-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpost_srun.cori
executable file
·208 lines (182 loc) · 5.42 KB
/
post_srun.cori
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#!/bin/csh
# cori post-srun script:
# ./post_srun.cori with the following optional argument keyword/value pairs, in any order
# -case <case name>
# -stdout <standard output from application>
# -jobid <should be same as $SLURM_JOB_ID>
# -user <should be same as $LOGNAME>
# -subdir <directory from which job script was submitted>
# -rundir <run directory>
# -save <TRUE|true|anything else>
# -archive <performance archive root directory>
# -sampling <sampling interval in seconds>
# -sample_script <script used to sample system status>
# -outpe_num <profile_outpe_num setting>
# -flush_count <mon_flush_count setting>
# -nprocess <number of MPI processes>
# -est_remaining <TRUE|true|anything else>
# NOTE: must be consistent with arguments in pre_srun.cori call, as variables set in pre_srun.cori are not 'inherited' by post_srun.cori
set case = $SLURM_JOB_NAME
set app_stdout = 'xgc2.out'
set app_jobid = $SLURM_JOB_ID
set user = $LOGNAME
set sub_dir = $SLURM_SUBMIT_DIR
set run_dir = $SLURM_SUBMIT_DIR
set save_timing = 'TRUE'
set save_timing_dir = '/project/projectdirs/m499'
set sample_interval = 900
set syslog_script = '/project/projectdirs/m499/camtimers/xgc_syslog.cori'
set outpe_num = 1
set flush_count = 12
set n_process = 0
set est_remaining = 'FALSE'
set i = 1
while ($i < $#argv)
if ("X$argv[$i]" == 'X-case') then
@ i = $i + 1
set case = $argv[$i]
else if ("X$argv[$i]" == 'X-stdout') then
@ i = $i + 1
set app_stdout = $argv[$i]
else if ("X$argv[$i]" == 'X-jobid') then
@ i = $i + 1
set app_jobid = $argv[$i]
else if ("X$argv[$i]" == 'X-user') then
@ i = $i + 1
set user = $argv[$i]
else if ("X$argv[$i]" == 'X-subdir') then
@ i = $i + 1
set sub_dir = $argv[$i]
else if ("X$argv[$i]" == 'X-rundir') then
@ i = $i + 1
set run_dir = $argv[$i]
else if ("X$argv[$i]" == 'X-save') then
@ i = $i + 1
set save_timing = $argv[$i]
else if ("X$argv[$i]" == 'X-archive') then
@ i = $i + 1
set save_timing_dir = $argv[$i]
else if ("X$argv[$i]" == 'X-sampling') then
@ i = $i + 1
set sample_interval = $argv[$i]
else if ("X$argv[$i]" == 'X-sample_script') then
@ i = $i + 1
set syslog_script = $argv[$i]
else if ("X$argv[$i]" == 'X-outpe_num') then
@ i = $i + 1
set outpe_num = $argv[$i]
else if ("X$argv[$i]" == 'X-flush_count') then
@ i = $i + 1
set flush_count = $argv[$i]
else if ("X$argv[$i]" == 'X-nprocess') then
@ i = $i + 1
set n_process = $argv[$i]
else if ("X$argv[$i]" == 'X-est_remaining') then
@ i = $i + 1
set est_remaining = $argv[$i]
endif
@ i = $i + 1
end
echo "`date` -- APPLICATION EXECUTION HAS FINISHED"
set sdate = `date +"%Y-%m-%d %H:%M:%S"`
cd $sub_dir
set string = `ls -pd *$app_jobid | grep -v / `
if !("X$string" == "X") then
set job_out = `echo $string | sed 's/^[ ]*\([^ ]*\).*/\1/' `
if !(-e $job_out) then
touch $job_out
endif
else
set job_out = "no_job_out"
if !(-e $job_out) then
touch $job_out
endif
endif
chmod g+w $job_out
chmod a+rX $job_out
cd $run_dir
mv CaseStatus.$app_jobid.X CaseStatus.$app_jobid
echo "srun ended $sdate" >>& CaseStatus.$app_jobid
if (-e xgc_mpi_init) then
/bin/rm -f xgc_mpi_init
endif
if (-e Walltime.Remaining) then
/bin/rm -f Walltime.Remaining
endif
set syslog_jobid = 0
if (-e syslog_jobid.$app_jobid.X) then
set syslog_jobid = `cat syslog_jobid.$app_jobid.X`
/bin/rm -f syslog_jobid.$app_jobid.X
endif
if ($syslog_jobid != 0) then
kill $syslog_jobid
endif
if (-e perf_in) then
/bin/rm -f perf_in
endif
if (-e perf_in.$app_jobid.X) then
mv perf_in.$app_jobid.X perf_in
endif
if (-e mon_in) then
/bin/rm -f mon_in
endif
if (-e mon_in.$app_jobid.X) then
mv mon_in.$app_jobid.X mon_in
endif
set lid = `date +"%y%m%d-%H%M%S"`
mv timing timing.$app_jobid.$lid
tar cf timing.$app_jobid.$lid.tar timing.$app_jobid.$lid
gzip timing.$app_jobid.$lid.tar
/bin/rm -rf timing.$app_jobid.$lid
if (-e log_summary_out) then
mv log_summary_out log_summary_out.$app_jobid
else
touch log_summary_out.$app_jobid
endif
chmod g+w log_summary_out.$app_jobid
chmod a+rX log_summary_out.$app_jobid
if (($save_timing == 'TRUE') || ($save_timing == 'true')) then
if (-d $save_timing_dir) then
cd $save_timing_dir
if !(-d performance_archive) then
mkdir performance_archive
chmod 777 performance_archive
chmod a+s performance_archive
endif
cd performance_archive
if !(-d $user) then
mkdir $user
chmod g+w $user
chmod a+rX $user
endif
cd $user
if !(-d $case) then
mkdir $case
chmod g+w $case
chmod a+rX $case
endif
cd $case
if !(-d $app_jobid) then
mkdir $app_jobid
chmod g+w $app_jobid
chmod a+rX $app_jobid
endif
cd $app_jobid
if !(-d $lid) then
mkdir $lid
chmod g+w $lid
chmod a+rX $lid
endif
mv checkpoints $lid/checkpoints.$lid
cd $lid
mv checkpoints.$lid/input.$app_jobid input.$app_jobid.$lid
cp --preserve=timestamps $run_dir/$app_stdout $app_stdout.$app_jobid.$lid
cp --preserve=timestamps $run_dir/CaseStatus.$app_jobid CaseStatus.$app_jobid.$lid
cp --preserve=timestamps $run_dir/timing.$app_jobid.$lid.tar.gz timing.$app_jobid.$lid.tar.gz
cp --preserve=timestamps $run_dir/log_summary_out.$app_jobid log_summary_out.$app_jobid.$lid
cp --preserve=timestamps $sub_dir/$job_out $job_out.$lid
chmod -R g+w *
chmod -R a+rX *
gzip -r *
endif
endif