-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathoptions.py
253 lines (229 loc) · 19 KB
/
options.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import argparse
def get_parser():
parser = argparse.ArgumentParser(description='WoodFisher: model compression')
# The following are required parameters, the defaults are for formatting-example purposes
parser.add_argument('--dset', default='imagenet', type=str,
help='dataset for the task (default: "imagenet")')
parser.add_argument('--dset_path', default='/home/dalistar/ILSVRC', type=str,
help='path ot the dataset (default: "/home/dalistar/ILSVRC")')
parser.add_argument('--arch', default='efficientnetb0', type=str,
help='NN architecture fo the task (default: "efficientnetb0")')
parser.add_argument('--config_path', type=str,
help='path to config file')
parser.add_argument('--pretrained', action='store_true',
help='use a pretrained model')
parser.add_argument('--use_butterfly', action='store_true',
help='replace all 2D convolutional layers with Butterfly convolutions')
parser.add_argument('--use_se', action='store_true',
help='using se in mixed conv resnet')
parser.add_argument('--se_ratio', type=float, default=None,
help='se ratio for SELayer (default: 0.5)')
parser.add_argument('--kernel_sizes', type=int, default=3)
parser.add_argument('--p', type=int, default=3)
parser.add_argument('--aa', action='store_true', help='use auto-augment for imagenet')
# Training-related parameters
parser.add_argument('--epochs', type=int,
help='number of epochs to run (default: 20')
parser.add_argument('--batch_size', default=256, type=int,
help='mini-batch size (default: 256)')
parser.add_argument('--test-batch_size', default=None, type=int,
help='mini-batch size for test (default: None)')
parser.add_argument('--recompute_bn_stats', action='store_true',
help='recompute bn statistics after pruning')
parser.add_argument('--num_samples', default=4096, type=int,
help='number of samples to compute pruning statistics for Fisher and SNIP based pruners (default: 4096)')
# Compute parameters
parser.add_argument('--workers', default=4, type=int,
help='number of workers to load the data (default: 4)')
parser.add_argument('--cpu', action='store_true',
help='force training on CPU')
parser.add_argument('--gpus', default=None,
help='Comma-separated list of GPU device ids to use, this assumes that parallel is applied (default: all devices)')
# Run history management
# The most convenient way is to specify --exp_name, then the logs and models will be stored under
# ../exp_root/{exp_name}/{current_inferred_datetime}/
parser.add_argument('--experiment_root_path', type=str, default='../exp_root',
help='path to directory under which all experiments will be stored; you can leave this argument as is')
parser.add_argument('--exp_name', type=str, default='default_exp',
help='name of the experiment, will be used to name a subdirectory of experiments_root;' +
'in this subdirectory, all runs (named by datetime) of this experiment will be stored')
parser.add_argument('--logging_level', type=str, default='info',
help='logging level: debug, info, warning, error, critical (default: info)')
parser.add_argument('--training_stats_freq', type=int, default=30,
help='the frequency (number of minibatches) to track training stats, e.g., loss, accuracy etc. (default: 30)')
parser.add_argument('--checkpoint_freq', type=int, default=1,
help='epoch frequency with which the checkpoints are dumped; at each time, two checkpoints are maintained:' +
'latest and best on validation/test set')
parser.add_argument('--from_checkpoint_path', type=str, default=None,
help='specifies path to *run_dir* from which the progress should be resumed')
parser.add_argument('--use-model-config', action='store_true',
help='use current model config for checkpoint too! used in case ckpt was not saved in the right way')
parser.add_argument('--reset_training_policy', action='store_true',
help='if reset training policy optimizer and lr_scheduler to initial config')
parser.add_argument('--load_distiller_weights_from', type=str, default=None,
help='the path to download the weight to the model from distiller checkpoint')
parser.add_argument('--ckpt-epoch', default=-1, type=int, help='at what epoch was the checkpoint taken')
# MLPNet specific args
parser.add_argument('--num-hidden-nodes1', default=40, type=int,
help='mlpnet: number of hidden nodes in the hidden layer 1')
parser.add_argument('--num-hidden-nodes2', default=20, type=int,
help='mlpnet: number of hidden nodes in the hidden layer 2')
parser.add_argument('--num-classes', default=10, type=int,
help='number of target classes')
parser.add_argument('--disable-log-soft', action='store_true', help='disable log softmax for mlpnet')
parser.add_argument('--seed', default=0, type=int, help='seed the computations!')
parser.add_argument('--enable-dropout', action='store_true', help='enable dropout for MLPNet')
parser.add_argument('--disable_bias', action='store_false', help='disable bias in the neural network layers')
# fisher related
parser.add_argument('--fisher-seed', default=-1, type=int, help='seed the compute_emprical_fisher method')
parser.add_argument('--fisher-parts', default=5, type=int, help='num of parts to divide fisher computation into')
parser.add_argument('--fisher-optimized', action='store_true',
help='split the addition of outer products into parts to save memory')
parser.add_argument('--save-fisher', action='store_true', help='to dump the emp_fisher to storage or not')
parser.add_argument('--fisher-cpu', action='store_true', help='compute the fisher on cpu!!')
parser.add_argument('--load-fisher', default="", type=str, help='path from where to load the emp_fisher')
parser.add_argument('--fisher-subsample-size', type=int, default=32, action='store',
help='over what number of training data is the empirical fisher computed')
parser.add_argument('--fisher-damp', type=float, default=1e-3, action='store',
help='dampening factor to scale the identity matrix and make fisher invertible')
parser.add_argument('--aux-gpu-id', default=-1, type=int, help='GPU id to use')
parser.add_argument('--update-config', action='store_true',
help='update the config, all these below will affect only when this flag is enabled')
parser.add_argument('--prune-class', type=str, default='woodfisher',
choices=['magni', 'globalmagni', 'naivehess', 'diagfisher',
'woodfisher', 'woodtaylor', 'woodfisherblock', 'woodtaylorblock',
'kfac', 'woodfisherblockgroup', 'woodfisherblockdynamic', 'woodfisherblock_flops'],
help='which pruner to use (assumes only 1 pruner)')
parser.add_argument('--prune-optimizer', type=str, default=None, choices=['Adam', 'SGD', 'RMSprop'],
help='optimizer to use during retrain while pruning')
parser.add_argument('--prune-start', type=int, default=None,
help='starting epoch for gradual pruning procedure')
parser.add_argument('--prune-end', type=int, default=None,
help='ending epoch for gradual pruning procedure')
parser.add_argument('--prune-freq', type=int, default=None,
help='frequency at which pruning is carried out!')
parser.add_argument('--prune-modules', type=str, default=None,
help='which modules to prune: space separated string with module names')
parser.add_argument('--untrained-last', action='store_true', help='dont prune after the last epoch!')
parser.add_argument('--init-sparsity', action='store', type=float, default=None,
help='initial sparsity')
parser.add_argument('--target-sparsity', action='store', type=float, default=None,
help='target sparsity to achieve!')
parser.add_argument('--one-shot', action='store_true',
help='one-shot pruning after training for prune_start-prune_end epochs!')
parser.add_argument('--prune-bias', action='store_true', default=False, help='prune biases as well')
parser.add_argument('--prune-all', action='store_true', default=False, help='prune all modules in the network')
parser.add_argument('--prune-lr', default=None, type=float,
help='learning rate for retraining part in pruning')
parser.add_argument('--prune-momentum', default=None, type=float,
help='momentum for retraining part in pruning')
parser.add_argument('--set-prune-momentum', action='store_true',
help='add momentum to config file if not already set there')
parser.add_argument('--prune-wdecay', default=None, type=float, help='weight decay for retraining part in pruning')
parser.add_argument('--result-file', action='store', type=str, default='',
help='path to file containing the saved results')
parser.add_argument('--sweep-id', default=-1, type=int, help='id of the experiment in sweep')
parser.add_argument('--not-oldfashioned', action='store_true', help='the checkpoints are not old_fashioned!')
parser.add_argument('--ignore-prefix', action='store_true', help='ignore the module prefix used!')
parser.add_argument('--batched-test', action='store_true', help='custom test dataset in batched mode!')
parser.add_argument('--cache-subset-ids', action='store_true',
help='sample subset indices only once in a hope to reduce the randomness!')
parser.add_argument('--full-subsample', action='store_true', help='do full subsampling in every epoch!')
parser.add_argument('--deterministic', action='store_true', help='makes things deterministic!')
parser.add_argument('--normalize-hgp', action='store_true',
help='normalize the hessain gradient product term for update in woodtaylor!')
parser.add_argument('--prune-direction', action='store_true', help='get the pruning direction for loss analysis')
parser.add_argument('--num-path-steps', default=-1, type=float, action='store',
help='num of discretization steps for loss path analysis')
parser.add_argument('--previous-mask', action='store_true', help='use the mask before pruning for loss analysis')
parser.add_argument('--zero-after-prune', action='store_true',
help='set update so that it is zero after prune (for woodburry like)')
parser.add_argument('--compare-models', action='store_true', help='compare models with one copied before')
parser.add_argument('--no-dataparallel', action='store_true', help='dont use dataparallel')
parser.add_argument('--always-eval-test', action='store_true',
help='always test in eval mode (even when evaluating train loss)')
parser.add_argument('--disable-train-random-transforms', action='store_true',
help='disable random transforms in train dataset ')
parser.add_argument('--disable-train-shuffle', action='store_true', help='disable shuffling in train dataloader')
parser.add_argument('--check-train-loss', action='store_true', help='if check train loss at various places')
parser.add_argument('--save-before-prune-ckpt', action='store_true', help='save checkpoint just before pruning!')
parser.add_argument('--inspect-inv', action='store_true', help='inspect the inverse!')
parser.add_argument('--fisher-mini-bsz', default=1, type=int, action='store',
help='minibatch of gradients to avg whose outer product then takes place!')
parser.add_argument('--max-mini-bsz', default=None, type=int, action='store',
help='max size of mini bsz that can fit in a machine. If fisher-mini-bsz greater than this, make the batches of this size')
parser.add_argument('--prune-at-launch', action='store_true',
help='start pruning at the very first epoch when pretraining!')
parser.add_argument('--layer-trace-stat', action='store_true',
help='use layer trace stats and multiply by weight_stats computed in layer!')
parser.add_argument('--woodburry-joint-sparsify', action='store_true',
help='jointly compute param stats in woodburry!')
parser.add_argument('--dump-grads-mat', action='store_true',
help='dump grads in matlab format!')
parser.add_argument('--dump-fisher-inv-mat', action='store_true',
help='dump fisher inverse computed via woodfisher in matlab format!')
parser.add_argument('--fisher-trace', action='store_true',
help='compute layerwise traces of empirical fisher matrix')
parser.add_argument('--eps', default=1e-10, type=float, action='store',
help='constant added to prevent divide by 0')
parser.add_argument('--check-grads', action='store_true',
help='check if the grads based on which jl is computed is same as current grads')
parser.add_argument('--true-fisher', action='store_true',
help='use true fisher (sample y from model) rather than empirical fisher ')
parser.add_argument('--fisher-split-grads', action='store_true',
help='split the grads to fit the outer product in memory!')
parser.add_argument('--offload-inv', action='store_true',
help='offload block inverses to CPU for woodburry joint sparsify efficient!')
parser.add_argument('--fittable-params', action='store', default=-1, type=int,
help='number of parameters which woodfisher can accommodate in GPU memory!')
parser.add_argument('--offload-grads', action='store_true',
help='offload the grads collected to prevent oom!')
parser.add_argument('--eval-fast', action='store_true',
help='do eval faster my_test methods instead of my_test_dataset')
parser.add_argument('--export_onnx', action='store_true', help='export onnx')
parser.add_argument('--fisher-damp-correction', action='store_true',
help='fix the incorrect division in the 1st iterate numerator')
parser.add_argument('--grad-subsample-size', type=int, default=None, action='store',
help='over what number of training data is the full gradient computed for taylor series')
parser.add_argument('--normalize-update', action='store_true', help='normalize the weight update for woodtaylor!')
parser.add_argument('--normalize-update-mult', type=float, default=1, action='store',
help='multiplier while normalizing the weight update for woodtaylor!')
parser.add_argument('--kfac-pi', action='store_true', help='use pi based dampening for KFAC!')
parser.add_argument('--local-quadratic', action='store_true',
help='analyse the training loss given by local quadratci model')
parser.add_argument('--compare-globalmagni-mask', action='store_true',
help='compare the layerwise mask generated via (joint) WF with global magni')
parser.add_argument('--spearman-globalmagni', action='store_true',
help='rank correlation between the pruning statistic from global magni and WF')
# UPDATE: both the --subtract-min and --check-reintro are not an issue
# as the diagonal of the inverse fisher will always be positive.
parser.add_argument('--subtract-min', action='store_true',
help='subtract the minimum of the statistic for WF')
parser.add_argument('--repeated-one-shot', action='store_true',
help='Repeated one-shot pruning after training, and with no fine-tuning!')
parser.add_argument('--scale-prune-update', type=float, default=1, action='store',
help='multiplier for reducing the effect of callibrating the other weights!')
parser.add_argument('--centered', action='store_true',
help='center the empirical fisher matrix (currently supported in WoodTaylor)!')
parser.add_argument('--flops', action='store_true', help='get flops')
parser.add_argument('--flops-power', type=float, default=0, help='exponent of the flop based statistic which gets multiplied to form overall stat')
parser.add_argument('--flops-per-param', action='store_true', help='normalize by the param count of respective layer')
parser.add_argument('--flops-normalize', type=str, default=None, help='balance out the flop counts')
parser.add_argument('--flops-target', type=float, default=-1, help='precise target of FLOPs to achieve')
parser.add_argument('--flops-epsilon', type=float, default=1, help='tolerance in achieving the target FLOP count')
parser.add_argument('--topk', action='store_true', help='print topK accuracy')
parser.add_argument('--mask-onnx', action='store_true', help='apply masks for onnx checkpoint')
parser.add_argument('--onnx-nick', type=str, default=None, action='store', help='name for generated onnx')
parser.add_argument('--save-dense-also', action='store_true',
help='also save the dense model by applying masks to sparsified model')
parser.add_argument('--recompute-schedule', type=str, default=None, action='store', choices=["linear", "poly"],
help='type of schedule for recomputation')
parser.add_argument('--recompute-num', type=int, default=None, help='number of recompute steps to carry out')
parser.add_argument('--recompute-degree', type=float, default=None, help='degree of the polynomial recompute steps')
parser.add_argument('--disable-wdecay-after-prune', action='store_true', help='Disables the weight decay after prune (sets to zero)')
parser.add_argument('--woodtaylor-abs', action='store_true', help='consider absolute value in woodtaylor stats')
parser.add_argument('--fisher-effective-damp', action='store_true', help='Use the effective dampening constant, i.e., WF (positive definite) + training')
# Flags which are not used for the primary results, but can be played around with if needed!
parser.add_argument('--label-smoothing', type=float, default=0, help='how much to soften the labels for training loss')
parser.add_argument('--hess-label-smoothing', type=float, default=None, help='how much to soften the labels for hessian')
return parser.parse_args()