-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathoptm_vmlmb.m
515 lines (504 loc) · 19.5 KB
/
optm_vmlmb.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
%% [x, f, g, status] = optm_vmlmb(fg, x0, 'key', val, ...);
%%
%% Apply VMLMB algorithm to minimize a multi-variate differentiable objective
%% function possibly under separable bound constraints. VMLMB is a
%% quasi-Newton method ("VM" is for "Variable Metric") with low memory
%% requirements ("LM" is for "Limited Memory") and which can optionally take
%% into account separable bound constraints (the final "B") on the variables.
%% To determine efficient search directions, VMLMB approximates the Hessian of
%% the objective function by a limited memory version of the model assumed in
%% Broyden-Fletcher-Goldfarb-Shanno algorithm (called L-BFGS for short). Hence
%% VMLMB is well suited to solving optimization problems with a very large
%% number of variables possibly with bound constraints.
%%
%% The method has two required arguments: `fg` the function to call to compute
%% the objective function and its gradient and `x0` the initial variables
%% (VMLMB is an iterative method). The initial variables may be an array of
%% any dimensions.
%%
%% The method returns `x` the best solution found during iterations, `f` and
%% `g` the value and the gradient of the objective at `x` and `status` an
%% integer code indicating the reason of the termination of the algorithm (see
%% `optm_reason`).
%%
%% The function `fg` shall be implemented as follows:
%%
%% function [f, g] = fg(x)
%% f = ...; % value of the objective function at `x`
%% g = ...: % gradient of the objective function at `x`
%% end
%%
%% All other settings are specified by keyword names followed by their value
%% (e.g.. `'key',val`), possible keywords are listed below.
%%
%% - Keywords `upper` and `lower` are to specify a lower and/or an upper bounds
%% for the variables. If unspecified or set to an empty array, a given bound
%% is considered as unlimited. Bounds must be conformable with the
%% variables.
%%
%% - Keyword `mem` specifies the memory used by the algorithm, that is the
%% number of previous steps memorized to approximate the Hessian of the
%% objective function. With `mem=0`, the algorithm behaves as a steepest
%% descent method. The default is `mem=5`.
%%
%% - Keywords `ftol`, `gtol` and `xtol` specify tolerances for deciding the
%% convergence of the algorithm.
%%
%% Convergence in the function occurs if one of the following conditions
%% hold:
%%
%% f ≤ fatol
%% |f - fp| ≤ frtol⋅max(|f|, |fp|)
%%
%% where `f` and `fp` are the values of the objective function at the current
%% and previous iterates. In these conditions, `fatol` and `frtol` are
%% absolute and relative tolerances specified by `ftol` which can be
%% `ftol=[fatol,frtol]` or `ftol=frtol` and assume that `fatol=-Inf`. The
%% default is `ftol=1e-8`.
%%
%% Convergence in the gradient occurs if the following condition holds:
%%
%% ‖g‖ ≤ max(0, gatol, grtol⋅‖g0‖)
%%
%% where `‖g‖` is the Euclidean norm of the projected gradient, `g0` is the
%% projected gradient at the initial solution. In this condition, `gatol`
%% and `grtol` are absolute and relative gradient tolerances specified by
%% `gtol` which can be `gtol=[gatol,grtol]` or `gtol=grtol` and assume that
%% `gatol=0`. The default is `gtol=1e-5`.
%%
%% Convergence in the variables occurs if the following condition holds:
%%
%% ‖x - xp‖ ≤ max(0, xatol, xrtol*‖x‖)
%%
%% where `x` and `xp` are the current and previous variables. In this
%% condition, `xatol` and `xrtol` are absolute and relative tolerances
%% specified by `xtol` which can be `xtol=[fatol,frtol]` or `xtol=xrtol` and
%% assume that `xatol=0`. The default is `xtol=1e-6`.
%%
%% - Keywords `maxiter` and `maxeval` are to specify a maximum number of
%% algorithm iterations or or evaluations of the objective function
%% implemented by `fg`. By default, these are unlimited.
%%
%% - Keyword `lnsrch` is to specify line-search settings different than the
%% default (see `optm_new_line_search`).
%%
%% - Keywords `f2nd`, `fmin`, `dxrel`, and `dxabs` are used to determine the
%% step length along the steepest descent (see `optm_steepest_descent_step).
%%
%% - Keyword `epsilon` specifies a threshold for a sufficient descent
%% condition. If `epsilon > 0`, then a search direction `d` computed by the
%% L-BFGS approximation is considered as acceptable if:
%%
%% ⟨d,g⟩ ≤ -epsilon⋅‖d‖⋅‖g‖
%%
%% where `g` denotes the projected gradient of the objective function (which
%% is just the gradient in unconstrained case). Otherwise, the condition
%% writes `⟨d,g⟩ < 0`. The default is `epsilon = 0` so only the latter
%% condition is checked.
%%
%% - Keyword `blmvm` (false by default) specifies whether to use BLMVM trick to
%% account for the bound constraints in the L-BFGS model of the Hessian. If
%% `blmvm` is set true, the overhead of the algorithm may be reduced, but the
%% L-BFGS model of the Hessian is more likely to be inaccurate causing the
%% algorithm to choose the steepest descent direction more often.
%%
%% - Keyword `verb`, if positive, specifies to print information every `verb`
%% iterations. Nothing is printed if `verb ≤ 0`. By default, `verb = 0`.
function [x, f, g, status] = optm_vmlmb(fg, x, varargin)
if nargin < 2
print_usage;
end
%% Constants. Calling inf, nan, true or false takes too much time (2.1µs
%% instead of 0.2µs if stored in a variable), so use local variables to pay
%% the price only once.
INF = inf();
NAN = nan();
TRUE = true();
FALSE = false();
%% Parse settings.
lower = [];
upper = [];
mem = 5;
maxiter = INF;
maxeval = INF;
ftol = 1e-8;
gtol = 1e-5;
xtol = 1e-6;
lnsrch = [];
verb = 0;
f2nd = NAN;
fmin = NAN;
dxrel = NAN;
dxabs = 1.0;
epsilon = 0.0;
blmvm = FALSE;
if mod(length(varargin), 2) ~= 0
error('parameters must be specified as pairs of names and values');
end
for i = 1:2:length(varargin)
key = varargin{i};
val = varargin{i+1};
switch key
case 'lower'
lower = val;
case 'upper'
upper = val;
case 'mem'
mem = val;
case 'maxiter'
maxiter = val;
case 'maxeval'
maxeval = val;
case 'ftol'
ftol = val;
case 'gtol'
gtol = val;
case 'xtol'
xtol = val;
case 'lnsrch'
lnsrch = val;
case 'verb'
verb = val;
case 'f2nd'
f2nd = val;
case 'fmin'
fmin = val;
case 'dxrel'
dxrel = val;
case 'dxabs'
dxabs = val;
case 'epsilon'
epsilon = val;
case 'f2nd'
f2nd = val;
case 'blmvm'
blmvm = (val ~= 0);
otherwise
error('invalid parameter name `%s`', key);
end
end
%% Tolerances. Most of these are forced to be nonnegative to simplify
%% tests.
if isscalar(ftol)
fatol = -INF;
frtol = max(0.0, ftol);
else
fatol = ftol(1);
frtol = max(0.0, ftol(2));
end
if isscalar(gtol)
gatol = 0.0;
grtol = max(0.0, gtol);
else
gatol = max(0.0, gtol(1));
grtol = max(0.0, gtol(2));
end
if isscalar(xtol)
xatol = 0.0;
xrtol = max(0.0, xtol);
else
xatol = max(0.0, xtol(1));
xrtol = max(0.0, xtol(2));
end
%% Bound constraints. For faster code, unlimited bounds are preferentially
%% represented by empty arrays.
if ~isempty(lower) && all(lower(:) == -INF)
lower = [];
end
if ~isempty(upper) && all(upper(:) == +INF)
upper = [];
end
bounded = (~isempty(lower) || ~isempty(upper));
if ~bounded
blmvm = FALSE; % no needs to use BLMVM trick in the unconstrained case
end
%% If the caller does not retrieve the status argument, failures are
%% reported by throwing an error.
throwerrors = (nargout < 4);
%% Other initialization.
g = []; % gradient
f0 = +INF; % function value at start of line-search
g0 = []; % gradient at start of line-search
d = []; % search direction
s = []; % effective step
pg = []; % projected gradient
pg0 = []; % projected gradient at start of line search
pgnorm = 0.0; % Euclidean norm of the (projected) gradient
alpha = 0.0; % step length
evals = 0; % number of calls to `fg`
iters = 0; % number of iterations
projs = 0; % number of projections onto the feasible set
rejects = 0; % number of search direction rejections
status = 0; % non-zero when algorithm is about to terminate
best_f = +INF; % function value at `best_x`
best_g = []; % gradient at `best_x`
best_x = []; % best solution found so far
best_pgnorm = -1;% norm of projected gradient at `best_x` (< 0 if unknown)
best_alpha = 0; % step length at `best_x`
best_evals = -1; % number of calls to `fg` at `best_x`
last_evals = -1; % number of calls to `fg` at last iterate
last_print = -1; % iteration number for last print
freevars = []; % subset of free variables (not yet known)
if isempty(lnsrch)
lnsrch = optm_new_line_search();
end
if ischar(fg)
fg = str2func(fg);
end
lbfgs = optm_new_lbfgs(mem);
if verb > 0
time = @() 86400E3*now(); % yields number of milliseconds
t0 = time();
end
%% Algorithm stage follows that of the line-search, it is one of:
%% 0 = initially;
%% 1 = line-search in progress;
%% 2 = line-search has converged.
stage = 0;
while TRUE
%% Make the variables feasible.
if bounded
%% In principle, we can avoid projecting the variables whenever
%% `alpha ≤ amin` (because the feasible set is convex) but rounding
%% errors could make this wrong. It is safer to always project the
%% variables. This cost O(n) operations which are probably
%% negligible compared to, say, computing the objective function
%% and its gradient.
x = optm_clamp(x, lower, upper);
projs = projs + 1;
end
%% Compute objective function and its gradient.
[f, g] = fg(x);
evals = evals + 1;
if f < best_f || evals == 1
%% Save best solution so far.
best_f = f;
best_g = g;
best_x = x;
best_pgnorm = -1; % must be recomputed
best_alpha = alpha;
best_evals = evals;
end
if stage ~= 0
%% Line-search in progress, check for line-search convergence.
lnsrch = optm_iterate_line_search(lnsrch, f);
stage = lnsrch.stage;
if stage == 2
%% Line-search has converged, `x` is the next iterate.
iters = iters + 1;
last_evals = evals;
elseif stage == 1
%% Line-search has not converged, peek next trial step.
alpha = lnsrch.step;
else
error('something is wrong!');
end
end
if stage ~= 1
%% Initial or next iterate after convergence of line-search.
if bounded
%% Determine the subset of free variables and compute the norm
%% of the projected gradient (needed to check for convergence).
freevars = optm_unblocked_variables(x, lower, upper, g);
pg = freevars .* g;
pgnorm = optm_norm2(pg);
else
%% Just compute the norm of the gradient.
pgnorm = optm_norm2(g);
end
if evals == best_evals
%% Now we know the norm of the (projected) gradient at the best
%% solution so far.
best_pgnorm = pgnorm;
end
%% Check for algorithm convergence or termination.
if evals == 1
%% Compute value for testing the convergence in the gradient.
gtest = max(gatol, grtol*pgnorm);
end
if pgnorm <= gtest
%% Convergence in gradient.
status = optm_status('GTEST_SATISFIED');
break
end
if stage == 2
%% Check convergence in relative function reduction.
if f <= fatol || abs(f - f0) <= frtol*max(abs(f), abs(f0))
status = optm_status('FTEST_SATISFIED');
break
end
%% Compute the effective change of variables.
s = x - x0;
snorm = optm_norm2(s);
%% Check convergence in variables.
if snorm <= xatol || (xrtol > 0 && snorm <= xrtol*optm_norm2(x))
status = optm_status('XTEST_SATISFIED');
break
end
end
if iters >= maxiter
status = optm_status('TOO_MANY_ITERATIONS');
break
end
end
if evals >= maxeval
status = optm_status('TOO_MANY_EVALUATIONS');
break
end
if stage ~= 1
%% Initial iteration or line-search has converged.
if verb > 0 && mod(iters, verb) == 0
%% Print iteration information.
print_iteration(iters, time() - t0, evals, rejects, ...
f, pgnorm, alpha);
last_print = iters;
end
%% Update L-BFGS approximation if a new step has been performed.
if stage ~= 0
if blmvm
lbfgs = optm_update_lbfgs(lbfgs, s, pg - pg0);
else
lbfgs = optm_update_lbfgs(lbfgs, s, g - g0);
end
end
%% Use L-BFGS approximation to determine a new search direction.
if blmvm
[d, scaled] = optm_apply_lbfgs(lbfgs, -pg);
d = d .* freevars;
else
[d, scaled] = optm_apply_lbfgs(lbfgs, -g, freevars);
end
%% Check whether `d` is an acceptable search direction and set
%% `flg` to 0 if `d` is not acceptable,
%% to 1 if `d` is acceptable with rescaling,
%% or to 2 if `d` is acceptable without rescaling.
flg = 2; %% assume no rescaling needed
dg = optm_inner(d, g);
if ~scaled
%% No exploitable curvature information, `d` is the unscaled
%% steepest feasible direction, that is the opposite of the
%% projected gradient.
flg = 1; % rescaling needed
else
%% Some exploitable curvature information were available.
if dg >= 0
%% L-BFGS approximation does not yield a descent direction.
flg = 0; % discard search direction
if ~bounded
if throwerrors
error('L-BFGS approximation is not positive definite');
end
status = optm_status('NOT_POSITIVE_DEFINITE');
break
end
elseif epsilon > 0
%% A more restrictive criterion has been specified for
%% accepting a search direction.
if dg > -epsilon*optm_norm2(d)*pgnorm
flg = 0; % discard search direction
end
end
%% Take the steepest feasible descent direction if the search
%% direction given by L-BFGS has been rejected.
if flg == 0
if bounded
d = -pg;
else
d = -g;
end
dg = -pgnorm^2;
flg = 1; % rescaling needed
end
end
if flg ~= 2 && iters > 0
%% L-BFGS search direction has been rejected.
rejects = rejects + 1;
end
%% Determine the length `alpha` of the initial step along `d`.
if flg == 2
%% The search direction needs no rescaling.
alpha = 1.0;
else
%% Find a suitable step size along the steepest feasible
%% descent direction `d`. Note that `pgnorm`, the Euclidean
%% norm of the (projected) gradient, is also that of `d` in
%% that case.
alpha = optm_steepest_descent_step(x, pgnorm, f, ...
f2nd, fmin, dxrel, dxabs);
end
if bounded
%% Safeguard the step to avoid searching in a region where
%% all bounds are overreached.
amax = optm_line_search_step_max(x, lower, upper, 1, d);
alpha = min(alpha, amax);
end
%% Initialize line-search.
lnsrch = optm_start_line_search(lnsrch, f, dg, alpha);
stage = lnsrch.stage;
if stage ~= 1
error('initialization of line-search fails!');
end
%% Save iterate at start of line-search.
f0 = f;
x0 = x;
if blmvm
pg0 = pg;
else
g0 = g;
end
end
%% Compute next iterate.
if alpha == 1
x = x0 + d;
else
x = x0 + alpha*d;
end
end
%% In case of abnormal termination, some progresses may have been made
%% since the start of the line-search. In that case, we restore the best
%% solution so far.
if best_f < f
f = best_f;
g = best_g;
x = best_x;
if verb > 0
%% Restore other information for printing.
alpha = best_alpha;
if best_pgnorm >= 0
pgnorm = best_pgnorm;
else
%% Re-compute the norm of the (projected) gradient.
if bounded
freevars = optm_unblocked_variables(x, lower, upper, g);
pgnorm = optm_norm2(g .* freevars);
else
pgnorm = optm_norm2(g);
end
end
if f < f0
%% Some progresses since last iterate, pretend that one more
%% iteration has been performed.
++iters;
end
end
end
if verb > 0
if iters > last_print
print_iteration(iters, time() - t0, evals, rejects, ...
f, pgnorm, alpha);
last_print = iters;
end
fprintf('# Termination: %s\n', optm_reason(status));
end
end
function print_iteration(iters, t, evals, rejects, f, pgnorm, alpha)
if iters < 1
fprintf('%s%s\n%s%s\n', ...
'# Iter. Time (ms) Eval. Reject.', ...
' Obj. Func. Grad. Step', ...
'# ----------------------------------', ...
'-----------------------------------------------');
end
fprintf('%7d %11.3f %7d %7d %23.15e %11.3e %11.3e\n', ...
iters, t, evals, rejects, f, pgnorm, alpha);
end