Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
make cpp impl for adabelief match whatever paper says
Browse files Browse the repository at this point in the history
  • Loading branch information
khaotik committed Mar 13, 2021
1 parent 30a1794 commit c4141d7
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 11 deletions.
17 changes: 9 additions & 8 deletions src/operator/contrib/adabelief-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,16 +107,17 @@ struct MPAdaBeliefKernel {
const float param_rescale_grad, const float param_epsilon) {
float w = weight32[i];
float scaled_grad = param_rescale_grad*static_cast<float>(grad_data[i]);
if (param_clip_gradient >= 0.0f)
if (param_clip_gradient >= 0.f)
scaled_grad = mshadow_op::clip::Map(scaled_grad, param_clip_gradient);
scaled_grad += param_wd * weight_data[i];

float mean = mean_data[i] = param_beta1 * mean_data[i] + (1.0f - param_beta1) * scaled_grad;
float var = var_data[i] = param_beta2 * var_data[i] +
(1.0f - param_beta2) * mshadow_op::square::Map(mean - scaled_grad);
const float mean = param_beta1 * (mean_data[i] - scaled_grad) + scaled_grad;
const float adj = mshadow_op::square::Map(mean - scaled_grad);
const float var = param_beta2 * var_data[i] + (1.f - param_beta2) * adj + param_epsilon;

w = w - param_eta * (param_lr * mean / (mshadow_op::square_root::Map(var) + param_epsilon)
+ param_wd * w);
mean_data[i] = mean;
var_data[i] = var;
w = w - param_eta * (param_lr * mean / (mshadow_op::square_root::Map(var) + param_epsilon));
weight32[i] = w;
KERNEL_ASSIGN(out_data[i], req, w);
}
Expand Down Expand Up @@ -359,9 +360,9 @@ struct MultiMPAdaBeliefKernel {
MPDType scaled_grad = static_cast<MPDType>(rescale_grad)*
static_cast<MPDType>(param.grad_data[index][i]);

if (param.clip_gradient >= 0.0f)
if (param.clip_gradient >= 0.f)
scaled_grad = mshadow_op::clip::Map(scaled_grad, param.clip_gradient) ;
scaled_grad += param.wds[index]*w;
scaled_grad += param.wds[index] * w;

const auto mean = param.beta1 * (param.mean_data[index][i]- scaled_grad) + scaled_grad;
const auto adj = mshadow_op::square::Map(mean - scaled_grad);
Expand Down
6 changes: 3 additions & 3 deletions src/operator/contrib/adabelief.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ are 1st and 2nd order moment estimates (mean and variance).
g_t = \nabla J(W_{t-1}) + w * wd \\
m_t = \beta_1 m_{t-1} + (1 - \beta_1) g_t\\
s_t = \beta_2 v_{t-1} + (1 - \beta_2) (g_t - m_t)^2\\
s_t = \beta_2 v_{t-1} + (1 - \beta_2) (g_t - m_t)^2 + \epsilon\\
W_t = W_{t-1} - \eta_t (\alpha \frac{ m_t }{ \sqrt{ v_t } + \epsilon })
It updates the weights using::
Expand Down Expand Up @@ -89,7 +89,7 @@ are 1st and 2nd order moment estimates (mean and variance).
g_t = \nabla J(W_{t-1}) + w * wd \\
m_t = \beta_1 m_{t-1} + (1 - \beta_1) g_t\\
s_t = \beta_2 v_{t-1} + (1 - \beta_2) (g_t - m_t)^2\\
s_t = \beta_2 v_{t-1} + (1 - \beta_2) (g_t - m_t)^2 + \epsilon\\
W_t = W_{t-1} - \eta_t (\alpha \frac{ m_t }{ \sqrt{ v_t } + \epsilon })
It updates the weights using::
Expand Down Expand Up @@ -213,7 +213,7 @@ are 1st and 2nd order moment estimates (mean and variance).
g_t = \nabla J(W_{t-1}) + w * wd \\
m_t = \beta_1 m_{t-1} + (1 - \beta_1) g_t\\
s_t = \beta_2 v_{t-1} + (1 - \beta_2) (g_t - m_t)^2\\
s_t = \beta_2 v_{t-1} + (1 - \beta_2) (g_t - m_t)^2 + \epsilon\\
W_t = W_{t-1} - \eta_t (\alpha \frac{ m_t }{ \sqrt{ v_t } + \epsilon })
It updates the weights using::
Expand Down

0 comments on commit c4141d7

Please sign in to comment.