Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Workaround for issue #2492 - disable ConvBinWinoRxS when granularity loss is huge #2507

Merged
merged 2 commits into from
Nov 7, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 54 additions & 23 deletions src/solver/conv_winoRxS.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,22 @@
// we will keep ConvBinWinoRxS<2,3> for group convolutions only.
#define WORKAROUND_ISSUE_1681 0

/// \anchor disable_winograd_with_huge_granularity_loss
/// This is to exclude Winograd from PyTorch correctness tests.
/// These tests are written in incorrect manner.
/// They are very sensitive to the values in the output tensors,
/// but do not provide inputs that guarantee minimal deviation
/// (a numerical difference between kernel output and theoretically
/// "ideal" output). This is especially important for Winograd.
/// As a result, the PyTorch correctness tests often produce false
/// failures when Winograd algorithm is used. As PyTorch correctness
/// tests use very small convolutions and Winograd algorithm is
/// ineffective with such small configs due to huge granularity loss,
/// we can disable Winograd without any performance implications.
#define WORKAROUND_ISSUE_2493 1

MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_WORKAROUND_ISSUE_2493)

MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F2X3)
MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F2X3_PERF_VALS)
MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F2X3_G1)
Expand All @@ -71,6 +87,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F3X2_PERF_VALS)
/// models for other solvers, OR when GEMM WTI model is improved.
/// --atamazov 2020-11-07.
#define WTI_MODEL_ALLOW_ANY_RS 1
#define WTI_MODEL_ALLOW_ANY_CK 1

static inline size_t Ceil(const size_t v, const size_t m)
{
Expand Down Expand Up @@ -433,6 +450,7 @@ class ShaderModel : public UnifiedDescriptionConv2d
bool is_2x3;

bool out_of_model_scope; // Shader model produces unreliable results.
double granularity_loss;

public:
ShaderModel(const ExecutionContext& ctx,
Expand All @@ -448,19 +466,21 @@ class ShaderModel : public UnifiedDescriptionConv2d
is_2x3{IS2X3},
out_of_model_scope
{
!(problem.GetGroupCount() == 1) || //
!(U == 1) || //
!(V == 1) || //
!(input_stride_h == 1) || //
!(input_stride_w == 1) || //
!(filter_stride_h == 1) || //
!(filter_stride_w == 1) || //
!(problem.GetGroupCount() == 1) //
|| !(U == 1) //
|| !(V == 1) //
|| !(input_stride_h == 1) //
|| !(input_stride_w == 1) //
|| !(filter_stride_h == 1) //
|| !(filter_stride_w == 1) //
#if !WTI_MODEL_ALLOW_ANY_RS
!(R <= 5) || //
!(S <= 5) || //
|| !(R <= 5) //
|| !(S <= 5) //
#endif
#if !WTI_MODEL_ALLOW_ANY_CK
|| !(C >= 16) //
|| !(K >= 16)
#endif
!(C >= 16) || //
!(K >= 16)
}
{
/// \todo add G to UnifiedDescriptionConv2d
Expand Down Expand Up @@ -513,13 +533,6 @@ class ShaderModel : public UnifiedDescriptionConv2d
R_loops = Rg / Tr;
C_loops = Cg / C_factor;

if(G == 1)
{
n_groups = n_CU;
n_works_per_CU = Ceil(n_works, n_CU);
return;
}

const auto NKWH_w = K_factor * NHW_tiles_factor * Toh * Tow;
const auto grid_g = static_cast<double>(NKWH_w * Cg * Rg * Sg) / 1e6;

Expand All @@ -531,15 +544,23 @@ class ShaderModel : public UnifiedDescriptionConv2d
return 1. - dc_macs / macs_g;
};

if(G == 1)
{
n_groups = n_CU;
n_works_per_CU = Ceil(n_works, n_CU);
granularity_loss = compute_granularity_loss(n_groups);
return;
}

n_groups = 1;
double best_loss = compute_granularity_loss(n_groups);
granularity_loss = compute_granularity_loss(n_groups);
for(auto i = n_groups + 1; i < n_CU; ++i)
{
auto loss = compute_granularity_loss(i);
if(loss < best_loss)
if(loss < granularity_loss)
{
n_groups = i;
best_loss = loss;
n_groups = i;
granularity_loss = loss;
}
}

Expand Down Expand Up @@ -606,6 +627,8 @@ class ShaderModel : public UnifiedDescriptionConv2d

return WTI_predicted;
}

double GetGranularityLoss() const { return granularity_loss; }
};

template <int Winodata, int Winofilter>
Expand Down Expand Up @@ -656,7 +679,15 @@ static bool IsApplicableBase(const ExecutionContext& ctx, const ProblemDescripti
&& problem.GetBias() == 0
&& problem.GetInLayout() == "NCHW"))
return false;
// clang-format on
// clang-format on

#if WORKAROUND_ISSUE_2493
if(!miopen::IsDisabled(MIOPEN_DEBUG_WORKAROUND_ISSUE_2493{}))
{
if(ShaderModel(ctx, problem, Winodata, Winofilter).GetGranularityLoss() > 0.995)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps we can move the constant to a define or a constexpr variable so that the limit is documented ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Resolved in PR #2510, commit f186faf.

return false;
}
#endif

const auto n_inputs_per_group = problem.GetInChannels_() / problem.GetGroupCount(),
n_outputs_per_group = problem.GetOutChannels_() / problem.GetGroupCount();
Expand Down