AI Based Parameter Prediction Model for conv_hip_igemm_group_fwd_xdlo…

…ps Solver (#2523)
ROCm · Dec 8, 2023 · 4d5a184 · 4d5a184
1 parent 3c0dfcb
commit 4d5a184
Show file tree

Hide file tree

Showing 10 changed files with 342 additions and 171 deletions.
diff --git a/src/conv/heuristics/ai_heuristics.cpp b/src/conv/heuristics/ai_heuristics.cpp
@@ -436,7 +436,8 @@ Metadata::Metadata(const std::string& arch, const std::string& solver)
     const nlohmann::json metadata =
         common::LoadJSON(GetSystemDbPath() + "/" + arch + "_" + solver + "_metadata.ktn.model");
     num_tuning_params = metadata["num_tuning_params"].get<std::size_t>();
-    tuning_decodings = metadata["decodings"]["tunings"].get<std::unordered_map<std::string, int>>();
+    tuning_decodings =
+        metadata["decodings"]["tunings"].get<std::unordered_map<std::string, std::string>>();
 }
 
 class Model
@@ -450,9 +451,11 @@ class Model
     {
     }
     virtual ~Model() = default;
-    fdeep::tensors Encode(const std::vector<float>& features, std::size_t dim) const
+    fdeep::tensors Encode(const std::vector<float>& features, std::size_t dim, bool transform) const
     {
-        fdeep::tensor input_tensor = fdeep::tensor(fdeep::tensor_shape(dim, dim), features);
+        const auto tensor_shape_depth = transform ? dim : 1;
+        fdeep::tensor input_tensor =
+            fdeep::tensor(fdeep::tensor_shape(dim, tensor_shape_depth), features);
         return encoder.predict({input_tensor});
     }
     fdeep::tensors Decode(const float prev_token, const fdeep::tensors& context) const
@@ -488,10 +491,6 @@ class Model
 
 std::shared_ptr<Model> GetModel(const std::string& arch, const std::string& solver)
 {
-    static const std::string prevArch{arch};
-
-    if(prevArch != arch)
-        MIOPEN_THROW("Cannot use AI tuning models for multiple gpu architectures");
     static std::map<std::string, std::shared_ptr<Model>> models;
     auto it = models.find(solver);
     if(it == models.end())
@@ -509,11 +508,16 @@ std::shared_ptr<Model> GetModel(const std::string& arch, const std::string& solv
 bool ModelSetParams(const std::string& arch,
                     const std::string& solver,
                     const std::vector<float>& features,
-                    std::function<bool(int, int)> validator)
+                    bool transform_features,
+                    std::function<bool(std::size_t, std::string)> validator)
 {
-    auto model             = GetModel(arch, solver);
-    int dim                = std::sqrt(features.size());
-    fdeep::tensors context = model->Encode(features, dim);
+    auto model = GetModel(arch, solver);
+    int dim    = 0;
+    if(transform_features)
+        dim = std::sqrt(features.size());
+    else
+        dim = features.size();
+    fdeep::tensors context = model->Encode(features, dim, transform_features);
     float decoder_input    = 0.0;
     for(std::size_t i = 0; i < model->metadata.num_tuning_params; ++i)
     {
@@ -529,9 +533,9 @@ bool ModelSetParams(const std::string& arch,
         {
             int token = pq.top().second;
             // convert index to token value
-            int value = model->metadata.tuning_decodings[std::to_string(token)];
+            std::string value = model->metadata.tuning_decodings[std::to_string(token)];
             pq.pop();
-            if(value < 0)
+            if(value == "-1")
                 return false;
             if(validator(i, value))
             {

diff --git a/src/include/miopen/conv/heuristics/ai_heuristics.hpp b/src/include/miopen/conv/heuristics/ai_heuristics.hpp
@@ -81,14 +81,15 @@ namespace tuning {
 struct Metadata
 {
     std::size_t num_tuning_params;
-    std::unordered_map<std::string, int> tuning_decodings;
+    std::unordered_map<std::string, std::string> tuning_decodings;
     Metadata(const std::string& arch, const std::string& solver);
 };
 
 bool ModelSetParams(const std::string& arch,
                     const std::string& solver,
                     const std::vector<float>& features,
-                    std::function<bool(int, int)> validator);
+                    bool transform_features,
+                    std::function<bool(std::size_t, std::string)> validator);
 } // namespace tuning
 #endif // MIOPEN_ENABLE_AI_KERNEL_TUNING
 } // namespace ai

diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp
@@ -370,12 +370,8 @@ struct PerformanceConfigConvAsm1x1U : PerfConfigBase<PerformanceConfigConvAsm1x1
 
     void StaticHeuristic(const miopen::conv::ProblemDescription& problem);
     void HeuristicInit(const ExecutionContext&, const miopen::conv::ProblemDescription&);
-#if MIOPEN_ENABLE_AI_KERNEL_TUNING
-    void RunParmeterPredictionModel(const ExecutionContext&,
-                                    const miopen::conv::ProblemDescription&,
-                                    bool& valid);
-    bool ModelApplyToken(int index, int value, const miopen::conv::ProblemDescription&);
-#endif
+    bool IsModelApplicable(const ExecutionContext& ctx,
+                           const miopen::conv::ProblemDescription& problem) const;
     bool IsValidValue() const { return IsValidValueImpl(8); }
     bool SetNextValue(const miopen::conv::ProblemDescription&);
     bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const
@@ -399,6 +395,9 @@ struct PerformanceConfigConvAsm1x1U : PerfConfigBase<PerformanceConfigConvAsm1x1
     {
         return IsValidValueImpl(sequence_length);
     }
+    bool RunParameterPredictionModel(const ExecutionContext&,
+                                     const miopen::conv::ProblemDescription&);
+    bool ModelApplyToken(int index, std::string value, const miopen::conv::ProblemDescription&);
 #endif
     bool IsValidImpl(const miopen::conv::ProblemDescription& problem, int sequence_length) const;
     bool IsValidValueImpl(int sequence_length) const;
@@ -4492,7 +4491,7 @@ struct PerformanceConfigHipImplicitGemmGroupFwdXdlops
         : PerformanceConfigHipImplicitGemmGroupFwdXdlops(0, "")
     {
     }
-    void HeuristicInit(const miopen::conv::ProblemDescription&);
+    void HeuristicInit(const ExecutionContext&, const miopen::conv::ProblemDescription&);
     bool SetNextValue(const miopen::conv::ProblemDescription&);
     bool IsValidValue() const;
     bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const
@@ -4501,8 +4500,19 @@ struct PerformanceConfigHipImplicitGemmGroupFwdXdlops
     }
     bool IsValid(const miopen::conv::ProblemDescription&) const;
     bool operator==(const PerformanceConfigHipImplicitGemmGroupFwdXdlops& other) const;
+    bool IsModelApplicable(const ExecutionContext& ctx,
+                           const miopen::conv::ProblemDescription& problem) const;
 
 private:
+#if MIOPEN_ENABLE_AI_KERNEL_TUNING
+    std::vector<int> heuristic_indexes;
+    std::vector<std::vector<std::string>> heuristic_kernels;
+    template <typename DataType>
+    bool RunParameterPredictionModel(const ExecutionContext& ctx,
+                                     const miopen::conv::ProblemDescription& problem);
+    void InitHeuristicKernelIDs();
+    bool ModelApplyToken(int idx, std::string value);
+#endif
     template <typename DataType>
     void Init(const miopen::conv::ProblemDescription&);
     template <typename DataType>

diff --git a/src/kernels/gfx908_ConvAsm1x1U_metadata.ktn.model b/src/kernels/gfx908_ConvAsm1x1U_metadata.ktn.model
@@ -2,44 +2,44 @@
     "num_tuning_params": 8,
     "decodings": {
         "tunings": {
-            "0": -1,
-            "1": 4,
-            "2": 2,
-            "3": 1,
-            "4": 3,
-            "5": 16,
-            "6": 8,
-            "7": 1,
-            "8": 4,
-            "9": 32,
-            "10": 4,
-            "11": 1,
-            "12": 2,
-            "13": 5,
-            "14": 7,
-            "15": 3,
-            "16": 6,
-            "17": 8,
-            "18": 64,
-            "19": 16,
-            "20": 32,
-            "21": 4,
-            "22": 1,
-            "23": 1,
-            "24": 3,
-            "25": 2,
-            "26": 4,
-            "27": 2,
-            "28": 4,
-            "29": 1,
-            "30": 2,
-            "31": 1,
-            "32": 4,
-            "33": 2,
-            "34": 4,
-            "35": 8,
-            "36": 1,
-            "37": -1
+            "0": "-1",
+            "1": "4",
+            "2": "2",
+            "3": "1",
+            "4": "3",
+            "5": "16",
+            "6": "8",
+            "7": "1",
+            "8": "4",
+            "9": "32",
+            "10": "4",
+            "11": "1",
+            "12": "2",
+            "13": "5",
+            "14": "7",
+            "15": "3",
+            "16": "6",
+            "17": "8",
+            "18": "64",
+            "19": "16",
+            "20": "32",
+            "21": "4",
+            "22": "1",
+            "23": "1",
+            "24": "3",
+            "25": "2",
+            "26": "4",
+            "27": "2",
+            "28": "4",
+            "29": "1",
+            "30": "2",
+            "31": "1",
+            "32": "4",
+            "33": "2",
+            "34": "4",
+            "35": "8",
+            "36": "1",
+            "37": "-1"
         }
     }
 }
diff --git a/src/kernels/gfx90a_ConvHipIgemmGroupFwdXdlops_decoder.ktn.model b/src/kernels/gfx90a_ConvHipIgemmGroupFwdXdlops_decoder.ktn.model
diff --git a/src/kernels/gfx90a_ConvHipIgemmGroupFwdXdlops_encoder.ktn.model b/src/kernels/gfx90a_ConvHipIgemmGroupFwdXdlops_encoder.ktn.model
diff --git a/src/kernels/gfx90a_ConvHipIgemmGroupFwdXdlops_metadata.ktn.model b/src/kernels/gfx90a_ConvHipIgemmGroupFwdXdlops_metadata.ktn.model
@@ -0,0 +1,36 @@
+{
+    "num_tuning_params": 9,
+    "decodings": {
+        "tunings": {
+            "0": "-1",
+            "1": "64",
+            "2": "256",
+            "3": "128",
+            "4": "64",
+            "5": "128",
+            "6": "32",
+            "7": "256",
+            "8": "32",
+            "9": "128",
+            "10": "64",
+            "11": "256",
+            "12": "32",
+            "13": "16",
+            "14": "Default",
+            "15": "OddC",
+            "16": "2",
+            "17": "1",
+            "18": "4",
+            "19": "1",
+            "20": "2",
+            "21": "4",
+            "22": "8",
+            "23": "1",
+            "24": "4",
+            "25": "8",
+            "26": "1",
+            "27": "4",
+            "28": "-1"
+        }
+    }
+}
diff --git a/src/solver/conv_asm_1x1u.cpp b/src/solver/conv_asm_1x1u.cpp
@@ -39,7 +39,6 @@
 #include <miopen/logger.hpp>
 #include <miopen/solver.hpp>
 #include <miopen/conv/heuristics/ai_heuristics.hpp>
-#include <nlohmann/json_fwd.hpp>
 
 MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U_PERF_VALS)
 MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U_SEARCH_OPTIMIZED)
@@ -367,39 +366,29 @@ bool PerformanceConfigConvAsm1x1U::IsValidImpl(const ProblemDescription& problem
     }
     return true;
 }
-#if MIOPEN_ENABLE_AI_KERNEL_TUNING
 
+#if MIOPEN_ENABLE_AI_KERNEL_TUNING
 bool PerformanceConfigConvAsm1x1U::ModelApplyToken(int index,
-                                                   int value,
+                                                   std::string value,
                                                    const ProblemDescription& problem)
 {
+    int val = stoi(value);
     switch(index)
     {
-    case 0: read_size = value; break;
-    case 1: k_mult = value; break;
-    case 2: chunks_per_wave = value; break;
-    case 3: chunk_size = value; break;
-    case 4: n_mult = value; break;
-    case 5: c_mult = value; break;
-    case 6: waves_c_in_group = value; break;
-    case 7: waves_k_in_group = value; break;
+    case 0: read_size = val; break;
+    case 1: k_mult = val; break;
+    case 2: chunks_per_wave = val; break;
+    case 3: chunk_size = val; break;
+    case 4: n_mult = val; break;
+    case 5: c_mult = val; break;
+    case 6: waves_c_in_group = val; break;
+    case 7: waves_k_in_group = val; break;
     default: return false;
     }
     // this function may leave PerformanceConfigConvAsm1x1U in a partially valid or invalid state
     return this->IsPartiallyValid(problem, index + 1);
 }
 
-static bool IsModelApplicable(const ExecutionContext& ctx, const ProblemDescription& problem)
-{
-    if(!miopen::IsEnabled(ENV(MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U_AI_HEUR)))
-        return false;
-    if(ctx.GetStream().GetDeviceName() != "gfx908")
-        return false;
-    if(problem.GetKernelStrideH() != 1)
-        return false;
-    return true;
-}
-
 static std::vector<float> TransformFeatures(const ProblemDescription& problem, std::size_t n)
 {
     assert(n == 8); // n = 6 (numerical conv params) * 1 + 1 (nominal conv params) * 2(amount of
@@ -418,21 +407,21 @@ static std::vector<float> TransformFeatures(const ProblemDescription& problem, s
     return features;
 }
 
-void PerformanceConfigConvAsm1x1U::RunParmeterPredictionModel(const ExecutionContext& ctx,
-                                                              const ProblemDescription& problem,
-                                                              bool& valid)
+bool PerformanceConfigConvAsm1x1U::RunParameterPredictionModel(const ExecutionContext& ctx,
+                                                               const ProblemDescription& problem)
 {
     static const std::size_t n      = 8;
     static const std::string& arch  = ctx.GetStream().GetDeviceName();
     static const std::string solver = "ConvAsm1x1U";
     std::vector<float> features     = TransformFeatures(problem, n);
-    if(ai::tuning::ModelSetParams(arch, solver, features, [&](int idx, int value) {
+    if(ai::tuning::ModelSetParams(arch, solver, features, true, [&](int idx, std::string value) {
            return this->ModelApplyToken(idx, value, problem);
        }))
     {
         MIOPEN_LOG_I("Params set by AI: " << ToString());
-        valid = true;
+        return true;
     }
+    return false;
 }
 #endif
 
@@ -482,22 +471,30 @@ void PerformanceConfigConvAsm1x1U::StaticHeuristic(const ProblemDescription& pro
     }
 }
 
-void PerformanceConfigConvAsm1x1U::HeuristicInit(const ExecutionContext& ctx,
+bool PerformanceConfigConvAsm1x1U::IsModelApplicable(const ExecutionContext& ctx,
+                                                     const ProblemDescription& problem) const
+{
+    if(miopen::IsDisabled(ENV(MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U_AI_HEUR)))
+        return false;
+    if(ctx.GetStream().GetDeviceName() != "gfx908")
+        return false;
+    if(problem.GetKernelStrideH() != 1)
+        return false;
+    return true;
+}
+
+void PerformanceConfigConvAsm1x1U::HeuristicInit([[maybe_unused]] const ExecutionContext& ctx,
                                                  const ProblemDescription& problem)
 {
     if(problem.GetInDataType() == miopenDouble)
         MIOPEN_THROW("Double data type is not supported by ConvAsm1x1U");
-
 #if MIOPEN_ENABLE_AI_KERNEL_TUNING
     if(IsModelApplicable(ctx, problem))
     {
-        bool valid = false;
-        RunParmeterPredictionModel(ctx, problem, valid);
-        if(valid)
+
+        if(RunParameterPredictionModel(ctx, problem))
             return;
     }
-#else
-    std::ignore = ctx;
 #endif
     StaticHeuristic(problem);
     MIOPEN_LOG_I(ToString());