Merge pull request #2829 from ROCm/r213-revert-last-two-with-correct-…

…patch R213 revert last two with correct patch
ROCm · Jan 31, 2025 · 924550f · 924550f
2 parents cebf33a + f3483b5
commit 924550f
Show file tree

Hide file tree

Showing 11 changed files with 30 additions and 51 deletions.
diff --git a/tensorflow/compiler/xla/service/gpu/autotuner_util.cc b/tensorflow/compiler/xla/service/gpu/autotuner_util.cc
@@ -60,15 +60,12 @@ void CSVLegend(std::ostream& os) {
 
   os << kCsvComment << " m" << kCsvSep << "n" << kCsvSep << "k" << kCsvSep
         << "batch_count" << kCsvSep << "trans_a" << kCsvSep 
-        << "trans_b" << kCsvSep << "type_a" << kCsvSep << "type_b" << kCsvSep 
+        << "trans_b" << kCsvSep 
+        << "type_a" << kCsvSep << "type_b" << kCsvSep 
         << "type_c" << kCsvSep << "lda" << kCsvSep << "ldb" << kCsvSep
         << "ldc" << kCsvSep << "stride_a" << kCsvSep
-        << "stride_b" << kCsvSep << "stride_c";
-  if (full_string) {
-    os << kCsvSep << "alpha_re" << kCsvSep << "alpha_im" << kCsvSep
-        << "beta" << kCsvSep << "epilogue";
-  }
-  os << kCsvSep << "alg_index" << std::endl;
+        << "stride_b" << kCsvSep << "stride_c" << kCsvSep
+        << "alg_index" << std::endl;
 }
 
 }  // namespace

diff --git a/tensorflow/compiler/xla/service/gpu/gpublas_lt_matmul_thunk.cc b/tensorflow/compiler/xla/service/gpu/gpublas_lt_matmul_thunk.cc
@@ -173,9 +173,7 @@ auto CublasLtMatmulThunk::GetCachedMatmulPlan(
         return std::move(plan);
       }
     }
-    TF_RETURN_IF_ERROR(plan->SetAlgorithm(algorithms[0]));
-    LOG(WARNING) << "Wrong algorithm ID: " << algorithm_id << " use default instead.";
-    return std::move(plan);
+    return InternalError("Wrong algorithm ID: %d", algorithm_id);
   };
   return cache.GetOrCreate(canonical_hlo_, create);
 }

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
@@ -49,7 +49,7 @@ inline constexpr int64_t WarpSize() { return 32; }
 
 // Need at least 1024 threads/block for reasonable tree reduction
 // performance (assuming all data fits).
-inline constexpr int64_t MinThreadsXRowReduction() { return 512; }
+inline constexpr int64_t MinThreadsXRowReduction() { return 1024; }
 
 // When doing batched row reduction, how big the batch dimension could be.
 inline constexpr int64_t BatchedReductionRaceFreeBound() { return 8; }

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1998,9 +1998,8 @@ Status IrEmitterUnnested::EmitUnnestedTranspose(
   // TODO(cheshire): have a more robust way of checking this.
   CHECK(dims_and_order.has_value());
 
-  constexpr int kNumRows = 8;
-  constexpr int warp_size = 64;
-  CHECK_EQ(warp_size % kNumRows, 0);
+  constexpr int kNumRows = 4;
+  CHECK_EQ(WarpSize() % kNumRows, 0);
 
   // 3D view over the input shape.
   Vector3 dims = dims_and_order->first;
@@ -2009,9 +2008,9 @@ Status IrEmitterUnnested::EmitUnnestedTranspose(
   CHECK_NE(order[2], 2);
   Vector3 permuted_dims = {dims[order[0]], dims[order[1]], dims[order[2]]};
   Vector3 tile_sizes{1, 1, 1};
-  tile_sizes[order[2]] = warp_size / kNumRows;
-  Vector3 num_threads{1, 1, warp_size};
-  num_threads[order[2]] = kNumRows;
+  tile_sizes[order[2]] = WarpSize() / 8;
+  Vector3 num_threads{1, 1, WarpSize()};
+  num_threads[order[2]] = 8;
 
   TilingScheme tiling_scheme(
       /*permuted_dims*/ permuted_dims,
@@ -4364,7 +4363,7 @@ llvm::Value* IrEmitterUnnested::EmitThreadId(int64_t threads_per_block,
 }
 
 StatusOr<IrEmitterUnnested::ThreadIdInfo> IrEmitterUnnested::EmitThreadIdInfo(
-    const TilingScheme& tiling_scheme, llvm::Type* index_ty, const int warp_size) {
+    const TilingScheme& tiling_scheme, llvm::Type* index_ty) {
   auto constant = [&](uint64_t c) -> llvm::Constant* {
     return llvm::ConstantInt::get(index_ty, c);
   };
@@ -4401,23 +4400,23 @@ StatusOr<IrEmitterUnnested::ThreadIdInfo> IrEmitterUnnested::EmitThreadIdInfo(
            /*thread_id_y=*/
            b_.CreateUDiv(thread_id_logical, num_threads_x_v, "thread_id.y"),
            /*lane_id=*/
-           b_.CreateURem(thread_id_logical, constant(warp_size), "lane_id"),
+           b_.CreateURem(thread_id_logical, constant(WarpSize()), "lane_id"),
            /*block_id=*/block_id_logical,
            /*scaling=*/scaling}};
 }
 
 StatusOr<IrEmitterUnnested::TilingKernelInfo>
 IrEmitterUnnested::EmitTilingKernel(
     const TilingScheme& tiling_scheme, llvm::Type* index_ty,
-    const TileElementGenerator& tile_element_generator, const int warp_size) {
+    const TileElementGenerator& tile_element_generator) {
   absl::Span<const int64_t> dims_in_elems = tiling_scheme.GetDimsInElems();
   Vector3 dims_in_blocks = tiling_scheme.GetDimsInBlocks();
   auto constant = [&](uint64_t c) -> llvm::Constant* {
     return llvm::ConstantInt::get(index_ty, c);
   };
 
   TF_ASSIGN_OR_RETURN(ThreadIdInfo thread_id_info,
-                      EmitThreadIdInfo(tiling_scheme, index_ty, warp_size));
+                      EmitThreadIdInfo(tiling_scheme, index_ty));
 
   KernelSupportLibrary ksl(&b_, llvm_ir::UnrollMode::kDefaultUnroll);
 
@@ -4678,7 +4677,7 @@ Status IrEmitterUnnested::EmitTransposeTile(
 
   llvm::Type* index_type = GetIndexTypeForKernel(
       fusion.getOperation(), launch_dimensions.launch_bound(), &b_);
-  return EmitTilingKernel(tiling_scheme, index_type, tile_generator, 64).status();
+  return EmitTilingKernel(tiling_scheme, index_type, tile_generator).status();
 }
 
 namespace {
@@ -5159,7 +5158,7 @@ Status IrEmitterUnnested::EmitIRForReduction(
               ValueVector2 tile_dimensions) {
             EmitTile(codegen_state.GetTilingScheme(), index, thread_id_info,
                      tile_dimensions, emit_reduction_element);
-          }, 32));
+          }));
 
   KernelSupportLibrary ksl(&b_);
   for (const HloReduceInstruction* reduce : reductions) {

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -587,7 +587,7 @@ class IrEmitterUnnested : public IrEmitter {
   // scheme.
   StatusOr<TilingKernelInfo> EmitTilingKernel(
       const TilingScheme& tiling_scheme, llvm::Type* index_ty,
-      const TileElementGenerator& tile_element_generator, const int warp_size);
+      const TileElementGenerator& tile_element_generator);
 
   // Emits code to iterate through a 2-dimensional tile with a given tile
   // dimensions and given strides, and call the callback at each iteration.,
@@ -846,7 +846,7 @@ class IrEmitterUnnested : public IrEmitter {
   // combination of thread_id/block_id does not correspond to a real block.
   // Assumes the current function returns void.
   StatusOr<ThreadIdInfo> EmitThreadIdInfo(const TilingScheme& tiling_scheme,
-                                          llvm::Type* index_ty, const int warp_size);
+                                          llvm::Type* index_ty);
   // Emit __syncthreads(), synchronization barrier for all threads in a block.
   llvm::CallInst* EmitSyncThreads();
 

diff --git a/tensorflow/compiler/xla/service/gpu/launch_dimensions.cc b/tensorflow/compiler/xla/service/gpu/launch_dimensions.cc
@@ -48,7 +48,7 @@ static int64_t ThreadsPerBlockLimit(GpuDeviceInfo gpu_device_info) {
     threads_per_block = gpu_device_info.threads_per_warp;
     if (threads_per_block == 0) {
       // Fall back to *something* if we can't even get num threads per warp.
-      threads_per_block = 64;
+      threads_per_block = 32;
     }
   }
   return threads_per_block;
@@ -112,7 +112,7 @@ StatusOr<LaunchDimensions> CalculateLaunchDimensions(
             ? threads_per_block_row_vectorized
             : RoundUpTo(ThreadsPerBlockLimit(gpu_device_info) /
                             dim_config.unroll_factor,
-                        int64_t{64});
+                        int64_t{32});
     if (num_elements < max_threads_per_block_x) {
       return num_elements;
     }

diff --git a/tensorflow/compiler/xla/stream_executor/gpu/gpu_blas_lt.cc b/tensorflow/compiler/xla/stream_executor/gpu/gpu_blas_lt.cc
@@ -272,7 +272,8 @@ std::string ToCSVString(const GemmConfig& cfg, bool full_string) {
 
   if (full_string) {
     // NOTE: epilogue is required for MatmulPlan caching !
-    oss << kCsvSep << cfg.alpha.real() << kCsvSep << cfg.alpha.imag() << kCsvSep << cfg.beta << kCsvSep << (int64_t)cfg.epilogue;
+    oss //<< kCsvSep << cfg.alpha << kCsvSep << cfg.beta 
+        << kCsvSep << (int64_t)cfg.epilogue;
   }
 
   return oss.str();

diff --git a/third_party/gpus/crosstool/BUILD.rocm.tpl b/third_party/gpus/crosstool/BUILD.rocm.tpl
@@ -87,14 +87,14 @@ cc_toolchain_config(
         "-fuse-ld=gold",
         "-Wl,-no-as-needed",
         "-Wl,-z,relro,-z,now",
-        # "-pass-exit-codes",
+        "-pass-exit-codes",
         "-lstdc++",
         "-lm",
     ],
     link_libs = [],
     opt_link_flags = [],
     unfiltered_compile_flags = [
-        # "-fno-canonical-system-headers",
+        "-fno-canonical-system-headers",
         "-Wno-builtin-macro-redefined",
         "-D__DATE__=\"redacted\"",
         "-D__TIMESTAMP__=\"redacted\"",

diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
@@ -75,9 +75,7 @@ def GetHostCompilerOptions(argv):
   parser.add_argument('-iquote', nargs='*', action='append')
   parser.add_argument('--sysroot', nargs=1)
   parser.add_argument('-g', nargs='*', action='append')
-  parser.add_argument('-no-canonical-prefixes', action='store_true')
-  parser.add_argument('-Wno-unused-variable', action='store_true')
-  parser.add_argument('-Wno-unused-but-set-variable', action='store_true')
+  parser.add_argument('-fno-canonical-system-headers', action='store_true')
 
   args, _ = parser.parse_known_args(argv)
 
@@ -89,16 +87,10 @@ def GetHostCompilerOptions(argv):
     opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
   if args.g:
     opts += ' -g' + ' -g'.join(sum(args.g, []))
-  if args.no_canonical_prefixes:
+  if args.fno_canonical_system_headers:
     opts += ' -no-canonical-prefixes'
   if args.sysroot:
     opts += ' --sysroot ' + args.sysroot[0]
-  if args.Wno_unused_variable:
-    opts += ' -Wno-unused-variable'
-
-  if args.Wno_unused_but_set_variable:
-    opts += ' -Wno-unused-but-set-variable'
-
 
   return opts
 
@@ -290,13 +282,7 @@ def main():
                                if not flag.startswith(('--rocm_log'))]
 
     # XXX: SE codes need to be built with gcc, but need this macro defined
-    cpu_compiler_flags.append("-D__HIP_PLATFORM_AMD__")
-    cpu_compiler_flags.append('-L' + HIP_RUNTIME_PATH)
-    cpu_compiler_flags.append('-Wl,-rpath=' + HIP_RUNTIME_PATH)
-    cpu_compiler_flags.append('-l' + HIP_RUNTIME_LIBRARY)
-    cpu_compiler_flags.append("-lrt")
-    cpu_compiler_flags.append("-Wno-unused-command-line-argument")
-    cpu_compiler_flags.append("-Wno-gnu-offsetof-extensions")
+    cpu_compiler_flags.append("-D__HIP_PLATFORM_HCC__")
     if VERBOSE: print(' '.join([CPU_COMPILER] + cpu_compiler_flags))
     return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
 

diff --git a/third_party/gpus/crosstool/hipcc_cc_toolchain_config.bzl.tpl b/third_party/gpus/crosstool/hipcc_cc_toolchain_config.bzl.tpl
@@ -1046,7 +1046,7 @@ def _impl(ctx):
                     flag_group(
                         flags = [
                             "-no-canonical-prefixes",
-                            #"-fno-canonical-system-headers",
+                            "-fno-canonical-system-headers",
                         ]
                     ),
                 ],

diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
@@ -715,14 +715,12 @@ def _create_local_rocm_repository(repository_ctx):
     # .d file - given that includes that are prefixed with "../" multiple
     # time quickly grow longer than the root of the tree, this can lead to
     # bazel's header check failing.
-    rocm_defines["%{extra_no_canonical_prefixes_flags}"] = ""
+    rocm_defines["%{extra_no_canonical_prefixes_flags}"] = "\"-fno-canonical-system-headers\""
 
     rocm_defines["%{unfiltered_compile_flags}"] = to_list_of_strings([
         "-DTENSORFLOW_USE_ROCM=1",
         "-D__HIP_PLATFORM_AMD__",
         "-DEIGEN_USE_HIP",
-        "-Wno-unused-but-set-variable",
-        "-Wno-c++11-narrowing",
     ])
 
     rocm_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"