Skip to content

Commit

Permalink
Merge pull request #2829 from ROCm/r213-revert-last-two-with-correct-…
Browse files Browse the repository at this point in the history
…patch

R213 revert last two with correct patch
  • Loading branch information
i-chaochen authored Jan 31, 2025
2 parents cebf33a + f3483b5 commit 924550f
Show file tree
Hide file tree
Showing 11 changed files with 30 additions and 51 deletions.
11 changes: 4 additions & 7 deletions tensorflow/compiler/xla/service/gpu/autotuner_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,12 @@ void CSVLegend(std::ostream& os) {

os << kCsvComment << " m" << kCsvSep << "n" << kCsvSep << "k" << kCsvSep
<< "batch_count" << kCsvSep << "trans_a" << kCsvSep
<< "trans_b" << kCsvSep << "type_a" << kCsvSep << "type_b" << kCsvSep
<< "trans_b" << kCsvSep
<< "type_a" << kCsvSep << "type_b" << kCsvSep
<< "type_c" << kCsvSep << "lda" << kCsvSep << "ldb" << kCsvSep
<< "ldc" << kCsvSep << "stride_a" << kCsvSep
<< "stride_b" << kCsvSep << "stride_c";
if (full_string) {
os << kCsvSep << "alpha_re" << kCsvSep << "alpha_im" << kCsvSep
<< "beta" << kCsvSep << "epilogue";
}
os << kCsvSep << "alg_index" << std::endl;
<< "stride_b" << kCsvSep << "stride_c" << kCsvSep
<< "alg_index" << std::endl;
}

} // namespace
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,9 +173,7 @@ auto CublasLtMatmulThunk::GetCachedMatmulPlan(
return std::move(plan);
}
}
TF_RETURN_IF_ERROR(plan->SetAlgorithm(algorithms[0]));
LOG(WARNING) << "Wrong algorithm ID: " << algorithm_id << " use default instead.";
return std::move(plan);
return InternalError("Wrong algorithm ID: %d", algorithm_id);
};
return cache.GetOrCreate(canonical_hlo_, create);
}
Expand Down
2 changes: 1 addition & 1 deletion tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ inline constexpr int64_t WarpSize() { return 32; }

// Need at least 1024 threads/block for reasonable tree reduction
// performance (assuming all data fits).
inline constexpr int64_t MinThreadsXRowReduction() { return 512; }
inline constexpr int64_t MinThreadsXRowReduction() { return 1024; }

// When doing batched row reduction, how big the batch dimension could be.
inline constexpr int64_t BatchedReductionRaceFreeBound() { return 8; }
Expand Down
23 changes: 11 additions & 12 deletions tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1998,9 +1998,8 @@ Status IrEmitterUnnested::EmitUnnestedTranspose(
// TODO(cheshire): have a more robust way of checking this.
CHECK(dims_and_order.has_value());

constexpr int kNumRows = 8;
constexpr int warp_size = 64;
CHECK_EQ(warp_size % kNumRows, 0);
constexpr int kNumRows = 4;
CHECK_EQ(WarpSize() % kNumRows, 0);

// 3D view over the input shape.
Vector3 dims = dims_and_order->first;
Expand All @@ -2009,9 +2008,9 @@ Status IrEmitterUnnested::EmitUnnestedTranspose(
CHECK_NE(order[2], 2);
Vector3 permuted_dims = {dims[order[0]], dims[order[1]], dims[order[2]]};
Vector3 tile_sizes{1, 1, 1};
tile_sizes[order[2]] = warp_size / kNumRows;
Vector3 num_threads{1, 1, warp_size};
num_threads[order[2]] = kNumRows;
tile_sizes[order[2]] = WarpSize() / 8;
Vector3 num_threads{1, 1, WarpSize()};
num_threads[order[2]] = 8;

TilingScheme tiling_scheme(
/*permuted_dims*/ permuted_dims,
Expand Down Expand Up @@ -4364,7 +4363,7 @@ llvm::Value* IrEmitterUnnested::EmitThreadId(int64_t threads_per_block,
}

StatusOr<IrEmitterUnnested::ThreadIdInfo> IrEmitterUnnested::EmitThreadIdInfo(
const TilingScheme& tiling_scheme, llvm::Type* index_ty, const int warp_size) {
const TilingScheme& tiling_scheme, llvm::Type* index_ty) {
auto constant = [&](uint64_t c) -> llvm::Constant* {
return llvm::ConstantInt::get(index_ty, c);
};
Expand Down Expand Up @@ -4401,23 +4400,23 @@ StatusOr<IrEmitterUnnested::ThreadIdInfo> IrEmitterUnnested::EmitThreadIdInfo(
/*thread_id_y=*/
b_.CreateUDiv(thread_id_logical, num_threads_x_v, "thread_id.y"),
/*lane_id=*/
b_.CreateURem(thread_id_logical, constant(warp_size), "lane_id"),
b_.CreateURem(thread_id_logical, constant(WarpSize()), "lane_id"),
/*block_id=*/block_id_logical,
/*scaling=*/scaling}};
}

StatusOr<IrEmitterUnnested::TilingKernelInfo>
IrEmitterUnnested::EmitTilingKernel(
const TilingScheme& tiling_scheme, llvm::Type* index_ty,
const TileElementGenerator& tile_element_generator, const int warp_size) {
const TileElementGenerator& tile_element_generator) {
absl::Span<const int64_t> dims_in_elems = tiling_scheme.GetDimsInElems();
Vector3 dims_in_blocks = tiling_scheme.GetDimsInBlocks();
auto constant = [&](uint64_t c) -> llvm::Constant* {
return llvm::ConstantInt::get(index_ty, c);
};

TF_ASSIGN_OR_RETURN(ThreadIdInfo thread_id_info,
EmitThreadIdInfo(tiling_scheme, index_ty, warp_size));
EmitThreadIdInfo(tiling_scheme, index_ty));

KernelSupportLibrary ksl(&b_, llvm_ir::UnrollMode::kDefaultUnroll);

Expand Down Expand Up @@ -4678,7 +4677,7 @@ Status IrEmitterUnnested::EmitTransposeTile(

llvm::Type* index_type = GetIndexTypeForKernel(
fusion.getOperation(), launch_dimensions.launch_bound(), &b_);
return EmitTilingKernel(tiling_scheme, index_type, tile_generator, 64).status();
return EmitTilingKernel(tiling_scheme, index_type, tile_generator).status();
}

namespace {
Expand Down Expand Up @@ -5159,7 +5158,7 @@ Status IrEmitterUnnested::EmitIRForReduction(
ValueVector2 tile_dimensions) {
EmitTile(codegen_state.GetTilingScheme(), index, thread_id_info,
tile_dimensions, emit_reduction_element);
}, 32));
}));

KernelSupportLibrary ksl(&b_);
for (const HloReduceInstruction* reduce : reductions) {
Expand Down
4 changes: 2 additions & 2 deletions tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
Original file line number Diff line number Diff line change
Expand Up @@ -587,7 +587,7 @@ class IrEmitterUnnested : public IrEmitter {
// scheme.
StatusOr<TilingKernelInfo> EmitTilingKernel(
const TilingScheme& tiling_scheme, llvm::Type* index_ty,
const TileElementGenerator& tile_element_generator, const int warp_size);
const TileElementGenerator& tile_element_generator);

// Emits code to iterate through a 2-dimensional tile with a given tile
// dimensions and given strides, and call the callback at each iteration.,
Expand Down Expand Up @@ -846,7 +846,7 @@ class IrEmitterUnnested : public IrEmitter {
// combination of thread_id/block_id does not correspond to a real block.
// Assumes the current function returns void.
StatusOr<ThreadIdInfo> EmitThreadIdInfo(const TilingScheme& tiling_scheme,
llvm::Type* index_ty, const int warp_size);
llvm::Type* index_ty);
// Emit __syncthreads(), synchronization barrier for all threads in a block.
llvm::CallInst* EmitSyncThreads();

Expand Down
4 changes: 2 additions & 2 deletions tensorflow/compiler/xla/service/gpu/launch_dimensions.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ static int64_t ThreadsPerBlockLimit(GpuDeviceInfo gpu_device_info) {
threads_per_block = gpu_device_info.threads_per_warp;
if (threads_per_block == 0) {
// Fall back to *something* if we can't even get num threads per warp.
threads_per_block = 64;
threads_per_block = 32;
}
}
return threads_per_block;
Expand Down Expand Up @@ -112,7 +112,7 @@ StatusOr<LaunchDimensions> CalculateLaunchDimensions(
? threads_per_block_row_vectorized
: RoundUpTo(ThreadsPerBlockLimit(gpu_device_info) /
dim_config.unroll_factor,
int64_t{64});
int64_t{32});
if (num_elements < max_threads_per_block_x) {
return num_elements;
}
Expand Down
3 changes: 2 additions & 1 deletion tensorflow/compiler/xla/stream_executor/gpu/gpu_blas_lt.cc
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,8 @@ std::string ToCSVString(const GemmConfig& cfg, bool full_string) {

if (full_string) {
// NOTE: epilogue is required for MatmulPlan caching !
oss << kCsvSep << cfg.alpha.real() << kCsvSep << cfg.alpha.imag() << kCsvSep << cfg.beta << kCsvSep << (int64_t)cfg.epilogue;
oss //<< kCsvSep << cfg.alpha << kCsvSep << cfg.beta
<< kCsvSep << (int64_t)cfg.epilogue;
}

return oss.str();
Expand Down
4 changes: 2 additions & 2 deletions third_party/gpus/crosstool/BUILD.rocm.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,14 @@ cc_toolchain_config(
"-fuse-ld=gold",
"-Wl,-no-as-needed",
"-Wl,-z,relro,-z,now",
# "-pass-exit-codes",
"-pass-exit-codes",
"-lstdc++",
"-lm",
],
link_libs = [],
opt_link_flags = [],
unfiltered_compile_flags = [
# "-fno-canonical-system-headers",
"-fno-canonical-system-headers",
"-Wno-builtin-macro-redefined",
"-D__DATE__=\"redacted\"",
"-D__TIMESTAMP__=\"redacted\"",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,7 @@ def GetHostCompilerOptions(argv):
parser.add_argument('-iquote', nargs='*', action='append')
parser.add_argument('--sysroot', nargs=1)
parser.add_argument('-g', nargs='*', action='append')
parser.add_argument('-no-canonical-prefixes', action='store_true')
parser.add_argument('-Wno-unused-variable', action='store_true')
parser.add_argument('-Wno-unused-but-set-variable', action='store_true')
parser.add_argument('-fno-canonical-system-headers', action='store_true')

args, _ = parser.parse_known_args(argv)

Expand All @@ -89,16 +87,10 @@ def GetHostCompilerOptions(argv):
opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
if args.g:
opts += ' -g' + ' -g'.join(sum(args.g, []))
if args.no_canonical_prefixes:
if args.fno_canonical_system_headers:
opts += ' -no-canonical-prefixes'
if args.sysroot:
opts += ' --sysroot ' + args.sysroot[0]
if args.Wno_unused_variable:
opts += ' -Wno-unused-variable'

if args.Wno_unused_but_set_variable:
opts += ' -Wno-unused-but-set-variable'


return opts

Expand Down Expand Up @@ -290,13 +282,7 @@ def main():
if not flag.startswith(('--rocm_log'))]

# XXX: SE codes need to be built with gcc, but need this macro defined
cpu_compiler_flags.append("-D__HIP_PLATFORM_AMD__")
cpu_compiler_flags.append('-L' + HIP_RUNTIME_PATH)
cpu_compiler_flags.append('-Wl,-rpath=' + HIP_RUNTIME_PATH)
cpu_compiler_flags.append('-l' + HIP_RUNTIME_LIBRARY)
cpu_compiler_flags.append("-lrt")
cpu_compiler_flags.append("-Wno-unused-command-line-argument")
cpu_compiler_flags.append("-Wno-gnu-offsetof-extensions")
cpu_compiler_flags.append("-D__HIP_PLATFORM_HCC__")
if VERBOSE: print(' '.join([CPU_COMPILER] + cpu_compiler_flags))
return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1046,7 +1046,7 @@ def _impl(ctx):
flag_group(
flags = [
"-no-canonical-prefixes",
#"-fno-canonical-system-headers",
"-fno-canonical-system-headers",
]
),
],
Expand Down
4 changes: 1 addition & 3 deletions third_party/gpus/rocm_configure.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -715,14 +715,12 @@ def _create_local_rocm_repository(repository_ctx):
# .d file - given that includes that are prefixed with "../" multiple
# time quickly grow longer than the root of the tree, this can lead to
# bazel's header check failing.
rocm_defines["%{extra_no_canonical_prefixes_flags}"] = ""
rocm_defines["%{extra_no_canonical_prefixes_flags}"] = "\"-fno-canonical-system-headers\""

rocm_defines["%{unfiltered_compile_flags}"] = to_list_of_strings([
"-DTENSORFLOW_USE_ROCM=1",
"-D__HIP_PLATFORM_AMD__",
"-DEIGEN_USE_HIP",
"-Wno-unused-but-set-variable",
"-Wno-c++11-narrowing",
])

rocm_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
Expand Down

0 comments on commit 924550f

Please sign in to comment.