From e47cab373b720f4fd12e5600df99d0159f6854d7 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Tue, 8 Aug 2023 15:59:02 +0200 Subject: [PATCH 1/2] fix PyTorch-1.13.1-foss-2022a on POWER --- ...fix-fp16-quantization-without-fbgemm.patch | 25 ++++++++++ ...12.0_fix-EmbeddingBag-without-fbgemm.patch | 48 +++++++++++++++++++ .../p/PyTorch/PyTorch-1.13.1-foss-2022a.eb | 6 +++ 3 files changed, 79 insertions(+) create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix-fp16-quantization-without-fbgemm.patch create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.0_fix-EmbeddingBag-without-fbgemm.patch diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix-fp16-quantization-without-fbgemm.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix-fp16-quantization-without-fbgemm.patch new file mode 100644 index 00000000000..b690fc529d5 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix-fp16-quantization-without-fbgemm.patch @@ -0,0 +1,25 @@ +Fix use-after free leading to random failures in nn/test_embedding +on e.g. POWER platforms where FBGEMM isn't used + +From https://github.com/pytorch/pytorch/pull/84750 + +Author: Alexander Grund (TU Dresden) + +diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp +index 224a66f8abf..f4d018007bf 100644 +--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp ++++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp +@@ -252,9 +252,10 @@ Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight) { + } + + #else +- const auto weight_data = weight_contig->scalar_type() == at::ScalarType::Half +- ? weight_contig->to(at::ScalarType::Float).data_ptr() +- : weight_contig->data_ptr(); ++ const Tensor& float_weight = weight_contig->scalar_type() == at::ScalarType::Half ++ ? weight_contig->to(at::ScalarType::Float) ++ : *weight_contig; ++ const auto weight_data = float_weight.data_ptr(); + constexpr float kEpsilon = 1e-8f; + for (auto row : c10::irange(embedding_rows)) { + const float* input_row = weight_data + row * embedding_cols; diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.0_fix-EmbeddingBag-without-fbgemm.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.0_fix-EmbeddingBag-without-fbgemm.patch new file mode 100644 index 00000000000..644c5d87545 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.0_fix-EmbeddingBag-without-fbgemm.patch @@ -0,0 +1,48 @@ +There is a bug in the fallback path for the case where FBGEMM isn't available (e.g. on POWER) +which leads to a race condition: +Data is "copied" for the full buffer while it is processed in chunks by different threads. +This a) duplicates the work and b) might write incomplete/wrong data to the output. + +Found in failing test_embedding_bag_half_cpu_* of nn/test_embedding: +ERROR: test_embedding_bag_half_cpu_int32_int32 (__main__.TestEmbeddingNNDeviceTypeCPU) +---------------------------------------------------------------------- +Traceback (most recent call last): + File "/dev/shm/s3248973-EasyBuild/PyTorch/1.13.1/foss-2022a/pytorch-v1.13.1/test/nn/test_embedding.py", line 936, in _test_EmbeddingBag_vs_Embedding + self.assertEqual(output, ref_output, atol=dtype2prec_DONTUSE[wdtype], rtol=0) + File "/tmp/eb-tmp-2022a/lib/python3.10/site-packages/torch/testing/_internal/common_utils.py", line 2470, in assertEqual + assert_equal( + File "/tmp/eb-tmp-2022a/lib/python3.10/site-packages/torch/testing/_comparison.py", line 1093, in assert_equal + raise error_metas[0].to_error(msg) +AssertionError: Tensor-likes are not close! + +Mismatched elements: 1 / 4 (25.0%) +Greatest absolute difference: 1.18359375 at index (1, 1) (up to 0.01 allowed) +Greatest relative difference: 1.0 at index (1, 1) (up to 0 allowed) + + +Introduced by https://github.com/pytorch/pytorch/pull/74844 + +Author: Alexander Grund (TU Dresden) + +diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp +index 6d8cea26f52..604ea16bace 100644 +--- a/aten/src/ATen/native/EmbeddingBag.cpp ++++ b/aten/src/ATen/native/EmbeddingBag.cpp +@@ -246,7 +246,7 @@ index_select_add(const Tensor &select_indices, + /*scale_bias=*/nullptr, + /*normalize_by_lengths=*/false, + /*out=*/output_data_fp32 + start_idx * ddim); +- for (const auto i : c10::irange(output_size)) { ++ for (const auto i : c10::irange(start_idx, end_idx)) { + // Convert FP32 intermediate buffer result back to FP16 for output dtype + for (const auto d : c10::irange(ddim)) { + (output_data + i * ddim)[d] = static_cast((output_data_fp32 + ddim * i)[d]); +@@ -590,7 +590,7 @@ index_select_scale_add(const Tensor &select_indices, + /*scale_bias=*/nullptr, + /*normalize_by_lengths=*/false, + /*out=*/output_data_fp32 + start_idx * ddim); +- for (const auto i : c10::irange(output_size)) { ++ for (const auto i : c10::irange(start_idx, end_idx)) { + // Convert FP32 intermediate buffer result back to FP16 for output dtype + for (const auto d : c10::irange(ddim)) { + (output_data + i * ddim)[d] = static_cast((output_data_fp32 + ddim * i)[d]); diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1-foss-2022a.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1-foss-2022a.eb index 645b2fb8883..25c524c6b73 100644 --- a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1-foss-2022a.eb +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1-foss-2022a.eb @@ -12,7 +12,9 @@ sources = ['%(namelower)s-v%(version)s.tar.gz'] patches = [ 'PyTorch-1.7.0_disable-dev-shm-test.patch', 'PyTorch-1.10.0_fix-kineto-crash.patch', + 'PyTorch-1.11.0_fix-fp16-quantization-without-fbgemm.patch', 'PyTorch-1.11.1_skip-test_init_from_local_shards.patch', + 'PyTorch-1.12.0_fix-EmbeddingBag-without-fbgemm.patch', 'PyTorch-1.12.1_add-hypothesis-suppression.patch', 'PyTorch-1.12.1_fix-skip-decorators.patch', 'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch', @@ -36,8 +38,12 @@ checksums = [ {'pytorch-v1.13.1.tar.gz': 'dbc229ee9750b02b514937d017744443a269ea0241ed3f32b9af0703589d25d4'}, {'PyTorch-1.7.0_disable-dev-shm-test.patch': '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a'}, {'PyTorch-1.10.0_fix-kineto-crash.patch': 'dc467333b28162149af8f675929d8c6bf219f23230bfc0d39af02ba4f6f882eb'}, + {'PyTorch-1.11.0_fix-fp16-quantization-without-fbgemm.patch': + 'cc526130b6446bbbf5f0f7372d3aeee3e7d4c4d6e471524dff028b430b152934'}, {'PyTorch-1.11.1_skip-test_init_from_local_shards.patch': '4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7'}, + {'PyTorch-1.12.0_fix-EmbeddingBag-without-fbgemm.patch': + '090598592283e3fc46ee08a68b6a6afe07be41b26514afba51834408bf1c98ed'}, {'PyTorch-1.12.1_add-hypothesis-suppression.patch': 'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c'}, {'PyTorch-1.12.1_fix-skip-decorators.patch': 'e3ca6e42b2fa592ea095939fb59ab875668a058479407db3f3684cc5c6f4146c'}, From 906923b000d1bcf998f7d66bd32f785fdafb9fdd Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Tue, 8 Aug 2023 16:18:48 +0200 Subject: [PATCH 2/2] Try to fix test_jit_legacy failing sometimes --- .../p/PyTorch/PyTorch-1.13.1-foss-2022a.eb | 2 ++ .../PyTorch-1.13.1_fix-flaky-jit-test.patch | 21 +++++++++++++++++++ 2 files changed, 23 insertions(+) create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1_fix-flaky-jit-test.patch diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1-foss-2022a.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1-foss-2022a.eb index 25c524c6b73..c2eb02d4435 100644 --- a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1-foss-2022a.eb +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1-foss-2022a.eb @@ -24,6 +24,7 @@ patches = [ 'PyTorch-1.12.1_fix-vsx-loadu.patch', 'PyTorch-1.12.1_fix-vsx-vector-funcs.patch', 'PyTorch-1.12.1_skip-test_round_robin.patch', + 'PyTorch-1.13.1_fix-flaky-jit-test.patch', 'PyTorch-1.13.1_fix-fsdp-fp16-test.patch', 'PyTorch-1.13.1_fix-pytest-args.patch', 'PyTorch-1.13.1_fix-test-ops-conf.patch', @@ -57,6 +58,7 @@ checksums = [ {'PyTorch-1.12.1_fix-vsx-loadu.patch': '8bfe3c94ada1dd1f7974a1261a8b576fb7ae944050fa1c7830fca033831123b2'}, {'PyTorch-1.12.1_fix-vsx-vector-funcs.patch': 'caccbf60f62eac313896c1eaec78b08f5d0fdfcb907079087490bb13d1561aa2'}, {'PyTorch-1.12.1_skip-test_round_robin.patch': '63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349'}, + {'PyTorch-1.13.1_fix-flaky-jit-test.patch': '71efdeb29b5e5b4982c9f5cb2182733654a34d52f85bb5487bc4d7d99b86101b'}, {'PyTorch-1.13.1_fix-fsdp-fp16-test.patch': '8ae68e60d6e1f92f50322b7f0381c7e65251fba32d7606e3a238a36a2f55b5cf'}, {'PyTorch-1.13.1_fix-pytest-args.patch': 'd3e3c841cf8d73683750f29326f2be56ee0bb5df7ff522baf7d7c3f301a91ec2'}, {'PyTorch-1.13.1_fix-test-ops-conf.patch': 'df652eec7753864ebebbfeca546929a53e3fb8f24259d5c9b964266a8551198c'}, diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1_fix-flaky-jit-test.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1_fix-flaky-jit-test.patch new file mode 100644 index 00000000000..f04de0cc8ed --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1_fix-flaky-jit-test.patch @@ -0,0 +1,21 @@ +Especially `test_jit_legacy` seems to be flaky. +https://github.com/pytorch/pytorch/commit/316ba9e6fc9e2c309c1b3785e35393b4a727b918 +makes the JIT tests run serially avoiding potential races. +So backport that commit. + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/run_test.py b/test/run_test.py +index f7c80f3f0a6..9d9b0c553e9 100755 +--- a/test/run_test.py ++++ b/test/run_test.py +@@ -994,7 +994,8 @@ def must_serial(file: str) -> bool: + "distributed" in file or + file in CUSTOM_HANDLERS or + file in RUN_PARALLEL_BLOCKLIST or +- file in CI_SERIAL_LIST ++ file in CI_SERIAL_LIST or ++ file in JIT_EXECUTOR_TESTS + ) + +