Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[TF] Update TF v2.16.1 (without libfft) #9388

Closed
wants to merge 22 commits into from
Closed
Show file tree
Hide file tree
Changes from 21 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion abseil-cpp.spec
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
### RPM external abseil-cpp 20230125.3
### RPM external abseil-cpp 20230802.2
## INCLUDE cpp-standard

Source: https://github.com/abseil/abseil-cpp/archive/%{realversion}.tar.gz
Source2: https://patch-diff.githubusercontent.com/raw/abseil/abseil-cpp/pull/1732.diff

BuildRequires: cmake gmake

%prep
Expand Down
10 changes: 5 additions & 5 deletions bazel-absl.patch
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,20 @@ index b896f1b..0646f73 100755
"com_google_absl": {
- "archive": "20211102.0.tar.gz",
- "sha256": "dcf71b9cba8dc0ca9940c4b316a0c796be8fab42b070bb6b7cab62b48f0e66c4",
+ "archive": "20230125.3.tar.gz",
+ "sha256": "5366d7e7fa7ba0d915014d387b66d0d002c03236448e1ba9ef98122c13b35c36",
+ "archive": "20230802.2.tar.gz",
+ "sha256": "7c11539617af1f332f0854a6fb21e296a1b29c27d03f23c7b49d4adefcd102cc",
"urls": [
- "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/refs/tags/20211102.0.tar.gz",
- "https://github.com/abseil/abseil-cpp/archive/refs/tags/20211102.0.tar.gz",
+ "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/refs/tags/20230125.3.tar.gz",
+ "https://github.com/abseil/abseil-cpp/archive/refs/tags/20230125.3.tar.gz",
+ "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/refs/tags/20230802.2.tar.gz",
+ "https://github.com/abseil/abseil-cpp/archive/refs/tags/20230802.2.tar.gz",
],
"used_in": [
"additional_distfiles",
"test_WORKSPACE_files",
],
- "strip_prefix": "abseil-cpp-20211102.0",
+ "strip_prefix": "abseil-cpp-20230125.3",
+ "strip_prefix": "abseil-cpp-20230802.2",
},
"zstd-jni": {
"archive": "v1.5.2-3.zip",
3 changes: 0 additions & 3 deletions pip/blosc2.file
Original file line number Diff line number Diff line change
@@ -1,5 +1,2 @@
Requires: py3-scikit-build py3-cython py3-numpy py3-msgpack py3-ndindex py3-py-cpuinfo py3-rich
BuildRequires: cmake ninja

%define PipPostPost \
%{relocateConfig}lib64/pkgconfig/blosc2.pc
2 changes: 2 additions & 0 deletions pip/keras.file
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
## INITENV SET KERAS_BACKEND tensorflow
%define PipDownloadSourceType none
Requires: py3-PyYAML py3-six
Requires: py3-h5py py3-keras-applications py3-keras-preprocessing
Requires: py3-absl-py py3-ml_dtypes py3-namex py3-optree py3-rich
5 changes: 2 additions & 3 deletions pip/numpy.file
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
BuildRequires: py3-meson-python
Requires: py3-cython
Requires: zlib OpenBLAS

Expand Down Expand Up @@ -28,8 +29,6 @@ EOF
ln -s ${numpy_core} %{i}/c-api/core

%define PipPostPost \
%{relocateConfig}lib/python*/site-packages/numpy/__config__.py \
%{relocateConfig}lib/python*/site-packages/numpy/distutils/__config__.py \
%{relocateConfig}lib/python*/site-packages/numpy/distutils/site.cfg
%{relocateConfig}lib/python*/site-packages/numpy/__config__.py

%define PipPostBuildPy rm -f %{i}/bin/f2py
1 change: 1 addition & 0 deletions pip/optree.file
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Requires: py3-typing-extensions
22 changes: 11 additions & 11 deletions pip/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ beautifulsoup4==4.12.3
beniget==0.4.1
bleach==6.1.0
# NO_AUTO_UPDATE: needs Python 3.10
blosc2==2.2.0
blosc2==2.5.1
bokeh==3.4.1
boost-histogram==1.4.1
bottle==0.12.25
Expand Down Expand Up @@ -84,7 +84,7 @@ cryptography==42.0.5
cx-Oracle==8.3.0
cycler==0.12.1
# NO_AUTO_UPDATE: numpy 1.24.3 doesn't support cython 3.x
cython==0.29.35
cython==3.0.10
dask==2024.4.2
dask-awkward==2024.3.0
decorator==5.1.1
Expand Down Expand Up @@ -185,8 +185,7 @@ jupyter-server-mathjax==0.2.6
jupyter-server-terminals==0.5.3
jupyterlab==4.1.8
jupyterlab-widgets==3.0.10
#NO_AUTO_UPDATE: needs newer TF
keras==2.15.0
keras==3.5.0
keras-applications==1.0.8
keras-preprocessing==1.1.2
keras2onnx==1.7.0
Expand Down Expand Up @@ -224,6 +223,7 @@ mplhep-data==0.0.3
mpmath==1.3.0
msgpack==1.0.8
multidict==6.0.5
namex==0.0.8
nbclassic==1.0.0
nbclient==0.10.0
nbconvert==7.16.4
Expand All @@ -244,12 +244,13 @@ nvidia-ml-py==12.550.52
numexpr==2.8.4
# setuptools version <64 is needed by numpy: https://github.com/pypa/setuptools/issues/3549
# NO_AUTO_UPDATE: update together with tensorflow
numpy==1.24.3
numpy==1.26.4
# NO_AUTO_UPDATE:1
onnx==1.16.0
onnxmltools==1.12.0
onnxconverter-common==1.14.0
oauthlib==3.2.2
optree==0.12.1
opt-einsum==3.3.0
pkginfo==1.10.0
packaging==24.0
Expand Down Expand Up @@ -349,7 +350,7 @@ scikit-build==0.16.7
scikit-build-core==0.9.3
scikit-learn==1.2.2
scinum==2.1.0
scipy==1.10.0
scipy==1.12.0
secretstorage==3.3.3
semantic-version==2.10.0
threadpoolctl==3.5.0
Expand All @@ -371,14 +372,13 @@ stack-data==0.6.3
stevedore==5.2.0
subprocess32==3.5.4
sympy==1.12
# NO_AUTO_UPDATE: Needs newer blosc2
tables==3.8.0
tables==3.9.0
tblib==1.7.0
tabulate==0.9.0
tenacity==8.2.3
#NO_AUTO_UPDATE:1: Force to use tensorflow 2.15x; this should match the version in tensorflow-sources.spec
tensorflow==2.15.0
tensorboard==2.16.2
#NO_AUTO_UPDATE:1: Force to use tensorflow 2.16.1; this should match the version in tensorflow-sources.spec
tensorflow==2.16.1
tensorboard==2.17.0
tensorflow-io-gcs-filesystem==0.37.0
tensorflow-estimator==2.15.0
tensorboard-data-server==0.7.2
Expand Down
1 change: 1 addition & 0 deletions scram-tools.file/tools/tensorflow/tensorflow.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
<environment name="TENSORFLOW_BASE" default="@TOOL_ROOT@"/>
<environment name="LIBDIR" default="$TENSORFLOW_BASE/lib"/>
<environment name="INCLUDE" default="$TENSORFLOW_BASE/include"/>
<environment name="KERAS_BACKEND" default="tensorflow"/>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it should be <runtime ..../> type variable ( see ROOTSYS as an example)
did you run test locally to see if gpu unit tests passed after setting this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tested it by setting the environment variable manually (not via toolfile).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for me all the unit tests still fails with error

##Failure Location unknown## : Error
Test name: testHelloWorldCUDA::test
uncaught exception of type std::exception (or derived).
- An exception of category 'UnavailableAccelerator' occurred while
   [0] Calling tensorflow::setBackend()
Exception Message:
Cuda backend requested, NVIDIA GPU visible to cmssw, but not visible to TensorF
low in the job

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some tests that were failing previously worked after setting KERAS_BACKEND. Yes, I saw these failures as well - I thought I missed some setup step to make them work (in a container started with --nv flag)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

e.g. testTFConstSession was failing with ValueError: Unable to import backend : theano, but after setting KERAS_BACKEND it passed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could the failure be due to 12.4 not being an officially tested CUDA version for TF 2.16.1 (and even 2.17) - link lists 12.3 as officially tested version?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Running python -c "import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))" prints this message:

successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355

and returns an empty list []. I googled this message, and there are basically three solutions:

  1. Run TensorFlow using official docker image
  2. Install TensorFlow using conda and prebuilt wheels
  3. Force-connect NUMA node (as suggested in the document that TensorFlow prints out), namely run sudo echo 0 | sudo tee -a /sys/bus/pci/devices/0000\:06\:10.0/numa_node after each reboot. But that requires sudo rights (and, I would imagine, not in container, but on the host).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are you sure you started cmssw-el8 with --nv option? For me the following command

python3 -c "import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))"

runs fine (both for this PR and TF_X Ibs) and return

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now it works for me as well, weird.

</client>
<runtime name="PATH" value="$TENSORFLOW_BASE/bin" type="path"/>
<flags SYSTEM_INCLUDE="1"/>
Expand Down
2 changes: 1 addition & 1 deletion tensorflow-sources.file
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ popd
#FIXME: Create missing externals links
pushd bazel-tensorflow-%{realversion}/external
srcdir=$(dirname $(readlink zlib))
for e in ducc farmhash_gpu_archive stablehlo ; do
for e in ducc farmhash_gpu_archive stablehlo cudnn_frontend_archive ; do
echo "Check external link: $e"
if [ -e ${srcdir}/$e ] ; then
if [ ! -e $e ] ; then
Expand Down
4 changes: 2 additions & 2 deletions tensorflow-sources.spec
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
### RPM external tensorflow-sources 2.15.0
%define tag 3f5b9e20a2e5f678f79baffc9c2a59ac554053b2
### RPM external tensorflow-sources 2.16.1
%define tag 6edb11fa48921474f9476c070c02c7bac37c02cd
%define branch cms/v%{realversion}
%define github_user cms-externals
%define build_type opt
Expand Down
2 changes: 2 additions & 0 deletions tensorflow-xla-runtime.spec
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ CXXFLAGS="-fPIC -Wl,-z,defs %{arch_build_flags} ${CMS_EIGEN_CXX_FLAGS} %{selecte
pushd tensorflow/xla_aot_runtime_src
# remove unnecessary implementations that use symbols that are not even existing
find . -type f -path '*/service/cpu/runtime_fork_join.cc' | xargs rm -f
find . -type f -path '*/service/cpu/runtime_fft.cc' | xargs rm -f
find . -type f -path '*/service/cpu/runtime_single_threaded_fft.cc' | xargs rm -f

cmake . \
-DCMAKE_CXX_FLAGS="${CXXFLAGS}" \
Expand Down
2 changes: 1 addition & 1 deletion tensorflow.spec
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
### RPM external tensorflow 2.15.0
### RPM external tensorflow 2.16.1
%if "%{?vectorized_package:set}" != "set"
%define source_package tensorflow-sources
%else
Expand Down