From 7aefed5ec4521d6a6b55b5f13f1fead6bedbe38d Mon Sep 17 00:00:00 2001 From: Yifan Li Date: Wed, 30 Oct 2024 14:11:06 -0400 Subject: [PATCH 1/3] move the illegal memory reminder from base class AutoBatchSize to the inherited class under tf --- deepmd/tf/utils/batch_size.py | 13 +++++++++++++ deepmd/utils/batch_size.py | 5 ----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/deepmd/tf/utils/batch_size.py b/deepmd/tf/utils/batch_size.py index 33f1ec0da0..421d1afa72 100644 --- a/deepmd/tf/utils/batch_size.py +++ b/deepmd/tf/utils/batch_size.py @@ -1,4 +1,6 @@ # SPDX-License-Identifier: LGPL-3.0-or-later +import os +import logging from packaging.version import ( Version, ) @@ -11,9 +13,20 @@ OutOfMemoryError, ) from deepmd.utils.batch_size import AutoBatchSize as AutoBatchSizeBase +from deepmd.utils.batch_size import log class AutoBatchSize(AutoBatchSizeBase): + def __init__(self, initial_batch_size: int = 1024, factor: float = 2.0) -> None: + super().__init__(initial_batch_size, factor) + DP_INFER_BATCH_SIZE = int(os.environ.get("DP_INFER_BATCH_SIZE", 0)) + if not DP_INFER_BATCH_SIZE > 0: + log.info( + "If you encounter the error 'an illegal memory access was encountered', this may be due to a TensorFlow issue. " + "To avoid this, set the environment variable DP_INFER_BATCH_SIZE to a smaller value than the last adjusted batch size. " + "The environment variable DP_INFER_BATCH_SIZE controls the inference batch size (nframes * natoms). " + ) + def is_gpu_available(self) -> bool: """Check if GPU is available. diff --git a/deepmd/utils/batch_size.py b/deepmd/utils/batch_size.py index 259fe93bdb..5ab06e55e2 100644 --- a/deepmd/utils/batch_size.py +++ b/deepmd/utils/batch_size.py @@ -61,11 +61,6 @@ def __init__(self, initial_batch_size: int = 1024, factor: float = 2.0) -> None: self.maximum_working_batch_size = initial_batch_size if self.is_gpu_available(): self.minimal_not_working_batch_size = 2**31 - log.info( - "If you encounter the error 'an illegal memory access was encountered', this may be due to a TensorFlow issue. " - "To avoid this, set the environment variable DP_INFER_BATCH_SIZE to a smaller value than the last adjusted batch size. " - "The environment variable DP_INFER_BATCH_SIZE controls the inference batch size (nframes * natoms). " - ) else: self.minimal_not_working_batch_size = ( self.maximum_working_batch_size + 1 From 94b2d41a8ac6abd0db26bd75d12b9c8baf4ce44f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 30 Oct 2024 18:24:54 +0000 Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/tf/utils/batch_size.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/deepmd/tf/utils/batch_size.py b/deepmd/tf/utils/batch_size.py index 421d1afa72..3f1fb61cae 100644 --- a/deepmd/tf/utils/batch_size.py +++ b/deepmd/tf/utils/batch_size.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: LGPL-3.0-or-later import os -import logging + from packaging.version import ( Version, ) @@ -13,7 +13,9 @@ OutOfMemoryError, ) from deepmd.utils.batch_size import AutoBatchSize as AutoBatchSizeBase -from deepmd.utils.batch_size import log +from deepmd.utils.batch_size import ( + log, +) class AutoBatchSize(AutoBatchSizeBase): From 5870a9379e3d3330e294bc78b9589a0c59adbb93 Mon Sep 17 00:00:00 2001 From: Yifan Li Date: Wed, 30 Oct 2024 15:20:33 -0400 Subject: [PATCH 3/3] only show the reminder when GPU is used --- deepmd/tf/utils/batch_size.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/deepmd/tf/utils/batch_size.py b/deepmd/tf/utils/batch_size.py index 3f1fb61cae..438bf36703 100644 --- a/deepmd/tf/utils/batch_size.py +++ b/deepmd/tf/utils/batch_size.py @@ -23,11 +23,12 @@ def __init__(self, initial_batch_size: int = 1024, factor: float = 2.0) -> None: super().__init__(initial_batch_size, factor) DP_INFER_BATCH_SIZE = int(os.environ.get("DP_INFER_BATCH_SIZE", 0)) if not DP_INFER_BATCH_SIZE > 0: - log.info( - "If you encounter the error 'an illegal memory access was encountered', this may be due to a TensorFlow issue. " - "To avoid this, set the environment variable DP_INFER_BATCH_SIZE to a smaller value than the last adjusted batch size. " - "The environment variable DP_INFER_BATCH_SIZE controls the inference batch size (nframes * natoms). " - ) + if self.is_gpu_available(): + log.info( + "If you encounter the error 'an illegal memory access was encountered', this may be due to a TensorFlow issue. " + "To avoid this, set the environment variable DP_INFER_BATCH_SIZE to a smaller value than the last adjusted batch size. " + "The environment variable DP_INFER_BATCH_SIZE controls the inference batch size (nframes * natoms). " + ) def is_gpu_available(self) -> bool: """Check if GPU is available.