diff --git a/mlflow/data/digest_utils.py b/mlflow/data/digest_utils.py index 73b5d4d81f6c9..849b613a4b35f 100644 --- a/mlflow/data/digest_utils.py +++ b/mlflow/data/digest_utils.py @@ -1,10 +1,10 @@ +import hashlib from typing import Any from packaging.version import Version from mlflow.exceptions import MlflowException from mlflow.protos.databricks_pb2 import INVALID_PARAMETER_VALUE -from mlflow.utils import insecure_hash MAX_ROWS = 10000 @@ -101,7 +101,7 @@ def get_normalized_md5_digest(elements: list[Any]) -> str: INVALID_PARAMETER_VALUE, ) - md5 = insecure_hash.md5() + md5 = hashlib.md5(usedforsecurity=False) for element in elements: md5.update(element) diff --git a/mlflow/data/evaluation_dataset.py b/mlflow/data/evaluation_dataset.py index e936f4932f679..87de8419ee0d2 100644 --- a/mlflow/data/evaluation_dataset.py +++ b/mlflow/data/evaluation_dataset.py @@ -1,3 +1,4 @@ +import hashlib import json import logging import math @@ -10,7 +11,6 @@ from mlflow.entities import RunTag from mlflow.exceptions import MlflowException from mlflow.protos.databricks_pb2 import INVALID_PARAMETER_VALUE -from mlflow.utils import insecure_hash from mlflow.utils.string_utils import generate_feature_name_if_not_string try: @@ -406,7 +406,7 @@ def __init__( ) # generate dataset hash - md5_gen = insecure_hash.md5() + md5_gen = hashlib.md5(usedforsecurity=False) _gen_md5_for_arraylike_obj(md5_gen, self._features_data) if self._labels_data is not None: _gen_md5_for_arraylike_obj(md5_gen, self._labels_data) diff --git a/mlflow/pyfunc/__init__.py b/mlflow/pyfunc/__init__.py index cc46e951006a5..d802f81b19e18 100644 --- a/mlflow/pyfunc/__init__.py +++ b/mlflow/pyfunc/__init__.py @@ -398,6 +398,7 @@ def predict(self, context, model_input, params=None): import collections import functools +import hashlib import importlib import inspect import json @@ -508,7 +509,6 @@ def predict(self, context, model_input, params=None): databricks_utils, find_free_port, get_major_minor_py_version, - insecure_hash, ) from mlflow.utils import env_manager as _EnvManager from mlflow.utils._spark_utils import modified_environ @@ -2499,7 +2499,7 @@ def batch_predict_fn(pdf, params=None): model_path = os.path.join( tempfile.gettempdir(), "mlflow", - insecure_hash.sha1(model_uri.encode()).hexdigest(), + hashlib.sha1(model_uri.encode(), usedforsecurity=False).hexdigest(), # Use pid to avoid conflict when multiple spark UDF tasks str(os.getpid()), ) diff --git a/mlflow/store/tracking/file_store.py b/mlflow/store/tracking/file_store.py index d894cd825a901..1aac2e415514f 100644 --- a/mlflow/store/tracking/file_store.py +++ b/mlflow/store/tracking/file_store.py @@ -1,3 +1,4 @@ +import hashlib import json import logging import os @@ -49,7 +50,7 @@ ) from mlflow.store.tracking.abstract_store import AbstractStore from mlflow.tracing.utils import generate_request_id -from mlflow.utils import get_results_from_paginated_fn, insecure_hash +from mlflow.utils import get_results_from_paginated_fn from mlflow.utils.file_utils import ( append_to, exists, @@ -1185,13 +1186,13 @@ def log_inputs(self, run_id: str, datasets: Optional[list[DatasetInput]] = None) @staticmethod def _get_dataset_id(dataset_name: str, dataset_digest: str) -> str: - md5 = insecure_hash.md5(dataset_name.encode("utf-8")) + md5 = hashlib.md5(dataset_name.encode("utf-8"), usedforsecurity=False) md5.update(dataset_digest.encode("utf-8")) return md5.hexdigest() @staticmethod def _get_input_id(dataset_id: str, run_id: str) -> str: - md5 = insecure_hash.md5(dataset_id.encode("utf-8")) + md5 = hashlib.md5(dataset_id.encode("utf-8"), usedforsecurity=False) md5.update(run_id.encode("utf-8")) return md5.hexdigest() diff --git a/mlflow/utils/conda.py b/mlflow/utils/conda.py index 7d6b612609122..94f3570dce4dc 100644 --- a/mlflow/utils/conda.py +++ b/mlflow/utils/conda.py @@ -1,3 +1,4 @@ +import hashlib import json import logging import os @@ -6,7 +7,7 @@ from mlflow.environment_variables import MLFLOW_CONDA_CREATE_ENV_CMD, MLFLOW_CONDA_HOME from mlflow.exceptions import ExecutionException -from mlflow.utils import insecure_hash, process +from mlflow.utils import process from mlflow.utils.environment import Environment from mlflow.utils.os import is_windows @@ -62,13 +63,15 @@ def _get_conda_env_name(conda_env_path, env_id=None, env_root_dir=None): conda_env_contents += env_id env_name = "mlflow-{}".format( - insecure_hash.sha1(conda_env_contents.encode("utf-8")).hexdigest() + hashlib.sha1(conda_env_contents.encode("utf-8"), usedforsecurity=False).hexdigest() ) if env_root_dir: env_root_dir = os.path.normpath(env_root_dir) # Generate env name with format "mlflow-{conda_env_contents_hash}-{env_root_dir_hash}" # hashing `conda_env_contents` and `env_root_dir` separately helps debugging - env_name += "-{}".format(insecure_hash.sha1(env_root_dir.encode("utf-8")).hexdigest()) + env_name += "-{}".format( + hashlib.sha1(env_root_dir.encode("utf-8"), usedforsecurity=False).hexdigest() + ) return env_name diff --git a/mlflow/utils/environment.py b/mlflow/utils/environment.py index 0512c61fbb516..2302bdf82c99b 100644 --- a/mlflow/utils/environment.py +++ b/mlflow/utils/environment.py @@ -1,3 +1,4 @@ +import hashlib import importlib.metadata import logging import os @@ -20,7 +21,7 @@ from mlflow.exceptions import MlflowException from mlflow.protos.databricks_pb2 import INVALID_PARAMETER_VALUE from mlflow.tracking import get_tracking_uri -from mlflow.utils import PYTHON_VERSION, insecure_hash +from mlflow.utils import PYTHON_VERSION from mlflow.utils.databricks_utils import ( _get_databricks_serverless_env_vars, get_databricks_env_vars, @@ -746,7 +747,7 @@ def _get_mlflow_env_name(s): (e.g. "mlflow-da39a3ee5e6b4b0d3255bfef95601890afd80709") """ - return "mlflow-" + insecure_hash.sha1(s.encode("utf-8")).hexdigest() + return "mlflow-" + hashlib.sha1(s.encode("utf-8"), usedforsecurity=False).hexdigest() def _get_pip_install_mlflow(): diff --git a/mlflow/utils/insecure_hash.py b/mlflow/utils/insecure_hash.py deleted file mode 100644 index 7807a6d2e4ff3..0000000000000 --- a/mlflow/utils/insecure_hash.py +++ /dev/null @@ -1,15 +0,0 @@ -import functools -import hashlib -import sys - -# DO NOT use this function for security purposes (e.g., password hashing). -# -# In Python >= 3.9, insecure hashing algorithms such as MD5 fail in FIPS-compliant -# environments unless `usedforsecurity=False` is explicitly passed. -# -# References: -# - https://github.com/mlflow/mlflow/issues/9905 -# - https://docs.python.org/3/library/hashlib.html -_kwargs = {"usedforsecurity": False} if sys.version_info >= (3, 9) else {} -md5 = functools.partial(hashlib.md5, **_kwargs) -sha1 = functools.partial(hashlib.sha1, **_kwargs) diff --git a/tests/evaluate/test_evaluation.py b/tests/evaluate/test_evaluation.py index 38bf7523f4447..2cd68c768520b 100644 --- a/tests/evaluate/test_evaluation.py +++ b/tests/evaluate/test_evaluation.py @@ -1,3 +1,4 @@ +import hashlib import inspect import io import json @@ -59,7 +60,6 @@ from mlflow.tracing.constant import TraceMetadataKey from mlflow.tracing.fluent import TRACE_BUFFER from mlflow.tracking.artifact_utils import get_artifact_uri -from mlflow.utils import insecure_hash from mlflow.utils.autologging_utils import ( MLFLOW_EVALUATE_RESTRICT_LANGCHAIN_AUTOLOG_TO_TRACES_CONFIG, ) @@ -863,7 +863,7 @@ def test_dataset_metadata(): def test_gen_md5_for_arraylike_obj(): def get_md5(data): - md5_gen = insecure_hash.md5() + md5_gen = hashlib.md5(usedforsecurity=False) _gen_md5_for_arraylike_obj(md5_gen, data) return md5_gen.hexdigest() @@ -881,7 +881,7 @@ def get_md5(data): def test_gen_md5_for_arraylike_obj_with_pandas_df_using_float_idx_does_not_raise_keyerror(): float_indices = np.random.uniform(low=0.5, high=13.3, size=(10,)) df = pd.DataFrame(np.random.randn(10, 4), index=float_indices, columns=["A", "B", "C", "D"]) - md5_gen = insecure_hash.md5() + md5_gen = hashlib.md5(usedforsecurity=False) assert _gen_md5_for_arraylike_obj(md5_gen, df) is None diff --git a/tests/projects/test_projects_cli.py b/tests/projects/test_projects_cli.py index 42e85875517f4..72ea6cc5dfe56 100644 --- a/tests/projects/test_projects_cli.py +++ b/tests/projects/test_projects_cli.py @@ -1,3 +1,4 @@ +import hashlib import json import logging import os @@ -8,7 +9,7 @@ from click.testing import CliRunner from mlflow import MlflowClient, cli -from mlflow.utils import insecure_hash, process +from mlflow.utils import process from tests.integration.utils import invoke_cli_runner from tests.projects.utils import ( @@ -90,7 +91,7 @@ def test_run_local_conda_env(): with open(os.path.join(TEST_PROJECT_DIR, "conda.yaml")) as handle: conda_env_contents = handle.read() expected_env_name = "mlflow-{}".format( - insecure_hash.sha1(conda_env_contents.encode("utf-8")).hexdigest() + hashlib.sha1(conda_env_contents.encode("utf-8"), usedforsecurity=False).hexdigest() ) try: process._exec_cmd(cmd=["conda", "env", "remove", "--name", expected_env_name]) diff --git a/tests/resources/data/dataset.py b/tests/resources/data/dataset.py index 4ec5fe91d6dc7..306e6fca25bea 100644 --- a/tests/resources/data/dataset.py +++ b/tests/resources/data/dataset.py @@ -1,4 +1,5 @@ import base64 +import hashlib import json from typing import Any, Optional @@ -8,7 +9,6 @@ from mlflow.data.dataset import Dataset from mlflow.types import Schema from mlflow.types.utils import _infer_schema -from mlflow.utils import insecure_hash from tests.resources.data.dataset_source import SampleDatasetSource @@ -29,7 +29,7 @@ def _compute_digest(self) -> str: Computes a digest for the dataset. Called if the user doesn't supply a digest when constructing the dataset. """ - hash_md5 = insecure_hash.md5() + hash_md5 = hashlib.md5(usedforsecurity=False) for hash_part in pd.util.hash_array(np.array(self._data_list)): hash_md5.update(hash_part) return base64.b64encode(hash_md5.digest()).decode("ascii") diff --git a/tests/store/tracking/test_file_store.py b/tests/store/tracking/test_file_store.py index faa0851573e59..46ee3d1e2fc9d 100644 --- a/tests/store/tracking/test_file_store.py +++ b/tests/store/tracking/test_file_store.py @@ -1,3 +1,4 @@ +import hashlib import json import os import posixpath @@ -43,7 +44,6 @@ from mlflow.store.tracking.file_store import FileStore from mlflow.tracing.constant import TraceMetadataKey, TraceTagKey from mlflow.tracking._tracking_service.utils import _use_tracking_uri -from mlflow.utils import insecure_hash from mlflow.utils.file_utils import ( TempDir, path_to_local_file_uri, @@ -2659,7 +2659,7 @@ def assert_expected_input_storage_ids_present(run, dataset_storage_ids): inputs_dir = os.path.join(run_dir, FileStore.INPUTS_FOLDER_NAME) expected_input_storage_ids = [] for dataset_storage_id in dataset_storage_ids: - md5 = insecure_hash.md5(dataset_storage_id.encode("utf-8")) + md5 = hashlib.md5(dataset_storage_id.encode("utf-8"), usedforsecurity=False) md5.update(run.info.run_id.encode("utf-8")) expected_input_storage_ids.append(md5.hexdigest()) assert set(os.listdir(inputs_dir)) == set(expected_input_storage_ids)