fix spark connect (#9986)

Signed-off-by: Serena Ruan <[email protected]>
mlflow · Oct 24, 2023 · e6af17a · e6af17a
1 parent ae7f779
commit e6af17a
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 0 deletions.
diff --git a/mlflow/pyfunc/__init__.py b/mlflow/pyfunc/__init__.py
@@ -1473,6 +1473,8 @@ def _predict_row_batch(predict_fn, args):
         pandas.DataFrame if isinstance(result_type, SparkStructType) else pandas.Series
     )
 
+    tracking_uri = mlflow.get_tracking_uri()
+
     @pandas_udf(result_type)
     def udf(
         iterator: Iterator[Tuple[Union[pandas.Series, pandas.DataFrame], ...]]
@@ -1492,6 +1494,9 @@ def udf(
         if mlflow_testing:
             _MLFLOW_TESTING.set(mlflow_testing)
         scoring_server_proc = None
+        # set tracking_uri inside udf so that with spark_connect
+        # we can load the model from correct path
+        mlflow.set_tracking_uri(tracking_uri)
 
         if env_manager != _EnvManager.LOCAL:
             if should_use_spark_to_broadcast_file:

diff --git a/tests/pyfunc/test_spark_connect.py b/tests/pyfunc/test_spark_connect.py
@@ -41,3 +41,20 @@ def test_spark_udf_spark_connect_unsupported_env_manager(spark, tmp_path, env_ma
         match=f"Environment manager {env_manager!r} is not supported",
     ):
         mlflow.pyfunc.spark_udf(spark, str(tmp_path), env_manager=env_manager)
+
+
+def test_spark_udf_spark_connect_with_model_logging(spark, tmp_path):
+    X, y = load_iris(return_X_y=True, as_frame=True)
+    model = LogisticRegression().fit(X, y)
+
+    mlflow.set_tracking_uri(tmp_path.joinpath("mlruns").as_uri())
+    mlflow.set_experiment("test")
+    with mlflow.start_run():
+        signature = mlflow.models.infer_signature(X, y)
+        model_info = mlflow.sklearn.log_model(model, "model", signature=signature)
+
+    udf = mlflow.pyfunc.spark_udf(spark, model_info.model_uri, env_manager="local")
+    X_test = X.head(5)
+    sdf = spark.createDataFrame(X_test)
+    preds = sdf.select(udf(*X_test.columns).alias("preds")).toPandas()["preds"]
+    np.testing.assert_array_almost_equal(preds, model.predict(X_test))