dask · BhuvanashreeM · Aug 29, 2021 · Aug 31, 2021 · Sep 3, 2021 · Sep 3, 2021
diff --git a/dask_ml/datasets.py b/dask_ml/datasets.py
@@ -381,10 +381,11 @@ def make_classification(
     return X, y
 
 
-def random_date(start, end):
+def random_date(start, end, random_state=None):
+    rng_random_date = sklearn.utils.check_random_state(random_state)
     delta = end - start
     int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
-    random_second = np.random.randint(int_delta)
+    random_second = rng_random_date.randint(int_delta)
     return start + timedelta(seconds=random_second)
 
 
@@ -430,6 +431,13 @@ def make_classification_df(
         The output values.
 
     """
+    if (
+        random_state is not None
+        or not isinstance(random_state, np.random.RandomState)
+        or not isinstance(random_state, int)
+    ):
+        random_state = 42
+
     X_array, y_array = make_classification(
         n_samples=n_samples,
         flip_y=(1 - predictability),
@@ -451,8 +459,13 @@ def make_classification_df(
             [
                 X_df,
                 dd.from_array(
-                    np.array([random_date(*dates)] * len(X_df)),
-                    chunksize=chunks,
+                    np.array(
+                        [
+                            random_date(*dates, random_state + i)
+                            for i in range(len(X_df))
+                        ]
+                    ),
+                    chunksize=n_samples,
                     columns=["date"],
                 ),
             ],

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -73,10 +73,22 @@ def test_make_classification_df():
         dates=(date(2014, 1, 1), date(2015, 1, 1)),
     )
 
+    X_df1, y_series1 = dask_ml.datasets.make_classification_df(
+        n_samples=100,
+        n_features=5,
+        random_state=123,
+        chunks=100,
+        dates=(date(2014, 1, 1), date(2015, 1, 1)),
+    )
+    check_randomness = np.unique((X_df["date"] == X_df1["date"]).compute())
+
     assert X_df is not None
     assert y_series is not None
     assert "date" in X_df.columns
     assert len(X_df.columns) == 6
     assert len(X_df) == 100
     assert len(y_series) == 100
     assert isinstance(y_series, dask.dataframe.core.Series)
+    assert check_randomness.size == 1
+    assert check_randomness[0] is True
+    assert np.unique(X_df["date"]).size >= 2