mosaicml · es94129 · Jan 15, 2025 · Jan 15, 2025
@@ -601,6 +601,9 @@ class StreamingFinetuningDataset(StreamingDataset):
         replication (int, optional): Determines how many consecutive devices will receive the same
             samples. Useful for training with tensor or sequence parallelism, where multiple
             devices need to see the same partition of the dataset. Defaults to ``None``.
+        stream_name (str): The name of the Stream to use which is registered in
+            streaming.base.stream.streams_registry. Defaults to ``stream``.
+        stream_config (dict[str, Any]): Additional arguments to pass to the Stream constructor.
     """
 
     def __init__(
@@ -632,6 +635,8 @@ def __init__(
         allow_unsafe_types: bool = False,
         replication: Optional[int] = None,
         packing_ratio: Optional[float] = None,
+        stream_name: str = 'stream',
+        stream_config: Optional[dict[str, Any]] = None,
         **kwargs: Any,
     ):
 
@@ -675,6 +680,8 @@ def __init__(
             batching_method=batching_method,
             allow_unsafe_types=allow_unsafe_types,
             replication=replication,
+            stream_name=stream_name,
+            stream_config=stream_config,
             **kwargs,
         )
 

@@ -105,6 +105,9 @@ class StreamingTextDataset(StreamingDataset):
         replication (int, optional): Determines how many consecutive devices will receive the same
             samples. Useful for training with tensor or sequence parallelism, where multiple
             devices need to see the same partition of the dataset. Defaults to ``None``.
+        stream_name (str): The name of the Stream to use which is registered in
+            streaming.base.stream.streams_registry. Defaults to ``stream``.
+        stream_config (dict[str, Any]): Additional arguments to pass to the Stream constructor.
     """
 
     def __init__(
@@ -135,6 +138,8 @@ def __init__(
         batching_method: str = 'random',
         allow_unsafe_types: bool = False,
         replication: Optional[int] = None,
+        stream_name: str = 'stream',
+        stream_config: Optional[dict[str, Any]] = None,
         **kwargs: Any,
     ):
 
@@ -183,6 +188,8 @@ def __init__(
             batching_method=batching_method,
             allow_unsafe_types=allow_unsafe_types,
             replication=replication,
+            stream_name=stream_name,
+            stream_config=stream_config,
             **kwargs,
         )
         self.tokenizer = tokenizer

@@ -56,7 +56,7 @@
     'mlflow>=2.14.1,<2.19',
     'accelerate>=0.25,<1.2',  # for HF inference `device_map`
     'transformers>=4.43.2,<4.47',
-    'mosaicml-streaming>=0.10.0,<0.11',
+    'mosaicml-streaming>=0.11.0,<0.12',
     'torch>=2.5.1,<2.5.2',
     'datasets>=2.20.0,<3.3',
     'fsspec==2023.6.0',  # newer version results in a bug in datasets that duplicates data