Align local and remote directory pathing.

mekaneeky · steffencruz · Dec 27, 2023 · Dec 27, 2023 · Dec 27, 2023 · Dec 27, 2023
commit 3d22ac1764343941db40b458bd8deba122a51d0a
diff --git a/model/model_updater.py b/model/model_updater.py
@@ -43,8 +43,8 @@ async def sync_model(self, hotkey: str) -> bool:
         if metadata == tracker_model_metadata:
             return False
 
-        # Get the local path based on the local store.
-        path = self.local_store.get_path(hotkey, metadata.id)
+        # Get the local path based on the local store to download to (top level hotkey path)
+        path = self.local_store.get_path(hotkey)
 
         # Otherwise we need to download the new model based on the metadata.
         model = await self.remote_store.download_model(metadata.id, path)

diff --git a/model/storage/disk/disk_model_store.py b/model/storage/disk/disk_model_store.py
@@ -13,15 +13,19 @@ class DiskModelStore(LocalModelStore):
     def __init__(self, base_dir: str):
         self.base_dir = base_dir
 
-    def get_path(self, hotkey: str, model_id: ModelId) -> str:
-        """Returns the path to where this store would locate this model."""
-        return utils.get_local_model_dir(self.base_dir, hotkey, model_id)
+    def get_path(self, hotkey: str) -> str:
+        """Returns the path to where this store would locate this hotkey."""
+        return utils.get_local_miner_dir(self.base_dir, hotkey)
 
     def store_model(self, hotkey: str, model: Model) -> ModelId:
         """Stores a trained model locally."""
 
+        # Note that the revision argument here does not affect the directory path like with hugging face downloads.
         model.pt_model.save_pretrained(
-            save_directory=utils.get_local_model_dir(self.base_dir, hotkey, model.id),
+            save_directory=utils.get_local_model_snapshot_dir(
+                self.base_dir, hotkey, model.id
+            ),
+            revision=model.id.commit,
             safe_serialization=True,
         )
 
@@ -32,7 +36,7 @@ def retrieve_model(self, hotkey: str, model_id: ModelId) -> Model:
         """Retrieves a trained model locally."""
 
         model = AutoModelForCausalLM.from_pretrained(
-            pretrained_model_name_or_path=utils.get_local_model_dir(
+            pretrained_model_name_or_path=utils.get_local_model_snapshot_dir(
                 self.base_dir, hotkey, model_id
             ),
             revision=model_id.commit,
@@ -50,7 +54,7 @@ def delete_unreferenced_models(
         valid_model_paths = set()
         for hotkey, model_id in valid_models_by_hotkey.items():
             valid_model_paths.add(
-                utils.get_local_model_dir(self.base_dir, hotkey, model_id)
+                utils.get_local_model_snapshot_dir(self.base_dir, hotkey, model_id)
             )
 
         # For each hotkey path on disk using listdir to go one level deep.
@@ -63,20 +67,42 @@ def delete_unreferenced_models(
 
             # If it is not in valid_hotkeys and out of grace period remove it.
             if hotkey not in valid_models_by_hotkey:
-                bt.logging.trace(
-                    f"Removing directory for unreferenced hotkey: {hotkey} if out of grace."
+                deleted_hotkey = utils.remove_dir_out_of_grace(
+                    hotkey_path, grace_period_seconds
                 )
-                utils.remove_dir_out_of_grace(hotkey_path, grace_period_seconds)
+                if deleted_hotkey:
+                    bt.logging.trace(
+                        f"Removed directory for unreferenced hotkey: {hotkey}."
+                    )
+
             else:
                 # Check all the model subfolder paths.
                 hotkey_dir = Path(hotkey_path)
                 model_subfolder_paths = [
                     str(d) for d in hotkey_dir.iterdir() if d.is_dir
                 ]
 
+                # Check all the snapshot subfolder paths
                 for model_path in model_subfolder_paths:
-                    if model_path not in valid_model_paths:
-                        bt.logging.trace(
-                            f"Removing directory for unreferenced model at: {model_path} if out of grace."
-                        )
-                        utils.remove_dir_out_of_grace(model_path, grace_period_seconds)
+                    model_dir = Path(model_path)
+                    snapshot_subfolder_paths = [
+                        str(d) for d in model_dir.iterdir() if d.is_dir
+                    ]
+
+                    # Check all the actual model snapshot paths
+                    for snapshot_path in snapshot_subfolder_paths:
+                        snapshot_dir = Path(snapshot_path)
+                        commit_subfolder_paths = [
+                            str(d) for d in snapshot_dir.iterdir() if d.is_dir
+                        ]
+
+                        # Reached the end. Check all the actual commit subfolders.
+                        for commit_path in commit_subfolder_paths:
+                            if commit_path not in valid_model_paths:
+                                deleted_model = utils.remove_dir_out_of_grace(
+                                    commit_path, grace_period_seconds
+                                )
+                                if deleted_model:
+                                    bt.logging.trace(
+                                        f"Removing directory for unreferenced model at: {commit_path}."
+                                    )
diff --git a/model/storage/disk/utils.py b/model/storage/disk/utils.py
@@ -15,10 +15,19 @@ def get_local_miner_dir(base_dir: str, hotkey: str) -> str:
     return os.path.join(get_local_miners_dir(base_dir), hotkey)
 
 
+# Hugging face stores models under models--namespace--name/snapshots/commit when downloading.
 def get_local_model_dir(base_dir: str, hotkey: str, model_id: ModelId) -> str:
     return os.path.join(
         get_local_miner_dir(base_dir, hotkey),
-        model_id.namespace + "_" + model_id.name + "_" + model_id.commit,
+        "models" + "--" + model_id.namespace + "--" + model_id.name,
+    )
+
+
+def get_local_model_snapshot_dir(base_dir: str, hotkey: str, model_id: ModelId) -> str:
+    return os.path.join(
+        get_local_model_dir(base_dir, hotkey, model_id),
+        "snapshots",
+        model_id.commit,
     )
 
 
@@ -39,12 +48,16 @@ def get_newest_datetime_under_path(path: str) -> datetime.datetime:
     return datetime.datetime.fromtimestamp(newest_filetime)
 
 
-def remove_dir_out_of_grace(path: str, grace_period_seconds: int):
+def remove_dir_out_of_grace(path: str, grace_period_seconds: int) -> bool:
+    """Removes a dir if the last modified time is out of grace period secs. Returns if it was deleted."""
     last_modified = get_newest_datetime_under_path(path)
     grace = datetime.timedelta(seconds=grace_period_seconds)
 
     if last_modified < datetime.datetime.now() - grace:
         shutil.rmtree(path=path, ignore_errors=True)
+        return True
+
+    return False
 
 
 def get_hash_of_file(path: str) -> str:

diff --git a/model/storage/hugging_face/hugging_face_model_store.py b/model/storage/hugging_face/hugging_face_model_store.py
@@ -105,7 +105,7 @@ async def test_roundtrip_model():
     # Retrieve the model from hf.
     retrieved_model = await hf_model_store.download_model(
         model_id=model.id,
-        local_path=utils.get_local_model_dir("test-models", "hotkey0", model.id),
+        local_path=utils.get_local_miner_dir("test-models", "hotkey0"),
     )
 
     # Check that they match.
@@ -129,7 +129,7 @@ async def test_retrieve_model():
     # Retrieve the model from hf (first run) or cache.
     model = await hf_model_store.download_model(
         model_id=model_id,
-        local_path=utils.get_local_model_dir("test-models", "hotkey0", model_id),
+        local_path=utils.get_local_miner_dir("test-models", "hotkey0"),
     )
 
     print(f"Finished retrieving the model with id: {model.id}")
@@ -149,7 +149,7 @@ async def test_retrieve_oversized_model():
     try:
         model = await hf_model_store.download_model(
             model_id=model_id,
-            local_path=utils.get_local_model_dir("test-models", "hotkey0", model_id),
+            local_path=utils.get_local_miner_dir("test-models", "hotkey0"),
         )
     except ValueError as ve:
         print(f"Caught expected exception for downloading too large of a model: {ve}")

diff --git a/model/storage/local_model_store.py b/model/storage/local_model_store.py
@@ -12,7 +12,7 @@ def store_model(self, hotkey: str, model: Model) -> ModelId:
         pass
 
     @abc.abstractmethod
-    def get_path(self, hotkey: str, model_id: ModelId) -> str:
+    def get_path(self, hotkey: str) -> str:
         """Returns the path to the appropriate location based on implementation."""
         pass
 

diff --git a/tests/model/storage/disk/test_disk_model_store.py b/tests/model/storage/disk/test_disk_model_store.py
@@ -16,15 +16,9 @@ def tearDown(self):
 
     def test_get_path(self):
         hotkey = "hotkey0"
-        model_id = ModelId(
-            namespace="test_model",
-            name="test_name",
-            commit="test_commit",
-            hash="test_hash",
-        )
 
-        expected_path = utils.get_local_model_dir("test-models", hotkey, model_id)
-        actual_path = self.disk_store.get_path(hotkey, model_id)
+        expected_path = utils.get_local_miner_dir("test-models", hotkey)
+        actual_path = self.disk_store.get_path(hotkey)
 
         self.assertEqual(expected_path, actual_path)
 

diff --git a/tests/model/storage/disk/test_utils.py b/tests/model/storage/disk/test_utils.py
@@ -48,10 +48,38 @@ def test_get_local_model_dir(self):
             + self.sep
             + hotkey
             + self.sep
+            + "models--"
             + namespace
-            + "_"
+            + "--"
             + name
-            + "_"
+        )
+        self.assertEqual(model_dir, expected_path)
+
+    def test_get_local_model_snapshot_dir(self):
+        hotkey = "test-hotkey"
+        namespace = "test-namespace"
+        name = "test-name"
+        commit = "test-commit"
+        model_id = ModelId(
+            namespace=namespace, name=name, hash="test-hash", commit=commit
+        )
+
+        model_dir = utils.get_local_model_snapshot_dir(self.base_dir, hotkey, model_id)
+
+        expected_path = (
+            self.base_dir
+            + self.sep
+            + "models"
+            + self.sep
+            + hotkey
+            + self.sep
+            + "models--"
+            + namespace
+            + "--"
+            + name
+            + self.sep
+            + "snapshots"
+            + self.sep
             + commit
         )
         self.assertEqual(model_dir, expected_path)
@@ -91,7 +119,8 @@ def test_remove_dir_out_of_grace(self):
         time.sleep(1)
 
         self.assertTrue(os.path.exists(self.base_dir))
-        utils.remove_dir_out_of_grace(self.base_dir, 0)
+        deleted = utils.remove_dir_out_of_grace(self.base_dir, 0)
+        self.assertTrue(deleted)
         self.assertFalse(os.path.exists(self.base_dir))
 
     def test_remove_dir_out_of_grace_in_grace(self):
@@ -104,7 +133,8 @@ def test_remove_dir_out_of_grace_in_grace(self):
         file.close()
 
         self.assertTrue(os.path.exists(self.base_dir))
-        utils.remove_dir_out_of_grace(self.base_dir, 60)
+        deleted = utils.remove_dir_out_of_grace(self.base_dir, 60)
+        self.assertFalse(deleted)
         self.assertTrue(os.path.exists(self.base_dir))
 
     def test_get_hash_of_file(self):

diff --git a/tests/model/storage/fake_model_metadata_store.py b/tests/model/storage/fake_model_metadata_store.py
@@ -12,7 +12,21 @@ def __init__(self):
         self.metadata = dict()
         self.store_errors = deque()
 
-    async def store_model_metadata(self, hotkey: str, model_metadata: ModelMetadata):
+    async def store_model_metadata(self, hotkey: str, model_id: ModelId):
+        """Fake stores model metadata for a specific hotkey."""
+
+        # Return an injected error if we have one.
+        if len(self.store_errors) > 0:
+            raise self.store_errors.popleft()
+
+        model_metadata = ModelMetadata(id=model_id, block=self.current_block)
+        self.current_block += 1
+
+        self.metadata[hotkey] = model_metadata
+
+    async def store_model_metadata_exact(
+        self, hotkey: str, model_metadata: ModelMetadata
+    ):
         """Fake stores model metadata for a specific hotkey."""
 
         # Return an injected error if we have one.

diff --git a/tests/model/storage/fake_remote_model_store.py b/tests/model/storage/fake_remote_model_store.py
@@ -1,4 +1,5 @@
 from model.data import Model, ModelId
+from model.storage.disk import utils
 from model.storage.remote_model_store import RemoteModelStore
 
 
@@ -21,9 +22,14 @@ async def download_model(self, model_id: ModelId, local_path: str) -> Model:
 
         model = self.remote_models[model_id]
 
+        # Parse out the hotkey and the base path from local_path to replicate hugging face logic.
+        split_string = local_path.split("/")
+
         # Store it at the local_path
         model.pt_model.save_pretrained(
-            save_directory=local_path,
+            save_directory=utils.get_local_model_snapshot_dir(
+                split_string[0], split_string[2], model_id
+            ),
             safe_serialization=True,
         )
 

diff --git a/tests/model/test_model_updater.py b/tests/model/test_model_updater.py
@@ -38,7 +38,9 @@ def test_get_metadata(self):
         )
         model_metadata = ModelMetadata(id=model_id, block=1)
 
-        asyncio.run(self.metadata_store.store_model_metadata(hotkey, model_metadata))
+        asyncio.run(
+            self.metadata_store.store_model_metadata_exact(hotkey, model_metadata)
+        )
 
         metadata = asyncio.run(self.model_updater._get_metadata(hotkey))
 
@@ -56,7 +58,9 @@ def test_sync_model_bad_metadata(self):
         model_metadata = ModelMetadata(id=model_id, block=1)
 
         # Setup the metadata with a commit that doesn't exist in the remote store.
-        asyncio.run(self.metadata_store.store_model_metadata(hotkey, model_metadata))
+        asyncio.run(
+            self.metadata_store.store_model_metadata_exact(hotkey, model_metadata)
+        )
 
         # FakeRemoteModelStore raises a KeyError but HuggingFace may raise other exceptions.
         with self.assertRaises(Exception):
@@ -77,7 +81,9 @@ def test_sync_model_same_metadata(self):
         model = Model(id=model_id, pt_model=pt_model)
 
         # Setup the metadata, local, and model_tracker to match.
-        asyncio.run(self.metadata_store.store_model_metadata(hotkey, model_metadata))
+        asyncio.run(
+            self.metadata_store.store_model_metadata_exact(hotkey, model_metadata)
+        )
         self.local_store.store_model(hotkey, model)
 
         self.model_tracker.on_miner_model_updated(hotkey, model_metadata)
@@ -105,7 +111,9 @@ def test_sync_model_new_metadata(self):
         model = Model(id=model_id, pt_model=pt_model)
 
         # Setup the metadata and remote store but not local or the model_tracker.
-        asyncio.run(self.metadata_store.store_model_metadata(hotkey, model_metadata))
+        asyncio.run(
+            self.metadata_store.store_model_metadata_exact(hotkey, model_metadata)
+        )
         asyncio.run(self.remote_store.upload_model(model))
 
         self.assertIsNone(
@@ -148,7 +156,9 @@ def test_sync_model_bad_hash(self):
         model = Model(id=model_id, pt_model=pt_model)
 
         # Setup the metadata and remote store and but not local or the model tracker.
-        asyncio.run(self.metadata_store.store_model_metadata(hotkey, model_metadata))
+        asyncio.run(
+            self.metadata_store.store_model_metadata_exact(hotkey, model_metadata)
+        )
         self.remote_store.inject_mismatched_model(model_id_chain, model)
 
         # Assert we fail due to the hash mismatch between the model in remote store and the metadata on chain.
@@ -177,7 +187,9 @@ def test_sync_model_over_max_parameters(self):
         model = Model(id=model_id, pt_model=pt_model)
 
         # Setup the metadata and remote store but not local or the model_tracker.
-        asyncio.run(self.metadata_store.store_model_metadata(hotkey, model_metadata))
+        asyncio.run(
+            self.metadata_store.store_model_metadata_exact(hotkey, model_metadata)
+        )
         asyncio.run(self.remote_store.upload_model(model))
 
         # Assert we fail due to exceeding the maximum allowed parameter size.