palonso · palonso · Jan 25, 2024 · Jan 25, 2024 · Jan 25, 2024 · Jan 26, 2024
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,8 @@ data_out/*
 exp_out/* 
 exp_logs/*
 datasets/*/data/*
+packaging/input_models/
+packaging/output_models/
 
 *.swp
 

diff --git a/discogs/datamodule.py b/discogs/datamodule.py
@@ -37,6 +37,8 @@ def default_config():
 
     clip_length = 10
 
+    num_replicas = 1
+
     roll = {
         "do": False,  # apply roll augmentation
         "axis": -1,
@@ -97,13 +99,15 @@ def __iter__(self):
 
 class DiscogsDataModule(pl.LightningDataModule):
     @datamodule_ing.capture
-    def __init__(self, masking):
+    def __init__(self, masking, num_replicas):
         super().__init__()
 
         if masking["do"]:
             params = {k: v for k, v in masking.items() if k != "do"}
             self.spec_masking = SpecMasking(**params)
 
+        self.num_replicas = num_replicas
+
     @datamodule_ing.capture(prefix="roll")
     def get_roll_func(self, axis, shift, shift_range):
         _logger.info("rolling...")
@@ -183,11 +187,13 @@ def get_ft_weighted_sampler(
         epoch_len,
         sampler_replace,
     ):
-        world_size = int(os.environ.get("WORLD_SIZE", 1))
+        num_replicas = self.num_replicas
         local_rank = int(os.environ.get("LOCAL_RANK", 0))
-        if world_size > 1:
-            _logger.info(f"WORLD_SIZE: {world_size}")
-            _logger.info(f"LOCAL_RANK: {local_rank}")
+
+        if num_replicas > 1:
+            _logger.debug("Distributed training:")
+            _logger.debug(f"  num_replicas: {num_replicas}")
+            _logger.debug(f"  local_rank: {local_rank}")
 
         sample_weights = self.get_ft_cls_balanced_sample_weights(
             groundtruth=groundtruth
@@ -198,7 +204,7 @@ def get_ft_weighted_sampler(
                 sample_weights, num_samples=epoch_len, replacement=sampler_replace
             ),
             dataset=range(epoch_len),
-            num_replicas=world_size,
+            num_replicas=num_replicas,
             rank=local_rank,
         )
 

diff --git a/ex_maest.py b/ex_maest.py
@@ -42,6 +42,7 @@
 def default_conf():
     process_id = os.getpid()
     timestamp = datetime.now().strftime("%y%m%d-%H%M%S")
+    ckpt_path = False
 
     trainer = {
         "max_epochs": 130,
@@ -55,6 +56,7 @@ def default_conf():
         "reload_dataloaders_every_n_epochs": 1,
         "strategy": "ddp_find_unused_parameters_true",
         "default_root_dir": "exp_logs",
+        "num_nodes": 1,
     }
 
     predict = {
@@ -83,9 +85,9 @@ def main(_run, _config, _log, _rnd, _seed):
     else:
         module = Module(distributed_mode=distributed_mode)
 
-    data = DiscogsDataModule()
+    data = DiscogsDataModule(num_replicas=_config["trainer"]["devices"])
 
-    trainer.fit(module, data)
+    trainer.fit(module, data, ckpt_path=_config["ckpt_path"])
     return {"done": True}
 
 
@@ -96,7 +98,7 @@ def test(_run, _config, _log, _rnd, _seed):
     module = Module()
     module.do_swa = False
 
-    data = DiscogsDataModule()
+    data = DiscogsDataModule(num_replicas=_config["trainer"]["devices"])
 
     trainer.test(module, data)
     return {"done": True}
@@ -165,7 +167,7 @@ def predict(_run, _config, _log, _rnd, _seed, output_name=""):
     module.set_prediction_tranformer_block(_config["predict"]["transformer_block"])
     module.eval()
 
-    data = DiscogsDataModule()
+    data = DiscogsDataModule(num_replicas=_config["trainer"]["devices"])
 
     outputs = trainer.predict(module, data)
 

diff --git a/ex_maest519.sh b/ex_maest519.sh
@@ -0,0 +1,21 @@
+set -e
+
+# NCCL_SOCKET_IFNAME=vlan884 MASTER_ADDR=10.55.0.129 MASTER_PORT=6666 NCCL_IB_DISABLE=1 NODE_RANK=0 NCCL_BLOCKING_WAIT=0  
+
+CUDA_VISIBLE_DEVICES=0,1,2,3 NCCL_BLOCKING_WAIT=0 python ex_maest.py -l INFO with maest_30s_from_passt_pretrain \
+    ckpt_path="exp_logs/lightning_logs/428/checkpoints/epoch=121.no_swa.ckpt" \
+    trainer.num_sanity_val_steps=0 \
+    trainer.num_nodes=1 \
+    trainer.devices=4 \
+    trainer.log_every_n_steps=100 \
+    datamodule.batch_size_train=6 \
+    datamodule.batch_size_test=6 \
+    maest.n_classes=519 \
+    datamodule.groundtruth_train=merged.pk \
+    datamodule.groundtruth_val=/home/palonso/reps/dt-training/src/preprocessing/231026-preprocessing/groundtruth_val.pk.fix \
+    datamodule.groundtruth_test=/home/palonso/reps/dt-training/src/preprocessing/231026-preprocessing/groundtruth_salmorejo.pk.fix \
+    datamodule.base_dir=/mnt/projects/discotube-melspectrograms/ \
+    datamodule.base_dir_val=/home/palonso/data/discotube/discotube-specs/ \
+    # ckpt_path="exp_logs/lightning_logs/306/checkpoints/epoch=44-val_loss=0.01-best.ckpt " \
+    # ckpt_path="exp_logs/lightning_logs/306/checkpoints/epoch=57.no.swa.ckpt" \
+    # ckpt_path="exp_logs/lightning_logs/306/checkpoints/epoch=57.no.swa.ckpt" \