UrbanSystemsLab · SallyElHajjar · Oct 29, 2024 · Oct 31, 2024 · Oct 31, 2024 · Nov 4, 2024
diff --git a/.github/workflows/cloud_functions/test-local.sh b/.github/workflows/cloud_functions/test-local.sh
@@ -2,4 +2,4 @@
 flake8 usl_pipeline/cloud_functions --show-source --statistics
 black usl_pipeline/cloud_functions --check
 pytest usl_pipeline/cloud_functions
-mypy usl_pipeline/cloud_functions
+mypy usl_pipeline/cloud_functions
diff --git a/.github/workflows/usl_models/test-local.sh b/.github/workflows/usl_models/test-local.sh
@@ -0,0 +1,6 @@
+# .github/workflows/usl_models/test-local.sh
+# if it fails, we should give permission: chmod +x /home/elhajjas/climateiq-cnn/.github/workflows/usl_models/test-local.sh
+flake8 usl_models --show-source --statistics
+black usl_models --check
+pytest usl_models -k "not integration"
+mypy usl_models
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,8 @@
 build
 dist
 __pycache__
+.mypy_cache
+.pytest_cache
 *.egg-info
 venv
 

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,9 +1,9 @@
 {
     "jupyter.notebookFileRoot": "${fileDirname}/..",
     "python.testing.pytestArgs": [
-        "usl_models", "--rootdir=usl_models", "-k",  "not integration"
+        "usl_models"
     ],
-    "python.testing.cwd": "usl_models",
     "python.testing.unittestEnabled": false,
-    "python.testing.pytestEnabled": true
+    "python.testing.pytestEnabled": true,
+    "python.testing.pytestPath": "pytest"
 }
diff --git a/usl_models/notebooks/train_atmo_model.ipynb b/usl_models/notebooks/train_atmo_model.ipynb
diff --git a/usl_models/scripts/run_atmo_training_job.py b/usl_models/scripts/run_atmo_training_job.py
@@ -0,0 +1,87 @@
+import argparse
+import distutils.core
+import os
+import pathlib
+
+from google.cloud import aiplatform
+from google.cloud import storage  # type:ignore[attr-defined]
+
+IMAGE = "us-docker.pkg.dev/vertex-ai/training/tf-gpu.2-14.py310:latest"
+
+
+def main():
+    cli_args = _parse_args()
+
+    # Build a source distribution for the usl_models package.
+    this_file = pathlib.Path(os.path.realpath(__file__))
+    package_dir = this_file.parent.parent
+    setup_path = package_dir / "setup.py"
+    distutils.core.run_setup(setup_path, script_args=["sdist", "--format=gztar"])
+
+    # Upload the usl_models package to GCS.
+    source_dist_path = package_dir / "dist" / "usl_models-0.0.0.tar.gz"
+    client = storage.Client(project="climateiq")
+    bucket = client.bucket("climateiq-vertexai")
+    bucket.blob("usl_models-0.0.0.tar.gz").upload_from_filename(str(source_dist_path))
+
+    # Run the training script trainer/flood_task.py in VertexAI.
+    job = aiplatform.CustomPythonPackageTrainingJob(
+        display_name=cli_args.model_name,
+        python_package_gcs_uri="gs://climateiq-vertexai/usl_models-0.0.0.tar.gz",
+        python_module_name="trainer.atmo_task",
+        container_uri=IMAGE,
+        model_serving_container_image_uri=IMAGE,
+        staging_bucket="gs://climateiq-vertexai",
+    )
+
+    job_args = ["--sim-names", *cli_args.sim_names]
+    if cli_args.epochs:
+        job_args.extend(("--epochs", str(cli_args.epochs)))
+    if cli_args.batch_size:
+        job_args.extend(("--batch-size", str(cli_args.batch_size)))
+    if cli_args.model_name:
+        job_args.extend(("--model-name", cli_args.model_name))
+
+    print(f"Creating training job with arguments {job_args}")
+    job.run(
+        model_display_name="atmo-model",
+        args=job_args,
+        replica_count=1,
+        machine_type="a2-highgpu-1g",
+        accelerator_type="NVIDIA_TESLA_A100",
+        accelerator_count=1,
+        sync=True,
+    )
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model-name",
+        dest="model_name",
+        type=str,
+        help="A name for the model.",
+        required=True,
+    )
+    parser.add_argument("--epochs", dest="epochs", type=int, help="Number of epochs.")
+    parser.add_argument(
+        "--batch-size", dest="batch_size", type=int, help="Size of a batch."
+    )
+    parser.add_argument(
+        "--sim-names",
+        dest="sim_names",
+        nargs="+",
+        type=str,
+        required=True,
+        help=(
+            "Space-separated set of simulations to train the model against, e.g. "
+            "--sim-names "
+            "NYC_Heat_Test/NYC_summer_ "
+            "Phoenix_Heat_Test/Phoenix_summer_"
+        ),
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/usl_models/tests/atmo_ml/atmo_model_test.py b/usl_models/tests/atmo_ml/atmo_model_test.py
@@ -7,10 +7,10 @@
 from usl_models.atmo_ml import constants
 from usl_models.atmo_ml import model_params
 
-_TEST_MAP_HEIGHT = 100
-_TEST_MAP_WIDTH = 100
-_TEST_SPATIAL_FEATURES = 17  # lu_index is now separate
-_TEST_SPATIOTEMPORAL_FEATURES = 9
+_TEST_MAP_HEIGHT = 200
+_TEST_MAP_WIDTH = 200
+_TEST_SPATIAL_FEATURES = 22  # lu_index is now separate
+_TEST_SPATIOTEMPORAL_FEATURES = 12
 _LU_INDEX_VOCAB_SIZE = 61
 _EMBEDDING_DIM = 8
 

diff --git a/usl_models/tests/atmo_ml/atmo_utils_test.py b/usl_models/tests/atmo_ml/atmo_utils_test.py
@@ -62,10 +62,6 @@ def test_split_time_step_pairs():
     expected_output = tf.constant(
         [
             [
-                [
-                    [[0, 1], [4, 5]],
-                    [[8, 9], [12, 13]],
-                ],
                 [
                     [[2, 3], [6, 7]],
                     [[10, 11], [14, 15]],
@@ -82,10 +78,6 @@ def test_split_time_step_pairs():
                     [[32, 33], [36, 37]],
                     [[40, 41], [44, 45]],
                 ],
-                [
-                    [[34, 35], [38, 39]],
-                    [[42, 43], [46, 47]],
-                ],
             ],
         ]
     )

diff --git a/usl_models/tests/atmo_ml/dataset_test.py b/usl_models/tests/atmo_ml/dataset_test.py
@@ -0,0 +1,106 @@
+# TODO: fix this file
+import io
+
+from unittest import mock
+from unittest.mock import MagicMock
+
+import numpy as np
+
+import usl_models.testing
+from usl_models.atmo_ml import dataset
+from usl_models.atmo_ml import constants
+
+
+def create_mock_blob(data, dtype=np.float32, allow_pickle=True):
+    """Create a mock blob with simulated data and return it."""
+    blob = MagicMock()
+    buf = io.BytesIO()
+    np.save(buf, data.astype(dtype), allow_pickle=allow_pickle)
+    buf.seek(0)
+    blob.open.return_value = buf
+    return blob
+
+
+class TestAtmoMLDataset(usl_models.testing.TestCase):
+    @mock.patch("google.cloud.storage.Client")
+    def test_load_dataset_structure(self, mock_storage_client):
+        """Test creating AtmoML dataset from GCS with expected structure and shapes."""
+        # Mock GCS client and bucket
+        mock_storage_client_instance = mock_storage_client.return_value
+        mock_bucket = MagicMock()
+        mock_storage_client_instance.bucket.return_value = mock_bucket
+
+        num_days = 4
+        timesteps_per_day = 6
+        num_timesteps = num_days * timesteps_per_day
+        batch_size = 2
+
+        B = batch_size
+        H, W = constants.MAP_HEIGHT, constants.MAP_WIDTH
+        F_S = constants.NUM_SAPTIAL_FEATURES
+        F_ST = constants.NUM_SPATIOTEMPORAL_FEATURES
+        C = constants.OUTPUT_CHANNELS
+        T_I, T_O = constants.INPUT_TIME_STEPS, constants.OUTPUT_TIME_STEPS
+
+        # Simulate mock blobs for datasets
+        mock_spatial_blob = create_mock_blob(
+            np.random.rand(H, W, F_S).astype(np.float32)
+        )
+        mock_spatiotemporal_tensor = np.random.rand(H, W, F_ST).astype(np.float32)
+        mock_spatiotemporal_blobs = [
+            create_mock_blob(mock_spatiotemporal_tensor) for _ in range(num_timesteps)
+        ]
+        mock_lu_index_blob = create_mock_blob(
+            np.random.randint(
+                low=0,
+                high=10,
+                size=(H, W),
+            ).astype(np.int32)
+        )
+        mock_label_blobs = [
+            create_mock_blob(np.random.rand(H, W, C).astype(np.float32))
+            for _ in range(num_timesteps)
+        ]
+
+        # Mock blob listing behavior to simulate folder structure
+        mock_bucket.list_blobs.side_effect = lambda prefix: {
+            "sim1/spatial": [mock_spatial_blob],
+            "sim1/spatiotemporal": mock_spatiotemporal_blobs,
+            "sim1/lu_index": [mock_lu_index_blob],
+            "sim1": mock_label_blobs,
+        }[prefix]
+
+        # Define bucket names and folder paths
+        data_bucket_name = "test-data-bucket"
+        label_bucket_name = "test-label-bucket"
+
+        # Call the function under test
+        ds = dataset.load_dataset(
+            data_bucket_name=data_bucket_name,
+            label_bucket_name=label_bucket_name,
+            sim_names=["sim1"],
+            timesteps_per_day=timesteps_per_day,
+            storage_client=mock_storage_client_instance,
+        )
+        ds = ds.batch(batch_size=batch_size)
+
+        inputs, labels = zip(*ds)
+        num_batches = num_days // batch_size
+        self.assertShapesRecursive(
+            list(inputs),
+            [
+                {
+                    "spatiotemporal": (B, T_I, H, W, F_ST),
+                    "spatial": (B, H, W, F_S),
+                    "lu_index": (B, H, W),
+                }
+            ]
+            * num_batches,
+        )
+        self.assertShapesRecursive(
+            list(labels),
+            [
+                (B, T_O, H, W, C),
+            ]
+            * num_batches,
+        )