addresses #4

RickytheGuy · RickytheGuy · commit 654f703cb849 · 2024-04-08T15:59:17.000-06:00
diff --git a/computation_scripts/master.py b/computation_scripts/master.py
@@ -413,27 +413,27 @@ def concatenate_outputs() -> None:
     if not qouts:
         raise FileNotFoundError("No Qout files found. RAPID probably not run correctly.")
 
-    unique_start_dates = sorted({os.path.basename(f).split('_')[2] for f in qouts})
-
-    for unique_start_date in unique_start_dates:
-        with xr.open_zarr(local_zarr) as retro_ds:
-            chunks = retro_ds.chunks
-            with xr.open_mfdataset(
-                    [qout for qout in qouts if unique_start_date in qout],
-                    combine='nested',
-                    concat_dim='rivid',
-                    preprocess=drop_coords
-            ).reindex(rivid=retro_ds['rivid']) as new_ds:
-                earliest_date = np.datetime_as_string(new_ds.time[0].values, unit="h")
-                latest_date = np.datetime_as_string(new_ds.time[-1].values, unit="h")
-                CL.log_message('RUNNING', f'Appending to zarr: {earliest_date} to {latest_date}')
-                logging.info(f'Appending to zarr: {earliest_date} to {latest_date}')
-                (
-                    new_ds
-                    .chunk({"time": chunks["time"][0], "rivid": chunks["rivid"][0]})
-                    .to_zarr(local_zarr, mode='a', append_dim='time', consolidated=True)
-                )
-                logging.info(f'Finished appending')
+    with xr.open_zarr(local_zarr) as retro_ds:
+        chunks = retro_ds.chunks
+        with xr.open_mfdataset(
+                qouts,
+                combine='nested',
+                concat_dim='rivid',
+                parallel=True,
+                preprocess=drop_coords
+        ).reindex(rivid=retro_ds['rivid']) as new_ds:
+            earliest_date = np.datetime_as_string(new_ds.time[0].values, unit="h")
+            latest_date = np.datetime_as_string(new_ds.time[-1].values, unit="h")
+            new_ds = new_ds.round(decimals=3)
+            new_ds = new_ds.where(new_ds['Qout'] >= 0.0, 0.0)
+            CL.log_message('RUNNING', f'Appending to zarr: {earliest_date} to {latest_date}')
+            logging.info(f'Appending to zarr: {earliest_date} to {latest_date}')
+            (
+                new_ds
+                .chunk({"time": chunks["time"][0], "rivid": chunks["rivid"][0]})
+                .to_zarr(local_zarr, mode='a', append_dim='time', consolidated=True)
+            )
+            logging.info(f'Finished appending')
     return
 
 
diff --git a/downloader_scripts/check_and_download_era.py b/downloader_scripts/check_and_download_era.py
@@ -164,17 +164,30 @@ def download_era5() -> None:
         return
 
     print('converting to daily cumulative')
-    for downloaded_file in downloaded_files:
-        daily_cumulative_file_name = os.path.basename(downloaded_file).replace('.nc', '_daily_cumulative.nc')
-        with xr.open_dataset(downloaded_file) as ds:
-            print(f'processing {downloaded_file}')
-
-            if ds['time'].shape[0] == 0:
-                print(f'No time steps were downloaded- the shape of the time array is 0.')
-                print(f'Removing {downloaded_file}')
-                os.remove(downloaded_file)
-                continue
-
+    year_1 = year_month_combos[0][0]
+    month_1 = year_month_combos[0][1]
+    if len(year_month_combos) > 1:
+        year_2 = year_month_combos[1][0]
+        month_2 = year_month_combos[1][1]
+    else:
+        year_2 = year_1
+        month_2 = month_1
+    day_1 = min({d.day for d in date_range if d.year == year_1 and d.month == month_1})
+    day_2 = max({d.day for d in date_range if d.year == year_2 and d.month == month_2})
+    daily_cumulative_file_name = f'era5_{year_1}{str(month_1).zfill(2)}{str(day_1).zfill(2)}-{year}{str(month_2).zfill(2)}{str(day_2).zfill(2)}_daily_cumulative.nc'
+    with xr.open_mfdataset(downloaded_files,
+                           concat_dim='time', 
+                           combine='nested', 
+                           parallel=True, 
+                           chunks = {'time':'auto', 'lat':'auto','lon':'auto'}, # Included to prevent weird slicing behavior and missing data
+                           ) as ds:
+        print(f'processing {", ".join(downloaded_files)}')
+
+        if ds['time'].shape[0] == 0:
+            print(f'No time steps were downloaded- the shape of the time array is 0.')
+            print(f'Removing {", ".join(downloaded_files)}')
+            {os.remove(downloaded_file) for downloaded_file in downloaded_files}
+        else:
             if 'expver' in ds.dims:
                 print('expver in dims')
                 # find the time steps where the runoff is not nan when expver=1
@@ -211,10 +224,10 @@ def download_era5() -> None:
             ds.to_netcdf(os.path.join(era_dir, daily_cumulative_file_name))
             print(f'uploading {daily_cumulative_file_name}')
             subprocess.call(['aws', 's3', 'cp', os.path.join(era_dir, daily_cumulative_file_name),
-                             os.path.join(s3_era_bucket, os.path.basename(downloaded_file))])
+                                os.path.join(s3_era_bucket, os.path.basename(daily_cumulative_file_name))])
 
-            # remove the original file
-            os.remove(downloaded_file)
+            # remove the original files
+            {os.remove(downloaded_file) for downloaded_file in downloaded_files}
 
             # remove the consolidated file
             os.remove(os.path.join(era_dir, daily_cumulative_file_name))