NOAA-GFDL · yichengt900 · Feb 13, 2025 · Feb 13, 2025 · Feb 13, 2025 · Feb 13, 2025
diff --git a/diagnostics/README.md b/diagnostics/README.md
@@ -3,4 +3,6 @@ This folder provides scripts, that can be utilized for analyzing model results.
 Please contact the code manager Yi-Cheng Teng (Contact: [email protected]) if you have any queations.
 
 ## Prerequisite
-"The majority of the scripts are written in Python. One can follow the instructions (README.md under tools folder) to create a Python virtual environment on NOAA R&D HPCs or GFDL Analysis.
+The majority of the scripts are written in Python. One can follow the instructions (README.md under tools folder) to create a Python virtual environment on NOAA R&D HPCs or GFDL Analysis.
+
+The Python scripts that read model data are written to take advantage of the `hsmget` command for faster reading and caching of data from the archive filesystem. When using GFDL Analysis, it is recommended to load the hsm module (`module load hsm/1.3.0`) before running any of the diagnostic scripts. 
diff --git a/diagnostics/physics/plot_common.py b/diagnostics/physics/plot_common.py
@@ -1,10 +1,16 @@
 import cartopy.crs as ccrs
 from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter, LatitudeLocator, LongitudeLocator
+from dataclasses import dataclass
+from getpass import getuser
 from glob import glob
 from matplotlib.colors import BoundaryNorm, ListedColormap
 import matplotlib.pyplot as plt
 import numpy as np
 import os
+from pathlib import Path
+import re
+from subprocess import run, DEVNULL
+from shutil import which
 import xarray
 import xskillscore
 import logging
@@ -14,6 +20,50 @@
 # Configure logging for plot_common
 logger = logging.getLogger(__name__)
 
+# hsmget, available on GFDL PPAN, will make it faster, easier, and safer
+# to read data from /archive. 
+# To use this, run `module load hsm/1.3.0` beforehand. 
+@dataclass
+class HSMGet():
+    archive: Path = Path('/') # this will duplicate paths used by frepp
+    ptmp: Path = Path('/ptmp') / getuser()
+    tmp: Path = Path(os.environ.get('TMPDIR', ptmp))
+
+    def _run(self, cmd, stdout=DEVNULL, stderr=DEVNULL):
+        # This will escape things like (1) in the file name 
+        # so that it can be run as a shell command. 
+        esc = re.sub(r'([\(\)])', r'\\\1', cmd)
+        return run(esc, shell=True, check=True, stdout=stdout, stderr=stderr)
+
+    def _dirs_exist(self):
+        return self.archive.is_dir() and self.ptmp.is_dir() and self.tmp.is_dir()
+
+    def __call__(self, path_or_paths):
+        if which('hsmget') is None or not self._dirs_exist():
+            # If hsmget or /archive, /ptmp, etc are not available, this will just return the input path(s).
+            logger.info('Not using hsmget. If running on GFDL analysis, run `module load hsm/1.3.0` to enable using hsmget. ')
+            return path_or_paths
+        elif isinstance(path_or_paths, Path):
+            # Find the file path on archive, relative to the root part of archive.
+            # (This is how hsmget will want it).
+            relative = path_or_paths.relative_to(self.archive)
+            # hsmget will do the dmget first and this is fine since it's one file
+            cmd = f'hsmget -q -a {self.archive} -w {self.tmp} -p {self.ptmp} {relative}'
+            self._run(cmd)
+            return (self.tmp / relative)
+        elif iter(path_or_paths):
+            # dmget all files with one dmget command.
+            p_str = ' '.join([p.as_posix() for p in path_or_paths])
+            self._run(f'dmget {p_str}')
+            relative = [p.relative_to(self.archive) for p in path_or_paths]
+            rel_str = ' '.join(map(str, relative))
+            cmd = f'hsmget -q -a {self.archive} -w {self.tmp} -p {self.ptmp} {rel_str}'
+            self._run(cmd)
+            return [self.tmp / r for r in relative]
+        else:
+            raise Exception('Need a Path or iterable of Paths to get')
+
+
 def center_to_outer(center, left=None, right=None):
     """
     Given an array of center coordinates, find the edge coordinates,
@@ -138,23 +188,32 @@ def add_ticks(ax, xticks=np.arange(-100, -31, 1), yticks=np.arange(2, 61, 1), xl
     ax.xaxis.set_major_formatter(lon_formatter)
     ax.yaxis.set_major_formatter(lat_formatter)
 
-def open_var(pp_root, kind, var):
+def open_var(pp_root, kind, var, hsmget=HSMGet()):
+    if isinstance(pp_root, str):
+        pp_root = Path(pp_root)
     freq = 'daily' if 'daily' in kind else 'monthly'
     longslice = '19930101-20191231' if freq == 'daily' else '199301-201912'
-    longfile = os.path.join(pp_root, 'pp', kind, 'ts', freq, '27yr', f'{kind}.{longslice}.{var}.nc')
-    if os.path.isfile(longfile):
-        os.system(f'dmget {longfile}')
-        return xarray.open_dataset(longfile)[var]
-    elif len(glob(os.path.join(pp_root, 'pp', kind, 'ts', freq, '1yr', f'{kind}.*.{var}.nc'))) > 0:
-        files = glob(os.path.join(pp_root, 'pp', kind, 'ts', freq, '1yr', f'{kind}.*.{var}.nc'))
-        os.system(f'dmget {" ".join(files)}')
-        return xarray.open_mfdataset(files)[var]
-    elif len(glob(os.path.join(pp_root, 'pp', kind, 'ts', freq, '5yr', f'{kind}.*.{var}.nc'))) > 0:
-        files = glob(os.path.join(pp_root, 'pp', kind, 'ts', freq, '5yr', f'{kind}.*.{var}.nc'))
-        os.system(f'dmget {" ".join(files)}')
-        return xarray.open_mfdataset(files)[var]
+    # For now, replicate previous behavior by looking for a single file covering 1993--2019,
+    # and if not found then looking for multiple files written in either yearly or 5-yearly chunks.
+    # Later this can be replaced to more intelligently look for the right file(s). 
+    longfile = pp_root / 'pp' / kind / 'ts' / freq / '27yr' / f'{kind}.{longslice}.{var}.nc'
+    if longfile.exists():
+        tmpfile = hsmget(longfile)
+        return xarray.open_dataset(tmpfile, decode_timedelta=True)[var] # Avoid FutureWarning about decode_timedelta
     else:
-        raise Exception('Did not find postprocessed files')
+        possible_chunks = [1, 5]
+        for chunk in possible_chunks:
+            short_dir = pp_root / 'pp' / kind / 'ts' / freq / f'{chunk}yr'
+            if short_dir.is_dir():
+                break
+        else:
+            raise Exception('Did not find directory for postprocessed files')
+        short_files = list(short_dir.glob(f'{kind}.*.{var}.nc'))
+        if len(short_files) > 0:
+            tmpfiles = hsmget(sorted(short_files))
+            return xarray.open_mfdataset(tmpfiles, decode_timedelta=True)[var] # Avoid FutureWarning about decode_timedelta
+        else:
+            raise Exception('Did not find postprocessed files')
 
 def save_figure(fname, label='', pdf=False, output_dir='figures'):
     if label == '':