Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

On ppan, use hsmget to copy model files before opening #144

Merged
merged 3 commits into from
Feb 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion diagnostics/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,6 @@ This folder provides scripts, that can be utilized for analyzing model results.
Please contact the code manager Yi-Cheng Teng (Contact: [email protected]) if you have any queations.

## Prerequisite
"The majority of the scripts are written in Python. One can follow the instructions (README.md under tools folder) to create a Python virtual environment on NOAA R&D HPCs or GFDL Analysis.
The majority of the scripts are written in Python. One can follow the instructions (README.md under tools folder) to create a Python virtual environment on NOAA R&D HPCs or GFDL Analysis.

The Python scripts that read model data are written to take advantage of the `hsmget` command for faster reading and caching of data from the archive filesystem. When using GFDL Analysis, it is recommended to load the hsm module (`module load hsm/1.3.0`) before running any of the diagnostic scripts.
87 changes: 73 additions & 14 deletions diagnostics/physics/plot_common.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
import cartopy.crs as ccrs
from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter, LatitudeLocator, LongitudeLocator
from dataclasses import dataclass
from getpass import getuser
from glob import glob
from matplotlib.colors import BoundaryNorm, ListedColormap
import matplotlib.pyplot as plt
import numpy as np
import os
from pathlib import Path
import re
from subprocess import run, DEVNULL
from shutil import which
import xarray
import xskillscore
import logging
Expand All @@ -14,6 +20,50 @@
# Configure logging for plot_common
logger = logging.getLogger(__name__)

# hsmget, available on GFDL PPAN, will make it faster, easier, and safer
# to read data from /archive.
# To use this, run `module load hsm/1.3.0` beforehand.
@dataclass
class HSMGet():
archive: Path = Path('/') # this will duplicate paths used by frepp
ptmp: Path = Path('/ptmp') / getuser()
tmp: Path = Path(os.environ.get('TMPDIR', ptmp))

def _run(self, cmd, stdout=DEVNULL, stderr=DEVNULL):
# This will escape things like (1) in the file name
# so that it can be run as a shell command.
esc = re.sub(r'([\(\)])', r'\\\1', cmd)
return run(esc, shell=True, check=True, stdout=stdout, stderr=stderr)

def _dirs_exist(self):
return self.archive.is_dir() and self.ptmp.is_dir() and self.tmp.is_dir()

def __call__(self, path_or_paths):
if which('hsmget') is None or not self._dirs_exist():
# If hsmget or /archive, /ptmp, etc are not available, this will just return the input path(s).
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this assumes the user has already module loaded hsmget, maybe we should tell the user somewhere to run that command beforehand if they want to use hsmget?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea! I added a reminder to the log message, a comment, and a note in the README.

logger.info('Not using hsmget. If running on GFDL analysis, run `module load hsm/1.3.0` to enable using hsmget. ')
return path_or_paths
elif isinstance(path_or_paths, Path):
# Find the file path on archive, relative to the root part of archive.
# (This is how hsmget will want it).
relative = path_or_paths.relative_to(self.archive)
# hsmget will do the dmget first and this is fine since it's one file
cmd = f'hsmget -q -a {self.archive} -w {self.tmp} -p {self.ptmp} {relative}'
self._run(cmd)
return (self.tmp / relative)
elif iter(path_or_paths):
# dmget all files with one dmget command.
p_str = ' '.join([p.as_posix() for p in path_or_paths])
self._run(f'dmget {p_str}')
relative = [p.relative_to(self.archive) for p in path_or_paths]
rel_str = ' '.join(map(str, relative))
cmd = f'hsmget -q -a {self.archive} -w {self.tmp} -p {self.ptmp} {rel_str}'
self._run(cmd)
return [self.tmp / r for r in relative]
else:
raise Exception('Need a Path or iterable of Paths to get')


def center_to_outer(center, left=None, right=None):
"""
Given an array of center coordinates, find the edge coordinates,
Expand Down Expand Up @@ -138,23 +188,32 @@ def add_ticks(ax, xticks=np.arange(-100, -31, 1), yticks=np.arange(2, 61, 1), xl
ax.xaxis.set_major_formatter(lon_formatter)
ax.yaxis.set_major_formatter(lat_formatter)

def open_var(pp_root, kind, var):
def open_var(pp_root, kind, var, hsmget=HSMGet()):
if isinstance(pp_root, str):
pp_root = Path(pp_root)
freq = 'daily' if 'daily' in kind else 'monthly'
longslice = '19930101-20191231' if freq == 'daily' else '199301-201912'
longfile = os.path.join(pp_root, 'pp', kind, 'ts', freq, '27yr', f'{kind}.{longslice}.{var}.nc')
if os.path.isfile(longfile):
os.system(f'dmget {longfile}')
return xarray.open_dataset(longfile)[var]
elif len(glob(os.path.join(pp_root, 'pp', kind, 'ts', freq, '1yr', f'{kind}.*.{var}.nc'))) > 0:
files = glob(os.path.join(pp_root, 'pp', kind, 'ts', freq, '1yr', f'{kind}.*.{var}.nc'))
os.system(f'dmget {" ".join(files)}')
return xarray.open_mfdataset(files)[var]
elif len(glob(os.path.join(pp_root, 'pp', kind, 'ts', freq, '5yr', f'{kind}.*.{var}.nc'))) > 0:
files = glob(os.path.join(pp_root, 'pp', kind, 'ts', freq, '5yr', f'{kind}.*.{var}.nc'))
os.system(f'dmget {" ".join(files)}')
return xarray.open_mfdataset(files)[var]
# For now, replicate previous behavior by looking for a single file covering 1993--2019,
# and if not found then looking for multiple files written in either yearly or 5-yearly chunks.
# Later this can be replaced to more intelligently look for the right file(s).
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this comment intelligently look for the right file(s) : )

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have a PR ready for once this one is merged 😃

longfile = pp_root / 'pp' / kind / 'ts' / freq / '27yr' / f'{kind}.{longslice}.{var}.nc'
if longfile.exists():
tmpfile = hsmget(longfile)
return xarray.open_dataset(tmpfile, decode_timedelta=True)[var] # Avoid FutureWarning about decode_timedelta
else:
raise Exception('Did not find postprocessed files')
possible_chunks = [1, 5]
for chunk in possible_chunks:
short_dir = pp_root / 'pp' / kind / 'ts' / freq / f'{chunk}yr'
if short_dir.is_dir():
break
else:
raise Exception('Did not find directory for postprocessed files')
short_files = list(short_dir.glob(f'{kind}.*.{var}.nc'))
if len(short_files) > 0:
tmpfiles = hsmget(sorted(short_files))
return xarray.open_mfdataset(tmpfiles, decode_timedelta=True)[var] # Avoid FutureWarning about decode_timedelta
else:
raise Exception('Did not find postprocessed files')

def save_figure(fname, label='', pdf=False, output_dir='figures'):
if label == '':
Expand Down