From 3e4f4fe6d504a7555485685fbd96bdcbd00274cf Mon Sep 17 00:00:00 2001 From: Ed Slavich Date: Fri, 21 May 2021 15:16:31 -0400 Subject: [PATCH] Add migrate_data script to make older x1d files readable by later versions of jwst --- CHANGES.rst | 8 +- docs/jwst/data_products/index.rst | 4 +- docs/jwst/data_products/migrating.rst | 30 ++++++ jwst/regtest/conftest.py | 10 +- jwst/regtest/regtestdata.py | 20 ++-- jwst/regtest/test_migrate_data.py | 43 ++++++++ scripts/migrate_data | 142 ++++++++++++++++++++++++++ 7 files changed, 238 insertions(+), 19 deletions(-) create mode 100644 docs/jwst/data_products/migrating.rst create mode 100644 jwst/regtest/test_migrate_data.py create mode 100755 scripts/migrate_data diff --git a/CHANGES.rst b/CHANGES.rst index 71b359dcf4..941b3fa378 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -11,7 +11,7 @@ combine_1d - Added SRCTYPE to COMBINE1D output extension headers, propagated from EXTRACT1D inputs [#6079] - + cube_build ---------- @@ -40,6 +40,12 @@ lib - Updated set_telescope_pointing to populate ENGQLPTG keyword with new allowed values [#6088] +scripts +------- + +- Add migrate_data command with support for migrating spec_table in + x1d files produced with <= 1.1.0 of this package. [#6055] + 1.2.0 (2021-05-24) ================== diff --git a/docs/jwst/data_products/index.rst b/docs/jwst/data_products/index.rst index 1d5882052e..d9cc684110 100644 --- a/docs/jwst/data_products/index.rst +++ b/docs/jwst/data_products/index.rst @@ -5,7 +5,7 @@ Data Products Information .. toctree:: :maxdepth: 3 - + stages.rst file_naming.rst product_types.rst @@ -13,4 +13,4 @@ Data Products Information science_products.rst nonscience_products.rst guidestar_products.rst - + migrating.rst diff --git a/docs/jwst/data_products/migrating.rst b/docs/jwst/data_products/migrating.rst new file mode 100644 index 0000000000..b9b52b648c --- /dev/null +++ b/docs/jwst/data_products/migrating.rst @@ -0,0 +1,30 @@ +Migrating deprecated products +----------------------------- + +On rare occasion, the model schemas are changed in such a way as to +break compatibility with data products produced by earlier versions +of this package. When these older files are opened the software +will report validation errors: + +.. doctest-skip:: + + >>> from jwst import datamodels + >>> datamodels.open("jw95115001001_02102_00001_nrs1_x1d.fits") + ... + ValueError: Column names don't match schema... + +In some cases it will be possible to update the file to the +new format using the ``migrate_data`` tool included with this package: +:: + + $ migrate_data jw95115001001_02102_00001_nrs1_x1d.fits --in-place + +It can also be run on multiple files: +:: + + $ migrate_data *_x1d.fits --in-place + +Or configured to write updated files to a separate output directory: +:: + + $ migrate_data *_x1d.fits --output-dir some/other/directory diff --git a/jwst/regtest/conftest.py b/jwst/regtest/conftest.py index 257e40aa35..e9c903acce 100644 --- a/jwst/regtest/conftest.py +++ b/jwst/regtest/conftest.py @@ -182,20 +182,18 @@ def generate_upload_schema(pattern, target, recursive=False): def _rtdata_fixture_implementation(artifactory_repos, envopt, request): """Provides the RemoteResource class""" inputs_root, results_root = artifactory_repos - rtdata = RegtestData(env=envopt, inputs_root=inputs_root, - results_root=results_root) - - yield rtdata + return RegtestData(env=envopt, inputs_root=inputs_root, + results_root=results_root) @pytest.fixture(scope='function') def rtdata(artifactory_repos, envopt, request, _jail): - yield from _rtdata_fixture_implementation(artifactory_repos, envopt, request) + return _rtdata_fixture_implementation(artifactory_repos, envopt, request) @pytest.fixture(scope='module') def rtdata_module(artifactory_repos, envopt, request, jail): - yield from _rtdata_fixture_implementation(artifactory_repos, envopt, request) + return _rtdata_fixture_implementation(artifactory_repos, envopt, request) @pytest.fixture diff --git a/jwst/regtest/regtestdata.py b/jwst/regtest/regtestdata.py index 8bbc9972fc..91c58cb0ab 100644 --- a/jwst/regtest/regtestdata.py +++ b/jwst/regtest/regtestdata.py @@ -34,7 +34,7 @@ def __init__(self, env="dev", inputs_root="jwst-pipeline", input=None, input_remote=None, output=None, truth=None, truth_remote=None, remote_results_path=None, test_name=None, traceback=None, **kwargs): - self._env = env + self.env = env self._inputs_root = inputs_root self._results_root = results_root self._bigdata_root = get_bigdata_root() @@ -142,9 +142,9 @@ def get_data(self, path=None, docopy=None): self.input_remote = path if docopy is None: docopy = self.docopy - self.input = get_bigdata(self._inputs_root, self._env, path, + self.input = get_bigdata(self._inputs_root, self.env, path, docopy=docopy) - self.input_remote = os.path.join(self._inputs_root, self._env, path) + self.input_remote = os.path.join(self._inputs_root, self.env, path) return self.input @@ -161,13 +161,13 @@ def data_glob(self, path=None, glob='*', docopy=None): # is a local path or URL. root = self.bigdata_root if op.exists(root): - root_path = op.join(root, self._inputs_root, self._env) + root_path = op.join(root, self._inputs_root, self.env) root_len = len(root_path) + 1 path = op.join(root_path, path) file_paths = _data_glob_local(path, glob) elif check_url(root): - root_len = len(self._env) + 1 - file_paths = _data_glob_url(self._inputs_root, self._env, path, glob, root=root) + root_len = len(self.env) + 1 + file_paths = _data_glob_url(self._inputs_root, self.env, path, glob, root=root) else: raise BigdataError('Path cannot be found: {}'.format(path)) @@ -192,9 +192,9 @@ def get_truth(self, path=None, docopy=None): os.makedirs('truth', exist_ok=True) os.chdir('truth') try: - self.truth = get_bigdata(self._inputs_root, self._env, path, + self.truth = get_bigdata(self._inputs_root, self.env, path, docopy=docopy) - self.truth_remote = os.path.join(self._inputs_root, self._env, path) + self.truth_remote = os.path.join(self._inputs_root, self.env, path) except BigdataError: os.chdir('..') raise @@ -232,7 +232,7 @@ def get_asn(self, path=None, docopy=True, get_members=True): docopy = self.docopy # Get the association JSON file - self.input = get_bigdata(self._inputs_root, self._env, path, + self.input = get_bigdata(self._inputs_root, self.env, path, docopy=docopy) with open(self.input) as fp: asn = load_asn(fp) @@ -245,7 +245,7 @@ def get_asn(self, path=None, docopy=True, get_members=True): fullpath = os.path.join( os.path.dirname(self.input_remote), member['expname']) - get_bigdata(self._inputs_root, self._env, fullpath, + get_bigdata(self._inputs_root, self.env, fullpath, docopy=self.docopy) def to_asdf(self, path): diff --git a/jwst/regtest/test_migrate_data.py b/jwst/regtest/test_migrate_data.py new file mode 100644 index 0000000000..ae66bc394c --- /dev/null +++ b/jwst/regtest/test_migrate_data.py @@ -0,0 +1,43 @@ +""" +Tests of the migrate_data script, which attempts to update +files that have become invalid due to changes in model schemas +between versions of this package. + +Obtains examples of files from artifactory truth data for +older releases. +""" +import subprocess + +from astropy.io import fits +import pytest + +from jwst import datamodels + + +@pytest.fixture(autouse=True) +def strict_validation(monkeypatch): + monkeypatch.setenv("STRICT_VALIDATION", "true") + yield + + +@pytest.mark.bigdata +@pytest.mark.parametrize("truth_path", [ + "truth/test_miri_lrs_slit_spec2/jw00623032001_03102_00001_mirimage_x1d.fits", + "truth/test_nirspec_mos_spec2/f170lp-g235m_mos_observation-6-c0e0_001_dn_nrs1_mod_x1d.fits", +], ids=["miri-lrs-x1d", "nirspec-mos-x1d"]) +def test_x1d_spec_table(truth_path, rtdata): + rtdata.env = "1.1.0" + rtdata.get_truth(truth_path) + + # Confirm that the file doesn't initially validate + # (open with fits first so that the failed call to open doesn't leave behind an open file) + with fits.open(rtdata.truth, memmap=False) as hdu: + with pytest.raises(ValueError, match="Column names don't match schema"): + with datamodels.open(hdu): + pass + + subprocess.check_call(["migrate_data", rtdata.truth, "--in-place"]) + + # Now the model should validate + with datamodels.open(rtdata.truth): + pass diff --git a/scripts/migrate_data b/scripts/migrate_data new file mode 100755 index 0000000000..c36d95a894 --- /dev/null +++ b/scripts/migrate_data @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +""" +Migrate .fits files whose format has changed between jwst package versions. +""" +import argparse +from datetime import datetime +import os +import re +import traceback +import warnings + +import asdf +from astropy.io import fits +import numpy as np +from packaging.specifiers import SpecifierSet + +import jwst +from jwst import datamodels + + +def parse_args(): + parser = argparse.ArgumentParser("migrate_data", "migrate .fits files whose format has changed between jwst package versions") + + parser.add_argument("files", nargs="+", help="one or more .fits files") + + output_group = parser.add_mutually_exclusive_group(required=True) + output_group.add_argument("--output-dir", help="write modified files to an output directory") + output_group.add_argument("--in-place", help="modify files in-place", action="store_true") + + return parser.parse_args() + + +# If there get to be many of these we may want to move +# them to jwst.datamodels somewhere: + +def migrate_spec_table_1_1_0(hdul): + """ + spectable.schema added additional columns and renamed + two columns. + """ + schema = asdf.schema.load_schema("http://stsci.edu/schemas/jwst_datamodel/spectable.schema") + dtype = asdf.tags.core.ndarray.asdf_datatype_to_numpy_dtype(schema["datatype"]) + renamed_columns = { + "ERROR": "FLUX_ERROR", + "BERROR": "BKGD_ERROR", + } + + for hdu in hdul: + if hdu.name == "EXTRACT1D": + new_data = np.zeros(hdu.data.shape, dtype=dtype) + for column_name in hdu.data.dtype.names: + new_data[renamed_columns.get(column_name, column_name)] = hdu.data[column_name] + hdu.data = new_data + + +# The first key is a model class name, the second +# a jwst package version specifier. The value +# is a method that accepts an HDUList and modifies +# it in-place. +_MIGRATE_METHODS = { + "SpecModel": { + "> 0.13.1, <= 1.1.0": migrate_spec_table_1_1_0, + }, + "MultiSpecModel": { + "> 0.13.1, <= 1.1.0": migrate_spec_table_1_1_0, + }, +} + + +def migrate_file(filename, args): + if args.in_place: + mode = "update" + else: + mode = "readonly" + + with fits.open(filename, memmap=False, mode=mode) as hdul: + model_type = hdul[0].header.get("DATAMODL") + jwst_version = hdul[0].header.get("CAL_VER") + + if not (model_type and jwst_version): + print(f"Unable to migrate {filename}: DATAMODL and CAL_VER keywords are required") + return + + match = re.match(r'^[0-9]+\.[0-9]+\.[0-9]+', jwst_version) + if match is None: + print(f"Unable to migrate {filename}: CAL_VER not understood") + return + jwst_version = match.group(0) + + if model_type not in _MIGRATE_METHODS: + print(f"Migration for {filename} DATAMODL {model_type} not implemented") + return + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + exception_raised = False + try: + getattr(datamodels, model_type)(hdul, strict_validation=True) + except Exception: + exception_raised = True + if not exception_raised: + print(f"{filename} is already valid") + return + + migrate_method = next((m for s, m in _MIGRATE_METHODS[model_type].items() if jwst_version in SpecifierSet(s)), None) + if migrate_method is None: + print(f"Migration for {filename} CAL_VER {jwst_version} not implemented") + return + + migrate_method(hdul) + hdul[0].header["HISTORY"] = f"Migrated with jwst {jwst.__version__} migrate_data script {datetime.utcnow().isoformat()}" + + try: + getattr(datamodels, model_type)(hdul, strict_validation=True) + except Exception: + print(f"Migration for {filename} failed to produce a valid model:\n") + traceback.print_exc() + return + + if args.in_place: + hdul.flush() + else: + output_filename = os.path.join(args.output_dir, os.path.basename(filename)) + hdul.writeto(output_filename, checksum=True, overwrite=True) + + +def main(): + args = parse_args() + + if args.output_dir: + os.makedirs(args.output_dir, exist_ok=True) + + for file in args.files: + try: + migrate_file(file, args) + except Exception: + print(f"Error migrating {file}:\n") + traceback.print_exc() + + +if __name__ == "__main__": + main()