From 3e4f4fe6d504a7555485685fbd96bdcbd00274cf Mon Sep 17 00:00:00 2001
From: Ed Slavich <eslavich@stsci.edu>
Date: Fri, 21 May 2021 15:16:31 -0400
Subject: [PATCH] Add migrate_data script to make older x1d files readable by
 later versions of jwst

---
 CHANGES.rst                           |   8 +-
 docs/jwst/data_products/index.rst     |   4 +-
 docs/jwst/data_products/migrating.rst |  30 ++++++
 jwst/regtest/conftest.py              |  10 +-
 jwst/regtest/regtestdata.py           |  20 ++--
 jwst/regtest/test_migrate_data.py     |  43 ++++++++
 scripts/migrate_data                  | 142 ++++++++++++++++++++++++++
 7 files changed, 238 insertions(+), 19 deletions(-)
 create mode 100644 docs/jwst/data_products/migrating.rst
 create mode 100644 jwst/regtest/test_migrate_data.py
 create mode 100755 scripts/migrate_data

diff --git a/CHANGES.rst b/CHANGES.rst
index 71b359dcf4..941b3fa378 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -11,7 +11,7 @@ combine_1d
 
 - Added SRCTYPE to COMBINE1D output extension headers, propagated from
   EXTRACT1D inputs [#6079]
-  
+
 cube_build
 ----------
 
@@ -40,6 +40,12 @@ lib
 - Updated set_telescope_pointing to populate ENGQLPTG keyword with new
   allowed values [#6088]
 
+scripts
+-------
+
+- Add migrate_data command with support for migrating spec_table in
+  x1d files produced with <= 1.1.0 of this package. [#6055]
+
 1.2.0 (2021-05-24)
 ==================
 
diff --git a/docs/jwst/data_products/index.rst b/docs/jwst/data_products/index.rst
index 1d5882052e..d9cc684110 100644
--- a/docs/jwst/data_products/index.rst
+++ b/docs/jwst/data_products/index.rst
@@ -5,7 +5,7 @@ Data Products Information
 
 .. toctree::
    :maxdepth: 3
-   
+
    stages.rst
    file_naming.rst
    product_types.rst
@@ -13,4 +13,4 @@ Data Products Information
    science_products.rst
    nonscience_products.rst
    guidestar_products.rst
-
+   migrating.rst
diff --git a/docs/jwst/data_products/migrating.rst b/docs/jwst/data_products/migrating.rst
new file mode 100644
index 0000000000..b9b52b648c
--- /dev/null
+++ b/docs/jwst/data_products/migrating.rst
@@ -0,0 +1,30 @@
+Migrating deprecated products
+-----------------------------
+
+On rare occasion, the model schemas are changed in such a way as to
+break compatibility with data products produced by earlier versions
+of this package.  When these older files are opened the software
+will report validation errors:
+
+.. doctest-skip::
+
+  >>> from jwst import datamodels
+  >>> datamodels.open("jw95115001001_02102_00001_nrs1_x1d.fits")
+  ...
+  ValueError: Column names don't match schema...
+
+In some cases it will be possible to update the file to the
+new format using the ``migrate_data`` tool included with this package:
+::
+
+    $ migrate_data jw95115001001_02102_00001_nrs1_x1d.fits --in-place
+
+It can also be run on multiple files:
+::
+
+    $ migrate_data *_x1d.fits --in-place
+
+Or configured to write updated files to a separate output directory:
+::
+
+    $ migrate_data *_x1d.fits --output-dir some/other/directory
diff --git a/jwst/regtest/conftest.py b/jwst/regtest/conftest.py
index 257e40aa35..e9c903acce 100644
--- a/jwst/regtest/conftest.py
+++ b/jwst/regtest/conftest.py
@@ -182,20 +182,18 @@ def generate_upload_schema(pattern, target, recursive=False):
 def _rtdata_fixture_implementation(artifactory_repos, envopt, request):
     """Provides the RemoteResource class"""
     inputs_root, results_root = artifactory_repos
-    rtdata = RegtestData(env=envopt, inputs_root=inputs_root,
-                         results_root=results_root)
-
-    yield rtdata
+    return RegtestData(env=envopt, inputs_root=inputs_root,
+                       results_root=results_root)
 
 
 @pytest.fixture(scope='function')
 def rtdata(artifactory_repos, envopt, request, _jail):
-    yield from _rtdata_fixture_implementation(artifactory_repos, envopt, request)
+    return _rtdata_fixture_implementation(artifactory_repos, envopt, request)
 
 
 @pytest.fixture(scope='module')
 def rtdata_module(artifactory_repos, envopt, request, jail):
-    yield from _rtdata_fixture_implementation(artifactory_repos, envopt, request)
+    return _rtdata_fixture_implementation(artifactory_repos, envopt, request)
 
 
 @pytest.fixture
diff --git a/jwst/regtest/regtestdata.py b/jwst/regtest/regtestdata.py
index 8bbc9972fc..91c58cb0ab 100644
--- a/jwst/regtest/regtestdata.py
+++ b/jwst/regtest/regtestdata.py
@@ -34,7 +34,7 @@ def __init__(self, env="dev", inputs_root="jwst-pipeline",
                  input=None, input_remote=None, output=None, truth=None,
                  truth_remote=None, remote_results_path=None, test_name=None,
                  traceback=None, **kwargs):
-        self._env = env
+        self.env = env
         self._inputs_root = inputs_root
         self._results_root = results_root
         self._bigdata_root = get_bigdata_root()
@@ -142,9 +142,9 @@ def get_data(self, path=None, docopy=None):
             self.input_remote = path
         if docopy is None:
             docopy = self.docopy
-        self.input = get_bigdata(self._inputs_root, self._env, path,
+        self.input = get_bigdata(self._inputs_root, self.env, path,
                                  docopy=docopy)
-        self.input_remote = os.path.join(self._inputs_root, self._env, path)
+        self.input_remote = os.path.join(self._inputs_root, self.env, path)
 
         return self.input
 
@@ -161,13 +161,13 @@ def data_glob(self, path=None, glob='*', docopy=None):
         # is a local path or URL.
         root = self.bigdata_root
         if op.exists(root):
-            root_path = op.join(root, self._inputs_root, self._env)
+            root_path = op.join(root, self._inputs_root, self.env)
             root_len = len(root_path) + 1
             path = op.join(root_path, path)
             file_paths = _data_glob_local(path, glob)
         elif check_url(root):
-            root_len = len(self._env) + 1
-            file_paths = _data_glob_url(self._inputs_root, self._env, path, glob, root=root)
+            root_len = len(self.env) + 1
+            file_paths = _data_glob_url(self._inputs_root, self.env, path, glob, root=root)
         else:
             raise BigdataError('Path cannot be found: {}'.format(path))
 
@@ -192,9 +192,9 @@ def get_truth(self, path=None, docopy=None):
         os.makedirs('truth', exist_ok=True)
         os.chdir('truth')
         try:
-            self.truth = get_bigdata(self._inputs_root, self._env, path,
+            self.truth = get_bigdata(self._inputs_root, self.env, path,
                                      docopy=docopy)
-            self.truth_remote = os.path.join(self._inputs_root, self._env, path)
+            self.truth_remote = os.path.join(self._inputs_root, self.env, path)
         except BigdataError:
             os.chdir('..')
             raise
@@ -232,7 +232,7 @@ def get_asn(self, path=None, docopy=True, get_members=True):
             docopy = self.docopy
 
         # Get the association JSON file
-        self.input = get_bigdata(self._inputs_root, self._env, path,
+        self.input = get_bigdata(self._inputs_root, self.env, path,
                                  docopy=docopy)
         with open(self.input) as fp:
             asn = load_asn(fp)
@@ -245,7 +245,7 @@ def get_asn(self, path=None, docopy=True, get_members=True):
                     fullpath = os.path.join(
                         os.path.dirname(self.input_remote),
                         member['expname'])
-                    get_bigdata(self._inputs_root, self._env, fullpath,
+                    get_bigdata(self._inputs_root, self.env, fullpath,
                                 docopy=self.docopy)
 
     def to_asdf(self, path):
diff --git a/jwst/regtest/test_migrate_data.py b/jwst/regtest/test_migrate_data.py
new file mode 100644
index 0000000000..ae66bc394c
--- /dev/null
+++ b/jwst/regtest/test_migrate_data.py
@@ -0,0 +1,43 @@
+"""
+Tests of the migrate_data script, which attempts to update
+files that have become invalid due to changes in model schemas
+between versions of this package.
+
+Obtains examples of files from artifactory truth data for
+older releases.
+"""
+import subprocess
+
+from astropy.io import fits
+import pytest
+
+from jwst import datamodels
+
+
+@pytest.fixture(autouse=True)
+def strict_validation(monkeypatch):
+    monkeypatch.setenv("STRICT_VALIDATION", "true")
+    yield
+
+
+@pytest.mark.bigdata
+@pytest.mark.parametrize("truth_path", [
+    "truth/test_miri_lrs_slit_spec2/jw00623032001_03102_00001_mirimage_x1d.fits",
+    "truth/test_nirspec_mos_spec2/f170lp-g235m_mos_observation-6-c0e0_001_dn_nrs1_mod_x1d.fits",
+], ids=["miri-lrs-x1d", "nirspec-mos-x1d"])
+def test_x1d_spec_table(truth_path, rtdata):
+    rtdata.env = "1.1.0"
+    rtdata.get_truth(truth_path)
+
+    # Confirm that the file doesn't initially validate
+    # (open with fits first so that the failed call to open doesn't leave behind an open file)
+    with fits.open(rtdata.truth, memmap=False) as hdu:
+        with pytest.raises(ValueError, match="Column names don't match schema"):
+            with datamodels.open(hdu):
+                pass
+
+    subprocess.check_call(["migrate_data", rtdata.truth, "--in-place"])
+
+    # Now the model should validate
+    with datamodels.open(rtdata.truth):
+        pass
diff --git a/scripts/migrate_data b/scripts/migrate_data
new file mode 100755
index 0000000000..c36d95a894
--- /dev/null
+++ b/scripts/migrate_data
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+"""
+Migrate .fits files whose format has changed between jwst package versions.
+"""
+import argparse
+from datetime import datetime
+import os
+import re
+import traceback
+import warnings
+
+import asdf
+from astropy.io import fits
+import numpy as np
+from packaging.specifiers import SpecifierSet
+
+import jwst
+from jwst import datamodels
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("migrate_data", "migrate .fits files whose format has changed between jwst package versions")
+
+    parser.add_argument("files", nargs="+", help="one or more .fits files")
+
+    output_group = parser.add_mutually_exclusive_group(required=True)
+    output_group.add_argument("--output-dir", help="write modified files to an output directory")
+    output_group.add_argument("--in-place", help="modify files in-place", action="store_true")
+
+    return parser.parse_args()
+
+
+# If there get to be many of these we may want to move
+# them to jwst.datamodels somewhere:
+
+def migrate_spec_table_1_1_0(hdul):
+    """
+    spectable.schema added additional columns and renamed
+    two columns.
+    """
+    schema = asdf.schema.load_schema("http://stsci.edu/schemas/jwst_datamodel/spectable.schema")
+    dtype = asdf.tags.core.ndarray.asdf_datatype_to_numpy_dtype(schema["datatype"])
+    renamed_columns = {
+        "ERROR": "FLUX_ERROR",
+        "BERROR": "BKGD_ERROR",
+    }
+
+    for hdu in hdul:
+        if hdu.name == "EXTRACT1D":
+            new_data = np.zeros(hdu.data.shape, dtype=dtype)
+            for column_name in hdu.data.dtype.names:
+                new_data[renamed_columns.get(column_name, column_name)] = hdu.data[column_name]
+            hdu.data = new_data
+
+
+# The first key is a model class name, the second
+# a jwst package version specifier.  The value
+# is a method that accepts an HDUList and modifies
+# it in-place.
+_MIGRATE_METHODS = {
+    "SpecModel": {
+        "> 0.13.1, <= 1.1.0": migrate_spec_table_1_1_0,
+    },
+    "MultiSpecModel": {
+        "> 0.13.1, <= 1.1.0": migrate_spec_table_1_1_0,
+    },
+}
+
+
+def migrate_file(filename, args):
+    if args.in_place:
+        mode = "update"
+    else:
+        mode = "readonly"
+
+    with fits.open(filename, memmap=False, mode=mode) as hdul:
+        model_type = hdul[0].header.get("DATAMODL")
+        jwst_version = hdul[0].header.get("CAL_VER")
+
+        if not (model_type and jwst_version):
+            print(f"Unable to migrate {filename}: DATAMODL and CAL_VER keywords are required")
+            return
+
+        match = re.match(r'^[0-9]+\.[0-9]+\.[0-9]+', jwst_version)
+        if match is None:
+            print(f"Unable to migrate {filename}: CAL_VER not understood")
+            return
+        jwst_version = match.group(0)
+
+        if model_type not in _MIGRATE_METHODS:
+            print(f"Migration for {filename} DATAMODL {model_type} not implemented")
+            return
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            exception_raised = False
+            try:
+                getattr(datamodels, model_type)(hdul, strict_validation=True)
+            except Exception:
+                exception_raised = True
+            if not exception_raised:
+                print(f"{filename} is already valid")
+                return
+
+        migrate_method = next((m for s, m in _MIGRATE_METHODS[model_type].items() if jwst_version in SpecifierSet(s)), None)
+        if migrate_method is None:
+            print(f"Migration for {filename} CAL_VER {jwst_version} not implemented")
+            return
+
+        migrate_method(hdul)
+        hdul[0].header["HISTORY"] = f"Migrated with jwst {jwst.__version__} migrate_data script {datetime.utcnow().isoformat()}"
+
+        try:
+            getattr(datamodels, model_type)(hdul, strict_validation=True)
+        except Exception:
+            print(f"Migration for {filename} failed to produce a valid model:\n")
+            traceback.print_exc()
+            return
+
+        if args.in_place:
+            hdul.flush()
+        else:
+            output_filename = os.path.join(args.output_dir, os.path.basename(filename))
+            hdul.writeto(output_filename, checksum=True, overwrite=True)
+
+
+def main():
+    args = parse_args()
+
+    if args.output_dir:
+        os.makedirs(args.output_dir, exist_ok=True)
+
+    for file in args.files:
+        try:
+            migrate_file(file, args)
+        except Exception:
+            print(f"Error migrating {file}:\n")
+            traceback.print_exc()
+
+
+if __name__ == "__main__":
+    main()