Merge branch 'master' into feat/add-pypi

uc-cdis · Feb 26, 2025 · fc99a7c · fc99a7c
2 parents a82f3e2 + 65b395f
commit fc99a7c
Show file tree

Hide file tree

Showing 33 changed files with 2,595 additions and 813 deletions.
diff --git a/heal/cli/extract.py b/heal/cli/extract.py
@@ -10,10 +10,17 @@
 @click.option(
     "--input_file",
     "input_file",
-    required=True,
     help="name of file to extract HEAL-compliant VLMD file",
+    required=True,
     type=click.Path(writable=True),
 )
+@click.option(
+    "--title",
+    "title",
+    help="Root level title for the dictionary (required if extracting from csv to json)",
+    default=None,
+    type=str,
+)
 @click.option(
     "--output_dir",
     "output_dir",
@@ -22,12 +29,16 @@
     type=click.Path(writable=True),
     show_default=True,
 )
-def extract(input_file, output_dir):
+def extract(input_file, title, output_dir):
     """Extract HEAL-compliant VLMD file from input file"""
 
     logging.info(f"Extracting VLMD from {input_file}")
 
     try:
-        vlmd_extract(input_file, output_dir=output_dir)
+        vlmd_extract(
+            input_file,
+            title,
+            output_dir=output_dir,
+        )
     except Exception as e:
         logging.error(f"Extraction error {str(e)}")
diff --git a/heal/harvard_downloads.py b/heal/harvard_downloads.py
@@ -11,7 +11,7 @@
 
 from pathlib import Path
 from typing import Dict, List
-from utils import unpackage_object, get_id, download_from_url
+from heal.utils import unpackage_object, get_id, download_from_url
 
 from cdislogging import get_logger
 from gen3.tools.download.drs_download import DownloadStatus
@@ -20,12 +20,14 @@
 
 
 def get_harvard_dataverse_files(
-    file_metadata_list: List, download_path: str = "."
+    wts_hostname: str, auth, file_metadata_list: List, download_path: str = "."
 ) -> Dict:
     """
     Retrieves external data from the Harvard Dataverse.
 
     Args:
+        wts_hostname (str): hostname for commons with wts (not being used at this moment)
+        auth (Gen3Auth): auth for commons with wts (not being used at this moment)
         file_metadata_list (List of Dict): list of studies or files
         download_path (str): path to download files and unpack
 
@@ -102,6 +104,8 @@ def get_download_url_for_harvard_dataverse(file_metadata: Dict) -> str:
         base_url = "https://demo.dataverse.org/api/access"
     if "study_id" in file_metadata:
         url = f"{base_url}/dataset/:persistentId/?persistentId={file_metadata.get('study_id')}"
+    elif "file_id" in file_metadata:
+        url = f"{base_url}/datafile/{file_metadata.get('file_id')}"
     else:
         url = None
 
@@ -122,7 +126,7 @@ def is_valid_harvard_file_metadata(file_metadata: Dict) -> bool:
     if not isinstance(file_metadata, dict):
         logger.critical(f"Invalid metadata - item is not a dict: {file_metadata}")
         return False
-    if "study_id" not in file_metadata:
+    if "study_id" not in file_metadata and "file_id" not in file_metadata:
         logger.critical(
             f"Invalid metadata - missing required Harvard Dataverse keys {file_metadata}"
         )

diff --git a/heal/qdr_downloads.py b/heal/qdr_downloads.py
@@ -22,7 +22,7 @@
 
 from pathlib import Path
 from typing import Dict, List
-from utils import unpackage_object, get_id, download_from_url, get_idp_access_token
+from heal.utils import unpackage_object, get_id, download_from_url, get_idp_access_token
 
 from cdislogging import get_logger
 from gen3.tools.download.drs_download import DownloadStatus, wts_get_token

diff --git a/heal/vlmd/README.md b/heal/vlmd/README.md
@@ -57,7 +57,8 @@ except ExtractionError as e_err:
 
 The extract module implements extraction and conversion of dictionaries into different formats.
 
-The current formats are csv, json, and tsv.
+The current formats are csv, json, and tsv. A `title=<TITLE>` paramater should be supplied when
+converting from non-json to json format.
 
 The `vlmd_extract()` method raises a `jsonschema.ValidationError` for an invalid input files
 and raises an `ExtractionError` for any other type of error.
@@ -70,7 +71,7 @@ from jsonschema import ValidationError
 from heal.vlmd import vlmd_extract, ExtractionError
 
 try:
-  vlmd_extract("vlmd_for_extraction.csv", output_dir="./output")
+  vlmd_extract("vlmd_for_extraction.csv", title="the dictionary title", output_dir="./output")
 
 except ValidationError as v_err:
   # handle validation error
@@ -115,4 +116,6 @@ For example, the following can validate a VLMD file in csv format:
 The following would extract a json format VLMD file from a csv format input file and
 write a json file in the directory `output`:
 
-`heal vlmd extract --input_file "vlmd_for_extraction.csv" --output_dir "./output"`
+`heal vlmd extract --input_file "vlmd_for_extraction.csv" --title "The dictionary title" --output_dir "./output"`
+
+The `--title` option is required when extracting from `csv` to `json`.
diff --git a/heal/vlmd/config.py b/heal/vlmd/config.py
@@ -6,7 +6,7 @@
 
 # file suffixes
 ALLOWED_INPUT_TYPES = ["csv", "tsv", "json"]
-ALLOWED_FILE_TYPES = ["auto"] + ALLOWED_INPUT_TYPES
+ALLOWED_FILE_TYPES = ["auto", "csv", "tsv", "json", "redcap"]
 ALLOWED_SCHEMA_TYPES = ["auto", "csv", "json", "tsv"]
 ALLOWED_OUTPUT_TYPES = ["csv", "json"]
 
@@ -23,6 +23,8 @@
 
 # schema
 JSON_SCHEMA_VERSION = JSON_SCHEMA.get("version", "0.3.2")
+# The title is a default title used in the validation process.
+# It will get overwritten by a user-specified title in the extraction process.
 TOP_LEVEL_PROPS = {
     "schemaVersion": JSON_SCHEMA_VERSION,
     "title": "HEAL Data Dictionary",

diff --git a/heal/vlmd/extract/conversion.py b/heal/vlmd/extract/conversion.py
@@ -7,6 +7,7 @@
 from heal.vlmd.config import JSON_SCHEMA, TOP_LEVEL_PROPS
 from heal.vlmd.extract.csv_dict_conversion import convert_datadict_csv
 from heal.vlmd.extract.json_dict_conversion import convert_template_json
+from heal.vlmd.extract.redcap_csv_dict_conversion import convert_redcap_csv
 from heal.vlmd.utils import clean_json_fields
 
 logger = get_logger("vlmd-conversion", log_level="debug")
@@ -18,6 +19,7 @@
         recode_map=mappings.recode_map,
     ),
     "json-template": convert_template_json,
+    "redcap-csv-dict": convert_redcap_csv,
 }
 
 ext_map = {
@@ -45,7 +47,8 @@ def convert_to_vlmd(
         input_type (str): The input type. See keys of 'choice_fxn' dict for options, currently:
             csv-data-dict, json-template.
         data_dictionary_props (dict):
-            The other data-dictionary level properties. By default, will give the data_dictionary `title` property as the file name stem.
+            The other data-dictionary level properties. By default,
+            will give the data_dictionary `title` property as the file name stem.
 
     Returns
         Dictionary with:
@@ -71,7 +74,6 @@ def convert_to_vlmd(
     data_dictionary_package = choice_fxn[input_type](
         input_filepath, data_dictionary_props
     )
-    logger.debug(f"Data Dictionary Package keys {data_dictionary_package.keys()}")
 
     # For now we return the csv and json in one package.
     # If any multiple data dictionaries are needed then implement the methods in

diff --git a/heal/vlmd/extract/csv_dict_conversion.py b/heal/vlmd/extract/csv_dict_conversion.py
@@ -4,10 +4,15 @@
 
 import pandas as pd
 
+from cdislogging import get_logger
 from heal.vlmd.config import JSON_SCHEMA
 from heal.vlmd.extract import utils
+from heal.vlmd.extract.redcap_csv_dict_conversion import convert_redcap_csv
+from heal.vlmd.utils import has_redcap_headers
 from heal.vlmd.validate.utils import read_delim
 
+logger = get_logger("csv-conversion", log_level="debug")
+
 
 def _parse_string_objects(
     tbl_csv: pd.DataFrame, field_properties: dict
@@ -75,7 +80,8 @@ def convert_datadict_csv(
             The HEAL-specified data dictionary properties.
         rename_map: A mapping of source (current) column headers to target (desired -- conforming to CVS HEAL spec)
             column headers
-        recode_map: A mapping of values for each column in HEAL spec -- {..."column_name":{"old_value":"new_value"...}...}
+        recode_map: A mapping of values for each column in HEAL spec, eg,
+            {..."column_name":{"old_value":"new_value"...}...}
         drop_list: a list of variables to drop from headers before processing
         item_sep:str (default:"|") Used to split stringified items (in objects and arrays)
         key_val_sep:str (default:"=") Used to split stringified each key-value pair
@@ -123,10 +129,22 @@ def infer_delim(series: pd.Series, char_list: list, firstmatch: bool):
         return inferred_delim
 
     if isinstance(csv_template, (str, PathLike)):
+        logger.debug("Getting data from path to CSV file")
         template_tbl = read_delim(str(Path(csv_template)))
     else:
+        logger.debug("Getting data from input dataframe")
         template_tbl = pd.DataFrame(csv_template)
 
+    # If REDCap then get dictionary and return.
+    column_names = template_tbl.columns
+    if has_redcap_headers(column_names):
+        logger.debug("File appears to have REDCap headers. Ready to convert.")
+        converted_dict = convert_redcap_csv(template_tbl)
+        return converted_dict
+    else:
+        logger.debug("File is CSV dictionary, not REDCap dictionary.")
+
+    # Regular CSV, not REDCap.
     if not rename_map:
         rename_map = {}
 

diff --git a/heal/vlmd/extract/extract.py b/heal/vlmd/extract/extract.py
@@ -1,3 +1,4 @@
+import os
 from os.path import isfile
 from pathlib import Path
 
@@ -9,13 +10,50 @@
     ALLOWED_INPUT_TYPES,
     ALLOWED_OUTPUT_TYPES,
 )
+from heal.vlmd.extract.conversion import convert_to_vlmd
 from heal.vlmd.file_utils import get_output_filepath, write_vlmd_dict
+from heal.vlmd.validate.validate import file_type_to_fxn_map
+
 
 logger = get_logger("extract", log_level="debug")
 
 
+def set_title_if_missing(file_type: str, title: str, converted_dict: dict) -> dict:
+    """
+    JSON output should have a title.
+    If input file_type is not json then title should come from
+    parameter or standardsMappings[0].instrument.title
+    """
+    if title is not None and title != converted_dict.get("title"):
+        logger.debug(f"JSON dictionary setting user-defined title '{title}'")
+        converted_dict["title"] = title
+        return converted_dict
+
+    instrument_title = None
+    try:
+        standards_mappings = converted_dict.get("standardsMappings")
+        if standards_mappings and len(standards_mappings) >= 1:
+            instrument_title = standards_mappings[0].get("instrument").get("title")
+    except Exception as err:
+        logger.warning("standardsMapping does not have 'instrument.title'")
+
+    if file_type != "json":
+        if title is None and instrument_title is None:
+            message = "Title must be supplied when extracting from non-json to json"
+            logger.error(message)
+            raise ExtractionError(message)
+
+        elif instrument_title is not None:
+            logger.debug(
+                f"JSON dictionary setting title to instrument title '{instrument_title}'"
+            )
+            converted_dict["title"] = instrument_title
+
+    return converted_dict
+
+
 def vlmd_extract(
-    input_file, file_type="auto", output_dir=".", output_type="json"
+    input_file, title=None, file_type="auto", output_dir=".", output_type="json"
 ) -> bool:
     """
     Extract a HEAL compliant csv and json format VLMD data dictionary
@@ -24,9 +62,11 @@ def vlmd_extract(
     Args:
         input_file (str): the path of the input HEAL VLMD file to be extracted
             into HEAL-compliant VLMD file(s).
+        title (str): the root level title of the dictionary (required if extracting from csv to json)
         file_type (str): the type of the input file that will be extracted into a
             HEAL-compliant VLMD file.
-            Allowed values are "auto", “csv”, "json", "tsv".
+            Allowed values are "auto", “csv”, "json", "tsv", and "redcap"
+            where "redcap" is a csv file of a REDCap dictionary export.
             Defaults to “auto”.
         output_dir (str): the directory of where the extracted VLMD file will
             be written. Defaults to “.”
@@ -38,7 +78,9 @@ def vlmd_extract(
         Raises ExtractionError in input VLMD is not valid or could not be converted.
     """
 
-    logger.info(f"Extracting VLMD file '{input_file}' with file_type '{file_type}'")
+    logger.info(
+        f"Extracting VLMD file '{input_file}' with input file_type '{file_type}'"
+    )
 
     file_suffix = Path(input_file).suffix.replace(".", "")
     if file_suffix not in ALLOWED_INPUT_TYPES:
@@ -62,15 +104,58 @@ def vlmd_extract(
         logger.error(message)
         raise ExtractionError(message)
 
+    logger.debug(f"File type is set to '{file_type}'")
     # validate
     try:
+        # csv files are converted as part of validate
         converted_dictionary = vlmd_validate(
-            input_file, output_type=output_type, return_converted_output=True
+            input_file,
+            file_type=file_type,
+            output_type=output_type,
+            return_converted_output=True,
         )
-    except Exception as e:
+    except Exception as err:
         logger.error(f"Error in validating and extracting dictionary from {input_file}")
-        logger.error(e)
-        raise ExtractionError(str(e))
+        logger.error(err)
+        raise ExtractionError(str(err))
+
+    # input json file require explicit conversion and post validation steps
+    if file_type == "json":
+        file_convert_function = file_type_to_fxn_map.get(file_type)
+        data_dictionary_props = {}
+        try:
+            logger.debug("Ready to convert json input to VLMD")
+            data_dictionaries = convert_to_vlmd(
+                input_filepath=input_file,
+                input_type=file_convert_function,
+                data_dictionary_props=data_dictionary_props,
+            )
+            if output_type == "json":
+                converted_dictionary = data_dictionaries["template_json"]
+                logger.debug(
+                    f"Ready to validate converted dict with output type '{output_type}'"
+                )
+                is_valid = vlmd_validate(
+                    converted_dictionary,
+                    file_type=file_type,
+                    output_type=output_type,
+                    return_converted_output=False,
+                )
+                logger.debug(f"Converted dictionary is valid: {is_valid}")
+            else:
+                converted_dictionary = data_dictionaries["template_csv"]["fields"]
+        except Exception as err:
+            logger.error(f"Error in extracting JSON dictionary from {input_file}")
+            logger.error(err)
+            raise ExtractionError(str(err))
+
+    if output_type == "json":
+        converted_dictionary = set_title_if_missing(
+            file_type=file_type, title=title, converted_dict=converted_dictionary
+        )
+        if converted_dictionary.get("title") is None:
+            logger.error("JSON dictionary is missing 'title'")
+            raise ExtractionError("JSON dictionary is missing 'title'")
 
     # write to file
     output_filepath = get_output_filepath(
@@ -79,9 +164,9 @@ def vlmd_extract(
     logger.info(f"Writing converted dictionary to {output_filepath}")
     try:
         write_vlmd_dict(converted_dictionary, output_filepath, file_type=output_type)
-    except Exception as e:
+    except Exception as err:
         logger.error("Error in writing converted dictionary")
-        logger.error(e)
+        logger.error(err)
         raise ExtractionError("Error in writing converted dictionary")
 
     return True