Merge branch 'master' into piotr/harvard

uc-cdis · Jan 21, 2025 · 14b9e53 · 14b9e53
2 parents b1533f5 + 7dd2ca9
commit 14b9e53
Show file tree

Hide file tree

Showing 52 changed files with 5,598 additions and 798 deletions.
diff --git a/README.md b/README.md
@@ -9,6 +9,16 @@ The qdr_downloads module include a retriever function for downloading files from
 
 This is intended to be called by gen3sdk [external download functions](https://github.com/uc-cdis/gen3sdk-python/blob/master/gen3/tools/download/external_file_download.py). It is also possible to write a wrapper script for the qdr download functions.
 
+### Notebooks
+
+In the notebooks directory there are jupyter notebooks that may be used to download files from a corresponding platform. For instance the qdr_data_download.ipynb notebook may be used to download files from Syracuse QDR.
+
+These notebooks perform optimally within a HEAL Gen3 Workspace and the notebooks will be automatically installed to a user's workspace when the workspace is initiated. However, you may also use these notebooks on your local machine.
+
+### VLMD extraction and validation
+
+The [VLMD docs](heal/vlmd/README.md) describe how to use the SDK for extracting and validating VLMD dictionaries.
+
 ### Run tests
 
 ```

diff --git a/heal/vlmd/README.md b/heal/vlmd/README.md
@@ -0,0 +1,72 @@
+# VLMD methods
+
+## VLMD validation
+
+This module validates VLMD data dictionaries against stored schemas. The `vlmd_validate()` method
+will attempt an extraction as part of the validation process.
+
+The `vlmd_validate()` method raises a `jsonschema.ValidationError` for an invalid input file and
+will raise an `ExtractionError` if the input_file cannot be converted
+
+Example validation code:
+
+```
+from jsonschema import ValidationError
+
+from heal.vlmd import vlmd_validate, ExtractionError
+
+input_file = "vlmd_dd.json"
+try:
+    vlmd_validate(input_file)
+
+except ValidationError as v_err:
+  # handle validation error
+
+except ExtractionError as e_err:
+  # handle extraction error
+
+```
+
+## VLMD extract
+
+The extract module implements extraction and conversion of dictionaries into different formats.
+
+The current formats are csv, json, and tsv.
+
+The `vlmd_extract()` method raises a `jsonschema.ValidationError` for an invalid input files
+and raises an `ExtractionError` for any other type of error.
+
+Example extraction code:
+
+```
+from jsonschema import ValidationError
+
+from heal.vlmd import vlmd_extract, ExtractionError
+
+try:
+  vlmd_extract("vlmd_for_extraction.csv", output_dir="./output")
+
+except ValidationError as v_err:
+  # handle validation error
+
+except ExtractionError as e_err:
+  # handle extraction error
+```
+
+The above will write a HEAL-compliant VLMD json dictionary to
+
+`output/heal-dd_vlmd_for_extraction.json`
+
+## Adding new file types for extraction and validation
+
+The above moduels currently handle the following types of dictionaries: csv, json, tsv.
+
+To add code for a new dictionary file type:
+
+* Create a new schema for the data type or validate against the existing json schema
+* If possible create a new validator module for the new file type
+* Call the new validator module from the `validate.py` module
+* Create a new extractor module for the new file type, possibly using `pandas`
+* Call the new extractor module from the `conversion.py` module
+* Add new file writing utilities if saving converted dictionaries in the new format
+* Create unit tests as needed for new code
diff --git a/heal/vlmd/__init__.py b/heal/vlmd/__init__.py
@@ -0,0 +1,4 @@
+from heal.vlmd.validate.validate import vlmd_validate, ExtractionError
+
+# place 'extract' import after 'validate' import
+from heal.vlmd.extract.extract import vlmd_extract
diff --git a/heal/vlmd/config.py b/heal/vlmd/config.py
@@ -0,0 +1,26 @@
+import json
+
+# file prefix
+OUTPUT_FILE_PREFIX = "heal-dd"
+
+# file suffixes
+ALLOWED_INPUT_TYPES = ["csv", "tsv", "json"]
+ALLOWED_FILE_TYPES = ["auto"] + ALLOWED_INPUT_TYPES
+ALLOWED_SCHEMA_TYPES = ["auto", "csv", "json", "tsv"]
+ALLOWED_OUTPUT_TYPES = ["csv", "json"]
+
+# schemas
+csv_schema_file = "heal/vlmd/schemas/heal_csv.json"
+with open(csv_schema_file, "r") as f:
+    CSV_SCHEMA = json.load(f)
+
+json_schema_file = "heal/vlmd/schemas/heal_json.json"
+with open(json_schema_file, "r") as f:
+    JSON_SCHEMA = json.load(f)
+
+# schema
+JSON_SCHEMA_VERSION = JSON_SCHEMA.get("version", "0.3.2")
+TOP_LEVEL_PROPS = {
+    "schemaVersion": JSON_SCHEMA_VERSION,
+    "title": "HEAL Data Dictionary",
+}
diff --git a/heal/vlmd/extract/conversion.py b/heal/vlmd/extract/conversion.py
@@ -0,0 +1,91 @@
+from functools import partial
+from pathlib import Path
+
+from cdislogging import get_logger
+
+from heal.vlmd import mappings
+from heal.vlmd.config import JSON_SCHEMA, TOP_LEVEL_PROPS
+from heal.vlmd.extract.csv_dict_conversion import convert_datadict_csv
+from heal.vlmd.extract.json_dict_conversion import convert_template_json
+from heal.vlmd.utils import clean_json_fields
+
+logger = get_logger("vlmd-conversion", log_level="debug")
+
+choice_fxn = {
+    "csv-data-dict": partial(
+        convert_datadict_csv,
+        rename_map=mappings.rename_map,
+        recode_map=mappings.recode_map,
+    ),
+    "json-template": convert_template_json,
+}
+
+ext_map = {
+    ".csv": "csv-data-dict",
+    ".json": "json-template",
+}
+
+
+def _detect_input_type(filepath, ext_to_input_type=ext_map):
+    ext = filepath.suffix
+    input_type = ext_to_input_type.get(ext, None)
+    return input_type
+
+
+def convert_to_vlmd(
+    input_filepath,
+    input_type=None,
+    data_dictionary_props=None,
+) -> dict:
+    """
+    Converts a data dictionary to HEAL compliant json or csv format.
+
+    Args
+        input_filepath (str): Path to input file. Currently converts data dictionaries in csv, json, and tsv.
+        input_type (str): The input type. See keys of 'choice_fxn' dict for options, currently:
+            csv-data-dict, json-template.
+        data_dictionary_props (dict):
+            The other data-dictionary level properties. By default, will give the data_dictionary `title` property as the file name stem.
+
+    Returns
+        Dictionary with:
+         1. csvtemplated array of fields.
+         2. jsontemplated data dictionary object as specified by an originally drafted design doc.
+            That is, a dictionary with title:<title>,description:<description>,data_dictionary:<fields>
+            where data dictionary is an array of fields as specified by the JSON schema.
+
+    """
+
+    input_filepath = Path(input_filepath)
+
+    input_type = input_type or _detect_input_type(input_filepath)
+    logger.debug(f"Converting file '{input_filepath}' of input_type '{input_type}'")
+    if input_type not in choice_fxn.keys():
+        logger.error(f"Unexpected input type {input_type}")
+        raise ValueError(
+            f"Unexpected input_type '{input_type}', not in {choice_fxn.keys()}"
+        )
+
+    # get data dictionary package based on the input type
+    data_dictionary_props = data_dictionary_props or {}
+    data_dictionary_package = choice_fxn[input_type](
+        input_filepath, data_dictionary_props
+    )
+    logger.debug(f"Data Dictionary Package keys {data_dictionary_package.keys()}")
+
+    # For now we return the csv and json in one package.
+    # If any multiple data dictionaries are needed then implement the methods in
+    # https://github.com/HEAL/healdata-utils/blob/5080227454d8e731d46a51aa6933c93523eb3b9a/src/healdata_utils/conversion.py#L196
+    package = data_dictionary_package
+
+    # add schema version
+    for field in package["template_csv"]["fields"]:
+        field.update({"schemaVersion": JSON_SCHEMA["version"], **field})
+
+    # remove empty json fields, add schema version (in TOP_LEVEL_PROPS)
+    package["template_json"]["fields"] = clean_json_fields(
+        package["template_json"]["fields"]
+    )
+    package["template_json"] = {**TOP_LEVEL_PROPS, **dict(package["template_json"])}
+
+    return package