-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' into piotr/harvard
- Loading branch information
Showing
52 changed files
with
5,598 additions
and
798 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
# VLMD methods | ||
|
||
## VLMD validation | ||
|
||
This module validates VLMD data dictionaries against stored schemas. The `vlmd_validate()` method | ||
will attempt an extraction as part of the validation process. | ||
|
||
The `vlmd_validate()` method raises a `jsonschema.ValidationError` for an invalid input file and | ||
will raise an `ExtractionError` if the input_file cannot be converted | ||
|
||
Example validation code: | ||
|
||
``` | ||
from jsonschema import ValidationError | ||
from heal.vlmd import vlmd_validate, ExtractionError | ||
input_file = "vlmd_dd.json" | ||
try: | ||
vlmd_validate(input_file) | ||
except ValidationError as v_err: | ||
# handle validation error | ||
except ExtractionError as e_err: | ||
# handle extraction error | ||
``` | ||
|
||
## VLMD extract | ||
|
||
The extract module implements extraction and conversion of dictionaries into different formats. | ||
|
||
The current formats are csv, json, and tsv. | ||
|
||
The `vlmd_extract()` method raises a `jsonschema.ValidationError` for an invalid input files | ||
and raises an `ExtractionError` for any other type of error. | ||
|
||
Example extraction code: | ||
|
||
``` | ||
from jsonschema import ValidationError | ||
from heal.vlmd import vlmd_extract, ExtractionError | ||
try: | ||
vlmd_extract("vlmd_for_extraction.csv", output_dir="./output") | ||
except ValidationError as v_err: | ||
# handle validation error | ||
except ExtractionError as e_err: | ||
# handle extraction error | ||
``` | ||
|
||
The above will write a HEAL-compliant VLMD json dictionary to | ||
|
||
`output/heal-dd_vlmd_for_extraction.json` | ||
|
||
## Adding new file types for extraction and validation | ||
|
||
The above moduels currently handle the following types of dictionaries: csv, json, tsv. | ||
|
||
To add code for a new dictionary file type: | ||
|
||
* Create a new schema for the data type or validate against the existing json schema | ||
* If possible create a new validator module for the new file type | ||
* Call the new validator module from the `validate.py` module | ||
* Create a new extractor module for the new file type, possibly using `pandas` | ||
* Call the new extractor module from the `conversion.py` module | ||
* Add new file writing utilities if saving converted dictionaries in the new format | ||
* Create unit tests as needed for new code |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from heal.vlmd.validate.validate import vlmd_validate, ExtractionError | ||
|
||
# place 'extract' import after 'validate' import | ||
from heal.vlmd.extract.extract import vlmd_extract |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import json | ||
|
||
# file prefix | ||
OUTPUT_FILE_PREFIX = "heal-dd" | ||
|
||
# file suffixes | ||
ALLOWED_INPUT_TYPES = ["csv", "tsv", "json"] | ||
ALLOWED_FILE_TYPES = ["auto"] + ALLOWED_INPUT_TYPES | ||
ALLOWED_SCHEMA_TYPES = ["auto", "csv", "json", "tsv"] | ||
ALLOWED_OUTPUT_TYPES = ["csv", "json"] | ||
|
||
# schemas | ||
csv_schema_file = "heal/vlmd/schemas/heal_csv.json" | ||
with open(csv_schema_file, "r") as f: | ||
CSV_SCHEMA = json.load(f) | ||
|
||
json_schema_file = "heal/vlmd/schemas/heal_json.json" | ||
with open(json_schema_file, "r") as f: | ||
JSON_SCHEMA = json.load(f) | ||
|
||
# schema | ||
JSON_SCHEMA_VERSION = JSON_SCHEMA.get("version", "0.3.2") | ||
TOP_LEVEL_PROPS = { | ||
"schemaVersion": JSON_SCHEMA_VERSION, | ||
"title": "HEAL Data Dictionary", | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
from functools import partial | ||
from pathlib import Path | ||
|
||
from cdislogging import get_logger | ||
|
||
from heal.vlmd import mappings | ||
from heal.vlmd.config import JSON_SCHEMA, TOP_LEVEL_PROPS | ||
from heal.vlmd.extract.csv_dict_conversion import convert_datadict_csv | ||
from heal.vlmd.extract.json_dict_conversion import convert_template_json | ||
from heal.vlmd.utils import clean_json_fields | ||
|
||
logger = get_logger("vlmd-conversion", log_level="debug") | ||
|
||
choice_fxn = { | ||
"csv-data-dict": partial( | ||
convert_datadict_csv, | ||
rename_map=mappings.rename_map, | ||
recode_map=mappings.recode_map, | ||
), | ||
"json-template": convert_template_json, | ||
} | ||
|
||
ext_map = { | ||
".csv": "csv-data-dict", | ||
".json": "json-template", | ||
} | ||
|
||
|
||
def _detect_input_type(filepath, ext_to_input_type=ext_map): | ||
ext = filepath.suffix | ||
input_type = ext_to_input_type.get(ext, None) | ||
return input_type | ||
|
||
|
||
def convert_to_vlmd( | ||
input_filepath, | ||
input_type=None, | ||
data_dictionary_props=None, | ||
) -> dict: | ||
""" | ||
Converts a data dictionary to HEAL compliant json or csv format. | ||
Args | ||
input_filepath (str): Path to input file. Currently converts data dictionaries in csv, json, and tsv. | ||
input_type (str): The input type. See keys of 'choice_fxn' dict for options, currently: | ||
csv-data-dict, json-template. | ||
data_dictionary_props (dict): | ||
The other data-dictionary level properties. By default, will give the data_dictionary `title` property as the file name stem. | ||
Returns | ||
Dictionary with: | ||
1. csvtemplated array of fields. | ||
2. jsontemplated data dictionary object as specified by an originally drafted design doc. | ||
That is, a dictionary with title:<title>,description:<description>,data_dictionary:<fields> | ||
where data dictionary is an array of fields as specified by the JSON schema. | ||
""" | ||
|
||
input_filepath = Path(input_filepath) | ||
|
||
input_type = input_type or _detect_input_type(input_filepath) | ||
logger.debug(f"Converting file '{input_filepath}' of input_type '{input_type}'") | ||
if input_type not in choice_fxn.keys(): | ||
logger.error(f"Unexpected input type {input_type}") | ||
raise ValueError( | ||
f"Unexpected input_type '{input_type}', not in {choice_fxn.keys()}" | ||
) | ||
|
||
# get data dictionary package based on the input type | ||
data_dictionary_props = data_dictionary_props or {} | ||
data_dictionary_package = choice_fxn[input_type]( | ||
input_filepath, data_dictionary_props | ||
) | ||
logger.debug(f"Data Dictionary Package keys {data_dictionary_package.keys()}") | ||
|
||
# For now we return the csv and json in one package. | ||
# If any multiple data dictionaries are needed then implement the methods in | ||
# https://github.com/HEAL/healdata-utils/blob/5080227454d8e731d46a51aa6933c93523eb3b9a/src/healdata_utils/conversion.py#L196 | ||
package = data_dictionary_package | ||
|
||
# add schema version | ||
for field in package["template_csv"]["fields"]: | ||
field.update({"schemaVersion": JSON_SCHEMA["version"], **field}) | ||
|
||
# remove empty json fields, add schema version (in TOP_LEVEL_PROPS) | ||
package["template_json"]["fields"] = clean_json_fields( | ||
package["template_json"]["fields"] | ||
) | ||
package["template_json"] = {**TOP_LEVEL_PROPS, **dict(package["template_json"])} | ||
|
||
return package |
Oops, something went wrong.