Skip to content

Commit

Permalink
Merge branch 'master' into feat/add-pypi
Browse files Browse the repository at this point in the history
  • Loading branch information
george42-ctds authored Feb 26, 2025
2 parents a82f3e2 + 65b395f commit fc99a7c
Show file tree
Hide file tree
Showing 33 changed files with 2,595 additions and 813 deletions.
17 changes: 14 additions & 3 deletions heal/cli/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,17 @@
@click.option(
"--input_file",
"input_file",
required=True,
help="name of file to extract HEAL-compliant VLMD file",
required=True,
type=click.Path(writable=True),
)
@click.option(
"--title",
"title",
help="Root level title for the dictionary (required if extracting from csv to json)",
default=None,
type=str,
)
@click.option(
"--output_dir",
"output_dir",
Expand All @@ -22,12 +29,16 @@
type=click.Path(writable=True),
show_default=True,
)
def extract(input_file, output_dir):
def extract(input_file, title, output_dir):
"""Extract HEAL-compliant VLMD file from input file"""

logging.info(f"Extracting VLMD from {input_file}")

try:
vlmd_extract(input_file, output_dir=output_dir)
vlmd_extract(
input_file,
title,
output_dir=output_dir,
)
except Exception as e:
logging.error(f"Extraction error {str(e)}")
10 changes: 7 additions & 3 deletions heal/harvard_downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from pathlib import Path
from typing import Dict, List
from utils import unpackage_object, get_id, download_from_url
from heal.utils import unpackage_object, get_id, download_from_url

from cdislogging import get_logger
from gen3.tools.download.drs_download import DownloadStatus
Expand All @@ -20,12 +20,14 @@


def get_harvard_dataverse_files(
file_metadata_list: List, download_path: str = "."
wts_hostname: str, auth, file_metadata_list: List, download_path: str = "."
) -> Dict:
"""
Retrieves external data from the Harvard Dataverse.
Args:
wts_hostname (str): hostname for commons with wts (not being used at this moment)
auth (Gen3Auth): auth for commons with wts (not being used at this moment)
file_metadata_list (List of Dict): list of studies or files
download_path (str): path to download files and unpack
Expand Down Expand Up @@ -102,6 +104,8 @@ def get_download_url_for_harvard_dataverse(file_metadata: Dict) -> str:
base_url = "https://demo.dataverse.org/api/access"
if "study_id" in file_metadata:
url = f"{base_url}/dataset/:persistentId/?persistentId={file_metadata.get('study_id')}"
elif "file_id" in file_metadata:
url = f"{base_url}/datafile/{file_metadata.get('file_id')}"
else:
url = None

Expand All @@ -122,7 +126,7 @@ def is_valid_harvard_file_metadata(file_metadata: Dict) -> bool:
if not isinstance(file_metadata, dict):
logger.critical(f"Invalid metadata - item is not a dict: {file_metadata}")
return False
if "study_id" not in file_metadata:
if "study_id" not in file_metadata and "file_id" not in file_metadata:
logger.critical(
f"Invalid metadata - missing required Harvard Dataverse keys {file_metadata}"
)
Expand Down
2 changes: 1 addition & 1 deletion heal/qdr_downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

from pathlib import Path
from typing import Dict, List
from utils import unpackage_object, get_id, download_from_url, get_idp_access_token
from heal.utils import unpackage_object, get_id, download_from_url, get_idp_access_token

from cdislogging import get_logger
from gen3.tools.download.drs_download import DownloadStatus, wts_get_token
Expand Down
9 changes: 6 additions & 3 deletions heal/vlmd/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ except ExtractionError as e_err:

The extract module implements extraction and conversion of dictionaries into different formats.

The current formats are csv, json, and tsv.
The current formats are csv, json, and tsv. A `title=<TITLE>` paramater should be supplied when
converting from non-json to json format.

The `vlmd_extract()` method raises a `jsonschema.ValidationError` for an invalid input files
and raises an `ExtractionError` for any other type of error.
Expand All @@ -70,7 +71,7 @@ from jsonschema import ValidationError
from heal.vlmd import vlmd_extract, ExtractionError

try:
vlmd_extract("vlmd_for_extraction.csv", output_dir="./output")
vlmd_extract("vlmd_for_extraction.csv", title="the dictionary title", output_dir="./output")

except ValidationError as v_err:
# handle validation error
Expand Down Expand Up @@ -115,4 +116,6 @@ For example, the following can validate a VLMD file in csv format:
The following would extract a json format VLMD file from a csv format input file and
write a json file in the directory `output`:

`heal vlmd extract --input_file "vlmd_for_extraction.csv" --output_dir "./output"`
`heal vlmd extract --input_file "vlmd_for_extraction.csv" --title "The dictionary title" --output_dir "./output"`

The `--title` option is required when extracting from `csv` to `json`.
4 changes: 3 additions & 1 deletion heal/vlmd/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

# file suffixes
ALLOWED_INPUT_TYPES = ["csv", "tsv", "json"]
ALLOWED_FILE_TYPES = ["auto"] + ALLOWED_INPUT_TYPES
ALLOWED_FILE_TYPES = ["auto", "csv", "tsv", "json", "redcap"]
ALLOWED_SCHEMA_TYPES = ["auto", "csv", "json", "tsv"]
ALLOWED_OUTPUT_TYPES = ["csv", "json"]

Expand All @@ -23,6 +23,8 @@

# schema
JSON_SCHEMA_VERSION = JSON_SCHEMA.get("version", "0.3.2")
# The title is a default title used in the validation process.
# It will get overwritten by a user-specified title in the extraction process.
TOP_LEVEL_PROPS = {
"schemaVersion": JSON_SCHEMA_VERSION,
"title": "HEAL Data Dictionary",
Expand Down
6 changes: 4 additions & 2 deletions heal/vlmd/extract/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from heal.vlmd.config import JSON_SCHEMA, TOP_LEVEL_PROPS
from heal.vlmd.extract.csv_dict_conversion import convert_datadict_csv
from heal.vlmd.extract.json_dict_conversion import convert_template_json
from heal.vlmd.extract.redcap_csv_dict_conversion import convert_redcap_csv
from heal.vlmd.utils import clean_json_fields

logger = get_logger("vlmd-conversion", log_level="debug")
Expand All @@ -18,6 +19,7 @@
recode_map=mappings.recode_map,
),
"json-template": convert_template_json,
"redcap-csv-dict": convert_redcap_csv,
}

ext_map = {
Expand Down Expand Up @@ -45,7 +47,8 @@ def convert_to_vlmd(
input_type (str): The input type. See keys of 'choice_fxn' dict for options, currently:
csv-data-dict, json-template.
data_dictionary_props (dict):
The other data-dictionary level properties. By default, will give the data_dictionary `title` property as the file name stem.
The other data-dictionary level properties. By default,
will give the data_dictionary `title` property as the file name stem.
Returns
Dictionary with:
Expand All @@ -71,7 +74,6 @@ def convert_to_vlmd(
data_dictionary_package = choice_fxn[input_type](
input_filepath, data_dictionary_props
)
logger.debug(f"Data Dictionary Package keys {data_dictionary_package.keys()}")

# For now we return the csv and json in one package.
# If any multiple data dictionaries are needed then implement the methods in
Expand Down
20 changes: 19 additions & 1 deletion heal/vlmd/extract/csv_dict_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,15 @@

import pandas as pd

from cdislogging import get_logger
from heal.vlmd.config import JSON_SCHEMA
from heal.vlmd.extract import utils
from heal.vlmd.extract.redcap_csv_dict_conversion import convert_redcap_csv
from heal.vlmd.utils import has_redcap_headers
from heal.vlmd.validate.utils import read_delim

logger = get_logger("csv-conversion", log_level="debug")


def _parse_string_objects(
tbl_csv: pd.DataFrame, field_properties: dict
Expand Down Expand Up @@ -75,7 +80,8 @@ def convert_datadict_csv(
The HEAL-specified data dictionary properties.
rename_map: A mapping of source (current) column headers to target (desired -- conforming to CVS HEAL spec)
column headers
recode_map: A mapping of values for each column in HEAL spec -- {..."column_name":{"old_value":"new_value"...}...}
recode_map: A mapping of values for each column in HEAL spec, eg,
{..."column_name":{"old_value":"new_value"...}...}
drop_list: a list of variables to drop from headers before processing
item_sep:str (default:"|") Used to split stringified items (in objects and arrays)
key_val_sep:str (default:"=") Used to split stringified each key-value pair
Expand Down Expand Up @@ -123,10 +129,22 @@ def infer_delim(series: pd.Series, char_list: list, firstmatch: bool):
return inferred_delim

if isinstance(csv_template, (str, PathLike)):
logger.debug("Getting data from path to CSV file")
template_tbl = read_delim(str(Path(csv_template)))
else:
logger.debug("Getting data from input dataframe")
template_tbl = pd.DataFrame(csv_template)

# If REDCap then get dictionary and return.
column_names = template_tbl.columns
if has_redcap_headers(column_names):
logger.debug("File appears to have REDCap headers. Ready to convert.")
converted_dict = convert_redcap_csv(template_tbl)
return converted_dict
else:
logger.debug("File is CSV dictionary, not REDCap dictionary.")

# Regular CSV, not REDCap.
if not rename_map:
rename_map = {}

Expand Down
103 changes: 94 additions & 9 deletions heal/vlmd/extract/extract.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from os.path import isfile
from pathlib import Path

Expand All @@ -9,13 +10,50 @@
ALLOWED_INPUT_TYPES,
ALLOWED_OUTPUT_TYPES,
)
from heal.vlmd.extract.conversion import convert_to_vlmd
from heal.vlmd.file_utils import get_output_filepath, write_vlmd_dict
from heal.vlmd.validate.validate import file_type_to_fxn_map


logger = get_logger("extract", log_level="debug")


def set_title_if_missing(file_type: str, title: str, converted_dict: dict) -> dict:
"""
JSON output should have a title.
If input file_type is not json then title should come from
parameter or standardsMappings[0].instrument.title
"""
if title is not None and title != converted_dict.get("title"):
logger.debug(f"JSON dictionary setting user-defined title '{title}'")
converted_dict["title"] = title
return converted_dict

instrument_title = None
try:
standards_mappings = converted_dict.get("standardsMappings")
if standards_mappings and len(standards_mappings) >= 1:
instrument_title = standards_mappings[0].get("instrument").get("title")
except Exception as err:
logger.warning("standardsMapping does not have 'instrument.title'")

if file_type != "json":
if title is None and instrument_title is None:
message = "Title must be supplied when extracting from non-json to json"
logger.error(message)
raise ExtractionError(message)

elif instrument_title is not None:
logger.debug(
f"JSON dictionary setting title to instrument title '{instrument_title}'"
)
converted_dict["title"] = instrument_title

return converted_dict


def vlmd_extract(
input_file, file_type="auto", output_dir=".", output_type="json"
input_file, title=None, file_type="auto", output_dir=".", output_type="json"
) -> bool:
"""
Extract a HEAL compliant csv and json format VLMD data dictionary
Expand All @@ -24,9 +62,11 @@ def vlmd_extract(
Args:
input_file (str): the path of the input HEAL VLMD file to be extracted
into HEAL-compliant VLMD file(s).
title (str): the root level title of the dictionary (required if extracting from csv to json)
file_type (str): the type of the input file that will be extracted into a
HEAL-compliant VLMD file.
Allowed values are "auto", “csv”, "json", "tsv".
Allowed values are "auto", “csv”, "json", "tsv", and "redcap"
where "redcap" is a csv file of a REDCap dictionary export.
Defaults to “auto”.
output_dir (str): the directory of where the extracted VLMD file will
be written. Defaults to “.”
Expand All @@ -38,7 +78,9 @@ def vlmd_extract(
Raises ExtractionError in input VLMD is not valid or could not be converted.
"""

logger.info(f"Extracting VLMD file '{input_file}' with file_type '{file_type}'")
logger.info(
f"Extracting VLMD file '{input_file}' with input file_type '{file_type}'"
)

file_suffix = Path(input_file).suffix.replace(".", "")
if file_suffix not in ALLOWED_INPUT_TYPES:
Expand All @@ -62,15 +104,58 @@ def vlmd_extract(
logger.error(message)
raise ExtractionError(message)

logger.debug(f"File type is set to '{file_type}'")
# validate
try:
# csv files are converted as part of validate
converted_dictionary = vlmd_validate(
input_file, output_type=output_type, return_converted_output=True
input_file,
file_type=file_type,
output_type=output_type,
return_converted_output=True,
)
except Exception as e:
except Exception as err:
logger.error(f"Error in validating and extracting dictionary from {input_file}")
logger.error(e)
raise ExtractionError(str(e))
logger.error(err)
raise ExtractionError(str(err))

# input json file require explicit conversion and post validation steps
if file_type == "json":
file_convert_function = file_type_to_fxn_map.get(file_type)
data_dictionary_props = {}
try:
logger.debug("Ready to convert json input to VLMD")
data_dictionaries = convert_to_vlmd(
input_filepath=input_file,
input_type=file_convert_function,
data_dictionary_props=data_dictionary_props,
)
if output_type == "json":
converted_dictionary = data_dictionaries["template_json"]
logger.debug(
f"Ready to validate converted dict with output type '{output_type}'"
)
is_valid = vlmd_validate(
converted_dictionary,
file_type=file_type,
output_type=output_type,
return_converted_output=False,
)
logger.debug(f"Converted dictionary is valid: {is_valid}")
else:
converted_dictionary = data_dictionaries["template_csv"]["fields"]
except Exception as err:
logger.error(f"Error in extracting JSON dictionary from {input_file}")
logger.error(err)
raise ExtractionError(str(err))

if output_type == "json":
converted_dictionary = set_title_if_missing(
file_type=file_type, title=title, converted_dict=converted_dictionary
)
if converted_dictionary.get("title") is None:
logger.error("JSON dictionary is missing 'title'")
raise ExtractionError("JSON dictionary is missing 'title'")

# write to file
output_filepath = get_output_filepath(
Expand All @@ -79,9 +164,9 @@ def vlmd_extract(
logger.info(f"Writing converted dictionary to {output_filepath}")
try:
write_vlmd_dict(converted_dictionary, output_filepath, file_type=output_type)
except Exception as e:
except Exception as err:
logger.error("Error in writing converted dictionary")
logger.error(e)
logger.error(err)
raise ExtractionError("Error in writing converted dictionary")

return True
Loading

0 comments on commit fc99a7c

Please sign in to comment.