From 448119098d3e1566ba3696c1c7f38b234191593d Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Fri, 8 Oct 2021 16:09:27 -0500 Subject: [PATCH 01/70] adding support for aggregation/guppy --- src/mds/agg_mds/commons.py | 57 ++++- src/mds/agg_mds/datastore/__init__.py | 20 +- .../agg_mds/datastore/elasticsearch_dao.py | 206 +++++++++++++++--- src/mds/agg_mds/functions.py | 0 src/mds/agg_mds/query.py | 28 ++- src/mds/populate.py | 67 +++++- 6 files changed, 318 insertions(+), 60 deletions(-) create mode 100644 src/mds/agg_mds/functions.py diff --git a/src/mds/agg_mds/commons.py b/src/mds/agg_mds/commons.py index b577fad4..27f2feca 100644 --- a/src/mds/agg_mds/commons.py +++ b/src/mds/agg_mds/commons.py @@ -1,8 +1,6 @@ from dataclasses import dataclass, field from dataclasses_json import dataclass_json -from typing import Any, Dict, List, Optional -from datetime import datetime -import json +from typing import Any, Dict, List, Optional, Union @dataclass_json @@ -10,7 +8,7 @@ class ColumnsToFields: """ A more complex mapping object for mapping column names to MDS fields - allows to explictly mark a field as missing, a default value and it's resources type + allows to explicitly mark a field as missing, a default value and it's resources type """ name: str @@ -18,17 +16,55 @@ class ColumnsToFields: default: str = "" type: str = "string" + def get_value(self, info: dict): + return info.get(self.name, self.default) + + +@dataclass_json +@dataclass +class FieldAggregation: + """ + Provides a description of what fields to compute summary information. + The default assumes computing the sum of the field, assuming it is a number + the functions supported are: sum and count + """ + + type: str = "number" + function: str = "sum" + + +@dataclass_json +@dataclass +class FieldDefinition: + """ + Provides a description of a field defined in the metadata + While other fields are defined dynamically, these help "tune" + certain fields + * type: one of string, number, object, nested (deeper object) + * aggregate: aggregation is available + """ + + type: str = "string" + aggregate: bool = False + @dataclass_json @dataclass class MDSInstance: mds_url: str commons_url: str - columns_to_fields: Optional[Dict[str, Any]] = None + columns_to_fields: Optional[ + Union[Dict[str, str], Dict[str, ColumnsToFields]] + ] = None study_data_field: str = "gen3_discovery" guid_type: str = "discovery_metadata" select_field: Optional[Dict[str, str]] = None + def __post_init__(self): + for name, value in self.columns_to_fields.items(): + if isinstance(value, dict): + self.columns_to_fields[name] = ColumnsToFields.from_dict(value) + @dataclass_json @dataclass @@ -50,9 +86,8 @@ class AdapterMDSInstance: class Commons: gen3_commons: Dict[str, MDSInstance] adapter_commons: Dict[str, AdapterMDSInstance] - aggregation: List[str] = field( - default_factory=lambda: ["_unique_id", "_subjects_count"] - ) + aggregations: Optional[Dict[str, FieldAggregation]] + fields: Optional[Dict[str, FieldDefinition]] def parse_config(data: Dict[str, Any]) -> Commons: @@ -63,7 +98,9 @@ def parse_config(data: Dict[str, Any]) -> Commons: return Commons.from_dict( { - "gen3_commons": data.get("gen3_commons", dict()), - "adapter_commons": data.get("adapter_commons", dict()), + "gen3_commons": data.get("gen3_commons", {}), + "adapter_commons": data.get("adapter_commons", {}), + "aggregations": data.get("aggregations", {}), + "fields": data.get("fields", {}), } ) diff --git a/src/mds/agg_mds/datastore/__init__.py b/src/mds/agg_mds/datastore/__init__.py index a049d1e2..5d2fb137 100644 --- a/src/mds/agg_mds/datastore/__init__.py +++ b/src/mds/agg_mds/datastore/__init__.py @@ -13,8 +13,8 @@ async def init(hostname, port): await client.init(hostname, port) -async def drop_all(): - await client.drop_all() +async def drop_all(commons_mapping): + await client.drop_all(commons_mapping) async def close(): @@ -32,10 +32,22 @@ async def update_metadata(*args): await client.update_metadata(*args) +async def update_global_info(*args): + await client.update_global_info(*args) + + +async def update_config_info(*args): + await client.update_config_info(*args) + + async def get_commons_metadata(*args): return await client.get_commons_metadata(*args) +async def get_all_tags(): + return await client.metadata_tags() + + async def get_all_named_commons_metadata(*args): return await client.get_all_named_commons_metadata(*args) @@ -60,5 +72,9 @@ async def get_aggregations(*args): return await client.get_aggregations(*args) +async def get_number_aggregations(*args): + return await client.get_number_aggregation_for_field(*args) + + async def search(*args): return await client.search(*args) diff --git a/src/mds/agg_mds/datastore/elasticsearch_dao.py b/src/mds/agg_mds/datastore/elasticsearch_dao.py index 39509226..e1aa6775 100644 --- a/src/mds/agg_mds/datastore/elasticsearch_dao.py +++ b/src/mds/agg_mds/datastore/elasticsearch_dao.py @@ -2,13 +2,12 @@ from typing import List, Dict import json from mds import logger -from mds.config import AGG_MDS_NAMESPACE - +from mds.config import AGG_MDS_NAMESPACE, ES_RETRY_LIMIT, ES_RETRY_INTERVAL # TODO WFH Why do we have both __manifest and _file_manifest? # TODO WFH These are bugs. If we have to check whether an object is a string or -# an object, the data is bad. -FIELD_NORMALIZERS = { +# an object, the data is bad. +DEFAULT_FIELD_NORMALIZERS = { "__manifest": "object", "_file_manifest": "object", "advSearchFilters": "object", @@ -16,33 +15,83 @@ "sites": "number", } - AGG_MDS_INDEX = f"{AGG_MDS_NAMESPACE}-commons-index" AGG_MDS_TYPE = "commons" - AGG_MDS_INFO_INDEX = f"{AGG_MDS_NAMESPACE}-commons-info-index" AGG_MDS_INFO_TYPE = "commons-info" +AGG_MDS_CONFIG_INDEX = f"{AGG_MDS_NAMESPACE}-commons-config-index" +AGG_MDS_CONFIG_TYPE = "commons-config" MAPPING = { + "settings": { + "index": { + "number_of_shards": 1, + "number_of_replicas": 0, + "analysis": { + "tokenizer": { + "ngram_tokenizer": { + "type": "ngram", + "min_gram": 2, + "max_gram": 20, + "token_chars": ["letter", "digit"], + } + }, + "analyzer": { + "ngram_analyzer": { + "type": "custom", + "tokenizer": "ngram_tokenizer", + "filter": ["lowercase"], + }, + "search_analyzer": { + "type": "custom", + "tokenizer": "keyword", + "filter": "lowercase", + }, + }, + }, + } + }, "mappings": { "commons": { "properties": { + "auth_resource_path": {"type": "keyword"}, "__manifest": { "type": "nested", }, "tags": { "type": "nested", + "properties": { + "name": { + "type": "text", + "fields": { + "analyzed": { + "type": "text", + "term_vector": "with_positions_offsets", + "analyzer": "ngram_analyzer", + "search_analyzer": "search_analyzer", + } + }, + }, + "category": {"type": "text"}, + }, }, - "data_dictionary": { + "advSearchFilters": { "type": "nested", }, } } - } + }, +} + +CONFIG = { + "settings": {"index": {"number_of_shards": 1, "number_of_replicas": 0}}, + "mappings": {"_doc": {"properties": {"array": {"type": "keyword"}}}}, } +SAMPLE = {"array": ["tags", "advSearchFilters"]} + elastic_search_client = None @@ -52,19 +101,21 @@ async def init(hostname: str = "0.0.0.0", port: int = 9200): [hostname], scheme="http", port=port, - timeout=30, - max_retries=7, + timeout=ES_RETRY_INTERVAL, + max_retries=ES_RETRY_LIMIT, retry_on_timeout=True, ) -async def drop_all(): - for index in [AGG_MDS_INDEX, AGG_MDS_INFO_INDEX]: +async def drop_all(common_mapping: dict): + for index in [AGG_MDS_INDEX, AGG_MDS_INFO_INDEX, AGG_MDS_CONFIG_TYPE]: res = elastic_search_client.indices.delete(index=index, ignore=[400, 404]) logger.debug(f"deleted index: {index}") try: - res = elastic_search_client.indices.create(index=AGG_MDS_INDEX, body=MAPPING) + res = elastic_search_client.indices.create( + index=AGG_MDS_INDEX, body=common_mapping + ) logger.debug(f"created index {AGG_MDS_INDEX}: {res}") except es_exceptions.RequestError as ex: if ex.error == "resource_already_exists_exception": @@ -86,12 +137,24 @@ async def drop_all(): else: # Other exception - raise it raise ex + try: + res = elastic_search_client.indices.create( + index=AGG_MDS_CONFIG_INDEX, body=CONFIG + ) + logger.debug(f"created index {AGG_MDS_CONFIG_INDEX}: {res}") + except es_exceptions.RequestError as ex: + if ex.error == "resource_already_exists_exception": + logger.warning(f"index already exists: {AGG_MDS_CONFIG_INDEX}") + pass # Index already exists. Ignore. + else: # Other exception - raise it + raise ex + def normalize_field(doc, key, normalize_type): try: if normalize_type == "object" and isinstance(doc[key], str): value = doc[key] - doc[key] = None if value is "" else json.loads(value) + doc[key] = None if value == "" else json.loads(value) if normalize_type == "number" and isinstance(doc[key], str): doc[key] = None except: @@ -105,6 +168,7 @@ async def update_metadata( guid_arr: List[str], tags: Dict[str, List[str]], info: Dict[str, str], + field_normalizers: Dict[str, str], study_data_field: str, ): elastic_search_client.index( @@ -114,20 +178,36 @@ async def update_metadata( body=info, ) + unified_field_normalizers = {**field_normalizers} for doc in data: key = list(doc.keys())[0] # Flatten out this structure doc = doc[key][study_data_field] - for field in FIELD_NORMALIZERS.keys(): + for field in unified_field_normalizers.keys(): if field in doc: - normalize_field(doc, field, FIELD_NORMALIZERS[field]) + normalize_field(doc, field, unified_field_normalizers[field]) elastic_search_client.index( index=AGG_MDS_INDEX, doc_type=AGG_MDS_TYPE, id=key, body=doc ) +async def update_global_info(key, doc) -> None: + elastic_search_client.index( + index=AGG_MDS_INFO_INDEX, doc_type=AGG_MDS_INFO_TYPE, id=key, body=doc + ) + + +async def update_config_info(doc) -> None: + elastic_search_client.index( + index=AGG_MDS_CONFIG_INDEX, + doc_type="_doc", + id=AGG_MDS_INDEX, + body=doc, + ) + + async def get_status(): if not elastic_search_client.ping(): raise ValueError("Connection failed") @@ -157,27 +237,41 @@ async def get_commons(): return [] -async def get_all_metadata(limit, offset): +async def get_all_metadata(limit, offset, flatten=False): try: res = elastic_search_client.search( index=AGG_MDS_INDEX, body={"size": limit, "from": offset, "query": {"match_all": {}}}, ) - byCommons = {} - for record in res["hits"]["hits"]: - id = record["_id"] - normalized = record["_source"] - commons_name = normalized["commons_name"] - if commons_name not in byCommons: - byCommons[commons_name] = [] - byCommons[commons_name].append( - { - id: { - "gen3_discovery": normalized, + if flatten: + flat = [] + for record in res["hits"]["hits"]: + id = record["_id"] + normalized = record["_source"] + flat.append( + { + id: { + "gen3_discovery": normalized, + } } - } - ) - return byCommons + ) + else: + byCommons = {} + for record in res["hits"]["hits"]: + id = record["_id"] + normalized = record["_source"] + commons_name = normalized["commons_name"] + + if commons_name not in byCommons: + byCommons[commons_name] = [] + byCommons[commons_name].append( + { + id: { + "gen3_discovery": normalized, + } + } + ) + return byCommons except Exception as error: logger.error(error) return {} @@ -195,9 +289,9 @@ async def get_all_named_commons_metadata(name): return {} -async def metadata_tags(name): +async def metadata_tags(): try: - return elastic_search_client.search( + res = elastic_search_client.search( index=AGG_MDS_INDEX, body={ "size": 0, @@ -220,6 +314,16 @@ async def metadata_tags(name): }, }, ) + results = {} + + for info in res["aggregations"]["tags"]["categories"]["buckets"]: + results[info["key"]] = { + "total": info["doc_count"], + "names": [{x["key"]: x["doc_count"] for x in info["name"]["buckets"]}], + } + + return results + except Exception as error: logger.error(error) return [] @@ -267,6 +371,42 @@ async def get_aggregations(name): return [] +async def get_number_aggregation_for_field(field: str): + try: + # get the total number of documents in a commons namespace + query = { + "size": 0, + "aggs": { + field: {"sum": {"field": field}}, + "missing": {"missing": {"field": field}}, + "types_count": {"value_count": {"field": field}}, + }, + } + nested = False + parts = field.split(".") + if len(parts) == 2: + nested = True + query["aggs"] = { + field: {"nested": {"path": parts[0]}, "aggs": query["aggs"]} + } + + res = elastic_search_client.search(index=AGG_MDS_INDEX, body=query) + + agg_results = res["aggregations"][field] if nested else res["aggregations"] + + return { + field: { + "total_items": res["hits"]["total"], + "sum": agg_results[field]["value"], + "missing": agg_results["missing"]["doc_count"], + } + } + + except Exception as error: + logger.error(error) + return {} + + async def get_by_guid(guid): try: data = elastic_search_client.get( diff --git a/src/mds/agg_mds/functions.py b/src/mds/agg_mds/functions.py new file mode 100644 index 00000000..e69de29b diff --git a/src/mds/agg_mds/query.py b/src/mds/agg_mds/query.py index 23b500c9..5eedf472 100644 --- a/src/mds/agg_mds/query.py +++ b/src/mds/agg_mds/query.py @@ -25,7 +25,7 @@ async def metadata( offset: int = Query(0, description="Return results at this given offset."), ): # TODO WFH How to properly return this? We think grouping by MDS is probably - # not ideal in reality. We already have commons_name in the results. + # not ideal in reality. We already have commons_name in the results. """ Returns all metadata from all registered commons in the form: { @@ -71,6 +71,21 @@ async def metadata_tags(name: str): ) +@mod.get("/aggregate/tags") +async def metadata_tags(): + """ + Returns the tags associated with the named commons. + """ + res = await datastore.get_all_tags() + if res: + return res + else: + raise HTTPException( + HTTP_404_NOT_FOUND, + {"message": f"error retrieving tags from service", "code": 404}, + ) + + @mod.get("/aggregate/metadata/{name}/info") async def metadata_info(name: str): """ @@ -86,15 +101,18 @@ async def metadata_info(name: str): ) -@mod.get("/aggregate/metadata/{name}/aggregations") -async def metadata_aggregations(name: str): - res = await datastore.get_aggregations(name) +@mod.get("/aggregate/summary/{field}") +async def metadata_aggregations(field: str): + res = await datastore.get_number_aggregations(field) if res: return res else: raise HTTPException( HTTP_404_NOT_FOUND, - {"message": f"no common exists with the given: {name}", "code": 404}, + { + "message": f"metadata_aggregations: no common exists with the given: {field}", + "code": 404, + }, ) diff --git a/src/mds/populate.py b/src/mds/populate.py index 06727044..ffcc71e5 100644 --- a/src/mds/populate.py +++ b/src/mds/populate.py @@ -3,7 +3,7 @@ from typing import Any, Dict, List from mds.agg_mds import datastore, adapters from mds.agg_mds.mds import pull_mds -from mds.agg_mds.commons import MDSInstance, AdapterMDSInstance, Commons, parse_config +from mds.agg_mds.commons import MDSInstance, ColumnsToFields, Commons, parse_config from mds import config, logger from pathlib import Path from urllib.parse import urlparse @@ -58,10 +58,15 @@ def normalize(entry: dict) -> Any: for column, field in common.columns_to_fields.items(): if field == column: continue - if field in entry[common.study_data_field]: - entry[common.study_data_field][column] = entry[ - common.study_data_field - ][field] + if isinstance(field, ColumnsToFields): + entry[common.study_data_field][column] = field.get_value( + entry[common.study_data_field] + ) + else: + if field in entry[common.study_data_field]: + entry[common.study_data_field][column] = entry[ + common.study_data_field + ][field] return entry entry = normalize(entry) @@ -81,12 +86,35 @@ def normalize(entry: dict) -> Any: keys = list(results.keys()) info = {"commons_url": common.commons_url} + + # build ES normalization dictionary + field_typing = { + field: "object" if info.type in ["nested", "array"] else info.type + for field, info in commons.fields.items() + } + await datastore.update_metadata( - name, mds_arr, keys, tags, info, common.study_data_field + name, mds_arr, keys, tags, info, field_typing, common.study_data_field ) -async def main(commons_config: Commons, hostname: str, port: int) -> None: +async def populate_info(commons_config: Commons) -> None: + agg_info = { + key: value.to_dict() for key, value in commons_config.aggregations.items() + } + await datastore.update_global_info("aggregations", agg_info) + + +async def populate_config(commons_config: Commons) -> None: + array_definition = { + "array": [ + field for field, value in commons.fields.items() if value.type == "array" + ] + } + await datastore.update_config_info(array_definition) + + +async def main(commons_config: Commons) -> None: """ Given a config structure, pull all metadata from each one in the config and cache into the following structure: @@ -103,13 +131,28 @@ async def main(commons_config: Commons, hostname: str, port: int) -> None: """ if not config.USE_AGG_MDS: - print("aggregate MDS disabled") + logger.info("aggregate MDS disabled") exit(1) url_parts = urlparse(config.ES_ENDPOINT) await datastore.init(hostname=url_parts.hostname, port=url_parts.port) - await datastore.drop_all() + + # build mapping table for commons index + + field_mapping = { + "mappings": { + "commons": { + "properties": { + field: {"type": {"array": "nested"}.get(info.type, info.type)} + for field, info in commons.fields.items() + if info.type in ["array", "nested"] + } + } + } + } + + await datastore.drop_all(commons_mapping=field_mapping) for name, common in commons_config.gen3_commons.items(): logger.info(f"Populating {name} using Gen3 MDS connector") @@ -134,6 +177,10 @@ async def main(commons_config: Commons, hostname: str, port: int) -> None: if len(results) > 0: await populate_metadata(name, common, results) + # populate global information index + await populate_info(commons_config) + # populate array index information to support guppy + await populate_config(commons_config) res = await datastore.get_status() print(res) await datastore.close() @@ -184,4 +231,4 @@ def parse_config_from_file(path: Path) -> Commons: """ args: Namespace = parse_args(sys.argv) commons = parse_config_from_file(Path(args.config)) - asyncio.run(main(commons_config=commons, hostname=args.hostname, port=args.port)) + asyncio.run(main(commons_config=commons)) From 810cbfda9896dacee645dc84f492394c0624292b Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Thu, 14 Oct 2021 12:55:38 -0500 Subject: [PATCH 02/70] add support for default value --- src/mds/agg_mds/adapters.py | 37 ++++++++++++++++++++++++------------- src/mds/config.py | 3 +++ 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/src/mds/agg_mds/adapters.py b/src/mds/agg_mds/adapters.py index 972446e4..362f1352 100644 --- a/src/mds/agg_mds/adapters.py +++ b/src/mds/agg_mds/adapters.py @@ -54,14 +54,20 @@ def execute(cls, name, value): return FieldFilters.filters[name](value) -def get_json_path_value(expression: str, item: dict) -> Union[str, List[Any]]: +def get_json_path_value( + expression: str, + item: dict, + has_default_value: bool = False, + default_value: str = "", +) -> Union[str, List[Any]]: """ Given a JSON Path expression and a dictionary, using the path expression - to find the value. If not found return an empty string + to find the value. If not found return and default value define return it, else + return None """ if expression is None: - return "" + return default_value if has_default_value else None try: jsonpath_expr = parse(expression) @@ -69,11 +75,11 @@ def get_json_path_value(expression: str, item: dict) -> Union[str, List[Any]]: logger.error( f"Invalid JSON Path expression {exc} . See https://github.com/json-path/JsonPath. Returning ''" ) - return "" + return default_value if has_default_value else None v = jsonpath_expr.find(item) - if len(v) == 0: # nothing found use default value of empty string - return "" + if len(v) == 0: # nothing found, deal with this + return default_value if has_default_value else None if len(v) == 1: # convert array length 1 to a value return v[0].value @@ -132,6 +138,7 @@ def mapFields(item: dict, mappings: dict, global_filters=None) -> dict: field: { path: JSON Path filters: [process field filters] + default_value(optional): Any Value } :param item: dictionary to map fields to @@ -147,7 +154,11 @@ def mapFields(item: dict, mappings: dict, global_filters=None) -> dict: for key, value in mappings.items(): if isinstance(value, dict): # have a complex assignment expression = value.get("path", None) - field_value = get_json_path_value(expression, item) + if hasDefaultValue := "default_value" in value: + default_value = value["default_value"] + field_value = get_json_path_value( + expression, item, hasDefaultValue, default_value + ) filters = value.get("filters", []) for filter in filters: @@ -217,7 +228,7 @@ def getRemoteDataAsJson(self, **kwargs) -> Dict: raise except httpx.HTTPError as exc: logger.error( - f"An HTTP error { exc.response.status_code if exc.response is not None else '' } occurred while requesting {exc.request.url}. Skipping {id}" + f"An HTTP error {exc.response.status_code if exc.response is not None else ''} occurred while requesting {exc.request.url}. Skipping {id}" ) except ValueError as exc: logger.error( @@ -366,17 +377,17 @@ def getRemoteDataAsJson(self, **kwargs) -> Dict: raise except httpx.HTTPError as exc: logger.error( - f"An HTTP error {exc.response.status_code if exc.response is not None else ''} occurred while requesting {exc.request.url}. Returning { len(results['results'])} results" + f"An HTTP error {exc.response.status_code if exc.response is not None else ''} occurred while requesting {exc.request.url}. Returning {len(results['results'])} results" ) break # need to break here as cannot be assured of leaving while loop except ValueError as exc: logger.error( - f"An error occurred while requesting {mds_url} {exc}. Returning { len(results['results'])} results." + f"An error occurred while requesting {mds_url} {exc}. Returning {len(results['results'])} results." ) break except Exception as exc: logger.error( - f"An error occurred while requesting {mds_url} {exc}. Returning { len(results['results'])} results." + f"An error occurred while requesting {mds_url} {exc}. Returning {len(results['results'])} results." ) break @@ -479,7 +490,7 @@ def getRemoteDataAsJson(self, **kwargs) -> Dict: raise except httpx.HTTPError as exc: logger.error( - f"An HTTP error { exc.response.status_code if exc.response is not None else '' } occurred while requesting {exc.request.url}. Skipping {id}" + f"An HTTP error {exc.response.status_code if exc.response is not None else ''} occurred while requesting {exc.request.url}. Skipping {id}" ) return results @@ -584,7 +595,7 @@ def getRemoteDataAsJson(self, **kwargs) -> Dict: raise except httpx.HTTPError as exc: logger.error( - f"An HTTP error { exc.response.status_code if exc.response is not None else '' } occurred while requesting {exc.request.url}. Returning { len(results['results'])} results." + f"An HTTP error {exc.response.status_code if exc.response is not None else ''} occurred while requesting {exc.request.url}. Returning {len(results['results'])} results." ) break diff --git a/src/mds/config.py b/src/mds/config.py index 020c7432..6af9694a 100644 --- a/src/mds/config.py +++ b/src/mds/config.py @@ -59,6 +59,9 @@ def __init__(self, value): DB_RETRY_LIMIT = config("DB_RETRY_LIMIT", cast=int, default=DB_CONNECT_RETRIES) DB_RETRY_INTERVAL = config("DB_RETRY_INTERVAL", cast=int, default=1) +# Elasticsearch +ES_RETRY_INTERVAL = config("ES_RETRY_INTERVAL", cast=int, default=20) +ES_RETRY_LIMIT = config("ES_RETRY_LIMIT", cast=int, default=5) # Security From 1b908686810fc4c7e3f7b7ef39edc5c1e6c4e0f5 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Mon, 18 Oct 2021 15:42:11 -0500 Subject: [PATCH 03/70] schema support and summary/search --- docs/metadata_adapters.md | 33 ++++++++-- src/mds/agg_mds/adapters.py | 11 ++++ src/mds/agg_mds/commons.py | 61 +++++++++++++++---- .../agg_mds/datastore/elasticsearch_dao.py | 25 +++++--- src/mds/agg_mds/query.py | 16 ----- src/mds/populate.py | 19 +++--- 6 files changed, 116 insertions(+), 49 deletions(-) diff --git a/docs/metadata_adapters.md b/docs/metadata_adapters.md index f2b0496a..ebf9594e 100644 --- a/docs/metadata_adapters.md +++ b/docs/metadata_adapters.md @@ -49,7 +49,8 @@ A metadata service is configurable via a JSON object, with the following format: "location": "path:coverage[0]", "summary": { "path":"description", - "filters": ["strip_html"] + "filters": ["strip_html"], + "default_value" : "N/A" }, ... }, @@ -106,10 +107,16 @@ The above methods should allow you to pull any nested value from a metadata entr ```json "summary": { "path":"description", - "filters": ["strip_html"] + "filters": ["strip_html"], + "default_value" : "N/A" } ``` -In this case, the ```summary``` is set to a JSON object which optionally defines a JSON path and an array of one or more filters to apply. The filters are applied to the text value of the remote field. Furthermore, the filters are applied in the order they appear. The current set of filters are: +In this case, the ```summary``` is set to a JSON object which optionally defines: +* a JSON path +* an array of one or more filters to apply +* default value to set if that field is not found + +The filters are applied to the text value of the remote field. Furthermore, the filters are applied in the order they appear. The current set of filters are: * strip_html: remove HTML tags from a text field * strip_email: remove email addresses from a text field @@ -123,7 +130,25 @@ def filter_function(s:str) -> str: ``` ### Default Values -Defining default values for fields is quite simple: define the normalized field name and a value. If a remote metadata field has a value, it will override the default. +Defining default values for fields is handled in one of two way: +If a field in the metdata does not need a path, simply define the +field name and a value. If a remote metadata field has a value, it will override the default. +If a path is use then use the longer form and set the ```default_value``` to use +if the path is not found. + +```json +{ + ... + "summary": { + "path": "description", + "filters": [ + "strip_html" + ], + "default_value": "N/A" + }, + ... +} +``` ### Per Item Overrides diff --git a/src/mds/agg_mds/adapters.py b/src/mds/agg_mds/adapters.py index 362f1352..797f0317 100644 --- a/src/mds/agg_mds/adapters.py +++ b/src/mds/agg_mds/adapters.py @@ -19,6 +19,8 @@ def strip_email(text: str): + if not isinstance(text, str): + return text rgx = r"[\w.+-]+@[\w-]+\.[\w.-]+" matches = re.findall(rgx, text) for cur_word in matches: @@ -27,14 +29,20 @@ def strip_email(text: str): def strip_html(s: str): + if not isinstance(s, str): + return s return bleach.clean(s, tags=[], strip=True) def add_icpsr_source_url(study_id: str): + if not isinstance(study_id, str): + return study_id return f"https://www.icpsr.umich.edu/web/NAHDAP/studies/{study_id}" def add_clinical_trials_source_url(study_id: str): + if not isinstance(study_id, str): + return study_id return f"https://clinicaltrials.gov/ct2/show/{study_id}" @@ -154,8 +162,11 @@ def mapFields(item: dict, mappings: dict, global_filters=None) -> dict: for key, value in mappings.items(): if isinstance(value, dict): # have a complex assignment expression = value.get("path", None) + + default_value = None if hasDefaultValue := "default_value" in value: default_value = value["default_value"] + field_value = get_json_path_value( expression, item, hasDefaultValue, default_value ) diff --git a/src/mds/agg_mds/commons.py b/src/mds/agg_mds/commons.py index 27f2feca..2462a0e4 100644 --- a/src/mds/agg_mds/commons.py +++ b/src/mds/agg_mds/commons.py @@ -1,6 +1,6 @@ from dataclasses import dataclass, field from dataclasses_json import dataclass_json -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union, TypeVar @dataclass_json @@ -31,6 +31,10 @@ class FieldAggregation: type: str = "number" function: str = "sum" + chart: str = "text" + + +FieldDefinition = TypeVar("FieldDefinition") @dataclass_json @@ -46,6 +50,31 @@ class FieldDefinition: type: str = "string" aggregate: bool = False + properties: Optional[Dict[str, FieldDefinition]] = None + + ES_TYPE_MAPPING = { + "array": "nested", + "string": "text", + "integer": "long", + } + + def __post_init__(self): + if self.properties is not None: + self.properties = { + k: FieldDefinition.from_dict(v) for k, v in self.properties.items() + } + + def to_schema(self, es_types: bool = False): + res = { + "type": FieldDefinition.ES_TYPE_MAPPING.get(self.type, self.type) + if es_types + else self.type + } + if self.properties is not None: + res["properties"] = { + k: v.to_schema(True) for k, v in self.properties.items() + } + return res @dataclass_json @@ -61,6 +90,9 @@ class MDSInstance: select_field: Optional[Dict[str, str]] = None def __post_init__(self): + if self.columns_to_fields is None: + return + for name, value in self.columns_to_fields.items(): if isinstance(value, dict): self.columns_to_fields[name] = ColumnsToFields.from_dict(value) @@ -81,13 +113,23 @@ class AdapterMDSInstance: global_field_filters: List[str] = field(default_factory=list) +@dataclass_json +@dataclass +class Config: + fields: Optional[Dict[str, FieldDefinition]] = field(default_factory=dict) + settings: Optional[Dict[str, Any]] = field(default_factory=dict) + aggregations: Optional[Dict[str, FieldAggregation]] = field(default_factory=dict) + search_settings: Optional[Dict[str, FieldAggregation]] = field(default_factory=dict) + + @dataclass_json @dataclass class Commons: - gen3_commons: Dict[str, MDSInstance] - adapter_commons: Dict[str, AdapterMDSInstance] - aggregations: Optional[Dict[str, FieldAggregation]] - fields: Optional[Dict[str, FieldDefinition]] + configuration: Optional[Config] = None + gen3_commons: Dict[str, MDSInstance] = field(default_factory=dict) + adapter_commons: Dict[str, AdapterMDSInstance] = field(default_factory=dict) + aggregations: Optional[Dict[str, FieldAggregation]] = field(default_factory=dict) + fields: Optional[Dict[str, FieldDefinition]] = field(default_factory=dict) def parse_config(data: Dict[str, Any]) -> Commons: @@ -96,11 +138,4 @@ def parse_config(data: Dict[str, Any]) -> Commons: for the Ecosystem browser. Returns a dictionary of MDSInfo entries """ - return Commons.from_dict( - { - "gen3_commons": data.get("gen3_commons", {}), - "adapter_commons": data.get("adapter_commons", {}), - "aggregations": data.get("aggregations", {}), - "fields": data.get("fields", {}), - } - ) + return Commons.from_json(data) diff --git a/src/mds/agg_mds/datastore/elasticsearch_dao.py b/src/mds/agg_mds/datastore/elasticsearch_dao.py index e1aa6775..bd5ce1ef 100644 --- a/src/mds/agg_mds/datastore/elasticsearch_dao.py +++ b/src/mds/agg_mds/datastore/elasticsearch_dao.py @@ -90,8 +90,6 @@ "mappings": {"_doc": {"properties": {"array": {"type": "keyword"}}}}, } -SAMPLE = {"array": ["tags", "advSearchFilters"]} - elastic_search_client = None @@ -110,7 +108,7 @@ async def init(hostname: str = "0.0.0.0", port: int = 9200): async def drop_all(common_mapping: dict): for index in [AGG_MDS_INDEX, AGG_MDS_INFO_INDEX, AGG_MDS_CONFIG_TYPE]: res = elastic_search_client.indices.delete(index=index, ignore=[400, 404]) - logger.debug(f"deleted index: {index}") + logger.debug(f"deleted index: {index}: {res}") try: res = elastic_search_client.indices.create( @@ -188,9 +186,12 @@ async def update_metadata( if field in doc: normalize_field(doc, field, unified_field_normalizers[field]) - elastic_search_client.index( - index=AGG_MDS_INDEX, doc_type=AGG_MDS_TYPE, id=key, body=doc - ) + try: + elastic_search_client.index( + index=AGG_MDS_INDEX, doc_type=AGG_MDS_TYPE, id=key, body=doc + ) + except Exception as ex: + print(ex) async def update_global_info(key, doc) -> None: @@ -391,7 +392,6 @@ async def get_number_aggregation_for_field(field: str): } res = elastic_search_client.search(index=AGG_MDS_INDEX, body=query) - agg_results = res["aggregations"][field] if nested else res["aggregations"] return { @@ -407,6 +407,17 @@ async def get_number_aggregation_for_field(field: str): return {} +async def does_exists(field): + try: + query = {"size": 0, "query": {"bool": {"must": {"exists": {"field": field}}}}} + res = elastic_search_client.search(index=AGG_MDS_INDEX, body=query) + if res["hits"]["total"] > 0: + return True + except Exception as error: + logger.error(error) + return False + + async def get_by_guid(guid): try: data = elastic_search_client.get( diff --git a/src/mds/agg_mds/query.py b/src/mds/agg_mds/query.py index 5eedf472..6b7924e5 100644 --- a/src/mds/agg_mds/query.py +++ b/src/mds/agg_mds/query.py @@ -3,7 +3,6 @@ from mds import config from mds.agg_mds import datastore - mod = APIRouter() @@ -56,21 +55,6 @@ async def metadata_name(name: str): ) -@mod.get("/aggregate/metadata/{name}/tags") -async def metadata_tags(name: str): - """ - Returns the tags associated with the named commons. - """ - res = await datastore.get_commons_attribute(name, "tags") - if res: - return res - else: - raise HTTPException( - HTTP_404_NOT_FOUND, - {"message": f"no common exists with the given: {name}", "code": 404}, - ) - - @mod.get("/aggregate/tags") async def metadata_tags(): """ diff --git a/src/mds/populate.py b/src/mds/populate.py index ffcc71e5..450c2551 100644 --- a/src/mds/populate.py +++ b/src/mds/populate.py @@ -89,8 +89,8 @@ def normalize(entry: dict) -> Any: # build ES normalization dictionary field_typing = { - field: "object" if info.type in ["nested", "array"] else info.type - for field, info in commons.fields.items() + field: "object" if x.type in ["nested", "array"] else x.type + for field, x in commons.fields.items() } await datastore.update_metadata( @@ -143,11 +143,7 @@ async def main(commons_config: Commons) -> None: field_mapping = { "mappings": { "commons": { - "properties": { - field: {"type": {"array": "nested"}.get(info.type, info.type)} - for field, info in commons.fields.items() - if info.type in ["array", "nested"] - } + "properties": {k: v.to_schema(True) for k, v in commons.fields.items()} } } } @@ -221,8 +217,13 @@ async def filter_entries( def parse_config_from_file(path: Path) -> Commons: - with open(path, "rt") as infile: - return parse_config(json.load(infile)) + if not path.exists(): + logger.error(f"configuration file: {path} does not exist") + try: + return parse_config(path.read_text()) + except IOError as ex: + logger.error(f"cannot read configuration file {path}: {ex}") + raise ex if __name__ == "__main__": From d2c7d432ce7510104f2bfcf18c1c03f3652f3467 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Mon, 18 Oct 2021 15:42:34 -0500 Subject: [PATCH 04/70] schema support and summary/search --- docs/metadata_adapters.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/metadata_adapters.md b/docs/metadata_adapters.md index ebf9594e..762dd4fe 100644 --- a/docs/metadata_adapters.md +++ b/docs/metadata_adapters.md @@ -131,9 +131,9 @@ def filter_function(s:str) -> str: ### Default Values Defining default values for fields is handled in one of two way: -If a field in the metdata does not need a path, simply define the +If a field in the metdata does not need a path, simply define the field name and a value. If a remote metadata field has a value, it will override the default. -If a path is use then use the longer form and set the ```default_value``` to use +If a path is use then use the longer form and set the ```default_value``` to use if the path is not found. ```json From d54df2a221ed6194e1cf1c7a7a84deff70394df3 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Wed, 20 Oct 2021 15:26:39 -0500 Subject: [PATCH 05/70] add configuration + schema --- src/mds/agg_mds/commons.py | 5 ++--- src/mds/populate.py | 5 ++++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/mds/agg_mds/commons.py b/src/mds/agg_mds/commons.py index 2462a0e4..2d54d8c0 100644 --- a/src/mds/agg_mds/commons.py +++ b/src/mds/agg_mds/commons.py @@ -45,11 +45,10 @@ class FieldDefinition: While other fields are defined dynamically, these help "tune" certain fields * type: one of string, number, object, nested (deeper object) - * aggregate: aggregation is available """ type: str = "string" - aggregate: bool = False + description: str = "" properties: Optional[Dict[str, FieldDefinition]] = None ES_TYPE_MAPPING = { @@ -116,7 +115,7 @@ class AdapterMDSInstance: @dataclass_json @dataclass class Config: - fields: Optional[Dict[str, FieldDefinition]] = field(default_factory=dict) + schema: Optional[Dict[str, FieldDefinition]] = field(default_factory=dict) settings: Optional[Dict[str, Any]] = field(default_factory=dict) aggregations: Optional[Dict[str, FieldAggregation]] = field(default_factory=dict) search_settings: Optional[Dict[str, FieldAggregation]] = field(default_factory=dict) diff --git a/src/mds/populate.py b/src/mds/populate.py index 450c2551..2f5e34ff 100644 --- a/src/mds/populate.py +++ b/src/mds/populate.py @@ -143,7 +143,10 @@ async def main(commons_config: Commons) -> None: field_mapping = { "mappings": { "commons": { - "properties": {k: v.to_schema(True) for k, v in commons.fields.items()} + "properties": { + k: v.to_schema(True) + for k, v in commons.configuration.schema.items() + } } } } From 7ba372086157ed36023933305b4b70ea48b49e72 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Thu, 21 Oct 2021 23:21:46 -0500 Subject: [PATCH 06/70] add schema suppoort and schema introspection endpoint --- src/mds/agg_mds/adapters.py | 77 +++++++++++---- src/mds/agg_mds/commons.py | 96 +++++++++++++++++-- .../agg_mds/datastore/elasticsearch_dao.py | 6 +- src/mds/agg_mds/query.py | 15 +++ src/mds/populate.py | 27 ++++-- 5 files changed, 189 insertions(+), 32 deletions(-) diff --git a/src/mds/agg_mds/adapters.py b/src/mds/agg_mds/adapters.py index 797f0317..7f9216ae 100644 --- a/src/mds/agg_mds/adapters.py +++ b/src/mds/agg_mds/adapters.py @@ -16,6 +16,7 @@ before_sleep_log, ) from mds import logger +import json def strip_email(text: str): @@ -130,7 +131,7 @@ def normalizeToGen3MDSFields(self, data, **kwargs) -> Dict: """needs to be implemented in derived class""" @staticmethod - def mapFields(item: dict, mappings: dict, global_filters=None) -> dict: + def mapFields(item: dict, mappings: dict, global_filters=None, schema=None) -> dict: """ Given a MetaData entry as a dict, and dictionary describing fields to add and optionally where to map an item entry from. @@ -154,6 +155,9 @@ def mapFields(item: dict, mappings: dict, global_filters=None) -> dict: :return: """ + if schema is None: + schema = {} + if global_filters is None: global_filters = [] @@ -163,9 +167,19 @@ def mapFields(item: dict, mappings: dict, global_filters=None) -> dict: if isinstance(value, dict): # have a complex assignment expression = value.get("path", None) + hasDefaultValue = False default_value = None - if hasDefaultValue := "default_value" in value: - default_value = value["default_value"] + # get adapter's default value if set + if "default" in value: + hasDefaultValue = True + default_value = value["default"] + + # get schema default value if set + if hasDefaultValue is False: + d = schema.get(key, {}).get("default", None) + if d is not None: + hasDefaultValue = True + default_value = d field_value = get_json_path_value( expression, item, hasDefaultValue, default_value @@ -178,12 +192,25 @@ def mapFields(item: dict, mappings: dict, global_filters=None) -> dict: elif isinstance(value, str) and "path:" in value: # process as json path expression = value.split("path:")[1] - field_value = get_json_path_value(expression, item) + + hasDefaultValue = False + default_value = None + if key in schema: + d = schema[key].default + if d is not None: + hasDefaultValue = True + default_value = d + + field_value = get_json_path_value( + expression, item, hasDefaultValue, default_value + ) else: field_value = value for f in global_filters: field_value = FieldFilters.execute(f, field_value) + if key in schema: + field_value = schema[key].normalize_value(field_value) results[key] = field_value return results @@ -257,11 +284,13 @@ def buildIdentifier(id: str): return id.replace("http://doi.org/", "").replace("dc:", "") @staticmethod - def addGen3ExpectedFields(item, mappings, keepOriginalFields, globalFieldFilters): + def addGen3ExpectedFields( + item, mappings, keepOriginalFields, globalFieldFilters, schema + ): results = item if mappings is not None: mapped_fields = RemoteMetadataAdapter.mapFields( - item, mappings, globalFieldFilters + item, mappings, globalFieldFilters, schema ) if keepOriginalFields: results.update(mapped_fields) @@ -284,6 +313,7 @@ def normalizeToGen3MDSFields(self, data, **kwargs) -> Dict[str, Any]: mappings = kwargs.get("mappings", None) keepOriginalFields = kwargs.get("keepOriginalFields", True) globalFieldFilters = kwargs.get("globalFieldFilters", []) + schema = kwargs.get("schema", {}) results = {} for record in data["results"]: @@ -299,7 +329,7 @@ def normalizeToGen3MDSFields(self, data, **kwargs) -> Dict[str, Any]: else: item[str.replace(key, "dc:", "")] = value normalized_item = ISCPSRDublin.addGen3ExpectedFields( - item, mappings, keepOriginalFields, globalFieldFilters + item, mappings, keepOriginalFields, globalFieldFilters, schema ) results[item["identifier"]] = { "_guid_type": "discovery_metadata", @@ -405,7 +435,9 @@ def getRemoteDataAsJson(self, **kwargs) -> Dict: return results @staticmethod - def addGen3ExpectedFields(item, mappings, keepOriginalFields, globalFieldFilters): + def addGen3ExpectedFields( + item, mappings, keepOriginalFields, globalFieldFilters, schema + ): """ Map item fields to gen3 normalized fields using the mapping and adding the location @@ -413,7 +445,7 @@ def addGen3ExpectedFields(item, mappings, keepOriginalFields, globalFieldFilters results = item if mappings is not None: mapped_fields = RemoteMetadataAdapter.mapFields( - item, mappings, globalFieldFilters + item, mappings, globalFieldFilters, schema ) if keepOriginalFields: results.update(mapped_fields) @@ -441,13 +473,14 @@ def normalizeToGen3MDSFields(self, data, **kwargs) -> Dict[str, Any]: mappings = kwargs.get("mappings", None) keepOriginalFields = kwargs.get("keepOriginalFields", True) globalFieldFilters = kwargs.get("globalFieldFilters", []) + schema = kwargs.get("schema", {}) results = {} for item in data["results"]: item = item["Study"] item = flatten(item) normalized_item = ClinicalTrials.addGen3ExpectedFields( - item, mappings, keepOriginalFields, globalFieldFilters + item, mappings, keepOriginalFields, globalFieldFilters, schema ) results[item["NCTId"]] = { "_guid_type": "discovery_metadata", @@ -507,7 +540,9 @@ def getRemoteDataAsJson(self, **kwargs) -> Dict: return results @staticmethod - def addGen3ExpectedFields(item, mappings, keepOriginalFields, globalFieldFilters): + def addGen3ExpectedFields( + item, mappings, keepOriginalFields, globalFieldFilters, schema + ): """ Maps the items fields into Gen3 resources fields if keepOriginalFields is False: only those fields will be included in the final entry @@ -515,7 +550,7 @@ def addGen3ExpectedFields(item, mappings, keepOriginalFields, globalFieldFilters results = item if mappings is not None: mapped_fields = RemoteMetadataAdapter.mapFields( - item, mappings, globalFieldFilters + item, mappings, globalFieldFilters, schema ) if keepOriginalFields: results.update(mapped_fields) @@ -536,11 +571,12 @@ def normalizeToGen3MDSFields(self, data, **kwargs) -> Dict[str, Any]: mappings = kwargs.get("mappings", None) keepOriginalFields = kwargs.get("keepOriginalFields", True) globalFieldFilters = kwargs.get("globalFieldFilters", []) + schema = kwargs.get("schema", {}) results = {} for item in data["results"]: normalized_item = PDAPS.addGen3ExpectedFields( - item, mappings, keepOriginalFields, globalFieldFilters + item, mappings, keepOriginalFields, globalFieldFilters, schema ) if "display_id" not in item: continue @@ -614,7 +650,7 @@ def getRemoteDataAsJson(self, **kwargs) -> Dict: @staticmethod def addGen3ExpectedFields( - item, mappings, keepOriginalFields, globalFieldFilters + item, mappings, keepOriginalFields, globalFieldFilters, schema ) -> Dict[str, Any]: """ Given an item (metadata as a dict), map the item's keys into @@ -629,7 +665,7 @@ def addGen3ExpectedFields( results = item if mappings is not None: mapped_fields = RemoteMetadataAdapter.mapFields( - item, mappings, globalFieldFilters + item, mappings, globalFieldFilters, schema ) if keepOriginalFields: results.update(mapped_fields) @@ -655,11 +691,16 @@ def normalizeToGen3MDSFields(self, data, **kwargs) -> Dict[str, Any]: study_field = config.get("study_field", "gen3_discovery") keepOriginalFields = kwargs.get("keepOriginalFields", True) globalFieldFilters = kwargs.get("globalFieldFilters", []) + schema = kwargs.get("schema", {}) results = {} for guid, record in data["results"].items(): item = Gen3Adapter.addGen3ExpectedFields( - record[study_field], mappings, keepOriginalFields, globalFieldFilters + record[study_field], + mappings, + keepOriginalFields, + globalFieldFilters, + schema, ) results[guid] = { "_guid_type": "discovery_metadata", @@ -682,6 +723,7 @@ def gather_metadata( perItemValues, keepOriginalFields, globalFieldFilters, + schema, ): try: json_data = gather.getRemoteDataAsJson( @@ -694,6 +736,7 @@ def gather_metadata( perItemValues=perItemValues, keepOriginalFields=keepOriginalFields, globalFieldFilters=globalFieldFilters, + schema=schema, ) return results except ValueError as exc: @@ -720,6 +763,7 @@ def get_metadata( perItemValues=None, keepOriginalFields=False, globalFieldFilters=None, + schema=None, ): if config is None: config = {} @@ -746,4 +790,5 @@ def get_metadata( perItemValues=perItemValues, keepOriginalFields=keepOriginalFields, globalFieldFilters=globalFieldFilters, + schema=schema, ) diff --git a/src/mds/agg_mds/commons.py b/src/mds/agg_mds/commons.py index 2d54d8c0..221554cc 100644 --- a/src/mds/agg_mds/commons.py +++ b/src/mds/agg_mds/commons.py @@ -1,6 +1,8 @@ from dataclasses import dataclass, field from dataclasses_json import dataclass_json from typing import Any, Dict, List, Optional, Union, TypeVar +from mds import logger +import json @dataclass_json @@ -37,6 +39,32 @@ class FieldAggregation: FieldDefinition = TypeVar("FieldDefinition") +def string_to_array(s: str) -> List[str]: + return [s] + + +def string_to_integer(s: str) -> int: + return int(s) if s.isnumeric() else None + + +def string_to_number(s: str) -> Optional[float]: + try: + return float(s) + except ValueError: + return None + + +def string_to_dict(s: str) -> Optional[Dict[Any, Any]]: + try: + return json.loads(s) + except json.JSONDecodeError: + return None + + +def dict_to_array(d: dict) -> List[Dict[Any, Any]]: + return [d] + + @dataclass_json @dataclass class FieldDefinition: @@ -49,12 +77,31 @@ class FieldDefinition: type: str = "string" description: str = "" + default: Optional[Any] = None properties: Optional[Dict[str, FieldDefinition]] = None + items: Optional[Dict[str, str]] = None ES_TYPE_MAPPING = { "array": "nested", + "object": "nested", "string": "text", "integer": "long", + "number": "float", + } + + FIELD_NORMALIZATION = { + "string_to_array": string_to_array, + "string_to_number": string_to_number, + "string_to_integer": string_to_integer, + "string_to_object": string_to_dict, + "dict_to_array": dict_to_array, + } + + MAP_TYPE_TO_JSON_SCHEMA_TYPES = { + "str": "string", + "int": "integer", + "list": "array", + "dict": "object", } def __post_init__(self): @@ -63,22 +110,53 @@ def __post_init__(self): k: FieldDefinition.from_dict(v) for k, v in self.properties.items() } - def to_schema(self, es_types: bool = False): - res = { - "type": FieldDefinition.ES_TYPE_MAPPING.get(self.type, self.type) - if es_types - else self.type - } + def get_es_type(self): + type = FieldDefinition.ES_TYPE_MAPPING.get(self.type, self.type) + if self.type == "array" and self.items and self.items["type"] == "string": + type = "text" + return type + + def to_schema(self, es_types: bool = False, all_fields: bool = False): + """ + Maps the FieldDefinition to either a JSON schema or a Elastic Search mapping + """ + res = {"type": self.get_es_type() if es_types else self.type} if self.properties is not None: res["properties"] = { - k: v.to_schema(True) for k, v in self.properties.items() + k: v.to_schema(es_types, all_fields) for k, v in self.properties.items() } + if all_fields: + if self.items is not None: + res["items"] = self.items + if self.description is not None: + res["description"] = self.description + if self.default is not None: + res["default"] = self.default return res + def normalize_value(self, value) -> Any: + value_type = FieldDefinition.MAP_TYPE_TO_JSON_SCHEMA_TYPES.get( + type(value).__name__, type(value).__name__ + ) + + if value_type == self.type: + return value + + conversion = f"{value_type}_to_{self.type}" + converter = FieldDefinition.FIELD_NORMALIZATION.get(conversion, None) + if converter is None: + logger.debug(f"error normalizing {value}") + return value + return converter(value) + @dataclass_json @dataclass class MDSInstance: + """ + Handles pulling and processing data from a Gen3 metadata-service + """ + mds_url: str commons_url: str columns_to_fields: Optional[ @@ -130,6 +208,10 @@ class Commons: aggregations: Optional[Dict[str, FieldAggregation]] = field(default_factory=dict) fields: Optional[Dict[str, FieldDefinition]] = field(default_factory=dict) + def __post_init__(self): + if self.configuration is None: + self.configuration = Config() + def parse_config(data: Dict[str, Any]) -> Commons: """ diff --git a/src/mds/agg_mds/datastore/elasticsearch_dao.py b/src/mds/agg_mds/datastore/elasticsearch_dao.py index bd5ce1ef..6ab60068 100644 --- a/src/mds/agg_mds/datastore/elasticsearch_dao.py +++ b/src/mds/agg_mds/datastore/elasticsearch_dao.py @@ -1,5 +1,5 @@ from elasticsearch import Elasticsearch, exceptions as es_exceptions -from typing import List, Dict +from typing import Any, List, Dict import json from mds import logger from mds.config import AGG_MDS_NAMESPACE, ES_RETRY_LIMIT, ES_RETRY_INTERVAL @@ -106,7 +106,7 @@ async def init(hostname: str = "0.0.0.0", port: int = 9200): async def drop_all(common_mapping: dict): - for index in [AGG_MDS_INDEX, AGG_MDS_INFO_INDEX, AGG_MDS_CONFIG_TYPE]: + for index in [AGG_MDS_INDEX, AGG_MDS_INFO_INDEX, AGG_MDS_CONFIG_INDEX]: res = elastic_search_client.indices.delete(index=index, ignore=[400, 404]) logger.debug(f"deleted index: {index}: {res}") @@ -166,7 +166,7 @@ async def update_metadata( guid_arr: List[str], tags: Dict[str, List[str]], info: Dict[str, str], - field_normalizers: Dict[str, str], + field_normalizers: Dict[str, Any], study_data_field: str, ): elastic_search_client.index( diff --git a/src/mds/agg_mds/query.py b/src/mds/agg_mds/query.py index 6b7924e5..d4196e82 100644 --- a/src/mds/agg_mds/query.py +++ b/src/mds/agg_mds/query.py @@ -15,6 +15,21 @@ async def get_commons(): return await datastore.get_commons() +@mod.get("/aggregate/info/{what}") +async def get_commons(what: str): + """ + Returns information from the aggregate metadata service. + """ + res = await datastore.get_commons_attribute(what, "") + if res: + return res + else: + raise HTTPException( + HTTP_404_NOT_FOUND, + {"message": f"information for {what} not found", "code": 404}, + ) + + @mod.get("/aggregate/metadata") async def metadata( _: Request, diff --git a/src/mds/populate.py b/src/mds/populate.py index 2f5e34ff..80faf9b3 100644 --- a/src/mds/populate.py +++ b/src/mds/populate.py @@ -9,7 +9,6 @@ from urllib.parse import urlparse import argparse import sys -import json def parse_args(argv: List[str]) -> Namespace: @@ -75,10 +74,12 @@ def normalize(entry: dict) -> Any: entry[common.study_data_field]["commons_name"] = name # add to tags - for t in entry[common.study_data_field]["tags"]: - if t["category"] not in tags: - tags[t["category"]] = set() - tags[t["category"]].add(t["name"]) + item_tags = entry[common.study_data_field].get("tags", {}) + if item_tags is not None: + for t in item_tags: + if t["category"] not in tags: + tags[t["category"]] = set() + tags[t["category"]].add(t["name"]) # process tags set to list for k, v in tags.items(): @@ -104,11 +105,20 @@ async def populate_info(commons_config: Commons) -> None: } await datastore.update_global_info("aggregations", agg_info) + if commons_config.configuration.schema: + json_schema = { + k: v.to_schema(all_fields=True) + for k, v in commons.configuration.schema.items() + } + await datastore.update_global_info("schema", json_schema) + async def populate_config(commons_config: Commons) -> None: array_definition = { "array": [ - field for field, value in commons.fields.items() if value.type == "array" + field + for field, value in commons.configuration.schema.items() + if value.type == "array" ] } await datastore.update_config_info(array_definition) @@ -119,6 +129,10 @@ async def main(commons_config: Commons) -> None: Given a config structure, pull all metadata from each one in the config and cache into the following structure: { + "configuration" : { + schema: { dict of data schema for normalized fields } + settings: { dict of additional configuration properties } + }, "commons_name" : { "metadata" : [ array of metadata entries ], "field_mapping" : { dictionary of field_name to column_name }, @@ -171,6 +185,7 @@ async def main(commons_config: Commons) -> None: common.per_item_values, common.keep_original_fields, common.global_field_filters, + schema=commons_config.configuration.schema, ) logger.info(f"Received {len(results)} from {name}") if len(results) > 0: From 32b0d00464c51c1158eb6d68a7880952c899846f Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Wed, 27 Oct 2021 12:04:35 -0500 Subject: [PATCH 07/70] refine schema support --- src/mds/agg_mds/adapters.py | 82 +++++++++++++++++-- src/mds/agg_mds/commons.py | 35 ++++++-- .../agg_mds/datastore/elasticsearch_dao.py | 67 +++++---------- src/mds/agg_mds/query.py | 2 + src/mds/populate.py | 21 +++-- 5 files changed, 141 insertions(+), 66 deletions(-) diff --git a/src/mds/agg_mds/adapters.py b/src/mds/agg_mds/adapters.py index 7f9216ae..be6a5bf3 100644 --- a/src/mds/agg_mds/adapters.py +++ b/src/mds/agg_mds/adapters.py @@ -16,7 +16,6 @@ before_sleep_log, ) from mds import logger -import json def strip_email(text: str): @@ -176,10 +175,9 @@ def mapFields(item: dict, mappings: dict, global_filters=None, schema=None) -> d # get schema default value if set if hasDefaultValue is False: - d = schema.get(key, {}).get("default", None) - if d is not None: + if key in schema and schema[key].default is not None: hasDefaultValue = True - default_value = d + default_value = schema[key].default field_value = get_json_path_value( expression, item, hasDefaultValue, default_value @@ -557,8 +555,8 @@ def addGen3ExpectedFields( else: results = mapped_fields - if isinstance(results["investigators"], list): - results["investigators"] = results["investigators"].join(", ") + # if isinstance(results["investigators"], list): + # results["investigators"] = results["investigators"].join(", ") return results def normalizeToGen3MDSFields(self, data, **kwargs) -> Dict[str, Any]: @@ -714,6 +712,77 @@ def normalizeToGen3MDSFields(self, data, **kwargs) -> Dict[str, Any]: return results +class DRSIndexdAdapter(RemoteMetadataAdapter): + @staticmethod + def clean_dist_entry(s: str) -> str: + """ + Cleans the string returning a proper DRS prefix + @param s: string to clean + @return: cleaned string + """ + return s.replace("\\.", ".").replace(".*", "") + + @staticmethod + def clean_http_url(s: str) -> str: + """ + Cleans input string removing http(s) prefix and all trailing paths + @param s: string to clean + @return: cleaned string + """ + return ( + s.replace("/index", "")[::-1] + .replace("/", "", 1)[::-1] + .replace("http://", "") + .replace("https://", "") + .replace("/ga4gh/drs/v1/objects", "") + ) + + def getRemoteDataAsJson(self, **kwargs) -> Dict: + from datetime import datetime, timezone + + results = {"results": {}} + + mds_url = kwargs.get("mds_url", None) + if mds_url is None: + return results + + try: + response = httpx.get(f"{mds_url}/index/_dist") + response.raise_for_status() + data = response.json() + # process the entries and create a DRS cache + results = { + "info": { + "created": datetime.now(timezone.utc).strftime( + "%m/%d/%Y %H:%M:%S:%Z" + ) + }, + "cache": {}, + } + for entry in data: + if entry["type"] != "indexd": + continue + host = DRSIndexdAdapter.clean_http_url(entry["host"]) + name = entry.get("name", "") + for x in entry["hints"]: + drs_prefix = DRSIndexdAdapter.clean_dist_entry(x) + results["cache"][drs_prefix] = { + "host": host, + "name": name, + "type": entry["type"], + } + + except httpx.HTTPError as exc: + logger.error( + f"An HTTP error {exc.response.status_code if exc.response is not None else ''} occurred while requesting {exc.request.url}." + ) + + return results + + def normalizeToGen3MDSFields(self, data, **kwargs) -> Dict[str, Any]: + return data + + def gather_metadata( gather, mds_url, @@ -751,6 +820,7 @@ def gather_metadata( "clinicaltrials": ClinicalTrials, "pdaps": PDAPS, "gen3": Gen3Adapter, + "drs_indexd": DRSIndexdAdapter, } diff --git a/src/mds/agg_mds/commons.py b/src/mds/agg_mds/commons.py index 221554cc..41634d0a 100644 --- a/src/mds/agg_mds/commons.py +++ b/src/mds/agg_mds/commons.py @@ -84,7 +84,7 @@ class FieldDefinition: ES_TYPE_MAPPING = { "array": "nested", "object": "nested", - "string": "text", + "string": "keyword", "integer": "long", "number": "float", } @@ -104,6 +104,9 @@ class FieldDefinition: "dict": "object", } + def has_default_value(self): + return self.default is not None + def __post_init__(self): if self.properties is not None: self.properties = { @@ -113,14 +116,28 @@ def __post_init__(self): def get_es_type(self): type = FieldDefinition.ES_TYPE_MAPPING.get(self.type, self.type) if self.type == "array" and self.items and self.items["type"] == "string": - type = "text" - return type + type = "keyword" + + if type == "keyword": + return { + "type": type, + "fields": { + "analyzed": { + "type": "text", + "analyzer": "ngram_analyzer", + "search_analyzer": "search_analyzer", + "term_vector": "with_positions_offsets", + } + }, + } + + return {"type": type} def to_schema(self, es_types: bool = False, all_fields: bool = False): """ Maps the FieldDefinition to either a JSON schema or a Elastic Search mapping """ - res = {"type": self.get_es_type() if es_types else self.type} + res = self.get_es_type() if es_types else {"type": self.type} if self.properties is not None: res["properties"] = { k: v.to_schema(es_types, all_fields) for k, v in self.properties.items() @@ -190,11 +207,19 @@ class AdapterMDSInstance: global_field_filters: List[str] = field(default_factory=list) +@dataclass_json +@dataclass +class Settings: + cache_drs: bool = False + drs_indexd_server: str = "https://dataguids.org" + timestamp_entry: bool = False + + @dataclass_json @dataclass class Config: + settings: Settings schema: Optional[Dict[str, FieldDefinition]] = field(default_factory=dict) - settings: Optional[Dict[str, Any]] = field(default_factory=dict) aggregations: Optional[Dict[str, FieldAggregation]] = field(default_factory=dict) search_settings: Optional[Dict[str, FieldAggregation]] = field(default_factory=dict) diff --git a/src/mds/agg_mds/datastore/elasticsearch_dao.py b/src/mds/agg_mds/datastore/elasticsearch_dao.py index 6ab60068..1536491d 100644 --- a/src/mds/agg_mds/datastore/elasticsearch_dao.py +++ b/src/mds/agg_mds/datastore/elasticsearch_dao.py @@ -24,7 +24,22 @@ AGG_MDS_CONFIG_INDEX = f"{AGG_MDS_NAMESPACE}-commons-config-index" AGG_MDS_CONFIG_TYPE = "commons-config" -MAPPING = { +# Setting Commons Info ES index to only store documents +# will not be searching on it +INFO_MAPPING = { + "mappings": { + AGG_MDS_INFO_TYPE: { + "dynamic": False, + } + } +} + +CONFIG = { + "settings": {"index": {"number_of_shards": 1, "number_of_replicas": 0}}, + "mappings": {"_doc": {"properties": {"array": {"type": "keyword"}}}}, +} + +SEARCH_CONFIG = { "settings": { "index": { "number_of_shards": 1, @@ -52,42 +67,7 @@ }, }, } - }, - "mappings": { - "commons": { - "properties": { - "auth_resource_path": {"type": "keyword"}, - "__manifest": { - "type": "nested", - }, - "tags": { - "type": "nested", - "properties": { - "name": { - "type": "text", - "fields": { - "analyzed": { - "type": "text", - "term_vector": "with_positions_offsets", - "analyzer": "ngram_analyzer", - "search_analyzer": "search_analyzer", - } - }, - }, - "category": {"type": "text"}, - }, - }, - "advSearchFilters": { - "type": "nested", - }, - } - } - }, -} - -CONFIG = { - "settings": {"index": {"number_of_shards": 1, "number_of_replicas": 0}}, - "mappings": {"_doc": {"properties": {"array": {"type": "keyword"}}}}, + } } elastic_search_client = None @@ -111,9 +91,8 @@ async def drop_all(common_mapping: dict): logger.debug(f"deleted index: {index}: {res}") try: - res = elastic_search_client.indices.create( - index=AGG_MDS_INDEX, body=common_mapping - ) + mapping = {**SEARCH_CONFIG, **common_mapping} + res = elastic_search_client.indices.create(index=AGG_MDS_INDEX, body=mapping) logger.debug(f"created index {AGG_MDS_INDEX}: {res}") except es_exceptions.RequestError as ex: if ex.error == "resource_already_exists_exception": @@ -124,7 +103,7 @@ async def drop_all(common_mapping: dict): try: res = elastic_search_client.indices.create( - index=AGG_MDS_INFO_INDEX, + index=AGG_MDS_INFO_INDEX, body=INFO_MAPPING ) logger.debug(f"created index {AGG_MDS_INFO_INDEX}: {res}") @@ -166,7 +145,6 @@ async def update_metadata( guid_arr: List[str], tags: Dict[str, List[str]], info: Dict[str, str], - field_normalizers: Dict[str, Any], study_data_field: str, ): elastic_search_client.index( @@ -176,16 +154,11 @@ async def update_metadata( body=info, ) - unified_field_normalizers = {**field_normalizers} for doc in data: key = list(doc.keys())[0] # Flatten out this structure doc = doc[key][study_data_field] - for field in unified_field_normalizers.keys(): - if field in doc: - normalize_field(doc, field, unified_field_normalizers[field]) - try: elastic_search_client.index( index=AGG_MDS_INDEX, doc_type=AGG_MDS_TYPE, id=key, body=doc diff --git a/src/mds/agg_mds/query.py b/src/mds/agg_mds/query.py index d4196e82..6d64d96d 100644 --- a/src/mds/agg_mds/query.py +++ b/src/mds/agg_mds/query.py @@ -2,6 +2,8 @@ from starlette.status import HTTP_404_NOT_FOUND from mds import config from mds.agg_mds import datastore +from typing import Optional + mod = APIRouter() diff --git a/src/mds/populate.py b/src/mds/populate.py index 80faf9b3..2c8525aa 100644 --- a/src/mds/populate.py +++ b/src/mds/populate.py @@ -88,14 +88,8 @@ def normalize(entry: dict) -> Any: keys = list(results.keys()) info = {"commons_url": common.commons_url} - # build ES normalization dictionary - field_typing = { - field: "object" if x.type in ["nested", "array"] else x.type - for field, x in commons.fields.items() - } - await datastore.update_metadata( - name, mds_arr, keys, tags, info, field_typing, common.study_data_field + name, mds_arr, keys, tags, info, common.study_data_field ) @@ -111,13 +105,23 @@ async def populate_info(commons_config: Commons) -> None: for k, v in commons.configuration.schema.items() } await datastore.update_global_info("schema", json_schema) + await populate_drs_info(commons_config) + + +async def populate_drs_info(commons_config: Commons) -> None: + if commons_config.configuration.settings.cache_drs: + server = commons_config.configuration.settings.drs_indexd_server + if server is not None: + drs_data = adapters.get_metadata("drs_indexd", server, None) + for id, entry in drs_data.get("cache", {}).items(): + await datastore.update_global_info(id, entry) async def populate_config(commons_config: Commons) -> None: array_definition = { "array": [ field - for field, value in commons.configuration.schema.items() + for field, value in commons_config.configuration.schema.items() if value.type == "array" ] } @@ -195,6 +199,7 @@ async def main(commons_config: Commons) -> None: await populate_info(commons_config) # populate array index information to support guppy await populate_config(commons_config) + res = await datastore.get_status() print(res) await datastore.close() From c43727625460902f05b64c84a134937e30a91e01 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Mon, 22 Nov 2021 12:12:33 -0600 Subject: [PATCH 08/70] update ES configuration --- src/mds/agg_mds/adapters.py | 5 ++ .../agg_mds/datastore/elasticsearch_dao.py | 11 ---- tests/test_agg_mds_elasticsearch_dao.py | 52 +++++++++++++++---- 3 files changed, 46 insertions(+), 22 deletions(-) diff --git a/src/mds/agg_mds/adapters.py b/src/mds/agg_mds/adapters.py index be6a5bf3..7a1a3ab5 100644 --- a/src/mds/agg_mds/adapters.py +++ b/src/mds/agg_mds/adapters.py @@ -713,6 +713,11 @@ def normalizeToGen3MDSFields(self, data, **kwargs) -> Dict[str, Any]: class DRSIndexdAdapter(RemoteMetadataAdapter): + """ + Pulls the DRS hostname from a ga4gh (indexd) server to cache + them to support local compact DRS resolution. + """ + @staticmethod def clean_dist_entry(s: str) -> str: """ diff --git a/src/mds/agg_mds/datastore/elasticsearch_dao.py b/src/mds/agg_mds/datastore/elasticsearch_dao.py index 1536491d..bbcbd876 100644 --- a/src/mds/agg_mds/datastore/elasticsearch_dao.py +++ b/src/mds/agg_mds/datastore/elasticsearch_dao.py @@ -4,17 +4,6 @@ from mds import logger from mds.config import AGG_MDS_NAMESPACE, ES_RETRY_LIMIT, ES_RETRY_INTERVAL -# TODO WFH Why do we have both __manifest and _file_manifest? -# TODO WFH These are bugs. If we have to check whether an object is a string or -# an object, the data is bad. -DEFAULT_FIELD_NORMALIZERS = { - "__manifest": "object", - "_file_manifest": "object", - "advSearchFilters": "object", - "data_dictionary": "object", - "sites": "number", -} - AGG_MDS_INDEX = f"{AGG_MDS_NAMESPACE}-commons-index" AGG_MDS_TYPE = "commons" diff --git a/tests/test_agg_mds_elasticsearch_dao.py b/tests/test_agg_mds_elasticsearch_dao.py index 2a6dd629..2fe4299e 100644 --- a/tests/test_agg_mds_elasticsearch_dao.py +++ b/tests/test_agg_mds_elasticsearch_dao.py @@ -1,8 +1,34 @@ from unittest.mock import patch, call, MagicMock import pytest from mds.agg_mds.datastore import elasticsearch_dao -from mds.agg_mds.datastore.elasticsearch_dao import MAPPING +from mds.agg_mds.datastore.elasticsearch_dao import ( + INFO_MAPPING, + AGG_MDS_INDEX, + AGG_MDS_INFO_INDEX, + AGG_MDS_CONFIG_INDEX, + CONFIG, + SEARCH_CONFIG, +) from elasticsearch import Elasticsearch, exceptions as es_exceptions +from mds.config import ES_RETRY_LIMIT, ES_RETRY_INTERVAL + +COMMON_MAPPING = { + "mappings": { + "commons": { + "properties": { + "__manifest": { + "type": "nested", + }, + "tags": { + "type": "nested", + }, + "data_dictionary": { + "type": "nested", + }, + } + } + } +} @pytest.mark.asyncio @@ -15,8 +41,8 @@ async def test_init(): ["myhost"], port=9200, scheme="http", - timeout=30, - max_retries=7, + timeout=ES_RETRY_INTERVAL, + max_retries=ES_RETRY_LIMIT, retry_on_timeout=True, ) @@ -27,17 +53,20 @@ async def test_drop_all(): "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.indices", MagicMock(), ) as mock_indices: - await elasticsearch_dao.drop_all() + await elasticsearch_dao.drop_all(COMMON_MAPPING) mock_indices.delete.assert_has_calls( [ - call(index="default_namespace-commons-index", ignore=[400, 404]), - call(index="default_namespace-commons-info-index", ignore=[400, 404]), - ] + call(index=AGG_MDS_INDEX, ignore=[400, 404]), + call(index=AGG_MDS_INFO_INDEX, ignore=[400, 404]), + call(index=AGG_MDS_CONFIG_INDEX, ignore=[400, 404]), + ], + any_order=True, ) mock_indices.create.assert_has_calls( [ - call(body=MAPPING, index="default_namespace-commons-index"), - call(index="default_namespace-commons-info-index"), + call(body={**SEARCH_CONFIG, **COMMON_MAPPING}, index=AGG_MDS_INDEX), + call(body=INFO_MAPPING, index=AGG_MDS_INFO_INDEX), + call(body=CONFIG, index=AGG_MDS_CONFIG_INDEX), ], any_order=True, ) @@ -53,7 +82,7 @@ async def test_create_if_exists(): ) ), ) as mock_indices: - await elasticsearch_dao.drop_all() + await elasticsearch_dao.drop_all(COMMON_MAPPING) @pytest.mark.asyncio @@ -64,7 +93,7 @@ async def test_create_index_raise_exception(): MagicMock(side_effect=es_exceptions.RequestError(403, "expect_to_fail")), ) as mock_indices: try: - await elasticsearch_dao.drop_all() + await elasticsearch_dao.drop_all(COMMON_MAPPING) except Exception as exc: assert isinstance(exc, es_exceptions.RequestError) == True @@ -108,6 +137,7 @@ async def test_update_metadata(): index="default_namespace-commons-index", ), ], + any_order=True, ) From e3bbf40921d7920985534e430a74a106d902c820 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Mon, 28 Feb 2022 12:25:58 -0600 Subject: [PATCH 09/70] update brh_config add support for json schema introspection --- configs/brh_config.json | 21 +++++++++++++++++++++ src/mds/agg_mds/commons.py | 2 +- tests/test_agg_mds_adapters.py | 6 +++--- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/configs/brh_config.json b/configs/brh_config.json index eb25e9d1..f5c063cc 100644 --- a/configs/brh_config.json +++ b/configs/brh_config.json @@ -125,6 +125,27 @@ "commons": "Kids First Data Resource Center", "study_url": "path:link" } + }, + "Genomic Data Commons": { + "mds_url": "https://gen3.datacommons.io", + "commons_url": "portal.gdc.cancer.gov", + "adapter": "gen3", + "config" : { + "guid_type": "metadata_object", + "study_field": "dbgap" + }, + "keep_original_fields": false, + "field_mappings" : { + "authz": "path:authz", + "tags": "path:gen3_discovery.tags", + "_unique_id": "path:_unique_id", + "study_id": "path:_unique_id", + "study_description": "path:description", + "full_name": "path:full_name", + "short_name": "path:full_name", + "commons": "Kids First Data Resource Center", + "study_url": "path:link" + } } } } diff --git a/src/mds/agg_mds/commons.py b/src/mds/agg_mds/commons.py index 41634d0a..c8d9c7ea 100644 --- a/src/mds/agg_mds/commons.py +++ b/src/mds/agg_mds/commons.py @@ -235,7 +235,7 @@ class Commons: def __post_init__(self): if self.configuration is None: - self.configuration = Config() + self.configuration = Config(settings=Settings()) def parse_config(data: Dict[str, Any]) -> Commons: diff --git a/tests/test_agg_mds_adapters.py b/tests/test_agg_mds_adapters.py index 00dbbb37..5666b125 100644 --- a/tests/test_agg_mds_adapters.py +++ b/tests/test_agg_mds_adapters.py @@ -3013,7 +3013,7 @@ def test_gen3_adapter(): "gen3_discovery": { "tags": [{"name": "Array", "category": "Data Type"}], "_subjects_count": 48, - "dbgap_accession_number": "", + "dbgap_accession_number": None, "study_description": "The molecular factors involved in the development of Post-traumatic Stress Disorder (PTSD) remain poorly understood. Previous transcriptomic studies investigating the mechanisms of PTSD apply targeted approaches to identify individual genes under a cross-sectional framework lack a holistic view of the behaviours and properties of these genes at the system-level. Here we sought to apply an unsupervised gene-network-based approach to a prospective experimental design using whole-transcriptome RNA-Seq gene expression from peripheral blood leukocytes of U.S. Marines (N=188), obtained both pre- and post-deployment to conflict zones. We identified discrete groups of co-regulated genes (i.e., co-expression modules) and tested them for association to PTSD. We identified one module at both pre- and post-deployment containing putative causal signatures for PTSD development displaying an over-expression of genes enriched for functions of innate-immune response and interferon signalling (Type-I and Type-II). Importantly, these results were replicated in a second non-overlapping independent dataset of U.S. Marines (N=96), further outlining the role of innate immune and interferon signalling genes within co-expression modules to explain at least part of the causal pathophysiology for PTSD development. A second module, consequential of trauma exposure, contained PTSD resiliency signatures and an over-expression of genes involved in hemostasis and wound responsiveness suggesting that chronic levels of stress impair proper wound healing during/after exposure to the battlefield while highlighting the role of the hemostatic system as a clinical indicator of chronic-based stress. These findings provide novel insights for early preventative measures and advanced PTSD detection, which may lead to interventions that delay or perhaps abrogate the development of PTSD.\nWe used microarrays to characterize both prognostic and diagnostic molecular signatures associated to PTSD risk and PTSD status compared to control subjects.", "number_of_datafiles": 0, "investigator": "me.foo@smartsite.com", @@ -3148,10 +3148,10 @@ def test_json_path_expression(): assert get_json_path_value("study1.summary", sample1) == "This is a summary" # test non existent path - assert get_json_path_value("study2.summary", sample1) == "" + assert get_json_path_value("study2.summary", sample1) is None # test bad path - assert get_json_path_value(".contributors", sample1) == "" + assert get_json_path_value(".contributors", sample1) is None # test single array assert get_json_path_value("study1.contributors", sample1) == ["Bilbo Baggins"] From b73f097eb9a0f1c8c349575d9ae70a96cfc7c3cb Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Mon, 28 Feb 2022 14:56:04 -0600 Subject: [PATCH 10/70] add check for drs_cache in config->settings --- src/mds/agg_mds/commons.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mds/agg_mds/commons.py b/src/mds/agg_mds/commons.py index c8d9c7ea..fb7c6aaa 100644 --- a/src/mds/agg_mds/commons.py +++ b/src/mds/agg_mds/commons.py @@ -218,7 +218,7 @@ class Settings: @dataclass_json @dataclass class Config: - settings: Settings + settings: Optional[Dict[str, Settings]] = field(default_factory=dict) schema: Optional[Dict[str, FieldDefinition]] = field(default_factory=dict) aggregations: Optional[Dict[str, FieldAggregation]] = field(default_factory=dict) search_settings: Optional[Dict[str, FieldAggregation]] = field(default_factory=dict) From 721a5455d6d73db66ea9289f80e5717f7682da4d Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Mon, 28 Feb 2022 14:56:32 -0600 Subject: [PATCH 11/70] add check for drs_cache in config->settings --- src/mds/populate.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/mds/populate.py b/src/mds/populate.py index 2c8525aa..d4859790 100644 --- a/src/mds/populate.py +++ b/src/mds/populate.py @@ -109,6 +109,10 @@ async def populate_info(commons_config: Commons) -> None: async def populate_drs_info(commons_config: Commons) -> None: + if len(commons_config.configuration.settings) == 0: + return + if len(commons_config.configuration.settings.cache_dir): + return if commons_config.configuration.settings.cache_drs: server = commons_config.configuration.settings.drs_indexd_server if server is not None: From 17ca5d944db299cf2d021aba4815a498d0061bdb Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Thu, 12 May 2022 13:38:17 -0500 Subject: [PATCH 12/70] add improved ES mapping support --- src/mds/agg_mds/adapters.py | 2 +- src/mds/agg_mds/commons.py | 24 ++++++----- src/mds/agg_mds/datastore/__init__.py | 8 +++- .../agg_mds/datastore/elasticsearch_dao.py | 40 ++++++++++++++----- src/mds/agg_mds/query.py | 5 ++- src/mds/populate.py | 3 +- tests/test_agg_mds_adapters.py | 2 +- 7 files changed, 60 insertions(+), 24 deletions(-) diff --git a/src/mds/agg_mds/adapters.py b/src/mds/agg_mds/adapters.py index f2574e0e..6a4818d5 100644 --- a/src/mds/agg_mds/adapters.py +++ b/src/mds/agg_mds/adapters.py @@ -640,7 +640,7 @@ def getRemoteDataAsJson(self, **kwargs) -> Dict: results["results"].update(data) numReturned = len(data) - if numReturned == 0 or numReturned < limit: + if numReturned == 0 or numReturned <= limit: moreData = False offset += numReturned diff --git a/src/mds/agg_mds/commons.py b/src/mds/agg_mds/commons.py index fb7c6aaa..dcc85913 100644 --- a/src/mds/agg_mds/commons.py +++ b/src/mds/agg_mds/commons.py @@ -39,7 +39,11 @@ class FieldAggregation: FieldDefinition = TypeVar("FieldDefinition") -def string_to_array(s: str) -> List[str]: +def string_to_array(s: str) -> Optional[List[str]]: + if s is None: + return None + if s == "": + return [] return [s] @@ -90,11 +94,11 @@ class FieldDefinition: } FIELD_NORMALIZATION = { - "string_to_array": string_to_array, "string_to_number": string_to_number, "string_to_integer": string_to_integer, "string_to_object": string_to_dict, "dict_to_array": dict_to_array, + "string_to_array": string_to_array, } MAP_TYPE_TO_JSON_SCHEMA_TYPES = { @@ -114,13 +118,13 @@ def __post_init__(self): } def get_es_type(self): - type = FieldDefinition.ES_TYPE_MAPPING.get(self.type, self.type) + field_type = FieldDefinition.ES_TYPE_MAPPING.get(self.type, self.type) if self.type == "array" and self.items and self.items["type"] == "string": - type = "keyword" + field_type = "keyword" - if type == "keyword": + if field_type == "keyword": return { - "type": type, + "type": field_type, "fields": { "analyzed": { "type": "text", @@ -131,11 +135,11 @@ def get_es_type(self): }, } - return {"type": type} + return {"type": field_type} def to_schema(self, es_types: bool = False, all_fields: bool = False): """ - Maps the FieldDefinition to either a JSON schema or a Elastic Search mapping + Maps the FieldDefinition to either a JSON schema or an Elasticsearch mapping """ res = self.get_es_type() if es_types else {"type": self.type} if self.properties is not None: @@ -162,7 +166,9 @@ def normalize_value(self, value) -> Any: conversion = f"{value_type}_to_{self.type}" converter = FieldDefinition.FIELD_NORMALIZATION.get(conversion, None) if converter is None: - logger.debug(f"error normalizing {value}") + logger.warning( + f"warning normalizing {value} via converter {conversion} not applied." + ) return value return converter(value) diff --git a/src/mds/agg_mds/datastore/__init__.py b/src/mds/agg_mds/datastore/__init__.py index 5d2fb137..a9d57f13 100644 --- a/src/mds/agg_mds/datastore/__init__.py +++ b/src/mds/agg_mds/datastore/__init__.py @@ -13,8 +13,12 @@ async def init(hostname, port): await client.init(hostname, port) -async def drop_all(commons_mapping): - await client.drop_all(commons_mapping) +async def drop_all(): + await client.drop_all() + + +async def create_indexes(commons_mapping): + await client.create_indexes(commons_mapping) async def close(): diff --git a/src/mds/agg_mds/datastore/elasticsearch_dao.py b/src/mds/agg_mds/datastore/elasticsearch_dao.py index bbcbd876..49c50908 100644 --- a/src/mds/agg_mds/datastore/elasticsearch_dao.py +++ b/src/mds/agg_mds/datastore/elasticsearch_dao.py @@ -1,6 +1,7 @@ from elasticsearch import Elasticsearch, exceptions as es_exceptions from typing import Any, List, Dict import json +from math import ceil from mds import logger from mds.config import AGG_MDS_NAMESPACE, ES_RETRY_LIMIT, ES_RETRY_INTERVAL @@ -37,8 +38,8 @@ "tokenizer": { "ngram_tokenizer": { "type": "ngram", - "min_gram": 2, - "max_gram": 20, + "min_gram": 3, + "max_gram": 3, "token_chars": ["letter", "digit"], } }, @@ -74,11 +75,13 @@ async def init(hostname: str = "0.0.0.0", port: int = 9200): ) -async def drop_all(common_mapping: dict): +async def drop_all(): for index in [AGG_MDS_INDEX, AGG_MDS_INFO_INDEX, AGG_MDS_CONFIG_INDEX]: res = elastic_search_client.indices.delete(index=index, ignore=[400, 404]) logger.debug(f"deleted index: {index}: {res}") + +async def create_indexes(common_mapping: dict): try: mapping = {**SEARCH_CONFIG, **common_mapping} res = elastic_search_client.indices.create(index=AGG_MDS_INDEX, body=mapping) @@ -124,7 +127,9 @@ def normalize_field(doc, key, normalize_type): if normalize_type == "number" and isinstance(doc[key], str): doc[key] = None except: - logger.debug(f"error normalizing {key} for a document") + logger.warning( + f"warning: normalizing {key} ({normalize_type}) for a document, elastic search will auto type" + ) doc[key] = None @@ -218,16 +223,33 @@ async def get_all_metadata(limit, offset, flatten=False): } } ) + return { + "results": flat, + "pagination": { + "hits": res["hits"]["total"], + "offset": offset, + "pageSize": limit, + "pages": ceil(int(res["hits"]["total"]) / limit), + }, + } else: - byCommons = {} + byCommons = { + "results": {}, + "pagination": { + "hits": res["hits"]["total"], + "offset": offset, + "pageSize": limit, + "pages": ceil(int(res["hits"]["total"]) / limit), + }, + } for record in res["hits"]["hits"]: id = record["_id"] normalized = record["_source"] commons_name = normalized["commons_name"] if commons_name not in byCommons: - byCommons[commons_name] = [] - byCommons[commons_name].append( + byCommons["results"][commons_name] = [] + byCommons["results"][commons_name].append( { id: { "gen3_discovery": normalized, @@ -341,7 +363,7 @@ async def get_number_aggregation_for_field(field: str): "size": 0, "aggs": { field: {"sum": {"field": field}}, - "missing": {"missing": {"field": field}}, + "missing": {"missing_bucket": {"field": field}}, "types_count": {"value_count": {"field": field}}, }, } @@ -360,7 +382,7 @@ async def get_number_aggregation_for_field(field: str): field: { "total_items": res["hits"]["total"], "sum": agg_results[field]["value"], - "missing": agg_results["missing"]["doc_count"], + "missing": agg_results["missing_bucket"]["doc_count"], } } diff --git a/src/mds/agg_mds/query.py b/src/mds/agg_mds/query.py index 6d64d96d..4e5d1c46 100644 --- a/src/mds/agg_mds/query.py +++ b/src/mds/agg_mds/query.py @@ -39,6 +39,9 @@ async def metadata( 20, description="Maximum number of records returned. (max: 2000)" ), offset: int = Query(0, description="Return results at this given offset."), + flatten: bool = Query( + False, description="Return the results without grouping items by commons." + ), ): # TODO WFH How to properly return this? We think grouping by MDS is probably # not ideal in reality. We already have commons_name in the results. @@ -54,7 +57,7 @@ async def metadata( ... } """ - return await datastore.get_all_metadata(limit, offset) + return await datastore.get_all_metadata(limit, offset, flatten) @mod.get("/aggregate/metadata/{name}") diff --git a/src/mds/populate.py b/src/mds/populate.py index d4859790..fa9c9c82 100644 --- a/src/mds/populate.py +++ b/src/mds/populate.py @@ -173,7 +173,8 @@ async def main(commons_config: Commons) -> None: } } - await datastore.drop_all(commons_mapping=field_mapping) + await datastore.drop_all() # TODO: rename indexes to old + await datastore.create_indexes(commons_mapping=field_mapping) for name, common in commons_config.gen3_commons.items(): logger.info(f"Populating {name} using Gen3 MDS connector") diff --git a/tests/test_agg_mds_adapters.py b/tests/test_agg_mds_adapters.py index 98d18ee4..650fd58f 100644 --- a/tests/test_agg_mds_adapters.py +++ b/tests/test_agg_mds_adapters.py @@ -3151,7 +3151,7 @@ def test_json_path_expression(): ) # test non existent path - assert get_json_path_value("study2.study_description_summary", sample1) == "" + assert get_json_path_value("study2.study_description_summary", sample1) is None # test bad path assert get_json_path_value(".contributors", sample1) is None From 719db0fecf99e1ba071982668999b29a70b7df59 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Wed, 6 Jul 2022 16:03:09 -0500 Subject: [PATCH 13/70] fix aggMDS failing unit test: all pass --- src/mds/agg_mds/commons.py | 4 +- .../agg_mds/datastore/elasticsearch_dao.py | 66 +++++++++++----- src/mds/agg_mds/query.py | 66 +++++++++++----- src/mds/populate.py | 4 +- tests/test_agg_mds_commons.py | 76 +++++++++++++++++-- tests/test_agg_mds_elasticsearch_dao.py | 24 ++++-- tests/test_agg_mds_query.py | 75 +++++++++--------- tests/test_populate.py | 37 +++++++-- 8 files changed, 251 insertions(+), 101 deletions(-) diff --git a/src/mds/agg_mds/commons.py b/src/mds/agg_mds/commons.py index dcc85913..20b37e05 100644 --- a/src/mds/agg_mds/commons.py +++ b/src/mds/agg_mds/commons.py @@ -244,9 +244,9 @@ def __post_init__(self): self.configuration = Config(settings=Settings()) -def parse_config(data: Dict[str, Any]) -> Commons: +def parse_config(data: str) -> Commons: """ - parses a aggregated config which defines the list of MDS services and the mapping of field to column names + parses an aggregated config which defines the list of MDS services and the mapping of field to column names for the Ecosystem browser. Returns a dictionary of MDSInfo entries """ diff --git a/src/mds/agg_mds/datastore/elasticsearch_dao.py b/src/mds/agg_mds/datastore/elasticsearch_dao.py index 49c50908..a73b88e8 100644 --- a/src/mds/agg_mds/datastore/elasticsearch_dao.py +++ b/src/mds/agg_mds/datastore/elasticsearch_dao.py @@ -1,5 +1,5 @@ from elasticsearch import Elasticsearch, exceptions as es_exceptions -from typing import Any, List, Dict +from typing import List, Dict, Optional, Tuple import json from math import ceil from mds import logger @@ -205,7 +205,45 @@ async def get_commons(): return [] -async def get_all_metadata(limit, offset, flatten=False): +def count(value) -> int: + """ + returns the length of the value if list or dict otherwise returns 0 + """ + if isinstance(value, dict) or isinstance(value, list): + return len(value) + return 0 + + +def process_record(record: dict, counts: Optional[str]) -> Tuple[str, dict]: + """ + processed an MDS record from the search + returns the id and record, if counts is found in the record the length is returned + instead of the entry. + """ + _id = record["_id"] + normalized = record["_source"] + if counts in normalized: + normalized[counts] = count(normalized[counts]) + return _id, normalized + + +async def get_all_metadata(limit, offset, counts: Optional[str] = None, flatten=False): + """ + Queries elastic search for metadata and returns up to the limit + offset: starting index to return + counts: converts the count of the entry[count] if it is a dict or array + returns: + flattend == true + results : MDS results as a dict + paging info + flattend == false + results : { + commonsA: metadata + commonsB: metadata + ... + }, + paging info + """ try: res = elastic_search_client.search( index=AGG_MDS_INDEX, @@ -214,15 +252,8 @@ async def get_all_metadata(limit, offset, flatten=False): if flatten: flat = [] for record in res["hits"]["hits"]: - id = record["_id"] - normalized = record["_source"] - flat.append( - { - id: { - "gen3_discovery": normalized, - } - } - ) + id, normalized = process_record(record, counts) + flat.append({id: {"gen3_discovery": normalized}}) return { "results": flat, "pagination": { @@ -243,19 +274,14 @@ async def get_all_metadata(limit, offset, flatten=False): }, } for record in res["hits"]["hits"]: - id = record["_id"] - normalized = record["_source"] + id, normalized = process_record(record, counts) commons_name = normalized["commons_name"] - - if commons_name not in byCommons: + if commons_name not in byCommons["results"]: byCommons["results"][commons_name] = [] byCommons["results"][commons_name].append( - { - id: { - "gen3_discovery": normalized, - } - } + {id: {"gen3_discovery": normalized}} ) + return byCommons except Exception as error: logger.error(error) diff --git a/src/mds/agg_mds/query.py b/src/mds/agg_mds/query.py index 4e5d1c46..11edd285 100644 --- a/src/mds/agg_mds/query.py +++ b/src/mds/agg_mds/query.py @@ -2,8 +2,6 @@ from starlette.status import HTTP_404_NOT_FOUND from mds import config from mds.agg_mds import datastore -from typing import Optional - mod = APIRouter() @@ -32,7 +30,7 @@ async def get_commons(what: str): ) -@mod.get("/aggregate/metadata") +@mod.get("/aggregate/metadata_paged") async def metadata( _: Request, limit: int = Query( @@ -40,11 +38,53 @@ async def metadata( ), offset: int = Query(0, description="Return results at this given offset."), flatten: bool = Query( - False, description="Return the results without grouping items by commons." + True, description="Return the results without grouping items by commons." ), ): # TODO WFH How to properly return this? We think grouping by MDS is probably # not ideal in reality. We already have commons_name in the results. + """ + Returns all metadata from all registered commons in the form: + { + results: { + "commonA" : { + ... Metadata + }, + "commonB" : { + ... Metadata + } + ... + }, + "pagination": { + "hits": 64, + "offset": 0, + "pageSize": 20, + "pages": 4 + } + } + + The flatten option removes the commons namespace so all results are a child or results: + results: { + ... Metadata from commons A + ... Metadata from commons B + } + ... + }, + """ + return await datastore.get_all_metadata(limit, offset, flatten) + + +@mod.get("/aggregate/metadata") +async def metadata( + _: Request, + limit: int = Query( + 20, description="Maximum number of records returned. (max: 2000)" + ), + offset: int = Query(0, description="Return results at this given offset."), + counts: str = Query( + "", description="Return count of a field instead of the value." + ), +): """ Returns all metadata from all registered commons in the form: { @@ -57,7 +97,8 @@ async def metadata( ... } """ - return await datastore.get_all_metadata(limit, offset, flatten) + results = await datastore.get_all_metadata(limit, offset, counts, False) + return results.get("results", {}) @mod.get("/aggregate/metadata/{name}") @@ -105,21 +146,6 @@ async def metadata_info(name: str): ) -@mod.get("/aggregate/summary/{field}") -async def metadata_aggregations(field: str): - res = await datastore.get_number_aggregations(field) - if res: - return res - else: - raise HTTPException( - HTTP_404_NOT_FOUND, - { - "message": f"metadata_aggregations: no common exists with the given: {field}", - "code": 404, - }, - ) - - @mod.get("/aggregate/metadata/guid/{guid:path}") async def metadata_name_guid(guid: str): """Get the metadata of the GUID in the named commons.""" diff --git a/src/mds/populate.py b/src/mds/populate.py index fa9c9c82..0c9bc4fa 100644 --- a/src/mds/populate.py +++ b/src/mds/populate.py @@ -102,7 +102,7 @@ async def populate_info(commons_config: Commons) -> None: if commons_config.configuration.schema: json_schema = { k: v.to_schema(all_fields=True) - for k, v in commons.configuration.schema.items() + for k, v in commons_config.configuration.schema.items() } await datastore.update_global_info("schema", json_schema) await populate_drs_info(commons_config) @@ -167,7 +167,7 @@ async def main(commons_config: Commons) -> None: "commons": { "properties": { k: v.to_schema(True) - for k, v in commons.configuration.schema.items() + for k, v in commons_config.configuration.schema.items() } } } diff --git a/tests/test_agg_mds_commons.py b/tests/test_agg_mds_commons.py index 7994b113..2021b59d 100644 --- a/tests/test_agg_mds_commons.py +++ b/tests/test_agg_mds_commons.py @@ -5,14 +5,52 @@ from mds.agg_mds.commons import ( parse_config, Commons, + Config, + FieldDefinition, MDSInstance, AdapterMDSInstance, ) def test_parse_config(): - assert parse_config( + results = parse_config( + """ { + "configuration": { + "schema": { + "_subjects_count": { + "type": "integer" + }, + "year_awarded": { + "type": "integer" + }, + "__manifest": { + "type": "array", + "properties": { + "file_name": { + "type": "string" + }, + "file_size": { + "type": "integer" + } + } + }, + "tags": { + "type": "array" + }, + "study_description": {}, + "short_name": {}, + "full_name": {}, + "_unique_id": {}, + "study_id": {}, + "study_url": {}, + "commons_url": {}, + "authz": { + "type": "string" + } + } + }, + "gen3_commons": { "my_gen3_commons": { "mds_url": "http://mds", @@ -23,19 +61,43 @@ def test_parse_config(): "_subjects_count": "_subjects_count", "study_id": "study_id", "_unique_id": "_unique_id", - "study_description": "study_description", - }, + "study_description": "study_description" + } } }, "adapter_commons": { "non_gen3_commons": { "mds_url": "http://non-gen3", "commons_url": "non-gen3", - "adapter": "icpsr", + "adapter": "icpsr" } - }, + } } - ) == Commons( + """ + ) + expected = Commons( + configuration=Config( + schema={ + "_subjects_count": FieldDefinition(type="integer"), + "year_awarded": FieldDefinition(type="integer"), + "__manifest": FieldDefinition( + type="array", + properties={ + "file_name": FieldDefinition(type="string"), + "file_size": FieldDefinition(type="integer"), + }, + ), + "tags": FieldDefinition(type="array"), + "study_description": FieldDefinition(type="string"), + "short_name": FieldDefinition(type="string"), + "full_name": FieldDefinition(type="string"), + "_unique_id": FieldDefinition(type="string"), + "study_id": FieldDefinition(type="string"), + "study_url": FieldDefinition(type="string"), + "commons_url": FieldDefinition(type="string"), + "authz": FieldDefinition(type="string"), + } + ), gen3_commons={ "my_gen3_commons": MDSInstance( "http://mds", @@ -58,3 +120,5 @@ def test_parse_config(): ) }, ) + + assert expected == results diff --git a/tests/test_agg_mds_elasticsearch_dao.py b/tests/test_agg_mds_elasticsearch_dao.py index 2fe4299e..c4a51de0 100644 --- a/tests/test_agg_mds_elasticsearch_dao.py +++ b/tests/test_agg_mds_elasticsearch_dao.py @@ -53,7 +53,7 @@ async def test_drop_all(): "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.indices", MagicMock(), ) as mock_indices: - await elasticsearch_dao.drop_all(COMMON_MAPPING) + await elasticsearch_dao.drop_all() mock_indices.delete.assert_has_calls( [ call(index=AGG_MDS_INDEX, ignore=[400, 404]), @@ -62,6 +62,15 @@ async def test_drop_all(): ], any_order=True, ) + + +@pytest.mark.asyncio +async def test_create_indexes(): + with patch( + "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.indices", + MagicMock(), + ) as mock_indices: + await elasticsearch_dao.create_indexes(common_mapping=COMMON_MAPPING) mock_indices.create.assert_has_calls( [ call(body={**SEARCH_CONFIG, **COMMON_MAPPING}, index=AGG_MDS_INDEX), @@ -82,7 +91,8 @@ async def test_create_if_exists(): ) ), ) as mock_indices: - await elasticsearch_dao.drop_all(COMMON_MAPPING) + await elasticsearch_dao.drop_all() + await elasticsearch_dao.create_indexes(COMMON_MAPPING) @pytest.mark.asyncio @@ -93,7 +103,7 @@ async def test_create_index_raise_exception(): MagicMock(side_effect=es_exceptions.RequestError(403, "expect_to_fail")), ) as mock_indices: try: - await elasticsearch_dao.drop_all(COMMON_MAPPING) + await elasticsearch_dao.create_indexes(common_mapping=COMMON_MAPPING) except Exception as exc: assert isinstance(exc, es_exceptions.RequestError) == True @@ -111,7 +121,7 @@ async def test_update_metadata(): "my_id": { "gen3_discovery": { "some_field": "some_value", - "__manifest": "{}", + "__manifest": {}, "sites": "", } } @@ -131,7 +141,7 @@ async def test_update_metadata(): index="default_namespace-commons-info-index", ), call( - body={"some_field": "some_value", "__manifest": {}, "sites": None}, + body={"some_field": "some_value", "__manifest": {}, "sites": ""}, doc_type="commons", id="my_id", index="default_namespace-commons-index", @@ -222,7 +232,7 @@ async def test_metadata_tags(): with patch( "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client", MagicMock() ) as mock_client: - await elasticsearch_dao.metadata_tags("my-commons") + await elasticsearch_dao.metadata_tags() mock_client.search.assert_called_with( index="default_namespace-commons-index", body={ @@ -247,7 +257,7 @@ async def test_metadata_tags(): "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.search", MagicMock(side_effect=Exception("some error")), ) as mock_search: - assert await elasticsearch_dao.metadata_tags("my-commons") == [] + assert await elasticsearch_dao.metadata_tags() == [] @pytest.mark.asyncio diff --git a/tests/test_agg_mds_query.py b/tests/test_agg_mds_query.py index 51b7c381..252c25fe 100644 --- a/tests/test_agg_mds_query.py +++ b/tests/test_agg_mds_query.py @@ -34,24 +34,26 @@ async def test_aggregate_commons(client): @pytest.mark.asyncio async def test_aggregate_metadata(client): with patch.object( - datastore, "get_all_metadata", AsyncMock(return_value=[]) + datastore, "get_all_metadata", AsyncMock(return_value={"results": []}) ) as datastore_mock: resp = client.get("/aggregate/metadata") assert resp.status_code == 200 assert resp.json() == [] - datastore.get_all_metadata.assert_called_with(20, 0) + datastore.get_all_metadata.assert_called_with(20, 0, "", False) mock_data = { - "commons1": [ - { - "study1": {}, - } - ], - "commons2": [ - { - "study2": {}, - } - ], + "results": { + "commons1": [ + { + "study1": {}, + } + ], + "commons2": [ + { + "study2": {}, + } + ], + } } with patch.object( @@ -59,8 +61,8 @@ async def test_aggregate_metadata(client): ) as datastore_mock: resp = client.get("/aggregate/metadata") assert resp.status_code == 200 - assert resp.json() == mock_data - datastore.get_all_metadata.assert_called_with(20, 0) + assert resp.json() == mock_data["results"] + datastore.get_all_metadata.assert_called_with(20, 0, "", False) @pytest.mark.asyncio @@ -92,24 +94,39 @@ async def test_aggregate_metadata_name(client): @pytest.mark.asyncio async def test_aggregate_metadata_tags(client): with patch.object( - datastore, "get_commons_attribute", AsyncMock(return_value=None) + datastore, "get_all_tags", AsyncMock(return_value={}) ) as datastore_mock: - resp = client.get("/aggregate/metadata/commons1/tags") + resp = client.get("/aggregate/tags") assert resp.status_code == 404 assert resp.json() == { "detail": { "code": 404, - "message": "no common exists with the given: commons1", + "message": "error retrieving tags from service", } } + tags = { + "Access": {"total": 63, "names": [{"restricted": 63}]}, + "Category": { + "total": 61, + "names": [ + { + "Family/Twin/Trios": 39, + "Prospective Longitudinal Cohort": 10, + "Tumor vs. Matched-Normal": 9, + "Cross-Sectional": 3, + } + ], + }, + } + with patch.object( - datastore, "get_commons_attribute", AsyncMock(return_value=["mytag1"]) + datastore, "get_all_tags", AsyncMock(return_value=tags) ) as datastore_mock: - resp = client.get("/aggregate/metadata/commons1/tags") + resp = client.get("/aggregate/tags") assert resp.status_code == 200 - assert resp.json() == ["mytag1"] - datastore.get_commons_attribute.assert_called_with("commons1", "tags") + assert resp.json() == tags + datastore.get_all_tags.assert_called_with() @pytest.mark.asyncio @@ -138,22 +155,6 @@ async def test_aggregate_metadata_info(client): datastore.get_commons_attribute.assert_called_with("commons1", "info") -@pytest.mark.asyncio -async def test_metadata_aggregations(client): - with patch.object( - datastore, "get_aggregations", AsyncMock(return_value=None) - ) as datastore_mock: - resp = client.get("/aggregate/metadata/commons1/aggregations") - assert resp.status_code == 404 - assert resp.json() == { - "detail": { - "code": 404, - "message": "no common exists with the given: commons1", - } - } - datastore.get_aggregations.assert_called_with("commons1") - - @pytest.mark.asyncio async def test_aggregate_metadata_name_guid(client): with patch.object( diff --git a/tests/test_populate.py b/tests/test_populate.py index a81ce3fc..199e2fbe 100644 --- a/tests/test_populate.py +++ b/tests/test_populate.py @@ -7,7 +7,13 @@ main, filter_entries, ) -from mds.agg_mds.commons import AdapterMDSInstance, MDSInstance, Commons +from mds.agg_mds.commons import ( + AdapterMDSInstance, + MDSInstance, + Commons, + FieldDefinition, + Config, +) from mds.agg_mds import adapters from mds.agg_mds import datastore import json @@ -86,7 +92,7 @@ async def test_populate_metadata(): async def test_main(): with patch("mds.config.USE_AGG_MDS", False): with pytest.raises(SystemExit) as pytest_wrapped_e: - await main(None, "", 0) + await main(commons_config=None) assert pytest_wrapped_e.type == SystemExit assert pytest_wrapped_e.value.code == 1 @@ -94,6 +100,7 @@ async def test_main(): patch("mds.populate.pull_mds", MagicMock()).start() patch.object(datastore, "init", AsyncMock()).start() patch.object(datastore, "drop_all", AsyncMock()).start() + patch.object(datastore, "create_indexes", AsyncMock()).start() patch.object(datastore, "get_status", AsyncMock(return_value="OK")).start() patch.object(datastore, "close", AsyncMock()).start() patch.object(datastore, "update_metadata", AsyncMock()).start() @@ -101,6 +108,12 @@ async def test_main(): await main( Commons( + configuration=Config( + schema={ + "_subjects_count": FieldDefinition(type="integer"), + "year_awarded": FieldDefinition(type="integer"), + } + ), gen3_commons={ "my_commons": MDSInstance( mds_url="", @@ -115,9 +128,7 @@ async def test_main(): adapter="icpsr", ), }, - ), - "", - 0, + ) ) @@ -159,6 +170,12 @@ def test_parse_config_from_file(): with NamedTemporaryFile(mode="w+", delete=False) as fp: json.dump( { + "configuration": { + "schema": { + "_subjects_count": {"type": "integer"}, + "study_description": {}, + } + }, "gen3_commons": { "mycommons": { "mds_url": "http://mds", @@ -187,7 +204,13 @@ def test_parse_config_from_file(): assert ( config.to_json() == Commons( - { + configuration=Config( + schema={ + "_subjects_count": FieldDefinition(type="integer"), + "study_description": FieldDefinition(type="string"), + } + ), + gen3_commons={ "mycommons": MDSInstance( "http://mds", "http://commons", @@ -201,7 +224,7 @@ def test_parse_config_from_file(): }, ) }, - { + adapter_commons={ "non-gen3": AdapterMDSInstance( "http://non-gen3", "non-gen3", From 8bb67f16c23b71a1c155dbc7b5a092722e4b8373 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Wed, 6 Jul 2022 17:14:36 -0500 Subject: [PATCH 14/70] add aggMDS unit test to increase coverage --- .../agg_mds/datastore/elasticsearch_dao.py | 61 ------------------- src/mds/agg_mds/functions.py | 0 src/mds/agg_mds/query.py | 2 +- tests/test_agg_mds_query.py | 50 ++++++++++++++- tests/test_populate.py | 1 + 5 files changed, 51 insertions(+), 63 deletions(-) delete mode 100644 src/mds/agg_mds/functions.py diff --git a/src/mds/agg_mds/datastore/elasticsearch_dao.py b/src/mds/agg_mds/datastore/elasticsearch_dao.py index a73b88e8..b8ec4ae0 100644 --- a/src/mds/agg_mds/datastore/elasticsearch_dao.py +++ b/src/mds/agg_mds/datastore/elasticsearch_dao.py @@ -1,6 +1,5 @@ from elasticsearch import Elasticsearch, exceptions as es_exceptions from typing import List, Dict, Optional, Tuple -import json from math import ceil from mds import logger from mds.config import AGG_MDS_NAMESPACE, ES_RETRY_LIMIT, ES_RETRY_INTERVAL @@ -119,20 +118,6 @@ async def create_indexes(common_mapping: dict): raise ex -def normalize_field(doc, key, normalize_type): - try: - if normalize_type == "object" and isinstance(doc[key], str): - value = doc[key] - doc[key] = None if value == "" else json.loads(value) - if normalize_type == "number" and isinstance(doc[key], str): - doc[key] = None - except: - logger.warning( - f"warning: normalizing {key} ({normalize_type}) for a document, elastic search will auto type" - ) - doc[key] = None - - async def update_metadata( name: str, data: List[Dict], @@ -382,52 +367,6 @@ async def get_aggregations(name): return [] -async def get_number_aggregation_for_field(field: str): - try: - # get the total number of documents in a commons namespace - query = { - "size": 0, - "aggs": { - field: {"sum": {"field": field}}, - "missing": {"missing_bucket": {"field": field}}, - "types_count": {"value_count": {"field": field}}, - }, - } - nested = False - parts = field.split(".") - if len(parts) == 2: - nested = True - query["aggs"] = { - field: {"nested": {"path": parts[0]}, "aggs": query["aggs"]} - } - - res = elastic_search_client.search(index=AGG_MDS_INDEX, body=query) - agg_results = res["aggregations"][field] if nested else res["aggregations"] - - return { - field: { - "total_items": res["hits"]["total"], - "sum": agg_results[field]["value"], - "missing": agg_results["missing_bucket"]["doc_count"], - } - } - - except Exception as error: - logger.error(error) - return {} - - -async def does_exists(field): - try: - query = {"size": 0, "query": {"bool": {"must": {"exists": {"field": field}}}}} - res = elastic_search_client.search(index=AGG_MDS_INDEX, body=query) - if res["hits"]["total"] > 0: - return True - except Exception as error: - logger.error(error) - return False - - async def get_by_guid(guid): try: data = elastic_search_client.get( diff --git a/src/mds/agg_mds/functions.py b/src/mds/agg_mds/functions.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/mds/agg_mds/query.py b/src/mds/agg_mds/query.py index 11edd285..14438222 100644 --- a/src/mds/agg_mds/query.py +++ b/src/mds/agg_mds/query.py @@ -71,7 +71,7 @@ async def metadata( ... }, """ - return await datastore.get_all_metadata(limit, offset, flatten) + return await datastore.get_all_metadata(limit, offset, counts=None, flatten=flatten) @mod.get("/aggregate/metadata") diff --git a/tests/test_agg_mds_query.py b/tests/test_agg_mds_query.py index 252c25fe..222da749 100644 --- a/tests/test_agg_mds_query.py +++ b/tests/test_agg_mds_query.py @@ -5,7 +5,6 @@ from unittest.mock import patch from conftest import AsyncMock - # https://github.com/encode/starlette/issues/440 nest_asyncio.apply() @@ -65,6 +64,33 @@ async def test_aggregate_metadata(client): datastore.get_all_metadata.assert_called_with(20, 0, "", False) +@pytest.mark.asyncio +async def test_aggregate_metadata_paged(client): + with patch.object( + datastore, "get_all_metadata", AsyncMock(return_value={"results": []}) + ) as datastore_mock: + resp = client.get("/aggregate/metadata_paged") + assert resp.status_code == 200 + assert resp.json() == {"results": []} + datastore.get_all_metadata.assert_called_with(20, 0, counts=None, flatten=True) + + mock_data = { + "results": [ + {"study1": {}}, + {"study2": {}}, + ], + "pagination": {"hits": 64, "offset": 0, "pageSize": 20, "pages": 4}, + } + + with patch.object( + datastore, "get_all_metadata", AsyncMock(return_value=mock_data) + ) as datastore_mock: + resp = client.get("/aggregate/metadata_paged") + assert resp.status_code == 200 + assert resp.json() == mock_data + datastore.get_all_metadata.assert_called_with(20, 0, counts=None, flatten=True) + + @pytest.mark.asyncio async def test_aggregate_metadata_name(client): with patch.object( @@ -177,3 +203,25 @@ async def test_aggregate_metadata_name_guid(client): assert resp.status_code == 200 assert resp.json() == {"study2": {}} datastore.get_by_guid.assert_called_with("123") + + +@pytest.mark.asyncio +async def test_aggregate_metadata_get_schema(client): + schema = { + "_subjects_count": {"type": "integer", "description": ""}, + "year_awarded": {"type": "integer", "description": ""}, + } + with patch.object( + datastore, + "get_commons_attribute", + AsyncMock( + return_value={ + "_subjects_count": {"type": "integer", "description": ""}, + "year_awarded": {"type": "integer", "description": ""}, + } + ), + ) as datastore_mock: + resp = client.get("/aggregate/info/schema") + assert resp.status_code == 200 + assert resp.json() == schema + datastore.get_commons_attribute.assert_called_with("schema", "") diff --git a/tests/test_populate.py b/tests/test_populate.py index 199e2fbe..bbc9d2db 100644 --- a/tests/test_populate.py +++ b/tests/test_populate.py @@ -103,6 +103,7 @@ async def test_main(): patch.object(datastore, "create_indexes", AsyncMock()).start() patch.object(datastore, "get_status", AsyncMock(return_value="OK")).start() patch.object(datastore, "close", AsyncMock()).start() + patch.object(datastore, "update_global_info", AsyncMock()).start() patch.object(datastore, "update_metadata", AsyncMock()).start() patch.object(adapters, "get_metadata", MagicMock()).start() From ba0f4dd2d63193f0a99c5750a50060ec7881d118 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Wed, 6 Jul 2022 18:09:38 -0500 Subject: [PATCH 15/70] add__manifest filter test, increase converage --- .../agg_mds/datastore/elasticsearch_dao.py | 1 + tests/test_agg_mds_query.py | 85 ++++++++++++++++++- 2 files changed, 85 insertions(+), 1 deletion(-) diff --git a/src/mds/agg_mds/datastore/elasticsearch_dao.py b/src/mds/agg_mds/datastore/elasticsearch_dao.py index b8ec4ae0..295a043e 100644 --- a/src/mds/agg_mds/datastore/elasticsearch_dao.py +++ b/src/mds/agg_mds/datastore/elasticsearch_dao.py @@ -234,6 +234,7 @@ async def get_all_metadata(limit, offset, counts: Optional[str] = None, flatten= index=AGG_MDS_INDEX, body={"size": limit, "from": offset, "query": {"match_all": {}}}, ) + print(res) if flatten: flat = [] for record in res["hits"]["hits"]: diff --git a/tests/test_agg_mds_query.py b/tests/test_agg_mds_query.py index 222da749..d6cd361d 100644 --- a/tests/test_agg_mds_query.py +++ b/tests/test_agg_mds_query.py @@ -2,7 +2,7 @@ import pytest import nest_asyncio from mds.agg_mds import datastore -from unittest.mock import patch +from unittest.mock import patch, MagicMock from conftest import AsyncMock # https://github.com/encode/starlette/issues/440 @@ -91,6 +91,89 @@ async def test_aggregate_metadata_paged(client): datastore.get_all_metadata.assert_called_with(20, 0, counts=None, flatten=True) +@pytest.mark.asyncio +async def test_aggregate_metadata_counts(client): + mock_data = { + "took": 3, + "timed_out": "false", + "_shards": {"total": 1, "successful": 1, "skipped": 0, "failed": 0}, + "hits": { + "total": 161, + "max_score": 1.0, + "hits": [ + { + "_index": "default_namespace-commons-index", + "_type": "commons", + "_id": "815616c0-dfsdfjjj", + "_score": 1.0, + "_source": { + "link": "", + "tags": [ + {"name": "restricted", "category": "Access"}, + {"name": "genomic", "category": "category"}, + ], + "commons": "LI", + "_unique_id": "815616c0-c4a4-4883-9107-a05694499a36", + "dataset_code": "LI", + "brief_summary": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.", + "dataset_title": "Lorem ipsum dolor sit amet", + "samples_count": "", + "subjects_count": "", + "data_files_count": 11062, + "_subjects_count": "", + "study_description": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ultricies tristique nulla aliquet enim tortor at auctor.", + "short_name": "Lorem ipsum dolor sit amet", + "full_name": "Lorem ipsum dolor sit amet, consectetur adipiscing elit", + "commons_name": "Lorem ipsum", + "__manifest": [ + {"filename": "foo2.txt"}, + {"filename": "foo3.txt"}, + ], + }, + } + ], + }, + } + + results = { + "Lorem ipsum": [ + { + "815616c0-dfsdfjjj": { + "gen3_discovery": { + "link": "", + "tags": [ + {"name": "restricted", "category": "Access"}, + {"name": "genomic", "category": "category"}, + ], + "commons": "LI", + "_unique_id": "815616c0-c4a4-4883-9107-a05694499a36", + "dataset_code": "LI", + "brief_summary": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.", + "dataset_title": "Lorem ipsum dolor sit amet", + "samples_count": "", + "subjects_count": "", + "data_files_count": 11062, + "_subjects_count": "", + "study_description": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ultricies tristique nulla aliquet enim tortor at auctor.", + "short_name": "Lorem ipsum dolor sit amet", + "full_name": "Lorem ipsum dolor sit amet, consectetur adipiscing elit", + "commons_name": "Lorem ipsum", + "__manifest": 2, + } + } + } + ] + } + + with patch( + "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.search", + MagicMock(return_value=mock_data), + ) as search: + resp = client.get("/aggregate/metadata?counts=__manifest") + assert resp.status_code == 200 + assert resp.json() == results + + @pytest.mark.asyncio async def test_aggregate_metadata_name(client): with patch.object( From 2547b319812a022a8c8fe91910ede5c5b0f3c9d7 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Wed, 6 Jul 2022 21:46:07 -0500 Subject: [PATCH 16/70] fix harvard adapter unit test --- tests/test_agg_mds_adapters.py | 109 ++++++++++++++++----------------- 1 file changed, 53 insertions(+), 56 deletions(-) diff --git a/tests/test_agg_mds_adapters.py b/tests/test_agg_mds_adapters.py index 3381c99f..6506d902 100644 --- a/tests/test_agg_mds_adapters.py +++ b/tests/test_agg_mds_adapters.py @@ -3110,15 +3110,18 @@ def test_gen3_adapter(): per_item_override = {"GSE63878": {"dbgap_accession_number": "dg.333344.222"}} - get_metadata( - "gen3", - "http://test/ok/", - None, - config={"batchSize": 64}, - mappings=field_mappings, - keepOriginalFields=False, - perItemValues=per_item_override, - ) == expected + assert ( + get_metadata( + "gen3", + "http://test/ok/", + None, + config={"batchSize": 64}, + mappings=field_mappings, + keepOriginalFields=False, + perItemValues=per_item_override, + ) + == expected + ) respx.get( "http://test/error/mds/metadata?data=True&_guid_type=discovery_metadata&limit=1000&offset=0" @@ -3145,7 +3148,7 @@ def test_gen3_adapter(): get_metadata("gen3", "http://test/timeouterror/", None, field_mappings) except Exception as exc: - assert isinstance(exc, RetryError) == True + assert isinstance(exc, RetryError) @respx.mock @@ -4067,17 +4070,16 @@ def test_get_metadata_harvard_dataverse(): # failed calls respx.get( - "http://test/ok/datasets/:persistentId/?persistentId=doi:10.7910/DVN/5B8YM8", - status_code=200, - content=json.loads(dataset_json_response), - content_type="text/plain;charset=UTF-8", + "http://test/ok/datasets/:persistentId/?persistentId=doi:10.7910/DVN/5B8YM8" + ).mock( + return_value=httpx.Response( + status_code=200, + json=json.loads(dataset_json_response), + ) ) - respx.get( - "http://test/ok/access/datafile/6297263/metadata/ddi", - status_code=200, - content=file_ddi_response, - content_type="text/plain;charset=UTF-8", + respx.get("http://test/ok/access/datafile/6297263/metadata/ddi").mock( + return_value=httpx.Response(status_code=200, json=file_ddi_response) ) assert get_metadata("havard_dataverse", "http://test/ok", filters=None) == {} @@ -4100,6 +4102,19 @@ def test_get_metadata_harvard_dataverse(): == {} ) + respx.get( + "http://test/ok/datasets/:persistentId/?persistentId=doi:10.7910/DVN/5B8YM8" + "http://test/ok/datasets/:persistentId/?persistentId=doi:10.7910/DVN/5B8YM8" + ).mock( + return_value=httpx.Response( + status_code=200, json=json.loads(dataset_json_response) + ) + ) + + respx.get("http://test/ok/access/datafile/6297263/metadata/ddi").mock( + return_value=httpx.Response(status_code=200, text=file_ddi_response) + ) + # valid call assert ( get_metadata( @@ -4113,17 +4128,17 @@ def test_get_metadata_harvard_dataverse(): # valid single variable call respx.get( - "http://test/single_variable/datasets/:persistentId/?persistentId=doi:10.7910/DVN/5B8YM8", - status_code=200, - content=json.loads(dataset_json_response), - content_type="text/plain;charset=UTF-8", + "http://test/single_variable/datasets/:persistentId/?persistentId=doi:10.7910/DVN/5B8YM8" + ).mock( + return_value=httpx.Response( + status_code=200, json=json.loads(dataset_json_response) + ) ) - respx.get( - "http://test/single_variable/access/datafile/6297263/metadata/ddi", - status_code=200, - content=file_single_variable_ddi_response, - content_type="text/plain;charset=UTF-8", + respx.get("http://test/single_variable/access/datafile/6297263/metadata/ddi").mock( + return_value=httpx.Response( + status_code=200, text=file_single_variable_ddi_response + ) ) assert ( @@ -4138,10 +4153,12 @@ def test_get_metadata_harvard_dataverse(): # invalid responses respx.get( - "http://test/invalid_dataset_response/datasets/:persistentId/?persistentId=doi:10.7910/DVN/5B8YM8", - status_code=200, - content={"status": "ok"}, - content_type="text/plain;charset=UTF-8", + "http://test/invalid_dataset_response/datasets/:persistentId/?persistentId=doi:10.7910/DVN/5B8YM8" + ).mock( + return_value=httpx.Response( + status_code=200, + json={"status": "ok"}, + ) ) assert ( @@ -4155,11 +4172,8 @@ def test_get_metadata_harvard_dataverse(): ) respx.get( - "http://test/err404/datasets/:persistentId/?persistentId=doi:10.7910/DVN/5B8YM8", - status_code=404, - content={}, - content_type="text/plain:charset=UTF-8", - ) + "http://test/err404/datasets/:persistentId/?persistentId=doi:10.7910/DVN/5B8YM8" + ).mock(return_value=httpx.Response(status_code=404, json={})) assert ( get_metadata( @@ -4172,22 +4186,6 @@ def test_get_metadata_harvard_dataverse(): ) # Incorrect keys expected in adapter class - respx.get( - "http://test/different_keys/datasets/:persistentId/?persistentId=doi:10.7910/DVN/5B8YM8", - status_code=200, - content=json.loads(dataset_json_different_keys_response), - content_type="text/plain:charset=UTF-8", - ) - - assert ( - get_metadata( - "harvard_dataverse", - "http://test/different_keys", - filters={"persistent_ids": ["doi:10.7910/DVN/5B8YM8"]}, - mappings=field_mappings, - ) - == {} - ) try: from mds.agg_mds.adapters import HarvardDataverse @@ -4195,9 +4193,8 @@ def test_get_metadata_harvard_dataverse(): HarvardDataverse.getRemoteDataAsJson.retry.wait = wait_none() respx.get( - "http://test/timeouterror/datasets/:persistentId/?persistentId=doi:10.7910/DVN/5B8YM8", - content=httpx.TimeoutException, - ) + "http://test/timeouterror/datasets/:persistentId/?persistentId=doi:10.7910/DVN/5B8YM8" + ).mock(side_effect=httpx.TimeoutException) get_metadata( "harvard_dataverse", From 2272cecbc98cc9ac8d657310b3e3b18c0c2370b0 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Wed, 6 Jul 2022 21:54:11 -0500 Subject: [PATCH 17/70] fix failing populate test --- tests/test_populate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_populate.py b/tests/test_populate.py index bbc9d2db..876782fa 100644 --- a/tests/test_populate.py +++ b/tests/test_populate.py @@ -101,6 +101,7 @@ async def test_main(): patch.object(datastore, "init", AsyncMock()).start() patch.object(datastore, "drop_all", AsyncMock()).start() patch.object(datastore, "create_indexes", AsyncMock()).start() + patch.object(datastore, "index", AsyncMock()).start() patch.object(datastore, "get_status", AsyncMock(return_value="OK")).start() patch.object(datastore, "close", AsyncMock()).start() patch.object(datastore, "update_global_info", AsyncMock()).start() From a69583a3acbe321e668431e0aaf163665353b1d3 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Wed, 6 Jul 2022 22:01:25 -0500 Subject: [PATCH 18/70] fix (again) populate unit test --- tests/test_populate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_populate.py b/tests/test_populate.py index 876782fa..487f36e8 100644 --- a/tests/test_populate.py +++ b/tests/test_populate.py @@ -101,7 +101,7 @@ async def test_main(): patch.object(datastore, "init", AsyncMock()).start() patch.object(datastore, "drop_all", AsyncMock()).start() patch.object(datastore, "create_indexes", AsyncMock()).start() - patch.object(datastore, "index", AsyncMock()).start() + patch.object(datastore, "update_config_info", AsyncMock()).start() patch.object(datastore, "get_status", AsyncMock(return_value="OK")).start() patch.object(datastore, "close", AsyncMock()).start() patch.object(datastore, "update_global_info", AsyncMock()).start() From 35188873ec4318870fbb021f70592145de02cff9 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Thu, 7 Jul 2022 13:00:15 -0500 Subject: [PATCH 19/70] increase test coverage --- src/mds/agg_mds/commons.py | 4 +- src/mds/agg_mds/datastore/__init__.py | 12 - .../agg_mds/datastore/elasticsearch_dao.py | 3 +- src/mds/agg_mds/query.py | 8 +- src/mds/populate.py | 5 +- tests/test_agg_mds_adapters.py | 66 ++++- tests/test_agg_mds_commons.py | 237 ++++++++++++++++++ tests/test_agg_mds_elasticsearch_dao.py | 10 +- tests/test_agg_mds_query.py | 194 +++++++++++++- tests/test_populate.py | 54 +++- 10 files changed, 550 insertions(+), 43 deletions(-) diff --git a/src/mds/agg_mds/commons.py b/src/mds/agg_mds/commons.py index 20b37e05..978bfbd7 100644 --- a/src/mds/agg_mds/commons.py +++ b/src/mds/agg_mds/commons.py @@ -40,8 +40,6 @@ class FieldAggregation: def string_to_array(s: str) -> Optional[List[str]]: - if s is None: - return None if s == "": return [] return [s] @@ -97,7 +95,7 @@ class FieldDefinition: "string_to_number": string_to_number, "string_to_integer": string_to_integer, "string_to_object": string_to_dict, - "dict_to_array": dict_to_array, + "object_to_array": dict_to_array, "string_to_array": string_to_array, } diff --git a/src/mds/agg_mds/datastore/__init__.py b/src/mds/agg_mds/datastore/__init__.py index a9d57f13..898f03a3 100644 --- a/src/mds/agg_mds/datastore/__init__.py +++ b/src/mds/agg_mds/datastore/__init__.py @@ -70,15 +70,3 @@ async def get_commons(): async def get_all_metadata(*args): return await client.get_all_metadata(*args) - - -async def get_aggregations(*args): - return await client.get_aggregations(*args) - - -async def get_number_aggregations(*args): - return await client.get_number_aggregation_for_field(*args) - - -async def search(*args): - return await client.search(*args) diff --git a/src/mds/agg_mds/datastore/elasticsearch_dao.py b/src/mds/agg_mds/datastore/elasticsearch_dao.py index 295a043e..6117f0bc 100644 --- a/src/mds/agg_mds/datastore/elasticsearch_dao.py +++ b/src/mds/agg_mds/datastore/elasticsearch_dao.py @@ -234,7 +234,6 @@ async def get_all_metadata(limit, offset, counts: Optional[str] = None, flatten= index=AGG_MDS_INDEX, body={"size": limit, "from": offset, "query": {"match_all": {}}}, ) - print(res) if flatten: flat = [] for record in res["hits"]["hits"]: @@ -326,7 +325,7 @@ async def metadata_tags(): return [] -async def get_commons_attribute(name, what): +async def get_commons_attribute(name): try: data = elastic_search_client.search( index=AGG_MDS_INFO_INDEX, diff --git a/src/mds/agg_mds/query.py b/src/mds/agg_mds/query.py index 14438222..62774ec5 100644 --- a/src/mds/agg_mds/query.py +++ b/src/mds/agg_mds/query.py @@ -20,7 +20,7 @@ async def get_commons(what: str): """ Returns information from the aggregate metadata service. """ - res = await datastore.get_commons_attribute(what, "") + res = await datastore.get_commons_attribute(what) if res: return res else: @@ -41,8 +41,6 @@ async def metadata( True, description="Return the results without grouping items by commons." ), ): - # TODO WFH How to properly return this? We think grouping by MDS is probably - # not ideal in reality. We already have commons_name in the results. """ Returns all metadata from all registered commons in the form: { @@ -71,7 +69,7 @@ async def metadata( ... }, """ - return await datastore.get_all_metadata(limit, offset, counts=None, flatten=flatten) + return await datastore.get_all_metadata(limit, offset, None, flatten) @mod.get("/aggregate/metadata") @@ -136,7 +134,7 @@ async def metadata_info(name: str): """ Returns information from the named commons. """ - res = await datastore.get_commons_attribute(name, "info") + res = await datastore.get_commons_attribute(name) if res: return res else: diff --git a/src/mds/populate.py b/src/mds/populate.py index 0c9bc4fa..080bd587 100644 --- a/src/mds/populate.py +++ b/src/mds/populate.py @@ -1,6 +1,6 @@ import asyncio from argparse import Namespace -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional from mds.agg_mds import datastore, adapters from mds.agg_mds.mds import pull_mds from mds.agg_mds.commons import MDSInstance, ColumnsToFields, Commons, parse_config @@ -244,9 +244,10 @@ async def filter_entries( return filtered -def parse_config_from_file(path: Path) -> Commons: +def parse_config_from_file(path: Path) -> Optional[Commons]: if not path.exists(): logger.error(f"configuration file: {path} does not exist") + return None try: return parse_config(path.read_text()) except IOError as ex: diff --git a/tests/test_agg_mds_adapters.py b/tests/test_agg_mds_adapters.py index 6506d902..a1169611 100644 --- a/tests/test_agg_mds_adapters.py +++ b/tests/test_agg_mds_adapters.py @@ -323,6 +323,70 @@ def test_get_metadata_icpsr(): assert isinstance(exc, RetryError) == True +@respx.mock +def test_drs_indexd(): + json_data = [ + { + "hints": [".*dg\\.XXTS.*"], + "host": "https://mytest1.commons.io/", + "name": "DataSTAGE", + "type": "indexd", + }, + { + "hints": [".*dg\\.TSXX.*"], + "host": "https://commons2.io/index/", + "name": "Environmental DC", + "type": "indexd", + }, + ] + + expected = { + "info": {"created": "07/07/2022 15:28:46:UTC"}, + "cache": { + "dg.XXTS": { + "host": "mytest1.commons.io", + "name": "DataSTAGE", + "type": "indexd", + }, + "dg.TSXX": { + "host": "commons2.io", + "name": "Environmental DC", + "type": "indexd", + }, + }, + } + + respx.get("http://test/index/_dist").mock( + return_value=httpx.Response( + status_code=200, + json=json_data, + ) + ) + + results = get_metadata( + "drs_indexd", + "http://test", + filters=None, + ) + + assert results["cache"] == expected["cache"] + + respx.get("http://test/index/_dist").mock( + return_value=httpx.Response( + status_code=404, + json=None, + ) + ) + + results = get_metadata( + "drs_indexd", + "http://test", + filters=None, + ) + + assert results == {"results": {}} + + @respx.mock def test_get_metadata_clinicaltrials(): json_response = r"""{ @@ -4203,7 +4267,7 @@ def test_get_metadata_harvard_dataverse(): mappings=field_mappings, ) except Exception as exc: - assert isinstance(exc, RetryError) == True + assert isinstance(exc, RetryError) is True def test_missing_adapter(): diff --git a/tests/test_agg_mds_commons.py b/tests/test_agg_mds_commons.py index 2021b59d..18bec43e 100644 --- a/tests/test_agg_mds_commons.py +++ b/tests/test_agg_mds_commons.py @@ -6,12 +6,188 @@ parse_config, Commons, Config, + ColumnsToFields, FieldDefinition, MDSInstance, AdapterMDSInstance, ) +def test_convert_tp_schema(): + schema = FieldDefinition( + type="object", + properties={ + "_subjects_count": FieldDefinition(type="integer"), + "year_awarded": FieldDefinition(type="integer", default=2000), + "__manifest": FieldDefinition( + type="array", + properties={ + "file_name": FieldDefinition(type="string"), + "file_size": FieldDefinition(type="integer"), + }, + ), + "tags": FieldDefinition(type="array"), + "study_description": FieldDefinition(type="string"), + "short_name": FieldDefinition(type="string"), + "full_name": FieldDefinition(type="string"), + "_unique_id": FieldDefinition(type="string"), + "study_id": FieldDefinition(type="string"), + "study_url": FieldDefinition(type="string"), + "commons_url": FieldDefinition(type="string"), + "authz": FieldDefinition(type="string"), + }, + ) + + converted = schema.to_schema(True) + + expected = { + "properties": { + "__manifest": { + "properties": { + "file_name": { + "fields": { + "analyzed": { + "analyzer": "ngram_analyzer", + "search_analyzer": "search_analyzer", + "term_vector": "with_positions_offsets", + "type": "text", + } + }, + "type": "keyword", + }, + "file_size": {"type": "long"}, + }, + "type": "nested", + }, + "_subjects_count": {"type": "long"}, + "_unique_id": { + "fields": { + "analyzed": { + "analyzer": "ngram_analyzer", + "search_analyzer": "search_analyzer", + "term_vector": "with_positions_offsets", + "type": "text", + } + }, + "type": "keyword", + }, + "authz": { + "fields": { + "analyzed": { + "analyzer": "ngram_analyzer", + "search_analyzer": "search_analyzer", + "term_vector": "with_positions_offsets", + "type": "text", + } + }, + "type": "keyword", + }, + "commons_url": { + "fields": { + "analyzed": { + "analyzer": "ngram_analyzer", + "search_analyzer": "search_analyzer", + "term_vector": "with_positions_offsets", + "type": "text", + } + }, + "type": "keyword", + }, + "full_name": { + "fields": { + "analyzed": { + "analyzer": "ngram_analyzer", + "search_analyzer": "search_analyzer", + "term_vector": "with_positions_offsets", + "type": "text", + } + }, + "type": "keyword", + }, + "short_name": { + "fields": { + "analyzed": { + "analyzer": "ngram_analyzer", + "search_analyzer": "search_analyzer", + "term_vector": "with_positions_offsets", + "type": "text", + } + }, + "type": "keyword", + }, + "study_description": { + "fields": { + "analyzed": { + "analyzer": "ngram_analyzer", + "search_analyzer": "search_analyzer", + "term_vector": "with_positions_offsets", + "type": "text", + } + }, + "type": "keyword", + }, + "study_id": { + "fields": { + "analyzed": { + "analyzer": "ngram_analyzer", + "search_analyzer": "search_analyzer", + "term_vector": "with_positions_offsets", + "type": "text", + } + }, + "type": "keyword", + }, + "study_url": { + "fields": { + "analyzed": { + "analyzer": "ngram_analyzer", + "search_analyzer": "search_analyzer", + "term_vector": "with_positions_offsets", + "type": "text", + } + }, + "type": "keyword", + }, + "tags": {"type": "nested"}, + "year_awarded": { + "type": "long", + }, + }, + "type": "nested", + } + assert converted == expected + + converted = schema.to_schema(False, True) + + expected = { + "type": "object", + "properties": { + "_subjects_count": {"type": "integer", "description": ""}, + "year_awarded": {"type": "integer", "description": "", "default": 2000}, + "__manifest": { + "type": "array", + "properties": { + "file_name": {"type": "string", "description": ""}, + "file_size": {"type": "integer", "description": ""}, + }, + "description": "", + }, + "tags": {"type": "array", "description": ""}, + "study_description": {"type": "string", "description": ""}, + "short_name": {"type": "string", "description": ""}, + "full_name": {"type": "string", "description": ""}, + "_unique_id": {"type": "string", "description": ""}, + "study_id": {"type": "string", "description": ""}, + "study_url": {"type": "string", "description": ""}, + "commons_url": {"type": "string", "description": ""}, + "authz": {"type": "string", "description": ""}, + }, + "description": "", + } + + assert converted == expected + + def test_parse_config(): results = parse_config( """ @@ -122,3 +298,64 @@ def test_parse_config(): ) assert expected == results + + +def test_normalization(): + val = FieldDefinition(type="integer") + assert val.normalize_value("100") == 100 + assert val.normalize_value("bear") is None + + val = FieldDefinition(type="number") + assert val.normalize_value("1.23") == 1.23 + assert val.normalize_value("bear") is None + + val = FieldDefinition(type="array") + assert val.normalize_value("1.23") == ["1.23"] + assert val.normalize_value({"foo": "bar"}) == [{"foo": "bar"}] + + val = FieldDefinition(type="array") + assert val.normalize_value(None) is None + + val = FieldDefinition(type="array") + assert val.normalize_value("") == [] + + val = FieldDefinition(type="object") + assert val.normalize_value('{"foo" : "bar"}') == {"foo": "bar"} + + val = FieldDefinition(type="object") + assert val.normalize_value("bear") is None + + val = FieldDefinition(type="string") + assert val.normalize_value("hello") == "hello" + + val = FieldDefinition(type="bar") + assert val.normalize_value("hello") == "hello" + + val = FieldDefinition(type="string") + val.has_default_value() is False + + val = FieldDefinition(type="string", default="hi") + val.has_default_value() == "hi" + + +def test_mds_instance(): + val = MDSInstance( + mds_url="https://test", + commons_url="http:/commons.io", + ) + assert val.columns_to_fields is None + + val = MDSInstance( + mds_url="https://test", + commons_url="http:/commons.io", + columns_to_fields={ + "val1": "path:root", + "value2": {"name": "value1", "default": "bear"}, + }, + ) + + assert val.columns_to_fields is not None + + val = ColumnsToFields(name="test", default="bear") + assert val.get_value({"test": "fox"}) == "fox" + assert val.get_value({"b": "s"}) == "bear" diff --git a/tests/test_agg_mds_elasticsearch_dao.py b/tests/test_agg_mds_elasticsearch_dao.py index c4a51de0..fb5abe02 100644 --- a/tests/test_agg_mds_elasticsearch_dao.py +++ b/tests/test_agg_mds_elasticsearch_dao.py @@ -9,7 +9,7 @@ CONFIG, SEARCH_CONFIG, ) -from elasticsearch import Elasticsearch, exceptions as es_exceptions +from elasticsearch import exceptions as es_exceptions from mds.config import ES_RETRY_LIMIT, ES_RETRY_INTERVAL COMMON_MAPPING = { @@ -97,7 +97,6 @@ async def test_create_if_exists(): @pytest.mark.asyncio async def test_create_index_raise_exception(): - with patch( "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.indices.create", MagicMock(side_effect=es_exceptions.RequestError(403, "expect_to_fail")), @@ -265,7 +264,7 @@ async def test_get_commons_attribute(): with patch( "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client", MagicMock() ) as mock_client: - await elasticsearch_dao.get_commons_attribute("my-commons", "attribute") + await elasticsearch_dao.get_commons_attribute("my-commons") mock_client.search.assert_called_with( index="default_namespace-commons-info-index", body={"query": {"terms": {"_id": ["my-commons"]}}}, @@ -275,10 +274,7 @@ async def test_get_commons_attribute(): "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.search", MagicMock(side_effect=Exception("some error")), ) as mock_search: - assert ( - await elasticsearch_dao.get_commons_attribute("my-commons", "attribute") - == None - ) + assert await elasticsearch_dao.get_commons_attribute("my-commons") is None @pytest.mark.asyncio diff --git a/tests/test_agg_mds_query.py b/tests/test_agg_mds_query.py index d6cd361d..e17ef955 100644 --- a/tests/test_agg_mds_query.py +++ b/tests/test_agg_mds_query.py @@ -72,7 +72,7 @@ async def test_aggregate_metadata_paged(client): resp = client.get("/aggregate/metadata_paged") assert resp.status_code == 200 assert resp.json() == {"results": []} - datastore.get_all_metadata.assert_called_with(20, 0, counts=None, flatten=True) + datastore.get_all_metadata.assert_called_with(20, 0, None, True) mock_data = { "results": [ @@ -88,7 +88,94 @@ async def test_aggregate_metadata_paged(client): resp = client.get("/aggregate/metadata_paged") assert resp.status_code == 200 assert resp.json() == mock_data - datastore.get_all_metadata.assert_called_with(20, 0, counts=None, flatten=True) + datastore.get_all_metadata.assert_called_with(20, 0, None, True) + + +@pytest.mark.asyncio +async def test_aggregate_metadata_paged_flat(client): + mock_data = { + "took": 3, + "timed_out": "false", + "_shards": {"total": 1, "successful": 1, "skipped": 0, "failed": 0}, + "hits": { + "total": 161, + "max_score": 1.0, + "hits": [ + { + "_index": "default_namespace-commons-index", + "_type": "commons", + "_id": "815616c0-dfsdfjjj", + "_score": 1.0, + "_source": { + "link": "", + "tags": [ + {"name": "restricted", "category": "Access"}, + {"name": "genomic", "category": "category"}, + ], + "commons": "LI", + "_unique_id": "815616c0-c4a4-4883-9107-a05694499a36", + "dataset_code": "LI", + "brief_summary": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.", + "dataset_title": "Lorem ipsum dolor sit amet", + "samples_count": "", + "subjects_count": "", + "data_files_count": 11062, + "_subjects_count": "", + "study_description": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ultricies tristique nulla aliquet enim tortor at auctor.", + "short_name": "Lorem ipsum dolor sit amet", + "full_name": "Lorem ipsum dolor sit amet, consectetur adipiscing elit", + "commons_name": "Lorem ipsum", + "__manifest": [ + {"filename": "foo2.txt"}, + {"filename": "foo3.txt"}, + ], + }, + } + ], + }, + } + + results = { + "pagination": {"hits": 161, "offset": 0, "pageSize": 20, "pages": 9}, + "results": [ + { + "815616c0-dfsdfjjj": { + "gen3_discovery": { + "link": "", + "tags": [ + {"name": "restricted", "category": "Access"}, + {"name": "genomic", "category": "category"}, + ], + "commons": "LI", + "_unique_id": "815616c0-c4a4-4883-9107-a05694499a36", + "dataset_code": "LI", + "brief_summary": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.", + "dataset_title": "Lorem ipsum dolor sit amet", + "samples_count": "", + "subjects_count": "", + "data_files_count": 11062, + "_subjects_count": "", + "study_description": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ultricies tristique nulla aliquet enim tortor at auctor.", + "short_name": "Lorem ipsum dolor sit amet", + "full_name": "Lorem ipsum dolor sit amet, consectetur adipiscing elit", + "commons_name": "Lorem ipsum", + "__manifest": [ + {"filename": "foo2.txt"}, + {"filename": "foo3.txt"}, + ], + } + } + } + ], + } + + with patch( + "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.search", + MagicMock(return_value=mock_data), + ) as search: + resp = client.get("/aggregate/metadata_paged?flatten=true") + assert resp.status_code == 200 + assert resp.json() == results @pytest.mark.asyncio @@ -174,6 +261,86 @@ async def test_aggregate_metadata_counts(client): assert resp.json() == results +@pytest.mark.asyncio +async def test_aggregate_metadata_counts_null(client): + mock_data = { + "took": 3, + "timed_out": "false", + "_shards": {"total": 1, "successful": 1, "skipped": 0, "failed": 0}, + "hits": { + "total": 161, + "max_score": 1.0, + "hits": [ + { + "_index": "default_namespace-commons-index", + "_type": "commons", + "_id": "815616c0-dfsdfjjj", + "_score": 1.0, + "_source": { + "link": "", + "tags": [ + {"name": "restricted", "category": "Access"}, + {"name": "genomic", "category": "category"}, + ], + "commons": "LI", + "_unique_id": "815616c0-c4a4-4883-9107-a05694499a36", + "dataset_code": "LI", + "brief_summary": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.", + "dataset_title": "Lorem ipsum dolor sit amet", + "samples_count": "", + "subjects_count": "", + "data_files_count": 11062, + "_subjects_count": "", + "study_description": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ultricies tristique nulla aliquet enim tortor at auctor.", + "short_name": "Lorem ipsum dolor sit amet", + "full_name": "Lorem ipsum dolor sit amet, consectetur adipiscing elit", + "commons_name": "Lorem ipsum", + "__manifest": None, + }, + } + ], + }, + } + + results = { + "Lorem ipsum": [ + { + "815616c0-dfsdfjjj": { + "gen3_discovery": { + "link": "", + "tags": [ + {"name": "restricted", "category": "Access"}, + {"name": "genomic", "category": "category"}, + ], + "commons": "LI", + "_unique_id": "815616c0-c4a4-4883-9107-a05694499a36", + "dataset_code": "LI", + "brief_summary": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.", + "dataset_title": "Lorem ipsum dolor sit amet", + "samples_count": "", + "subjects_count": "", + "data_files_count": 11062, + "_subjects_count": "", + "study_description": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ultricies tristique nulla aliquet enim tortor at auctor.", + "short_name": "Lorem ipsum dolor sit amet", + "full_name": "Lorem ipsum dolor sit amet, consectetur adipiscing elit", + "commons_name": "Lorem ipsum", + "__manifest": 0, + } + } + } + ] + } + + with patch( + "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.search", + MagicMock(return_value=mock_data), + ) as search: + resp = client.get("/aggregate/metadata?counts=__manifest") + assert resp.status_code == 200 + assert resp.json() == results + + @pytest.mark.asyncio async def test_aggregate_metadata_name(client): with patch.object( @@ -251,7 +418,7 @@ async def test_aggregate_metadata_info(client): "message": "no common exists with the given: commons1", } } - datastore.get_commons_attribute.assert_called_with("commons1", "info") + datastore.get_commons_attribute.assert_called_with("commons1") with patch.object( datastore, @@ -261,7 +428,7 @@ async def test_aggregate_metadata_info(client): resp = client.get("/aggregate/metadata/commons1/info") assert resp.status_code == 200 assert resp.json() == {"commons_url": "http://commons"} - datastore.get_commons_attribute.assert_called_with("commons1", "info") + datastore.get_commons_attribute.assert_called_with("commons1") @pytest.mark.asyncio @@ -303,8 +470,23 @@ async def test_aggregate_metadata_get_schema(client): "year_awarded": {"type": "integer", "description": ""}, } ), - ) as datastore_mock: + ): resp = client.get("/aggregate/info/schema") assert resp.status_code == 200 assert resp.json() == schema - datastore.get_commons_attribute.assert_called_with("schema", "") + datastore.get_commons_attribute.assert_called_with("schema") + + with patch.object( + datastore, + "get_commons_attribute", + AsyncMock(return_value=None), + ) as datastore_mock: + # test for unknown info string + resp = client.get("/aggregate/info/nothing") + assert resp.status_code == 404 + assert resp.json() == { + "detail": { + "code": 404, + "message": "information for nothing not found", + } + } diff --git a/tests/test_populate.py b/tests/test_populate.py index 487f36e8..e50a7d8f 100644 --- a/tests/test_populate.py +++ b/tests/test_populate.py @@ -1,4 +1,6 @@ import pytest +import respx +import httpx from argparse import Namespace from mds.populate import ( parse_config_from_file, @@ -88,8 +90,9 @@ async def test_populate_metadata(): ) +@respx.mock @pytest.mark.asyncio -async def test_main(): +async def test_populate_main(): with patch("mds.config.USE_AGG_MDS", False): with pytest.raises(SystemExit) as pytest_wrapped_e: await main(commons_config=None) @@ -97,7 +100,6 @@ async def test_main(): assert pytest_wrapped_e.value.code == 1 patch("mds.config.USE_AGG_MDS", True).start() - patch("mds.populate.pull_mds", MagicMock()).start() patch.object(datastore, "init", AsyncMock()).start() patch.object(datastore, "drop_all", AsyncMock()).start() patch.object(datastore, "create_indexes", AsyncMock()).start() @@ -108,6 +110,33 @@ async def test_main(): patch.object(datastore, "update_metadata", AsyncMock()).start() patch.object(adapters, "get_metadata", MagicMock()).start() + json_response = { + "GSE63878": { + "_guid_type": "discovery_metadata", + "gen3_discovery": { + "link": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE63878", + "tags": [{"name": "Array", "category": "Data Type"}], + "source": "Ichan School of Medicine at Mount Sinai", + "funding": "", + "study_description_summary": "The molecular factors involved in the development of Post-traumatic Stress Disorder (PTSD) remain poorly understood. Previous transcriptomic studies investigating the mechanisms of PTSD apply targeted approaches to identify individual genes under a cross-sectional framework lack a holistic view of the behaviours and properties of these genes at the system-level. Here we sought to apply an unsupervised gene-network-based approach to a prospective experimental design using whole-transcriptome RNA-Seq gene expression from peripheral blood leukocytes of U.S. Marines (N=188), obtained both pre- and post-deployment to conflict zones. We identified discrete groups of co-regulated genes (i.e., co-expression modules) and tested them for association to PTSD. We identified one module at both pre- and post-deployment containing putative causal signatures for PTSD development displaying an over-expression of genes enriched for functions of innate-immune response and interferon signalling (Type-I and Type-II). Importantly, these results were replicated in a second non-overlapping independent dataset of U.S. Marines (N=96), further outlining the role of innate immune and interferon signalling genes within co-expression modules to explain at least part of the causal pathophysiology for PTSD development. A second module, consequential of trauma exposure, contained PTSD resiliency signatures and an over-expression of genes involved in hemostasis and wound responsiveness suggesting that chronic levels of stress impair proper wound healing during/after exposure to the battlefield while highlighting the role of the hemostatic system as a clinical indicator of chronic-based stress. These findings provide novel insights for early preventative measures and advanced PTSD detection, which may lead to interventions that delay or perhaps abrogate the development of PTSD.\nWe used microarrays to characterize both prognostic and diagnostic molecular signatures associated to PTSD risk and PTSD status compared to control subjects.", + "study_title": "Gene Networks Specific for Innate Immunity Define Post-traumatic Stress Disorder [Affymetrix]", + "subjects_count": 48, + "accession_number": "GSE63878", + "data_files_count": 0, + "contributor": "me.foo@smartsite.com", + }, + } + } + + respx.get( + "http://test/ok//mds/metadata?data=True&_guid_type=discovery_metadata&limit=1000&offset=0" + ).mock( + return_value=httpx.Response( + status_code=200, + json=json_response, + ) + ) + await main( Commons( configuration=Config( @@ -118,9 +147,17 @@ async def test_main(): ), gen3_commons={ "my_commons": MDSInstance( - mds_url="", - commons_url="", - columns_to_fields={}, + mds_url="http://test/ok/", + commons_url="test", + columns_to_fields={ + "authz": "path:authz", + "tags": "path:tags", + "_subjects_count": "path:subjects_count", + "dbgap_accession_number": "path:study_id", + "study_description": "path:study_description_summary", + "number_of_datafiles": "path:data_files_count", + "investigator": "path:contributor", + }, ), }, adapter_commons={ @@ -235,3 +272,10 @@ def test_parse_config_from_file(): }, ).to_json() ) + + assert parse_config_from_file(Path("dummmy_files")) is None + + try: + parse_config_from_file(Path("/")) + except Exception as exc: + assert isinstance(exc, IOError) is True From c221a42f7d1b0a794e959a4414e0c4c9614773de Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Thu, 7 Jul 2022 14:28:11 -0500 Subject: [PATCH 20/70] add array_to_string converter --- src/mds/agg_mds/commons.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/mds/agg_mds/commons.py b/src/mds/agg_mds/commons.py index 978bfbd7..b4506ca6 100644 --- a/src/mds/agg_mds/commons.py +++ b/src/mds/agg_mds/commons.py @@ -45,6 +45,12 @@ def string_to_array(s: str) -> Optional[List[str]]: return [s] +def array_to_string(arr: Optional[list]) -> Optional[str]: + if arr is None: + return None + return "".join(arr) + + def string_to_integer(s: str) -> int: return int(s) if s.isnumeric() else None @@ -97,6 +103,7 @@ class FieldDefinition: "string_to_object": string_to_dict, "object_to_array": dict_to_array, "string_to_array": string_to_array, + "array_to_string": array_to_string, } MAP_TYPE_TO_JSON_SCHEMA_TYPES = { From 8ada9878c139d2e14064bd5494ecbe72050375fa Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Thu, 7 Jul 2022 16:59:35 -0500 Subject: [PATCH 21/70] add array_to_string converter unit test --- src/mds/populate.py | 2 +- tests/test_agg_mds_adapters.py | 14 +++++++++++++- tests/test_agg_mds_commons.py | 4 ++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/mds/populate.py b/src/mds/populate.py index 080bd587..4b01aa2b 100644 --- a/src/mds/populate.py +++ b/src/mds/populate.py @@ -219,7 +219,7 @@ async def filter_entries( "select_field": { "field_name" : "commons" , "field_value" : "Proteomic Data Commons" - } + }:q where only the records with the commons field === "Proteomic Data Commons" are added. Note the function assumes the field exists in all of the entries in the mds_arr parameter """ diff --git a/tests/test_agg_mds_adapters.py b/tests/test_agg_mds_adapters.py index a1169611..faf3285d 100644 --- a/tests/test_agg_mds_adapters.py +++ b/tests/test_agg_mds_adapters.py @@ -1,11 +1,23 @@ from more_itertools import side_effect import respx import json -from mds.agg_mds.adapters import get_metadata, get_json_path_value +from mds.agg_mds.adapters import ( + get_metadata, + get_json_path_value, + strip_email, + strip_html, + add_icpsr_source_url, +) from tenacity import RetryError, wait_none import httpx +def test_filters_with_bad_entries(): + assert strip_email(100) == 100 + assert strip_html(99) == 99 + assert add_icpsr_source_url(77) == 77 + + @respx.mock def test_get_metadata_icpsr(): xml_response = """ diff --git a/tests/test_agg_mds_commons.py b/tests/test_agg_mds_commons.py index 18bec43e..18d55824 100644 --- a/tests/test_agg_mds_commons.py +++ b/tests/test_agg_mds_commons.py @@ -337,6 +337,10 @@ def test_normalization(): val = FieldDefinition(type="string", default="hi") val.has_default_value() == "hi" + val = FieldDefinition(type="string") + assert val.normalize_value(["hello", "how", "are", "you"]) == "hellohowareyou" + assert val.normalize_value(None) is None + def test_mds_instance(): val = MDSInstance( From ba2c533b3a6023f47adea947853313284033148f Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Fri, 8 Jul 2022 15:41:32 -0500 Subject: [PATCH 22/70] add filter option to gen3 adapter --- src/mds/agg_mds/adapters.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/mds/agg_mds/adapters.py b/src/mds/agg_mds/adapters.py index 36bdd874..b3d47181 100644 --- a/src/mds/agg_mds/adapters.py +++ b/src/mds/agg_mds/adapters.py @@ -790,6 +790,7 @@ def getRemoteDataAsJson(self, **kwargs) -> Dict: guid_type = config.get("guid_type", "discovery_metadata") field_name = config.get("field_name", None) field_value = config.get("field_value", None) + filters = config.get("filters", None) batchSize = config.get("batchSize", 1000) maxItems = config.get("maxItems", None) @@ -797,9 +798,10 @@ def getRemoteDataAsJson(self, **kwargs) -> Dict: limit = min(maxItems, batchSize) if maxItems is not None else batchSize moreData = True while moreData: - url = f"{mds_url}mds/metadata?data=True&_guid_type={guid_type}&limit={limit}&offset={offset}" try: url = f"{mds_url}mds/metadata?data=True&_guid_type={guid_type}&limit={limit}&offset={offset}" + if filters: + url += f"&{filters}" if field_name is not None and field_value is not None: url += f"&{guid_type}.{field_name}={field_value}" response = httpx.get(url) From 2b40d9f898af71da240341e3b754cf321d99603e Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Fri, 15 Jul 2022 17:05:39 -0500 Subject: [PATCH 23/70] fix drs caching error, update unit test --- src/mds/agg_mds/commons.py | 2 +- src/mds/populate.py | 4 ---- tests/test_agg_mds_commons.py | 4 +++- tests/test_populate.py | 9 ++++++--- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/mds/agg_mds/commons.py b/src/mds/agg_mds/commons.py index b4506ca6..76053d56 100644 --- a/src/mds/agg_mds/commons.py +++ b/src/mds/agg_mds/commons.py @@ -229,7 +229,7 @@ class Settings: @dataclass_json @dataclass class Config: - settings: Optional[Dict[str, Settings]] = field(default_factory=dict) + settings: Optional[Settings] = field(default_factory=dict) schema: Optional[Dict[str, FieldDefinition]] = field(default_factory=dict) aggregations: Optional[Dict[str, FieldAggregation]] = field(default_factory=dict) search_settings: Optional[Dict[str, FieldAggregation]] = field(default_factory=dict) diff --git a/src/mds/populate.py b/src/mds/populate.py index 4b01aa2b..7e2bef4b 100644 --- a/src/mds/populate.py +++ b/src/mds/populate.py @@ -109,10 +109,6 @@ async def populate_info(commons_config: Commons) -> None: async def populate_drs_info(commons_config: Commons) -> None: - if len(commons_config.configuration.settings) == 0: - return - if len(commons_config.configuration.settings.cache_dir): - return if commons_config.configuration.settings.cache_drs: server = commons_config.configuration.settings.drs_indexd_server if server is not None: diff --git a/tests/test_agg_mds_commons.py b/tests/test_agg_mds_commons.py index 18d55824..807d2d7d 100644 --- a/tests/test_agg_mds_commons.py +++ b/tests/test_agg_mds_commons.py @@ -6,6 +6,7 @@ parse_config, Commons, Config, + Settings, ColumnsToFields, FieldDefinition, MDSInstance, @@ -253,6 +254,7 @@ def test_parse_config(): ) expected = Commons( configuration=Config( + settings=Settings(), schema={ "_subjects_count": FieldDefinition(type="integer"), "year_awarded": FieldDefinition(type="integer"), @@ -272,7 +274,7 @@ def test_parse_config(): "study_url": FieldDefinition(type="string"), "commons_url": FieldDefinition(type="string"), "authz": FieldDefinition(type="string"), - } + }, ), gen3_commons={ "my_gen3_commons": MDSInstance( diff --git a/tests/test_populate.py b/tests/test_populate.py index e50a7d8f..b28c0952 100644 --- a/tests/test_populate.py +++ b/tests/test_populate.py @@ -13,6 +13,7 @@ AdapterMDSInstance, MDSInstance, Commons, + Settings, FieldDefinition, Config, ) @@ -140,10 +141,11 @@ async def test_populate_main(): await main( Commons( configuration=Config( + settings=Settings(), schema={ "_subjects_count": FieldDefinition(type="integer"), "year_awarded": FieldDefinition(type="integer"), - } + }, ), gen3_commons={ "my_commons": MDSInstance( @@ -213,7 +215,7 @@ def test_parse_config_from_file(): "schema": { "_subjects_count": {"type": "integer"}, "study_description": {}, - } + }, }, "gen3_commons": { "mycommons": { @@ -244,10 +246,11 @@ def test_parse_config_from_file(): config.to_json() == Commons( configuration=Config( + settings=Settings(), schema={ "_subjects_count": FieldDefinition(type="integer"), "study_description": FieldDefinition(type="string"), - } + }, ), gen3_commons={ "mycommons": MDSInstance( From 7c3a2b0cbe7baed8f209b1a1f02bee59107a4f00 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Fri, 15 Jul 2022 17:22:55 -0500 Subject: [PATCH 24/70] update tests --- tests/test_agg_mds_adapters.py | 48 ++++++++++++++++------------------ 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/tests/test_agg_mds_adapters.py b/tests/test_agg_mds_adapters.py index faf3285d..bd61c0f8 100644 --- a/tests/test_agg_mds_adapters.py +++ b/tests/test_agg_mds_adapters.py @@ -1346,12 +1346,7 @@ def test_get_metadata_clinicaltrials(): respx.get( "http://test/ok?expr=should+error+bad+field&fmt=json&min_rnk=1&max_rnk=1" - ).mock( - return_value=httpx.Response( - status_code=200, - content=json.loads(json_response3), - ) - ) + ).mock(return_value=httpx.Response(status_code=200, content=json_response3)) assert ( get_metadata( @@ -4147,15 +4142,10 @@ def test_get_metadata_harvard_dataverse(): # failed calls respx.get( "http://test/ok/datasets/:persistentId/?persistentId=doi:10.7910/DVN/5B8YM8" - ).mock( - return_value=httpx.Response( - status_code=200, - json=json.loads(dataset_json_response), - ) - ) + ).mock(return_value=httpx.Response(status_code=200, content=dataset_json_response)) respx.get("http://test/ok/access/datafile/6297263/metadata/ddi").mock( - return_value=httpx.Response(status_code=200, json=file_ddi_response) + return_value=httpx.Response(status_code=200, content=file_ddi_response) ) assert get_metadata("havard_dataverse", "http://test/ok", filters=None) == {} @@ -4205,15 +4195,11 @@ def test_get_metadata_harvard_dataverse(): # valid single variable call respx.get( "http://test/single_variable/datasets/:persistentId/?persistentId=doi:10.7910/DVN/5B8YM8" - ).mock( - return_value=httpx.Response( - status_code=200, json=json.loads(dataset_json_response) - ) - ) + ).mock(return_value=httpx.Response(status_code=200, content=dataset_json_response)) respx.get("http://test/single_variable/access/datafile/6297263/metadata/ddi").mock( return_value=httpx.Response( - status_code=200, text=file_single_variable_ddi_response + status_code=200, content=file_single_variable_ddi_response ) ) @@ -4230,12 +4216,7 @@ def test_get_metadata_harvard_dataverse(): # invalid responses respx.get( "http://test/invalid_dataset_response/datasets/:persistentId/?persistentId=doi:10.7910/DVN/5B8YM8" - ).mock( - return_value=httpx.Response( - status_code=200, - json={"status": "ok"}, - ) - ) + ).mock(return_value=httpx.Response(status_code=200, json={"status": "ok"})) assert ( get_metadata( @@ -4262,6 +4243,23 @@ def test_get_metadata_harvard_dataverse(): ) # Incorrect keys expected in adapter class + respx.get( + "http://test/different_keys/datasets/:persistentId/?persistentId=doi:10.7910/DVN/5B8YM8" + ).mock( + return_value=httpx.Response( + status_code=200, json=dataset_json_different_keys_response + ) + ) + + assert ( + get_metadata( + "harvard_dataverse", + "http://test/different_keys", + filters={"persistent_ids": ["doi:10.7910/DVN/5B8YM8"]}, + mappings=field_mappings, + ) + == {} + ) try: from mds.agg_mds.adapters import HarvardDataverse From 97cc562b356409dc176122ec53c32811eae5f6bb Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Tue, 26 Jul 2022 12:08:12 -0500 Subject: [PATCH 25/70] prevent missing study field from throwing exception --- src/mds/agg_mds/adapters.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/mds/agg_mds/adapters.py b/src/mds/agg_mds/adapters.py index b3d47181..63ea1d22 100644 --- a/src/mds/agg_mds/adapters.py +++ b/src/mds/agg_mds/adapters.py @@ -873,6 +873,9 @@ def normalizeToGen3MDSFields(self, data, **kwargs) -> Dict[str, Any]: results = {} for guid, record in data["results"].items(): + if study_field not in record: + logger.error(f"Study field not in record. Skipping") + continue item = Gen3Adapter.addGen3ExpectedFields( record[study_field], mappings, From 0d185cd06095cdbac9451e19a2cfe24a0b67ed39 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Fri, 29 Jul 2022 08:30:58 -0500 Subject: [PATCH 26/70] update README with aggMDS development instructions --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index cd55e155..40071f1b 100644 --- a/README.md +++ b/README.md @@ -101,6 +101,18 @@ Run tests: ```bash docker-compose exec app pytest --cov=src --cov=migrations/versions tests ``` +### Aggregate MDS +For local development on OSX, install elasticsearch and ensure it is running. + +testing populate: +```bash +python src/mds/populate.py --config --hostname localhost --port 9200 +``` +view the loaded data +```bash +http://localhost:8000/aggregate/metadata?limit=1000 +``` + ## Deployment From 71d70aeafa93a5cacb8d1d4bc9920c7c3effdfef Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Tue, 2 Aug 2022 23:52:16 -0500 Subject: [PATCH 27/70] change sh to bash to wait for esproxy --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 1256f94b..c721ed22 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -36,7 +36,7 @@ services: environment: - USE_AGG_MDS=true - GEN3_ES_ENDPOINT=http://esproxy-service:9200 - command: sh -c 'while [[ "$$(curl --connect-timeout 2 -s -o /dev/null -w ''%{http_code}'' $$GEN3_ES_ENDPOINT)" != "200" ]]; do echo "wait for " $$GEN3_ES_ENDPOINT; sleep 5; done; echo es backend is available;/env/bin/python /src/src/mds/populate.py --config /src/tests/config.json' + command: bash -c 'while [[ "$$(curl --connect-timeout 2 -s -o /dev/null -w ''%{http_code}'' $$GEN3_ES_ENDPOINT)" != "200" ]]; do echo "wait for " $$GEN3_ES_ENDPOINT; sleep 5; done; echo es backend is available;/env/bin/python /src/src/mds/populate.py --config /src/tests/config.json' db: image: postgres environment: From 0577bc9346be2511a7becd3d79e1ceca2a986983 Mon Sep 17 00:00:00 2001 From: tianj7 Date: Tue, 9 Aug 2022 09:52:48 -0500 Subject: [PATCH 28/70] Change aggregate metadata population script. Retain original index in case of failure. --- src/mds/agg_mds/datastore/__init__.py | 24 +++ .../agg_mds/datastore/elasticsearch_dao.py | 109 ++++++++++++- src/mds/populate.py | 137 +++++++++++----- tests/test_populate.py | 150 ++++++++++++++++++ 4 files changed, 380 insertions(+), 40 deletions(-) diff --git a/src/mds/agg_mds/datastore/__init__.py b/src/mds/agg_mds/datastore/__init__.py index 898f03a3..b7a77c88 100644 --- a/src/mds/agg_mds/datastore/__init__.py +++ b/src/mds/agg_mds/datastore/__init__.py @@ -17,10 +17,22 @@ async def drop_all(): await client.drop_all() +async def drop_all_temp_indexes(): + await client.drop_all_temp_indexes() + + async def create_indexes(commons_mapping): await client.create_indexes(commons_mapping) +async def create_temp_indexes(commons_mapping): + await client.create_temp_indexes(commons_mapping) + + +async def clone_temp_indexes_to_real_indexes(): + await client.clone_temp_indexes_to_real_indexes() + + async def close(): await client.close() @@ -36,14 +48,26 @@ async def update_metadata(*args): await client.update_metadata(*args) +async def update_metadata_to_temp_index(*args): + await client.update_metadata_to_temp_index(*args) + + async def update_global_info(*args): await client.update_global_info(*args) +async def update_global_info_to_temp_index(*args): + await client.update_global_info_to_temp_index(*args) + + async def update_config_info(*args): await client.update_config_info(*args) +async def update_config_info_to_temp_index(*args): + await client.update_config_info_to_temp_index(*args) + + async def get_commons_metadata(*args): return await client.get_commons_metadata(*args) diff --git a/src/mds/agg_mds/datastore/elasticsearch_dao.py b/src/mds/agg_mds/datastore/elasticsearch_dao.py index 6117f0bc..12ff8318 100644 --- a/src/mds/agg_mds/datastore/elasticsearch_dao.py +++ b/src/mds/agg_mds/datastore/elasticsearch_dao.py @@ -1,4 +1,4 @@ -from elasticsearch import Elasticsearch, exceptions as es_exceptions +from elasticsearch import Elasticsearch, exceptions as es_exceptions, helpers from typing import List, Dict, Optional, Tuple from math import ceil from mds import logger @@ -6,12 +6,15 @@ AGG_MDS_INDEX = f"{AGG_MDS_NAMESPACE}-commons-index" AGG_MDS_TYPE = "commons" +AGG_MDS_INDEX_TEMP = f"{AGG_MDS_NAMESPACE}-commons-index-temp" AGG_MDS_INFO_INDEX = f"{AGG_MDS_NAMESPACE}-commons-info-index" AGG_MDS_INFO_TYPE = "commons-info" +AGG_MDS_INFO_INDEX_TEMP = f"{AGG_MDS_NAMESPACE}-commons-info-index-temp" AGG_MDS_CONFIG_INDEX = f"{AGG_MDS_NAMESPACE}-commons-config-index" AGG_MDS_CONFIG_TYPE = "commons-config" +AGG_MDS_CONFIG_INDEX_TEMP = f"{AGG_MDS_NAMESPACE}-commons-config-index-temp" # Setting Commons Info ES index to only store documents # will not be searching on it @@ -80,6 +83,27 @@ async def drop_all(): logger.debug(f"deleted index: {index}: {res}") +async def drop_all_temp_indexes(): + for index in [ + AGG_MDS_INDEX_TEMP, + AGG_MDS_INFO_INDEX_TEMP, + AGG_MDS_CONFIG_INDEX_TEMP, + ]: + res = elastic_search_client.indices.delete(index=index, ignore=[400, 404]) + logger.debug(f"deleted index: {index}: {res}") + + +async def clone_temp_indexes_to_real_indexes(): + for index in [AGG_MDS_INDEX, AGG_MDS_INFO_INDEX, AGG_MDS_CONFIG_INDEX]: + source_index = index + "-temp" + reqBody = {"source": {"index": source_index}, "dest": {"index": index}} + logger.debug(f"Cloning index: {source_index} to {index}...") + res = Elasticsearch.reindex(elastic_search_client, reqBody) + # Elasticsearch >7.4 introduces the clone api we could use later on + # res = elastic_search_client.indices.clone(index=source_index, target=index) + logger.debug(f"Cloned index: {source_index} to {index}: {res}") + + async def create_indexes(common_mapping: dict): try: mapping = {**SEARCH_CONFIG, **common_mapping} @@ -118,6 +142,46 @@ async def create_indexes(common_mapping: dict): raise ex +async def create_temp_indexes(common_mapping: dict): + try: + mapping = {**SEARCH_CONFIG, **common_mapping} + res = elastic_search_client.indices.create( + index=AGG_MDS_INDEX_TEMP, body=mapping + ) + logger.debug(f"created index {AGG_MDS_INDEX_TEMP}: {res}") + except es_exceptions.RequestError as ex: + if ex.error == "resource_already_exists_exception": + logger.warning(f"index already exists: {AGG_MDS_INDEX_TEMP}") + pass # Index already exists. Ignore. + else: # Other exception - raise it + raise ex + + try: + res = elastic_search_client.indices.create( + index=AGG_MDS_INFO_INDEX_TEMP, body=INFO_MAPPING + ) + logger.debug(f"created index {AGG_MDS_INFO_INDEX_TEMP}: {res}") + + except es_exceptions.RequestError as ex: + if ex.error == "resource_already_exists_exception": + logger.warning(f"index already exists: {AGG_MDS_INFO_INDEX_TEMP}") + pass # Index already exists. Ignore. + else: # Other exception - raise it + raise ex + + try: + res = elastic_search_client.indices.create( + index=AGG_MDS_CONFIG_INDEX_TEMP, body=CONFIG + ) + logger.debug(f"created index {AGG_MDS_CONFIG_INDEX_TEMP}: {res}") + except es_exceptions.RequestError as ex: + if ex.error == "resource_already_exists_exception": + logger.warning(f"index already exists: {AGG_MDS_CONFIG_INDEX_TEMP}") + pass # Index already exists. Ignore. + else: # Other exception - raise it + raise ex + + async def update_metadata( name: str, data: List[Dict], @@ -146,12 +210,46 @@ async def update_metadata( print(ex) +async def update_metadata_to_temp_index( + name: str, + data: List[Dict], + guid_arr: List[str], + tags: Dict[str, List[str]], + info: Dict[str, str], + study_data_field: str, +): + elastic_search_client.index( + index=AGG_MDS_INFO_INDEX_TEMP, + doc_type=AGG_MDS_INFO_TYPE, + id=name, + body=info, + ) + + for doc in data: + key = list(doc.keys())[0] + # Flatten out this structure + doc = doc[key][study_data_field] + + try: + elastic_search_client.index( + index=AGG_MDS_INDEX_TEMP, doc_type=AGG_MDS_TYPE, id=key, body=doc + ) + except Exception as ex: + print(ex) + + async def update_global_info(key, doc) -> None: elastic_search_client.index( index=AGG_MDS_INFO_INDEX, doc_type=AGG_MDS_INFO_TYPE, id=key, body=doc ) +async def update_global_info_to_temp_index(key, doc) -> None: + elastic_search_client.index( + index=AGG_MDS_INFO_INDEX_TEMP, doc_type=AGG_MDS_INFO_TYPE, id=key, body=doc + ) + + async def update_config_info(doc) -> None: elastic_search_client.index( index=AGG_MDS_CONFIG_INDEX, @@ -161,6 +259,15 @@ async def update_config_info(doc) -> None: ) +async def update_config_info_to_temp_index(doc) -> None: + elastic_search_client.index( + index=AGG_MDS_CONFIG_INDEX_TEMP, + doc_type="_doc", + id=AGG_MDS_INDEX, + body=doc, + ) + + async def get_status(): if not elastic_search_client.ping(): raise ValueError("Connection failed") diff --git a/src/mds/populate.py b/src/mds/populate.py index 7e2bef4b..4f8c7f3e 100644 --- a/src/mds/populate.py +++ b/src/mds/populate.py @@ -27,7 +27,7 @@ def parse_args(argv: List[str]) -> Namespace: return known_args -async def populate_metadata(name: str, common, results): +async def populate_metadata(name: str, common, results, populate_to_temp_index=False): mds_arr = [{k: v} for k, v in results.items()] total_items = len(mds_arr) @@ -87,10 +87,14 @@ def normalize(entry: dict) -> Any: keys = list(results.keys()) info = {"commons_url": common.commons_url} - - await datastore.update_metadata( - name, mds_arr, keys, tags, info, common.study_data_field - ) + if not populate_to_temp_index: + await datastore.update_metadata( + name, mds_arr, keys, tags, info, common.study_data_field + ) + else: + await datastore.update_metadata_to_temp_index( + name, mds_arr, keys, tags, info, common.study_data_field + ) async def populate_info(commons_config: Commons) -> None: @@ -108,16 +112,38 @@ async def populate_info(commons_config: Commons) -> None: await populate_drs_info(commons_config) -async def populate_drs_info(commons_config: Commons) -> None: +async def populate_info_to_temp_index(commons_config: Commons) -> None: + agg_info = { + key: value.to_dict() for key, value in commons_config.aggregations.items() + } + await datastore.update_global_info_to_temp_index("aggregations", agg_info) + + if commons_config.configuration.schema: + json_schema = { + k: v.to_schema(all_fields=True) + for k, v in commons_config.configuration.schema.items() + } + await datastore.update_global_info_to_temp_index("schema", json_schema) + await populate_drs_info(commons_config, populate_to_temp_index=True) + + +async def populate_drs_info( + commons_config: Commons, populate_to_temp_index=False +) -> None: if commons_config.configuration.settings.cache_drs: server = commons_config.configuration.settings.drs_indexd_server if server is not None: drs_data = adapters.get_metadata("drs_indexd", server, None) for id, entry in drs_data.get("cache", {}).items(): - await datastore.update_global_info(id, entry) + if not populate_to_temp_index: + await datastore.update_global_info(id, entry) + else: + await datastore.update_global_info_to_temp_index(id, entry) -async def populate_config(commons_config: Commons) -> None: +async def populate_config( + commons_config: Commons, populate_to_temp_index=False +) -> None: array_definition = { "array": [ field @@ -125,7 +151,10 @@ async def populate_config(commons_config: Commons) -> None: if value.type == "array" ] } - await datastore.update_config_info(array_definition) + if not populate_to_temp_index: + await datastore.update_config_info(array_definition) + else: + await datastore.update_config_info_to_temp_index(array_definition) async def main(commons_config: Commons) -> None: @@ -169,37 +198,67 @@ async def main(commons_config: Commons) -> None: } } - await datastore.drop_all() # TODO: rename indexes to old - await datastore.create_indexes(commons_mapping=field_mapping) - - for name, common in commons_config.gen3_commons.items(): - logger.info(f"Populating {name} using Gen3 MDS connector") - results = pull_mds(common.mds_url, common.guid_type) - logger.info(f"Received {len(results)} from {name}") - if len(results) > 0: - await populate_metadata(name, common, results) - - for name, common in commons_config.adapter_commons.items(): - logger.info(f"Populating {name} using adapter: {common.adapter}") - results = adapters.get_metadata( - common.adapter, - common.mds_url, - common.filters, - common.config, - common.field_mappings, - common.per_item_values, - common.keep_original_fields, - common.global_field_filters, - schema=commons_config.configuration.schema, + await datastore.drop_all_temp_indexes() + await datastore.create_temp_indexes(commons_mapping=field_mapping) + + mdsCount = 0 + try: + for name, common in commons_config.gen3_commons.items(): + logger.info(f"Populating {name} using Gen3 MDS connector") + results = pull_mds(common.mds_url, common.guid_type) + logger.info(f"Received {len(results)} from {name}") + if len(results) > 0: + mdsCount += len(results) + await populate_metadata( + name, common, results, populate_to_temp_index=True + ) + + for name, common in commons_config.adapter_commons.items(): + logger.info(f"Populating {name} using adapter: {common.adapter}") + results = adapters.get_metadata( + common.adapter, + common.mds_url, + common.filters, + common.config, + common.field_mappings, + common.per_item_values, + common.keep_original_fields, + common.global_field_filters, + schema=commons_config.configuration.schema, + ) + logger.info(f"Received {len(results)} from {name}") + if len(results) > 0: + mdsCount += len(results) + await populate_metadata( + name, common, results, populate_to_temp_index=True + ) + + if mdsCount == 0: + raise ValueError("Could not obtain any metadata from any adapters.") + + # populate global information index + await populate_info_to_temp_index(commons_config) + # populate array index information to support guppy + await populate_config(commons_config, populate_to_temp_index=True) + + except Exception as ex: + logger.error( + "Error occurred during mds population. Existing indexes are left in place." ) - logger.info(f"Received {len(results)} from {name}") - if len(results) > 0: - await populate_metadata(name, common, results) - - # populate global information index - await populate_info(commons_config) - # populate array index information to support guppy - await populate_config(commons_config) + logger.error(ex) + raise ex + + logger.info(f"Temp indexes populated successfully. Proceeding to clone") + # All temp indexes created without error, drop current real index, clone temp to real index and then drop temp index + try: + await datastore.drop_all() # TODO: rename indexes to old + await datastore.create_indexes(commons_mapping=field_mapping) + await datastore.clone_temp_indexes_to_real_indexes() + await datastore.drop_all_temp_indexes() + except Exception as ex: + logger.error("Error occurred during cloning.") + logger.error(ex) + raise ex res = await datastore.get_status() print(res) diff --git a/tests/test_populate.py b/tests/test_populate.py index b28c0952..1df0e977 100644 --- a/tests/test_populate.py +++ b/tests/test_populate.py @@ -103,13 +103,19 @@ async def test_populate_main(): patch("mds.config.USE_AGG_MDS", True).start() patch.object(datastore, "init", AsyncMock()).start() patch.object(datastore, "drop_all", AsyncMock()).start() + patch.object(datastore, "drop_all_temp_indexes", AsyncMock()).start() patch.object(datastore, "create_indexes", AsyncMock()).start() + patch.object(datastore, "create_temp_indexes", AsyncMock()).start() patch.object(datastore, "update_config_info", AsyncMock()).start() + patch.object(datastore, "update_config_info_to_temp_index", AsyncMock()).start() patch.object(datastore, "get_status", AsyncMock(return_value="OK")).start() patch.object(datastore, "close", AsyncMock()).start() patch.object(datastore, "update_global_info", AsyncMock()).start() + patch.object(datastore, "update_global_info_to_temp_index", AsyncMock()).start() patch.object(datastore, "update_metadata", AsyncMock()).start() + patch.object(datastore, "update_metadata_to_temp_index", AsyncMock()).start() patch.object(adapters, "get_metadata", MagicMock()).start() + patch.object(datastore, "clone_temp_indexes_to_real_indexes", AsyncMock()).start() json_response = { "GSE63878": { @@ -173,6 +179,150 @@ async def test_populate_main(): ) +@respx.mock +@pytest.mark.asyncio +async def test_populate_main_fail(): + + patch("mds.config.USE_AGG_MDS", True).start() + patch.object(datastore, "init", AsyncMock()).start() + patch.object(datastore, "drop_all_temp_indexes", AsyncMock()).start() + patch.object(datastore, "create_indexes", AsyncMock()).start() + patch.object(datastore, "create_temp_indexes", AsyncMock()).start() + patch.object(datastore, "update_config_info_to_temp_index", AsyncMock()).start() + patch.object(datastore, "get_status", AsyncMock(return_value="OK")).start() + patch.object(datastore, "close", AsyncMock()).start() + patch.object(datastore, "update_global_info_to_temp_index", AsyncMock()).start() + patch.object(datastore, "update_metadata_to_temp_index", AsyncMock()).start() + patch.object(adapters, "get_metadata", MagicMock()).start() + patch.object(datastore, "clone_temp_indexes_to_real_indexes", AsyncMock()).start() + + existing_metadata = { + "GSE63878": { + "_guid_type": "discovery_metadata", + "gen3_discovery": { + "link": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE63878", + "tags": [{"name": "Array", "category": "Data Type"}], + "source": "Ichan School of Medicine at Mount Sinai", + "funding": "", + "study_description_summary": "The molecular factors involved in the development of Post-traumatic Stress Disorder (PTSD) remain poorly understood. Previous transcriptomic studies investigating the mechanisms of PTSD apply targeted approaches to identify individual genes under a cross-sectional framework lack a holistic view of the behaviours and properties of these genes at the system-level. Here we sought to apply an unsupervised gene-network-based approach to a prospective experimental design using whole-transcriptome RNA-Seq gene expression from peripheral blood leukocytes of U.S. Marines (N=188), obtained both pre- and post-deployment to conflict zones. We identified discrete groups of co-regulated genes (i.e., co-expression modules) and tested them for association to PTSD. We identified one module at both pre- and post-deployment containing putative causal signatures for PTSD development displaying an over-expression of genes enriched for functions of innate-immune response and interferon signalling (Type-I and Type-II). Importantly, these results were replicated in a second non-overlapping independent dataset of U.S. Marines (N=96), further outlining the role of innate immune and interferon signalling genes within co-expression modules to explain at least part of the causal pathophysiology for PTSD development. A second module, consequential of trauma exposure, contained PTSD resiliency signatures and an over-expression of genes involved in hemostasis and wound responsiveness suggesting that chronic levels of stress impair proper wound healing during/after exposure to the battlefield while highlighting the role of the hemostatic system as a clinical indicator of chronic-based stress. These findings provide novel insights for early preventative measures and advanced PTSD detection, which may lead to interventions that delay or perhaps abrogate the development of PTSD.\nWe used microarrays to characterize both prognostic and diagnostic molecular signatures associated to PTSD risk and PTSD status compared to control subjects.", + "study_title": "Gene Networks Specific for Innate Immunity Define Post-traumatic Stress Disorder [Affymetrix]", + "subjects_count": 48, + "accession_number": "GSE63878", + "data_files_count": 0, + "contributor": "me.foo@smartsite.com", + }, + } + } + + # Mock get_all_metadata call to return proper document + get_all_metadata_mock = AsyncMock(return_value=existing_metadata) + patch.object(datastore, "get_all_metadata", get_all_metadata_mock).start() + + # If drop_all is called, set get_all_metadata_mock return_value to None + def wipe_return_value(mock: AsyncMock): + mock.return_value = None + + drop_all_indexes_mock = AsyncMock( + side_effect=wipe_return_value(get_all_metadata_mock) + ) + patch.object(datastore, "drop_all", drop_all_indexes_mock).start() + + respx.get( + "http://testfail/ok//mds/metadata?data=True&_guid_type=discovery_metadata&limit=1000&offset=0" + ).mock(return_value=httpx.Response(status_code=500)) + with pytest.raises(Exception): + await main( + Commons( + configuration=Config( + settings=Settings(), + schema={ + "_subjects_count": FieldDefinition(type="integer"), + "year_awarded": FieldDefinition(type="integer"), + }, + ), + gen3_commons={ + "my_commons": MDSInstance( + mds_url="http://testfail/ok/", + commons_url="test", + columns_to_fields={ + "authz": "path:authz", + "tags": "path:tags", + "_subjects_count": "path:subjects_count", + "dbgap_accession_number": "path:study_id", + "study_description": "path:study_description_summary", + "number_of_datafiles": "path:data_files_count", + "investigator": "path:contributor", + }, + ), + }, + adapter_commons={ + "adapter_commons": AdapterMDSInstance( + mds_url="", + commons_url="", + adapter="icpsr", + ), + }, + ) + ) + + # check that the get_all_metadata return value has not been changed + # since drop_all should not be called if an exception has been raised + es = await datastore.init("test", 9200) + assert (await es.get_all_metadata()) == existing_metadata + + respx.get( + "http://test/ok//mds/metadata?data=True&_guid_type=discovery_metadata&limit=1000&offset=0" + ).mock( + return_value=httpx.Response( + status_code=200, + json=existing_metadata, + ) + ) + + # Unable to update temp index, raise exception + patch.object( + datastore, + "update_metadata_to_temp_index", + AsyncMock(side_effect=Exception("Unable")), + ).start() + with pytest.raises(Exception): + await main( + Commons( + configuration=Config( + settings=Settings(), + schema={ + "_subjects_count": FieldDefinition(type="integer"), + "year_awarded": FieldDefinition(type="integer"), + }, + ), + gen3_commons={ + "my_commons": MDSInstance( + mds_url="http://test/ok/", + commons_url="test", + columns_to_fields={ + "authz": "path:authz", + "tags": "path:tags", + "_subjects_count": "path:subjects_count", + "dbgap_accession_number": "path:study_id", + "study_description": "path:study_description_summary", + "number_of_datafiles": "path:data_files_count", + "investigator": "path:contributor", + }, + ), + }, + adapter_commons={ + "adapter_commons": AdapterMDSInstance( + mds_url="", + commons_url="", + adapter="icpsr", + ), + }, + ) + ) + + assert (await es.get_all_metadata()) == existing_metadata + + @pytest.mark.asyncio async def test_filter_entries(): resp = await filter_entries( From 03cfded918fd15c43b9fde470d9abaea43b68dad Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Wed, 10 Aug 2022 10:00:20 -0500 Subject: [PATCH 29/70] update Documentation --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 40071f1b..43f2a38b 100644 --- a/README.md +++ b/README.md @@ -102,8 +102,6 @@ Run tests: docker-compose exec app pytest --cov=src --cov=migrations/versions tests ``` ### Aggregate MDS -For local development on OSX, install elasticsearch and ensure it is running. - testing populate: ```bash python src/mds/populate.py --config --hostname localhost --port 9200 From b995fab3304d7410223729a594f0e71bd4866f69 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Wed, 10 Aug 2022 10:22:27 -0500 Subject: [PATCH 30/70] remove hostname/port arguments, update documentation --- README.md | 4 +- configs/brh_config.json | 151 -------------------------- docs/sample_aggregate_mds_config.json | 56 ++++++++++ src/mds/populate.py | 4 - tests/test_populate.py | 11 +- 5 files changed, 59 insertions(+), 167 deletions(-) delete mode 100644 configs/brh_config.json create mode 100644 docs/sample_aggregate_mds_config.json diff --git a/README.md b/README.md index 40071f1b..cebc79f9 100644 --- a/README.md +++ b/README.md @@ -102,11 +102,11 @@ Run tests: docker-compose exec app pytest --cov=src --cov=migrations/versions tests ``` ### Aggregate MDS -For local development on OSX, install elasticsearch and ensure it is running. +For local development ensure the docker container is up. testing populate: ```bash -python src/mds/populate.py --config --hostname localhost --port 9200 +python src/mds/populate.py --config ``` view the loaded data ```bash diff --git a/configs/brh_config.json b/configs/brh_config.json deleted file mode 100644 index f5c063cc..00000000 --- a/configs/brh_config.json +++ /dev/null @@ -1,151 +0,0 @@ -{ - "gen3_commons": { - "IBD Commons": { - "mds_url": "https://ibdgc.datacommons.io", - "commons_url" : "ibdgc.datacommons.io", - "study_data_field" : "my_metadata", - "guid_type" : "my_metadata", - "columns_to_fields": { - "_subjects_count" : "subjects_count", - "study_description" : "brief_summary", - "short_name": "dataset_title", - "full_name": "dataset_title" - } - }, - "BioData Catalyst": { - "mds_url": "https://gen3.biodatacatalyst.nhlbi.nih.gov", - "commons_url" : "gen3.biodatacatalyst.nhlbi.nih.gov", - "columns_to_fields": { - "short_name": "name", - "_unique_id" : "study_id" - } - }, - "MIDRC": { - "mds_url": "https://data.midrc.org", - "commons_url" : "data.midrc.org", - "study_data_field" : "discovery_metadata", - "columns_to_fields": { - "_subjects_count" : "cases_count", - "study_description" : "research_description", - "_unique_id": "study_id" - } - }, - "NIAID ClinicalData": { - "mds_url": "https://accessclinicaldata.niaid.nih.gov", - "commons_url" : "accessclinicaldata.niaid.nih.gov", - "study_data_field" : "my_metadata", - "guid_type" : "my_metadata", - "columns_to_fields": { - "full_name": "title", - "study_id" : "nct_number", - "_unique_id": "nct_number", - "study_description" : "brief_summary" - } - }, - "JCOIN": { - "mds_url": "https://jcoin.datacommons.io/", - "commons_url" : "jcoin.datacommons.io/", - "columns_to_fields": { - "_subjects_count" : "subjects", - "study_description" : "summary", - "short_name": "study_name", - "full_name": "study_name" - } - }, - "AnVIL": { - "mds_url": "https://internalstaging.theanvil.io", - "commons_url": "gen3.theanvil.io", - "columns_to_fields": { - "name": "name", - "full_name": "full_name", - "_subjects_count" : "_subjects_count", - "_unique_id" : "study_id", - "study_description" : "study_description" - } - }, - "Genomic Data Commons": { - "mds_url": "https://gen3.datacommons.io", - "commons_url": "portal.gdc.cancer.gov", - "study_data_field" : "discovery_metadata", - "columns_to_fields": { - "_subjects_count" : "subjects_count", - "dbgap_accession_number" : "study_id", - "study_description" : "description" - }, - "select_field": { - "field_name" : "commons" , - "field_value" : "Genomic Data Commons" - } - }, - "Proteomic Data Commons": { - "mds_url": "https://gen3.datacommons.io", - "commons_url": "proteomic.datacommons.cancer.gov/pdc", - "columns_to_fields": { - "_subjects_count" : "cases_count", - "study_id" : "_unique_id", - "description" : "study_description" - }, - "select_field": { - "field_name" : "commons" , - "field_value" : "Proteomic Data Commons" - } - }, - "Cancer Imaging Data Commons": { - "mds_url": "https://gen3.datacommons.io", - "commons_url": "imaging.datacommons.cancer.gov/", - "columns_to_fields": { - "_subjects_count" : "cases_count", - "study_id" : "_unique_id", - "description" : "study_description" - }, - "select_field": { - "field_name" : "commons" , - "field_value" : "Cancer Imaging Data Commons" - } - } - }, - "adapter_commons": { - "Kids First": { - "mds_url": "https://gen3staging.kidsfirstdrc.org/", - "commons_url": "kidsfirstdrc.org", - "adapter": "gen3", - "config" : { - "guid_type": "metadata_object", - "study_field": "dbgap" - }, - "keep_original_fields": false, - "field_mappings" : { - "authz": "path:authz", - "tags": "path:gen3_discovery.tags", - "_unique_id": "path:_unique_id", - "study_id": "path:_unique_id", - "study_description": "path:description", - "full_name": "path:full_name", - "short_name": "path:full_name", - "commons": "Kids First Data Resource Center", - "study_url": "path:link" - } - }, - "Genomic Data Commons": { - "mds_url": "https://gen3.datacommons.io", - "commons_url": "portal.gdc.cancer.gov", - "adapter": "gen3", - "config" : { - "guid_type": "metadata_object", - "study_field": "dbgap" - }, - "keep_original_fields": false, - "field_mappings" : { - "authz": "path:authz", - "tags": "path:gen3_discovery.tags", - "_unique_id": "path:_unique_id", - "study_id": "path:_unique_id", - "study_description": "path:description", - "full_name": "path:full_name", - "short_name": "path:full_name", - "commons": "Kids First Data Resource Center", - "study_url": "path:link" - } - } - } -} diff --git a/docs/sample_aggregate_mds_config.json b/docs/sample_aggregate_mds_config.json new file mode 100644 index 00000000..56884a1e --- /dev/null +++ b/docs/sample_aggregate_mds_config.json @@ -0,0 +1,56 @@ +{ + "configuration": { + "schema": { + "_subjects_count": { + "type": "integer" + }, + "__manifest": { + "type": "array", + "properties": { + "file_name": { + "type": "string" + }, + "file_size": { + "type": "integer" + } + } + }, + "tags": { + "type": "array" + }, + "_unique_id": {}, + "study_description": {}, + "study_id": {}, + "study_url": {}, + "project_id": {}, + "short_name": {}, + "full_name": {}, + "commons_url": {}, + "commons" : {} + }, + "settings" : { + "cache_drs" : false + } + }, + "adapter_commons": { + "Gen3": { + "mds_url": "https://gen3.datacommons.io/", + "commons_url": "gen3.datacommons.io/", + "adapter": "gen3", + "config" : { + "guid_type": "discovery_metadata", + "study_field": "gen3_discovery" + }, + "keep_original_fields": false, + "field_mappings" : { + "tags": "path:tags", + "_unique_id": "path:_unique_id", + "study_description": "path:summary", + "full_name": "path:study_title", + "accession_number": "path:accession_number", + "commons": "Gen3 Data Commons", + "study_url": "path:link" + } + } + } +} diff --git a/src/mds/populate.py b/src/mds/populate.py index 4f8c7f3e..631e746f 100644 --- a/src/mds/populate.py +++ b/src/mds/populate.py @@ -19,10 +19,6 @@ def parse_args(argv: List[str]) -> Namespace: parser = argparse.ArgumentParser() parser.add_argument("--config", help="config file to use", type=str, required=True) - parser.add_argument( - "--hostname", help="hostname of server", type=str, default="localhost" - ) - parser.add_argument("--port", help="port of server", type=int, default=6379) known_args, unknown_args = parser.parse_known_args(argv) return known_args diff --git a/tests/test_populate.py b/tests/test_populate.py index 1df0e977..4d93295f 100644 --- a/tests/test_populate.py +++ b/tests/test_populate.py @@ -34,16 +34,7 @@ async def test_parse_args(): assert exception.code == 2 known_args = parse_args(["--config", "some/file.json"]) - assert known_args == Namespace( - config="some/file.json", hostname="localhost", port=6379 - ) - - known_args = parse_args( - ["--config", "some/file.json", "--hostname", "server", "--port", "1000"] - ) - assert known_args == Namespace( - config="some/file.json", hostname="server", port=1000 - ) + assert known_args == Namespace(config="some/file.json") @pytest.mark.asyncio From 914efb9e4b7e5871bd1f2d7e0ed476f4c207904a Mon Sep 17 00:00:00 2001 From: tianj7 Date: Wed, 10 Aug 2022 14:21:47 -0500 Subject: [PATCH 31/70] Add unit tests --- tests/test_agg_mds_datastore.py | 49 +++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/tests/test_agg_mds_datastore.py b/tests/test_agg_mds_datastore.py index 215a9b97..106ca239 100644 --- a/tests/test_agg_mds_datastore.py +++ b/tests/test_agg_mds_datastore.py @@ -83,3 +83,52 @@ async def test_get_all_metadata(): with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: await datastore.get_all_metadata() mock_client.get_all_metadata.assert_called_with() + + +@pytest.mark.asyncio +async def test_drop_all_temp_indexes(): + with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: + await datastore.drop_all_temp_indexes() + mock_client.drop_all_temp_indexes.assert_called_with() + + +@pytest.mark.asyncio +async def test_create_indexes(): + with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: + await datastore.create_indexes("{}") + mock_client.create_indexes.assert_called_with("{}") + + +@pytest.mark.asyncio +async def test_create_temp_indexes(): + with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: + await datastore.create_temp_indexes("{}") + mock_client.create_temp_indexes.assert_called_with("{}") + + +@pytest.mark.asyncio +async def test_clone_temp_indexes_to_real_indexes(): + with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: + await datastore.clone_temp_indexes_to_real_indexes() + mock_client.clone_temp_indexes_to_real_indexes.assert_called_with() + + +@pytest.mark.asyncio +async def test_update_metadata_to_temp_index(): + with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: + await datastore.update_metadata_to_temp_index() + mock_client.update_metadata_to_temp_index.assert_called_with() + + +@pytest.mark.asyncio +async def test_update_global_info_to_temp_index(): + with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: + await datastore.update_global_info_to_temp_index() + mock_client.update_global_info_to_temp_index.assert_called_with() + + +@pytest.mark.asyncio +async def test_update_config_info_to_temp_index(): + with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: + await datastore.update_config_info_to_temp_index() + mock_client.update_config_info_to_temp_index.assert_called_with() From 3444b8c5e22a87ac355ef53b0521d952b542fbb9 Mon Sep 17 00:00:00 2001 From: tianj7 Date: Wed, 10 Aug 2022 16:21:56 -0500 Subject: [PATCH 32/70] add unit tests --- tests/test_agg_mds_elasticsearch_dao.py | 232 +++++++++++++++++++++++- 1 file changed, 223 insertions(+), 9 deletions(-) diff --git a/tests/test_agg_mds_elasticsearch_dao.py b/tests/test_agg_mds_elasticsearch_dao.py index fb5abe02..155bd408 100644 --- a/tests/test_agg_mds_elasticsearch_dao.py +++ b/tests/test_agg_mds_elasticsearch_dao.py @@ -8,6 +8,13 @@ AGG_MDS_CONFIG_INDEX, CONFIG, SEARCH_CONFIG, + AGG_MDS_INDEX_TEMP, + AGG_MDS_INFO_INDEX_TEMP, + AGG_MDS_CONFIG_INDEX_TEMP, + AGG_MDS_INFO_TYPE, + AGG_MDS_CONFIG_TYPE, + count, + process_record, ) from elasticsearch import exceptions as es_exceptions from mds.config import ES_RETRY_LIMIT, ES_RETRY_INTERVAL @@ -64,6 +71,75 @@ async def test_drop_all(): ) +@pytest.mark.asyncio +async def test_drop_all_temp_indexes(): + with patch( + "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.indices", + MagicMock(), + ) as mock_indices: + await elasticsearch_dao.drop_all_temp_indexes() + mock_indices.delete.assert_has_calls( + [ + call(index=AGG_MDS_INDEX_TEMP, ignore=[400, 404]), + call(index=AGG_MDS_INFO_INDEX_TEMP, ignore=[400, 404]), + call(index=AGG_MDS_CONFIG_INDEX_TEMP, ignore=[400, 404]), + ], + any_order=True, + ) + + +@pytest.mark.asyncio +async def test_clone_temp_indexes_to_real_indexes(): + with patch( + "mds.agg_mds.datastore.elasticsearch_dao.Elasticsearch", + MagicMock(), + ) as mock_es: + await elasticsearch_dao.clone_temp_indexes_to_real_indexes() + mock_es.reindex.assert_has_calls( + [ + call( + elasticsearch_dao.elastic_search_client, + { + "source": {"index": AGG_MDS_INDEX_TEMP}, + "dest": {"index": AGG_MDS_INDEX}, + }, + ), + call( + elasticsearch_dao.elastic_search_client, + { + "source": {"index": AGG_MDS_INFO_INDEX_TEMP}, + "dest": {"index": AGG_MDS_INFO_INDEX}, + }, + ), + call( + elasticsearch_dao.elastic_search_client, + { + "source": {"index": AGG_MDS_CONFIG_INDEX_TEMP}, + "dest": {"index": AGG_MDS_CONFIG_INDEX}, + }, + ), + ], + any_order=True, + ) + + +@pytest.mark.asyncio +async def test_create_indexes(): + with patch( + "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.indices", + MagicMock(), + ) as mock_indices: + await elasticsearch_dao.create_indexes(common_mapping=COMMON_MAPPING) + mock_indices.create.assert_has_calls( + [ + call(body={**SEARCH_CONFIG, **COMMON_MAPPING}, index=AGG_MDS_INDEX), + call(body=INFO_MAPPING, index=AGG_MDS_INFO_INDEX), + call(body=CONFIG, index=AGG_MDS_CONFIG_INDEX), + ], + any_order=True, + ) + + @pytest.mark.asyncio async def test_create_indexes(): with patch( @@ -81,6 +157,23 @@ async def test_create_indexes(): ) +@pytest.mark.asyncio +async def test_create_temp_indexes(): + with patch( + "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.indices", + MagicMock(), + ) as mock_indices: + await elasticsearch_dao.create_temp_indexes(common_mapping=COMMON_MAPPING) + mock_indices.create.assert_has_calls( + [ + call(body={**SEARCH_CONFIG, **COMMON_MAPPING}, index=AGG_MDS_INDEX_TEMP), + call(body=INFO_MAPPING, index=AGG_MDS_INFO_INDEX_TEMP), + call(body=CONFIG, index=AGG_MDS_CONFIG_INDEX_TEMP), + ], + any_order=True, + ) + + @pytest.mark.asyncio async def test_create_if_exists(): with patch( @@ -137,19 +230,114 @@ async def test_update_metadata(): body={}, doc_type="commons-info", id="my_commons", - index="default_namespace-commons-info-index", + index=AGG_MDS_INFO_INDEX, + ), + call( + body={"some_field": "some_value", "__manifest": {}, "sites": ""}, + doc_type="commons", + id="my_id", + index=AGG_MDS_INDEX, + ), + ], + any_order=True, + ) + + +@pytest.mark.asyncio +async def test_update_metadata_to_temp_index(): + with patch( + "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.index", + MagicMock(), + ) as mock_index: + await elasticsearch_dao.update_metadata_to_temp_index( + "my_commons", + [ + { + "my_id": { + "gen3_discovery": { + "some_field": "some_value", + "__manifest": {}, + "sites": "", + } + } + } + ], + [], + {}, + {}, + "gen3_discovery", + ) + mock_index.assert_has_calls( + [ + call( + body={}, + doc_type="commons-info", + id="my_commons", + index=AGG_MDS_INFO_INDEX_TEMP, ), call( body={"some_field": "some_value", "__manifest": {}, "sites": ""}, doc_type="commons", id="my_id", - index="default_namespace-commons-index", + index=AGG_MDS_INDEX_TEMP, ), ], any_order=True, ) +@pytest.mark.asyncio +async def test_update_global_info(): + with patch( + "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client", + MagicMock(), + ) as mock_client: + await elasticsearch_dao.update_global_info(key="123", doc={}) + + mock_client.index.assert_called_with( + index=AGG_MDS_INFO_INDEX, doc_type=AGG_MDS_INFO_TYPE, id="123", body={} + ) + + +@pytest.mark.asyncio +async def test_update_global_info_to_temp_index(): + with patch( + "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client", + MagicMock(), + ) as mock_client: + await elasticsearch_dao.update_global_info_to_temp_index(key="123", doc={}) + + mock_client.index.assert_called_with( + index=AGG_MDS_INFO_INDEX_TEMP, doc_type=AGG_MDS_INFO_TYPE, id="123", body={} + ) + + +@pytest.mark.asyncio +async def test_update_config_info(): + with patch( + "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client", + MagicMock(), + ) as mock_client: + await elasticsearch_dao.update_config_info(doc={}) + + mock_client.index.assert_called_with( + index=AGG_MDS_CONFIG_INDEX, doc_type="_doc", id=AGG_MDS_INDEX, body={} + ) + + +@pytest.mark.asyncio +async def test_update_config_info_to_temp_index(): + with patch( + "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client", + MagicMock(), + ) as mock_client: + await elasticsearch_dao.update_config_info_to_temp_index(doc={}) + + mock_client.index.assert_called_with( + index=AGG_MDS_CONFIG_INDEX_TEMP, doc_type="_doc", id=AGG_MDS_INDEX, body={} + ) + + @pytest.mark.asyncio async def test_get_status(): with patch( @@ -161,6 +349,11 @@ async def test_get_status(): mock_client.ping.assert_called_with() +@pytest.mark.asyncio +async def close(): + assert True + + @pytest.mark.asyncio async def test_get_commons(): with patch( @@ -169,7 +362,7 @@ async def test_get_commons(): ) as mock_search: await elasticsearch_dao.get_commons() mock_search.assert_called_with( - index="default_namespace-commons-index", + index=AGG_MDS_INDEX, body={ "size": 0, "aggs": {"commons_names": {"terms": {"field": "commons_name.keyword"}}}, @@ -183,6 +376,27 @@ async def test_get_commons(): assert await elasticsearch_dao.get_commons() == [] +def test_count_dict(): + assert count({1: 2, 3: 4}) == 2 + + +def test_count_list(): + assert count([1, 2, 3]) == 3 + + +def test_count_fail(): + assert count(123) == 0 + + +def test_process_records(): + _id = "123" + _source = {"count": [1, 2, 3, 4]} + record = {"_id": _id, "_source": _source} + id, normalized = process_record(record, "count") + assert id == _id + assert normalized == {"count": 4} + + @pytest.mark.asyncio async def test_get_all_metadata(): response = { @@ -195,7 +409,7 @@ async def test_get_all_metadata(): ) as mock_search: await elasticsearch_dao.get_all_metadata(5, 9) mock_search.assert_called_with( - index="default_namespace-commons-index", + index=AGG_MDS_INDEX, body={"size": 5, "from": 9, "query": {"match_all": {}}}, ) @@ -213,7 +427,7 @@ async def test_get_all_named_commons_metadata(): ) as mock_client: await elasticsearch_dao.get_all_named_commons_metadata("my-commons") mock_client.search.assert_called_with( - index="default_namespace-commons-index", + index=AGG_MDS_INDEX, body={"query": {"match": {"commons_name.keyword": "my-commons"}}}, ) @@ -233,7 +447,7 @@ async def test_metadata_tags(): ) as mock_client: await elasticsearch_dao.metadata_tags() mock_client.search.assert_called_with( - index="default_namespace-commons-index", + index=AGG_MDS_INDEX, body={ "size": 0, "aggs": { @@ -266,7 +480,7 @@ async def test_get_commons_attribute(): ) as mock_client: await elasticsearch_dao.get_commons_attribute("my-commons") mock_client.search.assert_called_with( - index="default_namespace-commons-info-index", + index=AGG_MDS_INFO_INDEX, body={"query": {"terms": {"_id": ["my-commons"]}}}, ) @@ -284,7 +498,7 @@ async def test_get_aggregations(): ) as mock_client: await elasticsearch_dao.get_aggregations("my-commons") mock_client.search.assert_called_with( - index="default_namespace-commons-index", + index=AGG_MDS_INDEX, body={ "size": 0, "query": { @@ -310,7 +524,7 @@ async def test_get_by_guid(): ) as mock_client: await elasticsearch_dao.get_by_guid("my-commons") mock_client.get.assert_called_with( - index="default_namespace-commons-index", + index=AGG_MDS_INDEX, doc_type="commons", id="my-commons", ) From 348e39f8977de6a83be3e118084707ccddb0d5a6 Mon Sep 17 00:00:00 2001 From: tianj7 Date: Thu, 11 Aug 2022 13:09:26 -0500 Subject: [PATCH 33/70] add unit tests --- brh_all.json | 285 ++++++++++++++++++++++++++++++++ tests/test_agg_mds_datastore.py | 107 +++++++----- tests/test_populate.py | 153 ++++++++++++++++- 3 files changed, 501 insertions(+), 44 deletions(-) create mode 100644 brh_all.json diff --git a/brh_all.json b/brh_all.json new file mode 100644 index 00000000..8ec4fbb3 --- /dev/null +++ b/brh_all.json @@ -0,0 +1,285 @@ +{ + "configuration": { + "schema": { + "_subjects_count": { + "type": "integer" + }, + "__manifest": { + "type": "array", + "properties": { + "file_name": { + "type": "string" + }, + "file_size": { + "type": "integer" + } + } + }, + "tags": { + "type": "array" + }, + "_unique_id": {}, + "study_description": {}, + "study_id": {}, + "study_url": {}, + "project_id": {}, + "short_name": {}, + "full_name": {}, + "commons_url": {}, + "commons" : {}, + "authz": { + "type": "string" + } + }, + "settings" : { + "cache_drs" : true + } + }, + "adapter_commons": { + "IBD Commons": { + "mds_url": "https://ibdgc.datacommons.io/", + "commons_url" : "ibdgc.datacommons.io", + "adapter": "gen3", + "config" : { + "guid_type": "my_metadata", + "study_field": "my_metadata" + }, + "keep_original_fields": false, + "field_mappings" : { + "authz": "path:authz", + "tags": "path:tags", + "_unique_id": "path:_unique_id", + "study_id": "path:_unique_id", + "study_description": "path:brief_summary", + "full_name": "path:dataset_title", + "short_name": "path:dataset_code", + "commons": "NIDDK IBD Genetics Consortium Data Commons", + "study_url": "path:link", + "_subjects_count" : "path:subjects_count", + "__manifest": "path:__manifest", + "commons_url" : "ibdgc.datacommons.io" + } + }, + "BioData Catalyst": { + "mds_url": "https://gen3.biodatacatalyst.nhlbi.nih.gov/", + "commons_url" : "gen3.biodatacatalyst.nhlbi.nih.gov", + "adapter": "gen3", + "config" : { + "guid_type": "discovery_metadata", + "study_field": "gen3_discovery" + }, + "keep_original_fields": false, + "field_mappings" : { + "authz": "path:authz", + "tags": "path:tags", + "_unique_id": "path:study_id", + "study_id": "path:study_id", + "study_description": "path:study_description", + "full_name": "path:full_name", + "short_name": "path:short_name", + "commons": "BioData Catalyst", + "study_url": "path:dbgap_url", + "_subjects_count" : {"path":"_subjects_count", "default" : 0 }, + "__manifest": "path:__manifest", + "commons_url" : "gen3.biodatacatalyst.nhlbi.nih.gov" + } + }, + "MIDRC": { + "mds_url": "https://data.midrc.org/", + "commons_url" : "data.midrc.org", + "adapter": "gen3", + "config" : { + "guid_type": "discovery_metadata", + "study_field": "discovery_metadata" + }, + "keep_original_fields": false, + "field_mappings" : { + "authz": "path:auth_resource_path", + "tags": "path:tags", + "_unique_id": "path:_unique_id", + "study_id": "path:_publication_id", + "study_description": "path:study_description_summary", + "full_name": "path:study_name_title", + "short_name": "path:study_name_title", + "commons": "Medical Imaging and Data Resource Center (MIDRC)", + "study_url": "path:websites", + "_subjects_count" : {"path":"subjects_count", "default" : 0 }, + "__manifest": "path:__manifest", + "commons_url" : "ibdgc.datacommons.io" + } + }, + "NIAID ClinicalData": { + "mds_url": "https://accessclinicaldata.niaid.nih.gov/", + "commons_url" : "accessclinicaldata.niaid.nih.gov", + "adapter": "gen3", + "config" : { + "guid_type": "my_metadata", + "study_field": "my_metadata" + }, + "keep_original_fields": false, + "field_mappings" : { + "authz": "/open", + "tags": "path:tags", + "_unique_id": "path:_unique_id", + "study_id": "path:nct_number", + "study_description": "path:brief_summary", + "full_name": "path:title", + "short_name": "path:title", + "commons": "NCT", + "study_url": "path:publications", + "_subjects_count" : { + "path":"subjects_count", + "default_value": 0 + }, + "__manifest": "path:__manifest", + "commons_url" : "accessclinicaldata.niaid.nih.gov" + } + }, + "JCOIN": { + "mds_url": "https://jcoin.datacommons.io/", + "commons_url" : "jcoin.datacommons.io/", + "adapter": "gen3", + "keep_original_fields": false, + "field_mappings" : { + "authz": "path:authz", + "tags": "path:tags", + "_unique_id": "path:project_number", + "study_id": "path:project_number", + "study_description": "path:study_description_summary", + "full_name": "path:project_title", + "short_name": "path:project_title", + "commons": "JCOIN", + "study_url": "path:publications", + "data_availability": "path:data_availability", + "_subjects_count" : { + "path":"subjects", + "default_value": 0 + }, + "__manifest": "path:__manifest", + "commons_url" : "jcoin.datacommons.io" + } + }, + "AnVIL": { + "mds_url": "https://internalstaging.theanvil.io/", + "commons_url" : "gen3.theanvil.io", + "config" : { + "guid_type": "discovery_metadata", + "study_field": "gen3_discovery" + }, + "adapter": "gen3", + "keep_original_fields": false, + "field_mappings" : { + "authz": "path:authz", + "tags": "path:tags", + "_unique_id": "path:study_id", + "study_id": "path:study_id", + "study_description": "path:study_description", + "full_name": "path:full_name", + "short_name": "path:short_name", + "commons": "AnVIL", + "study_url": "path:publications", + "_subjects_count" : {"path":"_subjects_count", "default" : 0 }, + "__manifest": "path:__manifest", + "commons_url" : "gen3.theanvil.io" + } + }, + "Genomic Data Commons": { + "mds_url": "https://brh.data-commons.org/", + "commons_url" : "portal.gdc.cancer.gov", + "config" : { + "guid_type": "discovery_metadata", + "study_field": "gen3_discovery", + "filters": "gen3_discovery.commons=Genomic Data Commons" + }, + "adapter": "gen3", + "keep_original_fields": false, + "field_mappings" : { + "authz": "path:authz", + "tags": "path:tags", + "_unique_id": "path:_unique_id", + "study_id": "path:_unique_id", + "study_description": "path:study_description", + "full_name": "path:full_name", + "short_name": "N/A", + "commons": "GDC", + "study_url": "path:publications", + "_subjects_count" : {"path":"subjects_count", "default" : 0 }, + "__manifest": "path:__manifest", + "commons_url" : "brh.data-commons.org" + } + }, + "Proteomic Data Commons": { + "mds_url": "https://brh.data-commons.org/", + "commons_url" : "proteomic.datacommons.cancer.gov/pdc", + "config" : { + "guid_type": "discovery_metadata", + "study_field": "gen3_discovery", + "filters": "gen3_discovery.commons=Proteomic Data Commons" + }, + "adapter": "gen3", + "keep_original_fields": false, + "field_mappings" : { + "authz": "path:authz", + "tags": "path:tags", + "_unique_id": "path:_unique_id", + "study_id": "path:_unique_id", + "study_description": "path:study_description", + "full_name": "path:full_name", + "short_name": "N/A", + "commons": "Proteomic Data Commons", + "study_url": "path:publications", + "_subjects_count" : {"path":"subjects_count", "default" : 0 }, + "__manifest": "path:__manifest", + "commons_url" : "proteomic.datacommons.cancer.gov/pdc" + } + }, + "Cancer Imaging Data Commons": { + "mds_url": "https://brh.data-commons.org/", + "commons_url" : "imaging.datacommons.cancer.gov", + "config" : { + "guid_type": "discovery_metadata", + "study_field": "gen3_discovery", + "filters": "gen3_discovery.commons=Cancer Imaging Data Commons" + }, + "adapter": "gen3", + "keep_original_fields": false, + "field_mappings" : { + "authz": "path:authz", + "tags": "path:tags", + "_unique_id": "path:_unique_id", + "study_id": "path:_unique_id", + "study_description": "path:study_description", + "full_name": "path:full_name", + "short_name": "N/A", + "commons": "Cancer Imaging Data Commons", + "study_url": "path:publications", + "_subjects_count" : {"path":"subjects_count", "default" : 0 }, + "__manifest": "path:__manifest", + "commons_url" : "imaging.datacommons.cancer.gov" + } + }, + "Kids First": { + "mds_url": "https://gen3staging.kidsfirstdrc.org/", + "commons_url": "kidsfirstdrc.org", + "adapter": "gen3", + "config" : { + "guid_type": "metadata_object", + "study_field": "dbgap" + }, + "keep_original_fields": false, + "field_mappings" : { + "authz": "path:authz", + "tags": "path:gen3_discovery.tags", + "_unique_id": "path:_unique_id", + "study_id": "path:_unique_id", + "study_description": "path:description", + "full_name": "path:full_name", + "short_name": "path:full_name", + "commons": "Kids First Data Resource Center", + "study_url": "path:link", + "__manifest": "path:__manifest", + "commons_url" : "kidsfirstdrc.org" + } + } + } +} diff --git a/tests/test_agg_mds_datastore.py b/tests/test_agg_mds_datastore.py index 106ca239..fcd46eba 100644 --- a/tests/test_agg_mds_datastore.py +++ b/tests/test_agg_mds_datastore.py @@ -22,6 +22,34 @@ async def test_drop_all(): mock_client.drop_all.assert_called_with() +@pytest.mark.asyncio +async def test_drop_all_temp_indexes(): + with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: + await datastore.drop_all_temp_indexes() + mock_client.drop_all_temp_indexes.assert_called_with() + + +@pytest.mark.asyncio +async def test_create_indexes(): + with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: + await datastore.create_indexes("{}") + mock_client.create_indexes.assert_called_with("{}") + + +@pytest.mark.asyncio +async def test_create_temp_indexes(): + with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: + await datastore.create_temp_indexes("{}") + mock_client.create_temp_indexes.assert_called_with("{}") + + +@pytest.mark.asyncio +async def test_clone_temp_indexes_to_real_indexes(): + with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: + await datastore.clone_temp_indexes_to_real_indexes() + mock_client.clone_temp_indexes_to_real_indexes.assert_called_with() + + @pytest.mark.asyncio async def test_close(): with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: @@ -44,91 +72,84 @@ async def test_update_metadata(): @pytest.mark.asyncio -async def test_get_commons_metadata(): - with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: - await datastore.get_commons_metadata() - mock_client.get_commons_metadata.assert_called_with() - - -@pytest.mark.asyncio -async def test_get_all_named_commons_metadata(): +async def test_update_metadata_to_temp_index(): with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: - await datastore.get_all_named_commons_metadata() - mock_client.get_all_named_commons_metadata.assert_called_with() + await datastore.update_metadata_to_temp_index() + mock_client.update_metadata_to_temp_index.assert_called_with() @pytest.mark.asyncio -async def test_get_by_guid(): +async def test_update_global_info(): with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: - await datastore.get_by_guid("123") - mock_client.get_by_guid.assert_called_with("123") + await datastore.update_global_info() + mock_client.update_global_info.assert_called_with() @pytest.mark.asyncio -async def test_get_commons_attribute(): +async def test_update_global_info_to_temp_index(): with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: - await datastore.get_commons_attribute() - mock_client.get_commons_attribute.assert_called_with() + await datastore.update_global_info_to_temp_index() + mock_client.update_global_info_to_temp_index.assert_called_with() @pytest.mark.asyncio -async def test_get_commons(): +async def test_update_config_info(): with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: - await datastore.get_commons() - mock_client.get_commons.assert_called_with() + await datastore.update_config_info() + mock_client.update_config_info.assert_called_with() @pytest.mark.asyncio -async def test_get_all_metadata(): +async def test_update_config_info_to_temp_index(): with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: - await datastore.get_all_metadata() - mock_client.get_all_metadata.assert_called_with() + await datastore.update_config_info_to_temp_index() + mock_client.update_config_info_to_temp_index.assert_called_with() @pytest.mark.asyncio -async def test_drop_all_temp_indexes(): +async def test_get_commons_metadata(): with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: - await datastore.drop_all_temp_indexes() - mock_client.drop_all_temp_indexes.assert_called_with() + await datastore.get_commons_metadata() + mock_client.get_commons_metadata.assert_called_with() @pytest.mark.asyncio -async def test_create_indexes(): +async def test_get_all_named_commons_metadata(): with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: - await datastore.create_indexes("{}") - mock_client.create_indexes.assert_called_with("{}") + await datastore.get_all_named_commons_metadata() + mock_client.get_all_named_commons_metadata.assert_called_with() @pytest.mark.asyncio -async def test_create_temp_indexes(): +async def test_get_all_tags(): with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: - await datastore.create_temp_indexes("{}") - mock_client.create_temp_indexes.assert_called_with("{}") + await datastore.get_all_tags() + mock_client.metadata_tags.assert_called_with() @pytest.mark.asyncio -async def test_clone_temp_indexes_to_real_indexes(): +async def test_get_by_guid(): with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: - await datastore.clone_temp_indexes_to_real_indexes() - mock_client.clone_temp_indexes_to_real_indexes.assert_called_with() + await datastore.get_by_guid("123") + mock_client.get_by_guid.assert_called_with("123") @pytest.mark.asyncio -async def test_update_metadata_to_temp_index(): +async def test_get_commons_attribute(): with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: - await datastore.update_metadata_to_temp_index() - mock_client.update_metadata_to_temp_index.assert_called_with() + await datastore.get_commons_attribute() + mock_client.get_commons_attribute.assert_called_with() @pytest.mark.asyncio -async def test_update_global_info_to_temp_index(): +async def test_get_commons(): with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: - await datastore.update_global_info_to_temp_index() - mock_client.update_global_info_to_temp_index.assert_called_with() + await datastore.get_commons() + mock_client.get_commons.assert_called_with() @pytest.mark.asyncio -async def test_update_config_info_to_temp_index(): +async def test_get_all_metadata(): with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: - await datastore.update_config_info_to_temp_index() - mock_client.update_config_info_to_temp_index.assert_called_with() + await datastore.get_all_metadata() + mock_client.get_all_metadata.assert_called_with() diff --git a/tests/test_populate.py b/tests/test_populate.py index 4d93295f..9f96bda3 100644 --- a/tests/test_populate.py +++ b/tests/test_populate.py @@ -8,6 +8,10 @@ populate_metadata, main, filter_entries, + populate_info, + populate_info_to_temp_index, + populate_drs_info, + populate_config, ) from mds.agg_mds.commons import ( AdapterMDSInstance, @@ -20,7 +24,7 @@ from mds.agg_mds import adapters from mds.agg_mds import datastore import json -from unittest.mock import patch, MagicMock +from unittest.mock import patch, call, MagicMock from conftest import AsyncMock from tempfile import NamedTemporaryFile from pathlib import Path @@ -82,6 +86,153 @@ async def test_populate_metadata(): ) +@pytest.mark.asyncio +async def test_populate_info(): + with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_datastore: + with NamedTemporaryFile(mode="w+", delete=False) as fp: + json.dump( + { + "configuration": { + "schema": { + "_subjects_count": {"type": "integer"}, + "study_description": {}, + }, + }, + "gen3_commons": { + "mycommons": { + "mds_url": "http://mds", + "commons_url": "http://commons", + "columns_to_fields": { + "short_name": "name", + "full_name": "full_name", + "_subjects_count": "_subjects_count", + "study_id": "study_id", + "_unique_id": "_unique_id", + "study_description": "study_description", + }, + }, + }, + "adapter_commons": { + "non-gen3": { + "mds_url": "http://non-gen3", + "commons_url": "non-gen3", + "adapter": "icpsr", + } + }, + }, + fp, + ) + config = parse_config_from_file(Path(fp.name)) + await populate_info(config) + mock_datastore.update_global_info.assert_has_calls( + [ + call("aggregations", {}), + call( + "schema", + { + "_subjects_count": {"type": "integer", "description": ""}, + "study_description": {"type": "string", "description": ""}, + }, + ), + ], + any_order=True, + ) + + +@pytest.mark.asyncio +async def test_populate_info_to_temp_indexes(): + with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_datastore: + with NamedTemporaryFile(mode="w+", delete=False) as fp: + json.dump( + { + "configuration": { + "schema": { + "_subjects_count": {"type": "integer"}, + "study_description": {}, + }, + }, + "gen3_commons": { + "mycommons": { + "mds_url": "http://mds", + "commons_url": "http://commons", + "columns_to_fields": { + "short_name": "name", + "full_name": "full_name", + "_subjects_count": "_subjects_count", + "study_id": "study_id", + "_unique_id": "_unique_id", + "study_description": "study_description", + }, + }, + }, + "adapter_commons": { + "non-gen3": { + "mds_url": "http://non-gen3", + "commons_url": "non-gen3", + "adapter": "icpsr", + } + }, + }, + fp, + ) + config = parse_config_from_file(Path(fp.name)) + await populate_info_to_temp_index(config) + mock_datastore.update_global_info_to_temp_index.assert_has_calls( + [ + call("aggregations", {}), + call( + "schema", + { + "_subjects_count": {"type": "integer", "description": ""}, + "study_description": {"type": "string", "description": ""}, + }, + ), + ], + any_order=True, + ) + + +@pytest.mark.asyncio +async def test_populate_config(): + with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_datastore: + with NamedTemporaryFile(mode="w+", delete=False) as fp: + json.dump( + { + "configuration": { + "schema": { + "_subjects_count": {"type": "array"}, + "study_description": {}, + }, + }, + "gen3_commons": { + "mycommons": { + "mds_url": "http://mds", + "commons_url": "http://commons", + "columns_to_fields": { + "short_name": "name", + "full_name": "full_name", + "_subjects_count": "_subjects_count", + "study_id": "study_id", + "_unique_id": "_unique_id", + "study_description": "study_description", + }, + }, + }, + "adapter_commons": { + "non-gen3": { + "mds_url": "http://non-gen3", + "commons_url": "non-gen3", + "adapter": "icpsr", + } + }, + }, + fp, + ) + config = parse_config_from_file(Path(fp.name)) + await populate_config(config) + mock_datastore.update_config_info.called_with(["_subjects_count"]) + + @respx.mock @pytest.mark.asyncio async def test_populate_main(): From bcf0d900f2ce7838142093c13d81f32efc3ba927 Mon Sep 17 00:00:00 2001 From: tianj7 Date: Thu, 11 Aug 2022 15:39:23 -0500 Subject: [PATCH 34/70] add unit tests --- tests/test_populate.py | 89 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/tests/test_populate.py b/tests/test_populate.py index 9f96bda3..9a064d17 100644 --- a/tests/test_populate.py +++ b/tests/test_populate.py @@ -86,6 +86,54 @@ async def test_populate_metadata(): ) +@pytest.mark.asyncio +async def test_populate_metadata_to_temp_index(): + with patch.object( + datastore, "update_metadata_to_temp_index", AsyncMock() + ) as mock_update: + await populate_metadata( + "my_commons", + MDSInstance( + mds_url="http://mds", + commons_url="http://commons", + columns_to_fields={"column1": "field1"}, + ), + { + "id1": { + "gen3_discovery": { + "column1": "some data", + "tags": [{"category": "my_category", "name": "my_name"}], + } + } + }, + True, + ) + + mock_update.assert_called_with( + "my_commons", + [ + { + "id1": { + "gen3_discovery": { + "column1": "some data", + "tags": [ + { + "category": "my_category", + "name": "my_name", + }, + ], + "commons_name": "my_commons", + } + } + } + ], + ["id1"], + {"my_category": ["my_name"]}, + {"commons_url": "http://commons"}, + "gen3_discovery", + ) + + @pytest.mark.asyncio async def test_populate_info(): with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_datastore: @@ -233,6 +281,47 @@ async def test_populate_config(): mock_datastore.update_config_info.called_with(["_subjects_count"]) +@pytest.mark.asyncio +async def test_populate_config_to_temp_index(): + with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_datastore: + with NamedTemporaryFile(mode="w+", delete=False) as fp: + json.dump( + { + "configuration": { + "schema": { + "_subjects_count": {"type": "array"}, + "study_description": {}, + }, + }, + "gen3_commons": { + "mycommons": { + "mds_url": "http://mds", + "commons_url": "http://commons", + "columns_to_fields": { + "short_name": "name", + "full_name": "full_name", + "_subjects_count": "_subjects_count", + "study_id": "study_id", + "_unique_id": "_unique_id", + "study_description": "study_description", + }, + }, + }, + "adapter_commons": { + "non-gen3": { + "mds_url": "http://non-gen3", + "commons_url": "non-gen3", + "adapter": "icpsr", + } + }, + }, + fp, + ) + config = parse_config_from_file(Path(fp.name)) + await populate_config(config, True) + mock_datastore.update_config_info_to_temp_index.called_with(["_subjects_count"]) + + @respx.mock @pytest.mark.asyncio async def test_populate_main(): From 0bc38c5bd2e193278b5ce1118cc8826bad577d01 Mon Sep 17 00:00:00 2001 From: tianj7 Date: Thu, 11 Aug 2022 16:31:47 -0500 Subject: [PATCH 35/70] add unit tests --- tests/test_populate.py | 94 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/tests/test_populate.py b/tests/test_populate.py index 9a064d17..3aa3a3a0 100644 --- a/tests/test_populate.py +++ b/tests/test_populate.py @@ -240,6 +240,100 @@ async def test_populate_info_to_temp_indexes(): ) +@pytest.mark.asyncio +@respx.mock +async def test_populate_drs_info(): + mock_adapter = AsyncMock(return_value={}) + patch("mds.agg_mds.adapters.get_metadata", mock_adapter) + with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_datastore: + with NamedTemporaryFile(mode="w+", delete=False) as fp: + json.dump( + { + "configuration": { + "schema": { + "_subjects_count": {"type": "integer"}, + "study_description": {}, + }, + "settings": { + "cache_drs": True, + "drs_indexd_server": "http://test", + "timestamp_entry": True, + }, + }, + }, + fp, + ) + + json_data = [ + { + "hints": [".*dg\\.XXTS.*"], + "host": "https://mytest1.commons.io/", + "name": "DataSTAGE", + "type": "indexd", + }, + { + "hints": [".*dg\\.TSXX.*"], + "host": "https://commons2.io/index/", + "name": "Environmental DC", + "type": "indexd", + }, + ] + + respx.get("http://test/index/_dist").mock( + return_value=httpx.Response( + status_code=200, + json=json_data, + ) + ) + + config = parse_config_from_file(Path(fp.name)) + await populate_drs_info(config) + mock_datastore.update_global_info.assert_has_calls( + [ + call( + "dg.XXTS", + { + "host": "mytest1.commons.io", + "name": "DataSTAGE", + "type": "indexd", + }, + ), + call( + "dg.TSXX", + { + "host": "commons2.io", + "name": "Environmental DC", + "type": "indexd", + }, + ), + ], + any_order=True, + ) + + await populate_drs_info(config, True) + mock_datastore.update_global_info_to_temp_index.assert_has_calls( + [ + call( + "dg.XXTS", + { + "host": "mytest1.commons.io", + "name": "DataSTAGE", + "type": "indexd", + }, + ), + call( + "dg.TSXX", + { + "host": "commons2.io", + "name": "Environmental DC", + "type": "indexd", + }, + ), + ], + any_order=True, + ) + + @pytest.mark.asyncio async def test_populate_config(): with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_datastore: From 962ec70cc02ea1c9aa5d6bc38d17cc6e1da76d08 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Mon, 15 Aug 2022 10:25:17 -0500 Subject: [PATCH 36/70] set default value if available and type conversion is None --- src/mds/agg_mds/adapters.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/mds/agg_mds/adapters.py b/src/mds/agg_mds/adapters.py index 63ea1d22..b835a059 100644 --- a/src/mds/agg_mds/adapters.py +++ b/src/mds/agg_mds/adapters.py @@ -209,6 +209,9 @@ def mapFields(item: dict, mappings: dict, global_filters=None, schema=None) -> d field_value = FieldFilters.execute(f, field_value) if key in schema: field_value = schema[key].normalize_value(field_value) + # set to default if conversion failed and a default value is available + if field_value is None and hasDefaultValue: + field_value = default_value results[key] = field_value return results From 3427395ce1158cb362f8bd61efa26357f57f04cb Mon Sep 17 00:00:00 2001 From: tianj7 Date: Wed, 17 Aug 2022 17:32:28 -0500 Subject: [PATCH 37/70] remove duplicate functions, refactor code and tests --- .secrets.baseline | 6 +- README.md | 24 +- brh_all.json | 285 ------------------ docs/metadata_adapters.md | 4 +- src/mds/agg_mds/datastore/__init__.py | 16 +- .../agg_mds/datastore/elasticsearch_dao.py | 64 +--- src/mds/populate.py | 78 ++--- tests/test_agg_mds_datastore.py | 27 +- tests/test_agg_mds_elasticsearch_dao.py | 15 +- tests/test_populate.py | 131 ++------ 10 files changed, 85 insertions(+), 565 deletions(-) delete mode 100644 brh_all.json diff --git a/.secrets.baseline b/.secrets.baseline index b43b3f20..903bc6d1 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -3,7 +3,7 @@ "files": null, "lines": null }, - "generated_at": "2022-07-06T19:29:03Z", + "generated_at": "2022-08-17T22:32:12Z", "plugins_used": [ { "name": "AWSKeyDetector" @@ -70,7 +70,7 @@ { "hashed_secret": "6eae3a5b062c6d0d79f070c26e6d62486b40cb46", "is_verified": false, - "line_number": 50, + "line_number": 62, "type": "Secret Keyword" } ], @@ -78,7 +78,7 @@ { "hashed_secret": "bf7e894868fd96c11edf05ef7d23122cbfa22e7e", "is_verified": false, - "line_number": 60, + "line_number": 61, "type": "Hex High Entropy String" } ], diff --git a/README.md b/README.md index cebc79f9..addd8a7c 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,18 @@ The aggregate metadata APIs and migrations are disabled by default unless `USE_A The aggregate cache is built using Elasticsearch. See the `docker-compose.yaml` file (specifically the `aggregate_migration` service) for details regarding how aggregate data is populated. +### Aggregate MDS +For local development ensure the docker container is up. + +testing populate: +```bash +python src/mds/populate.py --config +``` +view the loaded data +```bash +http://localhost:8000/aggregate/metadata?limit=1000 +``` + ## Installation Install required software: @@ -101,18 +113,6 @@ Run tests: ```bash docker-compose exec app pytest --cov=src --cov=migrations/versions tests ``` -### Aggregate MDS -For local development ensure the docker container is up. - -testing populate: -```bash -python src/mds/populate.py --config -``` -view the loaded data -```bash -http://localhost:8000/aggregate/metadata?limit=1000 -``` - ## Deployment diff --git a/brh_all.json b/brh_all.json deleted file mode 100644 index 8ec4fbb3..00000000 --- a/brh_all.json +++ /dev/null @@ -1,285 +0,0 @@ -{ - "configuration": { - "schema": { - "_subjects_count": { - "type": "integer" - }, - "__manifest": { - "type": "array", - "properties": { - "file_name": { - "type": "string" - }, - "file_size": { - "type": "integer" - } - } - }, - "tags": { - "type": "array" - }, - "_unique_id": {}, - "study_description": {}, - "study_id": {}, - "study_url": {}, - "project_id": {}, - "short_name": {}, - "full_name": {}, - "commons_url": {}, - "commons" : {}, - "authz": { - "type": "string" - } - }, - "settings" : { - "cache_drs" : true - } - }, - "adapter_commons": { - "IBD Commons": { - "mds_url": "https://ibdgc.datacommons.io/", - "commons_url" : "ibdgc.datacommons.io", - "adapter": "gen3", - "config" : { - "guid_type": "my_metadata", - "study_field": "my_metadata" - }, - "keep_original_fields": false, - "field_mappings" : { - "authz": "path:authz", - "tags": "path:tags", - "_unique_id": "path:_unique_id", - "study_id": "path:_unique_id", - "study_description": "path:brief_summary", - "full_name": "path:dataset_title", - "short_name": "path:dataset_code", - "commons": "NIDDK IBD Genetics Consortium Data Commons", - "study_url": "path:link", - "_subjects_count" : "path:subjects_count", - "__manifest": "path:__manifest", - "commons_url" : "ibdgc.datacommons.io" - } - }, - "BioData Catalyst": { - "mds_url": "https://gen3.biodatacatalyst.nhlbi.nih.gov/", - "commons_url" : "gen3.biodatacatalyst.nhlbi.nih.gov", - "adapter": "gen3", - "config" : { - "guid_type": "discovery_metadata", - "study_field": "gen3_discovery" - }, - "keep_original_fields": false, - "field_mappings" : { - "authz": "path:authz", - "tags": "path:tags", - "_unique_id": "path:study_id", - "study_id": "path:study_id", - "study_description": "path:study_description", - "full_name": "path:full_name", - "short_name": "path:short_name", - "commons": "BioData Catalyst", - "study_url": "path:dbgap_url", - "_subjects_count" : {"path":"_subjects_count", "default" : 0 }, - "__manifest": "path:__manifest", - "commons_url" : "gen3.biodatacatalyst.nhlbi.nih.gov" - } - }, - "MIDRC": { - "mds_url": "https://data.midrc.org/", - "commons_url" : "data.midrc.org", - "adapter": "gen3", - "config" : { - "guid_type": "discovery_metadata", - "study_field": "discovery_metadata" - }, - "keep_original_fields": false, - "field_mappings" : { - "authz": "path:auth_resource_path", - "tags": "path:tags", - "_unique_id": "path:_unique_id", - "study_id": "path:_publication_id", - "study_description": "path:study_description_summary", - "full_name": "path:study_name_title", - "short_name": "path:study_name_title", - "commons": "Medical Imaging and Data Resource Center (MIDRC)", - "study_url": "path:websites", - "_subjects_count" : {"path":"subjects_count", "default" : 0 }, - "__manifest": "path:__manifest", - "commons_url" : "ibdgc.datacommons.io" - } - }, - "NIAID ClinicalData": { - "mds_url": "https://accessclinicaldata.niaid.nih.gov/", - "commons_url" : "accessclinicaldata.niaid.nih.gov", - "adapter": "gen3", - "config" : { - "guid_type": "my_metadata", - "study_field": "my_metadata" - }, - "keep_original_fields": false, - "field_mappings" : { - "authz": "/open", - "tags": "path:tags", - "_unique_id": "path:_unique_id", - "study_id": "path:nct_number", - "study_description": "path:brief_summary", - "full_name": "path:title", - "short_name": "path:title", - "commons": "NCT", - "study_url": "path:publications", - "_subjects_count" : { - "path":"subjects_count", - "default_value": 0 - }, - "__manifest": "path:__manifest", - "commons_url" : "accessclinicaldata.niaid.nih.gov" - } - }, - "JCOIN": { - "mds_url": "https://jcoin.datacommons.io/", - "commons_url" : "jcoin.datacommons.io/", - "adapter": "gen3", - "keep_original_fields": false, - "field_mappings" : { - "authz": "path:authz", - "tags": "path:tags", - "_unique_id": "path:project_number", - "study_id": "path:project_number", - "study_description": "path:study_description_summary", - "full_name": "path:project_title", - "short_name": "path:project_title", - "commons": "JCOIN", - "study_url": "path:publications", - "data_availability": "path:data_availability", - "_subjects_count" : { - "path":"subjects", - "default_value": 0 - }, - "__manifest": "path:__manifest", - "commons_url" : "jcoin.datacommons.io" - } - }, - "AnVIL": { - "mds_url": "https://internalstaging.theanvil.io/", - "commons_url" : "gen3.theanvil.io", - "config" : { - "guid_type": "discovery_metadata", - "study_field": "gen3_discovery" - }, - "adapter": "gen3", - "keep_original_fields": false, - "field_mappings" : { - "authz": "path:authz", - "tags": "path:tags", - "_unique_id": "path:study_id", - "study_id": "path:study_id", - "study_description": "path:study_description", - "full_name": "path:full_name", - "short_name": "path:short_name", - "commons": "AnVIL", - "study_url": "path:publications", - "_subjects_count" : {"path":"_subjects_count", "default" : 0 }, - "__manifest": "path:__manifest", - "commons_url" : "gen3.theanvil.io" - } - }, - "Genomic Data Commons": { - "mds_url": "https://brh.data-commons.org/", - "commons_url" : "portal.gdc.cancer.gov", - "config" : { - "guid_type": "discovery_metadata", - "study_field": "gen3_discovery", - "filters": "gen3_discovery.commons=Genomic Data Commons" - }, - "adapter": "gen3", - "keep_original_fields": false, - "field_mappings" : { - "authz": "path:authz", - "tags": "path:tags", - "_unique_id": "path:_unique_id", - "study_id": "path:_unique_id", - "study_description": "path:study_description", - "full_name": "path:full_name", - "short_name": "N/A", - "commons": "GDC", - "study_url": "path:publications", - "_subjects_count" : {"path":"subjects_count", "default" : 0 }, - "__manifest": "path:__manifest", - "commons_url" : "brh.data-commons.org" - } - }, - "Proteomic Data Commons": { - "mds_url": "https://brh.data-commons.org/", - "commons_url" : "proteomic.datacommons.cancer.gov/pdc", - "config" : { - "guid_type": "discovery_metadata", - "study_field": "gen3_discovery", - "filters": "gen3_discovery.commons=Proteomic Data Commons" - }, - "adapter": "gen3", - "keep_original_fields": false, - "field_mappings" : { - "authz": "path:authz", - "tags": "path:tags", - "_unique_id": "path:_unique_id", - "study_id": "path:_unique_id", - "study_description": "path:study_description", - "full_name": "path:full_name", - "short_name": "N/A", - "commons": "Proteomic Data Commons", - "study_url": "path:publications", - "_subjects_count" : {"path":"subjects_count", "default" : 0 }, - "__manifest": "path:__manifest", - "commons_url" : "proteomic.datacommons.cancer.gov/pdc" - } - }, - "Cancer Imaging Data Commons": { - "mds_url": "https://brh.data-commons.org/", - "commons_url" : "imaging.datacommons.cancer.gov", - "config" : { - "guid_type": "discovery_metadata", - "study_field": "gen3_discovery", - "filters": "gen3_discovery.commons=Cancer Imaging Data Commons" - }, - "adapter": "gen3", - "keep_original_fields": false, - "field_mappings" : { - "authz": "path:authz", - "tags": "path:tags", - "_unique_id": "path:_unique_id", - "study_id": "path:_unique_id", - "study_description": "path:study_description", - "full_name": "path:full_name", - "short_name": "N/A", - "commons": "Cancer Imaging Data Commons", - "study_url": "path:publications", - "_subjects_count" : {"path":"subjects_count", "default" : 0 }, - "__manifest": "path:__manifest", - "commons_url" : "imaging.datacommons.cancer.gov" - } - }, - "Kids First": { - "mds_url": "https://gen3staging.kidsfirstdrc.org/", - "commons_url": "kidsfirstdrc.org", - "adapter": "gen3", - "config" : { - "guid_type": "metadata_object", - "study_field": "dbgap" - }, - "keep_original_fields": false, - "field_mappings" : { - "authz": "path:authz", - "tags": "path:gen3_discovery.tags", - "_unique_id": "path:_unique_id", - "study_id": "path:_unique_id", - "study_description": "path:description", - "full_name": "path:full_name", - "short_name": "path:full_name", - "commons": "Kids First Data Resource Center", - "study_url": "path:link", - "__manifest": "path:__manifest", - "commons_url" : "kidsfirstdrc.org" - } - } - } -} diff --git a/docs/metadata_adapters.md b/docs/metadata_adapters.md index 762dd4fe..2c91c55c 100644 --- a/docs/metadata_adapters.md +++ b/docs/metadata_adapters.md @@ -131,9 +131,9 @@ def filter_function(s:str) -> str: ### Default Values Defining default values for fields is handled in one of two way: -If a field in the metdata does not need a path, simply define the +If a field in the metadata does not need a path, simply define the field name and a value. If a remote metadata field has a value, it will override the default. -If a path is use then use the longer form and set the ```default_value``` to use +If a path is used, then use the longer form and set the ```default_value``` to use if the path is not found. ```json diff --git a/src/mds/agg_mds/datastore/__init__.py b/src/mds/agg_mds/datastore/__init__.py index b7a77c88..36c17b1f 100644 --- a/src/mds/agg_mds/datastore/__init__.py +++ b/src/mds/agg_mds/datastore/__init__.py @@ -13,8 +13,8 @@ async def init(hostname, port): await client.init(hostname, port) -async def drop_all(): - await client.drop_all() +async def drop_all_non_temp_indexes(): + await client.drop_all_non_temp_indexes() async def drop_all_temp_indexes(): @@ -48,26 +48,14 @@ async def update_metadata(*args): await client.update_metadata(*args) -async def update_metadata_to_temp_index(*args): - await client.update_metadata_to_temp_index(*args) - - async def update_global_info(*args): await client.update_global_info(*args) -async def update_global_info_to_temp_index(*args): - await client.update_global_info_to_temp_index(*args) - - async def update_config_info(*args): await client.update_config_info(*args) -async def update_config_info_to_temp_index(*args): - await client.update_config_info_to_temp_index(*args) - - async def get_commons_metadata(*args): return await client.get_commons_metadata(*args) diff --git a/src/mds/agg_mds/datastore/elasticsearch_dao.py b/src/mds/agg_mds/datastore/elasticsearch_dao.py index 12ff8318..23072505 100644 --- a/src/mds/agg_mds/datastore/elasticsearch_dao.py +++ b/src/mds/agg_mds/datastore/elasticsearch_dao.py @@ -77,7 +77,7 @@ async def init(hostname: str = "0.0.0.0", port: int = 9200): ) -async def drop_all(): +async def drop_all_non_temp_indexes(): for index in [AGG_MDS_INDEX, AGG_MDS_INFO_INDEX, AGG_MDS_CONFIG_INDEX]: res = elastic_search_client.indices.delete(index=index, ignore=[400, 404]) logger.debug(f"deleted index: {index}: {res}") @@ -189,14 +189,17 @@ async def update_metadata( tags: Dict[str, List[str]], info: Dict[str, str], study_data_field: str, + use_temp_index: bool = False, ): + index_to_update = AGG_MDS_INFO_INDEX_TEMP if use_temp_index else AGG_MDS_INFO_INDEX elastic_search_client.index( - index=AGG_MDS_INFO_INDEX, + index=index_to_update, doc_type=AGG_MDS_INFO_TYPE, id=name, body=info, ) + index_to_update = AGG_MDS_INDEX_TEMP if use_temp_index else AGG_MDS_INDEX for doc in data: key = list(doc.keys())[0] # Flatten out this structure @@ -204,64 +207,25 @@ async def update_metadata( try: elastic_search_client.index( - index=AGG_MDS_INDEX, doc_type=AGG_MDS_TYPE, id=key, body=doc + index=index_to_update, doc_type=AGG_MDS_TYPE, id=key, body=doc ) except Exception as ex: - print(ex) + raise (ex) -async def update_metadata_to_temp_index( - name: str, - data: List[Dict], - guid_arr: List[str], - tags: Dict[str, List[str]], - info: Dict[str, str], - study_data_field: str, -): - elastic_search_client.index( - index=AGG_MDS_INFO_INDEX_TEMP, - doc_type=AGG_MDS_INFO_TYPE, - id=name, - body=info, - ) - - for doc in data: - key = list(doc.keys())[0] - # Flatten out this structure - doc = doc[key][study_data_field] - - try: - elastic_search_client.index( - index=AGG_MDS_INDEX_TEMP, doc_type=AGG_MDS_TYPE, id=key, body=doc - ) - except Exception as ex: - print(ex) - - -async def update_global_info(key, doc) -> None: +async def update_global_info(key, doc, use_temp_index: bool = False) -> None: + index_to_update = AGG_MDS_INFO_INDEX_TEMP if use_temp_index else AGG_MDS_INFO_INDEX elastic_search_client.index( - index=AGG_MDS_INFO_INDEX, doc_type=AGG_MDS_INFO_TYPE, id=key, body=doc + index=index_to_update, doc_type=AGG_MDS_INFO_TYPE, id=key, body=doc ) -async def update_global_info_to_temp_index(key, doc) -> None: - elastic_search_client.index( - index=AGG_MDS_INFO_INDEX_TEMP, doc_type=AGG_MDS_INFO_TYPE, id=key, body=doc +async def update_config_info(doc, use_temp_index: bool = False) -> None: + index_to_update = ( + AGG_MDS_CONFIG_INDEX_TEMP if use_temp_index else AGG_MDS_CONFIG_INDEX ) - - -async def update_config_info(doc) -> None: - elastic_search_client.index( - index=AGG_MDS_CONFIG_INDEX, - doc_type="_doc", - id=AGG_MDS_INDEX, - body=doc, - ) - - -async def update_config_info_to_temp_index(doc) -> None: elastic_search_client.index( - index=AGG_MDS_CONFIG_INDEX_TEMP, + index=index_to_update, doc_type="_doc", id=AGG_MDS_INDEX, body=doc, diff --git a/src/mds/populate.py b/src/mds/populate.py index 631e746f..8ae5e015 100644 --- a/src/mds/populate.py +++ b/src/mds/populate.py @@ -23,7 +23,7 @@ def parse_args(argv: List[str]) -> Namespace: return known_args -async def populate_metadata(name: str, common, results, populate_to_temp_index=False): +async def populate_metadata(name: str, common, results, use_temp_index=False): mds_arr = [{k: v} for k, v in results.items()] total_items = len(mds_arr) @@ -71,11 +71,10 @@ def normalize(entry: dict) -> Any: # add to tags item_tags = entry[common.study_data_field].get("tags", {}) - if item_tags is not None: - for t in item_tags: - if t["category"] not in tags: - tags[t["category"]] = set() - tags[t["category"]].add(t["name"]) + for t in item_tags: + if t["category"] not in tags: + tags[t["category"]] = set() + tags[t["category"]].add(t["name"]) # process tags set to list for k, v in tags.items(): @@ -83,63 +82,38 @@ def normalize(entry: dict) -> Any: keys = list(results.keys()) info = {"commons_url": common.commons_url} - if not populate_to_temp_index: - await datastore.update_metadata( - name, mds_arr, keys, tags, info, common.study_data_field - ) - else: - await datastore.update_metadata_to_temp_index( - name, mds_arr, keys, tags, info, common.study_data_field - ) - - -async def populate_info(commons_config: Commons) -> None: - agg_info = { - key: value.to_dict() for key, value in commons_config.aggregations.items() - } - await datastore.update_global_info("aggregations", agg_info) - if commons_config.configuration.schema: - json_schema = { - k: v.to_schema(all_fields=True) - for k, v in commons_config.configuration.schema.items() - } - await datastore.update_global_info("schema", json_schema) - await populate_drs_info(commons_config) + await datastore.update_metadata( + name, mds_arr, keys, tags, info, common.study_data_field, use_temp_index + ) -async def populate_info_to_temp_index(commons_config: Commons) -> None: +async def populate_info(commons_config: Commons, use_temp_index=False) -> None: agg_info = { key: value.to_dict() for key, value in commons_config.aggregations.items() } - await datastore.update_global_info_to_temp_index("aggregations", agg_info) + await datastore.update_global_info("aggregations", agg_info, use_temp_index) if commons_config.configuration.schema: json_schema = { k: v.to_schema(all_fields=True) for k, v in commons_config.configuration.schema.items() } - await datastore.update_global_info_to_temp_index("schema", json_schema) - await populate_drs_info(commons_config, populate_to_temp_index=True) + await datastore.update_global_info("schema", json_schema, use_temp_index) + await populate_drs_info(commons_config, use_temp_index) -async def populate_drs_info( - commons_config: Commons, populate_to_temp_index=False -) -> None: +async def populate_drs_info(commons_config: Commons, use_temp_index=False) -> None: if commons_config.configuration.settings.cache_drs: server = commons_config.configuration.settings.drs_indexd_server if server is not None: drs_data = adapters.get_metadata("drs_indexd", server, None) + for id, entry in drs_data.get("cache", {}).items(): - if not populate_to_temp_index: - await datastore.update_global_info(id, entry) - else: - await datastore.update_global_info_to_temp_index(id, entry) + await datastore.update_global_info(id, entry, use_temp_index) -async def populate_config( - commons_config: Commons, populate_to_temp_index=False -) -> None: +async def populate_config(commons_config: Commons, use_temp_index=False) -> None: array_definition = { "array": [ field @@ -147,10 +121,8 @@ async def populate_config( if value.type == "array" ] } - if not populate_to_temp_index: - await datastore.update_config_info(array_definition) - else: - await datastore.update_config_info_to_temp_index(array_definition) + + await datastore.update_config_info(array_definition, use_temp_index) async def main(commons_config: Commons) -> None: @@ -205,9 +177,7 @@ async def main(commons_config: Commons) -> None: logger.info(f"Received {len(results)} from {name}") if len(results) > 0: mdsCount += len(results) - await populate_metadata( - name, common, results, populate_to_temp_index=True - ) + await populate_metadata(name, common, results, use_temp_index=True) for name, common in commons_config.adapter_commons.items(): logger.info(f"Populating {name} using adapter: {common.adapter}") @@ -225,17 +195,15 @@ async def main(commons_config: Commons) -> None: logger.info(f"Received {len(results)} from {name}") if len(results) > 0: mdsCount += len(results) - await populate_metadata( - name, common, results, populate_to_temp_index=True - ) + await populate_metadata(name, common, results, use_temp_index=True) if mdsCount == 0: raise ValueError("Could not obtain any metadata from any adapters.") # populate global information index - await populate_info_to_temp_index(commons_config) + await populate_info(commons_config, use_temp_index=True) # populate array index information to support guppy - await populate_config(commons_config, populate_to_temp_index=True) + await populate_config(commons_config, use_temp_index=True) except Exception as ex: logger.error( @@ -247,7 +215,7 @@ async def main(commons_config: Commons) -> None: logger.info(f"Temp indexes populated successfully. Proceeding to clone") # All temp indexes created without error, drop current real index, clone temp to real index and then drop temp index try: - await datastore.drop_all() # TODO: rename indexes to old + await datastore.drop_all_non_temp_indexes() # TODO: rename indexes to old await datastore.create_indexes(commons_mapping=field_mapping) await datastore.clone_temp_indexes_to_real_indexes() await datastore.drop_all_temp_indexes() diff --git a/tests/test_agg_mds_datastore.py b/tests/test_agg_mds_datastore.py index fcd46eba..411d4320 100644 --- a/tests/test_agg_mds_datastore.py +++ b/tests/test_agg_mds_datastore.py @@ -16,10 +16,10 @@ async def test_init(): @pytest.mark.asyncio -async def test_drop_all(): +async def test_drop_all_non_temp_indexes(): with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: - await datastore.drop_all() - mock_client.drop_all.assert_called_with() + await datastore.drop_all_non_temp_indexes() + mock_client.drop_all_non_temp_indexes.assert_called_with() @pytest.mark.asyncio @@ -71,13 +71,6 @@ async def test_update_metadata(): mock_client.update_metadata.assert_called_with() -@pytest.mark.asyncio -async def test_update_metadata_to_temp_index(): - with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: - await datastore.update_metadata_to_temp_index() - mock_client.update_metadata_to_temp_index.assert_called_with() - - @pytest.mark.asyncio async def test_update_global_info(): with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: @@ -85,13 +78,6 @@ async def test_update_global_info(): mock_client.update_global_info.assert_called_with() -@pytest.mark.asyncio -async def test_update_global_info_to_temp_index(): - with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: - await datastore.update_global_info_to_temp_index() - mock_client.update_global_info_to_temp_index.assert_called_with() - - @pytest.mark.asyncio async def test_update_config_info(): with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: @@ -99,13 +85,6 @@ async def test_update_config_info(): mock_client.update_config_info.assert_called_with() -@pytest.mark.asyncio -async def test_update_config_info_to_temp_index(): - with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: - await datastore.update_config_info_to_temp_index() - mock_client.update_config_info_to_temp_index.assert_called_with() - - @pytest.mark.asyncio async def test_get_commons_metadata(): with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_client: diff --git a/tests/test_agg_mds_elasticsearch_dao.py b/tests/test_agg_mds_elasticsearch_dao.py index 155bd408..2a8faf67 100644 --- a/tests/test_agg_mds_elasticsearch_dao.py +++ b/tests/test_agg_mds_elasticsearch_dao.py @@ -55,12 +55,12 @@ async def test_init(): @pytest.mark.asyncio -async def test_drop_all(): +async def test_drop_all_non_temp_indexes(): with patch( "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.indices", MagicMock(), ) as mock_indices: - await elasticsearch_dao.drop_all() + await elasticsearch_dao.drop_all_non_temp_indexes() mock_indices.delete.assert_has_calls( [ call(index=AGG_MDS_INDEX, ignore=[400, 404]), @@ -184,7 +184,7 @@ async def test_create_if_exists(): ) ), ) as mock_indices: - await elasticsearch_dao.drop_all() + await elasticsearch_dao.drop_all_non_temp_indexes() await elasticsearch_dao.create_indexes(COMMON_MAPPING) @@ -249,7 +249,7 @@ async def test_update_metadata_to_temp_index(): "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.index", MagicMock(), ) as mock_index: - await elasticsearch_dao.update_metadata_to_temp_index( + await elasticsearch_dao.update_metadata( "my_commons", [ { @@ -266,6 +266,7 @@ async def test_update_metadata_to_temp_index(): {}, {}, "gen3_discovery", + use_temp_index=True, ) mock_index.assert_has_calls( [ @@ -305,7 +306,9 @@ async def test_update_global_info_to_temp_index(): "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client", MagicMock(), ) as mock_client: - await elasticsearch_dao.update_global_info_to_temp_index(key="123", doc={}) + await elasticsearch_dao.update_global_info( + key="123", doc={}, use_temp_index=True + ) mock_client.index.assert_called_with( index=AGG_MDS_INFO_INDEX_TEMP, doc_type=AGG_MDS_INFO_TYPE, id="123", body={} @@ -331,7 +334,7 @@ async def test_update_config_info_to_temp_index(): "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client", MagicMock(), ) as mock_client: - await elasticsearch_dao.update_config_info_to_temp_index(doc={}) + await elasticsearch_dao.update_config_info(doc={}, use_temp_index=True) mock_client.index.assert_called_with( index=AGG_MDS_CONFIG_INDEX_TEMP, doc_type="_doc", id=AGG_MDS_INDEX, body={} diff --git a/tests/test_populate.py b/tests/test_populate.py index 3aa3a3a0..75869e73 100644 --- a/tests/test_populate.py +++ b/tests/test_populate.py @@ -9,7 +9,6 @@ main, filter_entries, populate_info, - populate_info_to_temp_index, populate_drs_info, populate_config, ) @@ -83,54 +82,7 @@ async def test_populate_metadata(): {"my_category": ["my_name"]}, {"commons_url": "http://commons"}, "gen3_discovery", - ) - - -@pytest.mark.asyncio -async def test_populate_metadata_to_temp_index(): - with patch.object( - datastore, "update_metadata_to_temp_index", AsyncMock() - ) as mock_update: - await populate_metadata( - "my_commons", - MDSInstance( - mds_url="http://mds", - commons_url="http://commons", - columns_to_fields={"column1": "field1"}, - ), - { - "id1": { - "gen3_discovery": { - "column1": "some data", - "tags": [{"category": "my_category", "name": "my_name"}], - } - } - }, - True, - ) - - mock_update.assert_called_with( - "my_commons", - [ - { - "id1": { - "gen3_discovery": { - "column1": "some data", - "tags": [ - { - "category": "my_category", - "name": "my_name", - }, - ], - "commons_name": "my_commons", - } - } - } - ], - ["id1"], - {"my_category": ["my_name"]}, - {"commons_url": "http://commons"}, - "gen3_discovery", + False, ) @@ -174,66 +126,14 @@ async def test_populate_info(): await populate_info(config) mock_datastore.update_global_info.assert_has_calls( [ - call("aggregations", {}), - call( - "schema", - { - "_subjects_count": {"type": "integer", "description": ""}, - "study_description": {"type": "string", "description": ""}, - }, - ), - ], - any_order=True, - ) - - -@pytest.mark.asyncio -async def test_populate_info_to_temp_indexes(): - with patch("mds.agg_mds.datastore.client", AsyncMock()) as mock_datastore: - with NamedTemporaryFile(mode="w+", delete=False) as fp: - json.dump( - { - "configuration": { - "schema": { - "_subjects_count": {"type": "integer"}, - "study_description": {}, - }, - }, - "gen3_commons": { - "mycommons": { - "mds_url": "http://mds", - "commons_url": "http://commons", - "columns_to_fields": { - "short_name": "name", - "full_name": "full_name", - "_subjects_count": "_subjects_count", - "study_id": "study_id", - "_unique_id": "_unique_id", - "study_description": "study_description", - }, - }, - }, - "adapter_commons": { - "non-gen3": { - "mds_url": "http://non-gen3", - "commons_url": "non-gen3", - "adapter": "icpsr", - } - }, - }, - fp, - ) - config = parse_config_from_file(Path(fp.name)) - await populate_info_to_temp_index(config) - mock_datastore.update_global_info_to_temp_index.assert_has_calls( - [ - call("aggregations", {}), + call("aggregations", {}, False), call( "schema", { "_subjects_count": {"type": "integer", "description": ""}, "study_description": {"type": "string", "description": ""}, }, + False, ), ], any_order=True, @@ -297,6 +197,7 @@ async def test_populate_drs_info(): "name": "DataSTAGE", "type": "indexd", }, + False, ), call( "dg.TSXX", @@ -305,13 +206,14 @@ async def test_populate_drs_info(): "name": "Environmental DC", "type": "indexd", }, + False, ), ], any_order=True, ) await populate_drs_info(config, True) - mock_datastore.update_global_info_to_temp_index.assert_has_calls( + mock_datastore.update_global_info.assert_has_calls( [ call( "dg.XXTS", @@ -320,6 +222,7 @@ async def test_populate_drs_info(): "name": "DataSTAGE", "type": "indexd", }, + True, ), call( "dg.TSXX", @@ -328,6 +231,7 @@ async def test_populate_drs_info(): "name": "Environmental DC", "type": "indexd", }, + True, ), ], any_order=True, @@ -413,7 +317,9 @@ async def test_populate_config_to_temp_index(): ) config = parse_config_from_file(Path(fp.name)) await populate_config(config, True) - mock_datastore.update_config_info_to_temp_index.called_with(["_subjects_count"]) + mock_datastore.update_config_info.called_with( + ["_subjects_count"], use_temp_index=True + ) @respx.mock @@ -427,18 +333,15 @@ async def test_populate_main(): patch("mds.config.USE_AGG_MDS", True).start() patch.object(datastore, "init", AsyncMock()).start() - patch.object(datastore, "drop_all", AsyncMock()).start() + patch.object(datastore, "drop_all_non_temp_indexes", AsyncMock()).start() patch.object(datastore, "drop_all_temp_indexes", AsyncMock()).start() patch.object(datastore, "create_indexes", AsyncMock()).start() patch.object(datastore, "create_temp_indexes", AsyncMock()).start() patch.object(datastore, "update_config_info", AsyncMock()).start() - patch.object(datastore, "update_config_info_to_temp_index", AsyncMock()).start() patch.object(datastore, "get_status", AsyncMock(return_value="OK")).start() patch.object(datastore, "close", AsyncMock()).start() patch.object(datastore, "update_global_info", AsyncMock()).start() - patch.object(datastore, "update_global_info_to_temp_index", AsyncMock()).start() patch.object(datastore, "update_metadata", AsyncMock()).start() - patch.object(datastore, "update_metadata_to_temp_index", AsyncMock()).start() patch.object(adapters, "get_metadata", MagicMock()).start() patch.object(datastore, "clone_temp_indexes_to_real_indexes", AsyncMock()).start() @@ -513,11 +416,11 @@ async def test_populate_main_fail(): patch.object(datastore, "drop_all_temp_indexes", AsyncMock()).start() patch.object(datastore, "create_indexes", AsyncMock()).start() patch.object(datastore, "create_temp_indexes", AsyncMock()).start() - patch.object(datastore, "update_config_info_to_temp_index", AsyncMock()).start() + patch.object(datastore, "update_config_info", AsyncMock()).start() patch.object(datastore, "get_status", AsyncMock(return_value="OK")).start() patch.object(datastore, "close", AsyncMock()).start() - patch.object(datastore, "update_global_info_to_temp_index", AsyncMock()).start() - patch.object(datastore, "update_metadata_to_temp_index", AsyncMock()).start() + patch.object(datastore, "update_global_info", AsyncMock()).start() + patch.object(datastore, "update_metadata", AsyncMock()).start() patch.object(adapters, "get_metadata", MagicMock()).start() patch.object(datastore, "clone_temp_indexes_to_real_indexes", AsyncMock()).start() @@ -550,7 +453,7 @@ def wipe_return_value(mock: AsyncMock): drop_all_indexes_mock = AsyncMock( side_effect=wipe_return_value(get_all_metadata_mock) ) - patch.object(datastore, "drop_all", drop_all_indexes_mock).start() + patch.object(datastore, "drop_all_non_temp_indexes", drop_all_indexes_mock).start() respx.get( "http://testfail/ok//mds/metadata?data=True&_guid_type=discovery_metadata&limit=1000&offset=0" @@ -607,7 +510,7 @@ def wipe_return_value(mock: AsyncMock): # Unable to update temp index, raise exception patch.object( datastore, - "update_metadata_to_temp_index", + "update_metadata", AsyncMock(side_effect=Exception("Unable")), ).start() with pytest.raises(Exception): From 9f171260eb552233402b6e9666f0057d115df904 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Thu, 18 Aug 2022 12:17:12 -0500 Subject: [PATCH 38/70] remove /aggregate/metadata_paged merge functionality to /aggregate/metadata --- src/mds/agg_mds/query.py | 53 ++++++++++++++++--------------------- tests/test_agg_mds_query.py | 10 +++---- 2 files changed, 28 insertions(+), 35 deletions(-) diff --git a/src/mds/agg_mds/query.py b/src/mds/agg_mds/query.py index 62774ec5..13c46da3 100644 --- a/src/mds/agg_mds/query.py +++ b/src/mds/agg_mds/query.py @@ -30,20 +30,36 @@ async def get_commons(what: str): ) -@mod.get("/aggregate/metadata_paged") +@mod.get("/aggregate/metadata") async def metadata( _: Request, limit: int = Query( 20, description="Maximum number of records returned. (max: 2000)" ), offset: int = Query(0, description="Return results at this given offset."), + counts: str = Query( + "", description="Return count of a field instead of the value." + ), flatten: bool = Query( - True, description="Return the results without grouping items by commons." + False, description="Return the results without grouping items by commons." + ), + pagination: bool = Query( + False, description="If true will return a pagination object in the response" ), ): """ - Returns all metadata from all registered commons in the form: + The pagination option adds a pagination object to the response: { + "commonA" : { + ... Metadata + }, + "commonB" : { + ... Metadata + } + ... + } + + { results: { "commonA" : { ... Metadata @@ -68,35 +84,12 @@ async def metadata( } ... }, - """ - return await datastore.get_all_metadata(limit, offset, None, flatten) - -@mod.get("/aggregate/metadata") -async def metadata( - _: Request, - limit: int = Query( - 20, description="Maximum number of records returned. (max: 2000)" - ), - offset: int = Query(0, description="Return results at this given offset."), - counts: str = Query( - "", description="Return count of a field instead of the value." - ), -): - """ - Returns all metadata from all registered commons in the form: - { - "commonA" : { - ... Metadata - }, - "commonB" : { - ... Metadata - } - ... - } """ - results = await datastore.get_all_metadata(limit, offset, counts, False) - return results.get("results", {}) + results = await datastore.get_all_metadata(limit, offset, counts, flatten) + if pagination is False: + return results.get("results", {}) + return results @mod.get("/aggregate/metadata/{name}") diff --git a/tests/test_agg_mds_query.py b/tests/test_agg_mds_query.py index e17ef955..42ffd0fa 100644 --- a/tests/test_agg_mds_query.py +++ b/tests/test_agg_mds_query.py @@ -69,10 +69,10 @@ async def test_aggregate_metadata_paged(client): with patch.object( datastore, "get_all_metadata", AsyncMock(return_value={"results": []}) ) as datastore_mock: - resp = client.get("/aggregate/metadata_paged") + resp = client.get("/aggregate/metadata?pagination=1&flatten=1") assert resp.status_code == 200 assert resp.json() == {"results": []} - datastore.get_all_metadata.assert_called_with(20, 0, None, True) + datastore.get_all_metadata.assert_called_with(20, 0, "", True) mock_data = { "results": [ @@ -85,10 +85,10 @@ async def test_aggregate_metadata_paged(client): with patch.object( datastore, "get_all_metadata", AsyncMock(return_value=mock_data) ) as datastore_mock: - resp = client.get("/aggregate/metadata_paged") + resp = client.get("/aggregate/metadata?pagination=1&flatten=1") assert resp.status_code == 200 assert resp.json() == mock_data - datastore.get_all_metadata.assert_called_with(20, 0, None, True) + datastore.get_all_metadata.assert_called_with(20, 0, "", True) @pytest.mark.asyncio @@ -173,7 +173,7 @@ async def test_aggregate_metadata_paged_flat(client): "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.search", MagicMock(return_value=mock_data), ) as search: - resp = client.get("/aggregate/metadata_paged?flatten=true") + resp = client.get("/aggregate/metadata?flatten=true&pagination=true") assert resp.status_code == 200 assert resp.json() == results From b1d14805de0b6f2a69c9d51da121f45f2128a18e Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Thu, 18 Aug 2022 17:11:57 -0500 Subject: [PATCH 39/70] add mapping.ignore_malformed: True, to ES settings --- src/mds/agg_mds/datastore/elasticsearch_dao.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mds/agg_mds/datastore/elasticsearch_dao.py b/src/mds/agg_mds/datastore/elasticsearch_dao.py index 23072505..502de88e 100644 --- a/src/mds/agg_mds/datastore/elasticsearch_dao.py +++ b/src/mds/agg_mds/datastore/elasticsearch_dao.py @@ -34,6 +34,7 @@ SEARCH_CONFIG = { "settings": { "index": { + "mapping.ignore_malformed": True, "number_of_shards": 1, "number_of_replicas": 0, "analysis": { From a4b193c10eeeb64d0de25ea8e2a79a035480abf6 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Fri, 19 Aug 2022 11:34:43 -0500 Subject: [PATCH 40/70] add Schema and Gen3 Adapter documentation --- .secrets.baseline | 4 +- docs/metadata_adapters.md | 225 ++++++++++++++++++++++++-- docs/sample_aggregate_mds_config.json | 17 +- 3 files changed, 229 insertions(+), 17 deletions(-) diff --git a/.secrets.baseline b/.secrets.baseline index 903bc6d1..4da32422 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -3,7 +3,7 @@ "files": null, "lines": null }, - "generated_at": "2022-08-17T22:32:12Z", + "generated_at": "2022-08-19T16:34:02Z", "plugins_used": [ { "name": "AWSKeyDetector" @@ -78,7 +78,7 @@ { "hashed_secret": "bf7e894868fd96c11edf05ef7d23122cbfa22e7e", "is_verified": false, - "line_number": 61, + "line_number": 203, "type": "Hex High Entropy String" } ], diff --git a/docs/metadata_adapters.md b/docs/metadata_adapters.md index 2c91c55c..3a3c791b 100644 --- a/docs/metadata_adapters.md +++ b/docs/metadata_adapters.md @@ -8,7 +8,9 @@ either when it is started or on-demand. The adapter assists in the ETL process to pull, cleanup and normalize metadata before it is loaded. The adapters are configurable by a JSON object which is described below. -![](assets/metadata-adapters-fig.png)*Adapters enable pulling metadata from a remote service* +![](assets/metadata-adapters-fig.png) + +*Adapters enable pulling metadata from a remote service* ## Introduction @@ -29,6 +31,146 @@ The adapter works in the following order of operations: ## Configuration A metadata service is configurable via a JSON object, with the following format: +```json lines + "configuration": { + "schema": { + ... + }, + "settings": { + ... + } + }, + "adapter_commons": { + Adapters Configuration + } +``` +### Schema + +The schema section is optional. It allows finer controls over the Elastic Search backend and if defined +will allow for schema introspection via a JSON schema. +A schema is of the form: + +```json lines + "schema": { + "__manifest": { + "description": "and array of filename (usually DRS ids and its size", + "type": "array", + "properties": { + "file_name": { + "type": "string" + }, + "file_size": { + "type": "integer" + } + } + }, + "commons_url": {}, + "short_name": { + "default" : "not_set" + }, + "tags": { + "type": "array" + }, +``` + +Where each defined field can be defined with the data type, description, and a default value. All are optional, the default type is ```string```. Note any field defined in an adapter field mapping section below NOT defined in the +schema will be added and auto typed by Elastic search. The purpose of the schema is to provide a way to explicitly type fields, especially nested object (for example ```__manifest``` above). +It also allows for a default value to be defined an aggregate metadata field will be set to if the value is not present in a metadata object. +This also allows for introspection by returning a JSON schema form using the ```info``` API call: + +``` +http://localhost:8000/aggregate/info/schema +``` + +```json +{ + "_subjects_count": { + "type": "integer", + "description": "" + }, + "__manifest": { + "type": "array", + "properties": { + "file_name": { + "type": "string", + "description": "" + }, + "file_size": { + "type": "integer", + "description": "" + } + }, + "description": "and array of filename (usually DRS ids and its size" + }, + "tags": { + "type": "array", + "description": "" + }, + "_unique_id": { + "type": "string", + "description": "" + }, + "study_description": { + "type": "string", + "description": "" + }, + "study_id": { + "type": "string", + "description": "" + }, + "study_url": { + "type": "string", + "description": "" + }, + "project_id": { + "type": "string", + "description": "" + }, + "short_name": { + "type": "string", + "description": "", + "default": "not_set" + }, + "year": { + "type": "string", + "description": "", + "default": "not_set" + }, + "full_name": { + "type": "string", + "description": "" + }, + "commons_url": { + "type": "string", + "description": "" + }, + "commons": { + "type": "string", + "description": "" + } +} +``` + +### Settings + +#### DRS Caching +* **cache_drs** : [true|false] - if set to true, the adapter will +connected to dataguids.org and cache the DRS directory information. This information is available via the +into API endpoint: +``` +wget http://localhost:8000/aggregate/info/dg.H35L + +{ + "host": "externaldata.healdata.org", + "name": "External Data HEAL", + "type": "indexd" +} +``` + +## Adapter Configuration + +The ```adapter_commons``` section of the configuration file is used to define where the aggregate metadata service pull data from. +There can be any of adapters, in fact a single Gen3 commons can be queried more that once by defining different adapter setting. ```json { @@ -76,6 +218,8 @@ A metadata service is configurable via a JSON object, with the following format: ``` *A sample configuration file* +For a fully working configuration file to pull sample data from [gen3.datacommons.io](gen3.datacommon.io) is [here](sample_aggregate_mds_config.json). + Any number of adapters can be added to a configuration file as long as the key per adapter is unique. ### Parameters @@ -87,21 +231,44 @@ The parameters of an adapter are: adapter to a site: NOTE there is no checking to ensure that the correct adapters are being used. Usually, in the case of a mismatch, errors are logged and nothing is pulled. -* ```config```: an object defining any additional parameter needed for the adapter. -* ```filters```: the parameters (or filter + * ```config```: an object defining any additional parameters needed for an adapter (see Gen3 Adapter below). + * ```filters```: the parameters (or filter properties) passed to the adapter, this is adapter specific. In the -above example, the ```study_id``` parameter is selecting which study ids to +above example, the ```study_id``` parameter for the ICPSR adapter is used to select which study ids to pull from ICPSR. +#### Adapter Setting + +* **keep_original_fields** ```[true|false]``` - allows the adapter to add all of the original +field in a study when loading. If set to true, any field already defined and process will be updated to +the processed value. + +Sometimes a need arises to filter down entries based on a field value. ```select_fields``` +config provides a way to filter out data that does NOT match. The setting are: + +* **field_name** - the field name to filter. Note that the filter is executed +after the data has been processed so the values needs to be mapped or normalized name +* **field_value** - set to a string. Any fields NOT matching this value will ot be added. + +A sample: +```"select_field": { + "field_name": "data_resource", + "field_value": "SAMHDA" + }, +``` + + + + ### Field Mappings The next section of the configuration, is the field mappings which map a field name from the remote metadata into a standard name. This process is also called normalization. The mapping is simply the name of the normalized field (what is stored in the Aggregate metadata service ) to the remote field. Think of it as ```AggMDS field = Remote Field```. While this works for simple cases, there are many instances where the field is deeper in a JSON object. To resolve this you can specify a **path selector** -### Selectors +#### Selectors A path from the start (or root) of a remote metadata field can be described using [JSON path syntax](https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html). JSON path can be used by prefixing ```path:``` to a JSON path expression to the field you want to get the value for. For example, if you wanted to get the first official name in the array OverallOfficial the selection would be ```path:OverallOfficial[0].OverallOfficialName``` You can experiment with JSON paths using a [JSON Path editor](https://jsonpath.com/). -### Filters +#### Filters The above methods should allow you to pull any nested value from a metadata entry. There are also cases where the metadata needs further processing or filtering. While this can be done in Python, by writing a new or extending an existing adapter, there is also the option to apply a filter. A filter can be added to a field using the long mapping form: ```json @@ -118,10 +285,10 @@ In this case, the ```summary``` is set to a JSON object which optionally defines The filters are applied to the text value of the remote field. Furthermore, the filters are applied in the order they appear. The current set of filters are: -* strip_html: remove HTML tags from a text field -* strip_email: remove email addresses from a text field -* add_icpsr_source_url: creates a url to the study data on ICPSR -* add_clinical_trials_source_url: creates a url to the study on clinicaltrials.gov +* **strip_html**: remove HTML tags from a text field +* **strip_email**: remove email addresses from a text field +* **add_icpsr_source_url**: creates a url to the study data on ICPSR +* **add_clinical_trials_source_url**: creates a url to the study on clinicaltrials.gov You can add your own filters, and register them by creating a python function with the signature: ```python @@ -129,7 +296,7 @@ def filter_function(s:str) -> str: ... ``` -### Default Values +#### Default Values Defining default values for fields is handled in one of two way: If a field in the metadata does not need a path, simply define the field name and a value. If a remote metadata field has a value, it will override the default. @@ -284,7 +451,41 @@ The code above does the following: 4. Return the results -While the Adapters use Object Oriented Programming, you actually do not need to extend from the classes as long as you create a class with the above signature you should be fine. +While the Adapters support Object-Oriented Programming, you actually do not need to extend from the classes as long as you create a class with the above signature you should be fine. ### Adding your Adapter Adding your adapter and or filters to be called by the populate process is still in the design phase. Currently, this requires adding the adapter code into the source code of the Gen3 metadata-service. However, shortly we will move to a plugin-based model. + +## Gen3 Adapter +The Gen3 Adapter is used to interface and retrieve data from a Gen3 Datacommons running a metadata-service. +The configuration for the Gen3 Commons is identical to what is described above. The **config** section provides a +way define what _guid_type and field to read an entry from. + +### Configuring the metadata schema +Note that the Gen3 metadata is typically in this format: + +```json lines + "ds000030": { + "_guid_type": "discovery_metadata", + "gen3_discovery": { ... +``` +The ```_guid_type``` and ```gen3_discovery``` usually default to +```"discovery_metadata"``` and ```"gen3_discovery"```. However, this is not always the case. +To account for these differences you can add the following to a Gen3 adapter config section +where ```guid_type``` sets the string for ```_guid_type``` +```study_field``` set the name of the metadata filed within +the ```guid_type``` object. + +```json lines + "config" : { + "guid_type": "unregistered_discovery_metadata", + "study_field": "my_metadata" + }, +``` +this will the look for metadata entries such as: + +```json lines + "ds000030": { + "unregistered_discovery_metadata": "discovery_metadata", + "my_metadata": { ... +``` diff --git a/docs/sample_aggregate_mds_config.json b/docs/sample_aggregate_mds_config.json index 56884a1e..863d5210 100644 --- a/docs/sample_aggregate_mds_config.json +++ b/docs/sample_aggregate_mds_config.json @@ -5,6 +5,7 @@ "type": "integer" }, "__manifest": { + "description": "an array of filename (usually DRS ids and its size", "type": "array", "properties": { "file_name": { @@ -23,13 +24,18 @@ "study_id": {}, "study_url": {}, "project_id": {}, - "short_name": {}, + "short_name": { + "default" : "not_set" + }, + "year": { + "default" : "not_set" + }, "full_name": {}, "commons_url": {}, "commons" : {} }, "settings" : { - "cache_drs" : false + "cache_drs" : true } }, "adapter_commons": { @@ -47,9 +53,14 @@ "_unique_id": "path:_unique_id", "study_description": "path:summary", "full_name": "path:study_title", + "short_name": "path:short_name", + "year": "path:year", "accession_number": "path:accession_number", "commons": "Gen3 Data Commons", - "study_url": "path:link" + "study_url": { + "path": "link", + "default": "unknown" + } } } } From 75b06486e89b7d4bf2b5eeb3eb3f941b0a9acba9 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Fri, 19 Aug 2022 13:57:02 -0500 Subject: [PATCH 41/70] clean up metadata adapter documentation --- .secrets.baseline | 4 ++-- docs/metadata_adapters.md | 35 ++++++++++++++++++----------------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/.secrets.baseline b/.secrets.baseline index 4da32422..4934125c 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -3,7 +3,7 @@ "files": null, "lines": null }, - "generated_at": "2022-08-19T16:34:02Z", + "generated_at": "2022-08-19T18:56:32Z", "plugins_used": [ { "name": "AWSKeyDetector" @@ -78,7 +78,7 @@ { "hashed_secret": "bf7e894868fd96c11edf05ef7d23122cbfa22e7e", "is_verified": false, - "line_number": 203, + "line_number": 204, "type": "Hex High Entropy String" } ], diff --git a/docs/metadata_adapters.md b/docs/metadata_adapters.md index 3a3c791b..da6063b7 100644 --- a/docs/metadata_adapters.md +++ b/docs/metadata_adapters.md @@ -32,11 +32,11 @@ The adapter works in the following order of operations: A metadata service is configurable via a JSON object, with the following format: ```json lines - "configuration": { - "schema": { +"configuration": { + "schema": { ... - }, - "settings": { + }, + "settings": { ... } }, @@ -46,7 +46,7 @@ A metadata service is configurable via a JSON object, with the following format: ``` ### Schema -The schema section is optional. It allows finer controls over the Elastic Search backend and if defined +The schema section is optional. It allows user to have a finer level of control over the Elastic Search backend and if defined will allow for schema introspection via a JSON schema. A schema is of the form: @@ -158,8 +158,9 @@ http://localhost:8000/aggregate/info/schema connected to dataguids.org and cache the DRS directory information. This information is available via the into API endpoint: ``` -wget http://localhost:8000/aggregate/info/dg.H35L - +http://localhost:8000/aggregate/info/dg.H35L +``` +``` { "host": "externaldata.healdata.org", "name": "External Data HEAL", @@ -169,8 +170,8 @@ wget http://localhost:8000/aggregate/info/dg.H35L ## Adapter Configuration -The ```adapter_commons``` section of the configuration file is used to define where the aggregate metadata service pull data from. -There can be any of adapters, in fact a single Gen3 commons can be queried more that once by defining different adapter setting. +The ```adapter_commons``` section of the configuration file is used to define where the aggregate metadata service will pull data from. +There can be any of adapters, in fact a single Gen3 commons can be queried more than once by defining different adapter setting. ```json { @@ -221,7 +222,7 @@ There can be any of adapters, in fact a single Gen3 commons can be queried more For a fully working configuration file to pull sample data from [gen3.datacommons.io](gen3.datacommon.io) is [here](sample_aggregate_mds_config.json). Any number of adapters can be added to a configuration file as long as the -key per adapter is unique. +commons name (used as a key) per adapter is unique. ### Parameters The parameters of an adapter are: @@ -251,15 +252,15 @@ after the data has been processed so the values needs to be mapped or normalized * **field_value** - set to a string. Any fields NOT matching this value will ot be added. A sample: -```"select_field": { +``` + ... + "select_field": { "field_name": "data_resource", "field_value": "SAMHDA" }, + ... ``` - - - ### Field Mappings The next section of the configuration, is the field mappings which map a field name from the remote metadata into a standard name. This process is also called normalization. The mapping is simply the name of the normalized field (what is stored in the Aggregate metadata service ) to the remote field. Think of it as ```AggMDS field = Remote Field```. While this works for simple cases, there are many instances where the field is deeper in a JSON object. To resolve this you can specify a **path selector** @@ -297,10 +298,10 @@ def filter_function(s:str) -> str: ``` #### Default Values -Defining default values for fields is handled in one of two way: +Defining default values for fields is handled in one of two ways: If a field in the metadata does not need a path, simply define the field name and a value. If a remote metadata field has a value, it will override the default. -If a path is used, then use the longer form and set the ```default_value``` to use +If a path is used, then use the longer form and set the ```default``` to use if the path is not found. ```json @@ -311,7 +312,7 @@ if the path is not found. "filters": [ "strip_html" ], - "default_value": "N/A" + "default": "N/A" }, ... } From 831774cff184485e58a997ca5db26b869c914731 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Fri, 19 Aug 2022 14:02:39 -0500 Subject: [PATCH 42/70] clean up metadata adapter documentation (update title) --- docs/metadata_adapters.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/metadata_adapters.md b/docs/metadata_adapters.md index da6063b7..76b4e24c 100644 --- a/docs/metadata_adapters.md +++ b/docs/metadata_adapters.md @@ -1,4 +1,4 @@ -# Gen3 Metadata Adapters +# Configuring the Gen3 Aggregate Metadata Service and Adapters Ingesting data into an Aggregate Metadata Service from a remote Metadata Service is handled by an adapter. An adapter is used to interface with a From 1bc26a0826d016baf73db0efc3d40ac757a20a38 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Wed, 24 Aug 2022 13:55:06 -0500 Subject: [PATCH 43/70] address comments from reviewers: update documentation, and tests --- docs/metadata_adapters.md | 22 ++++-- .../agg_mds/datastore/elasticsearch_dao.py | 67 ++++++++++++++----- src/mds/agg_mds/query.py | 4 +- tests/test_agg_mds_elasticsearch_dao.py | 48 ++++++++----- tests/test_agg_mds_query.py | 14 ++++ 5 files changed, 114 insertions(+), 41 deletions(-) diff --git a/docs/metadata_adapters.md b/docs/metadata_adapters.md index 76b4e24c..baa15c9c 100644 --- a/docs/metadata_adapters.md +++ b/docs/metadata_adapters.md @@ -244,7 +244,7 @@ pull from ICPSR. field in a study when loading. If set to true, any field already defined and process will be updated to the processed value. -Sometimes a need arises to filter down entries based on a field value. ```select_fields``` +Sometimes a need arises to filter entries based on a field value. ```select_fields``` config provides a way to filter out data that does NOT match. The setting are: * **field_name** - the field name to filter. Note that the filter is executed @@ -276,7 +276,7 @@ The above methods should allow you to pull any nested value from a metadata entr "summary": { "path":"description", "filters": ["strip_html"], - "default_value" : "N/A" + "default" : "N/A" } ``` In this case, the ```summary``` is set to a JSON object which optionally defines: @@ -288,8 +288,8 @@ The filters are applied to the text value of the remote field. Furthermore, the * **strip_html**: remove HTML tags from a text field * **strip_email**: remove email addresses from a text field -* **add_icpsr_source_url**: creates a url to the study data on ICPSR -* **add_clinical_trials_source_url**: creates a url to the study on clinicaltrials.gov +* **add_icpsr_source_url**: creates an url to the study data on ICPSR +* **add_clinical_trials_source_url**: creates an url to the study on clinicaltrials.gov You can add your own filters, and register them by creating a python function with the signature: ```python @@ -302,8 +302,18 @@ Defining default values for fields is handled in one of two ways: If a field in the metadata does not need a path, simply define the field name and a value. If a remote metadata field has a value, it will override the default. If a path is used, then use the longer form and set the ```default``` to use -if the path is not found. - +if the path is not found. The longer form of a field mapping is: +```json lines + "summary": { + "path":"description", + "filters": ["strip_html"], + "default" : "N/A" + }, +``` +where: +* ```path``` is the json path to the field +* ```filters```: list of filters to apply (optional) +* ```default```: value to set the field to if the path does not resolve (also optional) ```json { ... diff --git a/src/mds/agg_mds/datastore/elasticsearch_dao.py b/src/mds/agg_mds/datastore/elasticsearch_dao.py index 502de88e..8f7f9c50 100644 --- a/src/mds/agg_mds/datastore/elasticsearch_dao.py +++ b/src/mds/agg_mds/datastore/elasticsearch_dao.py @@ -1,5 +1,5 @@ from elasticsearch import Elasticsearch, exceptions as es_exceptions, helpers -from typing import List, Dict, Optional, Tuple +from typing import Any, List, Dict, Union, Optional, Tuple from math import ceil from mds import logger from mds.config import AGG_MDS_NAMESPACE, ES_RETRY_LIMIT, ES_RETRY_INTERVAL @@ -262,25 +262,29 @@ async def get_commons(): return [] -def count(value) -> int: +def count(value) -> Union[int, Any]: """ - returns the length of the value if list or dict otherwise returns 0 + Returns the length of the value if list or dict otherwise returns the value + If value is None returns 0 """ + if value is None: + return 0 if isinstance(value, dict) or isinstance(value, list): return len(value) - return 0 + return value -def process_record(record: dict, counts: Optional[str]) -> Tuple[str, dict]: +def process_record(record: dict, counts: Optional[List[str]]) -> Tuple[str, dict]: """ processed an MDS record from the search - returns the id and record, if counts is found in the record the length is returned + returns the id and record, if an entry in counts is found in the record the length is returned instead of the entry. """ _id = record["_id"] normalized = record["_source"] - if counts in normalized: - normalized[counts] = count(normalized[counts]) + for c in counts: + if c in normalized: + normalized[c] = count(normalized[c]) return _id, normalized @@ -290,9 +294,11 @@ async def get_all_metadata(limit, offset, counts: Optional[str] = None, flatten= offset: starting index to return counts: converts the count of the entry[count] if it is a dict or array returns: + flattend == true results : MDS results as a dict paging info + flattend == false results : { commonsA: metadata @@ -300,43 +306,72 @@ async def get_all_metadata(limit, offset, counts: Optional[str] = None, flatten= ... }, paging info + + The counts parameter provides a way to "compress" an array field to it's length. + For example: + if the record is: + {"count": [1, 2, 3, 4], "name": "my_name"} + then setting counts=count the result would be: + {"count": 4, "name": "my_name"} + + counts can take a comma separated list of field names: + { + "count": [1, 2, 3, 4], + "__manifest" : [ + { "filename": "filename1.txt", "filesize": 1000 }, + { "filename": "filename2.txt", "filesize": 5555 }, + ], + "name": "my_name" + } + + setting counts=count,__manifest the result would be: + { + "count": 4, + "__manifest" : 2, + "name": "my_name" + } + + if a counts field is not a list then it is unchanged, unless it + is null, in which case the field will be set to 0 """ try: res = elastic_search_client.search( index=AGG_MDS_INDEX, body={"size": limit, "from": offset, "query": {"match_all": {}}}, ) + hitsTotal = res["hits"]["total"] + toReduce = counts.split(",") if counts is not None else None if flatten: flat = [] for record in res["hits"]["hits"]: - id, normalized = process_record(record, counts) - flat.append({id: {"gen3_discovery": normalized}}) + rid, normalized = process_record(record, toReduce) + flat.append({rid: {"gen3_discovery": normalized}}) return { "results": flat, "pagination": { - "hits": res["hits"]["total"], + "hits": hitsTotal, "offset": offset, "pageSize": limit, - "pages": ceil(int(res["hits"]["total"]) / limit), + "pages": ceil(int(hitsTotal) / limit), }, } else: byCommons = { "results": {}, "pagination": { - "hits": res["hits"]["total"], + "hits": hitsTotal, "offset": offset, "pageSize": limit, - "pages": ceil(int(res["hits"]["total"]) / limit), + "pages": ceil(int(hitsTotal) / limit), }, } for record in res["hits"]["hits"]: - id, normalized = process_record(record, counts) + rid, normalized = process_record(record, toReduce) commons_name = normalized["commons_name"] if commons_name not in byCommons["results"]: byCommons["results"][commons_name] = [] byCommons["results"][commons_name].append( - {id: {"gen3_discovery": normalized}} + {rid: {"gen3_discovery": normalized}} ) return byCommons diff --git a/src/mds/agg_mds/query.py b/src/mds/agg_mds/query.py index 13c46da3..432b77fa 100644 --- a/src/mds/agg_mds/query.py +++ b/src/mds/agg_mds/query.py @@ -77,7 +77,7 @@ async def metadata( } } - The flatten option removes the commons namespace so all results are a child or results: + The flatten option removes the commons' namespace so all results are a child or results: results: { ... Metadata from commons A ... Metadata from commons B @@ -85,6 +85,8 @@ async def metadata( ... }, + + The counts options when applied to a """ results = await datastore.get_all_metadata(limit, offset, counts, flatten) if pagination is False: diff --git a/tests/test_agg_mds_elasticsearch_dao.py b/tests/test_agg_mds_elasticsearch_dao.py index 2a8faf67..e5637f21 100644 --- a/tests/test_agg_mds_elasticsearch_dao.py +++ b/tests/test_agg_mds_elasticsearch_dao.py @@ -12,7 +12,6 @@ AGG_MDS_INFO_INDEX_TEMP, AGG_MDS_CONFIG_INDEX_TEMP, AGG_MDS_INFO_TYPE, - AGG_MDS_CONFIG_TYPE, count, process_record, ) @@ -183,7 +182,7 @@ async def test_create_if_exists(): 400, "resource_already_exists_exception" ) ), - ) as mock_indices: + ): await elasticsearch_dao.drop_all_non_temp_indexes() await elasticsearch_dao.create_indexes(COMMON_MAPPING) @@ -193,11 +192,11 @@ async def test_create_index_raise_exception(): with patch( "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.indices.create", MagicMock(side_effect=es_exceptions.RequestError(403, "expect_to_fail")), - ) as mock_indices: + ): try: await elasticsearch_dao.create_indexes(common_mapping=COMMON_MAPPING) except Exception as exc: - assert isinstance(exc, es_exceptions.RequestError) == True + assert isinstance(exc, es_exceptions.RequestError) is True @pytest.mark.asyncio @@ -375,7 +374,7 @@ async def test_get_commons(): with patch( "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.search", MagicMock(side_effect=Exception("some error")), - ) as mock_search: + ): assert await elasticsearch_dao.get_commons() == [] @@ -387,17 +386,30 @@ def test_count_list(): assert count([1, 2, 3]) == 3 -def test_count_fail(): - assert count(123) == 0 +def test_count_value_number(): + assert count(123) == 123 + + +def test_count_value_string(): + assert count("imastring") == "imastring" + + +def test_count_value_none(): + assert count(None) == 0 def test_process_records(): _id = "123" - _source = {"count": [1, 2, 3, 4]} + _source = {"count": [1, 2, 3, 4], "name": "my_name"} record = {"_id": _id, "_source": _source} - id, normalized = process_record(record, "count") - assert id == _id - assert normalized == {"count": 4} + rid, normalized = process_record(record, ["count"]) + assert rid == _id + assert normalized == {"count": 4, "name": "my_name"} + + # test if passed dict field is not array + rid, normalized = process_record(record, ["name"]) + assert rid == _id + assert normalized == _source @pytest.mark.asyncio @@ -419,7 +431,7 @@ async def test_get_all_metadata(): with patch( "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.search", MagicMock(side_effect=Exception("some error")), - ) as mock_search: + ): assert await elasticsearch_dao.get_all_metadata(5, 9) == {} @@ -437,7 +449,7 @@ async def test_get_all_named_commons_metadata(): with patch( "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.search", MagicMock(side_effect=Exception("some error")), - ) as mock_search: + ): assert ( await elasticsearch_dao.get_all_named_commons_metadata("my-commons") == {} ) @@ -472,7 +484,7 @@ async def test_metadata_tags(): with patch( "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.search", MagicMock(side_effect=Exception("some error")), - ) as mock_search: + ): assert await elasticsearch_dao.metadata_tags() == [] @@ -490,7 +502,7 @@ async def test_get_commons_attribute(): with patch( "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.search", MagicMock(side_effect=Exception("some error")), - ) as mock_search: + ): assert await elasticsearch_dao.get_commons_attribute("my-commons") is None @@ -516,7 +528,7 @@ async def test_get_aggregations(): with patch( "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.search", MagicMock(side_effect=Exception("some error")), - ) as mock_search: + ): assert await elasticsearch_dao.get_aggregations("my-commons") == [] @@ -535,5 +547,5 @@ async def test_get_by_guid(): with patch( "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.get", MagicMock(side_effect=Exception("some error")), - ) as mock_get: - assert await elasticsearch_dao.get_by_guid("my-commons") == None + ): + assert await elasticsearch_dao.get_by_guid("my-commons") is None diff --git a/tests/test_agg_mds_query.py b/tests/test_agg_mds_query.py index 42ffd0fa..1e34d75d 100644 --- a/tests/test_agg_mds_query.py +++ b/tests/test_agg_mds_query.py @@ -260,6 +260,20 @@ async def test_aggregate_metadata_counts(client): assert resp.status_code == 200 assert resp.json() == results + # test multiple counts field + mock_data["hits"]["hits"][0]["_source"]["__manifest"] = [ + {"filename": "foo2.txt"}, + {"filename": "foo3.txt"}, + ] + with patch( + "mds.agg_mds.datastore.elasticsearch_dao.elastic_search_client.search", + MagicMock(return_value=mock_data), + ) as search: + results["Lorem ipsum"][0]["815616c0-dfsdfjjj"]["gen3_discovery"]["tags"] = 2 + resp = client.get("/aggregate/metadata?counts=__manifest,tags") + assert resp.status_code == 200 + assert resp.json() == results + @pytest.mark.asyncio async def test_aggregate_metadata_counts_null(client): From c827ebab259818d21469df62f18c9198e6cf3156 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Wed, 24 Aug 2022 17:18:08 -0500 Subject: [PATCH 44/70] add commons_name option, update documentation --- ...data_adapters.md => aggregate_metadata.md} | 19 ++++++++++++++++++- src/mds/agg_mds/commons.py | 1 + src/mds/populate.py | 9 +++++++-- 3 files changed, 26 insertions(+), 3 deletions(-) rename docs/{metadata_adapters.md => aggregate_metadata.md} (94%) diff --git a/docs/metadata_adapters.md b/docs/aggregate_metadata.md similarity index 94% rename from docs/metadata_adapters.md rename to docs/aggregate_metadata.md index baa15c9c..dce3e4f3 100644 --- a/docs/metadata_adapters.md +++ b/docs/aggregate_metadata.md @@ -228,6 +228,10 @@ commons name (used as a key) per adapter is unique. The parameters of an adapter are: * ```mds_url```: URL of the metadata serviceAPI. * ```commons_url```: the URL for the homepage the metadata source + * ```commons_name``` : override the commons_name. Typically, the commons is named using the entry name for the adapter. (ICPSR in the above config file). However there are case where +using a different name is preferred. For example if one of more adapters are assigned the same name +all the entries will be added to the commons name in the aggregateMDS. This can use to have multiple adapters +pull data from the same source, but using different mappings of filtering operations. * ```adapter```: registered name of the adapter, used to bind a particular adapter to a site: NOTE there is no checking to ensure that the correct adapters are being used. Usually, in the case of a mismatch, errors are @@ -236,7 +240,8 @@ logged and nothing is pulled. * ```filters```: the parameters (or filter properties) passed to the adapter, this is adapter specific. In the above example, the ```study_id``` parameter for the ICPSR adapter is used to select which study ids to -pull from ICPSR. +pull from ICPSR. Note that adapter themselves can have filtering options, this is +provided as a backup if no other filter option is available. #### Adapter Setting @@ -500,3 +505,15 @@ this will the look for metadata entries such as: "unregistered_discovery_metadata": "discovery_metadata", "my_metadata": { ... ``` +### Advanced filtering + +The Gen3 metadata-service supports filtering as described in the documentation. The Gen3 Adapter +allows a filter option to be configs which is passed to the MDS. Specific studies can +be pulled from the MDS by defining the filters. +The filters are part of the config setting: +```json lines + "config": { + "filters": "gen3_discovery.data_resource=SAMHDA" + }, +``` +Note that this can work along with the ```guid_type``` and ```study_field```. diff --git a/src/mds/agg_mds/commons.py b/src/mds/agg_mds/commons.py index 76053d56..4b8b5f2f 100644 --- a/src/mds/agg_mds/commons.py +++ b/src/mds/agg_mds/commons.py @@ -216,6 +216,7 @@ class AdapterMDSInstance: study_data_field: str = "gen3_discovery" keep_original_fields: bool = True global_field_filters: List[str] = field(default_factory=list) + commons_name: Optional[str] = None @dataclass_json diff --git a/src/mds/populate.py b/src/mds/populate.py index 8ae5e015..7809121d 100644 --- a/src/mds/populate.py +++ b/src/mds/populate.py @@ -66,8 +66,13 @@ def normalize(entry: dict) -> Any: entry = normalize(entry) - # add the common field and url to the entry - entry[common.study_data_field]["commons_name"] = name + # add the common field, selecting the name or an override (i.e. commons_name) and url to the entry + + entry[common.study_data_field]["commons_name"] = ( + common.commons_name + if hasattr(common, "commons_name") and common.commons_name is not None + else name + ) # add to tags item_tags = entry[common.study_data_field].get("tags", {}) From d02d1217333b42bbfd4145b0864d75ff11b8145f Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Mon, 29 Aug 2022 15:51:26 -0500 Subject: [PATCH 45/70] add swagger documentation for aggregate api --- docs/openapi.yaml | 188 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 188 insertions(+) diff --git a/docs/openapi.yaml b/docs/openapi.yaml index 55c4746e..a75d7d85 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -632,3 +632,191 @@ paths: schema: {} description: Successful Response summary: Get Version + /aggregate/info/{what}: + get: + description: "Returns status and configuration information about aggregate metadata\ + \ service. Current support only 1 information type:\ + \ **schema**" + operationId: get_aggregate_info + parameters: + - in: path + required: true + schema: + title: What + type: string + name: what + description: type of information to return + responses: + '200': + description: Successful Response + content: + application/json: + schema: {} + summary: Get Config Information + tags: + - Aggregate + /aggregate/commons: + get: + description: "Returns a list of all commons with data in the aggregate metadata-service" + operationId: get_aggregate_commons + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + properties: + commons: + type: array + items: + type: string + example: + - commons: ["commonsA", "commonsB"] + summary: Get Commons + tags: + - Aggregate + /aggregate/tags: + get: + description: "Returns aggregate category, name and counts across all commons" + operationId: get_aggregate_tags + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + example: + - { + "Data Type": { + "total": 275, + "names": [ + { + "Genotype": 103, + "Clinical Phenotype": 100, + "DCC Harmonized": 24, + "WGS": 20, + "SNP/CNV Genotypes (NGS)": 6, + "RNA-Seq": 5, + "WXS": 5, + "Targeted-Capture": 3, + "miRNA-Seq": 3, + "CNV Genotypes": 2 + } + ] + } + } + summary: Get tag counts information + tags: + - Aggregate + /aggregate/metadata: + get: + description: "Returns a list of all commons with data in the aggregate metadata-service" + operationId: get_aggregate_metadata + parameters: + - in: query + name: limit + required: false + schema: + title: limit + type: integer + default: 20 + - in: query + name: offset + schema: + title: offset + type: integer + default: 0 + description: "Return results at this given offset" + - in: query + name: flatten + schema: + title: flatten + type: boolean + default: false + description: "Return the results without grouping items by commons" + - in: query + name: pagination + schema: + title: pagination + type: boolean + default: false + description: "If true will return a pagination object in the response" + - in: query + name: counts + schema: + title: counts + type: string + default: "" + description: "Return count of a field instead of the value if field is an array\ + \ otherwise field is unchanged. If field is null will set field to 0.\ + \ Multiple fields can be compressed by comma separating the field names:\ + \ _files,authors" + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + additionalProperties: true + example: + - { + "commonA" : [ { id2: { name: "bear" } } , { id3: { name: "cat" } } ], + "commonB" : [ { id200: { name: "shark" } } , { id312: { name: "bass" }} ] + } + summary: Get metadata records from aggregate metadata + tags: + - Aggregate + /aggregate/metadata/{name}: + get: + description: "Returns an array containing all the metadata entries for a single commons. There are no limit/offset parameters." + operationId: get_aggregate_metadata_for_commons + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + additionalProperties: true + example: + - [ { id2: { name: "bear" } } , { id3: { name: "cat" } }] + summary: Get all metadata records from a commons by name + tags: + - Aggregate + /aggregate/metadata/{name}/info: + get: + description: "Returns an object containing additional information about a commons" + operationId: get_aggregate_metadata_commons_info + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + additionalProperties: true + example: + - { commons_url: "gen3.datacommons.io" } + summary: Get additional named commons information + tags: + - Aggregate + /aggregate/metadata/guid/{guid}: + get: + description: "Returns a metadata record by guid" + operationId: get_aggregate_metadata_guid + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + additionalProperties: true + example: + - { id2: { name: "bear" } } + summary: Get metadata entry by guid + tags: + - Aggregate From e67cbe0dcc15a9f28547f8aacf97da4d976ca93a Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Mon, 29 Aug 2022 15:55:28 -0500 Subject: [PATCH 46/70] update in-source documentation --- src/mds/agg_mds/datastore/elasticsearch_dao.py | 2 +- src/mds/agg_mds/query.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/mds/agg_mds/datastore/elasticsearch_dao.py b/src/mds/agg_mds/datastore/elasticsearch_dao.py index 8f7f9c50..cc2a8319 100644 --- a/src/mds/agg_mds/datastore/elasticsearch_dao.py +++ b/src/mds/agg_mds/datastore/elasticsearch_dao.py @@ -331,7 +331,7 @@ async def get_all_metadata(limit, offset, counts: Optional[str] = None, flatten= "name": "my_name" } - if a counts field is not a list then it is unchanged, unless it + if a counts field is not a list or dict then it is unchanged, unless it is null, in which case the field will be set to 0 """ try: diff --git a/src/mds/agg_mds/query.py b/src/mds/agg_mds/query.py index 432b77fa..b2ff097c 100644 --- a/src/mds/agg_mds/query.py +++ b/src/mds/agg_mds/query.py @@ -34,7 +34,7 @@ async def get_commons(what: str): async def metadata( _: Request, limit: int = Query( - 20, description="Maximum number of records returned. (max: 2000)" + 20, description="Maximum number of records returned. (e.g. max: 2000)" ), offset: int = Query(0, description="Return results at this given offset."), counts: str = Query( @@ -86,7 +86,9 @@ async def metadata( }, - The counts options when applied to a + The counts options when applied to an array or dictionary will replace + the field value with its length. If the field values is None it will replace it with 0. + All other type will be unchanged. """ results = await datastore.get_all_metadata(limit, offset, counts, flatten) if pagination is False: From bf88319b28c2206a8f3441b0a10fe6584a1a23ee Mon Sep 17 00:00:00 2001 From: craigrbarnes Date: Mon, 29 Aug 2022 21:44:13 +0000 Subject: [PATCH 47/70] Apply automatic documentation changes --- docs/openapi.yaml | 188 ---------------------------------------------- 1 file changed, 188 deletions(-) diff --git a/docs/openapi.yaml b/docs/openapi.yaml index a75d7d85..55c4746e 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -632,191 +632,3 @@ paths: schema: {} description: Successful Response summary: Get Version - /aggregate/info/{what}: - get: - description: "Returns status and configuration information about aggregate metadata\ - \ service. Current support only 1 information type:\ - \ **schema**" - operationId: get_aggregate_info - parameters: - - in: path - required: true - schema: - title: What - type: string - name: what - description: type of information to return - responses: - '200': - description: Successful Response - content: - application/json: - schema: {} - summary: Get Config Information - tags: - - Aggregate - /aggregate/commons: - get: - description: "Returns a list of all commons with data in the aggregate metadata-service" - operationId: get_aggregate_commons - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - properties: - commons: - type: array - items: - type: string - example: - - commons: ["commonsA", "commonsB"] - summary: Get Commons - tags: - - Aggregate - /aggregate/tags: - get: - description: "Returns aggregate category, name and counts across all commons" - operationId: get_aggregate_tags - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - example: - - { - "Data Type": { - "total": 275, - "names": [ - { - "Genotype": 103, - "Clinical Phenotype": 100, - "DCC Harmonized": 24, - "WGS": 20, - "SNP/CNV Genotypes (NGS)": 6, - "RNA-Seq": 5, - "WXS": 5, - "Targeted-Capture": 3, - "miRNA-Seq": 3, - "CNV Genotypes": 2 - } - ] - } - } - summary: Get tag counts information - tags: - - Aggregate - /aggregate/metadata: - get: - description: "Returns a list of all commons with data in the aggregate metadata-service" - operationId: get_aggregate_metadata - parameters: - - in: query - name: limit - required: false - schema: - title: limit - type: integer - default: 20 - - in: query - name: offset - schema: - title: offset - type: integer - default: 0 - description: "Return results at this given offset" - - in: query - name: flatten - schema: - title: flatten - type: boolean - default: false - description: "Return the results without grouping items by commons" - - in: query - name: pagination - schema: - title: pagination - type: boolean - default: false - description: "If true will return a pagination object in the response" - - in: query - name: counts - schema: - title: counts - type: string - default: "" - description: "Return count of a field instead of the value if field is an array\ - \ otherwise field is unchanged. If field is null will set field to 0.\ - \ Multiple fields can be compressed by comma separating the field names:\ - \ _files,authors" - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - example: - - { - "commonA" : [ { id2: { name: "bear" } } , { id3: { name: "cat" } } ], - "commonB" : [ { id200: { name: "shark" } } , { id312: { name: "bass" }} ] - } - summary: Get metadata records from aggregate metadata - tags: - - Aggregate - /aggregate/metadata/{name}: - get: - description: "Returns an array containing all the metadata entries for a single commons. There are no limit/offset parameters." - operationId: get_aggregate_metadata_for_commons - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - example: - - [ { id2: { name: "bear" } } , { id3: { name: "cat" } }] - summary: Get all metadata records from a commons by name - tags: - - Aggregate - /aggregate/metadata/{name}/info: - get: - description: "Returns an object containing additional information about a commons" - operationId: get_aggregate_metadata_commons_info - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - example: - - { commons_url: "gen3.datacommons.io" } - summary: Get additional named commons information - tags: - - Aggregate - /aggregate/metadata/guid/{guid}: - get: - description: "Returns a metadata record by guid" - operationId: get_aggregate_metadata_guid - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - example: - - { id2: { name: "bear" } } - summary: Get metadata entry by guid - tags: - - Aggregate From e41d1b51ff000e5bf3d44d2df2e1923952dfcc27 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Tue, 30 Aug 2022 10:26:15 -0500 Subject: [PATCH 48/70] update swagger --- docs/openapi.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/openapi.yaml b/docs/openapi.yaml index a75d7d85..c7dd8509 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -805,7 +805,7 @@ paths: - Aggregate /aggregate/metadata/guid/{guid}: get: - description: "Returns a metadata record by guid" + description: "Returns a metadata record by GUID" operationId: get_aggregate_metadata_guid responses: '200': From 4762bdfd0f9a0263739c0087d164ae07ea74d862 Mon Sep 17 00:00:00 2001 From: craigrbarnes Date: Tue, 30 Aug 2022 15:28:37 +0000 Subject: [PATCH 49/70] Apply automatic documentation changes --- docs/openapi.yaml | 188 ---------------------------------------------- 1 file changed, 188 deletions(-) diff --git a/docs/openapi.yaml b/docs/openapi.yaml index c7dd8509..55c4746e 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -632,191 +632,3 @@ paths: schema: {} description: Successful Response summary: Get Version - /aggregate/info/{what}: - get: - description: "Returns status and configuration information about aggregate metadata\ - \ service. Current support only 1 information type:\ - \ **schema**" - operationId: get_aggregate_info - parameters: - - in: path - required: true - schema: - title: What - type: string - name: what - description: type of information to return - responses: - '200': - description: Successful Response - content: - application/json: - schema: {} - summary: Get Config Information - tags: - - Aggregate - /aggregate/commons: - get: - description: "Returns a list of all commons with data in the aggregate metadata-service" - operationId: get_aggregate_commons - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - properties: - commons: - type: array - items: - type: string - example: - - commons: ["commonsA", "commonsB"] - summary: Get Commons - tags: - - Aggregate - /aggregate/tags: - get: - description: "Returns aggregate category, name and counts across all commons" - operationId: get_aggregate_tags - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - example: - - { - "Data Type": { - "total": 275, - "names": [ - { - "Genotype": 103, - "Clinical Phenotype": 100, - "DCC Harmonized": 24, - "WGS": 20, - "SNP/CNV Genotypes (NGS)": 6, - "RNA-Seq": 5, - "WXS": 5, - "Targeted-Capture": 3, - "miRNA-Seq": 3, - "CNV Genotypes": 2 - } - ] - } - } - summary: Get tag counts information - tags: - - Aggregate - /aggregate/metadata: - get: - description: "Returns a list of all commons with data in the aggregate metadata-service" - operationId: get_aggregate_metadata - parameters: - - in: query - name: limit - required: false - schema: - title: limit - type: integer - default: 20 - - in: query - name: offset - schema: - title: offset - type: integer - default: 0 - description: "Return results at this given offset" - - in: query - name: flatten - schema: - title: flatten - type: boolean - default: false - description: "Return the results without grouping items by commons" - - in: query - name: pagination - schema: - title: pagination - type: boolean - default: false - description: "If true will return a pagination object in the response" - - in: query - name: counts - schema: - title: counts - type: string - default: "" - description: "Return count of a field instead of the value if field is an array\ - \ otherwise field is unchanged. If field is null will set field to 0.\ - \ Multiple fields can be compressed by comma separating the field names:\ - \ _files,authors" - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - example: - - { - "commonA" : [ { id2: { name: "bear" } } , { id3: { name: "cat" } } ], - "commonB" : [ { id200: { name: "shark" } } , { id312: { name: "bass" }} ] - } - summary: Get metadata records from aggregate metadata - tags: - - Aggregate - /aggregate/metadata/{name}: - get: - description: "Returns an array containing all the metadata entries for a single commons. There are no limit/offset parameters." - operationId: get_aggregate_metadata_for_commons - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - example: - - [ { id2: { name: "bear" } } , { id3: { name: "cat" } }] - summary: Get all metadata records from a commons by name - tags: - - Aggregate - /aggregate/metadata/{name}/info: - get: - description: "Returns an object containing additional information about a commons" - operationId: get_aggregate_metadata_commons_info - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - example: - - { commons_url: "gen3.datacommons.io" } - summary: Get additional named commons information - tags: - - Aggregate - /aggregate/metadata/guid/{guid}: - get: - description: "Returns a metadata record by GUID" - operationId: get_aggregate_metadata_guid - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - example: - - { id2: { name: "bear" } } - summary: Get metadata entry by guid - tags: - - Aggregate From 2bb38982fac6eb3ab6cc7eb1811fb20bf10eb6da Mon Sep 17 00:00:00 2001 From: Aarti Venkat Date: Thu, 1 Sep 2022 13:19:01 -0500 Subject: [PATCH 50/70] updated setuptools and poetry lock --- poetry.lock | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/poetry.lock b/poetry.lock index 919183ab..8702a6b5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -901,7 +901,7 @@ python-versions = ">=3.4" [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "44fa6307ade0d2628c8746313b66e99abb1ca7a876b66b280f85f4fcd3de8276" +content-hash = "b578592284fd33943267e0c2ebafeb9ddce4d36d4a062577fe007928db76954f" [metadata.files] alembic = [ diff --git a/pyproject.toml b/pyproject.toml index cfed94dd..7fe300a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ authutils = "^5.0.4" cdislogging = "^1.0" click = "==7.1.*,>=7.1.2" pyyaml = "==5.4.*,>=5.4.1" -setuptools = "==52.0.*,>=52.0.0" +setuptools = "*" dataclasses-json = "==0.5.*,>=0.5.2" pytest-asyncio = "^0.15.1" jsonpath-ng = "^1.5.3" From 30c4e5c3f7ed211a3fd08ba367ebc7c02c67d609 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Thu, 1 Sep 2022 14:23:49 -0500 Subject: [PATCH 51/70] fix typo in aggMDS documentation --- docs/aggregate_metadata.md | 2 +- docs/openapi.yaml | 188 +++++++++++++++++++++++++++++++++++++ 2 files changed, 189 insertions(+), 1 deletion(-) diff --git a/docs/aggregate_metadata.md b/docs/aggregate_metadata.md index dce3e4f3..17bdb474 100644 --- a/docs/aggregate_metadata.md +++ b/docs/aggregate_metadata.md @@ -53,7 +53,7 @@ A schema is of the form: ```json lines "schema": { "__manifest": { - "description": "and array of filename (usually DRS ids and its size", + "description": "an array of filename (usually DRS ids and its size", "type": "array", "properties": { "file_name": { diff --git a/docs/openapi.yaml b/docs/openapi.yaml index 0f672bd4..de3c88ab 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -632,3 +632,191 @@ paths: schema: {} description: Successful Response summary: Get Version + /aggregate/info/{what}: + get: + description: "Returns status and configuration information about aggregate metadata\ + \ service. Current support only 1 information type:\ + \ **schema**" + operationId: get_aggregate_info + parameters: + - in: path + required: true + schema: + title: What + type: string + name: what + description: type of information to return + responses: + '200': + description: Successful Response + content: + application/json: + schema: {} + summary: Get Config Information + tags: + - Aggregate + /aggregate/commons: + get: + description: "Returns a list of all commons with data in the aggregate metadata-service" + operationId: get_aggregate_commons + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + properties: + commons: + type: array + items: + type: string + example: + - commons: ["commonsA", "commonsB"] + summary: Get Commons + tags: + - Aggregate + /aggregate/tags: + get: + description: "Returns aggregate category, name and counts across all commons" + operationId: get_aggregate_tags + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + example: + - { + "Data Type": { + "total": 275, + "names": [ + { + "Genotype": 103, + "Clinical Phenotype": 100, + "DCC Harmonized": 24, + "WGS": 20, + "SNP/CNV Genotypes (NGS)": 6, + "RNA-Seq": 5, + "WXS": 5, + "Targeted-Capture": 3, + "miRNA-Seq": 3, + "CNV Genotypes": 2 + } + ] + } + } + summary: Get tag counts information + tags: + - Aggregate + /aggregate/metadata: + get: + description: "Returns a list of all commons with data in the aggregate metadata-service" + operationId: get_aggregate_metadata + parameters: + - in: query + name: limit + required: false + schema: + title: limit + type: integer + default: 20 + - in: query + name: offset + schema: + title: offset + type: integer + default: 0 + description: "Return results at this given offset" + - in: query + name: flatten + schema: + title: flatten + type: boolean + default: false + description: "Return the results without grouping items by commons" + - in: query + name: pagination + schema: + title: pagination + type: boolean + default: false + description: "If true will return a pagination object in the response" + - in: query + name: counts + schema: + title: counts + type: string + default: "" + description: "Return count of a field instead of the value if field is an array\ + \ otherwise field is unchanged. If field is null will set field to 0.\ + \ Multiple fields can be compressed by comma separating the field names:\ + \ _files,authors" + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + additionalProperties: true + example: + - { + "commonA" : [ { id2: { name: "bear" } } , { id3: { name: "cat" } } ], + "commonB" : [ { id200: { name: "shark" } } , { id312: { name: "bass" }} ] + } + summary: Get metadata records from aggregate metadata + tags: + - Aggregate + /aggregate/metadata/{name}: + get: + description: "Returns an array containing all the metadata entries for a single commons. There are no limit/offset parameters." + operationId: get_aggregate_metadata_for_commons + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + additionalProperties: true + example: + - [ { id2: { name: "bear" } } , { id3: { name: "cat" } }] + summary: Get all metadata records from a commons by name + tags: + - Aggregate + /aggregate/metadata/{name}/info: + get: + description: "Returns an object containing additional information about a commons" + operationId: get_aggregate_metadata_commons_info + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + additionalProperties: true + example: + - { commons_url: "gen3.datacommons.io" } + summary: Get additional named commons information + tags: + - Aggregate + /aggregate/metadata/guid/{guid}: + get: + description: "Returns a metadata record by GUID" + operationId: get_aggregate_metadata_guid + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + additionalProperties: true + example: + - { id2: { name: "bear" } } + summary: Get metadata entry by guid + tags: + - Aggregate From 3e38c6b7b6d129722d951701092160264e26aa95 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Mon, 12 Sep 2022 15:13:39 -0500 Subject: [PATCH 52/70] revert poetry --- poetry.lock | 128 ++++++++++++++++++++----------------------------- pyproject.toml | 1 + 2 files changed, 53 insertions(+), 76 deletions(-) diff --git a/poetry.lock b/poetry.lock index aa763288..f1a49b9f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -26,8 +26,8 @@ idna = ">=2.8" sniffio = ">=1.1" [package.extras] -doc = ["packaging", "sphinx-rtd-theme", "sphinx-autodoc-typehints (>=1.2.0)"] -test = ["coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "contextlib2", "uvloop (<0.15)", "mock (>=4)", "uvloop (>=0.15)"] +doc = ["packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] +test = ["contextlib2", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (<0.15)", "uvloop (>=0.15)"] trio = ["trio (>=0.16)"] [[package]] @@ -39,9 +39,9 @@ optional = false python-versions = ">=3.6.0" [package.extras] -dev = ["Cython (>=0.29.24,<0.30.0)", "pytest (>=6.0)", "Sphinx (>=4.1.2,<4.2.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "pycodestyle (>=2.7.0,<2.8.0)", "flake8 (>=3.9.2,<3.10.0)", "uvloop (>=0.15.3)"] -docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)"] -test = ["pycodestyle (>=2.7.0,<2.8.0)", "flake8 (>=3.9.2,<3.10.0)", "uvloop (>=0.15.3)"] +dev = ["Cython (>=0.29.24,<0.30.0)", "Sphinx (>=4.1.2,<4.2.0)", "flake8 (>=3.9.2,<3.10.0)", "pycodestyle (>=2.7.0,<2.8.0)", "pytest (>=6.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "uvloop (>=0.15.3)"] +docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] +test = ["flake8 (>=3.9.2,<3.10.0)", "pycodestyle (>=2.7.0,<2.8.0)", "uvloop (>=0.15.3)"] [[package]] name = "atomicwrites" @@ -60,10 +60,10 @@ optional = false python-versions = ">=3.5" [package.extras] -dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "mypy (>=0.900,!=0.940)", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit", "cloudpickle"] -docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"] -tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "mypy (>=0.900,!=0.940)", "pytest-mypy-plugins", "zope.interface", "cloudpickle"] -tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "mypy (>=0.900,!=0.940)", "pytest-mypy-plugins", "cloudpickle"] +dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy (>=0.900,!=0.940)", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "sphinx", "sphinx-notfound-page", "zope.interface"] +docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"] +tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy (>=0.900,!=0.940)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "zope.interface"] +tests_no_zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy (>=0.900,!=0.940)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins"] [[package]] name = "authlib" @@ -96,8 +96,8 @@ pyjwt = {version = ">=1.5,<2.0", extras = ["crypto"]} xmltodict = ">=0.9,<1.0" [package.extras] -flask = ["Flask (>=0.10.1)"] fastapi = ["fastapi (>=0.54.1,<0.55.0)"] +flask = ["Flask (>=0.10.1)"] [[package]] name = "backoff" @@ -121,7 +121,7 @@ webencodings = "*" [package.extras] css = ["tinycss2 (>=1.1.0,<1.2)"] -dev = ["build (==0.8.0)", "flake8 (==4.0.1)", "hashin (==0.17.0)", "pip-tools (==6.6.2)", "pytest (==7.1.2)", "Sphinx (==4.3.2)", "tox (==3.25.0)", "twine (==4.0.1)", "wheel (==0.37.1)", "black (==22.3.0)", "mypy (==0.961)"] +dev = ["Sphinx (==4.3.2)", "black (==22.3.0)", "build (==0.8.0)", "flake8 (==4.0.1)", "hashin (==0.17.0)", "mypy (==0.961)", "pip-tools (==6.6.2)", "pytest (==7.1.2)", "tox (==3.25.0)", "twine (==4.0.1)", "wheel (==0.37.1)"] [[package]] name = "cached-property" @@ -227,7 +227,7 @@ docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine ( pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"] sdist = ["setuptools_rust (>=0.11.4)"] ssh = ["bcrypt (>=3.1.5)"] -test = ["pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-subtests", "pytest-xdist", "pretend", "iso8601", "pytz", "hypothesis (>=1.11.4,!=3.79.2)"] +test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-subtests", "pytest-xdist", "pytz"] [[package]] name = "dataclasses-json" @@ -243,7 +243,7 @@ marshmallow-enum = ">=1.5.1,<2.0.0" typing-inspect = ">=0.4.0" [package.extras] -dev = ["pytest (>=6.2.3)", "ipython", "mypy (>=0.710)", "hypothesis", "portray", "flake8", "simplejson", "types-dataclasses"] +dev = ["flake8", "hypothesis", "ipython", "mypy (>=0.710)", "portray", "pytest (>=6.2.3)", "simplejson", "types-dataclasses"] [[package]] name = "decorator" @@ -265,7 +265,7 @@ python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, <4" urllib3 = ">=1.21.1" [package.extras] -develop = ["requests (>=2.0.0,<3.0.0)", "nose", "coverage", "mock", "pyyaml", "nosexcover", "numpy", "pandas", "sphinx (<1.7)", "sphinx-rtd-theme"] +develop = ["coverage", "mock", "nose", "nosexcover", "numpy", "pandas", "pyyaml", "requests (>=2.0.0,<3.0.0)", "sphinx (<1.7)", "sphinx-rtd-theme"] requests = ["requests (>=2.4.0,<3.0.0)"] [[package]] @@ -281,10 +281,10 @@ pydantic = ">=1.6.2,<1.7 || >1.7,<1.7.1 || >1.7.1,<1.7.2 || >1.7.2,<1.7.3 || >1. starlette = "0.14.2" [package.extras] -all = ["requests (>=2.24.0,<3.0.0)", "aiofiles (>=0.5.0,<0.6.0)", "jinja2 (>=2.11.2,<3.0.0)", "python-multipart (>=0.0.5,<0.0.6)", "itsdangerous (>=1.1.0,<2.0.0)", "pyyaml (>=5.3.1,<6.0.0)", "graphene (>=2.1.8,<3.0.0)", "ujson (>=4.0.1,<5.0.0)", "orjson (>=3.2.1,<4.0.0)", "email_validator (>=1.1.1,<2.0.0)", "uvicorn[standard] (>=0.12.0,<0.14.0)", "async_exit_stack (>=1.0.1,<2.0.0)", "async_generator (>=1.10,<2.0.0)"] -dev = ["python-jose[cryptography] (>=3.1.0,<4.0.0)", "passlib[bcrypt] (>=1.7.2,<2.0.0)", "autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "uvicorn[standard] (>=0.12.0,<0.14.0)", "graphene (>=2.1.8,<3.0.0)"] -doc = ["mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=7.1.9,<8.0.0)", "markdown-include (>=0.6.0,<0.7.0)", "mkdocs-markdownextradata-plugin (>=0.1.7,<0.2.0)", "typer-cli (>=0.0.12,<0.0.13)", "pyyaml (>=5.3.1,<6.0.0)"] -test = ["pytest (==5.4.3)", "pytest-cov (==2.10.0)", "pytest-asyncio (>=0.14.0,<0.15.0)", "mypy (==0.812)", "flake8 (>=3.8.3,<4.0.0)", "black (==20.8b1)", "isort (>=5.0.6,<6.0.0)", "requests (>=2.24.0,<3.0.0)", "httpx (>=0.14.0,<0.15.0)", "email_validator (>=1.1.1,<2.0.0)", "sqlalchemy (>=1.3.18,<1.4.0)", "peewee (>=3.13.3,<4.0.0)", "databases[sqlite] (>=0.3.2,<0.4.0)", "orjson (>=3.2.1,<4.0.0)", "ujson (>=4.0.1,<5.0.0)", "async_exit_stack (>=1.0.1,<2.0.0)", "async_generator (>=1.10,<2.0.0)", "python-multipart (>=0.0.5,<0.0.6)", "aiofiles (>=0.5.0,<0.6.0)", "flask (>=1.1.2,<2.0.0)"] +all = ["aiofiles (>=0.5.0,<0.6.0)", "async_exit_stack (>=1.0.1,<2.0.0)", "async_generator (>=1.10,<2.0.0)", "email_validator (>=1.1.1,<2.0.0)", "graphene (>=2.1.8,<3.0.0)", "itsdangerous (>=1.1.0,<2.0.0)", "jinja2 (>=2.11.2,<3.0.0)", "orjson (>=3.2.1,<4.0.0)", "python-multipart (>=0.0.5,<0.0.6)", "pyyaml (>=5.3.1,<6.0.0)", "requests (>=2.24.0,<3.0.0)", "ujson (>=4.0.1,<5.0.0)", "uvicorn[standard] (>=0.12.0,<0.14.0)"] +dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "graphene (>=2.1.8,<3.0.0)", "passlib[bcrypt] (>=1.7.2,<2.0.0)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "uvicorn[standard] (>=0.12.0,<0.14.0)"] +doc = ["markdown-include (>=0.6.0,<0.7.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-markdownextradata-plugin (>=0.1.7,<0.2.0)", "mkdocs-material (>=7.1.9,<8.0.0)", "pyyaml (>=5.3.1,<6.0.0)", "typer-cli (>=0.0.12,<0.0.13)"] +test = ["aiofiles (>=0.5.0,<0.6.0)", "async_exit_stack (>=1.0.1,<2.0.0)", "async_generator (>=1.10,<2.0.0)", "black (==20.8b1)", "databases[sqlite] (>=0.3.2,<0.4.0)", "email_validator (>=1.1.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "flask (>=1.1.2,<2.0.0)", "httpx (>=0.14.0,<0.15.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.812)", "orjson (>=3.2.1,<4.0.0)", "peewee (>=3.13.3,<4.0.0)", "pytest (==5.4.3)", "pytest-asyncio (>=0.14.0,<0.15.0)", "pytest-cov (==2.10.0)", "python-multipart (>=0.0.5,<0.0.6)", "requests (>=2.24.0,<3.0.0)", "sqlalchemy (>=1.3.18,<1.4.0)", "ujson (>=4.0.1,<5.0.0)"] [[package]] name = "gen3authz" @@ -314,11 +314,11 @@ gino-starlette = {version = ">=0.1.1,<0.2.0", optional = true, markers = "python SQLAlchemy = ">=1.2.16,<1.4" [package.extras] +aiohttp = ["gino-aiohttp (>=0.1.0,<0.2.0)"] quart = ["gino-quart (>=0.1.0,<0.2.0)"] sanic = ["gino-sanic (>=0.1.0,<0.2.0)"] -tornado = ["gino-tornado (>=0.1.0,<0.2.0)"] -aiohttp = ["gino-aiohttp (>=0.1.0,<0.2.0)"] starlette = ["gino-starlette (>=0.1.1,<0.2.0)"] +tornado = ["gino-tornado (>=0.1.0,<0.2.0)"] [[package]] name = "gino-starlette" @@ -401,8 +401,8 @@ rfc3986 = {version = ">=1.3,<2", extras = ["idna2008"]} sniffio = "*" [package.extras] -brotli = ["brotlicffi", "brotli"] -cli = ["click (>=8.0.0,<9.0.0)", "rich (>=10,<13)", "pygments (>=2.0.0,<3.0.0)"] +brotli = ["brotli", "brotlicffi"] +cli = ["click (>=8.0.0,<9.0.0)", "pygments (>=2.0.0,<3.0.0)", "rich (>=10,<13)"] http2 = ["h2 (>=3,<5)"] socks = ["socksio (>=1.0.0,<2.0.0)"] @@ -474,9 +474,9 @@ python-versions = ">=3.7" packaging = ">=17.0" [package.extras] -dev = ["pytest", "pytz", "simplejson", "mypy (==0.971)", "flake8 (==5.0.4)", "flake8-bugbear (==22.8.22)", "pre-commit (>=2.4,<3.0)", "tox"] -docs = ["sphinx (==5.1.1)", "sphinx-issues (==3.0.1)", "alabaster (==0.7.12)", "sphinx-version-warning (==1.1.2)", "autodocsumm (==0.2.9)"] -lint = ["mypy (==0.971)", "flake8 (==5.0.4)", "flake8-bugbear (==22.8.22)", "pre-commit (>=2.4,<3.0)"] +dev = ["flake8 (==5.0.4)", "flake8-bugbear (==22.8.22)", "mypy (==0.971)", "pre-commit (>=2.4,<3.0)", "pytest", "pytz", "simplejson", "tox"] +docs = ["alabaster (==0.7.12)", "autodocsumm (==0.2.9)", "sphinx (==5.1.1)", "sphinx-issues (==3.0.1)", "sphinx-version-warning (==1.1.2)"] +lint = ["flake8 (==5.0.4)", "flake8-bugbear (==22.8.22)", "mypy (==0.971)", "pre-commit (>=2.4,<3.0)"] tests = ["pytest", "pytz", "simplejson"] [[package]] @@ -583,6 +583,17 @@ typing-extensions = ">=4.1.0" dotenv = ["python-dotenv (>=0.10.4)"] email = ["email-validator (>=1.0.3)"] +[[package]] +name = "pydash" +version = "5.1.0" +description = "The kitchen sink of Python utility libraries for doing \"stuff\" in a functional way. Based on the Lo-Dash Javascript library." +category = "main" +optional = false +python-versions = ">=3.6" + +[package.extras] +dev = ["Sphinx", "black", "coverage", "docformatter", "flake8", "flake8-black", "flake8-bugbear", "flake8-isort", "invoke", "isort", "pylint", "pytest", "pytest-cov", "pytest-flake8", "pytest-pylint", "sphinx-rtd-theme", "tox", "twine", "wheel"] + [[package]] name = "pyjwt" version = "1.7.1" @@ -608,7 +619,7 @@ optional = false python-versions = ">=3.6.8" [package.extras] -diagrams = ["railroad-diagrams", "jinja2"] +diagrams = ["jinja2", "railroad-diagrams"] [[package]] name = "pytest" @@ -660,7 +671,7 @@ pytest = ">=4.6" toml = "*" [package.extras] -testing = ["fields", "hunter", "process-tests", "six", "pytest-xdist", "virtualenv"] +testing = ["fields", "hunter", "process-tests", "pytest-xdist", "six", "virtualenv"] [[package]] name = "python-dotenv" @@ -747,11 +758,11 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" [[package]] name = "sniffio" -version = "1.3.0" +version = "1.2.0" description = "Sniff out which async library your code is running under" category = "main" optional = false -python-versions = ">=3.7" +python-versions = ">=3.5" [[package]] name = "sqlalchemy" @@ -771,7 +782,7 @@ postgresql = ["psycopg2"] postgresql_pg8000 = ["pg8000 (<1.16.6)"] postgresql_psycopg2binary = ["psycopg2-binary"] postgresql_psycopg2cffi = ["psycopg2cffi"] -pymysql = ["pymysql (<1)", "pymysql"] +pymysql = ["pymysql", "pymysql (<1)"] [[package]] name = "starlette" @@ -832,8 +843,8 @@ optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4" [package.extras] -brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"] -secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "urllib3-secure-extra", "ipaddress"] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] +secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [[package]] @@ -867,9 +878,9 @@ optional = false python-versions = ">=3.7" [package.extras] -dev = ["Cython (>=0.29.24,<0.30.0)", "pytest (>=3.6.0)", "Sphinx (>=4.1.2,<4.2.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "aiohttp", "flake8 (>=3.9.2,<3.10.0)", "psutil", "pycodestyle (>=2.7.0,<2.8.0)", "pyOpenSSL (>=19.0.0,<19.1.0)", "mypy (>=0.800)"] -docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)"] -test = ["aiohttp", "flake8 (>=3.9.2,<3.10.0)", "psutil", "pycodestyle (>=2.7.0,<2.8.0)", "pyOpenSSL (>=19.0.0,<19.1.0)", "mypy (>=0.800)"] +dev = ["Cython (>=0.29.24,<0.30.0)", "Sphinx (>=4.1.2,<4.2.0)", "aiohttp", "flake8 (>=3.9.2,<3.10.0)", "mypy (>=0.800)", "psutil", "pyOpenSSL (>=19.0.0,<19.1.0)", "pycodestyle (>=2.7.0,<2.8.0)", "pytest (>=3.6.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] +docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] +test = ["aiohttp", "flake8 (>=3.9.2,<3.10.0)", "mypy (>=0.800)", "psutil", "pyOpenSSL (>=19.0.0,<19.1.0)", "pycodestyle (>=2.7.0,<2.8.0)"] [[package]] name = "watchfiles" @@ -1422,6 +1433,10 @@ pydantic = [ {file = "pydantic-1.10.1-py3-none-any.whl", hash = "sha256:f8b10e59c035ff3dcc9791619d6e6c5141e0fa5cbe264e19e267b8d523b210bf"}, {file = "pydantic-1.10.1.tar.gz", hash = "sha256:d41bb80347a8a2d51fbd6f1748b42aca14541315878447ba159617544712f770"}, ] +pydash = [ + {file = "pydash-5.1.0-py3-none-any.whl", hash = "sha256:ced4fedb163eb07fbee376e474bca74029eb9fab215614449fe13164f71dd9e3"}, + {file = "pydash-5.1.0.tar.gz", hash = "sha256:1b2b050ac1bae049cd07f5920b14fabbe52638f485d9ada1eb115a9eebff6835"}, +] pyjwt = [ {file = "PyJWT-1.7.1-py2.py3-none-any.whl", hash = "sha256:5c6eca3c2940464d106b99ba83b00c6add741c9becaec087fb7ccdefea71350e"}, {file = "PyJWT-1.7.1.tar.gz", hash = "sha256:8d59a976fb773f3e6a39c85636357c4f0e242707394cadadd9814f5cbaa20e96"}, @@ -1498,47 +1513,9 @@ six = [ {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, ] sniffio = [ - {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"}, - {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, -] -sqlalchemy = [ - {file = "SQLAlchemy-1.3.24-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:87a2725ad7d41cd7376373c15fd8bf674e9c33ca56d0b8036add2d634dba372e"}, - {file = "SQLAlchemy-1.3.24-cp27-cp27m-win32.whl", hash = "sha256:f597a243b8550a3a0b15122b14e49d8a7e622ba1c9d29776af741f1845478d79"}, - {file = "SQLAlchemy-1.3.24-cp27-cp27m-win_amd64.whl", hash = "sha256:fc4cddb0b474b12ed7bdce6be1b9edc65352e8ce66bc10ff8cbbfb3d4047dbf4"}, - {file = "SQLAlchemy-1.3.24-cp35-cp35m-macosx_10_14_x86_64.whl", hash = "sha256:f1149d6e5c49d069163e58a3196865e4321bad1803d7886e07d8710de392c548"}, - {file = "SQLAlchemy-1.3.24-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:14f0eb5db872c231b20c18b1e5806352723a3a89fb4254af3b3e14f22eaaec75"}, - {file = "SQLAlchemy-1.3.24-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:e98d09f487267f1e8d1179bf3b9d7709b30a916491997137dd24d6ae44d18d79"}, - {file = "SQLAlchemy-1.3.24-cp35-cp35m-manylinux2014_aarch64.whl", hash = "sha256:fc1f2a5a5963e2e73bac4926bdaf7790c4d7d77e8fc0590817880e22dd9d0b8b"}, - {file = "SQLAlchemy-1.3.24-cp35-cp35m-win32.whl", hash = "sha256:f3c5c52f7cb8b84bfaaf22d82cb9e6e9a8297f7c2ed14d806a0f5e4d22e83fb7"}, - {file = "SQLAlchemy-1.3.24-cp35-cp35m-win_amd64.whl", hash = "sha256:0352db1befcbed2f9282e72843f1963860bf0e0472a4fa5cf8ee084318e0e6ab"}, - {file = "SQLAlchemy-1.3.24-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:2ed6343b625b16bcb63c5b10523fd15ed8934e1ed0f772c534985e9f5e73d894"}, - {file = "SQLAlchemy-1.3.24-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:34fcec18f6e4b24b4a5f6185205a04f1eab1e56f8f1d028a2a03694ebcc2ddd4"}, - {file = "SQLAlchemy-1.3.24-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:e47e257ba5934550d7235665eee6c911dc7178419b614ba9e1fbb1ce6325b14f"}, - {file = "SQLAlchemy-1.3.24-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:816de75418ea0953b5eb7b8a74933ee5a46719491cd2b16f718afc4b291a9658"}, - {file = "SQLAlchemy-1.3.24-cp36-cp36m-win32.whl", hash = "sha256:26155ea7a243cbf23287f390dba13d7927ffa1586d3208e0e8d615d0c506f996"}, - {file = "SQLAlchemy-1.3.24-cp36-cp36m-win_amd64.whl", hash = "sha256:f03bd97650d2e42710fbe4cf8a59fae657f191df851fc9fc683ecef10746a375"}, - {file = "SQLAlchemy-1.3.24-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:a006d05d9aa052657ee3e4dc92544faae5fcbaafc6128217310945610d862d39"}, - {file = "SQLAlchemy-1.3.24-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:1e2f89d2e5e3c7a88e25a3b0e43626dba8db2aa700253023b82e630d12b37109"}, - {file = "SQLAlchemy-1.3.24-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:0d5d862b1cfbec5028ce1ecac06a3b42bc7703eb80e4b53fceb2738724311443"}, - {file = "SQLAlchemy-1.3.24-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:0172423a27fbcae3751ef016663b72e1a516777de324a76e30efa170dbd3dd2d"}, - {file = "SQLAlchemy-1.3.24-cp37-cp37m-win32.whl", hash = "sha256:d37843fb8df90376e9e91336724d78a32b988d3d20ab6656da4eb8ee3a45b63c"}, - {file = "SQLAlchemy-1.3.24-cp37-cp37m-win_amd64.whl", hash = "sha256:c10ff6112d119f82b1618b6dc28126798481b9355d8748b64b9b55051eb4f01b"}, - {file = "SQLAlchemy-1.3.24-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:861e459b0e97673af6cc5e7f597035c2e3acdfb2608132665406cded25ba64c7"}, - {file = "SQLAlchemy-1.3.24-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:5de2464c254380d8a6c20a2746614d5a436260be1507491442cf1088e59430d2"}, - {file = "SQLAlchemy-1.3.24-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:d375d8ccd3cebae8d90270f7aa8532fe05908f79e78ae489068f3b4eee5994e8"}, - {file = "SQLAlchemy-1.3.24-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:014ea143572fee1c18322b7908140ad23b3994036ef4c0d630110faf942652f8"}, - {file = "SQLAlchemy-1.3.24-cp38-cp38-win32.whl", hash = "sha256:6607ae6cd3a07f8a4c3198ffbf256c261661965742e2b5265a77cd5c679c9bba"}, - {file = "SQLAlchemy-1.3.24-cp38-cp38-win_amd64.whl", hash = "sha256:fcb251305fa24a490b6a9ee2180e5f8252915fb778d3dafc70f9cc3f863827b9"}, - {file = "SQLAlchemy-1.3.24-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:01aa5f803db724447c1d423ed583e42bf5264c597fd55e4add4301f163b0be48"}, - {file = "SQLAlchemy-1.3.24-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:4d0e3515ef98aa4f0dc289ff2eebb0ece6260bbf37c2ea2022aad63797eacf60"}, - {file = "SQLAlchemy-1.3.24-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:bce28277f308db43a6b4965734366f533b3ff009571ec7ffa583cb77539b84d6"}, - {file = "SQLAlchemy-1.3.24-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:8110e6c414d3efc574543109ee618fe2c1f96fa31833a1ff36cc34e968c4f233"}, - {file = "SQLAlchemy-1.3.24-cp39-cp39-win32.whl", hash = "sha256:ee5f5188edb20a29c1cc4a039b074fdc5575337c9a68f3063449ab47757bb064"}, - {file = "SQLAlchemy-1.3.24-cp39-cp39-win_amd64.whl", hash = "sha256:09083c2487ca3c0865dc588e07aeaa25416da3d95f7482c07e92f47e080aa17b"}, - {file = "SQLAlchemy-1.3.24.tar.gz", hash = "sha256:ebbb777cbf9312359b897bf81ba00dae0f5cb69fba2a18265dcc18a6f5ef7519"}, + {file = "sniffio-1.2.0-py3-none-any.whl", hash = "sha256:471b71698eac1c2112a40ce2752bb2f4a4814c22a54a3eed3676bc0f5ca9f663"}, + {file = "sniffio-1.2.0.tar.gz", hash = "sha256:c4666eecec1d3f50960c6bdf61ab7bc350648da6c126e3cf6898d8cd4ddcd3de"}, ] -<<<<<<< HEAD -======= sqlalchemy = [ {file = "SQLAlchemy-1.3.24-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:87a2725ad7d41cd7376373c15fd8bf674e9c33ca56d0b8036add2d634dba372e"}, {file = "SQLAlchemy-1.3.24-cp27-cp27m-win32.whl", hash = "sha256:f597a243b8550a3a0b15122b14e49d8a7e622ba1c9d29776af741f1845478d79"}, @@ -1575,7 +1552,6 @@ sqlalchemy = [ {file = "SQLAlchemy-1.3.24-cp39-cp39-win_amd64.whl", hash = "sha256:09083c2487ca3c0865dc588e07aeaa25416da3d95f7482c07e92f47e080aa17b"}, {file = "SQLAlchemy-1.3.24.tar.gz", hash = "sha256:ebbb777cbf9312359b897bf81ba00dae0f5cb69fba2a18265dcc18a6f5ef7519"}, ] ->>>>>>> master starlette = [ {file = "starlette-0.14.2-py3-none-any.whl", hash = "sha256:3c8e48e52736b3161e34c9f0e8153b4f32ec5d8995a3ee1d59410d92f75162ed"}, {file = "starlette-0.14.2.tar.gz", hash = "sha256:7d49f4a27f8742262ef1470608c59ddbc66baf37c148e938c7038e6bc7a998aa"}, diff --git a/pyproject.toml b/pyproject.toml index 2c375db1..1bc3cb2f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ jsonpath-ng = "^1.5.3" elasticsearch = "<7.0" bleach = ">=3.3.1" tenacity = ">=8.0.0" +pydash = "^5.1.0" [tool.poetry.dev-dependencies] pytest = "^5.3" From 039167b02743d234a80e3ba7c88e133752a6f1f0 Mon Sep 17 00:00:00 2001 From: craigrbarnes Date: Mon, 12 Sep 2022 20:14:40 +0000 Subject: [PATCH 53/70] Apply automatic documentation changes --- docs/openapi.yaml | 212 ---------------------------------------------- 1 file changed, 212 deletions(-) diff --git a/docs/openapi.yaml b/docs/openapi.yaml index 973159da..0f672bd4 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -632,215 +632,3 @@ paths: schema: {} description: Successful Response summary: Get Version - /aggregate/info/{what}: - get: - description: "Returns status and configuration information about aggregate metadata\ - \ service. Current support only 1 information type:\ - \ **schema**" - operationId: get_aggregate_info - parameters: - - in: path - required: true - schema: - title: What - type: string - name: what - description: type of information to return - responses: - '200': - description: Successful Response - content: - application/json: - schema: {} - summary: Get Config Information - tags: - - Aggregate - /aggregate/commons: - get: - description: "Returns a list of all commons with data in the aggregate metadata-service" - operationId: get_aggregate_commons - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - properties: - commons: - type: array - items: - type: string - example: - - commons: ["commonsA", "commonsB"] - summary: Get Commons - tags: - - Aggregate - /aggregate/tags: - get: - description: "Returns aggregate category, name and counts across all commons" - operationId: get_aggregate_tags - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - example: - - { - "Data Type": { - "total": 275, - "names": [ - { - "Genotype": 103, - "Clinical Phenotype": 100, - "DCC Harmonized": 24, - "WGS": 20, - "SNP/CNV Genotypes (NGS)": 6, - "RNA-Seq": 5, - "WXS": 5, - "Targeted-Capture": 3, - "miRNA-Seq": 3, - "CNV Genotypes": 2 - } - ] - } - } - summary: Get tag counts information - tags: - - Aggregate - /aggregate/metadata: - get: - description: "Returns a list of all commons with data in the aggregate metadata-service" - operationId: get_aggregate_metadata - parameters: - - in: query - name: limit - required: false - schema: - title: limit - type: integer - default: 20 - - in: query - name: offset - schema: - title: offset - type: integer - default: 0 - description: "Return results at this given offset" - - in: query - name: flatten - schema: - title: flatten - type: boolean - default: false - description: "Return the results without grouping items by commons" - - in: query - name: pagination - schema: - title: pagination - type: boolean - default: false - description: "If true will return a pagination object in the response" - - in: query - name: counts - schema: - title: counts - type: string - default: "" - description: "Return count of a field instead of the value if field is an array\ - \ otherwise field is unchanged. If field is null will set field to 0.\ - \ Multiple fields can be compressed by comma separating the field names:\ - \ _files,authors" - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - example: - - { - "commonA" : [ { id2: { name: "bear" } } , { id3: { name: "cat" } } ], - "commonB" : [ { id200: { name: "shark" } } , { id312: { name: "bass" }} ] - } - summary: Get metadata records from aggregate metadata - tags: - - Aggregate - /aggregate/metadata/{name}: - get: - parameters: - - in: path - name: name - required: true - schema: - title: Name - type: string - default: false - description: "Returns an array containing all the metadata entries for a single commons. There are no limit/offset parameters." - operationId: get_aggregate_metadata_for_commons - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - example: - - [ { id2: { name: "bear" } } , { id3: { name: "cat" } }] - summary: Get all metadata records from a commons by name - tags: - - Aggregate - /aggregate/metadata/{name}/info: - get: - parameters: - - in: path - name: name - required: true - schema: - title: Name - type: string - default: false - description: "Returns an object containing additional information about a commons" - operationId: get_aggregate_metadata_commons_info - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - example: - - { commons_url: "gen3.datacommons.io" } - summary: Get additional named commons information - tags: - - Aggregate - /aggregate/metadata/guid/{guid}: - get: - parameters: - - in: path - name: guid - required: true - schema: - title: Guid - type: string - default: false - description: "Returns a metadata record by GUID" - operationId: get_aggregate_metadata_guid - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - example: - - { id2: { name: "bear" } } - summary: Get metadata entry by guid - tags: - - Aggregate From f31b276ec0c5d1ccb15cb2a21aa9334e9f914edb Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Mon, 10 Oct 2022 14:27:02 -0500 Subject: [PATCH 54/70] add missing additions to swagger doc --- docs/openapi.yaml | 212 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 212 insertions(+) diff --git a/docs/openapi.yaml b/docs/openapi.yaml index 0f672bd4..973159da 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -632,3 +632,215 @@ paths: schema: {} description: Successful Response summary: Get Version + /aggregate/info/{what}: + get: + description: "Returns status and configuration information about aggregate metadata\ + \ service. Current support only 1 information type:\ + \ **schema**" + operationId: get_aggregate_info + parameters: + - in: path + required: true + schema: + title: What + type: string + name: what + description: type of information to return + responses: + '200': + description: Successful Response + content: + application/json: + schema: {} + summary: Get Config Information + tags: + - Aggregate + /aggregate/commons: + get: + description: "Returns a list of all commons with data in the aggregate metadata-service" + operationId: get_aggregate_commons + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + properties: + commons: + type: array + items: + type: string + example: + - commons: ["commonsA", "commonsB"] + summary: Get Commons + tags: + - Aggregate + /aggregate/tags: + get: + description: "Returns aggregate category, name and counts across all commons" + operationId: get_aggregate_tags + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + example: + - { + "Data Type": { + "total": 275, + "names": [ + { + "Genotype": 103, + "Clinical Phenotype": 100, + "DCC Harmonized": 24, + "WGS": 20, + "SNP/CNV Genotypes (NGS)": 6, + "RNA-Seq": 5, + "WXS": 5, + "Targeted-Capture": 3, + "miRNA-Seq": 3, + "CNV Genotypes": 2 + } + ] + } + } + summary: Get tag counts information + tags: + - Aggregate + /aggregate/metadata: + get: + description: "Returns a list of all commons with data in the aggregate metadata-service" + operationId: get_aggregate_metadata + parameters: + - in: query + name: limit + required: false + schema: + title: limit + type: integer + default: 20 + - in: query + name: offset + schema: + title: offset + type: integer + default: 0 + description: "Return results at this given offset" + - in: query + name: flatten + schema: + title: flatten + type: boolean + default: false + description: "Return the results without grouping items by commons" + - in: query + name: pagination + schema: + title: pagination + type: boolean + default: false + description: "If true will return a pagination object in the response" + - in: query + name: counts + schema: + title: counts + type: string + default: "" + description: "Return count of a field instead of the value if field is an array\ + \ otherwise field is unchanged. If field is null will set field to 0.\ + \ Multiple fields can be compressed by comma separating the field names:\ + \ _files,authors" + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + additionalProperties: true + example: + - { + "commonA" : [ { id2: { name: "bear" } } , { id3: { name: "cat" } } ], + "commonB" : [ { id200: { name: "shark" } } , { id312: { name: "bass" }} ] + } + summary: Get metadata records from aggregate metadata + tags: + - Aggregate + /aggregate/metadata/{name}: + get: + parameters: + - in: path + name: name + required: true + schema: + title: Name + type: string + default: false + description: "Returns an array containing all the metadata entries for a single commons. There are no limit/offset parameters." + operationId: get_aggregate_metadata_for_commons + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + additionalProperties: true + example: + - [ { id2: { name: "bear" } } , { id3: { name: "cat" } }] + summary: Get all metadata records from a commons by name + tags: + - Aggregate + /aggregate/metadata/{name}/info: + get: + parameters: + - in: path + name: name + required: true + schema: + title: Name + type: string + default: false + description: "Returns an object containing additional information about a commons" + operationId: get_aggregate_metadata_commons_info + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + additionalProperties: true + example: + - { commons_url: "gen3.datacommons.io" } + summary: Get additional named commons information + tags: + - Aggregate + /aggregate/metadata/guid/{guid}: + get: + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + default: false + description: "Returns a metadata record by GUID" + operationId: get_aggregate_metadata_guid + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + additionalProperties: true + example: + - { id2: { name: "bear" } } + summary: Get metadata entry by guid + tags: + - Aggregate From fb871ea95ded2653eedbafd8c5cecb7a75a53f31 Mon Sep 17 00:00:00 2001 From: craigrbarnes Date: Tue, 11 Oct 2022 17:18:50 +0000 Subject: [PATCH 55/70] Apply automatic documentation changes --- docs/openapi.yaml | 212 ---------------------------------------------- 1 file changed, 212 deletions(-) diff --git a/docs/openapi.yaml b/docs/openapi.yaml index 31220834..4a555a8d 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -805,215 +805,3 @@ paths: schema: {} description: Successful Response summary: Get Version - /aggregate/info/{what}: - get: - description: "Returns status and configuration information about aggregate metadata\ - \ service. Current support only 1 information type:\ - \ **schema**" - operationId: get_aggregate_info - parameters: - - in: path - required: true - schema: - title: What - type: string - name: what - description: type of information to return - responses: - '200': - description: Successful Response - content: - application/json: - schema: {} - summary: Get Config Information - tags: - - Aggregate - /aggregate/commons: - get: - description: "Returns a list of all commons with data in the aggregate metadata-service" - operationId: get_aggregate_commons - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - properties: - commons: - type: array - items: - type: string - example: - - commons: ["commonsA", "commonsB"] - summary: Get Commons - tags: - - Aggregate - /aggregate/tags: - get: - description: "Returns aggregate category, name and counts across all commons" - operationId: get_aggregate_tags - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - example: - - { - "Data Type": { - "total": 275, - "names": [ - { - "Genotype": 103, - "Clinical Phenotype": 100, - "DCC Harmonized": 24, - "WGS": 20, - "SNP/CNV Genotypes (NGS)": 6, - "RNA-Seq": 5, - "WXS": 5, - "Targeted-Capture": 3, - "miRNA-Seq": 3, - "CNV Genotypes": 2 - } - ] - } - } - summary: Get tag counts information - tags: - - Aggregate - /aggregate/metadata: - get: - description: "Returns a list of all commons with data in the aggregate metadata-service" - operationId: get_aggregate_metadata - parameters: - - in: query - name: limit - required: false - schema: - title: limit - type: integer - default: 20 - - in: query - name: offset - schema: - title: offset - type: integer - default: 0 - description: "Return results at this given offset" - - in: query - name: flatten - schema: - title: flatten - type: boolean - default: false - description: "Return the results without grouping items by commons" - - in: query - name: pagination - schema: - title: pagination - type: boolean - default: false - description: "If true will return a pagination object in the response" - - in: query - name: counts - schema: - title: counts - type: string - default: "" - description: "Return count of a field instead of the value if field is an array\ - \ otherwise field is unchanged. If field is null will set field to 0.\ - \ Multiple fields can be compressed by comma separating the field names:\ - \ _files,authors" - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - example: - - { - "commonA" : [ { id2: { name: "bear" } } , { id3: { name: "cat" } } ], - "commonB" : [ { id200: { name: "shark" } } , { id312: { name: "bass" }} ] - } - summary: Get metadata records from aggregate metadata - tags: - - Aggregate - /aggregate/metadata/{name}: - get: - parameters: - - in: path - name: name - required: true - schema: - title: Name - type: string - default: false - description: "Returns an array containing all the metadata entries for a single commons. There are no limit/offset parameters." - operationId: get_aggregate_metadata_for_commons - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - example: - - [ { id2: { name: "bear" } } , { id3: { name: "cat" } }] - summary: Get all metadata records from a commons by name - tags: - - Aggregate - /aggregate/metadata/{name}/info: - get: - parameters: - - in: path - name: name - required: true - schema: - title: Name - type: string - default: false - description: "Returns an object containing additional information about a commons" - operationId: get_aggregate_metadata_commons_info - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - example: - - { commons_url: "gen3.datacommons.io" } - summary: Get additional named commons information - tags: - - Aggregate - /aggregate/metadata/guid/{guid}: - get: - parameters: - - in: path - name: guid - required: true - schema: - title: Guid - type: string - default: false - description: "Returns a metadata record by GUID" - operationId: get_aggregate_metadata_guid - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - example: - - { id2: { name: "bear" } } - summary: Get metadata entry by guid - tags: - - Aggregate From bc492fcfe36e8be57db09d1ce97a64b251cb0446 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Wed, 12 Oct 2022 12:30:48 -0500 Subject: [PATCH 56/70] add error loging in conversion --- src/mds/agg_mds/commons.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/mds/agg_mds/commons.py b/src/mds/agg_mds/commons.py index 4b8b5f2f..69677a1e 100644 --- a/src/mds/agg_mds/commons.py +++ b/src/mds/agg_mds/commons.py @@ -47,11 +47,14 @@ def string_to_array(s: str) -> Optional[List[str]]: def array_to_string(arr: Optional[list]) -> Optional[str]: if arr is None: + logger.error(f"array is None") return None return "".join(arr) def string_to_integer(s: str) -> int: + if not s.isnumeric(): + logger.error(f"{s} does not represent a number") return int(s) if s.isnumeric() else None @@ -59,6 +62,7 @@ def string_to_number(s: str) -> Optional[float]: try: return float(s) except ValueError: + logger.error(f"{s} failed to convert to a float") return None @@ -66,6 +70,7 @@ def string_to_dict(s: str) -> Optional[Dict[Any, Any]]: try: return json.loads(s) except json.JSONDecodeError: + logger.error(f"{s} failed to convert to JSON ") return None From 7476088587091e7a088fe3c62041ee9f62812ae4 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Wed, 12 Oct 2022 12:34:04 -0500 Subject: [PATCH 57/70] add error loggin with conversions --- docs/openapi.yaml | 483 +++++++++++++++++++++++++--------------------- 1 file changed, 261 insertions(+), 222 deletions(-) diff --git a/docs/openapi.yaml b/docs/openapi.yaml index 4a555a8d..973159da 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -1,15 +1,5 @@ components: schemas: - AliasObjInput: - description: "Alias object\n\naliases (list, optional): unique names to allow\ - \ using in place of whatever GUID\n specified" - properties: - aliases: - items: {} - title: Aliases - type: array - title: AliasObjInput - type: object CreateObjForIdInput: description: "Create object.\n\nfile_name (str): Name for the file being uploaded\n\ aliases (list, optional): unique name to allow using in place of whatever\ @@ -92,14 +82,14 @@ components: type: http info: title: Framework Services Object Management Service - version: 1.10.0 + version: 1.8.1 openapi: 3.0.2 paths: /_status: get: description: "Returns the status of the MDS:\n * error: if there was no error\ \ this will be \"none\"\n * last_update: timestamp of the last data pull from\ - \ the commons\n * count: number of entries" + \ the commons\n * count: number of entries\n:return:" operationId: get_status__status_get responses: '200': @@ -358,175 +348,6 @@ paths: summary: Update Metadata tags: - Maintain - /metadata/{guid}/aliases: - delete: - description: Delete all metadata_aliases of the GUID. - operationId: delete_all_metadata_aliases_metadata__guid__aliases_delete - parameters: - - in: path - name: guid - required: true - schema: - title: Guid - type: string - responses: - '200': - content: - application/json: - schema: {} - description: Successful Response - '422': - content: - application/json: - schema: - $ref: '#/components/schemas/HTTPValidationError' - description: Validation Error - security: - - HTTPBasic: [] - - HTTPBearer: [] - summary: Delete All Metadata Aliases - tags: - - Aliases - get: - description: Get the aliases for the provided GUID - operationId: get_metadata_aliases_metadata__guid__aliases_get - parameters: - - in: path - name: guid - required: true - schema: - title: Guid - type: string - responses: - '200': - content: - application/json: - schema: {} - description: Successful Response - '422': - content: - application/json: - schema: - $ref: '#/components/schemas/HTTPValidationError' - description: Validation Error - summary: Get Metadata Aliases - tags: - - Query - post: - description: Create metadata aliases for the GUID. - operationId: create_metadata_aliases_metadata__guid__aliases_post - parameters: - - in: path - name: guid - required: true - schema: - title: Guid - type: string - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/AliasObjInput' - required: true - responses: - '200': - content: - application/json: - schema: {} - description: Successful Response - '422': - content: - application/json: - schema: - $ref: '#/components/schemas/HTTPValidationError' - description: Validation Error - security: - - HTTPBasic: [] - - HTTPBearer: [] - summary: Create Metadata Aliases - tags: - - Aliases - put: - description: 'Update the metadata aliases of the GUID. - - - If `merge` is True, then any aliases that are not in the new data will be - - kept.' - operationId: update_metadata_alias_metadata__guid__aliases_put - parameters: - - in: path - name: guid - required: true - schema: - title: Guid - type: string - - in: query - name: merge - required: false - schema: - default: false - title: Merge - type: boolean - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/AliasObjInput' - required: true - responses: - '200': - content: - application/json: - schema: {} - description: Successful Response - '422': - content: - application/json: - schema: - $ref: '#/components/schemas/HTTPValidationError' - description: Validation Error - security: - - HTTPBasic: [] - - HTTPBearer: [] - summary: Update Metadata Alias - tags: - - Aliases - /metadata/{guid}/aliases/{alias}: - delete: - description: Delete the specified metadata_alias of the GUID. - operationId: delete_metadata_alias_metadata__guid__aliases__alias__delete - parameters: - - in: path - name: guid - required: true - schema: - title: Guid - type: string - - in: path - name: alias - required: true - schema: - title: Alias - type: string - responses: - '200': - content: - application/json: - schema: {} - description: Successful Response - '422': - content: - application/json: - schema: - $ref: '#/components/schemas/HTTPValidationError' - description: Validation Error - security: - - HTTPBasic: [] - - HTTPBearer: [] - summary: Delete Metadata Alias - tags: - - Aliases /metadata_index: get: description: List all the metadata key paths indexed in the database. @@ -597,8 +418,11 @@ paths: - Index /objects/upload: post: - description: Create object placeholder and attach metadata, return Upload url - to the user. + description: "Create object placeholder and attach metadata, return Upload url\ + \ to the user.\n\nArgs:\n body (CreateObjInput): input body for create\ + \ object\n request (Request): starlette request (which contains reference\ + \ to FastAPI app)\n token (HTTPAuthorizationCredentials, optional): bearer\ + \ token" operationId: create_object_objects_upload_post requestBody: content: @@ -625,18 +449,16 @@ paths: - Object /objects/{guid}: delete: - description: 'Delete the metadata for the specified object and also delete the - record from indexd. - - [Optional] Remove the object from existing bucket location(s) by proxying - to - - fence DELETE /data/file_id by using an additional query parameter `delete_file_locations`. - - Uses the response status code from fence/indexd to determine whether user - has - - permission to delete metadata.' + description: "Delete the metadata for the specified object and also delete the\ + \ record from indexd.\n[Optional] Remove the object from existing bucket location(s)\ + \ by proxying to\nfence DELETE /data/file_id by using an additional query\ + \ parameter `delete_file_locations`.\nUses the response status code from fence/indexd\ + \ to determine whether user has\npermission to delete metadata.\n\nArgs:\n\ + \ guid (str): indexd GUID or alias, or MDS key\n request (Request):\ + \ starlette request (which contains reference to FastAPI app)\nReturns:\n\ + \ 204: if record and metadata are deleted\n 403: if fence/indexd returns\ + \ a 403 unauthorized response\n 500: if fence/indexd does not return 204\ + \ or 403 or there is an error deleting metadata" operationId: delete_object_objects__guid__delete parameters: - in: path @@ -663,10 +485,12 @@ paths: tags: - Object get: - description: 'Get the metadata associated with the provided key. If the key - is an - - indexd GUID or alias, also returns the indexd record.' + description: "Get the metadata associated with the provided key. If the key\ + \ is an\nindexd GUID or alias, also returns the indexd record.\n\nArgs:\n\ + \ guid (str): indexd GUID or alias, or MDS key\n request (Request):\ + \ starlette request (which contains reference to FastAPI app)\n\nReturns:\n\ + \ 200: { \"record\": { indexd record }, \"metadata\": { MDS metadata }\ + \ }\n 404: if the key is not in indexd and not in MDS" operationId: get_object_objects__guid__get parameters: - in: path @@ -691,12 +515,13 @@ paths: tags: - Object post: - description: 'Create object placeholder and attach metadata, return Upload url - to the - - user. A new GUID (new version of the provided GUID) will be created for - - this object. The new record will have the same authz as the original one.' + description: "Create object placeholder and attach metadata, return Upload url\ + \ to the\nuser. A new GUID (new version of the provided GUID) will be created\ + \ for\nthis object. The new record will have the same authz as the original\ + \ one.\n\nArgs:\n guid (str): indexd GUID or alias\n body (CreateObjForIdInput):\ + \ input body for create object for ID\n request (Request): starlette request\ + \ (which contains reference to FastAPI app)\n token (HTTPAuthorizationCredentials,\ + \ optional): bearer token" operationId: create_object_for_id_objects__guid__post parameters: - in: path @@ -730,12 +555,15 @@ paths: - Object /objects/{guid}/download: get: - description: 'Send a GET request to the data access service to generate a signed - download - - url for the given GUID or alias. Returns the generated signed download url - - to the user.' + description: "Send a GET request to the data access service to generate a signed\ + \ download\nurl for the given GUID or alias. Returns the generated signed\ + \ download url\nto the user.\n\nArgs:\n guid (str): indexd GUID or alias\n\ + \ request (Request): starlette request (which contains reference to FastAPI\ + \ app)\n\nReturns:\n 200: { \"url\": signed download url }\n 404: if\ + \ the data access service can not find GUID/alias in indexd\n 403: if the\ + \ data access service returns a 401 or a 403\n 500: if there is an error\ + \ making the request to the data access service\n or the data access service\ + \ returns any other 400-range or 500-range\n error" operationId: get_object_signed_download_url_objects__guid__download_get parameters: - in: path @@ -761,16 +589,16 @@ paths: - Object /objects/{guid}/latest: get: - description: 'Attempt to fetch the latest version of the provided guid/key from - indexd. - - If the provided guid/key is found in indexd, return the indexd record and - - metadata object associated with the latest guid fetched from indexd. If the - - provided guid/key is not found in indexd, return the metadata object - - associated with the provided guid/key.' + description: "Attempt to fetch the latest version of the provided guid/key from\ + \ indexd.\nIf the provided guid/key is found in indexd, return the indexd\ + \ record and\nmetadata object associated with the latest guid fetched from\ + \ indexd. If the\nprovided guid/key is not found in indexd, return the metadata\ + \ object\nassociated with the provided guid/key.\n\nArgs:\n guid (str):\ + \ indexd GUID or MDS key. alias is NOT supported because the\n corresponding\ + \ endpoint in indexd does not accept alias\n request (Request): starlette\ + \ request (which contains reference to FastAPI app)\n\nReturns:\n 200:\ + \ { \"record\": { indexd record }, \"metadata\": { MDS metadata } }\n 404:\ + \ if the key is not in indexd and not in MDS" operationId: get_object_latest_objects__guid__latest_get parameters: - in: path @@ -796,7 +624,6 @@ paths: - Object /version: get: - description: '' operationId: get_version_version_get responses: '200': @@ -805,3 +632,215 @@ paths: schema: {} description: Successful Response summary: Get Version + /aggregate/info/{what}: + get: + description: "Returns status and configuration information about aggregate metadata\ + \ service. Current support only 1 information type:\ + \ **schema**" + operationId: get_aggregate_info + parameters: + - in: path + required: true + schema: + title: What + type: string + name: what + description: type of information to return + responses: + '200': + description: Successful Response + content: + application/json: + schema: {} + summary: Get Config Information + tags: + - Aggregate + /aggregate/commons: + get: + description: "Returns a list of all commons with data in the aggregate metadata-service" + operationId: get_aggregate_commons + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + properties: + commons: + type: array + items: + type: string + example: + - commons: ["commonsA", "commonsB"] + summary: Get Commons + tags: + - Aggregate + /aggregate/tags: + get: + description: "Returns aggregate category, name and counts across all commons" + operationId: get_aggregate_tags + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + example: + - { + "Data Type": { + "total": 275, + "names": [ + { + "Genotype": 103, + "Clinical Phenotype": 100, + "DCC Harmonized": 24, + "WGS": 20, + "SNP/CNV Genotypes (NGS)": 6, + "RNA-Seq": 5, + "WXS": 5, + "Targeted-Capture": 3, + "miRNA-Seq": 3, + "CNV Genotypes": 2 + } + ] + } + } + summary: Get tag counts information + tags: + - Aggregate + /aggregate/metadata: + get: + description: "Returns a list of all commons with data in the aggregate metadata-service" + operationId: get_aggregate_metadata + parameters: + - in: query + name: limit + required: false + schema: + title: limit + type: integer + default: 20 + - in: query + name: offset + schema: + title: offset + type: integer + default: 0 + description: "Return results at this given offset" + - in: query + name: flatten + schema: + title: flatten + type: boolean + default: false + description: "Return the results without grouping items by commons" + - in: query + name: pagination + schema: + title: pagination + type: boolean + default: false + description: "If true will return a pagination object in the response" + - in: query + name: counts + schema: + title: counts + type: string + default: "" + description: "Return count of a field instead of the value if field is an array\ + \ otherwise field is unchanged. If field is null will set field to 0.\ + \ Multiple fields can be compressed by comma separating the field names:\ + \ _files,authors" + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + additionalProperties: true + example: + - { + "commonA" : [ { id2: { name: "bear" } } , { id3: { name: "cat" } } ], + "commonB" : [ { id200: { name: "shark" } } , { id312: { name: "bass" }} ] + } + summary: Get metadata records from aggregate metadata + tags: + - Aggregate + /aggregate/metadata/{name}: + get: + parameters: + - in: path + name: name + required: true + schema: + title: Name + type: string + default: false + description: "Returns an array containing all the metadata entries for a single commons. There are no limit/offset parameters." + operationId: get_aggregate_metadata_for_commons + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + additionalProperties: true + example: + - [ { id2: { name: "bear" } } , { id3: { name: "cat" } }] + summary: Get all metadata records from a commons by name + tags: + - Aggregate + /aggregate/metadata/{name}/info: + get: + parameters: + - in: path + name: name + required: true + schema: + title: Name + type: string + default: false + description: "Returns an object containing additional information about a commons" + operationId: get_aggregate_metadata_commons_info + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + additionalProperties: true + example: + - { commons_url: "gen3.datacommons.io" } + summary: Get additional named commons information + tags: + - Aggregate + /aggregate/metadata/guid/{guid}: + get: + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + default: false + description: "Returns a metadata record by GUID" + operationId: get_aggregate_metadata_guid + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + additionalProperties: true + example: + - { id2: { name: "bear" } } + summary: Get metadata entry by guid + tags: + - Aggregate From 41d28c040c951a036fd7c16e21bdbd5a9caa8b10 Mon Sep 17 00:00:00 2001 From: craigrbarnes Date: Wed, 12 Oct 2022 17:35:34 +0000 Subject: [PATCH 58/70] Apply automatic documentation changes --- docs/openapi.yaml | 483 +++++++++++++++++++++------------------------- 1 file changed, 222 insertions(+), 261 deletions(-) diff --git a/docs/openapi.yaml b/docs/openapi.yaml index 973159da..4a555a8d 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -1,5 +1,15 @@ components: schemas: + AliasObjInput: + description: "Alias object\n\naliases (list, optional): unique names to allow\ + \ using in place of whatever GUID\n specified" + properties: + aliases: + items: {} + title: Aliases + type: array + title: AliasObjInput + type: object CreateObjForIdInput: description: "Create object.\n\nfile_name (str): Name for the file being uploaded\n\ aliases (list, optional): unique name to allow using in place of whatever\ @@ -82,14 +92,14 @@ components: type: http info: title: Framework Services Object Management Service - version: 1.8.1 + version: 1.10.0 openapi: 3.0.2 paths: /_status: get: description: "Returns the status of the MDS:\n * error: if there was no error\ \ this will be \"none\"\n * last_update: timestamp of the last data pull from\ - \ the commons\n * count: number of entries\n:return:" + \ the commons\n * count: number of entries" operationId: get_status__status_get responses: '200': @@ -348,6 +358,175 @@ paths: summary: Update Metadata tags: - Maintain + /metadata/{guid}/aliases: + delete: + description: Delete all metadata_aliases of the GUID. + operationId: delete_all_metadata_aliases_metadata__guid__aliases_delete + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + security: + - HTTPBasic: [] + - HTTPBearer: [] + summary: Delete All Metadata Aliases + tags: + - Aliases + get: + description: Get the aliases for the provided GUID + operationId: get_metadata_aliases_metadata__guid__aliases_get + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + summary: Get Metadata Aliases + tags: + - Query + post: + description: Create metadata aliases for the GUID. + operationId: create_metadata_aliases_metadata__guid__aliases_post + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/AliasObjInput' + required: true + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + security: + - HTTPBasic: [] + - HTTPBearer: [] + summary: Create Metadata Aliases + tags: + - Aliases + put: + description: 'Update the metadata aliases of the GUID. + + + If `merge` is True, then any aliases that are not in the new data will be + + kept.' + operationId: update_metadata_alias_metadata__guid__aliases_put + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + - in: query + name: merge + required: false + schema: + default: false + title: Merge + type: boolean + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/AliasObjInput' + required: true + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + security: + - HTTPBasic: [] + - HTTPBearer: [] + summary: Update Metadata Alias + tags: + - Aliases + /metadata/{guid}/aliases/{alias}: + delete: + description: Delete the specified metadata_alias of the GUID. + operationId: delete_metadata_alias_metadata__guid__aliases__alias__delete + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + - in: path + name: alias + required: true + schema: + title: Alias + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + security: + - HTTPBasic: [] + - HTTPBearer: [] + summary: Delete Metadata Alias + tags: + - Aliases /metadata_index: get: description: List all the metadata key paths indexed in the database. @@ -418,11 +597,8 @@ paths: - Index /objects/upload: post: - description: "Create object placeholder and attach metadata, return Upload url\ - \ to the user.\n\nArgs:\n body (CreateObjInput): input body for create\ - \ object\n request (Request): starlette request (which contains reference\ - \ to FastAPI app)\n token (HTTPAuthorizationCredentials, optional): bearer\ - \ token" + description: Create object placeholder and attach metadata, return Upload url + to the user. operationId: create_object_objects_upload_post requestBody: content: @@ -449,16 +625,18 @@ paths: - Object /objects/{guid}: delete: - description: "Delete the metadata for the specified object and also delete the\ - \ record from indexd.\n[Optional] Remove the object from existing bucket location(s)\ - \ by proxying to\nfence DELETE /data/file_id by using an additional query\ - \ parameter `delete_file_locations`.\nUses the response status code from fence/indexd\ - \ to determine whether user has\npermission to delete metadata.\n\nArgs:\n\ - \ guid (str): indexd GUID or alias, or MDS key\n request (Request):\ - \ starlette request (which contains reference to FastAPI app)\nReturns:\n\ - \ 204: if record and metadata are deleted\n 403: if fence/indexd returns\ - \ a 403 unauthorized response\n 500: if fence/indexd does not return 204\ - \ or 403 or there is an error deleting metadata" + description: 'Delete the metadata for the specified object and also delete the + record from indexd. + + [Optional] Remove the object from existing bucket location(s) by proxying + to + + fence DELETE /data/file_id by using an additional query parameter `delete_file_locations`. + + Uses the response status code from fence/indexd to determine whether user + has + + permission to delete metadata.' operationId: delete_object_objects__guid__delete parameters: - in: path @@ -485,12 +663,10 @@ paths: tags: - Object get: - description: "Get the metadata associated with the provided key. If the key\ - \ is an\nindexd GUID or alias, also returns the indexd record.\n\nArgs:\n\ - \ guid (str): indexd GUID or alias, or MDS key\n request (Request):\ - \ starlette request (which contains reference to FastAPI app)\n\nReturns:\n\ - \ 200: { \"record\": { indexd record }, \"metadata\": { MDS metadata }\ - \ }\n 404: if the key is not in indexd and not in MDS" + description: 'Get the metadata associated with the provided key. If the key + is an + + indexd GUID or alias, also returns the indexd record.' operationId: get_object_objects__guid__get parameters: - in: path @@ -515,13 +691,12 @@ paths: tags: - Object post: - description: "Create object placeholder and attach metadata, return Upload url\ - \ to the\nuser. A new GUID (new version of the provided GUID) will be created\ - \ for\nthis object. The new record will have the same authz as the original\ - \ one.\n\nArgs:\n guid (str): indexd GUID or alias\n body (CreateObjForIdInput):\ - \ input body for create object for ID\n request (Request): starlette request\ - \ (which contains reference to FastAPI app)\n token (HTTPAuthorizationCredentials,\ - \ optional): bearer token" + description: 'Create object placeholder and attach metadata, return Upload url + to the + + user. A new GUID (new version of the provided GUID) will be created for + + this object. The new record will have the same authz as the original one.' operationId: create_object_for_id_objects__guid__post parameters: - in: path @@ -555,15 +730,12 @@ paths: - Object /objects/{guid}/download: get: - description: "Send a GET request to the data access service to generate a signed\ - \ download\nurl for the given GUID or alias. Returns the generated signed\ - \ download url\nto the user.\n\nArgs:\n guid (str): indexd GUID or alias\n\ - \ request (Request): starlette request (which contains reference to FastAPI\ - \ app)\n\nReturns:\n 200: { \"url\": signed download url }\n 404: if\ - \ the data access service can not find GUID/alias in indexd\n 403: if the\ - \ data access service returns a 401 or a 403\n 500: if there is an error\ - \ making the request to the data access service\n or the data access service\ - \ returns any other 400-range or 500-range\n error" + description: 'Send a GET request to the data access service to generate a signed + download + + url for the given GUID or alias. Returns the generated signed download url + + to the user.' operationId: get_object_signed_download_url_objects__guid__download_get parameters: - in: path @@ -589,16 +761,16 @@ paths: - Object /objects/{guid}/latest: get: - description: "Attempt to fetch the latest version of the provided guid/key from\ - \ indexd.\nIf the provided guid/key is found in indexd, return the indexd\ - \ record and\nmetadata object associated with the latest guid fetched from\ - \ indexd. If the\nprovided guid/key is not found in indexd, return the metadata\ - \ object\nassociated with the provided guid/key.\n\nArgs:\n guid (str):\ - \ indexd GUID or MDS key. alias is NOT supported because the\n corresponding\ - \ endpoint in indexd does not accept alias\n request (Request): starlette\ - \ request (which contains reference to FastAPI app)\n\nReturns:\n 200:\ - \ { \"record\": { indexd record }, \"metadata\": { MDS metadata } }\n 404:\ - \ if the key is not in indexd and not in MDS" + description: 'Attempt to fetch the latest version of the provided guid/key from + indexd. + + If the provided guid/key is found in indexd, return the indexd record and + + metadata object associated with the latest guid fetched from indexd. If the + + provided guid/key is not found in indexd, return the metadata object + + associated with the provided guid/key.' operationId: get_object_latest_objects__guid__latest_get parameters: - in: path @@ -624,6 +796,7 @@ paths: - Object /version: get: + description: '' operationId: get_version_version_get responses: '200': @@ -632,215 +805,3 @@ paths: schema: {} description: Successful Response summary: Get Version - /aggregate/info/{what}: - get: - description: "Returns status and configuration information about aggregate metadata\ - \ service. Current support only 1 information type:\ - \ **schema**" - operationId: get_aggregate_info - parameters: - - in: path - required: true - schema: - title: What - type: string - name: what - description: type of information to return - responses: - '200': - description: Successful Response - content: - application/json: - schema: {} - summary: Get Config Information - tags: - - Aggregate - /aggregate/commons: - get: - description: "Returns a list of all commons with data in the aggregate metadata-service" - operationId: get_aggregate_commons - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - properties: - commons: - type: array - items: - type: string - example: - - commons: ["commonsA", "commonsB"] - summary: Get Commons - tags: - - Aggregate - /aggregate/tags: - get: - description: "Returns aggregate category, name and counts across all commons" - operationId: get_aggregate_tags - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - example: - - { - "Data Type": { - "total": 275, - "names": [ - { - "Genotype": 103, - "Clinical Phenotype": 100, - "DCC Harmonized": 24, - "WGS": 20, - "SNP/CNV Genotypes (NGS)": 6, - "RNA-Seq": 5, - "WXS": 5, - "Targeted-Capture": 3, - "miRNA-Seq": 3, - "CNV Genotypes": 2 - } - ] - } - } - summary: Get tag counts information - tags: - - Aggregate - /aggregate/metadata: - get: - description: "Returns a list of all commons with data in the aggregate metadata-service" - operationId: get_aggregate_metadata - parameters: - - in: query - name: limit - required: false - schema: - title: limit - type: integer - default: 20 - - in: query - name: offset - schema: - title: offset - type: integer - default: 0 - description: "Return results at this given offset" - - in: query - name: flatten - schema: - title: flatten - type: boolean - default: false - description: "Return the results without grouping items by commons" - - in: query - name: pagination - schema: - title: pagination - type: boolean - default: false - description: "If true will return a pagination object in the response" - - in: query - name: counts - schema: - title: counts - type: string - default: "" - description: "Return count of a field instead of the value if field is an array\ - \ otherwise field is unchanged. If field is null will set field to 0.\ - \ Multiple fields can be compressed by comma separating the field names:\ - \ _files,authors" - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - example: - - { - "commonA" : [ { id2: { name: "bear" } } , { id3: { name: "cat" } } ], - "commonB" : [ { id200: { name: "shark" } } , { id312: { name: "bass" }} ] - } - summary: Get metadata records from aggregate metadata - tags: - - Aggregate - /aggregate/metadata/{name}: - get: - parameters: - - in: path - name: name - required: true - schema: - title: Name - type: string - default: false - description: "Returns an array containing all the metadata entries for a single commons. There are no limit/offset parameters." - operationId: get_aggregate_metadata_for_commons - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - example: - - [ { id2: { name: "bear" } } , { id3: { name: "cat" } }] - summary: Get all metadata records from a commons by name - tags: - - Aggregate - /aggregate/metadata/{name}/info: - get: - parameters: - - in: path - name: name - required: true - schema: - title: Name - type: string - default: false - description: "Returns an object containing additional information about a commons" - operationId: get_aggregate_metadata_commons_info - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - example: - - { commons_url: "gen3.datacommons.io" } - summary: Get additional named commons information - tags: - - Aggregate - /aggregate/metadata/guid/{guid}: - get: - parameters: - - in: path - name: guid - required: true - schema: - title: Guid - type: string - default: false - description: "Returns a metadata record by GUID" - operationId: get_aggregate_metadata_guid - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - example: - - { id2: { name: "bear" } } - summary: Get metadata entry by guid - tags: - - Aggregate From f2475b05e80e701c92a7e8e9d0a6d280f0a00bfe Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Wed, 12 Oct 2022 12:35:42 -0500 Subject: [PATCH 59/70] remove :q from docstring --- src/mds/populate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mds/populate.py b/src/mds/populate.py index ab20bc68..71b3f24f 100644 --- a/src/mds/populate.py +++ b/src/mds/populate.py @@ -245,7 +245,7 @@ async def filter_entries( "select_field": { "field_name" : "commons" , "field_value" : "Proteomic Data Commons" - }:q + } where only the records with the commons field === "Proteomic Data Commons" are added. Note the function assumes the field exists in all of the entries in the mds_arr parameter """ From 6daaf5db8ecbb88add0994a2858822d1ceeb701e Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Wed, 12 Oct 2022 12:39:06 -0500 Subject: [PATCH 60/70] re-add openapi.yaml --- docs/openapi.yaml | 483 +++++++++++++++++++++++++--------------------- 1 file changed, 261 insertions(+), 222 deletions(-) diff --git a/docs/openapi.yaml b/docs/openapi.yaml index 4a555a8d..973159da 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -1,15 +1,5 @@ components: schemas: - AliasObjInput: - description: "Alias object\n\naliases (list, optional): unique names to allow\ - \ using in place of whatever GUID\n specified" - properties: - aliases: - items: {} - title: Aliases - type: array - title: AliasObjInput - type: object CreateObjForIdInput: description: "Create object.\n\nfile_name (str): Name for the file being uploaded\n\ aliases (list, optional): unique name to allow using in place of whatever\ @@ -92,14 +82,14 @@ components: type: http info: title: Framework Services Object Management Service - version: 1.10.0 + version: 1.8.1 openapi: 3.0.2 paths: /_status: get: description: "Returns the status of the MDS:\n * error: if there was no error\ \ this will be \"none\"\n * last_update: timestamp of the last data pull from\ - \ the commons\n * count: number of entries" + \ the commons\n * count: number of entries\n:return:" operationId: get_status__status_get responses: '200': @@ -358,175 +348,6 @@ paths: summary: Update Metadata tags: - Maintain - /metadata/{guid}/aliases: - delete: - description: Delete all metadata_aliases of the GUID. - operationId: delete_all_metadata_aliases_metadata__guid__aliases_delete - parameters: - - in: path - name: guid - required: true - schema: - title: Guid - type: string - responses: - '200': - content: - application/json: - schema: {} - description: Successful Response - '422': - content: - application/json: - schema: - $ref: '#/components/schemas/HTTPValidationError' - description: Validation Error - security: - - HTTPBasic: [] - - HTTPBearer: [] - summary: Delete All Metadata Aliases - tags: - - Aliases - get: - description: Get the aliases for the provided GUID - operationId: get_metadata_aliases_metadata__guid__aliases_get - parameters: - - in: path - name: guid - required: true - schema: - title: Guid - type: string - responses: - '200': - content: - application/json: - schema: {} - description: Successful Response - '422': - content: - application/json: - schema: - $ref: '#/components/schemas/HTTPValidationError' - description: Validation Error - summary: Get Metadata Aliases - tags: - - Query - post: - description: Create metadata aliases for the GUID. - operationId: create_metadata_aliases_metadata__guid__aliases_post - parameters: - - in: path - name: guid - required: true - schema: - title: Guid - type: string - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/AliasObjInput' - required: true - responses: - '200': - content: - application/json: - schema: {} - description: Successful Response - '422': - content: - application/json: - schema: - $ref: '#/components/schemas/HTTPValidationError' - description: Validation Error - security: - - HTTPBasic: [] - - HTTPBearer: [] - summary: Create Metadata Aliases - tags: - - Aliases - put: - description: 'Update the metadata aliases of the GUID. - - - If `merge` is True, then any aliases that are not in the new data will be - - kept.' - operationId: update_metadata_alias_metadata__guid__aliases_put - parameters: - - in: path - name: guid - required: true - schema: - title: Guid - type: string - - in: query - name: merge - required: false - schema: - default: false - title: Merge - type: boolean - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/AliasObjInput' - required: true - responses: - '200': - content: - application/json: - schema: {} - description: Successful Response - '422': - content: - application/json: - schema: - $ref: '#/components/schemas/HTTPValidationError' - description: Validation Error - security: - - HTTPBasic: [] - - HTTPBearer: [] - summary: Update Metadata Alias - tags: - - Aliases - /metadata/{guid}/aliases/{alias}: - delete: - description: Delete the specified metadata_alias of the GUID. - operationId: delete_metadata_alias_metadata__guid__aliases__alias__delete - parameters: - - in: path - name: guid - required: true - schema: - title: Guid - type: string - - in: path - name: alias - required: true - schema: - title: Alias - type: string - responses: - '200': - content: - application/json: - schema: {} - description: Successful Response - '422': - content: - application/json: - schema: - $ref: '#/components/schemas/HTTPValidationError' - description: Validation Error - security: - - HTTPBasic: [] - - HTTPBearer: [] - summary: Delete Metadata Alias - tags: - - Aliases /metadata_index: get: description: List all the metadata key paths indexed in the database. @@ -597,8 +418,11 @@ paths: - Index /objects/upload: post: - description: Create object placeholder and attach metadata, return Upload url - to the user. + description: "Create object placeholder and attach metadata, return Upload url\ + \ to the user.\n\nArgs:\n body (CreateObjInput): input body for create\ + \ object\n request (Request): starlette request (which contains reference\ + \ to FastAPI app)\n token (HTTPAuthorizationCredentials, optional): bearer\ + \ token" operationId: create_object_objects_upload_post requestBody: content: @@ -625,18 +449,16 @@ paths: - Object /objects/{guid}: delete: - description: 'Delete the metadata for the specified object and also delete the - record from indexd. - - [Optional] Remove the object from existing bucket location(s) by proxying - to - - fence DELETE /data/file_id by using an additional query parameter `delete_file_locations`. - - Uses the response status code from fence/indexd to determine whether user - has - - permission to delete metadata.' + description: "Delete the metadata for the specified object and also delete the\ + \ record from indexd.\n[Optional] Remove the object from existing bucket location(s)\ + \ by proxying to\nfence DELETE /data/file_id by using an additional query\ + \ parameter `delete_file_locations`.\nUses the response status code from fence/indexd\ + \ to determine whether user has\npermission to delete metadata.\n\nArgs:\n\ + \ guid (str): indexd GUID or alias, or MDS key\n request (Request):\ + \ starlette request (which contains reference to FastAPI app)\nReturns:\n\ + \ 204: if record and metadata are deleted\n 403: if fence/indexd returns\ + \ a 403 unauthorized response\n 500: if fence/indexd does not return 204\ + \ or 403 or there is an error deleting metadata" operationId: delete_object_objects__guid__delete parameters: - in: path @@ -663,10 +485,12 @@ paths: tags: - Object get: - description: 'Get the metadata associated with the provided key. If the key - is an - - indexd GUID or alias, also returns the indexd record.' + description: "Get the metadata associated with the provided key. If the key\ + \ is an\nindexd GUID or alias, also returns the indexd record.\n\nArgs:\n\ + \ guid (str): indexd GUID or alias, or MDS key\n request (Request):\ + \ starlette request (which contains reference to FastAPI app)\n\nReturns:\n\ + \ 200: { \"record\": { indexd record }, \"metadata\": { MDS metadata }\ + \ }\n 404: if the key is not in indexd and not in MDS" operationId: get_object_objects__guid__get parameters: - in: path @@ -691,12 +515,13 @@ paths: tags: - Object post: - description: 'Create object placeholder and attach metadata, return Upload url - to the - - user. A new GUID (new version of the provided GUID) will be created for - - this object. The new record will have the same authz as the original one.' + description: "Create object placeholder and attach metadata, return Upload url\ + \ to the\nuser. A new GUID (new version of the provided GUID) will be created\ + \ for\nthis object. The new record will have the same authz as the original\ + \ one.\n\nArgs:\n guid (str): indexd GUID or alias\n body (CreateObjForIdInput):\ + \ input body for create object for ID\n request (Request): starlette request\ + \ (which contains reference to FastAPI app)\n token (HTTPAuthorizationCredentials,\ + \ optional): bearer token" operationId: create_object_for_id_objects__guid__post parameters: - in: path @@ -730,12 +555,15 @@ paths: - Object /objects/{guid}/download: get: - description: 'Send a GET request to the data access service to generate a signed - download - - url for the given GUID or alias. Returns the generated signed download url - - to the user.' + description: "Send a GET request to the data access service to generate a signed\ + \ download\nurl for the given GUID or alias. Returns the generated signed\ + \ download url\nto the user.\n\nArgs:\n guid (str): indexd GUID or alias\n\ + \ request (Request): starlette request (which contains reference to FastAPI\ + \ app)\n\nReturns:\n 200: { \"url\": signed download url }\n 404: if\ + \ the data access service can not find GUID/alias in indexd\n 403: if the\ + \ data access service returns a 401 or a 403\n 500: if there is an error\ + \ making the request to the data access service\n or the data access service\ + \ returns any other 400-range or 500-range\n error" operationId: get_object_signed_download_url_objects__guid__download_get parameters: - in: path @@ -761,16 +589,16 @@ paths: - Object /objects/{guid}/latest: get: - description: 'Attempt to fetch the latest version of the provided guid/key from - indexd. - - If the provided guid/key is found in indexd, return the indexd record and - - metadata object associated with the latest guid fetched from indexd. If the - - provided guid/key is not found in indexd, return the metadata object - - associated with the provided guid/key.' + description: "Attempt to fetch the latest version of the provided guid/key from\ + \ indexd.\nIf the provided guid/key is found in indexd, return the indexd\ + \ record and\nmetadata object associated with the latest guid fetched from\ + \ indexd. If the\nprovided guid/key is not found in indexd, return the metadata\ + \ object\nassociated with the provided guid/key.\n\nArgs:\n guid (str):\ + \ indexd GUID or MDS key. alias is NOT supported because the\n corresponding\ + \ endpoint in indexd does not accept alias\n request (Request): starlette\ + \ request (which contains reference to FastAPI app)\n\nReturns:\n 200:\ + \ { \"record\": { indexd record }, \"metadata\": { MDS metadata } }\n 404:\ + \ if the key is not in indexd and not in MDS" operationId: get_object_latest_objects__guid__latest_get parameters: - in: path @@ -796,7 +624,6 @@ paths: - Object /version: get: - description: '' operationId: get_version_version_get responses: '200': @@ -805,3 +632,215 @@ paths: schema: {} description: Successful Response summary: Get Version + /aggregate/info/{what}: + get: + description: "Returns status and configuration information about aggregate metadata\ + \ service. Current support only 1 information type:\ + \ **schema**" + operationId: get_aggregate_info + parameters: + - in: path + required: true + schema: + title: What + type: string + name: what + description: type of information to return + responses: + '200': + description: Successful Response + content: + application/json: + schema: {} + summary: Get Config Information + tags: + - Aggregate + /aggregate/commons: + get: + description: "Returns a list of all commons with data in the aggregate metadata-service" + operationId: get_aggregate_commons + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + properties: + commons: + type: array + items: + type: string + example: + - commons: ["commonsA", "commonsB"] + summary: Get Commons + tags: + - Aggregate + /aggregate/tags: + get: + description: "Returns aggregate category, name and counts across all commons" + operationId: get_aggregate_tags + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + example: + - { + "Data Type": { + "total": 275, + "names": [ + { + "Genotype": 103, + "Clinical Phenotype": 100, + "DCC Harmonized": 24, + "WGS": 20, + "SNP/CNV Genotypes (NGS)": 6, + "RNA-Seq": 5, + "WXS": 5, + "Targeted-Capture": 3, + "miRNA-Seq": 3, + "CNV Genotypes": 2 + } + ] + } + } + summary: Get tag counts information + tags: + - Aggregate + /aggregate/metadata: + get: + description: "Returns a list of all commons with data in the aggregate metadata-service" + operationId: get_aggregate_metadata + parameters: + - in: query + name: limit + required: false + schema: + title: limit + type: integer + default: 20 + - in: query + name: offset + schema: + title: offset + type: integer + default: 0 + description: "Return results at this given offset" + - in: query + name: flatten + schema: + title: flatten + type: boolean + default: false + description: "Return the results without grouping items by commons" + - in: query + name: pagination + schema: + title: pagination + type: boolean + default: false + description: "If true will return a pagination object in the response" + - in: query + name: counts + schema: + title: counts + type: string + default: "" + description: "Return count of a field instead of the value if field is an array\ + \ otherwise field is unchanged. If field is null will set field to 0.\ + \ Multiple fields can be compressed by comma separating the field names:\ + \ _files,authors" + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + additionalProperties: true + example: + - { + "commonA" : [ { id2: { name: "bear" } } , { id3: { name: "cat" } } ], + "commonB" : [ { id200: { name: "shark" } } , { id312: { name: "bass" }} ] + } + summary: Get metadata records from aggregate metadata + tags: + - Aggregate + /aggregate/metadata/{name}: + get: + parameters: + - in: path + name: name + required: true + schema: + title: Name + type: string + default: false + description: "Returns an array containing all the metadata entries for a single commons. There are no limit/offset parameters." + operationId: get_aggregate_metadata_for_commons + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + additionalProperties: true + example: + - [ { id2: { name: "bear" } } , { id3: { name: "cat" } }] + summary: Get all metadata records from a commons by name + tags: + - Aggregate + /aggregate/metadata/{name}/info: + get: + parameters: + - in: path + name: name + required: true + schema: + title: Name + type: string + default: false + description: "Returns an object containing additional information about a commons" + operationId: get_aggregate_metadata_commons_info + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + additionalProperties: true + example: + - { commons_url: "gen3.datacommons.io" } + summary: Get additional named commons information + tags: + - Aggregate + /aggregate/metadata/guid/{guid}: + get: + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + default: false + description: "Returns a metadata record by GUID" + operationId: get_aggregate_metadata_guid + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + additionalProperties: true + example: + - { id2: { name: "bear" } } + summary: Get metadata entry by guid + tags: + - Aggregate From a1d80fd5ea00a7680348e4ce963dc4fe8a01c3fb Mon Sep 17 00:00:00 2001 From: craigrbarnes Date: Wed, 12 Oct 2022 17:40:33 +0000 Subject: [PATCH 61/70] Apply automatic documentation changes --- docs/openapi.yaml | 483 +++++++++++++++++++++------------------------- 1 file changed, 222 insertions(+), 261 deletions(-) diff --git a/docs/openapi.yaml b/docs/openapi.yaml index 973159da..4a555a8d 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -1,5 +1,15 @@ components: schemas: + AliasObjInput: + description: "Alias object\n\naliases (list, optional): unique names to allow\ + \ using in place of whatever GUID\n specified" + properties: + aliases: + items: {} + title: Aliases + type: array + title: AliasObjInput + type: object CreateObjForIdInput: description: "Create object.\n\nfile_name (str): Name for the file being uploaded\n\ aliases (list, optional): unique name to allow using in place of whatever\ @@ -82,14 +92,14 @@ components: type: http info: title: Framework Services Object Management Service - version: 1.8.1 + version: 1.10.0 openapi: 3.0.2 paths: /_status: get: description: "Returns the status of the MDS:\n * error: if there was no error\ \ this will be \"none\"\n * last_update: timestamp of the last data pull from\ - \ the commons\n * count: number of entries\n:return:" + \ the commons\n * count: number of entries" operationId: get_status__status_get responses: '200': @@ -348,6 +358,175 @@ paths: summary: Update Metadata tags: - Maintain + /metadata/{guid}/aliases: + delete: + description: Delete all metadata_aliases of the GUID. + operationId: delete_all_metadata_aliases_metadata__guid__aliases_delete + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + security: + - HTTPBasic: [] + - HTTPBearer: [] + summary: Delete All Metadata Aliases + tags: + - Aliases + get: + description: Get the aliases for the provided GUID + operationId: get_metadata_aliases_metadata__guid__aliases_get + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + summary: Get Metadata Aliases + tags: + - Query + post: + description: Create metadata aliases for the GUID. + operationId: create_metadata_aliases_metadata__guid__aliases_post + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/AliasObjInput' + required: true + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + security: + - HTTPBasic: [] + - HTTPBearer: [] + summary: Create Metadata Aliases + tags: + - Aliases + put: + description: 'Update the metadata aliases of the GUID. + + + If `merge` is True, then any aliases that are not in the new data will be + + kept.' + operationId: update_metadata_alias_metadata__guid__aliases_put + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + - in: query + name: merge + required: false + schema: + default: false + title: Merge + type: boolean + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/AliasObjInput' + required: true + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + security: + - HTTPBasic: [] + - HTTPBearer: [] + summary: Update Metadata Alias + tags: + - Aliases + /metadata/{guid}/aliases/{alias}: + delete: + description: Delete the specified metadata_alias of the GUID. + operationId: delete_metadata_alias_metadata__guid__aliases__alias__delete + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + - in: path + name: alias + required: true + schema: + title: Alias + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + security: + - HTTPBasic: [] + - HTTPBearer: [] + summary: Delete Metadata Alias + tags: + - Aliases /metadata_index: get: description: List all the metadata key paths indexed in the database. @@ -418,11 +597,8 @@ paths: - Index /objects/upload: post: - description: "Create object placeholder and attach metadata, return Upload url\ - \ to the user.\n\nArgs:\n body (CreateObjInput): input body for create\ - \ object\n request (Request): starlette request (which contains reference\ - \ to FastAPI app)\n token (HTTPAuthorizationCredentials, optional): bearer\ - \ token" + description: Create object placeholder and attach metadata, return Upload url + to the user. operationId: create_object_objects_upload_post requestBody: content: @@ -449,16 +625,18 @@ paths: - Object /objects/{guid}: delete: - description: "Delete the metadata for the specified object and also delete the\ - \ record from indexd.\n[Optional] Remove the object from existing bucket location(s)\ - \ by proxying to\nfence DELETE /data/file_id by using an additional query\ - \ parameter `delete_file_locations`.\nUses the response status code from fence/indexd\ - \ to determine whether user has\npermission to delete metadata.\n\nArgs:\n\ - \ guid (str): indexd GUID or alias, or MDS key\n request (Request):\ - \ starlette request (which contains reference to FastAPI app)\nReturns:\n\ - \ 204: if record and metadata are deleted\n 403: if fence/indexd returns\ - \ a 403 unauthorized response\n 500: if fence/indexd does not return 204\ - \ or 403 or there is an error deleting metadata" + description: 'Delete the metadata for the specified object and also delete the + record from indexd. + + [Optional] Remove the object from existing bucket location(s) by proxying + to + + fence DELETE /data/file_id by using an additional query parameter `delete_file_locations`. + + Uses the response status code from fence/indexd to determine whether user + has + + permission to delete metadata.' operationId: delete_object_objects__guid__delete parameters: - in: path @@ -485,12 +663,10 @@ paths: tags: - Object get: - description: "Get the metadata associated with the provided key. If the key\ - \ is an\nindexd GUID or alias, also returns the indexd record.\n\nArgs:\n\ - \ guid (str): indexd GUID or alias, or MDS key\n request (Request):\ - \ starlette request (which contains reference to FastAPI app)\n\nReturns:\n\ - \ 200: { \"record\": { indexd record }, \"metadata\": { MDS metadata }\ - \ }\n 404: if the key is not in indexd and not in MDS" + description: 'Get the metadata associated with the provided key. If the key + is an + + indexd GUID or alias, also returns the indexd record.' operationId: get_object_objects__guid__get parameters: - in: path @@ -515,13 +691,12 @@ paths: tags: - Object post: - description: "Create object placeholder and attach metadata, return Upload url\ - \ to the\nuser. A new GUID (new version of the provided GUID) will be created\ - \ for\nthis object. The new record will have the same authz as the original\ - \ one.\n\nArgs:\n guid (str): indexd GUID or alias\n body (CreateObjForIdInput):\ - \ input body for create object for ID\n request (Request): starlette request\ - \ (which contains reference to FastAPI app)\n token (HTTPAuthorizationCredentials,\ - \ optional): bearer token" + description: 'Create object placeholder and attach metadata, return Upload url + to the + + user. A new GUID (new version of the provided GUID) will be created for + + this object. The new record will have the same authz as the original one.' operationId: create_object_for_id_objects__guid__post parameters: - in: path @@ -555,15 +730,12 @@ paths: - Object /objects/{guid}/download: get: - description: "Send a GET request to the data access service to generate a signed\ - \ download\nurl for the given GUID or alias. Returns the generated signed\ - \ download url\nto the user.\n\nArgs:\n guid (str): indexd GUID or alias\n\ - \ request (Request): starlette request (which contains reference to FastAPI\ - \ app)\n\nReturns:\n 200: { \"url\": signed download url }\n 404: if\ - \ the data access service can not find GUID/alias in indexd\n 403: if the\ - \ data access service returns a 401 or a 403\n 500: if there is an error\ - \ making the request to the data access service\n or the data access service\ - \ returns any other 400-range or 500-range\n error" + description: 'Send a GET request to the data access service to generate a signed + download + + url for the given GUID or alias. Returns the generated signed download url + + to the user.' operationId: get_object_signed_download_url_objects__guid__download_get parameters: - in: path @@ -589,16 +761,16 @@ paths: - Object /objects/{guid}/latest: get: - description: "Attempt to fetch the latest version of the provided guid/key from\ - \ indexd.\nIf the provided guid/key is found in indexd, return the indexd\ - \ record and\nmetadata object associated with the latest guid fetched from\ - \ indexd. If the\nprovided guid/key is not found in indexd, return the metadata\ - \ object\nassociated with the provided guid/key.\n\nArgs:\n guid (str):\ - \ indexd GUID or MDS key. alias is NOT supported because the\n corresponding\ - \ endpoint in indexd does not accept alias\n request (Request): starlette\ - \ request (which contains reference to FastAPI app)\n\nReturns:\n 200:\ - \ { \"record\": { indexd record }, \"metadata\": { MDS metadata } }\n 404:\ - \ if the key is not in indexd and not in MDS" + description: 'Attempt to fetch the latest version of the provided guid/key from + indexd. + + If the provided guid/key is found in indexd, return the indexd record and + + metadata object associated with the latest guid fetched from indexd. If the + + provided guid/key is not found in indexd, return the metadata object + + associated with the provided guid/key.' operationId: get_object_latest_objects__guid__latest_get parameters: - in: path @@ -624,6 +796,7 @@ paths: - Object /version: get: + description: '' operationId: get_version_version_get responses: '200': @@ -632,215 +805,3 @@ paths: schema: {} description: Successful Response summary: Get Version - /aggregate/info/{what}: - get: - description: "Returns status and configuration information about aggregate metadata\ - \ service. Current support only 1 information type:\ - \ **schema**" - operationId: get_aggregate_info - parameters: - - in: path - required: true - schema: - title: What - type: string - name: what - description: type of information to return - responses: - '200': - description: Successful Response - content: - application/json: - schema: {} - summary: Get Config Information - tags: - - Aggregate - /aggregate/commons: - get: - description: "Returns a list of all commons with data in the aggregate metadata-service" - operationId: get_aggregate_commons - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - properties: - commons: - type: array - items: - type: string - example: - - commons: ["commonsA", "commonsB"] - summary: Get Commons - tags: - - Aggregate - /aggregate/tags: - get: - description: "Returns aggregate category, name and counts across all commons" - operationId: get_aggregate_tags - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - example: - - { - "Data Type": { - "total": 275, - "names": [ - { - "Genotype": 103, - "Clinical Phenotype": 100, - "DCC Harmonized": 24, - "WGS": 20, - "SNP/CNV Genotypes (NGS)": 6, - "RNA-Seq": 5, - "WXS": 5, - "Targeted-Capture": 3, - "miRNA-Seq": 3, - "CNV Genotypes": 2 - } - ] - } - } - summary: Get tag counts information - tags: - - Aggregate - /aggregate/metadata: - get: - description: "Returns a list of all commons with data in the aggregate metadata-service" - operationId: get_aggregate_metadata - parameters: - - in: query - name: limit - required: false - schema: - title: limit - type: integer - default: 20 - - in: query - name: offset - schema: - title: offset - type: integer - default: 0 - description: "Return results at this given offset" - - in: query - name: flatten - schema: - title: flatten - type: boolean - default: false - description: "Return the results without grouping items by commons" - - in: query - name: pagination - schema: - title: pagination - type: boolean - default: false - description: "If true will return a pagination object in the response" - - in: query - name: counts - schema: - title: counts - type: string - default: "" - description: "Return count of a field instead of the value if field is an array\ - \ otherwise field is unchanged. If field is null will set field to 0.\ - \ Multiple fields can be compressed by comma separating the field names:\ - \ _files,authors" - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - example: - - { - "commonA" : [ { id2: { name: "bear" } } , { id3: { name: "cat" } } ], - "commonB" : [ { id200: { name: "shark" } } , { id312: { name: "bass" }} ] - } - summary: Get metadata records from aggregate metadata - tags: - - Aggregate - /aggregate/metadata/{name}: - get: - parameters: - - in: path - name: name - required: true - schema: - title: Name - type: string - default: false - description: "Returns an array containing all the metadata entries for a single commons. There are no limit/offset parameters." - operationId: get_aggregate_metadata_for_commons - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - example: - - [ { id2: { name: "bear" } } , { id3: { name: "cat" } }] - summary: Get all metadata records from a commons by name - tags: - - Aggregate - /aggregate/metadata/{name}/info: - get: - parameters: - - in: path - name: name - required: true - schema: - title: Name - type: string - default: false - description: "Returns an object containing additional information about a commons" - operationId: get_aggregate_metadata_commons_info - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - example: - - { commons_url: "gen3.datacommons.io" } - summary: Get additional named commons information - tags: - - Aggregate - /aggregate/metadata/guid/{guid}: - get: - parameters: - - in: path - name: guid - required: true - schema: - title: Guid - type: string - default: false - description: "Returns a metadata record by GUID" - operationId: get_aggregate_metadata_guid - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - example: - - { id2: { name: "bear" } } - summary: Get metadata entry by guid - tags: - - Aggregate From 7e8ef8d48ac5e9b4017c759d2de1b41b473493d9 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Wed, 12 Oct 2022 16:15:36 -0500 Subject: [PATCH 62/70] update docs and tests --- .secrets.baseline | 10 +--------- pyproject.toml | 2 +- src/mds/agg_mds/adapters.py | 2 +- src/mds/agg_mds/query.py | 7 +++---- tests/test_agg_mds_adapters.py | 18 ++++++++++++++++-- tests/test_populate.py | 1 - 6 files changed, 22 insertions(+), 18 deletions(-) diff --git a/.secrets.baseline b/.secrets.baseline index b4e8d976..f21d3f0f 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -3,7 +3,7 @@ "files": null, "lines": null }, - "generated_at": "2022-10-03T18:10:28Z", + "generated_at": "2022-10-12T21:15:02Z", "plugins_used": [ { "name": "AWSKeyDetector" @@ -106,14 +106,6 @@ "type": "Hex High Entropy String" } ], - "tests/test_agg_mds_adapters.py": [ - { - "hashed_secret": "143e9f2aca10dbd2711cb96047f4016f095e5709", - "is_verified": false, - "line_number": 3898, - "type": "Hex High Entropy String" - } - ], "tests/test_migrations.py": [ { "hashed_secret": "4dcba4ad1d671981e2d211ebe56da8a5b40f14ef", diff --git a/pyproject.toml b/pyproject.toml index 126cc708..ab60c816 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "mds" -version = "1.10.0" +version = "1.11.0" description = "Metadata Service" authors = ["CTDS UChicago "] license = "Apache-2.0" diff --git a/src/mds/agg_mds/adapters.py b/src/mds/agg_mds/adapters.py index 5ed25f1f..3e067a55 100644 --- a/src/mds/agg_mds/adapters.py +++ b/src/mds/agg_mds/adapters.py @@ -150,7 +150,7 @@ def mapFields(item: dict, mappings: dict, global_filters=None, schema=None) -> d } :param item: dictionary to map fields to - :param mappings: + :param mappings: dictionary describing fields to add :return: """ diff --git a/src/mds/agg_mds/query.py b/src/mds/agg_mds/query.py index b2ff097c..a0c8a4e6 100644 --- a/src/mds/agg_mds/query.py +++ b/src/mds/agg_mds/query.py @@ -47,7 +47,8 @@ async def metadata( False, description="If true will return a pagination object in the response" ), ): - """ + """Returns metadata records + The pagination option adds a pagination object to the response: { "commonA" : { @@ -98,9 +99,7 @@ async def metadata( @mod.get("/aggregate/metadata/{name}") async def metadata_name(name: str): - """ - Returns the all the metadata from the named commons. - """ + """Returns the all the metadata from the named commons.""" res = await datastore.get_all_named_commons_metadata(name) if res: return res diff --git a/tests/test_agg_mds_adapters.py b/tests/test_agg_mds_adapters.py index 315b1116..6668123e 100644 --- a/tests/test_agg_mds_adapters.py +++ b/tests/test_agg_mds_adapters.py @@ -7,6 +7,8 @@ strip_email, strip_html, add_icpsr_source_url, + FieldFilters, + get_json_path_value, ) from tenacity import RetryError, wait_none import httpx @@ -18,6 +20,18 @@ def test_filters_with_bad_entries(): assert add_icpsr_source_url(77) == 77 +def test_non_existing_filters(): + assert FieldFilters().execute("nofilter", "passthru") == "passthru" + + +def test_json_path(): + assert get_json_path_value(None, {}) is None + assert get_json_path_value("shark", {"shark": ["great", "white"]}) == [ + "great", + "white", + ] + + @respx.mock def test_get_metadata_icpsr(): xml_response = """ @@ -3080,7 +3094,7 @@ def test_get_metadata_mps(): "authz": "", "sites": "", "summary": "path:description", - "study_url": "path:url", + "study_url": {"path": "url", "default": ""}, "location": "path:data_group", "subjects": "", "__manifest": "", @@ -3089,7 +3103,7 @@ def test_get_metadata_mps(): "institutions": "path:data_group", "year_awarded": "", "investigators": "path:data_group", - "project_title": "path:title", + "project_title": {"path": "title", "default": ""}, "protocol_name": "", "study_summary": "", "_file_manifest": "", diff --git a/tests/test_populate.py b/tests/test_populate.py index 75869e73..cf666570 100644 --- a/tests/test_populate.py +++ b/tests/test_populate.py @@ -410,7 +410,6 @@ async def test_populate_main(): @respx.mock @pytest.mark.asyncio async def test_populate_main_fail(): - patch("mds.config.USE_AGG_MDS", True).start() patch.object(datastore, "init", AsyncMock()).start() patch.object(datastore, "drop_all_temp_indexes", AsyncMock()).start() From fcb736bba4a2027170b2aec6e0f8b56f3a2c03e3 Mon Sep 17 00:00:00 2001 From: craigrbarnes Date: Wed, 12 Oct 2022 21:17:17 +0000 Subject: [PATCH 63/70] Apply automatic documentation changes --- docs/openapi.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/openapi.yaml b/docs/openapi.yaml index 4a555a8d..62926316 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -92,7 +92,7 @@ components: type: http info: title: Framework Services Object Management Service - version: 1.10.0 + version: 1.11.0 openapi: 3.0.2 paths: /_status: From 41c80a73a44f01380ee38287706624cf74731094 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Wed, 12 Oct 2022 16:28:07 -0500 Subject: [PATCH 64/70] update Tags --- src/mds/agg_mds/query.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/mds/agg_mds/query.py b/src/mds/agg_mds/query.py index a0c8a4e6..c53c02a4 100644 --- a/src/mds/agg_mds/query.py +++ b/src/mds/agg_mds/query.py @@ -8,18 +8,13 @@ @mod.get("/aggregate/commons") async def get_commons(): - """ - Returns a list of all registered commons - :return: - """ + """Returns a list of all registered commons""" return await datastore.get_commons() @mod.get("/aggregate/info/{what}") async def get_commons(what: str): - """ - Returns information from the aggregate metadata service. - """ + """Returns information from the aggregate metadata service.""" res = await datastore.get_commons_attribute(what) if res: return res @@ -158,4 +153,4 @@ async def metadata_name_guid(guid: str): def init_app(app): if config.USE_AGG_MDS: - app.include_router(mod, tags=["Query"]) + app.include_router(mod, tags=["Aggregate"]) From e6e9a04e76ad7e1118c704c930f35b1a5362efc7 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Wed, 12 Oct 2022 18:51:22 -0500 Subject: [PATCH 65/70] update FastAPI documentation --- src/mds/agg_mds/query.py | 139 ++++++++++++++++++++++++++++++--------- 1 file changed, 107 insertions(+), 32 deletions(-) diff --git a/src/mds/agg_mds/query.py b/src/mds/agg_mds/query.py index c53c02a4..365acc6c 100644 --- a/src/mds/agg_mds/query.py +++ b/src/mds/agg_mds/query.py @@ -2,19 +2,41 @@ from starlette.status import HTTP_404_NOT_FOUND from mds import config from mds.agg_mds import datastore +from typing import Any, Dict, List +from pydantic import BaseModel mod = APIRouter() @mod.get("/aggregate/commons") async def get_commons(): - """Returns a list of all registered commons""" + """Returns a list of all commons with data in the aggregate metadata-service + + Example: + + { commons: ["commonsA", "commonsB" ] } + + """ return await datastore.get_commons() @mod.get("/aggregate/info/{what}") -async def get_commons(what: str): - """Returns information from the aggregate metadata service.""" +async def get_commons_info(what: str): + """Returns status and configuration information about aggregate metadata service. + + Return configuration information. Currently supports only 1 information type: + **schema** + + Example: + + { + schema: { + ... + ... + } + } + + """ res = await datastore.get_commons_attribute(what) if res: return res @@ -26,14 +48,18 @@ async def get_commons(what: str): @mod.get("/aggregate/metadata") -async def metadata( +async def get_aggregate_metadata( _: Request, limit: int = Query( 20, description="Maximum number of records returned. (e.g. max: 2000)" ), offset: int = Query(0, description="Return results at this given offset."), counts: str = Query( - "", description="Return count of a field instead of the value." + "", + description="Return count of a field instead of the value if field is an array\ + otherwise field is unchanged. If field is **null** will set field to **0**.\ + Multiple fields can be compressed by comma separating the field names:\ + **files,authors**", ), flatten: bool = Query( False, description="Return the results without grouping items by commons." @@ -44,19 +70,10 @@ async def metadata( ): """Returns metadata records - The pagination option adds a pagination object to the response: - { - "commonA" : { - ... Metadata - }, - "commonB" : { - ... Metadata - } - ... - } + Returns medata records namespaced by commons as a JSON object. + Example without pagination: { - results: { "commonA" : { ... Metadata }, @@ -64,16 +81,30 @@ async def metadata( ... Metadata } ... - }, - "pagination": { - "hits": 64, - "offset": 0, - "pageSize": 20, - "pages": 4 } - } + + The pagination option adds a pagination object to the response: + + { + results: { + "commonA" : { + ... Metadata + }, + "commonB" : { + ... Metadata + } + ... + }, + "pagination": { + "hits": 64, + "offset": 0, + "pageSize": 20, + "pages": 4 + } + } The flatten option removes the commons' namespace so all results are a child or results: + results: { ... Metadata from commons A ... Metadata from commons B @@ -84,7 +115,7 @@ async def metadata( The counts options when applied to an array or dictionary will replace the field value with its length. If the field values is None it will replace it with 0. - All other type will be unchanged. + All other types will be unchanged. """ results = await datastore.get_all_metadata(limit, offset, counts, flatten) if pagination is False: @@ -93,8 +124,21 @@ async def metadata( @mod.get("/aggregate/metadata/{name}") -async def metadata_name(name: str): - """Returns the all the metadata from the named commons.""" +async def get_aggregate_metadata_for_commons( + name: str = Query( + False, description="Return the results without grouping items by commons." + ) +): + """et all metadata records from a commons by name + + Returns an array containing all the metadata entries for a single commons. + There are no limit/offset parameters. + + Example: + + [ { id2: { name: "bear" } } , { id3: { name: "cat" } }] + + """ res = await datastore.get_all_named_commons_metadata(name) if res: return res @@ -106,9 +150,30 @@ async def metadata_name(name: str): @mod.get("/aggregate/tags") -async def metadata_tags(): - """ - Returns the tags associated with the named commons. +async def get_aggregate_tags(): + """Returns aggregate category, name and counts across all commons + + Example: + + { + "Data Type": { + "total": 275, + "names": [ + { + "Genotype": 103, + "Clinical Phenotype": 100, + "DCC Harmonized": 24, + "WGS": 20, + "SNP/CNV Genotypes (NGS)": 6, + "RNA-Seq": 5, + "WXS": 5, + "Targeted-Capture": 3, + "miRNA-Seq": 3, + "CNV Genotypes": 2 + } + ] + } + } """ res = await datastore.get_all_tags() if res: @@ -121,9 +186,14 @@ async def metadata_tags(): @mod.get("/aggregate/metadata/{name}/info") -async def metadata_info(name: str): +async def get_aggregate_metadata_commons_info(name: str): """ Returns information from the named commons. + + Example: + + { commons_url: "gen3.datacommons.io" } + """ res = await datastore.get_commons_attribute(name) if res: @@ -136,8 +206,13 @@ async def metadata_info(name: str): @mod.get("/aggregate/metadata/guid/{guid:path}") -async def metadata_name_guid(guid: str): - """Get the metadata of the GUID in the named commons.""" +async def get_aggregate_metadata_guid(guid: str): + """Returns a metadata record by GUID + + Example: + + { id2: { name: "bear" } } + """ res = await datastore.get_by_guid(guid) if res: return res From 3522b4fda2fb09892915c55f0fcf855958684d6a Mon Sep 17 00:00:00 2001 From: Pauline <4224001+paulineribeyre@users.noreply.github.com> Date: Thu, 13 Oct 2022 10:34:33 -0500 Subject: [PATCH 66/70] fix agg mds docs not generated --- run.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/run.py b/run.py index fd354ac7..688a8798 100644 --- a/run.py +++ b/run.py @@ -34,6 +34,8 @@ def _get_schema_with_clean_descriptions(): of the docstring. It does so by splitting on the first well-defined part of the Google-style docstring, the string "Args", and returning only everything before that. """ + mds.config.USE_AGG_MDS = True # so the aggregate MDS docs are added + raw_schema = get_app().openapi() output_schema = {} output_schema["openapi"] = raw_schema.get("openapi", {}) From a67826da8d136c122a14927bde8502afebc3449a Mon Sep 17 00:00:00 2001 From: Pauline <4224001+paulineribeyre@users.noreply.github.com> Date: Thu, 13 Oct 2022 10:39:10 -0500 Subject: [PATCH 67/70] set USE_AGG_MDS in workflow --- .github/workflows/docs.yaml | 2 ++ run.py | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 21c00155..f02d57f6 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -31,6 +31,8 @@ jobs: poetry install -vv --no-interaction poetry show -vv - name: Build docs + env: + USE_AGG_MDS: true # so the aggregate MDS docs are added run: poetry run python run.py openapi - uses: stefanzweifel/git-auto-commit-action@v4.1.2 diff --git a/run.py b/run.py index 688a8798..fd354ac7 100644 --- a/run.py +++ b/run.py @@ -34,8 +34,6 @@ def _get_schema_with_clean_descriptions(): of the docstring. It does so by splitting on the first well-defined part of the Google-style docstring, the string "Args", and returning only everything before that. """ - mds.config.USE_AGG_MDS = True # so the aggregate MDS docs are added - raw_schema = get_app().openapi() output_schema = {} output_schema["openapi"] = raw_schema.get("openapi", {}) From 35f9137a8968befb6ffec7aab9bee567af6ade57 Mon Sep 17 00:00:00 2001 From: paulineribeyre Date: Thu, 13 Oct 2022 15:40:40 +0000 Subject: [PATCH 68/70] Apply automatic documentation changes --- docs/openapi.yaml | 232 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 232 insertions(+) diff --git a/docs/openapi.yaml b/docs/openapi.yaml index 62926316..2743ce66 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -108,6 +108,238 @@ paths: schema: {} description: Successful Response summary: Get Status + /aggregate/commons: + get: + description: "Returns a list of all commons with data in the aggregate metadata-service\n\ + \nExample:\n\n { commons: [\"commonsA\", \"commonsB\" ] }" + operationId: get_commons_aggregate_commons_get + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + summary: Get Commons + tags: + - Aggregate + /aggregate/info/{what}: + get: + description: "Returns status and configuration information about aggregate metadata\ + \ service.\n\nReturn configuration information. Currently supports only 1\ + \ information type:\n**schema**\n\nExample:\n\n {\n schema: {\n \ + \ ...\n ...\n }\n }" + operationId: get_commons_info_aggregate_info__what__get + parameters: + - in: path + name: what + required: true + schema: + title: What + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + summary: Get Commons Info + tags: + - Aggregate + /aggregate/metadata: + get: + description: "Returns metadata records\n\nReturns medata records namespaced\ + \ by commons as a JSON object.\nExample without pagination:\n\n {\n \ + \ \"commonA\" : {\n ... Metadata\n },\n \"commonB\"\ + \ : {\n ... Metadata\n }\n ...\n }\n\nThe pagination\ + \ option adds a pagination object to the response:\n\n {\n results:\ + \ {\n \"commonA\" : {\n ... Metadata\n },\n\ + \ \"commonB\" : {\n ... Metadata\n }\n \ + \ ...\n },\n \"pagination\": {\n \"hits\":\ + \ 64,\n \"offset\": 0,\n \"pageSize\": 20,\n \ + \ \"pages\": 4\n }\n }\n\nThe flatten option removes the commons'\ + \ namespace so all results are a child or results:\n\n results: {\n \ + \ ... Metadata from commons A\n ... Metadata from commons B\n\ + \ }\n ...\n },\n\n\nThe counts options when applied to an array\ + \ or dictionary will replace\nthe field value with its length. If the field\ + \ values is None it will replace it with 0.\nAll other types will be unchanged." + operationId: get_aggregate_metadata_aggregate_metadata_get + parameters: + - description: 'Maximum number of records returned. (e.g. max: 2000)' + in: query + name: limit + required: false + schema: + default: 20 + description: 'Maximum number of records returned. (e.g. max: 2000)' + title: Limit + type: integer + - description: Return results at this given offset. + in: query + name: offset + required: false + schema: + default: 0 + description: Return results at this given offset. + title: Offset + type: integer + - description: 'Return count of a field instead of the value if field is an + array otherwise field is unchanged. If field is **null** will + set field to **0**. Multiple fields can be compressed by comma + separating the field names: **files,authors**' + in: query + name: counts + required: false + schema: + default: '' + description: 'Return count of a field instead of the value if field is an + array otherwise field is unchanged. If field is **null** will + set field to **0**. Multiple fields can be compressed by comma + separating the field names: **files,authors**' + title: Counts + type: string + - description: Return the results without grouping items by commons. + in: query + name: flatten + required: false + schema: + default: false + description: Return the results without grouping items by commons. + title: Flatten + type: boolean + - description: If true will return a pagination object in the response + in: query + name: pagination + required: false + schema: + default: false + description: If true will return a pagination object in the response + title: Pagination + type: boolean + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + summary: Get Aggregate Metadata + tags: + - Aggregate + /aggregate/metadata/guid/{guid}: + get: + description: "Returns a metadata record by GUID\n\nExample:\n\n { id2: {\ + \ name: \"bear\" } }" + operationId: get_aggregate_metadata_guid_aggregate_metadata_guid__guid__get + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + summary: Get Aggregate Metadata Guid + tags: + - Aggregate + /aggregate/metadata/{name}: + get: + description: "et all metadata records from a commons by name\n\nReturns an array\ + \ containing all the metadata entries for a single commons.\nThere are no\ + \ limit/offset parameters.\n\nExample:\n\n [ { id2: { name: \"bear\" }\ + \ } , { id3: { name: \"cat\" } }]" + operationId: get_aggregate_metadata_for_commons_aggregate_metadata__name__get + parameters: + - in: path + name: name + required: true + schema: + title: Name + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + summary: Get Aggregate Metadata For Commons + tags: + - Aggregate + /aggregate/metadata/{name}/info: + get: + description: "Returns information from the named commons.\n\nExample:\n\n \ + \ { commons_url: \"gen3.datacommons.io\" }" + operationId: get_aggregate_metadata_commons_info_aggregate_metadata__name__info_get + parameters: + - in: path + name: name + required: true + schema: + title: Name + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + summary: Get Aggregate Metadata Commons Info + tags: + - Aggregate + /aggregate/tags: + get: + description: "Returns aggregate category, name and counts across all commons\n\ + \nExample:\n\n {\n \"Data Type\": {\n \"total\"\ + : 275,\n \"names\": [\n {\n \"Genotype\"\ + : 103,\n \"Clinical Phenotype\": 100,\n \"DCC\ + \ Harmonized\": 24,\n \"WGS\": 20,\n \"SNP/CNV\ + \ Genotypes (NGS)\": 6,\n \"RNA-Seq\": 5,\n \ + \ \"WXS\": 5,\n \"Targeted-Capture\": 3,\n \"\ + miRNA-Seq\": 3,\n \"CNV Genotypes\": 2\n }\n \ + \ ]\n }\n }" + operationId: get_aggregate_tags_aggregate_tags_get + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + summary: Get Aggregate Tags + tags: + - Aggregate /metadata: get: description: "Search the metadata.\n\nWithout filters, this will return all\ From 992ddb37ba921a142ac3c9617f8a12f3c88044c0 Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Thu, 20 Oct 2022 10:42:20 -0500 Subject: [PATCH 69/70] extend httpx and rety timeout for Gen3Adapter --- src/mds/agg_mds/adapters.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/mds/agg_mds/adapters.py b/src/mds/agg_mds/adapters.py index 3e067a55..0563dcdd 100644 --- a/src/mds/agg_mds/adapters.py +++ b/src/mds/agg_mds/adapters.py @@ -873,9 +873,9 @@ class Gen3Adapter(RemoteMetadataAdapter): """ @retry( - stop=stop_after_attempt(5), + stop=stop_after_attempt(10), retry=retry_if_exception_type(httpx.TimeoutException), - wait=wait_random_exponential(multiplier=1, max=20), + wait=wait_random_exponential(multiplier=1, max=60), before_sleep=before_sleep_log(logger, logging.DEBUG), ) def getRemoteDataAsJson(self, **kwargs) -> Dict: @@ -896,6 +896,8 @@ def getRemoteDataAsJson(self, **kwargs) -> Dict: offset = 0 limit = min(maxItems, batchSize) if maxItems is not None else batchSize moreData = True + # extend httpx timeout + timeout = httpx.Timeout(connect=60, read=120, write=5, pool=60) while moreData: try: url = f"{mds_url}mds/metadata?data=True&_guid_type={guid_type}&limit={limit}&offset={offset}" @@ -903,7 +905,7 @@ def getRemoteDataAsJson(self, **kwargs) -> Dict: url += f"&{filters}" if field_name is not None and field_value is not None: url += f"&{guid_type}.{field_name}={field_value}" - response = httpx.get(url) + response = httpx.get(url, timeout=timeout) response.raise_for_status() data = response.json() From 750ebc0b3815fe928bddb1d5b86558a852f8cbcb Mon Sep 17 00:00:00 2001 From: Craig Barnes Date: Wed, 26 Oct 2022 13:43:03 -0500 Subject: [PATCH 70/70] add warning for field not in schema and no value --- src/mds/agg_mds/adapters.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/src/mds/agg_mds/adapters.py b/src/mds/agg_mds/adapters.py index 0563dcdd..26022c3a 100644 --- a/src/mds/agg_mds/adapters.py +++ b/src/mds/agg_mds/adapters.py @@ -210,8 +210,14 @@ def mapFields(item: dict, mappings: dict, global_filters=None, schema=None) -> d if key in schema: field_value = schema[key].normalize_value(field_value) # set to default if conversion failed and a default value is available - if field_value is None and hasDefaultValue: - field_value = default_value + if field_value is None: + if hasDefaultValue: + field_value = default_value + else: + logger.warn( + f"{key} = None{', is not in the schema,' if key not in schema else ''} " + f"and has no default value. Consider adding {key} to the schema" + ) results[key] = field_value return results @@ -271,7 +277,7 @@ def getRemoteDataAsJson(self, **kwargs) -> Tuple[Dict, str]: raise except httpx.HTTPError as exc: logger.error( - f"An HTTP error { exc.response.status_code if exc.response is not None else '' } occurred while requesting {exc.request.url}. Skipping {id}" + f"An HTTP error {exc.response.status_code if exc.response is not None else ''} occurred while requesting {exc.request.url}. Skipping {id}" ) break except Exception as exc: @@ -780,7 +786,7 @@ def getRemoteDataAsJson(self, **kwargs) -> Tuple[Dict, str]: for var_iter, var in enumerate(vars): data_file["data_dictionary"].append( { - "name": var.get("@name", f"var{var_iter+1}"), + "name": var.get("@name", f"var{var_iter + 1}"), "label": var.get("labl", {}).get("#text"), "interval": var.get("@intrvl"), "type": var.get("varFormat", {}).get("@type"), @@ -796,17 +802,17 @@ def getRemoteDataAsJson(self, **kwargs) -> Tuple[Dict, str]: raise except httpx.HTTPError as exc: logger.error( - f"An HTTP error {exc.response.status_code if exc.response is not None else ''} occurred while requesting {exc.request.url}. Returning { len(results['results'])} results" + f"An HTTP error {exc.response.status_code if exc.response is not None else ''} occurred while requesting {exc.request.url}. Returning {len(results['results'])} results" ) break # need to break here as cannot be assured of leaving while loop except ValueError as exc: logger.error( - f"An error occurred while requesting {mds_url} {exc}. Returning { len(results['results'])} results." + f"An error occurred while requesting {mds_url} {exc}. Returning {len(results['results'])} results." ) break except Exception as exc: logger.error( - f"An error occurred while requesting {mds_url} {exc}. Returning { len(results['results'])} results." + f"An error occurred while requesting {mds_url} {exc}. Returning {len(results['results'])} results." ) break @@ -885,6 +891,9 @@ def getRemoteDataAsJson(self, **kwargs) -> Dict: if mds_url is None: return results + if mds_url[-1] != "/": + mds_url += "/" + config = kwargs.get("config", {}) guid_type = config.get("guid_type", "discovery_metadata") field_name = config.get("field_name", None) @@ -897,7 +906,7 @@ def getRemoteDataAsJson(self, **kwargs) -> Dict: limit = min(maxItems, batchSize) if maxItems is not None else batchSize moreData = True # extend httpx timeout - timeout = httpx.Timeout(connect=60, read=120, write=5, pool=60) + # timeout = httpx.Timeout(connect=60, read=120, write=5, pool=60) while moreData: try: url = f"{mds_url}mds/metadata?data=True&_guid_type={guid_type}&limit={limit}&offset={offset}" @@ -905,7 +914,7 @@ def getRemoteDataAsJson(self, **kwargs) -> Dict: url += f"&{filters}" if field_name is not None and field_value is not None: url += f"&{guid_type}.{field_name}={field_value}" - response = httpx.get(url, timeout=timeout) + response = httpx.get(url) response.raise_for_status() data = response.json() @@ -916,12 +925,12 @@ def getRemoteDataAsJson(self, **kwargs) -> Dict: moreData = False offset += numReturned - except httpx.TimeoutException as exc: + except httpx.TimeoutException: logger.error(f"An timeout error occurred while requesting {url}.") raise except httpx.HTTPError as exc: logger.error( - f"An HTTP error {exc.response.status_code if exc.response is not None else ''} occurred while requesting {exc.request.url}. Returning {len(results['results'])} results." + f"An HTTP error {exc if exc is not None else ''} occurred while requesting {exc.request.url}. Returning {len(results['results'])} results." ) break