From a480d4902e4d03a744e22c44dffcb32ae5f10450 Mon Sep 17 00:00:00 2001 From: Diego Oliveira Date: Tue, 22 Feb 2022 17:35:01 -0300 Subject: [PATCH] [infra] change download functions to consume CKAN endpoints #1129 (#1130) * [infra] add function to wrap bd_dataset_search endpoint * Update download.py * [infra] modify list_datasets function to consume CKAN endpoint * [infra] fix list_dataset function to include limit and remove order_by * [infra] change function list_dataset_tables to use CKAN endpoint * [infra] apply PEP8 to list_dataset_tables and respective tests * add get_dataset_description, get_table_description, get_table_columns * [infra] fix dataset_config.yaml folder path (#1067) * feat(infra) merge master * fix files organization to match master * remove download.py * remove test_download * Delete test_download.py * remove test files * remove test_download.py * remove test_download.py * remove test_download.py * remove test_download.py * add tests metadata * remove test_download.py * remove unused imports * [infra] add _safe_fetch and get_table_size functions Co-authored-by: lucascr91 --- python-package/basedosdados/__init__.py | 3 +- .../basedosdados/download/metadata.py | 420 +++++++++--------- .../tests/test_download/test_metadata.py | 138 +++--- 3 files changed, 288 insertions(+), 273 deletions(-) diff --git a/python-package/basedosdados/__init__.py b/python-package/basedosdados/__init__.py index 58e6a68ca..fd197e9ab 100644 --- a/python-package/basedosdados/__init__.py +++ b/python-package/basedosdados/__init__.py @@ -21,4 +21,5 @@ get_dataset_description, get_table_columns, get_table_size, -) + search +) \ No newline at end of file diff --git a/python-package/basedosdados/download/metadata.py b/python-package/basedosdados/download/metadata.py index fe5d5dff9..9aae894ba 100644 --- a/python-package/basedosdados/download/metadata.py +++ b/python-package/basedosdados/download/metadata.py @@ -1,24 +1,25 @@ from google.cloud import bigquery import pandas as pd +import requests from basedosdados.download.base import credentials - -def _get_header(text): - """Gets first paragraph of a text - - Args: - text (str or None): Text to be split - - Returns: - str: First paragraph +def _safe_fetch(url:str): """ - - if isinstance(text, str): - return text.split("\n")[0] - elif text is None: - return "" - + Safely fetchs urls and, if somehting goes wrong, informs user what is the possible cause + """ + try: + response = requests.get(url) + response.raise_for_status() + return response + except requests.exceptions.RequestException as err: + print ("This url doesn't appear to exists:",err) + except requests.exceptions.HTTPError as errh: + print ("Http Error:",errh) + except requests.exceptions.ConnectionError as errc: + print ("Error Connecting:",errc) + except requests.exceptions.Timeout as errt: + print ("Timeout Error:",errt) def _fix_size(s, step=80): @@ -32,7 +33,6 @@ def _fix_size(s, step=80): def _print_output(df): """Prints dataframe contents as print blocks - Args: df (pd.DataFrame): table to be printed """ @@ -46,14 +46,6 @@ def _print_output(df): print("-" * (step + 15)) print() - # func = lambda lista, final, step: ( - # func(lista[1:], - # (final + lista[0] + ' ') - # if len(final.split('\n')[-1]) <= step - # else final + '\n', - # step - # ) if len(lista) else final) - def _handle_output(verbose, output_type, df, col_name=None): """Handles datasets and tables listing outputs based on user's choice. @@ -91,234 +83,226 @@ def _handle_output(verbose, output_type, df, col_name=None): return None - -def list_datasets( - query_project_id="basedosdados", - filter_by=None, - with_description=False, - from_file=False, - verbose=True, -): - """Fetch the dataset_id of datasets available at query_project_id. Prints information on - screen or returns it as a list. +def list_datasets(query, limit=10, with_description=False, verbose=True): + """ + This function uses `bd_dataset_search` website API + enpoint to retrieve a list of available datasets. Args: - query_project_id (str): Optional. - Which project the table lives. You can change this you want to query different projects. - filter_by (str): Optional - String to be matched in dataset_id. + query (str): + String to search in datasets' metadata. + limit (int): + Field to limit the number of results with_description (bool): Optional If True, fetch short dataset description for each dataset. verbose (bool): Optional. If set to True, information is printed to the screen. If set to False, a list object is returned. - - Example: - list_datasets( - filter_by='sp', - with_description=True, - ) + Returns: + list | stdout """ - client = bigquery.Client( - credentials=credentials(from_file=from_file), project=query_project_id - ) - - datasets_list = list(client.list_datasets()) - - datasets = pd.DataFrame( - [dataset.dataset_id for dataset in datasets_list], columns=["dataset_id"] - ) - - if filter_by: - - datasets = datasets.loc[datasets["dataset_id"].str.contains(filter_by)] - - if with_description: - - datasets["description"] = [ - _get_header(client.get_dataset(dataset).description) - for dataset in datasets["dataset_id"] + url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q={query}&page_size={limit}&resource_type=bdm_table" + + response = _safe_fetch(url) + + json_response = response.json() + + # this dict has all information we need to output the function + dataset_dict = { + "dataset_id": [ + dataset["name"] for dataset in json_response["result"]["datasets"] + ], + "description": [ + dataset["notes"] if "notes" in dataset.keys() else None + for dataset in json_response["result"]["datasets"] + ], + } + + # select desired output using dataset_id info. Note that the output is either a standardized string or a list + if verbose & (with_description == False): + return _print_output(pd.DataFrame.from_dict(dataset_dict)[["dataset_id"]]) + elif verbose & with_description: + return _print_output( + pd.DataFrame.from_dict(dataset_dict)[["dataset_id", "description"]] + ) + elif (verbose == False) & (with_description == False): + return dataset_dict["dataset_id"] + elif (verbose == False) & with_description: + return [ + { + "dataset_id": dataset_dict["dataset_id"][k], + "description": dataset_dict["description"][k], + } + for k in range(len(dataset_dict["dataset_id"])) ] - return _handle_output( - verbose=verbose, - output_type="list", - df=datasets, - col_name="dataset_id", - ) - def list_dataset_tables( dataset_id, - query_project_id="basedosdados", - from_file=False, - filter_by=None, with_description=False, verbose=True, ): - """Fetch table_id for tables available at the specified dataset_id. Prints the information - on screen or returns it as a list. + """ + Fetch table_id for tables available at the specified dataset_id. Prints the information on screen or returns it as a list. Args: dataset_id (str): Optional. - Dataset id available in basedosdados. - query_project_id (str): Optional. - Which project the table lives. You can change this you want to query different projects. - filter_by (str): Optional - String to be matched in the table_id. + Dataset id returned by list_datasets function + limit (int): + Field to limit the number of results with_description (bool): Optional If True, fetch short table descriptions for each table that match the search criteria. verbose (bool): Optional. If set to True, information is printed to the screen. If set to False, a list object is returned. - Example: - list_dataset_tables( - dataset_id='br_ibge_censo2010' - filter_by='renda', - with_description=True, - ) + Returns: + stdout | list """ - client = bigquery.Client( - credentials=credentials(from_file=from_file), project=query_project_id - ) - - dataset = client.get_dataset(dataset_id) - - tables_list = list(client.list_tables(dataset)) - tables = pd.DataFrame( - [table.table_id for table in tables_list], columns=["table_id"] - ) - - if filter_by: - - tables = tables.loc[tables["table_id"].str.contains(filter_by)] - - if with_description: - - tables["description"] = [ - _get_header(client.get_table(f"{dataset_id}.{table}").description) - for table in tables["table_id"] + dataset_id = dataset_id.replace("-","_") #The dataset_id pattern in the bd_dataset_search endpoint response uses a hyphen as a separator, while in the endpoint urls that specify the dataset_id parameter the separator used is an underscore. See issue #1079 + + url = f"https://basedosdados.org/api/3/action/bd_bdm_dataset_show?dataset_id={dataset_id}" + + response = _safe_fetch(url) + + json_response = response.json() + + dataset = json_response["result"] + # this dict has all information need to output the function + table_dict = { + "table_id": [ + dataset["resources"][k]["name"] for k in range(len(dataset["resources"])) + ], + "description": [ + dataset["resources"][k]["description"] + for k in range(len(dataset["resources"])) + ], + } + # select desired output using table_id info. Note that the output is either a standardized string or a list + if verbose & (with_description == False): + return _print_output(pd.DataFrame.from_dict(table_dict)[["table_id"]]) + elif verbose & with_description: + return _print_output( + pd.DataFrame.from_dict(table_dict)[["table_id", "description"]] + ) + elif (verbose == False) & (with_description == False): + return table_dict["table_id"] + elif (verbose == False) & with_description: + return [ + { + "table_id": table_dict["table_id"][k], + "description": table_dict["description"][k], + } + for k in range(len(table_dict["table_id"])) ] - return _handle_output( - verbose=verbose, - output_type="list", - df=tables, - col_name="table_id", - ) - def get_dataset_description( - dataset_id=None, - query_project_id="basedosdados", - from_file=False, + dataset_id, verbose=True, ): - """Prints the full dataset description. + """ + Prints the full dataset description. Args: - dataset_id (str): Optional. - Dataset id available in basedosdados. - query_project_id (str): Optional. - Which project the table lives. You can change this you want to query different projects. + dataset_id (str): Required. + Dataset id available in list_datasets. verbose (bool): Optional. If set to True, information is printed to the screen. If set to False, data is returned as a `str`. + + Returns: + stdout | str """ + url = f"https://basedosdados.org/api/3/action/bd_bdm_dataset_show?dataset_id={dataset_id}" - client = bigquery.Client( - credentials=credentials(from_file=from_file), project=query_project_id - ) + response = _safe_fetch(url) - dataset = client.get_dataset(dataset_id) + json_response = response.json() - return _handle_output(verbose=verbose, output_type="str", df=dataset) + description = json_response["result"]["notes"] + + if verbose: + print(description) + else: + return description def get_table_description( - dataset_id=None, - table_id=None, - query_project_id="basedosdados", - from_file=False, + dataset_id, + table_id, verbose=True, ): - """Prints the full table description. + """ + Prints the full table description. Args: - dataset_id (str): Optional. - Dataset id available in basedosdados. It should always come with table_id. - table_id (str): Optional. - Table id available in basedosdados.dataset_id. - It should always come with dataset_id. - query_project_id (str): Optional. - Which project the table lives. You can change this you want to query different projects. + dataset_id (str): Required. + Dataset id available in list_datasets. + table_id (str): Required. + Table id available in list_dataset_tables verbose (bool): Optional. If set to True, information is printed to the screen. If set to False, data is returned as a `str`. + + Returns: + stdout | str """ - client = bigquery.Client( - credentials=credentials(from_file=from_file), project=query_project_id - ) + url = f"https://basedosdados.org/api/3/action/bd_bdm_table_show?dataset_id={dataset_id}&table_id={table_id}" + + response = _safe_fetch(url) - table = client.get_table(f"{dataset_id}.{table_id}") + json_response = response.json() - return _handle_output(verbose=verbose, output_type="str", df=table) + description = json_response["result"]["description"] + + if verbose: + print(description) + else: + return description def get_table_columns( - dataset_id=None, - table_id=None, - query_project_id="basedosdados", - from_file=False, + dataset_id, + table_id, verbose=True, ): - """Fetch the names, types and descriptions for the columns in the specified table. Prints - information on screen. - + """ + Fetch the names, types and descriptions for the columns in the specified table. Prints + information on screen. Args: - dataset_id (str): Optional. - Dataset id available in basedosdados. It should always come with table_id. - table_id (str): Optional. - Table id available in basedosdados.dataset_id. - It should always come with dataset_id. - query_project_id (str): Optional. - Which project the table lives. You can change this you want to query different projects. + dataset_id (str): Required. + Dataset id available in list_datasets. + table_id (str): Required. + Table id available in list_dataset_tables verbose (bool): Optional. If set to True, information is printed to the screen. If set to False, data is returned as a `list` of `dict`s. - Example: - get_table_columns( - dataset_id='br_ibge_censo2010', - table_id='pessoa_renda_setor_censitario' - ) + + Returns: + stdout | list """ - client = bigquery.Client( - credentials=credentials(from_file=from_file), project=query_project_id - ) + url = f"https://basedosdados.org/api/3/action/bd_bdm_table_show?dataset_id={dataset_id}&table_id={table_id}" - table_ref = client.get_table(f"{dataset_id}.{table_id}") + response = _safe_fetch(url) - columns = [ - (field.name, field.field_type, field.description) for field in table_ref.schema - ] + json_response = response.json() - description = pd.DataFrame(columns, columns=["name", "field_type", "description"]) + columns = json_response["result"]["columns"] - return _handle_output(verbose=verbose, output_type="records", df=description) + if verbose: + _print_output(pd.DataFrame(columns)) + else: + return columns def get_table_size( dataset_id, table_id, - billing_project_id, - query_project_id="basedosdados", - from_file=False, verbose=True, ): - """Use a query to get the number of rows and size (in Mb) of a table query - from BigQuery. Prints information on screen in markdown friendly format. + """Use a query to get the number of rows and size (in Mb) of a table. WARNING: this query may cost a lot depending on the table. @@ -328,41 +312,67 @@ def get_table_size( table_id (str): Optional. Table id available in basedosdados.dataset_id. It should always come with dataset_id. - query_project_id (str): Optional. - Which project the table lives. You can change this you want to query different projects. - billing_project_id (str): Optional. - Project that will be billed. Find your Project ID here https://console.cloud.google.com/projectselector2/home/dashboard verbose (bool): Optional. If set to True, information is printed to the screen. If set to False, data is returned as a `list` of `dict`s. - Example: - get_table_size( - dataset_id='br_ibge_censo2010', - table_id='pessoa_renda_setor_censitario', - billing_project_id='yourprojectid' - ) """ - billing_client = bigquery.Client( - credentials=credentials(from_file=from_file), project=billing_project_id - ) + url = f"https://basedosdados.org/api/3/action/bd_bdm_table_show?dataset_id={dataset_id}&table_id={table_id}" - query = f"""SELECT COUNT(*) FROM {query_project_id}.{dataset_id}.{table_id}""" + response = _safe_fetch(url) - job = billing_client.query(query, location="US") + json_response = response.json() - num_rows = job.to_dataframe().loc[0, "f0_"] + size = json_response["result"]["size"] - size_mb = round(job.total_bytes_processed / 1024 / 1024, 2) + if size==None: + print("Size not available") + else: + if verbose: + _print_output(pd.DataFrame(size)) + else: + return size +def search(query, order_by): + """This function works as a wrapper to the `bd_dataset_search` website API + enpoint. - table_data = pd.DataFrame( - [ - { - "project_id": query_project_id, - "dataset_id": dataset_id, - "table_id": table_id, - "num_rows": num_rows, - "size_mb": size_mb, - } - ] - ) + Args: + query (str): + String to search in datasets and tables' metadata. + order_by (str): score|popular|recent + Field by which the results will be ordered. + + Returns: + pd.DataFrame: + Response from the API presented as a pandas DataFrame. Each row is + a table. Each column is a field identifying the table. + """ + + # validate order_by input + if order_by not in ["score", "popular", "recent"]: + raise ValueError( + f'order_by must be score, popular or recent. Received "{order_by}"' + ) - return _handle_output(verbose=verbose, output_type="records", df=table_data) + url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q={query}&order_by={order_by}&resource_type=bdm_table" + + response = _safe_fetch(url) + + json_response = response.json() + + dataset_dfs = [] + # first loop identify the number of the tables in each datasets + for dataset in json_response["result"]["datasets"]: + tables_dfs = [] + len(dataset["resources"]) + # second loop extracts tables' information for each dataset + for table in dataset["resources"]: + data_table = pd.DataFrame( + {k: str(table[k]) for k in list(table.keys())}, index=[0] + ) + tables_dfs.append(data_table) + # append tables' dataframes for each dataset + data_ds = tables_dfs[0].append(tables_dfs[1:]).reset_index(drop=True) + dataset_dfs.append(data_ds) + # append datasets' dataframes + df = dataset_dfs[0].append(dataset_dfs[1:]).reset_index(drop=True) + + return df \ No newline at end of file diff --git a/python-package/tests/test_download/test_metadata.py b/python-package/tests/test_download/test_metadata.py index 048fcbd08..d6788de83 100644 --- a/python-package/tests/test_download/test_metadata.py +++ b/python-package/tests/test_download/test_metadata.py @@ -1,9 +1,9 @@ -from os import read import pytest from pathlib import Path import pandas as pd from pandas_gbq.gbq import GenericGBQException import shutil +import requests from basedosdados import ( list_datasets, @@ -12,110 +12,114 @@ get_table_description, get_table_columns, get_table_size, + search, ) +from basedosdados.download.metadata import _safe_fetch TEST_PROJECT_ID = "basedosdados-dev" SAVEFILE = Path(__file__).parent / "tmp_bases" / "test.csv" SAVEPATH = Path(__file__).parent / "tmp_bases" shutil.rmtree(SAVEPATH, ignore_errors=True) +def test_list_datasets_simple_verbose(capsys): -def test_list_datasets(capsys): - - list_datasets(from_file=True) + out = list_datasets( + query="trabalho", limit=10, with_description=False, verbose=True + ) out, err = capsys.readouterr() # Capture prints assert "dataset_id" in out + # check input error + with pytest.raises(ValueError): + search(query="trabalho", order_by="name") + +def test_list_datasets_simple_list(): -def test_list_datasets_complete(capsys): + out = list_datasets(query="", limit=12, with_description=False, verbose=False) + # check if function returns list + assert isinstance(out, list) + assert len(out) == 12 - list_datasets(with_description=True, filter_by="ibge", from_file=True) + +def test_list_datasets_complete_list(): + + out = list_datasets( + query="trabalho", limit=12, with_description=True, verbose=False + ) + # check if function returns list + assert isinstance(out, list) + assert "dataset_id" in out[0].keys() + assert "description" in out[0].keys() + + +def test_list_datasets_complete_verbose(capsys): + + list_datasets(query="trabalho", limit=10, with_description=True, verbose=True) out, err = capsys.readouterr() # Capture prints assert "dataset_id" in out assert "description" in out -def test_list_datasets_all_descriptions(capsys): +def test_list_dataset_tables_simple_verbose(capsys): - list_datasets(with_description=True, from_file=True) + list_dataset_tables(dataset_id="br_me_caged", with_description=False, verbose=True) out, err = capsys.readouterr() # Capture prints - assert len(out) > 0 + assert "table_id" in out -def test_list_datasets_verbose_false(): +def test_list_dataset_tables_simple_list(): + + out = list_dataset_tables( + dataset_id="br_me_caged", with_description=False, verbose=False + ) - out = list_datasets(from_file=True, verbose=False) assert type(out) == list assert len(out) > 0 -def test_list_dataset_tables(capsys): - - list_dataset_tables(dataset_id="br_ibge_censo_demografico", from_file=True) - out, err = capsys.readouterr() # Capture prints - assert "table_id" in out +def test_list_dataset_tables_complete_verbose(capsys): + list_dataset_tables(dataset_id="br_me_caged", with_description=True, verbose=True) -def test_list_dataset_tables_complete(capsys): - - list_dataset_tables( - dataset_id="br_ibge_censo_demografico", - filter_by="renda", - with_description=True, - from_file=True, - ) out, err = capsys.readouterr() # Capture prints assert "table_id" in out assert "description" in out - assert "renda" in out -def test_list_dataset_tables_all_descriptions(capsys): - list_dataset_tables( - dataset_id="br_ibge_censo_demografico", with_description=True, from_file=True - ) - out, err = capsys.readouterr() # Capture prints - assert len(out) > 0 - - -def test_list_dataset_tables_verbose_false(): +def test_list_dataset_tables_complete_list(): out = list_dataset_tables( - dataset_id="br_ibge_censo_demografico", from_file=True, verbose=False + dataset_id="br_me_caged", with_description=True, verbose=False ) + assert type(out) == list - assert len(out) > 0 + assert type(out[0]) == dict def test_get_dataset_description(capsys): - get_dataset_description("br_ibge_censo_demografico", from_file=True) + get_dataset_description("br_me_caged", verbose=True) out, err = capsys.readouterr() # Capture prints assert len(out) > 0 def test_get_dataset_description_verbose_false(): - out = get_dataset_description( - "br_ibge_censo_demografico", from_file=True, verbose=False - ) + out = get_dataset_description("br_me_caged", verbose=False) assert type(out) == str assert len(out) > 0 def test_get_table_description(capsys): - get_table_description( - "br_ibge_censo_demografico", "setor_censitario_basico_2010", from_file=True - ) + get_table_description("br_me_caged", "microdados_antigos") out, err = capsys.readouterr() # Capture prints assert len(out) > 0 def test_get_table_description_verbose_false(): out = get_table_description( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - from_file=True, + dataset_id="br_me_caged", + table_id="microdados_antigos", verbose=False, ) assert type(out) == str @@ -124,46 +128,46 @@ def test_get_table_description_verbose_false(): def test_get_table_columns(capsys): get_table_columns( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - from_file=True, + dataset_id="br_me_caged", + table_id="microdados_antigos", ) out, err = capsys.readouterr() # Capture prints assert "name" in out - assert "field_type" in out assert "description" in out def test_get_table_columns_verbose_false(): out = get_table_columns( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - from_file=True, + dataset_id="br_me_caged", + table_id="microdados_antigos", verbose=False, ) assert type(out) == list assert len(out) > 0 +def test_search(): + out = search(query="agua", order_by="score") + # check if function returns pd.DataFrame + assert isinstance(out, pd.DataFrame) + # check if there is duplicate tables in the result + assert out.id.nunique() == out.shape[0] + # check input error + with pytest.raises(ValueError): + search(query="agua", order_by="name") def test_get_table_size(capsys): get_table_size( dataset_id="br_ibge_censo_demografico", table_id="setor_censitario_basico_2010", - billing_project_id=TEST_PROJECT_ID, - from_file=True, ) out, err = capsys.readouterr() - assert "num_rows" in out - assert "size_mb" in out + assert "not available" in out +def test__safe_fetch(capsys): -def test_get_table_size_verbose_false(): - out = get_table_size( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - verbose=False, - ) - assert type(out) == list - assert len(out) > 0 + _safe_fetch("https://www.lkajsdhgfal.com.br") + out, err = capsys.readouterr() # Capture prints + assert "HTTPSConnection" in out + + response = _safe_fetch("https://basedosdados.org/api/3/action/bd_dataset_search?q=agua&page_size=10&resource_type=bdm_table") + assert type(response.json())==dict