From 378c5c5bb1fb4ea1acf644e8982b324bb1840fe0 Mon Sep 17 00:00:00 2001 From: lucascr91 Date: Sun, 12 Dec 2021 16:24:21 -0300 Subject: [PATCH 01/22] [infra] add function to wrap bd_dataset_search endpoint --- .../basedosdados/download/download.py | 60 ++++++++++++++++++- python-package/tests/test_download.py | 18 ++++++ 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/python-package/basedosdados/download/download.py b/python-package/basedosdados/download/download.py index c4dd89c25..309ddd034 100644 --- a/python-package/basedosdados/download/download.py +++ b/python-package/basedosdados/download/download.py @@ -18,7 +18,7 @@ BaseDosDadosNoBillingProjectIDException, ) from pandas_gbq.gbq import GenericGBQException - +import requests def credentials(from_file=False, reauth=False): @@ -632,3 +632,61 @@ def get_table_size( ) return _handle_output(verbose=verbose, output_type="records", df=table_data) + +def search(query, order_by): + """This function works as a wrapper to the `bd_dataset_search` website API + enpoint. + + Args: + query (str): + String to search in datasets and tables' metadata. + order_by (str): score|popular|recent + Field by which the results will be ordered. + + Returns: + pd.DataFrame: + Response from the API presented as a pandas DataFrame. Each row is + a table. Each column is a field identifying the table. + """ + + #validate base url + base_url = "https://basedosdados.org/api/3/action/bd_dataset_search" + response = requests.get(base_url) + status_code = response.status_code + if status_code!=200: + raise ValueError(f"The API endpoint doesn't look to be available. Please, check if any change has occurred at url {base_url}") + + #validate order_by input + if order_by not in ['score', 'popular','recent']: + raise ValueError(f"order_by must be score, popular or recent. Received \"{order_by}\"") + + url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q={query}&order_by={order_by}&resource_type=bdm_table" + response = requests.get(url) + json_response = response.json() + + dataset_dfs = [] + n_datasets = len(json_response['result']['datasets']) + #first loop identify the number of the tables in each datasets + for i in range(n_datasets): + tables_dfs = [] + n_tables = len(json_response['result']['datasets'][i]['resources']) + #second loop extracts tables' information for each dataset + for j in range(n_tables): + table_dict = json_response['result']['datasets'][i]['resources'][j] + data_table = pd.DataFrame({k:str(table_dict[k]) for k in list(table_dict.keys())}, index=[0]) + tables_dfs.append(data_table) + #append tables' dataframes for each dataset + data_ds = tables_dfs[0].append(tables_dfs[1:]).reset_index(drop=True) + dataset_dfs.append(data_ds) + #append datasets' dataframes + df = dataset_dfs[0].append(dataset_dfs[1:]).reset_index(drop=True) + + return df + + + + + + + + diff --git a/python-package/tests/test_download.py b/python-package/tests/test_download.py index 3fa9222d7..7a8e1bd3b 100644 --- a/python-package/tests/test_download.py +++ b/python-package/tests/test_download.py @@ -4,6 +4,7 @@ import pandas as pd from pandas_gbq.gbq import GenericGBQException import shutil +from basedosdados.download.download import search from basedosdados import ( download, @@ -350,3 +351,20 @@ def test_get_table_size_verbose_false(): ) assert type(out) == list assert len(out) > 0 + +def test_search(): + out = search( + query='agua', + order_by='score' + ) + #check if function returns pd.DataFrame + assert isinstance(out, pd.DataFrame) + #check if there is duplicate tables in the result + assert out.id.nunique()==out.shape[0] + #check input error + with pytest.raises(ValueError): + search( + query='agua', + order_by='name' + ) + From 380ccbcee4b97fb0a61d771c32b6465f67867b70 Mon Sep 17 00:00:00 2001 From: Lucas Cavalcanti Rodrigues Date: Tue, 14 Dec 2021 18:01:54 -0300 Subject: [PATCH 02/22] Update download.py --- .../basedosdados/download/download.py | 52 +++++++++---------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/python-package/basedosdados/download/download.py b/python-package/basedosdados/download/download.py index 309ddd034..abc57409c 100644 --- a/python-package/basedosdados/download/download.py +++ b/python-package/basedosdados/download/download.py @@ -649,44 +649,40 @@ def search(query, order_by): a table. Each column is a field identifying the table. """ - #validate base url + # validate order_by input + if order_by not in ["score", "popular", "recent"]: + raise ValueError( + f'order_by must be score, popular or recent. Received "{order_by}"' + ) + base_url = "https://basedosdados.org/api/3/action/bd_dataset_search" - response = requests.get(base_url) - status_code = response.status_code - if status_code!=200: - raise ValueError(f"The API endpoint doesn't look to be available. Please, check if any change has occurred at url {base_url}") - - #validate order_by input - if order_by not in ['score', 'popular','recent']: - raise ValueError(f"order_by must be score, popular or recent. Received \"{order_by}\"") - url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q={query}&order_by={order_by}&resource_type=bdm_table" response = requests.get(url) + status_code = response.status_code + + # validate url + if status_code != 200: + raise ValueError( + f"The API endpoint doesn't look to be available. Please, check if any change has occurred at url {base_url}" + ) + json_response = response.json() dataset_dfs = [] - n_datasets = len(json_response['result']['datasets']) - #first loop identify the number of the tables in each datasets - for i in range(n_datasets): + # first loop identify the number of the tables in each datasets + for dataset in json_response["result"]["datasets"]: tables_dfs = [] - n_tables = len(json_response['result']['datasets'][i]['resources']) - #second loop extracts tables' information for each dataset - for j in range(n_tables): - table_dict = json_response['result']['datasets'][i]['resources'][j] - data_table = pd.DataFrame({k:str(table_dict[k]) for k in list(table_dict.keys())}, index=[0]) + n_tables = len(dataset["resources"]) + # second loop extracts tables' information for each dataset + for table in dataset["resources"]: + data_table = pd.DataFrame( + {k: str(table[k]) for k in list(table.keys())}, index=[0] + ) tables_dfs.append(data_table) - #append tables' dataframes for each dataset + # append tables' dataframes for each dataset data_ds = tables_dfs[0].append(tables_dfs[1:]).reset_index(drop=True) dataset_dfs.append(data_ds) - #append datasets' dataframes + # append datasets' dataframes df = dataset_dfs[0].append(dataset_dfs[1:]).reset_index(drop=True) return df - - - - - - - - From de373ef3ea67d2faf8f348dbd87eb893cd97440e Mon Sep 17 00:00:00 2001 From: lucascr91 Date: Fri, 14 Jan 2022 21:32:04 -0300 Subject: [PATCH 03/22] [infra] modify list_datasets function to consume CKAN endpoint --- .../basedosdados/download/download.py | 103 +++++++++--------- python-package/tests/test_download.py | 40 +++++-- 2 files changed, 85 insertions(+), 58 deletions(-) diff --git a/python-package/basedosdados/download/download.py b/python-package/basedosdados/download/download.py index abc57409c..c22109bfa 100644 --- a/python-package/basedosdados/download/download.py +++ b/python-package/basedosdados/download/download.py @@ -358,63 +358,70 @@ def _handle_output(verbose, output_type, df, col_name=None): return None -def list_datasets( - query_project_id="basedosdados", - filter_by=None, - with_description=False, - from_file=False, - verbose=True, -): - """Fetch the dataset_id of datasets available at query_project_id. Prints information on - screen or returns it as a list. +def list_datasets(query, order_by, with_description=False, verbose=True): + """This function uses `bd_dataset_search` website API + enpoint to retrieve a list of available datasets. Args: - query_project_id (str): Optional. - Which project the table lives. You can change this you want to query different projects. - filter_by (str): Optional - String to be matched in dataset_id. + query (str): + String to search in datasets and tables' metadata. + order_by (str): score|popular|recent + Field by which the results will be ordered. with_description (bool): Optional If True, fetch short dataset description for each dataset. verbose (bool): Optional. If set to True, information is printed to the screen. If set to False, a list object is returned. - - Example: - list_datasets( - filter_by='sp', - with_description=True, - ) + Returns: + list | string of datasets """ - client = bigquery.Client( - credentials=credentials(from_file=from_file), project=query_project_id - ) - - datasets_list = list(client.list_datasets()) - - datasets = pd.DataFrame( - [dataset.dataset_id for dataset in datasets_list], columns=["dataset_id"] - ) + # validate order_by input + if order_by not in ["score", "popular", "recent"]: + raise ValueError( + f'order_by must be score, popular or recent. Received "{order_by}"' + ) - if filter_by: + url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q={query}&order_by={order_by}&resource_type=bdm_table" - datasets = datasets.loc[datasets["dataset_id"].str.contains(filter_by)] + # validate url + try: + response = requests.get(url) + response.raise_for_status() + except requests.exceptions.HTTPError as err: + print(err) - if with_description: + json_response = response.json() - datasets["description"] = [ - _get_header(client.get_dataset(dataset).description) - for dataset in datasets["dataset_id"] + # this dict has all information we need to output the function + dataset_dict = { + "dataset_id": [ + dataset["name"] for dataset in json_response["result"]["datasets"] + ], + "description": [ + dataset["notes"] if "notes" in dataset.keys() else None + for dataset in json_response["result"]["datasets"] + ], + } + + # select desired output using dataset_id info. Note that the output is either a standardized string or a list + if verbose & (with_description == False): + return _print_output(pd.DataFrame.from_dict(dataset_dict)[["dataset_id"]]) + elif verbose & with_description: + return _print_output( + pd.DataFrame.from_dict(dataset_dict)[["dataset_id", "description"]] + ) + elif (verbose == False) & (with_description == False): + return dataset_dict["dataset_id"] + elif (verbose == False) & with_description: + return [ + { + "dataset_id": dataset_dict["dataset_id"][k], + "description": dataset_dict["description"][k], + } + for k in range(len(dataset_dict.keys())) ] - return _handle_output( - verbose=verbose, - output_type="list", - df=datasets, - col_name="dataset_id", - ) - - def list_dataset_tables( dataset_id, query_project_id="basedosdados", @@ -655,16 +662,14 @@ def search(query, order_by): f'order_by must be score, popular or recent. Received "{order_by}"' ) - base_url = "https://basedosdados.org/api/3/action/bd_dataset_search" url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q={query}&order_by={order_by}&resource_type=bdm_table" - response = requests.get(url) - status_code = response.status_code # validate url - if status_code != 200: - raise ValueError( - f"The API endpoint doesn't look to be available. Please, check if any change has occurred at url {base_url}" - ) + try: + response = requests.get(url) + response.raise_for_status() + except requests.exceptions.HTTPError as err: + print(err) json_response = response.json() diff --git a/python-package/tests/test_download.py b/python-package/tests/test_download.py index 7a8e1bd3b..d73ec16df 100644 --- a/python-package/tests/test_download.py +++ b/python-package/tests/test_download.py @@ -205,26 +205,48 @@ def test_read_table(): ) -def test_list_datasets(capsys): +def test_list_datasets_default(capsys): - list_datasets(from_file=True) + out = list_datasets( + query="trabalho", order_by="score", with_description=False, verbose=True + ) out, err = capsys.readouterr() # Capture prints assert "dataset_id" in out + # check input error + with pytest.raises(ValueError): + search(query="trabalho", order_by="name") + + +def test_list_datasets_noverbose(): + + out = list_datasets( + query="trabalho", order_by="score", with_description=False, verbose=False + ) + # check if function returns list + assert isinstance(out, list) + +def test_list_datasets_complete_list(): + + out = list_datasets( + query="trabalho", order_by="score", with_description=True, verbose=False + ) + # check if function returns list + assert isinstance(out, list) + assert "dataset_id" in out[0].keys() + assert "description" in out[0].keys() -def test_list_datasets_complete(capsys): - list_datasets(with_description=True, filter_by="ibge", from_file=True) +def test_list_datasets_complete_verbose(capsys): + + list_datasets( + query="trabalho", order_by="score", with_description=True, verbose=True + ) out, err = capsys.readouterr() # Capture prints assert "dataset_id" in out assert "description" in out -def test_list_datasets_all_descriptions(capsys): - - list_datasets(with_description=True, from_file=True) - out, err = capsys.readouterr() # Capture prints - assert len(out) > 0 def test_list_datasets_verbose_false(): From 23e2c6a86032d32b2221b24b53b9df0c383b42ec Mon Sep 17 00:00:00 2001 From: lucascr91 Date: Sat, 15 Jan 2022 14:30:08 -0300 Subject: [PATCH 04/22] [infra] fix list_dataset function to include limit and remove order_by --- .../basedosdados/download/download.py | 16 ++++------------ python-package/tests/test_download.py | 18 +++++------------- 2 files changed, 9 insertions(+), 25 deletions(-) diff --git a/python-package/basedosdados/download/download.py b/python-package/basedosdados/download/download.py index c22109bfa..832e71d8c 100644 --- a/python-package/basedosdados/download/download.py +++ b/python-package/basedosdados/download/download.py @@ -358,15 +358,15 @@ def _handle_output(verbose, output_type, df, col_name=None): return None -def list_datasets(query, order_by, with_description=False, verbose=True): +def list_datasets(query, limit=10, with_description=False, verbose=True): """This function uses `bd_dataset_search` website API enpoint to retrieve a list of available datasets. Args: query (str): String to search in datasets and tables' metadata. - order_by (str): score|popular|recent - Field by which the results will be ordered. + limit (int): + Field to limit the number of results with_description (bool): Optional If True, fetch short dataset description for each dataset. verbose (bool): Optional. @@ -376,13 +376,7 @@ def list_datasets(query, order_by, with_description=False, verbose=True): list | string of datasets """ - # validate order_by input - if order_by not in ["score", "popular", "recent"]: - raise ValueError( - f'order_by must be score, popular or recent. Received "{order_by}"' - ) - - url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q={query}&order_by={order_by}&resource_type=bdm_table" + url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q={query}&page_size={limit}&resource_type=bdm_table" # validate url try: @@ -432,7 +426,6 @@ def list_dataset_tables( ): """Fetch table_id for tables available at the specified dataset_id. Prints the information on screen or returns it as a list. - Args: dataset_id (str): Optional. Dataset id available in basedosdados. @@ -444,7 +437,6 @@ def list_dataset_tables( If True, fetch short table descriptions for each table that match the search criteria. verbose (bool): Optional. If set to True, information is printed to the screen. If set to False, a list object is returned. - Example: list_dataset_tables( dataset_id='br_ibge_censo2010' diff --git a/python-package/tests/test_download.py b/python-package/tests/test_download.py index d73ec16df..76477b886 100644 --- a/python-package/tests/test_download.py +++ b/python-package/tests/test_download.py @@ -208,7 +208,7 @@ def test_read_table(): def test_list_datasets_default(capsys): out = list_datasets( - query="trabalho", order_by="score", with_description=False, verbose=True + query="trabalho", limit=10, with_description=False, verbose=True ) out, err = capsys.readouterr() # Capture prints assert "dataset_id" in out @@ -220,16 +220,17 @@ def test_list_datasets_default(capsys): def test_list_datasets_noverbose(): out = list_datasets( - query="trabalho", order_by="score", with_description=False, verbose=False + query="", limit=12, with_description=False, verbose=False ) # check if function returns list assert isinstance(out, list) + assert len(out) == 12 def test_list_datasets_complete_list(): out = list_datasets( - query="trabalho", order_by="score", with_description=True, verbose=False + query="trabalho", limit=12, with_description=True, verbose=False ) # check if function returns list assert isinstance(out, list) @@ -240,22 +241,13 @@ def test_list_datasets_complete_list(): def test_list_datasets_complete_verbose(capsys): list_datasets( - query="trabalho", order_by="score", with_description=True, verbose=True + query="trabalho", limit=10, with_description=True, verbose=True ) out, err = capsys.readouterr() # Capture prints assert "dataset_id" in out assert "description" in out - - -def test_list_datasets_verbose_false(): - - out = list_datasets(from_file=True, verbose=False) - assert type(out) == list - assert len(out) > 0 - - def test_list_dataset_tables(capsys): list_dataset_tables(dataset_id="br_ibge_censo_demografico", from_file=True) From e22531e743b94bf76165fc11973d787e2719a8a9 Mon Sep 17 00:00:00 2001 From: lucascr91 Date: Sat, 15 Jan 2022 17:57:20 -0300 Subject: [PATCH 05/22] [infra] change function list_dataset_tables to use CKAN endpoint --- .../basedosdados/download/download.py | 76 ++++++++++--------- python-package/tests/test_download.py | 44 +++++------ 2 files changed, 61 insertions(+), 59 deletions(-) diff --git a/python-package/basedosdados/download/download.py b/python-package/basedosdados/download/download.py index 832e71d8c..cb3da7105 100644 --- a/python-package/basedosdados/download/download.py +++ b/python-package/basedosdados/download/download.py @@ -364,7 +364,7 @@ def list_datasets(query, limit=10, with_description=False, verbose=True): Args: query (str): - String to search in datasets and tables' metadata. + String to search in datasets' metadata. limit (int): Field to limit the number of results with_description (bool): Optional @@ -383,7 +383,7 @@ def list_datasets(query, limit=10, with_description=False, verbose=True): response = requests.get(url) response.raise_for_status() except requests.exceptions.HTTPError as err: - print(err) + return err json_response = response.json() @@ -413,30 +413,28 @@ def list_datasets(query, limit=10, with_description=False, verbose=True): "dataset_id": dataset_dict["dataset_id"][k], "description": dataset_dict["description"][k], } - for k in range(len(dataset_dict.keys())) + for k in range(len(dataset_dict["dataset_id"])) ] + def list_dataset_tables( dataset_id, - query_project_id="basedosdados", - from_file=False, - filter_by=None, with_description=False, verbose=True, ): """Fetch table_id for tables available at the specified dataset_id. Prints the information on screen or returns it as a list. + Args: dataset_id (str): Optional. - Dataset id available in basedosdados. - query_project_id (str): Optional. - Which project the table lives. You can change this you want to query different projects. - filter_by (str): Optional - String to be matched in the table_id. + Dataset id returned by list_datasets function + limit (int): + Field to limit the number of results with_description (bool): Optional If True, fetch short table descriptions for each table that match the search criteria. verbose (bool): Optional. If set to True, information is printed to the screen. If set to False, a list object is returned. + Example: list_dataset_tables( dataset_id='br_ibge_censo2010' @@ -444,36 +442,46 @@ def list_dataset_tables( with_description=True, ) """ - client = bigquery.Client( - credentials=credentials(from_file=from_file), project=query_project_id - ) - dataset = client.get_dataset(dataset_id) - - tables_list = list(client.list_tables(dataset)) + url = f"https://basedosdados.org/api/3/action/bd_dataset_search?&resource_type=bdm_table" - tables = pd.DataFrame( - [table.table_id for table in tables_list], columns=["table_id"] - ) - - if filter_by: + # validate url + try: + response = requests.get(url) + response.raise_for_status() + except requests.exceptions.HTTPError as err: + return err - tables = tables.loc[tables["table_id"].str.contains(filter_by)] + json_response = response.json() - if with_description: + #this dict has all information need to output the function + table_dict = { + "table_id": [ + dataset['resources'][k]["name"] for dataset in json_response["result"]["datasets"] for k in range(len(dataset['resources'])) if dataset['name']==dataset_id + ], + "description": [ + dataset['resources'][k]["description"] for dataset in json_response["result"]["datasets"] for k in range(len(dataset['resources'])) if dataset['name']==dataset_id + ], + } - tables["description"] = [ - _get_header(client.get_table(f"{dataset_id}.{table}").description) - for table in tables["table_id"] + # #select desired output using table_id info. Note that the output is either a standardized string or a list + if verbose & (with_description == False): + return _print_output(pd.DataFrame.from_dict(table_dict)[["table_id"]]) + elif verbose & with_description: + return _print_output( + pd.DataFrame.from_dict(table_dict)[["table_id", "description"]] + ) + elif (verbose == False) & (with_description == False): + return table_dict["table_id"] + elif (verbose == False) & with_description: + return [ + { + "table_id": table_dict["table_id"][k], + "description": table_dict["description"][k], + } + for k in range(len(table_dict["table_id"])) ] - return _handle_output( - verbose=verbose, - output_type="list", - df=tables, - col_name="table_id", - ) - def get_dataset_description( dataset_id=None, diff --git a/python-package/tests/test_download.py b/python-package/tests/test_download.py index 76477b886..7c878e051 100644 --- a/python-package/tests/test_download.py +++ b/python-package/tests/test_download.py @@ -205,7 +205,7 @@ def test_read_table(): ) -def test_list_datasets_default(capsys): +def test_list_datasets_simple_verbose(capsys): out = list_datasets( query="trabalho", limit=10, with_description=False, verbose=True @@ -217,7 +217,7 @@ def test_list_datasets_default(capsys): search(query="trabalho", order_by="name") -def test_list_datasets_noverbose(): +def test_list_datasets_simple_list(): out = list_datasets( query="", limit=12, with_description=False, verbose=False @@ -248,42 +248,36 @@ def test_list_datasets_complete_verbose(capsys): assert "description" in out -def test_list_dataset_tables(capsys): +def test_list_dataset_tables_simple_verbose(capsys): - list_dataset_tables(dataset_id="br_ibge_censo_demografico", from_file=True) + list_dataset_tables(dataset_id="br-sp-alesp", with_description=False, verbose=True) out, err = capsys.readouterr() # Capture prints assert "table_id" in out -def test_list_dataset_tables_complete(capsys): +def test_list_dataset_tables_simple_list(): + + out = list_dataset_tables(dataset_id="br-sp-alesp", with_description=False, verbose=False) + + assert type(out) == list + assert len(out) > 0 + + +def test_list_dataset_tables_complete_verbose(capsys): + + list_dataset_tables(dataset_id="br-sp-alesp", with_description=True, verbose=True) - list_dataset_tables( - dataset_id="br_ibge_censo_demografico", - filter_by="renda", - with_description=True, - from_file=True, - ) out, err = capsys.readouterr() # Capture prints assert "table_id" in out assert "description" in out - assert "renda" in out - -def test_list_dataset_tables_all_descriptions(capsys): - list_dataset_tables( - dataset_id="br_ibge_censo_demografico", with_description=True, from_file=True - ) - out, err = capsys.readouterr() # Capture prints - assert len(out) > 0 +def test_list_dataset_tables_complete_list(): -def test_list_dataset_tables_verbose_false(): + out = list_dataset_tables(dataset_id="br-sp-alesp", with_description=True, verbose=False) - out = list_dataset_tables( - dataset_id="br_ibge_censo_demografico", from_file=True, verbose=False - ) - assert type(out) == list - assert len(out) > 0 + assert type(out)==list + assert type(out[0])==dict def test_get_dataset_description(capsys): From f0789deeb3ca32549dce7cf2fa3e51312dade043 Mon Sep 17 00:00:00 2001 From: lucascr91 Date: Sat, 15 Jan 2022 19:35:58 -0300 Subject: [PATCH 06/22] [infra] apply PEP8 to list_dataset_tables and respective tests --- python-package/basedosdados/download/download.py | 12 +++++++++--- python-package/tests/test_download.py | 12 ++++++++---- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/python-package/basedosdados/download/download.py b/python-package/basedosdados/download/download.py index cb3da7105..2fe822c02 100644 --- a/python-package/basedosdados/download/download.py +++ b/python-package/basedosdados/download/download.py @@ -454,13 +454,19 @@ def list_dataset_tables( json_response = response.json() - #this dict has all information need to output the function + # this dict has all information need to output the function table_dict = { "table_id": [ - dataset['resources'][k]["name"] for dataset in json_response["result"]["datasets"] for k in range(len(dataset['resources'])) if dataset['name']==dataset_id + dataset["resources"][k]["name"] + for dataset in json_response["result"]["datasets"] + for k in range(len(dataset["resources"])) + if dataset["name"] == dataset_id ], "description": [ - dataset['resources'][k]["description"] for dataset in json_response["result"]["datasets"] for k in range(len(dataset['resources'])) if dataset['name']==dataset_id + dataset["resources"][k]["description"] + for dataset in json_response["result"]["datasets"] + for k in range(len(dataset["resources"])) + if dataset["name"] == dataset_id ], } diff --git a/python-package/tests/test_download.py b/python-package/tests/test_download.py index 7c878e051..02604a350 100644 --- a/python-package/tests/test_download.py +++ b/python-package/tests/test_download.py @@ -257,7 +257,9 @@ def test_list_dataset_tables_simple_verbose(capsys): def test_list_dataset_tables_simple_list(): - out = list_dataset_tables(dataset_id="br-sp-alesp", with_description=False, verbose=False) + out = list_dataset_tables( + dataset_id="br-sp-alesp", with_description=False, verbose=False + ) assert type(out) == list assert len(out) > 0 @@ -274,10 +276,12 @@ def test_list_dataset_tables_complete_verbose(capsys): def test_list_dataset_tables_complete_list(): - out = list_dataset_tables(dataset_id="br-sp-alesp", with_description=True, verbose=False) + out = list_dataset_tables( + dataset_id="br-sp-alesp", with_description=True, verbose=False + ) - assert type(out)==list - assert type(out[0])==dict + assert type(out) == list + assert type(out[0]) == dict def test_get_dataset_description(capsys): From b534862f0e526864e9841a26ddaab414a62758a0 Mon Sep 17 00:00:00 2001 From: lucascr91 Date: Sat, 22 Jan 2022 20:39:21 -0300 Subject: [PATCH 07/22] add get_dataset_description, get_table_description, get_table_columns --- .../basedosdados/download/download.py | 165 +++++++++--------- python-package/tests/test_download.py | 62 +++---- 2 files changed, 109 insertions(+), 118 deletions(-) diff --git a/python-package/basedosdados/download/download.py b/python-package/basedosdados/download/download.py index 2fe822c02..adf532000 100644 --- a/python-package/basedosdados/download/download.py +++ b/python-package/basedosdados/download/download.py @@ -20,6 +20,7 @@ from pandas_gbq.gbq import GenericGBQException import requests + def credentials(from_file=False, reauth=False): if from_file: @@ -359,7 +360,8 @@ def _handle_output(verbose, output_type, df, col_name=None): def list_datasets(query, limit=10, with_description=False, verbose=True): - """This function uses `bd_dataset_search` website API + """ + This function uses `bd_dataset_search` website API enpoint to retrieve a list of available datasets. Args: @@ -373,7 +375,7 @@ def list_datasets(query, limit=10, with_description=False, verbose=True): If set to True, information is printed to the screen. If set to False, a list object is returned. Returns: - list | string of datasets + list | stdout """ url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q={query}&page_size={limit}&resource_type=bdm_table" @@ -422,8 +424,8 @@ def list_dataset_tables( with_description=False, verbose=True, ): - """Fetch table_id for tables available at the specified dataset_id. Prints the information - on screen or returns it as a list. + """ + Fetch table_id for tables available at the specified dataset_id. Prints the information on screen or returns it as a list. Args: dataset_id (str): Optional. @@ -435,15 +437,11 @@ def list_dataset_tables( verbose (bool): Optional. If set to True, information is printed to the screen. If set to False, a list object is returned. - Example: - list_dataset_tables( - dataset_id='br_ibge_censo2010' - filter_by='renda', - with_description=True, - ) + Returns: + stdout | list """ - url = f"https://basedosdados.org/api/3/action/bd_dataset_search?&resource_type=bdm_table" + url = f"https://basedosdados.org/api/3/action/bd_bdm_dataset_show?dataset_id={dataset_id}" # validate url try: @@ -454,23 +452,18 @@ def list_dataset_tables( json_response = response.json() + dataset = json_response["result"] # this dict has all information need to output the function table_dict = { "table_id": [ - dataset["resources"][k]["name"] - for dataset in json_response["result"]["datasets"] - for k in range(len(dataset["resources"])) - if dataset["name"] == dataset_id + dataset["resources"][k]["name"] for k in range(len(dataset["resources"])) ], "description": [ dataset["resources"][k]["description"] - for dataset in json_response["result"]["datasets"] for k in range(len(dataset["resources"])) - if dataset["name"] == dataset_id ], } - - # #select desired output using table_id info. Note that the output is either a standardized string or a list + # select desired output using table_id info. Note that the output is either a standardized string or a list if verbose & (with_description == False): return _print_output(pd.DataFrame.from_dict(table_dict)[["table_id"]]) elif verbose & with_description: @@ -490,102 +483,117 @@ def list_dataset_tables( def get_dataset_description( - dataset_id=None, - query_project_id="basedosdados", - from_file=False, + dataset_id, verbose=True, ): - """Prints the full dataset description. + """ + Prints the full dataset description. Args: - dataset_id (str): Optional. - Dataset id available in basedosdados. - query_project_id (str): Optional. - Which project the table lives. You can change this you want to query different projects. + dataset_id (str): Required. + Dataset id available in list_datasets. verbose (bool): Optional. If set to True, information is printed to the screen. If set to False, data is returned as a `str`. + + Returns: + stdout | str """ + url = f"https://basedosdados.org/api/3/action/bd_bdm_dataset_show?dataset_id={dataset_id}" - client = bigquery.Client( - credentials=credentials(from_file=from_file), project=query_project_id - ) + # validate url + try: + response = requests.get(url) + response.raise_for_status() + except requests.exceptions.HTTPError as err: + return err - dataset = client.get_dataset(dataset_id) + json_response = response.json() - return _handle_output(verbose=verbose, output_type="str", df=dataset) + description = json_response["result"]["notes"] + + if verbose: + print(description) + else: + return description def get_table_description( - dataset_id=None, - table_id=None, - query_project_id="basedosdados", - from_file=False, + dataset_id, + table_id, verbose=True, ): - """Prints the full table description. + """ + Prints the full table description. Args: - dataset_id (str): Optional. - Dataset id available in basedosdados. It should always come with table_id. - table_id (str): Optional. - Table id available in basedosdados.dataset_id. - It should always come with dataset_id. - query_project_id (str): Optional. - Which project the table lives. You can change this you want to query different projects. + dataset_id (str): Required. + Dataset id available in list_datasets. + table_id (str): Required. + Table id available in list_dataset_tables verbose (bool): Optional. If set to True, information is printed to the screen. If set to False, data is returned as a `str`. + + Returns: + stdout | str """ - client = bigquery.Client( - credentials=credentials(from_file=from_file), project=query_project_id - ) + url = f"https://basedosdados.org/api/3/action/bd_bdm_table_show?dataset_id={dataset_id}&table_id={table_id}" + + # validate url + try: + response = requests.get(url) + response.raise_for_status() + except requests.exceptions.HTTPError as err: + return err + + json_response = response.json() - table = client.get_table(f"{dataset_id}.{table_id}") + description = json_response["result"]["description"] - return _handle_output(verbose=verbose, output_type="str", df=table) + if verbose: + print(description) + else: + return description def get_table_columns( - dataset_id=None, - table_id=None, - query_project_id="basedosdados", - from_file=False, + dataset_id, + table_id, verbose=True, ): - """Fetch the names, types and descriptions for the columns in the specified table. Prints - information on screen. - + """ + Fetch the names, types and descriptions for the columns in the specified table. Prints + information on screen. Args: - dataset_id (str): Optional. - Dataset id available in basedosdados. It should always come with table_id. - table_id (str): Optional. - Table id available in basedosdados.dataset_id. - It should always come with dataset_id. - query_project_id (str): Optional. - Which project the table lives. You can change this you want to query different projects. + dataset_id (str): Required. + Dataset id available in list_datasets. + table_id (str): Required. + Table id available in list_dataset_tables verbose (bool): Optional. If set to True, information is printed to the screen. If set to False, data is returned as a `list` of `dict`s. - Example: - get_table_columns( - dataset_id='br_ibge_censo2010', - table_id='pessoa_renda_setor_censitario' - ) + + Returns: + stdout | list """ - client = bigquery.Client( - credentials=credentials(from_file=from_file), project=query_project_id - ) + url = f"https://basedosdados.org/api/3/action/bd_bdm_table_show?dataset_id={dataset_id}&table_id={table_id}" - table_ref = client.get_table(f"{dataset_id}.{table_id}") + # validate url + try: + response = requests.get(url) + response.raise_for_status() + except requests.exceptions.HTTPError as err: + return err - columns = [ - (field.name, field.field_type, field.description) for field in table_ref.schema - ] + json_response = response.json() - description = pd.DataFrame(columns, columns=["name", "field_type", "description"]) + columns = json_response["result"]["columns"] - return _handle_output(verbose=verbose, output_type="records", df=description) + if verbose: + _print_output(pd.DataFrame(columns)) + else: + return columns def get_table_size( @@ -646,6 +654,7 @@ def get_table_size( return _handle_output(verbose=verbose, output_type="records", df=table_data) + def search(query, order_by): """This function works as a wrapper to the `bd_dataset_search` website API enpoint. @@ -659,7 +668,7 @@ def search(query, order_by): Returns: pd.DataFrame: Response from the API presented as a pandas DataFrame. Each row is - a table. Each column is a field identifying the table. + a table. Each column is a field identifying the table. """ # validate order_by input diff --git a/python-package/tests/test_download.py b/python-package/tests/test_download.py index 02604a350..4cd852843 100644 --- a/python-package/tests/test_download.py +++ b/python-package/tests/test_download.py @@ -219,9 +219,7 @@ def test_list_datasets_simple_verbose(capsys): def test_list_datasets_simple_list(): - out = list_datasets( - query="", limit=12, with_description=False, verbose=False - ) + out = list_datasets(query="", limit=12, with_description=False, verbose=False) # check if function returns list assert isinstance(out, list) assert len(out) == 12 @@ -240,9 +238,7 @@ def test_list_datasets_complete_list(): def test_list_datasets_complete_verbose(capsys): - list_datasets( - query="trabalho", limit=10, with_description=True, verbose=True - ) + list_datasets(query="trabalho", limit=10, with_description=True, verbose=True) out, err = capsys.readouterr() # Capture prints assert "dataset_id" in out assert "description" in out @@ -250,7 +246,7 @@ def test_list_datasets_complete_verbose(capsys): def test_list_dataset_tables_simple_verbose(capsys): - list_dataset_tables(dataset_id="br-sp-alesp", with_description=False, verbose=True) + list_dataset_tables(dataset_id="br_me_caged", with_description=False, verbose=True) out, err = capsys.readouterr() # Capture prints assert "table_id" in out @@ -258,7 +254,7 @@ def test_list_dataset_tables_simple_verbose(capsys): def test_list_dataset_tables_simple_list(): out = list_dataset_tables( - dataset_id="br-sp-alesp", with_description=False, verbose=False + dataset_id="br_me_caged", with_description=False, verbose=False ) assert type(out) == list @@ -267,7 +263,7 @@ def test_list_dataset_tables_simple_list(): def test_list_dataset_tables_complete_verbose(capsys): - list_dataset_tables(dataset_id="br-sp-alesp", with_description=True, verbose=True) + list_dataset_tables(dataset_id="br_me_caged", with_description=True, verbose=True) out, err = capsys.readouterr() # Capture prints assert "table_id" in out @@ -277,7 +273,7 @@ def test_list_dataset_tables_complete_verbose(capsys): def test_list_dataset_tables_complete_list(): out = list_dataset_tables( - dataset_id="br-sp-alesp", with_description=True, verbose=False + dataset_id="br_me_caged", with_description=True, verbose=False ) assert type(out) == list @@ -286,32 +282,27 @@ def test_list_dataset_tables_complete_list(): def test_get_dataset_description(capsys): - get_dataset_description("br_ibge_censo_demografico", from_file=True) + get_dataset_description("br_me_caged", verbose=True) out, err = capsys.readouterr() # Capture prints assert len(out) > 0 def test_get_dataset_description_verbose_false(): - out = get_dataset_description( - "br_ibge_censo_demografico", from_file=True, verbose=False - ) + out = get_dataset_description("br_me_caged", verbose=False) assert type(out) == str assert len(out) > 0 def test_get_table_description(capsys): - get_table_description( - "br_ibge_censo_demografico", "setor_censitario_basico_2010", from_file=True - ) + get_table_description("br_me_caged", "microdados_antigos") out, err = capsys.readouterr() # Capture prints assert len(out) > 0 def test_get_table_description_verbose_false(): out = get_table_description( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - from_file=True, + dataset_id="br_me_caged", + table_id="microdados_antigos", verbose=False, ) assert type(out) == str @@ -320,21 +311,18 @@ def test_get_table_description_verbose_false(): def test_get_table_columns(capsys): get_table_columns( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - from_file=True, + dataset_id="br_me_caged", + table_id="microdados_antigos", ) out, err = capsys.readouterr() # Capture prints assert "name" in out - assert "field_type" in out assert "description" in out def test_get_table_columns_verbose_false(): out = get_table_columns( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - from_file=True, + dataset_id="br_me_caged", + table_id="microdados_antigos", verbose=False, ) assert type(out) == list @@ -364,19 +352,13 @@ def test_get_table_size_verbose_false(): assert type(out) == list assert len(out) > 0 + def test_search(): - out = search( - query='agua', - order_by='score' - ) - #check if function returns pd.DataFrame + out = search(query="agua", order_by="score") + # check if function returns pd.DataFrame assert isinstance(out, pd.DataFrame) - #check if there is duplicate tables in the result - assert out.id.nunique()==out.shape[0] - #check input error + # check if there is duplicate tables in the result + assert out.id.nunique() == out.shape[0] + # check input error with pytest.raises(ValueError): - search( - query='agua', - order_by='name' - ) - + search(query="agua", order_by="name") From 1a06e76b1a2f0dfdf8738c775dea9e8c414d70da Mon Sep 17 00:00:00 2001 From: Diego Oliveira Date: Mon, 24 Jan 2022 19:47:19 -0300 Subject: [PATCH 08/22] [infra] fix dataset_config.yaml folder path (#1067) --- README.md | 6 +- .../br_ibge_inpc/mes_brasil/table_config.yaml | 2 +- .../mes_categoria_brasil/table_config.yaml | 2 +- .../mes_categoria_municipio/table_config.yaml | 2 +- .../mes_categoria_rm/table_config.yaml | 2 +- .../br_ibge_ipca/mes_brasil/table_config.yaml | 2 +- .../mes_categoria_brasil/table_config.yaml | 2 +- .../mes_categoria_municipio/table_config.yaml | 2 +- .../mes_categoria_rm/table_config.yaml | 2 +- .../mes_brasil/table_config.yaml | 2 +- .../mes_categoria_brasil/table_config.yaml | 2 +- .../mes_categoria_municipio/table_config.yaml | 2 +- .../mes_categoria_rm/table_config.yaml | 2 +- bases/br_ibge_pib/code/build.do | 58 +- bases/br_ibge_pib/municipio/publish.sql | 2 +- bases/br_ibge_pib/municipio/schema-prod.json | 2 +- .../br_ibge_pib/municipio/schema-staging.json | 2 +- bases/br_ibge_pib/municipio/table_config.yaml | 332 ++-- .../municipio/table_description.txt | 37 +- bases/br_ibge_pnad/README.md | 7 + bases/br_ibge_pnad/code/br_ibge_pnad.py | 300 ++++ bases/br_ibge_pnad/dataset_config.yaml | 46 + bases/br_ibge_pnad/dicionario/publish.sql | 27 + .../br_ibge_pnad/dicionario/schema-prod.json | 1 + .../dicionario/schema-staging.json | 1 + .../br_ibge_pnad/dicionario/table_config.yaml | 171 ++ .../dicionario/table_description.txt | 44 + .../publish.sql | 125 ++ .../schema-prod.json | 1 + .../schema-staging.json | 1 + .../table_config.yaml | 1476 +++++++++++++++++ .../table_description.txt | 88 + .../ano_brasil_grupo_idade/table_config.yaml | 2 +- .../ano_brasil_raca_cor/table_config.yaml | 2 +- .../table_config.yaml | 2 +- .../ano_municipio_raca_cor/table_config.yaml | 2 +- .../ano_regiao_grupo_idade/table_config.yaml | 2 +- .../table_config.yaml | 2 +- .../table_config.yaml | 2 +- .../ano_regiao_raca_cor/table_config.yaml | 2 +- .../ano_uf_grupo_idade/table_config.yaml | 2 +- .../ano_uf_raca_cor/table_config.yaml | 2 +- bases/br_inep_saeb/README.md | 5 + bases/br_inep_saeb/dataset_config.yml | 30 + .../br_inep_saeb/dicionario/table_config.yaml | 2 +- .../dicionario/table_description.txt | 2 +- .../microdados/table_config.yaml | 3 +- .../table_config.yaml | 3 +- .../microdados_paciente/table_config.yaml | 3 +- .../microdados_vacinacao/table_config.yaml | 3 +- bases/br_ons_energia_armazenada/README.md | 6 +- .../code/br_ons_energia_armazenada.R | 68 +- .../dataset_config.yaml | 70 +- .../subsistemas/publish.sql | 4 +- .../subsistemas/schema-prod.json | 2 +- .../subsistemas/schema-staging.json | 2 +- .../subsistemas/table_config.yaml | 256 +-- .../subsistemas/table_description.txt | 40 +- bases/br_poder360_pesquisas/README.md | 7 + .../br_poder360_pesquisas/dataset_config.yaml | 42 + .../microdados/publish.sql | 45 + .../microdados/schema-prod.json | 1 + .../microdados/schema-staging.json | 1 + .../microdados/table_config.yaml | 446 +++++ .../microdados/table_description.txt | 77 + .../basedosdados/upload/metadata.py | 2 +- 66 files changed, 3465 insertions(+), 426 deletions(-) create mode 100644 bases/br_ibge_pnad/README.md create mode 100644 bases/br_ibge_pnad/code/br_ibge_pnad.py create mode 100644 bases/br_ibge_pnad/dataset_config.yaml create mode 100644 bases/br_ibge_pnad/dicionario/publish.sql create mode 100644 bases/br_ibge_pnad/dicionario/schema-prod.json create mode 100644 bases/br_ibge_pnad/dicionario/schema-staging.json create mode 100644 bases/br_ibge_pnad/dicionario/table_config.yaml create mode 100644 bases/br_ibge_pnad/dicionario/table_description.txt create mode 100644 bases/br_ibge_pnad/microdados_compatibilizado_datazoom/publish.sql create mode 100644 bases/br_ibge_pnad/microdados_compatibilizado_datazoom/schema-prod.json create mode 100644 bases/br_ibge_pnad/microdados_compatibilizado_datazoom/schema-staging.json create mode 100644 bases/br_ibge_pnad/microdados_compatibilizado_datazoom/table_config.yaml create mode 100644 bases/br_ibge_pnad/microdados_compatibilizado_datazoom/table_description.txt create mode 100644 bases/br_inep_saeb/README.md create mode 100644 bases/br_inep_saeb/dataset_config.yml create mode 100644 bases/br_poder360_pesquisas/README.md create mode 100644 bases/br_poder360_pesquisas/dataset_config.yaml create mode 100644 bases/br_poder360_pesquisas/microdados/publish.sql create mode 100644 bases/br_poder360_pesquisas/microdados/schema-prod.json create mode 100644 bases/br_poder360_pesquisas/microdados/schema-staging.json create mode 100644 bases/br_poder360_pesquisas/microdados/table_config.yaml create mode 100644 bases/br_poder360_pesquisas/microdados/table_description.txt diff --git a/README.md b/README.md index 1da855b9b..755b2b9e6 100644 --- a/README.md +++ b/README.md @@ -196,6 +196,10 @@ Caso encontre algum problema no pacote e queira ajudar, basta documentar o probl ## Desenvolvimento +### Roadmap + +- [Primeiro semestre 2022](https://github.com/basedosdados/mais/milestone/2) + ### Documentação Para rodar a documentação localmente: @@ -221,4 +225,4 @@ nav: - Pacotes: access_data_packages.md - Contribua: colab.md - [Seu novo título]: .md -``` \ No newline at end of file +``` diff --git a/bases/br_ibge_inpc/mes_brasil/table_config.yaml b/bases/br_ibge_inpc/mes_brasil/table_config.yaml index 660c3c4db..ebae528aa 100644 --- a/bases/br_ibge_inpc/mes_brasil/table_config.yaml +++ b/bases/br_ibge_inpc/mes_brasil/table_config.yaml @@ -83,7 +83,7 @@ identifying_columns: last_updated: metadata: - data: + data: 2022_01_12 release: # Versão da tabela. Seguindo o padrão de semantic versioning. diff --git a/bases/br_ibge_inpc/mes_categoria_brasil/table_config.yaml b/bases/br_ibge_inpc/mes_categoria_brasil/table_config.yaml index 69b1ea4cb..95dc74e01 100644 --- a/bases/br_ibge_inpc/mes_categoria_brasil/table_config.yaml +++ b/bases/br_ibge_inpc/mes_categoria_brasil/table_config.yaml @@ -44,7 +44,7 @@ identifying_columns: last_updated: metadata: - data: + data: 2022_01_12 release: # Versão da tabela. Seguindo o padrão de semantic versioning. diff --git a/bases/br_ibge_inpc/mes_categoria_municipio/table_config.yaml b/bases/br_ibge_inpc/mes_categoria_municipio/table_config.yaml index 6c66083e1..0dd73dbaf 100644 --- a/bases/br_ibge_inpc/mes_categoria_municipio/table_config.yaml +++ b/bases/br_ibge_inpc/mes_categoria_municipio/table_config.yaml @@ -46,7 +46,7 @@ identifying_columns: last_updated: metadata: - data: + data: 2022_01_12 release: # Versão da tabela. Seguindo o padrão de semantic versioning. diff --git a/bases/br_ibge_inpc/mes_categoria_rm/table_config.yaml b/bases/br_ibge_inpc/mes_categoria_rm/table_config.yaml index fb77e7021..16a8cb967 100644 --- a/bases/br_ibge_inpc/mes_categoria_rm/table_config.yaml +++ b/bases/br_ibge_inpc/mes_categoria_rm/table_config.yaml @@ -45,7 +45,7 @@ identifying_columns: last_updated: metadata: - data: + data: 2022_01_12 release: # Versão da tabela. Seguindo o padrão de semantic versioning. diff --git a/bases/br_ibge_ipca/mes_brasil/table_config.yaml b/bases/br_ibge_ipca/mes_brasil/table_config.yaml index 68450d7f9..d6928561a 100644 --- a/bases/br_ibge_ipca/mes_brasil/table_config.yaml +++ b/bases/br_ibge_ipca/mes_brasil/table_config.yaml @@ -87,7 +87,7 @@ identifying_columns: last_updated: metadata: - data: + data: 2022_01_12 release: # Versão da tabela. Seguindo o padrão de semantic versioning. diff --git a/bases/br_ibge_ipca/mes_categoria_brasil/table_config.yaml b/bases/br_ibge_ipca/mes_categoria_brasil/table_config.yaml index d003d6cb1..ceab39f7f 100644 --- a/bases/br_ibge_ipca/mes_categoria_brasil/table_config.yaml +++ b/bases/br_ibge_ipca/mes_categoria_brasil/table_config.yaml @@ -48,7 +48,7 @@ identifying_columns: last_updated: metadata: - data: + data: 2022_01_12 release: # Versão da tabela. Seguindo o padrão de semantic versioning. diff --git a/bases/br_ibge_ipca/mes_categoria_municipio/table_config.yaml b/bases/br_ibge_ipca/mes_categoria_municipio/table_config.yaml index 556c2499d..a5c87292e 100644 --- a/bases/br_ibge_ipca/mes_categoria_municipio/table_config.yaml +++ b/bases/br_ibge_ipca/mes_categoria_municipio/table_config.yaml @@ -59,7 +59,7 @@ identifying_columns: last_updated: metadata: - data: + data: 2022_01_12 release: # Versão da tabela. Seguindo o padrão de semantic versioning. diff --git a/bases/br_ibge_ipca/mes_categoria_rm/table_config.yaml b/bases/br_ibge_ipca/mes_categoria_rm/table_config.yaml index 15db50609..fa365bd81 100644 --- a/bases/br_ibge_ipca/mes_categoria_rm/table_config.yaml +++ b/bases/br_ibge_ipca/mes_categoria_rm/table_config.yaml @@ -46,7 +46,7 @@ identifying_columns: last_updated: metadata: - data: + data: 2022_01_12 release: # Versão da tabela. Seguindo o padrão de semantic versioning. diff --git a/bases/br_ibge_ipca15/mes_brasil/table_config.yaml b/bases/br_ibge_ipca15/mes_brasil/table_config.yaml index 03846b080..bfb888f6b 100644 --- a/bases/br_ibge_ipca15/mes_brasil/table_config.yaml +++ b/bases/br_ibge_ipca15/mes_brasil/table_config.yaml @@ -63,7 +63,7 @@ identifying_columns: last_updated: metadata: - data: + data: 2022_01_12 release: # Versão da tabela. Seguindo o padrão de semantic versioning. diff --git a/bases/br_ibge_ipca15/mes_categoria_brasil/table_config.yaml b/bases/br_ibge_ipca15/mes_categoria_brasil/table_config.yaml index bf89aed94..ca7b9c511 100644 --- a/bases/br_ibge_ipca15/mes_categoria_brasil/table_config.yaml +++ b/bases/br_ibge_ipca15/mes_categoria_brasil/table_config.yaml @@ -44,7 +44,7 @@ identifying_columns: last_updated: metadata: - data: + data: 2022_01_12 release: # Versão da tabela. Seguindo o padrão de semantic versioning. diff --git a/bases/br_ibge_ipca15/mes_categoria_municipio/table_config.yaml b/bases/br_ibge_ipca15/mes_categoria_municipio/table_config.yaml index 1d5153f61..76ed9d0b1 100644 --- a/bases/br_ibge_ipca15/mes_categoria_municipio/table_config.yaml +++ b/bases/br_ibge_ipca15/mes_categoria_municipio/table_config.yaml @@ -56,7 +56,7 @@ identifying_columns: last_updated: metadata: - data: + data: 2022_01_12 release: # Versão da tabela. Seguindo o padrão de semantic versioning. diff --git a/bases/br_ibge_ipca15/mes_categoria_rm/table_config.yaml b/bases/br_ibge_ipca15/mes_categoria_rm/table_config.yaml index f3d5ae6d9..2b8145379 100644 --- a/bases/br_ibge_ipca15/mes_categoria_rm/table_config.yaml +++ b/bases/br_ibge_ipca15/mes_categoria_rm/table_config.yaml @@ -46,7 +46,7 @@ identifying_columns: last_updated: metadata: - data: + data: 2022_01_12 release: # Versão da tabela. Seguindo o padrão de semantic versioning. diff --git a/bases/br_ibge_pib/code/build.do b/bases/br_ibge_pib/code/build.do index 5405a1eb0..bdd6ecf94 100644 --- a/bases/br_ibge_pib/code/build.do +++ b/bases/br_ibge_pib/code/build.do @@ -6,7 +6,8 @@ clear all set more off -cd "/path/to/PIB" +//cd "/path/to/PIB" +cd "~/Dropbox/Academic/Data/Brazil/Municipios/PIB" //----------------------------------------------------------------------------// // build @@ -22,52 +23,63 @@ drop in 1 ren G id_municipio ren A ano -ren AM PIB +ren AM pib ren AL impostos_liquidos -ren AK VA -ren AG VA_agropecuaria -ren AH VA_industria -ren AI VA_servicos -ren AJ VA_ADESPSS +ren AK va +ren AG va_agropecuaria +ren AH va_industria +ren AI va_servicos +ren AJ va_adespss -keep id_municipio ano PIB VA* impostos* +keep id_municipio ano pib va* impostos* tempfile f2002_2009 save `f2002_2009' -import excel "input/PIB dos Municípios - base de dados 2010-2018.xls", clear +//----------------// +// 2010-2019 +//----------------// + +import excel "input/PIB dos Municípios - base de dados 2010-2019.xls", clear drop in 1 ren G id_municipio ren A ano -ren AM PIB +ren AM pib ren AL impostos_liquidos -ren AK VA -ren AG VA_agropecuaria -ren AH VA_industria -ren AI VA_servicos -ren AJ VA_ADESPSS +ren AK va +ren AG va_agropecuaria +ren AH va_industria +ren AI va_servicos +ren AJ va_adespss -keep id_municipio ano PIB impostos* VA* +keep id_municipio ano pib impostos* va* -tempfile f2010_2018 -save `f2010_2018' +tempfile f2010_2019 +save `f2010_2019' + +//----------------// +// append +//----------------// use `f2002_2009', clear -append using `f2010_2018' +append using `f2010_2019' destring, replace -foreach k of varlist PIB impostos_liquidos VA* { +foreach k of varlist pib impostos_liquidos va* { replace `k' = 1000 * `k' } * -order id_municipio ano PIB impostos_liquidos VA VA_agropecuaria VA_industria VA_servicos VA_ADESPSS +order id_municipio ano pib impostos_liquidos va va_agropecuaria va_industria va_servicos va_adespss sort id_municipio ano -format PIB impostos_liquidos VA* %20.0f +format PIB impostos_liquidos va* %20.0f + +export delimited "output/municipio.csv", replace datafmt + + -export delimited "output/municipios.csv", replace datafmt diff --git a/bases/br_ibge_pib/municipio/publish.sql b/bases/br_ibge_pib/municipio/publish.sql index bc44fefa5..f31401d8f 100644 --- a/bases/br_ibge_pib/municipio/publish.sql +++ b/bases/br_ibge_pib/municipio/publish.sql @@ -30,4 +30,4 @@ SAFE_CAST(VA_agropecuaria AS INT64) va_agropecuaria, SAFE_CAST(VA_industria AS INT64) va_industria, SAFE_CAST(VA_servicos AS INT64) va_servicos, SAFE_CAST(VA_ADESPSS AS INT64) va_adespss -from basedosdados-dev.br_ibge_pib_staging.municipio as t \ No newline at end of file +FROM basedosdados-dev.br_ibge_pib_staging.municipio AS t \ No newline at end of file diff --git a/bases/br_ibge_pib/municipio/schema-prod.json b/bases/br_ibge_pib/municipio/schema-prod.json index 683fad586..5d7ccfe3e 100644 --- a/bases/br_ibge_pib/municipio/schema-prod.json +++ b/bases/br_ibge_pib/municipio/schema-prod.json @@ -1 +1 @@ -[{"name": "id_municipio", "description": "ID Munic\u00edpio - IBGE 7 D\u00edgitos", "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "ano", "description": "Ano", "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "pib", "description": "Produto Interno Bruto a pre\u00e7os correntes", "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "impostos_liquidos", "description": "Impostos, l\u00edquidos de subs\u00eddios, sobre produtos a pre\u00e7os correntes", "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "va", "description": "Valor adicionado bruto a pre\u00e7os correntes total", "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "va_agropecuaria", "description": "Valor adicionado bruto a pre\u00e7os correntes da agropecu\u00e1ria", "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "va_industria", "description": "Valor adicionado bruto a pre\u00e7os correntes da ind\u00fastria", "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "va_servicos", "description": "Valor adicionado bruto a pre\u00e7os correntes dos servi\u00e7os, exclusive administra\u00e7\u00e3o, defesa, educa\u00e7\u00e3o e sa\u00fade p\u00fablicas e seguridade social", "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "va_adespss", "description": "Valor adicionado bruto a pre\u00e7os correntes da administra\u00e7\u00e3o, defesa, educa\u00e7\u00e3o e sa\u00fade p\u00fablicas e seguridade social", "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}] \ No newline at end of file +[{"name": "id_municipio", "bigquery_type": "string", "description": "ID Munic\u00edpio - IBGE 7 D\u00edgitos", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_brasil", "table_id": "municipio", "column_name": "id_municipio"}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "ano", "bigquery_type": "int64", "description": "Ano", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_data_tempo", "table_id": "ano", "column_name": "ano"}, "measurement_unit": "year", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "pib", "bigquery_type": "int64", "description": "Produto Interno Bruto a pre\u00e7os correntes", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "impostos_liquidos", "bigquery_type": "int64", "description": "Impostos, l\u00edquidos de subs\u00eddios, sobre produtos a pre\u00e7os correntes", "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "va", "bigquery_type": "int64", "description": "Valor adicionado bruto a pre\u00e7os correntes total", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "va_agropecuaria", "bigquery_type": "int64", "description": "Valor adicionado bruto a pre\u00e7os correntes da agropecu\u00e1ria", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": null, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "va_industria", "bigquery_type": "int64", "description": "Valor adicionado bruto a pre\u00e7os correntes da ind\u00fastria", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "va_servicos", "bigquery_type": "int64", "description": "Valor adicionado bruto a pre\u00e7os correntes dos servi\u00e7os, exclusive administra\u00e7\u00e3o, defesa, educa\u00e7\u00e3o e sa\u00fade p\u00fablicas e seguridade social", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "va_adespss", "bigquery_type": "int64", "description": "Valor adicionado bruto a pre\u00e7os correntes da administra\u00e7\u00e3o, defesa, educa\u00e7\u00e3o e sa\u00fade p\u00fablicas e seguridade social", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}] \ No newline at end of file diff --git a/bases/br_ibge_pib/municipio/schema-staging.json b/bases/br_ibge_pib/municipio/schema-staging.json index c5df68167..cb58a98e9 100644 --- a/bases/br_ibge_pib/municipio/schema-staging.json +++ b/bases/br_ibge_pib/municipio/schema-staging.json @@ -1 +1 @@ -[{"name": "id_municipio", "description": "", "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "ano", "description": "", "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "PIB", "description": "", "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "impostos_liquidos", "description": "", "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "VA", "description": "", "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "VA_agropecuaria", "description": "", "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "VA_industria", "description": "", "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "VA_servicos", "description": "", "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "VA_ADESPSS", "description": "", "is_in_staging": true, "is_partition": false, "type": "STRING"}] \ No newline at end of file +[{"name": "id_municipio", "bigquery_type": "string", "description": "ID Munic\u00edpio - IBGE 7 D\u00edgitos", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_brasil", "table_id": "municipio", "column_name": "id_municipio"}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "ano", "bigquery_type": "int64", "description": "Ano", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_data_tempo", "table_id": "ano", "column_name": "ano"}, "measurement_unit": "year", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "pib", "bigquery_type": "int64", "description": "Produto Interno Bruto a pre\u00e7os correntes", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "impostos_liquidos", "bigquery_type": "int64", "description": "Impostos, l\u00edquidos de subs\u00eddios, sobre produtos a pre\u00e7os correntes", "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "va", "bigquery_type": "int64", "description": "Valor adicionado bruto a pre\u00e7os correntes total", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "va_agropecuaria", "bigquery_type": "int64", "description": "Valor adicionado bruto a pre\u00e7os correntes da agropecu\u00e1ria", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": null, "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "va_industria", "bigquery_type": "int64", "description": "Valor adicionado bruto a pre\u00e7os correntes da ind\u00fastria", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "va_servicos", "bigquery_type": "int64", "description": "Valor adicionado bruto a pre\u00e7os correntes dos servi\u00e7os, exclusive administra\u00e7\u00e3o, defesa, educa\u00e7\u00e3o e sa\u00fade p\u00fablicas e seguridade social", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "va_adespss", "bigquery_type": "int64", "description": "Valor adicionado bruto a pre\u00e7os correntes da administra\u00e7\u00e3o, defesa, educa\u00e7\u00e3o e sa\u00fade p\u00fablicas e seguridade social", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING"}] \ No newline at end of file diff --git a/bases/br_ibge_pib/municipio/table_config.yaml b/bases/br_ibge_pib/municipio/table_config.yaml index bf65e89f6..a6fe8ffdc 100644 --- a/bases/br_ibge_pib/municipio/table_config.yaml +++ b/bases/br_ibge_pib/municipio/table_config.yaml @@ -1,80 +1,21 @@ -source_bucket_name: basedosdados-dev -project_id_staging: basedosdados-dev -project_id_prod: basedosdados-dev -table_id: municipio # AUTO GENERATED -dataset_id: br_ibge_pib # AUTO GENERATED - -url_ckan: https://basedosdados.org/dataset/br-ibge-pib # AUTO GENERATED -url_github: https://github.com/basedosdados/mais/tree/master/bases/br_ibge_pib # AUTO GENERATED -version: v1.0 # REQUIRED +dataset_id: br_ibge_pib -last_updated: 2021-07-05 # AUTO GENERATED +table_id: municipio # Descreva a tabela. Essas são as primeiras frases que um usuário vai ver. # Você não precisa ser muito conciso. Sinta-se a vontade para dar exemplos de # como usar os dados. -# Se souber, liste também aplicações: pesquisa, apps, etc. que usem os dados. -description: | # REQUIRED - Produto Interno Bruto (PIB) municipal a preços correntes. - -# Quem está completando esse arquivo config? -published_by: - name: Ricardo Dahis - code_url: https://github.com/basedosdados/mais/tree/master/bases/br_ibge_pib/code # REQUIRED - website: www.ricardodahis.com - email: rdahis@basedosdados.org - -# Qual organização/departamento/pessoa tratou os dados? -# As vezes há um ponto intermediário entre os dados originais e subir na Base dos Dados. -# Se essa pessoa é você, preencha abaixo com suas informações. -treated_by: - name: Ricardo Dahis - code_url: https://github.com/basedosdados/mais/tree/master/bases/br_ibge_pib/code # REQUIRED - website: - email: rdahis@basedosdados.org - -# Se houve passos de tratamento, limpeza e manipulação de dados, descreva-os aqui. -treatment_description: | - Mudamos a unidade de medida para R$1. - - -# Com qual frequência a base é atualizada? -# Opções: hora | dia | semana | mes | 1 ano | 2 anos | 5 anos | 10 anos | unico | recorrente -data_update_frequency: 1 ano # REQUIRED - -# Nível da observação (qual é a granularidade de cada linha na tabela) -# Escolha todas as opções necessárias. -# Regras: -# - minúsculo, sem acento, singular. -# - em portugues (ou seja, não use os nomes de colunas abaixo) -# Exemplos: pais, estado, municipio, cidade, hora, dia, semana, mes, ano, etc. -observation_level: #REQUIRED - - municipio - - ano +# Se souber, liste também aplicações: pesquisa, apps, etc. que usem os dados., +description: "Produto Interno Bruto (PIB) municipal a preços correntes.\n" -# Quais colunas identificam uma linha unicamente? -# Preencha com os nomes de colunas. Ex: id_municipio, ano. -# Pode ser vazio pois certas tabelas não possuem identificadores. -primary_keys: - - id_municipio - - ano +# A máxima unidade espacial que a tabela cobre. +spatial_coverage: bra -# Qual é a cobertura espacial da tabela? -# Regras: -# - minúsculo, sem acento, singular -# - descer até o menor nível administrativo cuja cobertura abaixo seja 'todos' -# Exemplo 1: tabela que cubra todos os municípios nos estados de SP e GO -# - brasil -# - SP, GO -# Exemplo 2: tabela que cubra países inteiros na América Latina -# - brasil, argentina, peru, equador -coverage_geo: - - brasil - -# Qual é a cobertura temporal (em anos) da tabela? -# Opções: ..., 1990, 1991, ..., 1999, 2000, 2001, ..., 2019, 2020, ... -coverage_time: +# Anos cobertos pela tabela. +# Preencher como lista de intervalos. +# Exemplo: 1995(1)2019. +temporal_coverage: - 2002 - 2003 - 2004 @@ -92,12 +33,86 @@ coverage_time: - 2016 - 2017 - 2018 + - 2019 + +# A unidade temporal com qual a tabela é atualizada. +# Opções em 'https://basedosdados.org/api/3/action/bd_available_options' +update_frequency: one_year + +# Entidade representada por cada linha. +# Opções em 'https://basedosdados.org/api/3/action/bd_available_options' +entity: + - municipality + +# A unidade temporal representada por cada linha. +# Opções em 'https://basedosdados.org/api/3/action/bd_available_options' +time_unit: + +# O conjunto mínimo de colunas identificando cada linha unicamente. +# Preencha com os nomes de colunas. +# Exemplos: id_municipio, ano. +# Pode ser vazio pois certas tabelas não possuem identificadores. +identifying_columns: + - id_municipio + - ano + +last_updated: + metadata: + data: + release: + +# Versão da tabela. Seguindo o padrão de semantic versioning. +# Exemplo: v1.1.3 +version: v1.0 + +# Quem está preenchendo esses metadados? +published_by: + name: Ricardo Dahis + email: rdahis@basedosdados.org + github_user: rdahis + website: www.ricardodahis.com + ckan_user: rdahis + +# Qual organização/departamento/pessoa tratou os dados? +# As vezes há um ponto intermediário entre os dados originais e subir na Base dos Dados. +# Se essa pessoa é você, preencha abaixo com suas informações. +data_cleaned_by: + name: Ricardo Dahis + email: rdahis@basedosdados.org + github_user: rdahis + ckan_user: rdahis + website: www.ricardodahis.com + code_url: https://github.com/basedosdados/mais/tree/master/bases/br_ibge_pib/code + +# Se houve passos de tratamento, limpeza e manipulação de dados, descreva-os aqui. +data_cleaning_description: "Mudamos a unidade de medida para R$1.\n" + +# Url dos dados originais no GCP Storage. +raw_files_url: + +# Url dos arquivos auxiliares no GCP Storage. +auxiliary_files_url: '' + +# Url da tabela de arquitetura no GCP Storage. +architecture_url: + +# A tabela tem colunas que precisam de dicionário? +# Opções: yes, no. +covered_by_dictionary: + +source_bucket_name: basedosdados-dev + +project_id_prod: basedosdados-dev + +project_id_staging: basedosdados-dev # Liste as colunas da tabela que representam partições. # Não esqueça de deletar essas colunas nas tabelas .csv na hora de subir para o BigQuery. # Isso poupará muito tempo e dinheiro às pessoas utilizando essa tabela. # Se não houver partições, não modifique abaixo. -partitions: # REQUIRED +partitions: + +bdm_file_size: # Quais são as colunas? Certifique-se de escrever uma boa descrição, as pessoas vão gostar # para saber sobre o que é a coluna. @@ -107,58 +122,125 @@ partitions: # REQUIRED # Algumas colunas existirão apenas na tabela final, você as construirá em `publish.sql`. # Para esses, defina is_in_staging como False. # Além disso, você deve adicionar as colunas de partição aqui e definir is_partition como True. -columns: # REQUIRED - - - - name: id_municipio - description: ID Município - IBGE 7 Dígitos - is_in_staging: True # Bool [True, False], whether the column is in the staging table - is_partition: False # Bool [True, False], whether the column is a partition. - - - - name: ano - description: Ano - is_in_staging: True # Bool [True, False], whether the column is in the staging table - is_partition: False # Bool [True, False], whether the column is a partition. - - - - name: pib - description: Produto Interno Bruto a preços correntes - is_in_staging: True # Bool [True, False], whether the column is in the staging table - is_partition: False # Bool [True, False], whether the column is a partition. - - - - name: impostos_liquidos - description: Impostos, líquidos de subsídios, sobre produtos a preços correntes - is_in_staging: True # Bool [True, False], whether the column is in the staging table - is_partition: False # Bool [True, False], whether the column is a partition. - - - - name: va - description: Valor adicionado bruto a preços correntes total - is_in_staging: True # Bool [True, False], whether the column is in the staging table - is_partition: False # Bool [True, False], whether the column is a partition. - - - - name: va_agropecuaria - description: Valor adicionado bruto a preços correntes da agropecuária - is_in_staging: True # Bool [True, False], whether the column is in the staging table - is_partition: False # Bool [True, False], whether the column is a partition. - - - - name: va_industria - description: Valor adicionado bruto a preços correntes da indústria - is_in_staging: True # Bool [True, False], whether the column is in the staging table - is_partition: False # Bool [True, False], whether the column is a partition. - - - - name: va_servicos - description: Valor adicionado bruto a preços correntes dos serviços, exclusive administração, defesa, educação e saúde públicas e seguridade social - is_in_staging: True # Bool [True, False], whether the column is in the staging table - is_partition: False # Bool [True, False], whether the column is a partition. - - - - name: va_adespss - description: Valor adicionado bruto a preços correntes da administração, defesa, educação e saúde públicas e seguridade social - is_in_staging: True # Bool [True, False], whether the column is in the staging table - is_partition: False # Bool [True, False], whether the column is a partition. \ No newline at end of file +columns: + - name: id_municipio + bigquery_type: string + description: ID Município - IBGE 7 Dígitos + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: br_bd_diretorios_brasil + table_id: municipio + column_name: id_municipio + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: ano + bigquery_type: int64 + description: Ano + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: br_bd_diretorios_data_tempo + table_id: ano + column_name: ano + measurement_unit: year + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: pib + bigquery_type: int64 + description: Produto Interno Bruto a preços correntes + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: impostos_liquidos + bigquery_type: int64 + description: Impostos, líquidos de subsídios, sobre produtos a preços correntes + temporal_coverage: + covered_by_dictionary: + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: + is_in_staging: true + is_partition: false + - name: va + bigquery_type: int64 + description: Valor adicionado bruto a preços correntes total + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: va_agropecuaria + bigquery_type: int64 + description: Valor adicionado bruto a preços correntes da agropecuária + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: + is_in_staging: true + is_partition: false + - name: va_industria + bigquery_type: int64 + description: Valor adicionado bruto a preços correntes da indústria + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: va_servicos + bigquery_type: int64 + description: Valor adicionado bruto a preços correntes dos serviços, exclusive + administração, defesa, educação e saúde públicas e seguridade social + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: va_adespss + bigquery_type: int64 + description: Valor adicionado bruto a preços correntes da administração, defesa, + educação e saúde públicas e seguridade social + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + +metadata_modified: '2021-09-12T23:21:06.841921' diff --git a/bases/br_ibge_pib/municipio/table_description.txt b/bases/br_ibge_pib/municipio/table_description.txt index aa5359063..d495aeeb1 100644 --- a/bases/br_ibge_pib/municipio/table_description.txt +++ b/bases/br_ibge_pib/municipio/table_description.txt @@ -2,8 +2,8 @@ Produto Interno Bruto (PIB) municipal a preços correntes. Para saber mais acesse: -Website: https://basedosdados.org/dataset/br-ibge-pib -Github: https://github.com/basedosdados/mais/tree/master/bases/br_ibge_pib +Website: +Github: Ajude a manter o projeto :) Apoia-se: https://apoia.se/basedosdados @@ -11,24 +11,27 @@ Apoia-se: https://apoia.se/basedosdados Publicado por ------------- Nome: Ricardo Dahis -Código: https://github.com/basedosdados/mais/tree/master/bases/br_ibge_pib/code +Código: Website: www.ricardodahis.com -Email: rdahis@basedosdados.orgTratado por +Email: rdahis@basedosdados.org +Tratado por ----------- Nome: Ricardo Dahis Código: https://github.com/basedosdados/mais/tree/master/bases/br_ibge_pib/code +Website: www.ricardodahis.com Email: rdahis@basedosdados.org -Nível da Observação (i.e. a granularidade da linha) -------------------- -- municipio -- ano + + Colunas identificando linhas unicamente ------------------- - id_municipio - ano + + + Cobertura Temporal ------------------ - 2002 @@ -48,17 +51,27 @@ Cobertura Temporal - 2016 - 2017 - 2018 +- 2019 + + + Cobertura Espacial ------------------ -- brasil +- b +- r +- a + + + Tratamento ---------- Mudamos a unidade de medida para R$1. -Frequencia de Atualização -------------------------- -1 ano + +Frequencia de Atualização +------------------------- +one_year diff --git a/bases/br_ibge_pnad/README.md b/bases/br_ibge_pnad/README.md new file mode 100644 index 000000000..0f2cd1f7c --- /dev/null +++ b/bases/br_ibge_pnad/README.md @@ -0,0 +1,7 @@ +Como capturar os dados de br_ibge_pnad? + +Para capturar esses dados, basta verificar o link dos dados originais indicado em dataset_config.yaml no item website. + +Caso tenha sido utilizado algum cdigo de captura ou tratamento, estes estaro contidos em code/. Se o dado publicado for em sua verso bruta, no existir a pasta code/. + +Os dados publicados esto disponveis em: https://basedosdados.org/dataset/br-ibge-pnad \ No newline at end of file diff --git a/bases/br_ibge_pnad/code/br_ibge_pnad.py b/bases/br_ibge_pnad/code/br_ibge_pnad.py new file mode 100644 index 000000000..405ef44a6 --- /dev/null +++ b/bases/br_ibge_pnad/code/br_ibge_pnad.py @@ -0,0 +1,300 @@ +#--------------------# +# path +#--------------------# + +path = "/content/gdrive/MyDrive/Base dos Dados/Dados/" +path_dados = path + "Bases/br_ibge_pnad/" + +#--------------------# +# pacotes +#--------------------# + +!pip install unidecode +import unidecode as un + +import pandas as pd +import numpy as np + +#cria pastas ano e UF + +list_anos = [1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1992, + 1993, 1995, 1996, 1997, 1998, 1999, 2001, 2002, 2003, 2004, 2005, + 2006, 2007, 2008, 2009, 2011, 2012, 2013, 2014, 2015] + +# list_anos = [1981, 1982] # teste + +ufs = ["AC", "AL", "AM", "AP", "BA", "CE", "DF", "ES", "GO", + "MA", "MG", "MS", "MT", "PA", "PB", "PE", "PI", "PR", + "RJ", "RN", "RO", "RR", "SC", "SE", "RS", "SP", "TO"] + +id_uf = { + "11" : "RO", + "12" : "AC", + "13" : "AM", + "14" : "RR", + "15" : "PA", + "16" : "AP", + "17" : "TO", + "21" : "MA", + "22" : "PI", + "23" : "CE", + "24" : "RN", + "25" : "PB", + "26" : "PE", + "27" : "AL", + "28" : "SE", + "29" : "BA", + "31" : "MG", + "32" : "ES", + "33" : "RJ", + "35" : "SP", + "41" : "PR", + "42" : "SC", + "43" : "RS", + "50" : "MS", + "51" : "MT", + "52" : "GO", + "53" : "DF" +} + +list_rename = { + 'ano':'ano', + 'sigla_uf':'sigla_uf', + 'uf':'id_uf', + 'regiao':'id_regiao', + 'metropol':'id_regiao_metropolitana', + 'id_dom':'id_domicilio', + 'urbana':'zona_urbana', + 'area_censit':'tipo_zona_domicilio', + 'tot_pess':'total_pessoas', + 'tot_pess_10_mais':'total_pessoas_10_mais', + 'especie_dom':'especie_domicilio', + 'tipo_dom':'tipo_domicilio', + 'parede':'tipo_parede', + 'cobertura':'tipo_cobertura', + 'agua_rede':'possui_agua_rede', + 'esgoto':'tipo_esgoto', + 'sanit_excl':'possui_sanitario_exclusivo', + 'lixo':'lixo_coletado', + 'ilum_eletr':'possui_iluminacao_eletrica', + 'comodos':'quantidade_comodos', + 'dormit':'quantidade_dormitorios', + 'sanit':'possui_sanitario', + 'posse_dom':'posse_domocilio', + 'filtro':'possui_filtro', + 'fogao':'possui_fogao', + 'geladeira':'possui_geladeira', + 'radio':'possui_radio', + 'tv':'possui_tv', + 'renda_dom':'renda_mensal_domiciliar', + 'renda_domB':'renda_mensal_domiciliar_compativel_1992', + 'peso':'peso_amostral', + 'aluguel':'valor_aluguel_pago', + 'prestacao':'prestacao', + 'deflator':'deflator', + 'conversor':'conversor_moeda', + 'renda_dom_def':'renda_domicilio_deflacionada', + 'renda_domB_def':'renda_mensal_domiciliar_compativel_1992_deflacionada', + 'aluguel_def':'aluguel_deflacionado', + 'prestacao_def':'prestacao_deflacionado', + 'num_fam':'numero_familia', + 'sexo':'sexo', + 'cond_dom':'condicao_domicilio', + 'cond_fam':'condicao_familia', + 'dia_nasc':'dia_nascimento', + 'mes_nasc':'mes_nascimento', + 'ano_nasc':'ano_nascimento', + 'idade':'idade', + 'ler_escrever':'sabe_ler_escrever', + 'serie_freq':'serie_frequentada', + 'grau_freq':'grau_frequentada', + 'serie_nao_freq':'ultima_seria_frequentada', + 'grau_nao_freq':'ultimo_grau_frequentado', + 'tinha_outro_trab':'tinha_outro_trabalho', + 'ocup_sem':'ocupacao_semana', + 'ramo_negocio_sem':'atividade_ramo_negocio_semana', + 'tem_carteira_assinada':'possui_carteira_assinada', + 'renda_mensal_din':'rendimento_mensal_dinheiro', + 'renda_mensal_prod':'rendimento_mensal_produto_mercadoria', + 'horas_trab_sem':'horas_trabalhadas_semana', + 'renda_mensal_din_outra':'rendimento_mensal_dinheiro_outra', + 'renda_mensal_prod_outra':'rendimento_mensal_produto_outra', + 'horas_trab_sem_outro':'horas_trabalhadas_outros_trabalhos', + 'contr_inst_prev':'contribui_previdencia', + 'qual_inst_prev':'tipo_instituto_previdencia', + 'tomou_prov_semana':'tomou_providencia_conseguir_trabalho_semana' , + 'tomou_prov_2meses':'tomou_providencia_ultimos_2_meses', + 'que_prov_tomou':'qual_medida', + 'tinha_cart_assin_ant_ano':'tinha_carteira_assinada_ultimo_emprego' , + 'renda_aposentadoria':'valor_aposentadoria', + 'renda_pensao':'valor_pensao', + 'renda_abono':'valor_abono_permanente', + 'renda_aluguel':'valor_aluguel', + 'renda_outras':'renda_outras', + 'renda_mensal_ocup_prin':'rendimento_mensal_ocupacao_principal', + 'renda_mensal_todos_trab':'rendimento_mensal_todos_trabalhos', + 'renda_mensal_todas_fontes':'rendimento_mensal_todas_fontes', + 'ramo_negocio_agreg':'atividade_ramo_negocio_agregado', + 'horas_trab_todos_trab':'horas_trabalhadas_todos_trabalhos', + 'pos_ocup_sem':'posicao_ocupacao' , + 'grupos_ocup_sem':'grupos_ocupacao', + 'num_pes_fam':'numero_membros_familia' , + 'renda_mensal_fam':'rendimento_mensal_familia' , + 'ordem':'ordem', + 'cor':'raca_cor', + 'educa':'anos_estudo', + 'freq_escola':'frequenta_escola', + 'trabalhou_semana':'trabalhou_semana', + 'tinha_trab_sem':'tinha_trabalhado_semana', + 'ocup_ant_ano':'ocupacao_ano_anterior', + 'ramo_negocio_ant_ano':'atividade_ramo_negocio_anterior', + 'renda_din_def':'rendimento_mensal_dinheiro_deflacionado', + 'renda_prod_def':'rendimento_mensal_produto_mercadoria_deflacionado', + 'renda_din_outra_def':'rendimento_mensal_dinheiro_outra_deflacionado', + 'renda_prod_outra_def':'rendimento_mensal_produto_mercadoria_outra_deflacionado' , + 'renda_ocup_prin_def':'rendimento_mensal_ocupacao_principal_deflacionado', + 'renda_todos_trab_def':'rendimento_mensal_todos_trabalhos_deflacionado', + 'renda_todas_fontes_def':'rendimento_mensal_todas_fontes_deflacionado', + 'renda_fam_def':'rendimento_mensal_familia_deflacionado', + 'renda_aposentadoria_def':'valor_aposentadoria_deflacionado', + 'renda_pensao_def':'valor_pensao_deflacionado', + 'renda_abono_def':'valor_abono_deflacionado', + 'renda_aluguel_def':'valor_aluguel_deflacionado', + 'renda_outras_def':'rendas_outras_deflacionado' +} + + + +list_ordem = ['ano', +'sigla_uf', +'id_uf', +'id_regiao', +'id_regiao_metropolitana', +'id_domicilio', +'zona_urbana', +'tipo_zona_domicilio', +'total_pessoas', +'total_pessoas_10_mais', +'especie_domicilio', +'tipo_domicilio', +'tipo_parede', +'tipo_cobertura', +'possui_agua_rede', +'tipo_esgoto', +'possui_sanitario_exclusivo', +'lixo_coletado', +'possui_iluminacao_eletrica', +'quantidade_comodos', +'quantidade_dormitorios', +'possui_sanitario', +'posse_domocilio', +'possui_filtro', +'possui_fogao', +'possui_geladeira', +'possui_radio', +'possui_tv', +'renda_mensal_domiciliar', +'renda_mensal_domiciliar_compativel_1992', +'peso_amostral', +'valor_aluguel_pago', +'prestacao', +'deflator', +'conversor_moeda', +'renda_domicilio_deflacionada', +'renda_mensal_domiciliar_compativel_1992_deflacionada', +'aluguel_deflacionado', +'prestacao_deflacionado', +'numero_familia', +'sexo', +'condicao_domicilio', +'condicao_familia', +'dia_nascimento', +'mes_nascimento', +'ano_nascimento', +'idade', +'sabe_ler_escrever', +'serie_frequentada', +'grau_frequentada', +'ultima_seria_frequentada', +'ultimo_grau_frequentado', +'tinha_outro_trabalho', +'ocupacao_semana', +'atividade_ramo_negocio_semana', +'possui_carteira_assinada', +'rendimento_mensal_dinheiro', +'rendimento_mensal_produto_mercadoria', +'horas_trabalhadas_semana', +'rendimento_mensal_dinheiro_outra', +'rendimento_mensal_produto_outra', +'horas_trabalhadas_outros_trabalhos', +'contribui_previdencia', +'tipo_instituto_previdencia', +'tomou_providencia_conseguir_trabalho_semana' , +'tomou_providencia_ultimos_2_meses', +'qual_medida', +'tinha_carteira_assinada_ultimo_emprego' , +'valor_aposentadoria', +'valor_pensao', +'valor_abono_permanente', +'valor_aluguel', +'renda_outras', +'rendimento_mensal_ocupacao_principal', +'rendimento_mensal_todos_trabalhos', +'rendimento_mensal_todas_fontes', +'atividade_ramo_negocio_agregado', +'horas_trabalhadas_todos_trabalhos', +'posicao_ocupacao' , +'grupos_ocupacao', +'numero_membros_familia' , +'rendimento_mensal_familia' , +'ordem', +'raca_cor', +'anos_estudo', +'frequenta_escola', +'trabalhou_semana', +'tinha_trabalhado_semana', +'ocupacao_ano_anterior', +'atividade_ramo_negocio_anterior', +'rendimento_mensal_dinheiro_deflacionado', +'rendimento_mensal_produto_mercadoria_deflacionado', +'rendimento_mensal_dinheiro_outra_deflacionado', +'rendimento_mensal_produto_mercadoria_outra_deflacionado', +'rendimento_mensal_ocupacao_principal_deflacionado', +'rendimento_mensal_todos_trabalhos_deflacionado', +'rendimento_mensal_todas_fontes_deflacionado', +'rendimento_mensal_familia_deflacionado', +'valor_aposentadoria_deflacionado', +'valor_pensao_deflacionado', +'valor_abono_deflacionado', +'valor_aluguel_deflacionado', +'rendas_outras_deflacionado'] + +#--------------------# +# cria particoes +#--------------------# + +# for i in list_anos: +# for uf in ufs: +# directory = path_dados + 'output/microdados_compatibilizado_datazoom/ano={}/sigla_uf={}'.format(i, uf) +# if not os.path.exists(directory): +# os.makedirs(directory) + +#--------------------# +# tratamento +#--------------------# + +dfs1 = [] + +df_total = 0 + +for ano in list_anos: + df = pd.read_stata(path_dados + 'input/' + f'{ano}_merge.dta') + df['sigla_uf'] = df['uf'].map(id_uf) + df = df.rename(columns = list_rename) + dfs1.append(df) + for uf in ufs: + print("Particionando {}-{}".format(ano, uf)) + df_partition = df[df['sigla_uf'] == uf] + df_partition.drop(['sigla_uf', 'ano'], axis=1, inplace=True) + partition_path = path_dados + 'output/microdados_compatibilizado_datazoom/ano={}/sigla_uf={}/microdados_dz.csv'.format(ano,uf) + df_partition.to_csv(partition_path, index=False, encoding='utf-8', na_rep='', float_format= '%.0f') \ No newline at end of file diff --git a/bases/br_ibge_pnad/dataset_config.yaml b/bases/br_ibge_pnad/dataset_config.yaml new file mode 100644 index 000000000..560e3f114 --- /dev/null +++ b/bases/br_ibge_pnad/dataset_config.yaml @@ -0,0 +1,46 @@ + +# Qual organização disponibiliza os dados originais? +# Opções: escolher dessa lista -> https://basedosdados.org/api/3/action/organization_list +# Exemplos: br-ibge, br-tse, br-rj-gov +organization: br-ibge + +dataset_id: br_ibge_pnad + +# Título do conjunto, a ser exibido no mecanismo de busca. +# Exemplo: População brasileira +title: Pesquisa Nacional por Amostra de Domicílios (PNAD) + +# Descrição e anotações úteis sobre os dados. +description: Microdados da Pesquisa Nacional por Amostra de Domicílios - PNAD, compatibilizados pelo DataZoom. A PNAD investiga diversas características da população brasileira, tais como educação, trabalho, rendimento e composição domiciliar. + +# Quais temas caracterizam a base? +# Opções: escolher dessa lista -> https://basedosdados.org/api/3/action/group_list +# Importante: preencher com a chave, e não o valor. +groups: + - economia + - educacao + - saude + +# Quais etiquetas caracterizam a base? +# Opções: escolher dessa lista -> https://basedosdados.org/api/3/action/tag_list +# Exemplos: fertilidade, preco, desmatamento. +# Caso crie etiquetas novas, as regras são: +# - letras minúsculas +# - sem acentos +# - sempre no singular +# - não repita nomes de grupos (ex. educacao, saude, meio ambiente, economia, etc.) +tags: + - trabalho + - emprego + +# Url completa do CKAN já contendo o dataset-id +# Exemplo: https://basedosdados.org/dataset/ +ckan_url: + +# Url completa do Github já contendo o dataset_id +# Exemplo: https://github.com/basedosdados/mais/tree/master/bases/ +github_url: + +# Não altere esse campo. +# Data da última modificação dos metadados gerada automaticamente pelo CKAN. +metadata_modified: diff --git a/bases/br_ibge_pnad/dicionario/publish.sql b/bases/br_ibge_pnad/dicionario/publish.sql new file mode 100644 index 000000000..811b643fd --- /dev/null +++ b/bases/br_ibge_pnad/dicionario/publish.sql @@ -0,0 +1,27 @@ +/* +Query para publicar a tabela. + +Esse é o lugar para: + - modificar nomes, ordem e tipos de colunas + - dar join com outras tabelas + - criar colunas extras (e.g. logs, proporções, etc.) + +Qualquer coluna definida aqui deve também existir em `table_config.yaml`. + +# Além disso, sinta-se à vontade para alterar alguns nomes obscuros +# para algo um pouco mais explícito. + +TIPOS: + - Para modificar tipos de colunas, basta substituir STRING por outro tipo válido. + - Exemplo: `SAFE_CAST(column_name AS NUMERIC) column_name` + - Mais detalhes: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types +*/ + +CREATE VIEW basedosdados-dev.br_ibge_pnad.dicionario AS +SELECT +SAFE_CAST(id_tabela AS STRING) id_tabela, +SAFE_CAST(nome_coluna AS STRING) nome_coluna, +SAFE_CAST(chave AS STRING) chave, +SAFE_CAST(cobertura_temporal AS STRING) cobertura_temporal, +SAFE_CAST(valor AS STRING) valor +FROM basedosdados-dev.br_ibge_pnad_staging.dicionario AS t \ No newline at end of file diff --git a/bases/br_ibge_pnad/dicionario/schema-prod.json b/bases/br_ibge_pnad/dicionario/schema-prod.json new file mode 100644 index 000000000..e0404396a --- /dev/null +++ b/bases/br_ibge_pnad/dicionario/schema-prod.json @@ -0,0 +1 @@ +[{"name": "id_tabela", "bigquery_type": "string", "description": "ID da tabela", "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING", "mode": "NULLABLE"}, {"name": "nome_coluna", "bigquery_type": "string", "description": "Nome da coluna", "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING", "mode": "NULLABLE"}, {"name": "chave", "bigquery_type": "string", "description": "Chave", "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING", "mode": "NULLABLE"}, {"name": "cobertura_temporal", "bigquery_type": "string", "description": "Cobertura temporal", "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING", "mode": "NULLABLE"}, {"name": "valor", "bigquery_type": "string", "description": "Valor", "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING", "mode": "NULLABLE"}] \ No newline at end of file diff --git a/bases/br_ibge_pnad/dicionario/schema-staging.json b/bases/br_ibge_pnad/dicionario/schema-staging.json new file mode 100644 index 000000000..8cb580b98 --- /dev/null +++ b/bases/br_ibge_pnad/dicionario/schema-staging.json @@ -0,0 +1 @@ +[{"name": "id_tabela", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "nome_coluna", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "chave", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "cobertura_temporal", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "valor", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}] \ No newline at end of file diff --git a/bases/br_ibge_pnad/dicionario/table_config.yaml b/bases/br_ibge_pnad/dicionario/table_config.yaml new file mode 100644 index 000000000..5f5ce79d0 --- /dev/null +++ b/bases/br_ibge_pnad/dicionario/table_config.yaml @@ -0,0 +1,171 @@ + +dataset_id: br_ibge_pnad + +table_id: dicionario + +# Descreva a tabela. Essas são as primeiras frases que um usuário vai ver. +# Você não precisa ser muito conciso. Sinta-se a vontade para dar exemplos de +# como usar os dados. +# Se souber, liste também aplicações: pesquisa, apps, etc. que usem os dados., +description: Dicionário + +# A máxima unidade espacial que a tabela cobre. +spatial_coverage: + +# Anos cobertos pela tabela. +# Preencher como lista de intervalos. +# Exemplo: 1995(1)2019. +temporal_coverage: + +# A unidade temporal com qual a tabela é atualizada. +# Opções em 'https://basedosdados.org/api/3/action/bd_available_options' +update_frequency: + +# Entidade representada por cada linha. +# Opções em 'https://basedosdados.org/api/3/action/bd_available_options' +entity: + +# A unidade temporal representada por cada linha. +# Opções em 'https://basedosdados.org/api/3/action/bd_available_options' +time_unit: + +# O conjunto mínimo de colunas identificando cada linha unicamente. +# Preencha com os nomes de colunas. +# Exemplos: id_municipio, ano. +# Pode ser vazio pois certas tabelas não possuem identificadores. +identifying_columns: + +last_updated: + metadata: + data: + release: + +# Versão da tabela. Seguindo o padrão de semantic versioning. +# Exemplo: v1.1.3 +version: + +# Quem está preenchendo esses metadados? +published_by: + name: Crislane Alves + email: crislanealves@basedosdados.org + github_user: crislanealves + website: + ckan_user: crislanealves + +# Qual organização/departamento/pessoa tratou os dados? +# As vezes há um ponto intermediário entre os dados originais e subir na Base dos Dados. +# Se essa pessoa é você, preencha abaixo com suas informações. +data_cleaned_by: + name: Crislane Alves + email: crislanealves@basedosdados.org + github_user: crislanealves + ckan_user: crislanealves + website: + code_url: + +# Se houve passos de tratamento, limpeza e manipulação de dados, descreva-os aqui. +data_cleaning_description: + +# Url dos dados originais no GCP Storage. +raw_files_url: + +# Url dos arquivos auxiliares no GCP Storage. +auxiliary_files_url: + +# Url da tabela de arquitetura no GCP Storage. +architecture_url: + +# A tabela tem colunas que precisam de dicionário? +# Opções: yes, no. +covered_by_dictionary: + +source_bucket_name: basedosdados-dev + +project_id_prod: basedosdados-dev + +project_id_staging: basedosdados-dev + +# Liste as colunas da tabela que representam partições. +# Não esqueça de deletar essas colunas nas tabelas .csv na hora de subir para o BigQuery. +# Isso poupará muito tempo e dinheiro às pessoas utilizando essa tabela. +# Se não houver partições, não modifique abaixo. +partitions: + +bdm_file_size: + +# Quais são as colunas? Certifique-se de escrever uma boa descrição, as pessoas vão gostar +# para saber sobre o que é a coluna. +# Adicionar todas as colunas manualmente pode ser bastante cansativo, por isso, quando +# inicializando este arquivo de configuração, você pode apontar a função para uma amostra de dados que +# preencherá automaticamente as colunas. +# Algumas colunas existirão apenas na tabela final, você as construirá em `publish.sql`. +# Para esses, defina is_in_staging como False. +# Além disso, você deve adicionar as colunas de partição aqui e definir is_partition como True. +columns: + - name: id_tabela + bigquery_type: string + description: ID da tabela + temporal_coverage: + covered_by_dictionary: + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: + is_in_staging: + is_partition: + - name: nome_coluna + bigquery_type: string + description: Nome da coluna + temporal_coverage: + covered_by_dictionary: + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: + is_in_staging: + is_partition: + - name: chave + bigquery_type: string + description: Chave + temporal_coverage: + covered_by_dictionary: + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: + is_in_staging: + is_partition: + - name: cobertura_temporal + bigquery_type: string + description: Cobertura temporal + temporal_coverage: + covered_by_dictionary: + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: + is_in_staging: + is_partition: + - name: valor + bigquery_type: string + description: Valor + temporal_coverage: + covered_by_dictionary: + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: + is_in_staging: + is_partition: + +metadata_modified: '2021-12-31T16:45:48.096008' diff --git a/bases/br_ibge_pnad/dicionario/table_description.txt b/bases/br_ibge_pnad/dicionario/table_description.txt new file mode 100644 index 000000000..7e7efe1ac --- /dev/null +++ b/bases/br_ibge_pnad/dicionario/table_description.txt @@ -0,0 +1,44 @@ +Dicionário + +Para saber mais acesse: +Website: +Github: + +Ajude a manter o projeto :) +Apoia-se: https://apoia.se/basedosdados + +Publicado por +------------- +Nome: Crislane Alves +Código: +Email: crislanealves@basedosdados.org +Tratado por +----------- +Nome: Crislane Alves +Email: crislanealves@basedosdados.org + + + + +Colunas identificando linhas unicamente +------------------- + + + + +Cobertura Temporal +------------------ + + + + +Cobertura Espacial +------------------ + + + + + + + + diff --git a/bases/br_ibge_pnad/microdados_compatibilizado_datazoom/publish.sql b/bases/br_ibge_pnad/microdados_compatibilizado_datazoom/publish.sql new file mode 100644 index 000000000..a2239c1df --- /dev/null +++ b/bases/br_ibge_pnad/microdados_compatibilizado_datazoom/publish.sql @@ -0,0 +1,125 @@ +/* +Query para publicar a tabela. + +Esse é o lugar para: + - modificar nomes, ordem e tipos de colunas + - dar join com outras tabelas + - criar colunas extras (e.g. logs, proporções, etc.) + +Qualquer coluna definida aqui deve também existir em `table_config.yaml`. + +# Além disso, sinta-se à vontade para alterar alguns nomes obscuros +# para algo um pouco mais explícito. + +TIPOS: + - Para modificar tipos de colunas, basta substituir STRING por outro tipo válido. + - Exemplo: `SAFE_CAST(column_name AS NUMERIC) column_name` + - Mais detalhes: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types +*/ + +CREATE VIEW basedosdados-dev.br_ibge_pnad.microdados_compatibilizado_datazoom AS +SELECT +SAFE_CAST(ano AS INT64) ano, +SAFE_CAST(sigla_uf AS STRING) sigla_uf, +SAFE_CAST(id_uf AS STRING) id_uf, +SAFE_CAST(id_regiao AS STRING) id_regiao, +SAFE_CAST(id_regiao_metropolitana AS STRING) id_regiao_metropolitana, +SAFE_CAST(id_domicilio AS STRING) id_domicilio, +SAFE_CAST(zona_urbana AS STRING) zona_urbana, +SAFE_CAST(tipo_zona_domicilio AS STRING) tipo_zona_domicilio, +SAFE_CAST(total_pessoas AS INT64) total_pessoas, +SAFE_CAST(total_pessoas_10_mais AS INT64) total_pessoas_10_mais, +SAFE_CAST(especie_domicilio AS STRING) especie_domicilio, +SAFE_CAST(tipo_domicilio AS STRING) tipo_domicilio, +SAFE_CAST(tipo_parede AS STRING) tipo_parede, +SAFE_CAST(tipo_cobertura AS STRING) tipo_cobertura, +SAFE_CAST(possui_agua_rede AS STRING) possui_agua_rede, +SAFE_CAST(tipo_esgoto AS STRING) tipo_esgoto, +SAFE_CAST(possui_sanitario_exclusivo AS STRING) possui_sanitario_exclusivo, +SAFE_CAST(lixo_coletado AS STRING) lixo_coletado, +SAFE_CAST(possui_iluminacao_eletrica AS STRING) possui_iluminacao_eletrica, +SAFE_CAST(quantidade_comodos AS INT64) quantidade_comodos, +SAFE_CAST(quantidade_dormitorios AS INT64) quantidade_dormitorios, +SAFE_CAST(possui_sanitario AS STRING) possui_sanitario, +SAFE_CAST(posse_domocilio AS STRING) posse_domocilio, +SAFE_CAST(possui_filtro AS STRING) possui_filtro, +SAFE_CAST(possui_fogao AS STRING) possui_fogao, +SAFE_CAST(possui_geladeira AS STRING) possui_geladeira, +SAFE_CAST(possui_radio AS STRING) possui_radio, +SAFE_CAST(possui_tv AS STRING) possui_tv, +SAFE_CAST(renda_mensal_domiciliar AS FLOAT64) renda_mensal_domiciliar, +SAFE_CAST(renda_mensal_domiciliar_compativel_1992 AS FLOAT64) renda_mensal_domiciliar_compativel_1992, +SAFE_CAST(peso_amostral AS FLOAT64) peso_amostral, +SAFE_CAST(valor_aluguel_pago AS FLOAT64) valor_aluguel_pago, +SAFE_CAST(prestacao AS FLOAT64) prestacao, +SAFE_CAST(deflator AS INT64) deflator, +SAFE_CAST(conversor_moeda AS INT64) conversor_moeda, +SAFE_CAST(renda_domicilio_deflacionada AS FLOAT64) renda_domicilio_deflacionada, +SAFE_CAST(renda_mensal_domiciliar_compativel_1992_deflacionada AS FLOAT64) renda_mensal_domiciliar_compativel_1992_deflacionada, +SAFE_CAST(aluguel_deflacionado AS FLOAT64) aluguel_deflacionado, +SAFE_CAST(prestacao_deflacionado AS FLOAT64) prestacao_deflacionado, +SAFE_CAST(numero_familia AS INT64) numero_familia, +SAFE_CAST(sexo AS STRING) sexo, +SAFE_CAST(condicao_domicilio AS STRING) condicao_domicilio, +SAFE_CAST(condicao_familia AS STRING) condicao_familia, +SAFE_CAST(dia_nascimento AS INT64) dia_nascimento, +SAFE_CAST(mes_nascimento AS INT64) mes_nascimento, +SAFE_CAST(ano_nascimento AS INT64) ano_nascimento, +SAFE_CAST(idade AS INT64) idade, +SAFE_CAST(sabe_ler_escrever AS STRING) sabe_ler_escrever, +SAFE_CAST(serie_frequentada AS STRING) serie_frequentada, +SAFE_CAST(grau_frequentada AS STRING) grau_frequentada, +SAFE_CAST(ultima_seria_frequentada AS INT64) ultima_seria_frequentada, +SAFE_CAST(ultimo_grau_frequentado AS STRING) ultimo_grau_frequentado, +SAFE_CAST(tinha_outro_trabalho AS STRING) tinha_outro_trabalho, +SAFE_CAST(ocupacao_semana AS INT64) ocupacao_semana, +SAFE_CAST(atividade_ramo_negocio_semana AS INT64) atividade_ramo_negocio_semana, +SAFE_CAST(possui_carteira_assinada AS STRING) possui_carteira_assinada, +SAFE_CAST(rendimento_mensal_dinheiro AS FLOAT64) rendimento_mensal_dinheiro, +SAFE_CAST(rendimento_mensal_produto_mercadoria AS FLOAT64) rendimento_mensal_produto_mercadoria, +SAFE_CAST(horas_trabalhadas_semana AS INT64) horas_trabalhadas_semana, +SAFE_CAST(rendimento_mensal_dinheiro_outra AS FLOAT64) rendimento_mensal_dinheiro_outra, +SAFE_CAST(rendimento_mensal_produto_outra AS FLOAT64) rendimento_mensal_produto_outra, +SAFE_CAST(horas_trabalhadas_outros_trabalhos AS INT64) horas_trabalhadas_outros_trabalhos, +SAFE_CAST(contribui_previdencia AS STRING) contribui_previdencia, +SAFE_CAST(tipo_instituto_previdencia AS STRING) tipo_instituto_previdencia, +SAFE_CAST(tomou_providencia_conseguir_trabalho_semana AS STRING) tomou_providencia_conseguir_trabalho_semana, +SAFE_CAST(tomou_providencia_ultimos_2_meses AS STRING) tomou_providencia_ultimos_2_meses, +SAFE_CAST(qual_medida AS STRING) qual_medida, +SAFE_CAST(tinha_carteira_assinada_ultimo_emprego AS STRING) tinha_carteira_assinada_ultimo_emprego, +SAFE_CAST(valor_aposentadoria AS FLOAT64) valor_aposentadoria, +SAFE_CAST(valor_pensao AS FLOAT64) valor_pensao, +SAFE_CAST(valor_abono_permanente AS FLOAT64) valor_abono_permanente, +SAFE_CAST(valor_aluguel AS FLOAT64) valor_aluguel, +SAFE_CAST(renda_outras AS FLOAT64) renda_outras, +SAFE_CAST(rendimento_mensal_ocupacao_principal AS FLOAT64) rendimento_mensal_ocupacao_principal, +SAFE_CAST(rendimento_mensal_todos_trabalhos AS FLOAT64) rendimento_mensal_todos_trabalhos, +SAFE_CAST(rendimento_mensal_todas_fontes AS FLOAT64) rendimento_mensal_todas_fontes, +SAFE_CAST(atividade_ramo_negocio_agregado AS STRING) atividade_ramo_negocio_agregado, +SAFE_CAST(horas_trabalhadas_todos_trabalhos AS INT64) horas_trabalhadas_todos_trabalhos, +SAFE_CAST(posicao_ocupacao AS STRING) posicao_ocupacao, +SAFE_CAST(grupos_ocupacao AS STRING) grupos_ocupacao, +SAFE_CAST(numero_membros_familia AS STRING) numero_membros_familia, +SAFE_CAST(rendimento_mensal_familia AS STRING) rendimento_mensal_familia, +SAFE_CAST(ordem AS INT64) ordem, +SAFE_CAST(raca_cor AS STRING) raca_cor, +SAFE_CAST(anos_estudo AS INT64) anos_estudo, +SAFE_CAST(frequenta_escola AS STRING) frequenta_escola, +SAFE_CAST(trabalhou_semana AS STRING) trabalhou_semana, +SAFE_CAST(tinha_trabalhado_semana AS STRING) tinha_trabalhado_semana, +SAFE_CAST(ocupacao_ano_anterior AS INT64) ocupacao_ano_anterior, +SAFE_CAST(atividade_ramo_negocio_anterior AS INT64) atividade_ramo_negocio_anterior, +SAFE_CAST(rendimento_mensal_dinheiro_deflacionado AS FLOAT64) rendimento_mensal_dinheiro_deflacionado, +SAFE_CAST(rendimento_mensal_produto_mercadoria_deflacionado AS FLOAT64) rendimento_mensal_produto_mercadoria_deflacionado, +SAFE_CAST(rendimento_mensal_dinheiro_outra_deflacionado AS FLOAT64) rendimento_mensal_dinheiro_outra_deflacionado, +SAFE_CAST(rendimento_mensal_produto_mercadoria_outra_deflacionado AS STRING) rendimento_mensal_produto_mercadoria_outra_deflacionado, +SAFE_CAST(rendimento_mensal_ocupacao_principal_deflacionado AS FLOAT64) rendimento_mensal_ocupacao_principal_deflacionado, +SAFE_CAST(rendimento_mensal_todos_trabalhos_deflacionado AS FLOAT64) rendimento_mensal_todos_trabalhos_deflacionado, +SAFE_CAST(rendimento_mensal_todas_fontes_deflacionado AS FLOAT64) rendimento_mensal_todas_fontes_deflacionado, +SAFE_CAST(rendimento_mensal_familia_deflacionado AS FLOAT64) rendimento_mensal_familia_deflacionado, +SAFE_CAST(valor_aposentadoria_deflacionado AS FLOAT64) valor_aposentadoria_deflacionado, +SAFE_CAST(valor_pensao_deflacionado AS FLOAT64) valor_pensao_deflacionado, +SAFE_CAST(valor_abono_deflacionado AS FLOAT64) valor_abono_deflacionado, +SAFE_CAST(valor_aluguel_deflacionado AS FLOAT64) valor_aluguel_deflacionado, +SAFE_CAST(rendas_outras_deflacionado AS FLOAT64) rendas_outras_deflacionado +FROM basedosdados-dev.br_ibge_pnad_staging.microdados_compatibilizado_datazoom AS t \ No newline at end of file diff --git a/bases/br_ibge_pnad/microdados_compatibilizado_datazoom/schema-prod.json b/bases/br_ibge_pnad/microdados_compatibilizado_datazoom/schema-prod.json new file mode 100644 index 000000000..6c200976d --- /dev/null +++ b/bases/br_ibge_pnad/microdados_compatibilizado_datazoom/schema-prod.json @@ -0,0 +1 @@ +[{"name": "ano", "bigquery_type": "int64", "description": "Ano da pesquisa", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_data_tempo", "table_id": "ano", "column_name": "ano"}, "measurement_unit": "year", "has_sensitive_data": false, "is_in_staging": true, "is_partition": true, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "sigla_uf", "bigquery_type": "string", "description": "Sigla da Unidade da Federa\u00e7\u00e3o", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_brasil", "table_id": "uf", "column_name": "sigla"}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": true, "type": "STRING", "mode": "NULLABLE"}, {"name": "id_regiao", "bigquery_type": "string", "description": "ID da Regi\u00e3o - IBGE", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "id_uf", "bigquery_type": "string", "description": "ID Unidade da Federa\u00e7\u00e3o - IBGE", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "id_domicilio", "bigquery_type": "string", "description": "N\u00famero de identifica\u00e7\u00e3o do domic\u00edlio", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "zona_urbana", "bigquery_type": "string", "description": "Zona Urbana", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "tipo_zona_domicilio", "bigquery_type": "string", "description": "Tipo de zona do domic\u00edlio", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "total_pessoas", "bigquery_type": "int64", "description": "Total de pessoas", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "total_pessoas_10_mais", "bigquery_type": "int64", "description": "Total de pessoas 10 anos ou mais", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "person", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "especie_domicilio", "bigquery_type": "string", "description": "Esp\u00e9cie de domic\u00edlio", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "tipo_domicilio", "bigquery_type": "string", "description": "Tipo de domic\u00edlio", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "tipo_parede", "bigquery_type": "string", "description": "Material Predominante das paredes", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "tipo_cobertura", "bigquery_type": "string", "description": "Material Predominante no telhado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "possui_agua_rede", "bigquery_type": "string", "description": "\u00c1gua prov\u00e9m de rede?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "tipo_esgoto", "bigquery_type": "string", "description": "Esgotamento sanit\u00e1rio", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "possui_sanitario_exclusivo", "bigquery_type": "string", "description": "Sanit\u00e1rio exclusivo do domic\u00edlio?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "lixo_coletado", "bigquery_type": "string", "description": "O lixo \u00e9 coletado?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "possui_iluminacao_eletrica", "bigquery_type": "string", "description": "Possui ilumina\u00e7\u00e3o el\u00e9trica?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "quantidade_comodos", "bigquery_type": "int64", "description": "Quantidade de c\u00f4modos", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "quantidade_dormitorios", "bigquery_type": "int64", "description": "Quantidade de c\u00f4modos servindo como dormit\u00f3rio", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "possui_sanitario", "bigquery_type": "string", "description": "Possui sanit\u00e1rio?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "posse_domocilio", "bigquery_type": "string", "description": "Posse do domic\u00edlio", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "possui_filtro", "bigquery_type": "string", "description": "Possui filtro?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "possui_fogao", "bigquery_type": "string", "description": "Possui fog\u00e3o?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "possui_geladeira", "bigquery_type": "string", "description": "Possui geladeira?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "possui_radio", "bigquery_type": "string", "description": "Possui r\u00e1dio?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "possui_tv", "bigquery_type": "string", "description": "Possui televis\u00e3o?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "renda_mensal_domiciliar", "bigquery_type": "float64", "description": "Renda mensal domiciliar", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "renda_mensal_domiciliar_compativel_1992", "bigquery_type": "float64", "description": "Renda mensal domiciliar compat\u00edvel com 1992", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "peso_amostral", "bigquery_type": "float64", "description": "Peso do domic\u00edlio", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "valor_aluguel_pago", "bigquery_type": "float64", "description": "Valor do aluguel", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "prestacao", "bigquery_type": "float64", "description": "Valor da presta\u00e7\u00e3o", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "deflator", "bigquery_type": "int64", "description": "Deflator (base outubro de 2012)", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "conversor_moeda", "bigquery_type": "int64", "description": "Conversor de Moeda", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "renda_domicilio_deflacionada", "bigquery_type": "float64", "description": "Renda do domic\u00edlio - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "renda_mensal_domiciliar_compativel_1992_deflacionada", "bigquery_type": "float64", "description": "Renda mensal domiciliar compat\u00edvel com 1992 - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "aluguel_deflacionado", "bigquery_type": "float64", "description": "Aluguel - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "prestacao_deflacionado", "bigquery_type": "float64", "description": "Presta\u00e7\u00e3o - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "numero_familia", "bigquery_type": "int64", "description": "N\u00famero da fam\u00edlia", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "id_regiao_metropolitana", "bigquery_type": "string", "description": "ID da Regi\u00e3o Metropolitana", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "sexo", "bigquery_type": "string", "description": "Sexo", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "condicao_domicilio", "bigquery_type": "string", "description": "Condi\u00e7\u00e3o no domic\u00edlio", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "condicao_familia", "bigquery_type": "string", "description": "Condi\u00e7\u00e3o na fam\u00edlia", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "dia_nascimento", "bigquery_type": "int64", "description": "Dia de nascimento", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_data_tempo", "table_id": "dia", "column_name": "dia"}, "measurement_unit": "day", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "mes_nascimento", "bigquery_type": "int64", "description": "M\u00eas de nascimento", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_data_tempo", "table_id": "mes", "column_name": "mes"}, "measurement_unit": "month", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "ano_nascimento", "bigquery_type": "int64", "description": "Ano de nascimento", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_data_tempo", "table_id": "ano", "column_name": "ano"}, "measurement_unit": "year", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "idade", "bigquery_type": "int64", "description": "Idade", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "year", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "sabe_ler_escrever", "bigquery_type": "string", "description": "Sabe ler e escrever?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "serie_frequentada", "bigquery_type": "string", "description": "S\u00e9rie que frequenta", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "grau_frequentada", "bigquery_type": "string", "description": "Grau que frequenta", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "ultima_seria_frequentada", "bigquery_type": "int64", "description": "\u00daltima s\u00e9rie frequentada (para quem n\u00e3o frequenta escola)", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "ultimo_grau_frequentado", "bigquery_type": "string", "description": "\u00daltimo grau frequentado (para quem n\u00e3o frequenta escola)", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "tinha_outro_trabalho", "bigquery_type": "string", "description": "Tinha outro trabalho na semana de refer\u00eancia", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "ocupacao_semana", "bigquery_type": "int64", "description": "Ocupa\u00e7\u00e3o na semana", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "atividade_ramo_negocio_semana", "bigquery_type": "int64", "description": "Atividade ou ramo do n\u00e9gocio na semana", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "possui_carteira_assinada", "bigquery_type": "string", "description": "Tem carteira de trabalho assinada", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "rendimento_mensal_dinheiro", "bigquery_type": "float64", "description": "Rendimento mensal em dinheiro", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "rendimento_mensal_produto_mercadoria", "bigquery_type": "float64", "description": "Rendimento mensal em produtos ou mercadorias", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "horas_trabalhadas_semana", "bigquery_type": "int64", "description": "Horas normalmente trabalhadas na semana - Ocupa\u00e7\u00e3o principal", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "hour", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "rendimento_mensal_dinheiro_outra", "bigquery_type": "float64", "description": "Rendimento em mensal dinheiro Outra", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "rendimento_mensal_produto_outra", "bigquery_type": "float64", "description": "Rendimento mensal em produtos ou mercadorias outras", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "horas_trabalhadas_outros_trabalhos", "bigquery_type": "int64", "description": "Horas normalmente trabalhadas na semana - Outros trabalhos", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "hour", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "contribui_previdencia", "bigquery_type": "string", "description": "Contribui para instituto de previd\u00eancia", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "tipo_instituto_previdencia", "bigquery_type": "string", "description": "Tipo de instituto de previd\u00eancia", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "tomou_providencia_conseguir_trabalho_semana", "bigquery_type": null, "description": "Tomou provid\u00eancia para conseguir trabalho na semana", "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "tomou_providencia_ultimos_2_meses", "bigquery_type": null, "description": "Tomou provid\u00eancia nos \u00faltimos 2 meses para conseguir trabalho", "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "qual_medida", "bigquery_type": "string", "description": "O que foi tomado de medida para encontrar um emprego", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "tinha_carteira_assinada_ultimo_emprego", "bigquery_type": null, "description": "Tinha carteira assinada no \u00faltimo emprego que teve no ano", "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "valor_aposentadoria", "bigquery_type": "float64", "description": "Valor da Aposentadoria", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "valor_pensao", "bigquery_type": "float64", "description": "Valor da Pens\u00e3o", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "valor_abono_permanente", "bigquery_type": "float64", "description": "Valor do b\u00f4nus do sal\u00e1rio permanente", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "valor_aluguel", "bigquery_type": "float64", "description": "Valor do Aluguel recebido", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "renda_outras", "bigquery_type": "float64", "description": "Valor de outras rendas", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "rendimento_mensal_ocupacao_principal", "bigquery_type": "float64", "description": "Rendimento mensal ocupa\u00e7\u00e3o principal", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "rendimento_mensal_todos_trabalhos", "bigquery_type": "float64", "description": "Rendimento mensal todos trabalhos", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "rendimento_mensal_todas_fontes", "bigquery_type": "float64", "description": "Rendimento mensal todas fontes", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "atividade_ramo_negocio_agregado", "bigquery_type": "string", "description": "Atividade ou ramo do n\u00e9gocio na semana - Agregado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "horas_trabalhadas_todos_trabalhos", "bigquery_type": "int64", "description": "Horas normalmente trabalhadas na semana em todos os trabalhos", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "hour", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "posicao_ocupacao", "bigquery_type": null, "description": "Posi\u00e7\u00e3o de ocupa\u00e7\u00e3o na semana", "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "grupos_ocupacao", "bigquery_type": "string", "description": "Grupos de ocupa\u00e7\u00e3o na semana", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "numero_membros_familia", "bigquery_type": null, "description": "N\u00famero de membros da fam\u00edlia", "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "person", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "rendimento_mensal_familia", "bigquery_type": null, "description": "Rendimento mensal da fam\u00edlia", "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "ordem", "bigquery_type": "int64", "description": "N\u00famero de ordem da pessoa", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "raca_cor", "bigquery_type": "string", "description": "Ra\u00e7a ou Cor (autodeclara\u00e7\u00e3o)", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "anos_estudo", "bigquery_type": "int64", "description": "Anos de estudo", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "year", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "frequenta_escola", "bigquery_type": "string", "description": "Frequenta a escola?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "trabalhou_semana", "bigquery_type": "string", "description": "Trabalhou na semana", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "tinha_trabalhado_semana", "bigquery_type": "string", "description": "Tinha trabalhado na semana?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "ocupacao_ano_anterior", "bigquery_type": "int64", "description": "Ocupa\u00e7\u00e3o no ano anterior", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "atividade_ramo_negocio_anterior", "bigquery_type": "int64", "description": "Atividade ou ramo do neg\u00f3cio anterior", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "rendimento_mensal_dinheiro_deflacionado", "bigquery_type": "float64", "description": "Rendimento mensal em dinheiro - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "rendimento_mensal_produto_mercadoria_deflacionado", "bigquery_type": "float64", "description": "Rendimento mensal em produto ou mercadoria - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "rendimento_mensal_dinheiro_outra_deflacionado", "bigquery_type": "float64", "description": "Rendimento mensal em dinheiro outra - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "rendimento_mensal_produto_mercadoria_outra_deflacionado", "bigquery_type": null, "description": "Rendimento mensal em produto ou mercadoria outra - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "rendimento_mensal_ocupacao_principal_deflacionado", "bigquery_type": "float64", "description": "Rendimento mensal da ocupa\u00e7\u00e3o principal - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "rendimento_mensal_todos_trabalhos_deflacionado", "bigquery_type": "float64", "description": "Rendimento mensal de todos os trabalhos - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "rendimento_mensal_todas_fontes_deflacionado", "bigquery_type": "float64", "description": "Rendimento mensal de todas as fontes - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "rendimento_mensal_familia_deflacionado", "bigquery_type": "float64", "description": "Rendimento mensal da fam\u00edlia - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "valor_aposentadoria_deflacionado", "bigquery_type": "float64", "description": "Valor da aposentadoria - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "valor_pensao_deflacionado", "bigquery_type": "float64", "description": "Valor da Pens\u00e3o - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "valor_abono_deflacionado", "bigquery_type": "float64", "description": "Valor do abono - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "valor_aluguel_deflacionado", "bigquery_type": "float64", "description": "Valor do Aluguel - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "rendas_outras_deflacionado", "bigquery_type": "float64", "description": "Rendas Outras - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "brl", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}] \ No newline at end of file diff --git a/bases/br_ibge_pnad/microdados_compatibilizado_datazoom/schema-staging.json b/bases/br_ibge_pnad/microdados_compatibilizado_datazoom/schema-staging.json new file mode 100644 index 000000000..412fec35c --- /dev/null +++ b/bases/br_ibge_pnad/microdados_compatibilizado_datazoom/schema-staging.json @@ -0,0 +1 @@ +[{"name": "id_regiao", "bigquery_type": "STRING", "description": "ID da Regi\u00e3o - IBGE", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "id_uf", "bigquery_type": "STRING", "description": "ID Unidade da Federa\u00e7\u00e3o - IBGE", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "id_domicilio", "bigquery_type": "STRING", "description": "N\u00famero de identifica\u00e7\u00e3o do domic\u00edlio", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "zona_urbana", "bigquery_type": "STRING", "description": "Zona Urbana", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "tipo_zona_domicilio", "bigquery_type": "STRING", "description": "Tipo de zona do domic\u00edlio", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "total_pessoas", "bigquery_type": "INT64", "description": "Total de pessoas", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "total_pessoas_10_mais", "bigquery_type": "INT64", "description": "Total de pessoas 10 anos ou mais", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "especie_domicilio", "bigquery_type": "STRING", "description": "Esp\u00e9cie de domic\u00edlio ", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "tipo_domicilio", "bigquery_type": "STRING", "description": "Tipo de domic\u00edlio", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "tipo_parede", "bigquery_type": "STRING", "description": "Material Predominante das paredes", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "tipo_cobertura", "bigquery_type": "STRING", "description": "Material Predominante no telhado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "possui_agua_rede", "bigquery_type": "STRING", "description": "\u00c1gua prov\u00e9m de rede?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "tipo_esgoto", "bigquery_type": "STRING", "description": "Esgotamento sanit\u00e1rio", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "possui_sanitario_exclusivo", "bigquery_type": "STRING", "description": "Sanit\u00e1rio exclusivo do domic\u00edlio?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "lixo_coletado", "bigquery_type": "STRING", "description": "O lixo \u00e9 coletado?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "possui_iluminacao_eletrica", "bigquery_type": "STRING", "description": "Possui ilumina\u00e7\u00e3o el\u00e9trica?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "quantidade_comodos", "bigquery_type": "INT64", "description": "Quantidade de c\u00f4modos", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "quantidade_dormitorios", "bigquery_type": "INT64", "description": "Quantidade de c\u00f4modos servindo como dormit\u00f3rio", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "possui_sanitario", "bigquery_type": "STRING", "description": "Possui sanit\u00e1rio?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "posse_domocilio", "bigquery_type": "STRING", "description": "Posse do domic\u00edlio", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "possui_filtro", "bigquery_type": "STRING", "description": "Possui filtro?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "possui_fogao", "bigquery_type": "STRING", "description": "Possui fog\u00e3o?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "possui_geladeira", "bigquery_type": "STRING", "description": "Possui geladeira?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "possui_radio", "bigquery_type": "STRING", "description": "Possui r\u00e1dio?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "possui_tv", "bigquery_type": "STRING", "description": "Possui televis\u00e3o?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "renda_mensal_domiciliar", "bigquery_type": "FLOAT64", "description": "Renda mensal domiciliar", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "renda_mensal_domiciliar_compativel_1992", "bigquery_type": "FLOAT64", "description": "Renda mensal domiciliar compat\u00edvel com 1992", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "peso_amostral", "bigquery_type": "FLOAT64", "description": "Peso do domic\u00edlio", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "valor_aluguel_pago", "bigquery_type": "FLOAT64", "description": "Valor do aluguel", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "prestacao", "bigquery_type": "FLOAT64", "description": "Valor da presta\u00e7\u00e3o", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "deflator", "bigquery_type": "INT64", "description": "Deflator (base outubro de 2012)", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "conversor_moeda", "bigquery_type": "INT64", "description": "Conversor de Moeda", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "renda_domicilio_deflacionada", "bigquery_type": "FLOAT64", "description": "Renda do domic\u00edlio - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "renda_mensal_domiciliar_compativel_1992_deflacionada", "bigquery_type": "FLOAT64", "description": "Renda mensal domiciliar compat\u00edvel com 1992 - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "aluguel_deflacionado", "bigquery_type": "FLOAT64", "description": "Aluguel - Valor deflacionado ", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "prestacao_deflacionado", "bigquery_type": "FLOAT64", "description": "Presta\u00e7\u00e3o - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "numero_familia", "bigquery_type": "INT64", "description": "N\u00famero da fam\u00edlia", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "id_regiao_metropolitana", "bigquery_type": "STRING", "description": "ID da Regi\u00e3o Metropolitana", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "sexo", "bigquery_type": "STRING", "description": "Sexo", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "condicao_domicilio", "bigquery_type": "STRING", "description": "Condi\u00e7\u00e3o no domic\u00edlio", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "condicao_familia", "bigquery_type": "STRING", "description": "Condi\u00e7\u00e3o na fam\u00edlia", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "dia_nascimento", "bigquery_type": "INT64", "description": "Dia do nascimento", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "mes_nascimento", "bigquery_type": "INT64", "description": "M\u00eas do nascimento", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "m\u00eas", "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "ano_nascimento", "bigquery_type": "INT64", "description": "Ano do nascimento", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "ano", "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "idade", "bigquery_type": "INT64", "description": "Idade", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "sabe_ler_escrever", "bigquery_type": "STRING", "description": "Sabe ler e escrever?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "serie_frequentada", "bigquery_type": "STRING", "description": "S\u00e9rie que frequenta", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "grau_frequentada", "bigquery_type": "STRING", "description": "Grau que frequenta ", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "ultima_seria_frequentada", "bigquery_type": "INT64", "description": "\u00daltima s\u00e9rie frequentada (para quem n\u00e3o frequenta escola)", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "ultimo_grau_frequentado", "bigquery_type": "STRING", "description": "\u00daltimo grau frequentado (para quem n\u00e3o frequenta escola)", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "tinha_outro_trabalho", "bigquery_type": "STRING", "description": "Tinha outro trabalho na semana de refer\u00eancia", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "ocupacao_semana", "bigquery_type": "INT64", "description": "Ocupa\u00e7\u00e3o na semana", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": "cbo", "table_id": "cbo", "column_name": "cbo"}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "atividade_ramo_negocio_semana", "bigquery_type": "INT64", "description": "Atividade ou ramo do n\u00e9gocio na semana", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": "cbo", "table_id": "cbo", "column_name": "cbo"}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "possui_carteira_assinada", "bigquery_type": "STRING", "description": "Tem carteira de trabalho assinada", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "rendimento_mensal_dinheiro", "bigquery_type": "FLOAT64", "description": "Rendimento mensal em dinheiro", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "rendimento_mensal_produto_mercadoria", "bigquery_type": "FLOAT64", "description": "Rendimento mensal em produtos ou mercadorias", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "horas_trabalhadas_semana", "bigquery_type": "INT64", "description": "Horas normalmente trabalhadas na semana - Ocupa\u00e7\u00e3o principal", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "h", "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "rendimento_mensal_dinheiro_outra", "bigquery_type": "FLOAT64", "description": "Rendimento em mensal dinheiro Outra", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "rendimento_mensal_produto_outra", "bigquery_type": "FLOAT64", "description": "Rendimento mensal em produtos ou mercadorias outras", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "horas_trabalhadas_outros_trabalhos", "bigquery_type": "INT64", "description": "Horas normalmente trabalhadas na semana - Outros trabalhos", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "h", "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "contribui_previdencia", "bigquery_type": "STRING", "description": "Contribui para instituto de previd\u00eancia", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "tipo_instituto_previdencia", "bigquery_type": "STRING", "description": "Tipo de instituto de previd\u00eancia", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "tomou_providencia_conseguir_trabalho_semana", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "tomou_providencia_ultimos_2_meses", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "qual_medida", "bigquery_type": "STRING", "description": "O que foi tomado de medida para encontrar um emprego", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "tinha_carteira_assinada_ultimo_emprego", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "valor_aposentadoria", "bigquery_type": "FLOAT64", "description": "Valor da Aposentadoria", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "valor_pensao", "bigquery_type": "FLOAT64", "description": "Valor da Pens\u00e3o", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "valor_abono_permanente", "bigquery_type": "FLOAT64", "description": "Valor do b\u00f4nus do sal\u00e1rio permanente", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "valor_aluguel", "bigquery_type": "FLOAT64", "description": "Valor do Aluguel recebido", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "renda_outras", "bigquery_type": "FLOAT64", "description": "Valor de outras rendas", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "rendimento_mensal_ocupacao_principal", "bigquery_type": "FLOAT64", "description": "Rendimento mensal ocupa\u00e7\u00e3o principal", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "rendimento_mensal_todos_trabalhos", "bigquery_type": "FLOAT64", "description": "Rendimento mensal todos trabalhos", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "rendimento_mensal_todas_fontes", "bigquery_type": "FLOAT64", "description": "Rendimento mensal todas fontes", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "atividade_ramo_negocio_agregado", "bigquery_type": "STRING", "description": "Atividade ou ramo do n\u00e9gocio na semana - Agregado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "horas_trabalhadas_todos_trabalhos", "bigquery_type": "INT64", "description": "Horas normalmente trabalhadas na semana em todos os trabalhos", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "h", "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "posicao_ocupacao", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "grupos_ocupacao", "bigquery_type": "STRING", "description": "Grupos de ocupa\u00e7\u00e3o na semana", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "numero_membros_familia", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "rendimento_mensal_familia", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "ordem", "bigquery_type": "INT64", "description": "N\u00famero de ordem da pessoa", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "raca_cor", "bigquery_type": "STRING", "description": "Ra\u00e7a ou Cor (autodeclara\u00e7\u00e3o)", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "anos_estudo", "bigquery_type": "INT64", "description": "Anos de estudo", "temporal_coverage": null, "covered_by_dictionary": true, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "ano", "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "frequenta_escola", "bigquery_type": "STRING", "description": "Frequenta a escola?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "trabalhou_semana", "bigquery_type": "STRING", "description": "Trabalhou na semana", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "tinha_trabalhado_semana", "bigquery_type": "STRING", "description": "Tinha trabalhado na semana?", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "ocupacao_ano_anterior", "bigquery_type": "INT64", "description": "Ocupa\u00e7\u00e3o no ano anterior", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "atividade_ramo_negocio_anterior", "bigquery_type": "INT64", "description": "Atividade ou ramo do neg\u00f3cio anterior", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "rendimento_mensal_dinheiro_deflacionado", "bigquery_type": "FLOAT64", "description": "Rendimento mensal em dinheiro - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "rendimento_mensal_produto_mercadoria_deflacionado", "bigquery_type": "FLOAT64", "description": "Rendimento mensal em produto ou mercadoria - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "rendimento_mensal_dinheiro_outra_deflacionado", "bigquery_type": "FLOAT64", "description": "Rendimento mensal em dinheiro outra - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "rendimento_mensal_produto_mercadoria_outra_deflacionado", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "rendimento_mensal_ocupacao_principal_deflacionado", "bigquery_type": "FLOAT64", "description": "Rendimento mensal da ocupa\u00e7\u00e3o principal - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "rendimento_mensal_todos_trabalhos_deflacionado", "bigquery_type": "FLOAT64", "description": "Rendimento mensal de todos os trabalhos - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "rendimento_mensal_todas_fontes_deflacionado", "bigquery_type": "FLOAT64", "description": "Rendimento mensal de todas as fontes - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "rendimento_mensal_familia_deflacionado", "bigquery_type": "FLOAT64", "description": "Rendimento mensal da fam\u00edlia - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "valor_aposentadoria_deflacionado", "bigquery_type": "FLOAT64", "description": "Valor da aposentadoria - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "valor_pensao_deflacionado", "bigquery_type": "FLOAT64", "description": "Valor da Pens\u00e3o - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "valor_abono_deflacionado", "bigquery_type": "FLOAT64", "description": "Valor do abono - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "valor_aluguel_deflacionado", "bigquery_type": "FLOAT64", "description": "Valor do Aluguel - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "rendas_outras_deflacionado", "bigquery_type": "FLOAT64", "description": "Rendas Outras - Valor deflacionado", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}] \ No newline at end of file diff --git a/bases/br_ibge_pnad/microdados_compatibilizado_datazoom/table_config.yaml b/bases/br_ibge_pnad/microdados_compatibilizado_datazoom/table_config.yaml new file mode 100644 index 000000000..e19dc3872 --- /dev/null +++ b/bases/br_ibge_pnad/microdados_compatibilizado_datazoom/table_config.yaml @@ -0,0 +1,1476 @@ +dataset_id: br_ibge_pnad + +table_id: microdados_compatibilizado_datazoom + +# Descreva a tabela. Essas são as primeiras frases que um usuário vai ver. +# Você não precisa ser muito conciso. Sinta-se a vontade para dar exemplos de +# como usar os dados. +# Se souber, liste também aplicações: pesquisa, apps, etc. que usem os dados., +description: Microdados da Pesquisa Nacional por Amostra de Domicílios - PNAD. + +# A máxima unidade espacial que a tabela cobre. +spatial_coverage: bra + + +# Anos cobertos pela tabela. +# Preencher como lista de intervalos. +# Exemplo: 1995(1)2019. +temporal_coverage: + - 1981 + - 1982 + - 1983 + - 1984 + - 1985 + - 1986 + - 1987 + - 1988 + - 1989 + - 1990 + - 1992 + - 1993 + - 1995 + - 1996 + - 1997 + - 1998 + - 1999 + - 2001 + - 2002 + - 2003 + - 2004 + - 2005 + - 2006 + - 2007 + - 2008 + - 2009 + - 2011 + - 2012 + - 2013 + - 2014 + - 2015 + +# A unidade temporal com qual a tabela é atualizada. +# Opções em 'https://basedosdados.org/api/3/action/bd_available_options' +update_frequency: one_year + +# Entidade representada por cada linha. +# Opções em 'https://basedosdados.org/api/3/action/bd_available_options' +entity: person + +# A unidade temporal representada por cada linha. +# Opções em 'https://basedosdados.org/api/3/action/bd_available_options' +time_unit: one_year + +# O conjunto mínimo de colunas identificando cada linha unicamente. +# Preencha com os nomes de colunas. +# Exemplos: id_municipio, ano. +# Pode ser vazio pois certas tabelas não possuem identificadores. +identifying_columns: [] + +last_updated: + metadata: + data: + release: + +# Versão da tabela. Seguindo o padrão de semantic versioning. +# Exemplo: v1.1.3 +version: v1.0 + +# Quem está preenchendo esses metadados? +published_by: + name: Crislane Alves + email: crislanealves@basedosdados.org + github_user: crislanealves + website: + ckan_user: crislanealves + +# Qual organização/departamento/pessoa tratou os dados? +# As vezes há um ponto intermediário entre os dados originais e subir na Base dos Dados. +# Se essa pessoa é você, preencha abaixo com suas informações. +data_cleaned_by: + name: Crislane Alves + email: crislanealves@basedosdados.org + github_user: crislanealves + ckan_user: crislanealves + website: + code_url: + +# Se houve passos de tratamento, limpeza e manipulação de dados, descreva-os aqui. +data_cleaning_description: Tratamento e compatibilização feitos pelo pacote "datazoom_pnad" no Stata. E, posteriormente, adicionado ao padrão da Base dos Dados. + +# Url dos dados originais no GCP Storage. +raw_files_url: + +# Url dos arquivos auxiliares no GCP Storage. +auxiliary_files_url: + +# Url da tabela de arquitetura no GCP Storage. +architecture_url: + +# A tabela tem colunas que precisam de dicionário? +# Opções: yes, no. +covered_by_dictionary: + +source_bucket_name: basedosdados-dev + +project_id_prod: basedosdados-dev + +project_id_staging: basedosdados-dev + +# Liste as colunas da tabela que representam partições. +# Não esqueça de deletar essas colunas nas tabelas .csv na hora de subir para o BigQuery. +# Isso poupará muito tempo e dinheiro às pessoas utilizando essa tabela. +# Se não houver partições, não modifique abaixo. +partitions: ano, sigla_uf + +bdm_file_size: + +# Quais são as colunas? Certifique-se de escrever uma boa descrição, as pessoas vão gostar +# para saber sobre o que é a coluna. +# Adicionar todas as colunas manualmente pode ser bastante cansativo, por isso, quando +# inicializando este arquivo de configuração, você pode apontar a função para uma amostra de dados que +# preencherá automaticamente as colunas. +# Algumas colunas existirão apenas na tabela final, você as construirá em `publish.sql`. +# Para esses, defina is_in_staging como False. +# Além disso, você deve adicionar as colunas de partição aqui e definir is_partition como True. +columns: + - name: ano + bigquery_type: int64 + description: Ano da pesquisa + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: br_bd_diretorios_data_tempo + table_id: ano + column_name: ano + measurement_unit: year + has_sensitive_data: no + is_in_staging: true + is_partition: true + - name: sigla_uf + bigquery_type: string + description: Sigla da Unidade da Federação + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: br_bd_diretorios_brasil + table_id: uf + column_name: sigla + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: true + - name: id_regiao + bigquery_type: string + description: ID da Região - IBGE + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: id_uf + bigquery_type: string + description: ID Unidade da Federação - IBGE + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: id_domicilio + bigquery_type: string + description: Número de identificação do domicílio + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: zona_urbana + bigquery_type: string + description: Zona Urbana + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: tipo_zona_domicilio + bigquery_type: string + description: Tipo de zona do domicílio + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: total_pessoas + bigquery_type: int64 + description: Total de pessoas + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: total_pessoas_10_mais + bigquery_type: int64 + description: Total de pessoas 10 anos ou mais + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: person + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: especie_domicilio + bigquery_type: string + description: Espécie de domicílio + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: tipo_domicilio + bigquery_type: string + description: Tipo de domicílio + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: tipo_parede + bigquery_type: string + description: Material Predominante das paredes + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: tipo_cobertura + bigquery_type: string + description: Material Predominante no telhado + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: possui_agua_rede + bigquery_type: string + description: Água provém de rede? + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: tipo_esgoto + bigquery_type: string + description: Esgotamento sanitário + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: possui_sanitario_exclusivo + bigquery_type: string + description: Sanitário exclusivo do domicílio? + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: lixo_coletado + bigquery_type: string + description: O lixo é coletado? + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: possui_iluminacao_eletrica + bigquery_type: string + description: Possui iluminação elétrica? + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: quantidade_comodos + bigquery_type: int64 + description: Quantidade de cômodos + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: quantidade_dormitorios + bigquery_type: int64 + description: Quantidade de cômodos servindo como dormitório + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: possui_sanitario + bigquery_type: string + description: Possui sanitário? + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: posse_domocilio + bigquery_type: string + description: Posse do domicílio + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: possui_filtro + bigquery_type: string + description: Possui filtro? + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: possui_fogao + bigquery_type: string + description: Possui fogão? + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: possui_geladeira + bigquery_type: string + description: Possui geladeira? + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: possui_radio + bigquery_type: string + description: Possui rádio? + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: possui_tv + bigquery_type: string + description: Possui televisão? + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: renda_mensal_domiciliar + bigquery_type: float64 + description: Renda mensal domiciliar + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: renda_mensal_domiciliar_compativel_1992 + bigquery_type: float64 + description: Renda mensal domiciliar compatível com 1992 + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: peso_amostral + bigquery_type: float64 + description: Peso do domicílio + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: valor_aluguel_pago + bigquery_type: float64 + description: Valor do aluguel + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: prestacao + bigquery_type: float64 + description: Valor da prestação + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: deflator + bigquery_type: int64 + description: Deflator (base outubro de 2012) + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: conversor_moeda + bigquery_type: int64 + description: Conversor de Moeda + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: renda_domicilio_deflacionada + bigquery_type: float64 + description: Renda do domicílio - Valor deflacionado + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: renda_mensal_domiciliar_compativel_1992_deflacionada + bigquery_type: float64 + description: Renda mensal domiciliar compatível com 1992 - Valor deflacionado + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: aluguel_deflacionado + bigquery_type: float64 + description: Aluguel - Valor deflacionado + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: prestacao_deflacionado + bigquery_type: float64 + description: Prestação - Valor deflacionado + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: numero_familia + bigquery_type: int64 + description: Número da família + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: id_regiao_metropolitana + bigquery_type: string + description: ID da Região Metropolitana + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: sexo + bigquery_type: string + description: Sexo + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: condicao_domicilio + bigquery_type: string + description: Condição no domicílio + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: condicao_familia + bigquery_type: string + description: Condição na família + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: dia_nascimento + bigquery_type: int64 + description: Dia de nascimento + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: br_bd_diretorios_data_tempo + table_id: dia + column_name: dia + measurement_unit: day + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: mes_nascimento + bigquery_type: int64 + description: Mês de nascimento + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: br_bd_diretorios_data_tempo + table_id: mes + column_name: mes + measurement_unit: month + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: ano_nascimento + bigquery_type: int64 + description: Ano de nascimento + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: br_bd_diretorios_data_tempo + table_id: ano + column_name: ano + measurement_unit: year + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: idade + bigquery_type: int64 + description: Idade + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: year + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: sabe_ler_escrever + bigquery_type: string + description: Sabe ler e escrever? + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: serie_frequentada + bigquery_type: string + description: Série que frequenta + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: grau_frequentada + bigquery_type: string + description: Grau que frequenta + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: ultima_seria_frequentada + bigquery_type: int64 + description: Última série frequentada (para quem não frequenta escola) + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: ultimo_grau_frequentado + bigquery_type: string + description: Último grau frequentado (para quem não frequenta escola) + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: tinha_outro_trabalho + bigquery_type: string + description: Tinha outro trabalho na semana de referência + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: ocupacao_semana + bigquery_type: int64 + description: Ocupação na semana + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: atividade_ramo_negocio_semana + bigquery_type: int64 + description: Atividade ou ramo do négocio na semana + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: possui_carteira_assinada + bigquery_type: string + description: Tem carteira de trabalho assinada + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: rendimento_mensal_dinheiro + bigquery_type: float64 + description: Rendimento mensal em dinheiro + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: rendimento_mensal_produto_mercadoria + bigquery_type: float64 + description: Rendimento mensal em produtos ou mercadorias + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: horas_trabalhadas_semana + bigquery_type: int64 + description: Horas normalmente trabalhadas na semana - Ocupação principal + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: hour + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: rendimento_mensal_dinheiro_outra + bigquery_type: float64 + description: Rendimento em mensal dinheiro Outra + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: rendimento_mensal_produto_outra + bigquery_type: float64 + description: Rendimento mensal em produtos ou mercadorias outras + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: horas_trabalhadas_outros_trabalhos + bigquery_type: int64 + description: Horas normalmente trabalhadas na semana - Outros trabalhos + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: hour + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: contribui_previdencia + bigquery_type: string + description: Contribui para instituto de previdência + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: tipo_instituto_previdencia + bigquery_type: string + description: Tipo de instituto de previdência + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: tomou_providencia_conseguir_trabalho_semana + bigquery_type: + description: Tomou providência para conseguir trabalho na semana + temporal_coverage: + covered_by_dictionary: + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: tomou_providencia_ultimos_2_meses + bigquery_type: + description: Tomou providência nos últimos 2 meses para conseguir trabalho + temporal_coverage: + covered_by_dictionary: + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: qual_medida + bigquery_type: string + description: O que foi tomado de medida para encontrar um emprego + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: tinha_carteira_assinada_ultimo_emprego + bigquery_type: + description: Tinha carteira assinada no último emprego que teve no ano + temporal_coverage: + covered_by_dictionary: + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: valor_aposentadoria + bigquery_type: float64 + description: Valor da Aposentadoria + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: valor_pensao + bigquery_type: float64 + description: Valor da Pensão + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: valor_abono_permanente + bigquery_type: float64 + description: Valor do bônus do salário permanente + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: valor_aluguel + bigquery_type: float64 + description: Valor do Aluguel recebido + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: renda_outras + bigquery_type: float64 + description: Valor de outras rendas + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: rendimento_mensal_ocupacao_principal + bigquery_type: float64 + description: Rendimento mensal ocupação principal + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: rendimento_mensal_todos_trabalhos + bigquery_type: float64 + description: Rendimento mensal todos trabalhos + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: rendimento_mensal_todas_fontes + bigquery_type: float64 + description: Rendimento mensal todas fontes + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: atividade_ramo_negocio_agregado + bigquery_type: string + description: Atividade ou ramo do négocio na semana - Agregado + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: horas_trabalhadas_todos_trabalhos + bigquery_type: int64 + description: Horas normalmente trabalhadas na semana em todos os trabalhos + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: hour + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: posicao_ocupacao + bigquery_type: + description: Posição de ocupação na semana + temporal_coverage: + covered_by_dictionary: + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: grupos_ocupacao + bigquery_type: string + description: Grupos de ocupação na semana + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: numero_membros_familia + bigquery_type: + description: Número de membros da família + temporal_coverage: + covered_by_dictionary: + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: person + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: rendimento_mensal_familia + bigquery_type: + description: Rendimento mensal da família + temporal_coverage: + covered_by_dictionary: + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: ordem + bigquery_type: int64 + description: Número de ordem da pessoa + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: raca_cor + bigquery_type: string + description: Raça ou Cor (autodeclaração) + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: anos_estudo + bigquery_type: int64 + description: Anos de estudo + temporal_coverage: + covered_by_dictionary: yes + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: year + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: frequenta_escola + bigquery_type: string + description: Frequenta a escola? + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: trabalhou_semana + bigquery_type: string + description: Trabalhou na semana + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: tinha_trabalhado_semana + bigquery_type: string + description: Tinha trabalhado na semana? + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: ocupacao_ano_anterior + bigquery_type: int64 + description: Ocupação no ano anterior + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: atividade_ramo_negocio_anterior + bigquery_type: int64 + description: Atividade ou ramo do negócio anterior + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: rendimento_mensal_dinheiro_deflacionado + bigquery_type: float64 + description: Rendimento mensal em dinheiro - Valor deflacionado + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: rendimento_mensal_produto_mercadoria_deflacionado + bigquery_type: float64 + description: Rendimento mensal em produto ou mercadoria - Valor deflacionado + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: rendimento_mensal_dinheiro_outra_deflacionado + bigquery_type: float64 + description: Rendimento mensal em dinheiro outra - Valor deflacionado + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: rendimento_mensal_produto_mercadoria_outra_deflacionado + bigquery_type: + description: Rendimento mensal em produto ou mercadoria outra - Valor deflacionado + temporal_coverage: + covered_by_dictionary: + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: rendimento_mensal_ocupacao_principal_deflacionado + bigquery_type: float64 + description: Rendimento mensal da ocupação principal - Valor deflacionado + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: rendimento_mensal_todos_trabalhos_deflacionado + bigquery_type: float64 + description: Rendimento mensal de todos os trabalhos - Valor deflacionado + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: rendimento_mensal_todas_fontes_deflacionado + bigquery_type: float64 + description: Rendimento mensal de todas as fontes - Valor deflacionado + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: rendimento_mensal_familia_deflacionado + bigquery_type: float64 + description: Rendimento mensal da família - Valor deflacionado + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: valor_aposentadoria_deflacionado + bigquery_type: float64 + description: Valor da aposentadoria - Valor deflacionado + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: valor_pensao_deflacionado + bigquery_type: float64 + description: Valor da Pensão - Valor deflacionado + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: valor_abono_deflacionado + bigquery_type: float64 + description: Valor do abono - Valor deflacionado + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: valor_aluguel_deflacionado + bigquery_type: float64 + description: Valor do Aluguel - Valor deflacionado + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: rendas_outras_deflacionado + bigquery_type: float64 + description: Rendas Outras - Valor deflacionado + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: brl + has_sensitive_data: no + is_in_staging: true + is_partition: false + +metadata_modified: '2021-12-31T16:29:47.374174' diff --git a/bases/br_ibge_pnad/microdados_compatibilizado_datazoom/table_description.txt b/bases/br_ibge_pnad/microdados_compatibilizado_datazoom/table_description.txt new file mode 100644 index 000000000..d9af61f0a --- /dev/null +++ b/bases/br_ibge_pnad/microdados_compatibilizado_datazoom/table_description.txt @@ -0,0 +1,88 @@ +Microdados da Pesquisa Nacional por Amostra de Domicílios - PNAD. + +Para saber mais acesse: +Website: +Github: + +Ajude a manter o projeto :) +Apoia-se: https://apoia.se/basedosdados + +Publicado por +------------- +Nome: Crislane Alves +Código: +Email: crislanealves@basedosdados.org +Tratado por +----------- +Nome: Crislane Alves +Email: crislanealves@basedosdados.org + + + +Partições (Filtre a tabela por essas colunas para economizar dinheiro e tempo) +--------- +- ano +- sigla_uf + + +Colunas identificando linhas unicamente +------------------- + + + + +Cobertura Temporal +------------------ +- 1981 +- 1982 +- 1983 +- 1984 +- 1985 +- 1986 +- 1987 +- 1988 +- 1989 +- 1990 +- 1992 +- 1993 +- 1995 +- 1996 +- 1997 +- 1998 +- 1999 +- 2001 +- 2002 +- 2003 +- 2004 +- 2005 +- 2006 +- 2007 +- 2008 +- 2009 +- 2011 +- 2012 +- 2013 +- 2014 +- 2015 + + + + +Cobertura Espacial +------------------ +- b +- r +- a + + + + +Tratamento +---------- +Tratamento e compatibilização feitos pelo pacote "datazoom_pnad" no Stata. E, posteriormente, adicionado ao padrão da Base dos Dados. + + + +Frequencia de Atualização +------------------------- +one_year diff --git a/bases/br_ibge_pnadc/ano_brasil_grupo_idade/table_config.yaml b/bases/br_ibge_pnadc/ano_brasil_grupo_idade/table_config.yaml index 47c687fad..670a5b2ef 100644 --- a/bases/br_ibge_pnadc/ano_brasil_grupo_idade/table_config.yaml +++ b/bases/br_ibge_pnadc/ano_brasil_grupo_idade/table_config.yaml @@ -166,4 +166,4 @@ columns: is_in_staging: true is_partition: false -metadata_modified: '2021-09-12T23:21:06.211459' +metadata_modified: '2021-12-29T19:45:27.746244' diff --git a/bases/br_ibge_pnadc/ano_brasil_raca_cor/table_config.yaml b/bases/br_ibge_pnadc/ano_brasil_raca_cor/table_config.yaml index 31c8eaecf..aa27c8276 100644 --- a/bases/br_ibge_pnadc/ano_brasil_raca_cor/table_config.yaml +++ b/bases/br_ibge_pnadc/ano_brasil_raca_cor/table_config.yaml @@ -166,4 +166,4 @@ columns: is_in_staging: true is_partition: false -metadata_modified: '2021-09-12T23:21:06.211459' +metadata_modified: '2021-12-29T11:54:29.822034' diff --git a/bases/br_ibge_pnadc/ano_municipio_grupo_idade/table_config.yaml b/bases/br_ibge_pnadc/ano_municipio_grupo_idade/table_config.yaml index e99d49f3b..9dbce3235 100644 --- a/bases/br_ibge_pnadc/ano_municipio_grupo_idade/table_config.yaml +++ b/bases/br_ibge_pnadc/ano_municipio_grupo_idade/table_config.yaml @@ -181,4 +181,4 @@ columns: is_partition: false -metadata_modified: '2021-09-12T23:21:06.211459' +metadata_modified: '2021-12-29T19:45:27.746244' diff --git a/bases/br_ibge_pnadc/ano_municipio_raca_cor/table_config.yaml b/bases/br_ibge_pnadc/ano_municipio_raca_cor/table_config.yaml index 21b4358b0..51f4d4813 100644 --- a/bases/br_ibge_pnadc/ano_municipio_raca_cor/table_config.yaml +++ b/bases/br_ibge_pnadc/ano_municipio_raca_cor/table_config.yaml @@ -181,4 +181,4 @@ columns: is_in_staging: true is_partition: false -metadata_modified: '2021-09-12T23:21:06.211459' +metadata_modified: '2021-12-29T19:24:45.262102' diff --git a/bases/br_ibge_pnadc/ano_regiao_grupo_idade/table_config.yaml b/bases/br_ibge_pnadc/ano_regiao_grupo_idade/table_config.yaml index ee0235af2..2a276377c 100644 --- a/bases/br_ibge_pnadc/ano_regiao_grupo_idade/table_config.yaml +++ b/bases/br_ibge_pnadc/ano_regiao_grupo_idade/table_config.yaml @@ -180,4 +180,4 @@ columns: is_in_staging: true is_partition: false -metadata_modified: '2021-09-12T23:21:06.211459' +metadata_modified: '2021-12-29T19:45:27.746244' diff --git a/bases/br_ibge_pnadc/ano_regiao_metropolitana_grupo_idade/table_config.yaml b/bases/br_ibge_pnadc/ano_regiao_metropolitana_grupo_idade/table_config.yaml index 017fbcfac..027815bf7 100644 --- a/bases/br_ibge_pnadc/ano_regiao_metropolitana_grupo_idade/table_config.yaml +++ b/bases/br_ibge_pnadc/ano_regiao_metropolitana_grupo_idade/table_config.yaml @@ -180,4 +180,4 @@ columns: is_in_staging: true is_partition: false -metadata_modified: '2021-09-12T23:21:06.211459' +metadata_modified: '2021-12-29T19:45:27.746244' diff --git a/bases/br_ibge_pnadc/ano_regiao_metropolitana_raca_cor/table_config.yaml b/bases/br_ibge_pnadc/ano_regiao_metropolitana_raca_cor/table_config.yaml index f077ad642..8b8a5ea90 100644 --- a/bases/br_ibge_pnadc/ano_regiao_metropolitana_raca_cor/table_config.yaml +++ b/bases/br_ibge_pnadc/ano_regiao_metropolitana_raca_cor/table_config.yaml @@ -180,4 +180,4 @@ columns: is_in_staging: true is_partition: false -metadata_modified: '2021-09-12T23:21:06.211459' +metadata_modified: '2021-12-29T19:30:23.805564' diff --git a/bases/br_ibge_pnadc/ano_regiao_raca_cor/table_config.yaml b/bases/br_ibge_pnadc/ano_regiao_raca_cor/table_config.yaml index 005713089..83cd4e995 100644 --- a/bases/br_ibge_pnadc/ano_regiao_raca_cor/table_config.yaml +++ b/bases/br_ibge_pnadc/ano_regiao_raca_cor/table_config.yaml @@ -180,4 +180,4 @@ columns: is_in_staging: true is_partition: false -metadata_modified: '2021-09-12T23:21:06.211459' +metadata_modified: '2021-12-29T19:30:23.805564' diff --git a/bases/br_ibge_pnadc/ano_uf_grupo_idade/table_config.yaml b/bases/br_ibge_pnadc/ano_uf_grupo_idade/table_config.yaml index 65b02921c..c8fdb1c0d 100644 --- a/bases/br_ibge_pnadc/ano_uf_grupo_idade/table_config.yaml +++ b/bases/br_ibge_pnadc/ano_uf_grupo_idade/table_config.yaml @@ -179,4 +179,4 @@ columns: has_sensitive_data: no is_in_staging: true is_partition: false -metadata_modified: '2021-09-12T23:21:06.211459' +metadata_modified: '2021-12-29T19:45:27.746244' diff --git a/bases/br_ibge_pnadc/ano_uf_raca_cor/table_config.yaml b/bases/br_ibge_pnadc/ano_uf_raca_cor/table_config.yaml index dbe7774ae..2a760c04c 100644 --- a/bases/br_ibge_pnadc/ano_uf_raca_cor/table_config.yaml +++ b/bases/br_ibge_pnadc/ano_uf_raca_cor/table_config.yaml @@ -180,4 +180,4 @@ columns: is_in_staging: true is_partition: false -metadata_modified: '2021-09-12T23:21:06.211459' +metadata_modified: '2021-12-29T19:30:23.805564' diff --git a/bases/br_inep_saeb/README.md b/bases/br_inep_saeb/README.md new file mode 100644 index 000000000..cdca2b660 --- /dev/null +++ b/bases/br_inep_saeb/README.md @@ -0,0 +1,5 @@ +Dados públicos do Sistema de Avaliação da Educação Básica (Saeb) do INEP. + +Dados originais em https://www.gov.br/inep/pt-br/areas-de-atuacao/avaliacao-e-exames-educacionais/saeb/resultados. + +Todo o código usado na captura e limpeza está em `code/` \ No newline at end of file diff --git a/bases/br_inep_saeb/dataset_config.yml b/bases/br_inep_saeb/dataset_config.yml new file mode 100644 index 000000000..be6d00dab --- /dev/null +++ b/bases/br_inep_saeb/dataset_config.yml @@ -0,0 +1,30 @@ + +# Qual organização disponibiliza os dados originais? +# Opções: escolher dessa lista -> https://basedosdados.org/api/3/action/organization_list= +organization: + - br-inep + +dataset_id: br_inep_saeb + +title: Sistema de Avaliação da Educação Básica (Saeb) + +# exemplo: descrição e anotações úteis sobre os dados. +description: Dados referente ao total de acessos a Banda Larga Fixa (também denominado por Serviço Comunicação Multimídia - SCM) provenientes da Anatel. A base inclui ainda, dados de entrada de banda larga nas escolas segundo o Programa Banda Larga nas Escolas e a instalação de backhauls segundo o Plano Nacional de Universalização da Banda Larga. + +# Quais grupos caracterizam a base? +# Opções: escolher dessa lista -> https://basedosdados.org/api/3/action/group_list +groups: [educacao] + +# Quais etiquetas caracterizam a base? +# Opções: escolher dessa lista -> https://basedosdados.org/api/3/action/tag_list +# Caso crie etiquetas novas, as regras são: +# - letras minúsculas +# - sem acentos +# - não repita nomes de grupos (ex. educacao, saude, meio ambiente, economia, etc.) +tags: ['alfabetizacao', 'ensino fundamental'] + +ckan_url: + +github_url: https://github.com/basedosdados/mais/tree/master/bases/br_inep_saeb + +metadata_modified: \ No newline at end of file diff --git a/bases/br_inep_saeb/dicionario/table_config.yaml b/bases/br_inep_saeb/dicionario/table_config.yaml index c11b8ab55..eabc66916 100644 --- a/bases/br_inep_saeb/dicionario/table_config.yaml +++ b/bases/br_inep_saeb/dicionario/table_config.yaml @@ -34,7 +34,7 @@ time_unit: # Exemplos: id_municipio, ano. # Pode ser vazio pois certas tabelas não possuem identificadores. identifying_columns: - - table_id + - id_tabela - coluna - chave - cobertura_temporal diff --git a/bases/br_inep_saeb/dicionario/table_description.txt b/bases/br_inep_saeb/dicionario/table_description.txt index 02b1d182b..88a2b7950 100644 --- a/bases/br_inep_saeb/dicionario/table_description.txt +++ b/bases/br_inep_saeb/dicionario/table_description.txt @@ -23,7 +23,7 @@ Email: lucas.moreira@basedosdados.org Colunas identificando linhas unicamente ------------------- -- table_id +- id_tabela - coluna - chave - cobertura_temporal diff --git a/bases/br_ms_vacinacao_covid19/microdados/table_config.yaml b/bases/br_ms_vacinacao_covid19/microdados/table_config.yaml index 381b910b5..081d9f64d 100644 --- a/bases/br_ms_vacinacao_covid19/microdados/table_config.yaml +++ b/bases/br_ms_vacinacao_covid19/microdados/table_config.yaml @@ -1,4 +1,3 @@ - dataset_id: br_ms_vacinacao_covid19 table_id: microdados @@ -40,7 +39,7 @@ identifying_columns: - id_documento last_updated: - metadata: + metadata: '2022-01-15' data: release: diff --git a/bases/br_ms_vacinacao_covid19/microdados_estabelecimento/table_config.yaml b/bases/br_ms_vacinacao_covid19/microdados_estabelecimento/table_config.yaml index 42c20c592..9fd9cd7ef 100644 --- a/bases/br_ms_vacinacao_covid19/microdados_estabelecimento/table_config.yaml +++ b/bases/br_ms_vacinacao_covid19/microdados_estabelecimento/table_config.yaml @@ -1,4 +1,3 @@ - dataset_id: br_ms_vacinacao_covid19 table_id: microdados_estabelecimento @@ -39,7 +38,7 @@ identifying_columns: - id_estabelecimento last_updated: - metadata: + metadata: '2022-01-15' data: release: diff --git a/bases/br_ms_vacinacao_covid19/microdados_paciente/table_config.yaml b/bases/br_ms_vacinacao_covid19/microdados_paciente/table_config.yaml index 377cf8050..586b59272 100644 --- a/bases/br_ms_vacinacao_covid19/microdados_paciente/table_config.yaml +++ b/bases/br_ms_vacinacao_covid19/microdados_paciente/table_config.yaml @@ -1,4 +1,3 @@ - dataset_id: br_ms_vacinacao_covid19 table_id: microdados_paciente @@ -38,7 +37,7 @@ identifying_columns: - id_paciente last_updated: - metadata: + metadata: '2022-01-15' data: release: diff --git a/bases/br_ms_vacinacao_covid19/microdados_vacinacao/table_config.yaml b/bases/br_ms_vacinacao_covid19/microdados_vacinacao/table_config.yaml index ddaef3e0f..426208657 100644 --- a/bases/br_ms_vacinacao_covid19/microdados_vacinacao/table_config.yaml +++ b/bases/br_ms_vacinacao_covid19/microdados_vacinacao/table_config.yaml @@ -1,4 +1,3 @@ - dataset_id: br_ms_vacinacao_covid19 table_id: microdados_vacinacao @@ -38,7 +37,7 @@ identifying_columns: - id_documento last_updated: - metadata: + metadata: '2022-01-15' data: release: diff --git a/bases/br_ons_energia_armazenada/README.md b/bases/br_ons_energia_armazenada/README.md index 53cc7096c..c814348a2 100644 --- a/bases/br_ons_energia_armazenada/README.md +++ b/bases/br_ons_energia_armazenada/README.md @@ -1,7 +1,7 @@ Como capturar os dados de br_ons_energia_armazenada? -1. Para capturar esses dados, basta verificar o link dos dados originais indicado em `dataset_config.yaml` no item `website`. +Para capturar esses dados, basta verificar o link dos dados originais indicado em dataset_config.yaml no item website. -2. Caso tenha sido utilizado algum código de captura ou tratamento, estes estarão contidos em `code/`. Se o dado publicado for em sua versão bruta, não existirá a pasta `code/`. +Caso tenha sido utilizado algum cdigo de captura ou tratamento, estes estaro contidos em code/. Se o dado publicado for em sua verso bruta, no existir a pasta code/. -Os dados publicados estão disponíveis em: https://basedosdados.org/dataset/br-ons-energia-armazenada \ No newline at end of file +Os dados publicados esto disponveis em: https://basedosdados.org/dataset/br-ons-energia-armazenada \ No newline at end of file diff --git a/bases/br_ons_energia_armazenada/code/br_ons_energia_armazenada.R b/bases/br_ons_energia_armazenada/code/br_ons_energia_armazenada.R index 1bdefe21d..d28083e66 100644 --- a/bases/br_ons_energia_armazenada/code/br_ons_energia_armazenada.R +++ b/bases/br_ons_energia_armazenada/code/br_ons_energia_armazenada.R @@ -1,69 +1,35 @@ rm(list = ls()) +setwd("~/basedosdados/br_ons_energia_armazenada/output") -#### Pacotes + Diretorios #### +# Instala pacote 'reservatoriosBR' +devtools::install_github('brunomioto/reservatoriosBR') # Leitura +library(reservatoriosBR) library(dplyr) library(tidyverse) -library(fs) -# Definir e criar pasta diretorios -setwd('/content/br_ons_subsistemas') -fs::dir_create(c('input', 'output', 'code')) - -#### Tratamento #### +# Baixar dados # Lista -list_rename = c('data' = 'data_medicao', +list_rename = c('data' = 'data', 'energia_armazenada_maxima' = 'ear_max_subsistema_mwmes', 'energia_armazenada_verificada' = 'ear_verif_subsistema_mwmes', 'proporcao_energia_armazenada_verificada' = 'ear_verif_subsistema_percentual', 'subsistema' = 'subsistema') -ons_subsistema <- function(ano_inicial=2000, ano_final=format(Sys.Date(), "%Y")){ - - if(ano_inicial < 2000| - ano_final > format(Sys.Date(), "%Y")){ - message("Escolha um ano inicial igual ou maior que 2000 e um ano final igual ou menor que ", format(Sys.Date(), "%Y")) - }else{ - - anos <- seq(ano_inicial,ano_final) - - message("Buscando dados dirios de ", ano_inicial, " at ", ano_final, "...") - - historico <- list() - - for (i in anos) { - dados_ons <- utils::read.csv(glue::glue("https://ons-dl-prod-opendata.s3.amazonaws.com/dataset/ear_subsistema_di/EAR_DIARIO_SUBSISTEMA_{i}.csv"),sep = ";") - historico[[i]] <- dados_ons - } - - historico_ear <- do.call(rbind, historico) - - message("Organizando os dados...") - - historico_ear_clean <- historico_ear %>% - dplyr::mutate(subsistema = dplyr::recode(id_subsistema, - N = "Norte", - NE = "Nordeste", - SE = "Sudeste / Centro-Oeste", - S = "Sul")) %>% - dplyr::select(-id_subsistema, -nom_subsistema) %>% - dplyr::rename(data_medicao = ear_data, - ear_max_subsistema_mwmes = ear_max_subsistema) %>% - dplyr::select(data_medicao, subsistema, ear_max_subsistema_mwmes, ear_verif_subsistema_mwmes, ear_verif_subsistema_percentual) - - historico_ear_clean$data_medicao <- as.Date(historico_ear_clean$data_medicao) - - return(historico_ear_clean) - - } -} -#### Limpa e salva dados #### +#Limpa e salva dados -df = ons_subsistema(ano_inicial= 2000, ano_final= 2021) %>% +df = ONS_EAR_subsistemas(2000) %>% as_tibble()%>% - rename(list_rename)%>% - readr::write_csv(file = "output/microdados.csv") + rename(list_rename) + +write.csv( + df, + "~/output/microdados.csv", + na = " ", + row.names = F, + fileEncoding = "UTF-8" +) diff --git a/bases/br_ons_energia_armazenada/dataset_config.yaml b/bases/br_ons_energia_armazenada/dataset_config.yaml index 9d704da64..b0d04207d 100644 --- a/bases/br_ons_energia_armazenada/dataset_config.yaml +++ b/bases/br_ons_energia_armazenada/dataset_config.yaml @@ -1,74 +1,36 @@ -dataset_id: br_ons_energia_armazenada # AUTO GENERATED -url_ckan: https://basedosdados.org/dataset/br-ons-energia-armazenada -url_github: https://github.com/basedosdados/mais/tree/master/bases/br_ons_energia_armazenada - -# Descreva a base. -# Ela é sobre o que? -# Quais as principais fontes de dados? -# Há links para FAQs e explicações? -description: | # REQUIRED - Dados das grandezas de energia armazenada (EAR) em periodicidade diária. A Energia Armazenada (EAR) representa a energia associada ao volume de água disponível nos reservatórios que pode ser convertido em geração na própria usina e em todas as usinas à jusante na cascata. A grandeza de EAR leva em conta nível verificado nos reservatórios na data de referência. - # Qual organização disponibiliza os dados originais? -# Opções: escolher dessa lista -> https://basedosdados.org/api/3/action/organization_list -organization: operador-nacional-do-sistema-eletrico # REQUIRED +# Opções: escolher dessa lista -> https://basedosdados.org/api/3/action/organization_list= +organization: br-ons -# Qual departamento/grupo/pessoa mantém os dados originais? -author: - name: Operador Nacional do Sistema Elétrico - email: relacionamento.agentes@ons.org.br +dataset_id: br_ons_energia_armazenada -# Onde encontrar os dados originais e mais informações? -website: - - https://dados.ons.org.br/dataset/ear-diario-por-subsistema - - https://github.com/brunomioto/reservatoriosBR +title: Reservatório de Energia Armazenada + +# exemplo: descrição e anotações úteis sobre os dados. +description: + Dados das grandezas de energia armazenada (EAR) em periodicidade diária. A Energia Armazenada (EAR) representa a energia associada ao volume de água disponível nos reservatórios que pode ser convertido em geração na própria usina e em todas as usinas à jusante na cascata. A grandeza de EAR leva em conta nível verificado nos reservatórios na data de referência. # Quais grupos caracterizam a base? # Opções: escolher dessa lista -> https://basedosdados.org/api/3/action/group_list groups: - - energia - - meio-ambiente + - energia + - meio-ambiente # Quais etiquetas caracterizam a base? # Opções: escolher dessa lista -> https://basedosdados.org/api/3/action/tag_list # Caso crie etiquetas novas, as regras são: -# - letras minúsculas -# - sem acentos -# - não repita nomes de grupos (ex. educacao, saude, meio ambiente, economia, etc.) +# - letras minúsculas +# - sem acentos +# - não repita nomes de grupos (ex. educacao, saude, meio ambiente, economia, etc.) tags: - agua - apagao - chuva - usinas -# Em quais línguas a base (ou a fonte original) está disponível? -# Regras: minúsculo, sem acentos. -# Opções: portugues, ingles, espanhol, frances, chines, russo, hindi, alemao, etc. -languages: - - portugues - -# Os dados originais estão disponíveis de graça? -free: sim - -# Are microdata available for download? -microdata: sim - -# Existe uma API na fonte original? -API: não - -# É necessário registrar um usuário para baixar os dados originais? -registration: não - -# Como os dados originais estão disponibilizados? -availability: online +ckan_url: -# A fonte original requer IP brasileiro para acesso? -brazilian_IP: não +github_url: -# Essa base está sob qual licença? -# A licença MIT se aplica a bases públicas. -# Caso não seja pública, ver opções aqui: https://help.data.world/hc/en-us/articles/115006114287-Common-license-types-for-datasets -license: - name: MIT # REQUIRED - url: \ No newline at end of file +metadata_modified: diff --git a/bases/br_ons_energia_armazenada/subsistemas/publish.sql b/bases/br_ons_energia_armazenada/subsistemas/publish.sql index ca20e4b1a..6fb769996 100644 --- a/bases/br_ons_energia_armazenada/subsistemas/publish.sql +++ b/bases/br_ons_energia_armazenada/subsistemas/publish.sql @@ -1,5 +1,4 @@ /* - Query para publicar a tabela. Esse é o lugar para: @@ -16,7 +15,6 @@ TIPOS: - Para modificar tipos de colunas, basta substituir STRING por outro tipo válido. - Exemplo: `SAFE_CAST(column_name AS NUMERIC) column_name` - Mais detalhes: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types - */ CREATE VIEW basedosdados-dev.br_ons_energia_armazenada.subsistemas AS @@ -26,4 +24,4 @@ SAFE_CAST(subsistema AS STRING) subsistema, SAFE_CAST(energia_armazenada_maxima AS FLOAT64) energia_armazenada_maxima, SAFE_CAST(energia_armazenada_verificada AS FLOAT64) energia_armazenada_verificada, SAFE_CAST(proporcao_energia_armazenada_verificada AS FLOAT64) proporcao_energia_armazenada_verificada -from basedosdados-dev.br_ons_energia_armazenada_staging.subsistemas as t \ No newline at end of file +FROM basedosdados-dev.br_ons_energia_armazenada_staging.subsistemas AS t \ No newline at end of file diff --git a/bases/br_ons_energia_armazenada/subsistemas/schema-prod.json b/bases/br_ons_energia_armazenada/subsistemas/schema-prod.json index 92e294679..07a1bd7b4 100644 --- a/bases/br_ons_energia_armazenada/subsistemas/schema-prod.json +++ b/bases/br_ons_energia_armazenada/subsistemas/schema-prod.json @@ -1 +1 @@ -[{"name": "data", "description": "Data", "is_in_staging": true, "is_partition": false, "type": "DATE", "mode": "NULLABLE"}, {"name": "subsistema", "description": "Nome do subsistema", "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "energia_armazenada_maxima", "description": "Valor de energia armazenada m\u00e1xima por subsistema", "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "energia_armazenada_verificada", "description": "Valor de energia armazenada verificada no dia por subsistema", "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "proporcao_energia_armazenada_verificada", "description": "Porcentagem de energia armazenada verificada no dia por subsistema", "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}] \ No newline at end of file +[{"name": "data", "bigquery_type": "date", "description": "Data", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_data_tempo", "table_id": "data", "column_name": "data"}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": true, "is_partition": false, "type": "DATE", "mode": "NULLABLE"}, {"name": "subsistema", "bigquery_type": "string", "description": "Nome do subsistema", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "energia_armazenada_maxima", "bigquery_type": "float64", "description": "Valor de energia armazenada m\u00e1xima por subsistema", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "energia_armazenada_verificada", "bigquery_type": "float64", "description": "Valor de energia armazenada verificada no dia por subsistema", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "proporcao_energia_armazenada_verificada", "bigquery_type": "float64", "description": "Porcentagem de energia armazenada verificada no dia por subsistema", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}] \ No newline at end of file diff --git a/bases/br_ons_energia_armazenada/subsistemas/schema-staging.json b/bases/br_ons_energia_armazenada/subsistemas/schema-staging.json index 39aa707c0..1cc41bf13 100644 --- a/bases/br_ons_energia_armazenada/subsistemas/schema-staging.json +++ b/bases/br_ons_energia_armazenada/subsistemas/schema-staging.json @@ -1 +1 @@ -[{"name": "data", "description": "", "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "subsistema", "description": "", "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "energia_armazenada_maxima", "description": "", "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "energia_armazenada_verificada", "description": "", "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "proporcao_energia_armazenada_verificada", "description": "", "is_in_staging": true, "is_partition": false, "type": "STRING"}] \ No newline at end of file +[{"name": "data", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "subsistema", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "energia_armazenada_maxima", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "energia_armazenada_verificada", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "proporcao_energia_armazenada_verificada", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}] \ No newline at end of file diff --git a/bases/br_ons_energia_armazenada/subsistemas/table_config.yaml b/bases/br_ons_energia_armazenada/subsistemas/table_config.yaml index af15bc6b3..b6f8a718a 100644 --- a/bases/br_ons_energia_armazenada/subsistemas/table_config.yaml +++ b/bases/br_ons_energia_armazenada/subsistemas/table_config.yaml @@ -1,78 +1,25 @@ -source_bucket_name: basedosdados-dev -project_id_staging: basedosdados-dev -project_id_prod: basedosdados-dev -table_id: subsistemas # AUTO GENERATED -dataset_id: br_ons_energia_armazenada # AUTO GENERATED - -url_ckan: https://basedosdados.org/dataset/br-ons-energia-armazenada # AUTO GENERATED -url_github: https://github.com/basedosdados/mais/tree/master/bases/br_ons_energia_armazenada # AUTO GENERATED -version: v1.0 # REQUIRED +dataset_id: br_ons_energia_armazenada -last_updated: 2021-09-29 # AUTO GENERATED +table_id: subsistemas # Descreva a tabela. Essas são as primeiras frases que um usuário vai ver. # Você não precisa ser muito conciso. Sinta-se a vontade para dar exemplos de # como usar os dados. -# Se souber, liste também aplicações: pesquisa, apps, etc. que usem os dados. -description: | # REQUIRED - Dados das grandezas de energia armazenada em periodicidade diária por subsistemas. - -# Quem está completando esse arquivo config? -published_by: - name: Crislane A. Souza # REQUIRED - code_url: https://github.com/basedosdados/mais/tree/master/bases/br_ons_energia_armazenada/code # REQUIRED - website: https://github.com/crislanealves - email: crislanealves@basedosdados.org - -# Qual organização/departamento/pessoa tratou os dados? -# As vezes há um ponto intermediário entre os dados originais e subir na Base dos Dados. -# Se essa pessoa é você, preencha abaixo com suas informações. -treated_by: - name: Bruno Mioto - code_url: https://github.com/basedosdados/mais/tree/master/bases/br_ons_energia_armazenada/code - website: brunomioto.com.br - email: brunomioto97@gmail.com +# Se souber, liste também aplicações: pesquisa, apps, etc. que usem os dados., +description: Dados das grandezas de energia armazenada em periodicidade diária por subsistemas. -# Se houve passos de tratamento, limpeza e manipulação de dados, descreva-os aqui. -treatment_description: | - - Limpeza para consistência - -# Com qual frequência a base é atualizada? -# Opções: hora | dia | semana | mes | 1 ano | 2 anos | 5 anos | 10 anos | unico | recorrente -data_update_frequency: dia # REQUIRED - -# Nível da observação (qual é a granularidade de cada linha na tabela) -# Escolha todas as opções necessárias. -# Regras: -# - minúsculo, sem acento, singular. -# - em portugues (ou seja, não use os nomes de colunas abaixo) -# Exemplos: pais, estado, municipio, cidade, hora, dia, semana, mes, ano, etc. -observation_level: #REQUIRED - - dia - -# Quais colunas identificam uma linha unicamente? -# Preencha com os nomes de colunas. Ex: id_municipio, ano. -# Pode ser vazio pois certas tabelas não possuem identificadores. -primary_keys: - - data - - subsistema +# A máxima unidade espacial que a tabela cobre. +spatial_coverage: + continent: + - south_america + country: + - bra -# Qual é a cobertura espacial da tabela? -# Regras: -# - minúsculo, sem acento, singular -# - descer até o menor nível administrativo cuja cobertura abaixo seja 'todos' -# Exemplo 1: tabela que cubra todos os municípios nos estados de SP e GO -# - brasil -# - SP, GO -# Exemplo 2: tabela que cubra países inteiros na América Latina -# - brasil, argentina, peru, equador -coverage_geo: - - brasil - -# Qual é a cobertura temporal (em anos) da tabela? -# Opções: ..., 1990, 1991, ..., 1999, 2000, 2001, ..., 2019, 2020, ... -coverage_time: +# Anos cobertos pela tabela. +# Preencher como lista de intervalos. +# Exemplo: 1995(1)2019. +temporal_coverage: - 2000 - 2001 - 2002 @@ -95,12 +42,85 @@ coverage_time: - 2019 - 2020 - 2021 + - 2022 + +# A unidade temporal com qual a tabela é atualizada. +# Opções em 'https://basedosdados.org/api/3/action/bd_available_options' +update_frequency: day + +# Entidade representada por cada linha. +# Opções em 'https://basedosdados.org/api/3/action/bd_available_options' +entity: + +# A unidade temporal representada por cada linha. +# Opções em 'https://basedosdados.org/api/3/action/bd_available_options' +time_unit: day + +# O conjunto mínimo de colunas identificando cada linha unicamente. +# Preencha com os nomes de colunas. +# Exemplos: id_municipio, ano. +# Pode ser vazio pois certas tabelas não possuem identificadores. +identifying_columns: + - data + - subsistema + +last_updated: + metadata: + data: + release: + +# Versão da tabela. Seguindo o padrão de semantic versioning. +# Exemplo: v1.1.3 +version: v1.1 + +# Quem está preenchendo esses metadados? +published_by: + name: Crislane A. Souza + email: crislanealves@basedosdados.org + github_user: https://github.com/crislanealves + website: + ckan_user: crislanealves + +# Qual organização/departamento/pessoa tratou os dados? +# As vezes há um ponto intermediário entre os dados originais e subir na Base dos Dados. +# Se essa pessoa é você, preencha abaixo com suas informações. +data_cleaned_by: + name: Bruno Mioto + email: brunomioto97@gmail.com + github_user: + ckan_user: + website: brunomioto.com.br + code_url: + +# Se houve passos de tratamento, limpeza e manipulação de dados, descreva-os aqui. +data_cleaning_description: Limpeza para consistência + +# Url dos dados originais no GCP Storage. +raw_files_url: + +# Url dos arquivos auxiliares no GCP Storage. +auxiliary_files_url: + +# Url da tabela de arquitetura no GCP Storage. +architecture_url: + +# A tabela tem colunas que precisam de dicionário? +# Opções: yes, no. +covered_by_dictionary: no + +source_bucket_name: basedosdados-dev + +project_id_prod: basedosdados-dev + +project_id_staging: basedosdados-dev # Liste as colunas da tabela que representam partições. # Não esqueça de deletar essas colunas nas tabelas .csv na hora de subir para o BigQuery. # Isso poupará muito tempo e dinheiro às pessoas utilizando essa tabela. # Se não houver partições, não modifique abaixo. -partitions: # REQUIRED +partitions: + +bdm_file_size: # Quais são as colunas? Certifique-se de escrever uma boa descrição, as pessoas vão gostar # para saber sobre o que é a coluna. @@ -110,34 +130,72 @@ partitions: # REQUIRED # Algumas colunas existirão apenas na tabela final, você as construirá em `publish.sql`. # Para esses, defina is_in_staging como False. # Além disso, você deve adicionar as colunas de partição aqui e definir is_partition como True. -columns: # REQUIRED - - - - name: data - description: Data - is_in_staging: True # Bool [True, False], whether the column is in the staging table - is_partition: False # Bool [True, False], whether the column is a partition. - - - - name: subsistema - description: Nome do subsistema - is_in_staging: True # Bool [True, False], whether the column is in the staging table - is_partition: False # Bool [True, False], whether the column is a partition. - - - - name: energia_armazenada_maxima - description: Valor de energia armazenada máxima por subsistema - is_in_staging: True # Bool [True, False], whether the column is in the staging table - is_partition: False # Bool [True, False], whether the column is a partition. - - - - name: energia_armazenada_verificada - description: Valor de energia armazenada verificada no dia por subsistema - is_in_staging: True # Bool [True, False], whether the column is in the staging table - is_partition: False # Bool [True, False], whether the column is a partition. - - - - name: proporcao_energia_armazenada_verificada - description: Porcentagem de energia armazenada verificada no dia por subsistema - is_in_staging: True # Bool [True, False], whether the column is in the staging table - is_partition: False # Bool [True, False], whether the column is a partition. +columns: + - name: data + bigquery_type: date + description: Data + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: br_bd_diretorios_data_tempo + table_id: data + column_name: data + measurement_unit: day + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: subsistema + bigquery_type: string + description: Nome do subsistema + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: energia_armazenada_maxima + bigquery_type: float64 + description: Valor de energia armazenada máxima por subsistema (MWmês) + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: megawatt + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: energia_armazenada_verificada + bigquery_type: float64 + description: Valor de energia armazenada verificada no dia por subsistema (MWmês) + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: megawatt + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: proporcao_energia_armazenada_verificada + bigquery_type: float64 + description: Porcentagem de energia armazenada verificada no dia por subsistema + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: percent + has_sensitive_data: no + is_in_staging: true + is_partition: false + + +metadata_modified: '2021-12-06T13:54:02.927052' diff --git a/bases/br_ons_energia_armazenada/subsistemas/table_description.txt b/bases/br_ons_energia_armazenada/subsistemas/table_description.txt index 101a23028..be2ec34ae 100644 --- a/bases/br_ons_energia_armazenada/subsistemas/table_description.txt +++ b/bases/br_ons_energia_armazenada/subsistemas/table_description.txt @@ -1,33 +1,34 @@ Dados das grandezas de energia armazenada em periodicidade diária por subsistemas. - Para saber mais acesse: -Website: https://basedosdados.org/dataset/br-ons-energia-armazenada -Github: https://github.com/basedosdados/mais/tree/master/bases/br_ons_energia_armazenada +Website: +Github: Ajude a manter o projeto :) Apoia-se: https://apoia.se/basedosdados Publicado por ------------- -Nome: Crislane A. Souza -Código: https://github.com/basedosdados/mais/tree/master/bases/br_ons_energia_armazenada/code -Website: https://github.com/crislanealves -Email: crislanealves@basedosdados.orgTratado por +Nome: Crislane Alves +Código: +Email: crislanealves@basedosdados.org +Tratado por ----------- Nome: Bruno Mioto -Código: https://github.com/basedosdados/mais/tree/master/bases/br_ons_energia_armazenada/code +Código: https://github.com/brunomioto/reservatoriosBR Website: brunomioto.com.br Email: brunomioto97@gmail.com -Nível da Observação (i.e. a granularidade da linha) -------------------- -- dia + + Colunas identificando linhas unicamente ------------------- - data -- subsistemas +- subsistema + + + Cobertura Temporal ------------------ @@ -54,16 +55,21 @@ Cobertura Temporal - 2020 - 2021 + + + Cobertura Espacial ------------------ -- brasil + + + Tratamento ---------- -- Limpeza para consistência +Limpeza para consistência -Frequencia de Atualização -------------------------- -dia +Frequencia de Atualização +------------------------- +day diff --git a/bases/br_poder360_pesquisas/README.md b/bases/br_poder360_pesquisas/README.md new file mode 100644 index 000000000..18c4751b7 --- /dev/null +++ b/bases/br_poder360_pesquisas/README.md @@ -0,0 +1,7 @@ +Como capturar os dados de br_poder360_pesquisas? + +Para capturar esses dados, basta verificar o link dos dados originais indicado em dataset_config.yaml no item website. + +Caso tenha sido utilizado algum código de captura ou tratamento, estes estarão contidos em code/. Se o dado publicado for em sua versão bruta, não existirá a pasta code/. + +Os dados publicados estão disponíveis em: https://basedosdados.org/dataset/br-poder360-pesquisas \ No newline at end of file diff --git a/bases/br_poder360_pesquisas/dataset_config.yaml b/bases/br_poder360_pesquisas/dataset_config.yaml new file mode 100644 index 000000000..5cd1f0a3d --- /dev/null +++ b/bases/br_poder360_pesquisas/dataset_config.yaml @@ -0,0 +1,42 @@ + +# Qual organização disponibiliza os dados originais? +# Opções: escolher dessa lista -> https://basedosdados.org/api/3/action/organization_list +# Exemplos: br-ibge, br-tse, br-rj-gov +organization: br-poder360 + +dataset_id: br_poder360_pesquisas + +# Título do conjunto, a ser exibido no mecanismo de busca. +# Exemplo: População brasileira +title: Pesquisas Eleitorais + +# Descrição e anotações úteis sobre os dados. +description: Banco de pesquisas eleitorais de eleições federais, estaduais, e municipais no Brasil. + +# Quais temas caracterizam a base? +# Opções: escolher dessa lista -> https://basedosdados.org/api/3/action/group_list +# Importante: preencher com a chave, e não o valor. +groups: + - politica + +# Quais etiquetas caracterizam a base? +# Opções: escolher dessa lista -> https://basedosdados.org/api/3/action/tag_list +# Exemplos: fertilidade, preco, desmatamento. +# Caso crie etiquetas novas, as regras são: +# - letras minúsculas +# - sem acentos +# - sempre no singular +# - não repita nomes de grupos (ex. educacao, saude, meio ambiente, economia, etc.) +tags: + +# Url completa do CKAN já contendo o dataset-id +# Exemplo: https://basedosdados.org/dataset/ +ckan_url: https://basedosdados.org/dataset/br_poder360_pesquisas + +# Url completa do Github já contendo o dataset_id +# Exemplo: https://github.com/basedosdados/mais/tree/master/bases/ +github_url: https://github.com/basedosdados/mais/tree/master/bases/br_poder360_pesquisas + +# Não altere esse campo. +# Data da última modificação dos metadados gerada automaticamente pelo CKAN. +metadata_modified: diff --git a/bases/br_poder360_pesquisas/microdados/publish.sql b/bases/br_poder360_pesquisas/microdados/publish.sql new file mode 100644 index 000000000..d4b66cc64 --- /dev/null +++ b/bases/br_poder360_pesquisas/microdados/publish.sql @@ -0,0 +1,45 @@ +/* +Query para publicar a tabela. + +Esse é o lugar para: + - modificar nomes, ordem e tipos de colunas + - dar join com outras tabelas + - criar colunas extras (e.g. logs, proporções, etc.) + +Qualquer coluna definida aqui deve também existir em `table_config.yaml`. + +# Além disso, sinta-se à vontade para alterar alguns nomes obscuros +# para algo um pouco mais explícito. + +TIPOS: + - Para modificar tipos de colunas, basta substituir STRING por outro tipo válido. + - Exemplo: `SAFE_CAST(column_name AS NUMERIC) column_name` + - Mais detalhes: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types +*/ +CREATE VIEW basedosdados-dev.br_poder360_pesquisas.microdados AS +SELECT +SAFE_CAST(id_pesquisa AS STRING) id_pesquisa, +SAFE_CAST(ano AS INT64) ano, +SAFE_CAST(sigla_uf AS STRING) sigla_uf, +SAFE_CAST(nome_municipio AS STRING) nome_municipio, +SAFE_CAST(cargo AS STRING) cargo, +SAFE_CAST(data AS DATE) data, +SAFE_CAST(data_referencia AS STRING) data_referencia, +SAFE_CAST(instituto AS STRING) instituto, +SAFE_CAST(contratante AS STRING) contratante, +SAFE_CAST(orgao_registro AS STRING) orgao_registro, +SAFE_CAST(numero_registro AS STRING) numero_registro, +SAFE_CAST(quantidade_entrevistas AS FLOAT64) quantidade_entrevistas, +SAFE_CAST(margem_mais AS FLOAT64) margem_mais, +SAFE_CAST(margem_menos AS FLOAT64) margem_menos, +SAFE_CAST(tipo AS STRING) tipo, +SAFE_CAST(turno AS INT64) turno, +SAFE_CAST(tipo_voto AS STRING) tipo_voto, +SAFE_CAST(id_cenario AS STRING) id_cenario, +SAFE_CAST(descricao_cenario AS STRING) descricao_cenario, +SAFE_CAST(id_candidato_poder360 AS STRING) id_candidato_poder360, +SAFE_CAST(nome_candidato AS STRING) nome_candidato, +SAFE_CAST(sigla_partido AS STRING) sigla_partido, +SAFE_CAST(condicao AS INT64) condicao, +SAFE_CAST(percentual AS FLOAT64) percentual +FROM basedosdados-dev.br_poder360_pesquisas_staging.microdados AS t \ No newline at end of file diff --git a/bases/br_poder360_pesquisas/microdados/schema-prod.json b/bases/br_poder360_pesquisas/microdados/schema-prod.json new file mode 100644 index 000000000..467c470cf --- /dev/null +++ b/bases/br_poder360_pesquisas/microdados/schema-prod.json @@ -0,0 +1 @@ +[{"name": "id_pesquisa", "bigquery_type": "string", "description": "ID da pesquisa", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "ano", "bigquery_type": "int64", "description": "Ano", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_data_tempo", "table_id": "ano", "column_name": "ano"}, "measurement_unit": "year", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "sigla_uf", "bigquery_type": "string", "description": "Sigla da Unidade da Federa\u00e7\u00e3o", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_brasil", "table_id": "uf", "column_name": "sigla"}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "nome_municipio", "bigquery_type": "string", "description": "Nome do munic\u00edpio", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_brasil", "table_id": "municipio", "column_name": "nome"}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "cargo", "bigquery_type": "string", "description": "Cargo", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "data", "bigquery_type": "date", "description": "Data", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "DATE", "mode": "NULLABLE"}, {"name": "data_referencia", "bigquery_type": "string", "description": "Data de refer\u00eancia", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "instituto", "bigquery_type": "string", "description": "Instituto realizador", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "contratante", "bigquery_type": "string", "description": "Contratante", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "orgao_registro", "bigquery_type": "string", "description": "Org\u00e3o de registro", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "numero_registro", "bigquery_type": "string", "description": "N\u00famero de registro", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "quantidade_entrevistas", "bigquery_type": "float64", "description": "Quantidade de entrevistas", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "margem_mais", "bigquery_type": "float64", "description": "Margem a mais", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "percent", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "margem_menos", "bigquery_type": "float64", "description": "Margem a menos", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "percent", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}, {"name": "tipo", "bigquery_type": "string", "description": "Tipo", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "turno", "bigquery_type": "int64", "description": "Turno", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "tipo_voto", "bigquery_type": "string", "description": "Tipo do voto", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "id_cenario", "bigquery_type": "string", "description": "ID do cen\u00e1rio", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "descricao_cenario", "bigquery_type": "string", "description": "Descri\u00e7\u00e3o do cen\u00e1rio", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "id_candidato_poder360", "bigquery_type": "string", "description": "ID do candidato - Poder360", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "nome_candidato", "bigquery_type": "string", "description": "Nome do candidato", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "sigla_partido", "bigquery_type": "string", "description": "Sigla do partido", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "condicao", "bigquery_type": "int64", "description": "Condi\u00e7\u00e3o", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "percentual", "bigquery_type": "float64", "description": "Percentual", "temporal_coverage": null, "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": "percent", "has_sensitive_data": false, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}] \ No newline at end of file diff --git a/bases/br_poder360_pesquisas/microdados/schema-staging.json b/bases/br_poder360_pesquisas/microdados/schema-staging.json new file mode 100644 index 000000000..b23204012 --- /dev/null +++ b/bases/br_poder360_pesquisas/microdados/schema-staging.json @@ -0,0 +1 @@ +[{"name": "id_pesquisa", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "ano", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "sigla_uf", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "nome_municipio", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "cargo", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "data", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "data_referencia", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "instituto", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "contratante", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "orgao_registro", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "numero_registro", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "quantidade_entrevistas", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "margem_mais", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "margem_menos", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "tipo", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "turno", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "tipo_voto", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "id_cenario", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "descricao_cenario", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "id_candidato_poder360", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "nome_candidato", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "sigla_partido", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "condicao", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}, {"name": "percentual", "bigquery_type": null, "description": null, "temporal_coverage": null, "covered_by_dictionary": null, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": null, "is_in_staging": null, "is_partition": null, "type": "STRING"}] \ No newline at end of file diff --git a/bases/br_poder360_pesquisas/microdados/table_config.yaml b/bases/br_poder360_pesquisas/microdados/table_config.yaml new file mode 100644 index 000000000..f23e30ea1 --- /dev/null +++ b/bases/br_poder360_pesquisas/microdados/table_config.yaml @@ -0,0 +1,446 @@ + +dataset_id: br_poder360_pesquisas + +table_id: microdados + +# Descreva a tabela. Essas são as primeiras frases que um usuário vai ver. +# Você não precisa ser muito conciso. Sinta-se a vontade para dar exemplos de +# como usar os dados. +# Se souber, liste também aplicações: pesquisa, apps, etc. que usem os dados., +description: Microdados descrevendo cada pesquisa. + +# A máxima unidade espacial que a tabela cobre. +spatial_coverage: + +# Anos cobertos pela tabela. +# Preencher como lista de intervalos. +# Exemplo: 1995(1)2019. +temporal_coverage: + - 2000 + - 2001 + - 2002 + - 2003 + - 2004 + - 2005 + - 2006 + - 2007 + - 2008 + - 2009 + - 2010 + - 2011 + - 2012 + - 2013 + - 2014 + - 2015 + - 2016 + - 2017 + - 2018 + - 2019 + - 2020 + - 2021 + - 2022 + +# A unidade temporal com qual a tabela é atualizada. +# Opções em 'https://basedosdados.org/api/3/action/bd_available_options' +update_frequency: recurring + +# Entidade representada por cada linha. +# Opções em 'https://basedosdados.org/api/3/action/bd_available_options' +entity: poll + +# A unidade temporal representada por cada linha. +# Opções em 'https://basedosdados.org/api/3/action/bd_available_options' +time_unit: + +# O conjunto mínimo de colunas identificando cada linha unicamente. +# Preencha com os nomes de colunas. +# Exemplos: id_municipio, ano. +# Pode ser vazio pois certas tabelas não possuem identificadores. +identifying_columns: + - id_pesquisa + - id_cenario + - id_candidato_poder360 + +last_updated: + metadata: + data: + release: + +# Versão da tabela. Seguindo o padrão de semantic versioning. +# Exemplo: v1.1.3 +version: v0.1 + +# Quem está preenchendo esses metadados? +published_by: + name: Ricardo Dahis + email: rdahis@basedosdados.org + github_user: rdahis + website: www.ricardodahis.com + ckan_user: rdahis + +# Qual organização/departamento/pessoa tratou os dados? +# As vezes há um ponto intermediário entre os dados originais e subir na Base dos Dados. +# Se essa pessoa é você, preencha abaixo com suas informações. +data_cleaned_by: + name: Ricardo Dahis + email: rdahis@basedosdados.org + github_user: rdahis + website: www.ricardodahis.com + ckan_user: rdahis + code_url: + +# Se houve passos de tratamento, limpeza e manipulação de dados, descreva-os aqui. +data_cleaning_description: + "- Excluímos colunas dos dados originais que simplesmente listavam os IDs de cargo, município, instituto, etc. + - Reordenamos colunas para facilitar entendimento." + +# Url dos dados originais no GCP Storage. +raw_files_url: + +# Url dos arquivos auxiliares no GCP Storage. +auxiliary_files_url: + +# Url da tabela de arquitetura no GCP Storage. +architecture_url: + +# A tabela tem colunas que precisam de dicionário? +# Opções: yes, no. +covered_by_dictionary: no + +source_bucket_name: basedosdados-dev + +project_id_prod: basedosdados-dev + +project_id_staging: basedosdados-dev + +# Liste as colunas da tabela que representam partições. +# Não esqueça de deletar essas colunas nas tabelas .csv na hora de subir para o BigQuery. +# Isso poupará muito tempo e dinheiro às pessoas utilizando essa tabela. +# Se não houver partições, não modifique abaixo. +partitions: + +bdm_file_size: + +# Quais são as colunas? Certifique-se de escrever uma boa descrição, as pessoas vão gostar +# para saber sobre o que é a coluna. +# Adicionar todas as colunas manualmente pode ser bastante cansativo, por isso, quando +# inicializando este arquivo de configuração, você pode apontar a função para uma amostra de dados que +# preencherá automaticamente as colunas. +# Algumas colunas existirão apenas na tabela final, você as construirá em `publish.sql`. +# Para esses, defina is_in_staging como False. +# Além disso, você deve adicionar as colunas de partição aqui e definir is_partition como True. +columns: + - name: id_pesquisa + bigquery_type: string + description: ID da pesquisa + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: ano + bigquery_type: int64 + description: Ano + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: br_bd_diretorios_data_tempo + table_id: ano + column_name: ano + measurement_unit: year + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: sigla_uf + bigquery_type: string + description: Sigla da Unidade da Federação + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: br_bd_diretorios_brasil + table_id: uf + column_name: sigla + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: nome_municipio + bigquery_type: string + description: Nome do município + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: br_bd_diretorios_brasil + table_id: municipio + column_name: nome + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: cargo + bigquery_type: string + description: Cargo + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: data + bigquery_type: date + description: Data + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: data_referencia + bigquery_type: string + description: Data de referência + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: instituto + bigquery_type: string + description: Instituto realizador + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: contratante + bigquery_type: string + description: Contratante + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: orgao_registro + bigquery_type: string + description: Orgão de registro + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: numero_registro + bigquery_type: string + description: Número de registro + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: quantidade_entrevistas + bigquery_type: float64 + description: Quantidade de entrevistas + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: margem_mais + bigquery_type: float64 + description: Margem a mais + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: percent + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: margem_menos + bigquery_type: float64 + description: Margem a menos + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: percent + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: tipo + bigquery_type: string + description: Tipo + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: turno + bigquery_type: int64 + description: Turno + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: tipo_voto + bigquery_type: string + description: Tipo do voto + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: id_cenario + bigquery_type: string + description: ID do cenário + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: descricao_cenario + bigquery_type: string + description: Descrição do cenário + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: id_candidato_poder360 + bigquery_type: string + description: ID do candidato - Poder360 + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: nome_candidato + bigquery_type: string + description: Nome do candidato + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: sigla_partido + bigquery_type: string + description: Sigla do partido + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: condicao + bigquery_type: int64 + description: Condição + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: + has_sensitive_data: no + is_in_staging: true + is_partition: false + - name: percentual + bigquery_type: float64 + description: Percentual + temporal_coverage: + covered_by_dictionary: no + directory_column: + dataset_id: + table_id: + column_name: + measurement_unit: percent + has_sensitive_data: no + is_in_staging: true + is_partition: false + +metadata_modified: diff --git a/bases/br_poder360_pesquisas/microdados/table_description.txt b/bases/br_poder360_pesquisas/microdados/table_description.txt new file mode 100644 index 000000000..262bedc0a --- /dev/null +++ b/bases/br_poder360_pesquisas/microdados/table_description.txt @@ -0,0 +1,77 @@ +Microdados descrevendo cada pesquisa. + +Para saber mais acesse: +Website: +Github: + +Ajude a manter o projeto :) +Apoia-se: https://apoia.se/basedosdados + +Publicado por +------------- +Nome: Ricardo Dahis +Código: +Website: www.ricardodahis.com +Email: rdahis@basedosdados.org +Tratado por +----------- +Nome: Ricardo Dahis +Website: www.ricardodahis.com +Email: rdahis@basedosdados.org + + + + +Colunas identificando linhas unicamente +------------------- +- id_pesquisa +- id_cenario +- id_candidato_poder360 + + + + +Cobertura Temporal +------------------ +- 2000 +- 2001 +- 2002 +- 2003 +- 2004 +- 2005 +- 2006 +- 2007 +- 2008 +- 2009 +- 2010 +- 2011 +- 2012 +- 2013 +- 2014 +- 2015 +- 2016 +- 2017 +- 2018 +- 2019 +- 2020 +- 2021 +- 2022 + + + + +Cobertura Espacial +------------------ + + + + +Tratamento +---------- +- Excluímos colunas dos dados originais que simplesmente listavam os IDs de cargo, município, instituto, etc. - Reordenamos colunas para facilitar entendimento. + + + +Frequencia de Atualização +------------------------- +recurring diff --git a/python-package/basedosdados/upload/metadata.py b/python-package/basedosdados/upload/metadata.py index c32b114bc..65954fef3 100644 --- a/python-package/basedosdados/upload/metadata.py +++ b/python-package/basedosdados/upload/metadata.py @@ -324,7 +324,7 @@ def create( # if `dataset_config.yaml` doesn't exist but user wants to create # it alongside `table_config.yaml` dataset_config_exists = ( - self.metadata_path / "dataset_config.yaml" + self.metadata_path / self.dataset_id / "dataset_config.yaml" ).exists() if self.table_id and not table_only and not dataset_config_exists: self.dataset_metadata_obj.create(if_exists=if_exists) From 6e1911bd66b1a589df3241d991d4d77d26338516 Mon Sep 17 00:00:00 2001 From: d116626 Date: Mon, 24 Jan 2022 19:58:18 -0300 Subject: [PATCH 09/22] feat(infra) merge master --- bases/br_ibge_inpc/mes_categoria_municipio/table_config.yaml | 2 +- bases/br_ibge_inpc/mes_categoria_rm/table_config.yaml | 2 +- bases/br_ibge_ipca15/mes_brasil/table_config.yaml | 2 +- bases/br_ibge_ipca15/mes_categoria_brasil/table_config.yaml | 2 +- bases/br_ibge_ipca15/mes_categoria_municipio/table_config.yaml | 2 +- bases/br_ibge_ipca15/mes_categoria_rm/table_config.yaml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/bases/br_ibge_inpc/mes_categoria_municipio/table_config.yaml b/bases/br_ibge_inpc/mes_categoria_municipio/table_config.yaml index c5d3913a2..c5b216db0 100644 --- a/bases/br_ibge_inpc/mes_categoria_municipio/table_config.yaml +++ b/bases/br_ibge_inpc/mes_categoria_municipio/table_config.yaml @@ -46,7 +46,7 @@ identifying_columns: last_updated: metadata: - data: 2022_01_12 + data: release: # Versão da tabela. Seguindo o padrão de semantic versioning. diff --git a/bases/br_ibge_inpc/mes_categoria_rm/table_config.yaml b/bases/br_ibge_inpc/mes_categoria_rm/table_config.yaml index 347103333..7a8c417c5 100644 --- a/bases/br_ibge_inpc/mes_categoria_rm/table_config.yaml +++ b/bases/br_ibge_inpc/mes_categoria_rm/table_config.yaml @@ -45,7 +45,7 @@ identifying_columns: last_updated: metadata: - data: 2022_01_12 + data: release: # Versão da tabela. Seguindo o padrão de semantic versioning. diff --git a/bases/br_ibge_ipca15/mes_brasil/table_config.yaml b/bases/br_ibge_ipca15/mes_brasil/table_config.yaml index 32de989e1..bae155251 100644 --- a/bases/br_ibge_ipca15/mes_brasil/table_config.yaml +++ b/bases/br_ibge_ipca15/mes_brasil/table_config.yaml @@ -63,7 +63,7 @@ identifying_columns: last_updated: metadata: - data: 2022_01_12 + data: release: # Versão da tabela. Seguindo o padrão de semantic versioning. diff --git a/bases/br_ibge_ipca15/mes_categoria_brasil/table_config.yaml b/bases/br_ibge_ipca15/mes_categoria_brasil/table_config.yaml index 05519c70b..e756eec8e 100644 --- a/bases/br_ibge_ipca15/mes_categoria_brasil/table_config.yaml +++ b/bases/br_ibge_ipca15/mes_categoria_brasil/table_config.yaml @@ -44,7 +44,7 @@ identifying_columns: last_updated: metadata: - data: 2022_01_12 + data: release: # Versão da tabela. Seguindo o padrão de semantic versioning. diff --git a/bases/br_ibge_ipca15/mes_categoria_municipio/table_config.yaml b/bases/br_ibge_ipca15/mes_categoria_municipio/table_config.yaml index af40cc27e..8ba06b446 100644 --- a/bases/br_ibge_ipca15/mes_categoria_municipio/table_config.yaml +++ b/bases/br_ibge_ipca15/mes_categoria_municipio/table_config.yaml @@ -56,7 +56,7 @@ identifying_columns: last_updated: metadata: - data: 2022_01_12 + data: release: # Versão da tabela. Seguindo o padrão de semantic versioning. diff --git a/bases/br_ibge_ipca15/mes_categoria_rm/table_config.yaml b/bases/br_ibge_ipca15/mes_categoria_rm/table_config.yaml index 0f44088fd..7b7ebf653 100644 --- a/bases/br_ibge_ipca15/mes_categoria_rm/table_config.yaml +++ b/bases/br_ibge_ipca15/mes_categoria_rm/table_config.yaml @@ -46,7 +46,7 @@ identifying_columns: last_updated: metadata: - data: 2022_01_12 + data: release: # Versão da tabela. Seguindo o padrão de semantic versioning. From a293d5d994dc122c3219766959065635723b6834 Mon Sep 17 00:00:00 2001 From: lucascr91 Date: Sat, 29 Jan 2022 12:49:43 -0300 Subject: [PATCH 10/22] fix files organization to match master --- .../basedosdados/download/metadata.py | 438 +++++++++++++++++ python-package/tests/test_metadata.py | 439 ------------------ 2 files changed, 438 insertions(+), 439 deletions(-) create mode 100644 python-package/basedosdados/download/metadata.py delete mode 100644 python-package/tests/test_metadata.py diff --git a/python-package/basedosdados/download/metadata.py b/python-package/basedosdados/download/metadata.py new file mode 100644 index 000000000..24dbc02cf --- /dev/null +++ b/python-package/basedosdados/download/metadata.py @@ -0,0 +1,438 @@ +from google.cloud import bigquery +import pandas as pd +import requests + +from basedosdados.download.base import credentials + + +def _get_header(text): + """Gets first paragraph of a text + Args: + text (str or None): Text to be split + Returns: + str: First paragraph + """ + + if isinstance(text, str): + return text.split("\n")[0] + elif text is None: + return "" + + +def _fix_size(s, step=80): + + final = "" + + for l in s.split(" "): + final += (l + " ") if len(final.split("\n")[-1]) < step else "\n" + + return final + + +def _print_output(df): + """Prints dataframe contents as print blocks + Args: + df (pd.DataFrame): table to be printed + """ + + columns = df.columns + step = 80 + print() + for i, row in df.iterrows(): + for c in columns: + print(_fix_size(f"{c}: \n\t{row[c]}")) + print("-" * (step + 15)) + print() + + # func = lambda lista, final, step: ( + # func(lista[1:], + # (final + lista[0] + ' ') + # if len(final.split('\n')[-1]) <= step + # else final + '\n', + # step + # ) if len(lista) else final) + + +def _handle_output(verbose, output_type, df, col_name=None): + """Handles datasets and tables listing outputs based on user's choice. + Either prints it to the screen or returns it as a `list` object. + Args: + verbose (bool): amount of verbosity + output_type (str): type of output + df (pd.DataFrame, bigquery.Dataset or bigquery.Table): table containing datasets metadata + col_name (str): name of column with id's data + """ + + df_is_dataframe = type(df) == pd.DataFrame + df_is_bq_dataset_or_table = type(df) == bigquery.Table + df_is_bq_dataset_or_table |= type(df) == bigquery.Dataset + + if verbose == True and df_is_dataframe: + _print_output(df) + + elif verbose == True and df_is_bq_dataset_or_table: + print(df.description) + + elif verbose == False: + if output_type == "list": + return df[col_name].to_list() + elif output_type == "str": + return df.description + elif output_type == "records": + return df.to_dict("records") + else: + msg = '`output_type` argument must be set to "list", "str" or "records".' + raise ValueError(msg) + + else: + raise TypeError("`verbose` argument must be of `bool` type.") + + return None + +def list_datasets(query, limit=10, with_description=False, verbose=True): + """ + This function uses `bd_dataset_search` website API + enpoint to retrieve a list of available datasets. + + Args: + query (str): + String to search in datasets' metadata. + limit (int): + Field to limit the number of results + with_description (bool): Optional + If True, fetch short dataset description for each dataset. + verbose (bool): Optional. + If set to True, information is printed to the screen. If set to False, a list object is returned. + + Returns: + list | stdout + """ + + url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q={query}&page_size={limit}&resource_type=bdm_table" + + # validate url + try: + response = requests.get(url) + response.raise_for_status() + except requests.exceptions.HTTPError as err: + return err + + json_response = response.json() + + # this dict has all information we need to output the function + dataset_dict = { + "dataset_id": [ + dataset["name"] for dataset in json_response["result"]["datasets"] + ], + "description": [ + dataset["notes"] if "notes" in dataset.keys() else None + for dataset in json_response["result"]["datasets"] + ], + } + + # select desired output using dataset_id info. Note that the output is either a standardized string or a list + if verbose & (with_description == False): + return _print_output(pd.DataFrame.from_dict(dataset_dict)[["dataset_id"]]) + elif verbose & with_description: + return _print_output( + pd.DataFrame.from_dict(dataset_dict)[["dataset_id", "description"]] + ) + elif (verbose == False) & (with_description == False): + return dataset_dict["dataset_id"] + elif (verbose == False) & with_description: + return [ + { + "dataset_id": dataset_dict["dataset_id"][k], + "description": dataset_dict["description"][k], + } + for k in range(len(dataset_dict["dataset_id"])) + ] + + +def list_dataset_tables( + dataset_id, + with_description=False, + verbose=True, +): + """ + Fetch table_id for tables available at the specified dataset_id. Prints the information on screen or returns it as a list. + + Args: + dataset_id (str): Optional. + Dataset id returned by list_datasets function + limit (int): + Field to limit the number of results + with_description (bool): Optional + If True, fetch short table descriptions for each table that match the search criteria. + verbose (bool): Optional. + If set to True, information is printed to the screen. If set to False, a list object is returned. + + Returns: + stdout | list + """ + + url = f"https://basedosdados.org/api/3/action/bd_bdm_dataset_show?dataset_id={dataset_id}" + + # validate url + try: + response = requests.get(url) + response.raise_for_status() + except requests.exceptions.HTTPError as err: + return err + + json_response = response.json() + + dataset = json_response["result"] + # this dict has all information need to output the function + table_dict = { + "table_id": [ + dataset["resources"][k]["name"] for k in range(len(dataset["resources"])) + ], + "description": [ + dataset["resources"][k]["description"] + for k in range(len(dataset["resources"])) + ], + } + # select desired output using table_id info. Note that the output is either a standardized string or a list + if verbose & (with_description == False): + return _print_output(pd.DataFrame.from_dict(table_dict)[["table_id"]]) + elif verbose & with_description: + return _print_output( + pd.DataFrame.from_dict(table_dict)[["table_id", "description"]] + ) + elif (verbose == False) & (with_description == False): + return table_dict["table_id"] + elif (verbose == False) & with_description: + return [ + { + "table_id": table_dict["table_id"][k], + "description": table_dict["description"][k], + } + for k in range(len(table_dict["table_id"])) + ] + + +def get_dataset_description( + dataset_id, + verbose=True, +): + """ + Prints the full dataset description. + + Args: + dataset_id (str): Required. + Dataset id available in list_datasets. + verbose (bool): Optional. + If set to True, information is printed to the screen. If set to False, data is returned as a `str`. + + Returns: + stdout | str + """ + url = f"https://basedosdados.org/api/3/action/bd_bdm_dataset_show?dataset_id={dataset_id}" + + # validate url + try: + response = requests.get(url) + response.raise_for_status() + except requests.exceptions.HTTPError as err: + return err + + json_response = response.json() + + description = json_response["result"]["notes"] + + if verbose: + print(description) + else: + return description + + +def get_table_description( + dataset_id, + table_id, + verbose=True, +): + """ + Prints the full table description. + + Args: + dataset_id (str): Required. + Dataset id available in list_datasets. + table_id (str): Required. + Table id available in list_dataset_tables + verbose (bool): Optional. + If set to True, information is printed to the screen. If set to False, data is returned as a `str`. + + Returns: + stdout | str + """ + + url = f"https://basedosdados.org/api/3/action/bd_bdm_table_show?dataset_id={dataset_id}&table_id={table_id}" + + # validate url + try: + response = requests.get(url) + response.raise_for_status() + except requests.exceptions.HTTPError as err: + return err + + json_response = response.json() + + description = json_response["result"]["description"] + + if verbose: + print(description) + else: + return description + + +def get_table_columns( + dataset_id, + table_id, + verbose=True, +): + + """ + Fetch the names, types and descriptions for the columns in the specified table. Prints + information on screen. + Args: + dataset_id (str): Required. + Dataset id available in list_datasets. + table_id (str): Required. + Table id available in list_dataset_tables + verbose (bool): Optional. + If set to True, information is printed to the screen. If set to False, data is returned as a `list` of `dict`s. + + Returns: + stdout | list + """ + + url = f"https://basedosdados.org/api/3/action/bd_bdm_table_show?dataset_id={dataset_id}&table_id={table_id}" + + # validate url + try: + response = requests.get(url) + response.raise_for_status() + except requests.exceptions.HTTPError as err: + return err + + json_response = response.json() + + columns = json_response["result"]["columns"] + + if verbose: + _print_output(pd.DataFrame(columns)) + else: + return columns + + +def get_table_size( + dataset_id, + table_id, + billing_project_id, + query_project_id="basedosdados", + from_file=False, + verbose=True, +): + """Use a query to get the number of rows and size (in Mb) of a table query + from BigQuery. Prints information on screen in markdown friendly format. + + WARNING: this query may cost a lot depending on the table. + + Args: + dataset_id (str): Optional. + Dataset id available in basedosdados. It should always come with table_id. + table_id (str): Optional. + Table id available in basedosdados.dataset_id. + It should always come with dataset_id. + query_project_id (str): Optional. + Which project the table lives. You can change this you want to query different projects. + billing_project_id (str): Optional. + Project that will be billed. Find your Project ID here https://console.cloud.google.com/projectselector2/home/dashboard + verbose (bool): Optional. + If set to True, information is printed to the screen. If set to False, data is returned as a `list` of `dict`s. + Example: + get_table_size( + dataset_id='br_ibge_censo2010', + table_id='pessoa_renda_setor_censitario', + billing_project_id='yourprojectid' + ) + """ + billing_client = bigquery.Client( + credentials=credentials(from_file=from_file), project=billing_project_id + ) + + query = f"""SELECT COUNT(*) FROM {query_project_id}.{dataset_id}.{table_id}""" + + job = billing_client.query(query, location="US") + + num_rows = job.to_dataframe().loc[0, "f0_"] + + size_mb = round(job.total_bytes_processed / 1024 / 1024, 2) + + table_data = pd.DataFrame( + [ + { + "project_id": query_project_id, + "dataset_id": dataset_id, + "table_id": table_id, + "num_rows": num_rows, + "size_mb": size_mb, + } + ] + ) + + return _handle_output(verbose=verbose, output_type="records", df=table_data) + +def search(query, order_by): + """This function works as a wrapper to the `bd_dataset_search` website API + enpoint. + + Args: + query (str): + String to search in datasets and tables' metadata. + order_by (str): score|popular|recent + Field by which the results will be ordered. + + Returns: + pd.DataFrame: + Response from the API presented as a pandas DataFrame. Each row is + a table. Each column is a field identifying the table. + """ + + # validate order_by input + if order_by not in ["score", "popular", "recent"]: + raise ValueError( + f'order_by must be score, popular or recent. Received "{order_by}"' + ) + + url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q={query}&order_by={order_by}&resource_type=bdm_table" + + # validate url + try: + response = requests.get(url) + response.raise_for_status() + except requests.exceptions.HTTPError as err: + print(err) + + json_response = response.json() + + dataset_dfs = [] + # first loop identify the number of the tables in each datasets + for dataset in json_response["result"]["datasets"]: + tables_dfs = [] + n_tables = len(dataset["resources"]) + # second loop extracts tables' information for each dataset + for table in dataset["resources"]: + data_table = pd.DataFrame( + {k: str(table[k]) for k in list(table.keys())}, index=[0] + ) + tables_dfs.append(data_table) + # append tables' dataframes for each dataset + data_ds = tables_dfs[0].append(tables_dfs[1:]).reset_index(drop=True) + dataset_dfs.append(data_ds) + # append datasets' dataframes + df = dataset_dfs[0].append(dataset_dfs[1:]).reset_index(drop=True) + + return df + diff --git a/python-package/tests/test_metadata.py b/python-package/tests/test_metadata.py deleted file mode 100644 index 316adfdfc..000000000 --- a/python-package/tests/test_metadata.py +++ /dev/null @@ -1,439 +0,0 @@ -import pytest -import ruamel.yaml as ryaml - -from pathlib import Path -import shutil -import random -import string - -from basedosdados import Metadata -from basedosdados.exceptions import BaseDosDadosException - - -DATASET_ID = "pytest" -TABLE_ID = "pytest" - -METADATA_FILES = {"dataset": "dataset_config.yaml", "table": "table_config.yaml"} - - -@pytest.fixture -def metadatadir(tmpdir_factory): - (Path(__file__).parent / "tmp_bases").mkdir(exist_ok=True) - return Path(__file__).parent / "tmp_bases" - - -@pytest.fixture -def dataset_metadata(metadatadir): - return Metadata(dataset_id=DATASET_ID, metadata_path=metadatadir) - - -@pytest.fixture -def table_metadata(metadatadir): - return Metadata(dataset_id=DATASET_ID, table_id=TABLE_ID, metadata_path=metadatadir) - - -@pytest.fixture -def dataset_metadata_path(metadatadir): - return Path(metadatadir) / DATASET_ID - - -@pytest.fixture -def table_metadata_path(metadatadir): - return Path(metadatadir) / DATASET_ID / TABLE_ID - - -def test_create_from_dataset_id(dataset_metadata, dataset_metadata_path): - shutil.rmtree(dataset_metadata_path, ignore_errors=True) - dataset_metadata.create() - assert (dataset_metadata_path / METADATA_FILES["dataset"]).exists() - - -def test_create_from_dataset_and_table_id(table_metadata, table_metadata_path): - shutil.rmtree(table_metadata_path, ignore_errors=True) - table_metadata.create() - assert (table_metadata_path / METADATA_FILES["table"]).exists() - - -def test_create_if_exists_raise(dataset_metadata, table_metadata): - - with pytest.raises(FileExistsError): - dataset_metadata.create(if_exists="raise") - - with pytest.raises(FileExistsError): - table_metadata.create(if_exists="raise") - - -def test_create_if_exists_replace( - dataset_metadata, dataset_metadata_path, table_metadata, table_metadata_path -): - dataset_metadata.create(if_exists="replace") - assert (dataset_metadata_path / METADATA_FILES["dataset"]).exists() - - table_metadata.create(if_exists="replace") - assert (table_metadata_path / METADATA_FILES["table"]).exists() - - -def test_create_if_exists_pass( - dataset_metadata, dataset_metadata_path, table_metadata, table_metadata_path -): - - # make sure new file is created - dataset_metadata.create(if_exists="replace") - assert (dataset_metadata_path / METADATA_FILES["dataset"]).exists() - - # make sure no Exception is raised on `if_exists="pass"` - dataset_metadata.create(if_exists="pass") - - # same procedure for `Table` - table_metadata.create(if_exists="replace") - assert (table_metadata_path / METADATA_FILES["table"]).exists() - table_metadata.create(if_exists="pass") - - -def test_create_columns(table_metadata, table_metadata_path): - shutil.rmtree(table_metadata_path, ignore_errors=True) - table_metadata.create(columns=["column1", "column2"]) - assert (table_metadata_path / METADATA_FILES["table"]).exists() - - -@pytest.fixture -def existent_metadata(metadatadir): - table_metadata_obj = Metadata( - dataset_id="br_me_rais", - table_id="microdados_vinculos", - metadata_path=metadatadir, - ) - return table_metadata_obj - - -@pytest.fixture -def existent_metadata_path(metadatadir): - return Path(metadatadir) / "br_me_rais" / "microdados_vinculos" - - -def test_create_partition_columns_from_existent_table( - existent_metadata: Metadata, - existent_metadata_path: Path, -): - shutil.rmtree(existent_metadata_path, ignore_errors=True) - - existent_metadata.create() - assert existent_metadata_path.exists() - - metadata_dict = existent_metadata.local_metadata - assert metadata_dict.get("partitions") == "ano, sigla_uf, id_municipio" - - -def test_create_partition_columns_from_user_input( - existent_metadata: Metadata, - existent_metadata_path: Path, -): - shutil.rmtree(existent_metadata_path, ignore_errors=True) - - existent_metadata.create(partition_columns=["id_municipio"]) - assert existent_metadata_path.exists() - - metadata_dict = existent_metadata.local_metadata - assert metadata_dict.get("partitions") == "id_municipio" - - -def test_create_force_columns_is_true( - existent_metadata: Metadata, - existent_metadata_path: Path, -): - shutil.rmtree(existent_metadata_path, ignore_errors=True) - existent_metadata.create(columns=["column1", "column2"], force_columns=True) - assert (existent_metadata_path / METADATA_FILES["table"]).exists() - - table_metadata_dict = existent_metadata.local_metadata - assert table_metadata_dict["columns"][0]["name"] == "column1" - assert table_metadata_dict["columns"][1]["name"] == "column2" - - -def test_create_force_columns_is_false( - existent_metadata: Metadata, - existent_metadata_path: Path, -): - shutil.rmtree(existent_metadata_path, ignore_errors=True) - existent_metadata.create(columns=["column1", "column2"], force_columns=False) - assert (existent_metadata_path / METADATA_FILES["table"]).exists() - - table_metadata_dict = existent_metadata.local_metadata - assert table_metadata_dict["columns"][0]["name"] != "column1" - assert table_metadata_dict["columns"][1]["name"] != "column2" - - -def test_create_table_only_is_true( - table_metadata, dataset_metadata_path, table_metadata_path - ): - shutil.rmtree(dataset_metadata_path, ignore_errors=True) - shutil.rmtree(table_metadata_path, ignore_errors=True) - - table_metadata.create(table_only=True) - assert (table_metadata_path / METADATA_FILES["table"]).exists() - assert not (dataset_metadata_path / METADATA_FILES["dataset"]).exists() - - -def test_create_table_only_is_false( - table_metadata, dataset_metadata_path, table_metadata_path - ): - shutil.rmtree(dataset_metadata_path, ignore_errors=True) - shutil.rmtree(table_metadata_path, ignore_errors=True) - - table_metadata.create(table_only=False) - assert (table_metadata_path / METADATA_FILES["table"]).exists() - assert (dataset_metadata_path / METADATA_FILES["dataset"]).exists() - - -@pytest.fixture -def out_of_date_metadata_obj(metadatadir): - out_of_date_metadata = Metadata(dataset_id="br_me_caged", metadata_path=metadatadir) - out_of_date_metadata.create(if_exists="replace") - - out_of_date_config = out_of_date_metadata.local_metadata - out_of_date_config["metadata_modified"] = "old_date" - ryaml.dump( - out_of_date_config, open(out_of_date_metadata.filepath, "w", encoding="utf-8") - ) - - return out_of_date_metadata - - -@pytest.fixture -def updated_metadata_obj(metadatadir): - updated_metadata = Metadata(dataset_id="br_me_caged", metadata_path=metadatadir) - updated_metadata.create(if_exists="replace") - - updated_config = updated_metadata.local_metadata - updated_config["metadata_modified"] = updated_metadata.ckan_metadata[ - "metadata_modified" - ] - ryaml.dump(updated_config, open(updated_metadata.filepath, "w", encoding="utf-8")) - - return updated_metadata - - -def test_is_updated_is_true(updated_metadata_obj): - assert updated_metadata_obj.is_updated() == True - - -def test_is_updated_is_false(out_of_date_metadata_obj): - assert out_of_date_metadata_obj.is_updated() == False - - -@pytest.fixture -def valid_metadata_dataset(metadatadir): - dataset_metadata = Metadata(dataset_id="br_ibge_pib", metadata_path=metadatadir) - dataset_metadata.create(if_exists="replace") - dataset_metadata.CKAN_API_KEY = "valid-key" - return dataset_metadata - - -@pytest.fixture -def valid_metadata_table(metadatadir): - table_metadata = Metadata( - dataset_id="br_ibge_pib", - table_id="municipio", - metadata_path=metadatadir, - ) - table_metadata.create(if_exists="replace") - dataset_metadata.CKAN_API_KEY = "valid-key" - return table_metadata - - -def test_validate_is_succesful( - valid_metadata_dataset: Metadata, valid_metadata_table: Metadata -): - assert valid_metadata_dataset.validate() == True - assert valid_metadata_table.validate() == True - - -@pytest.fixture -def invalid_dataset_metadata(metadatadir): - invalid_dataset_metadata = Metadata( - dataset_id="br_ibge_pib", - metadata_path=metadatadir, - ) - invalid_dataset_metadata.create(if_exists="replace") - - invalid_config = invalid_dataset_metadata.local_metadata - invalid_config["title"] = {"this_title": "is_not_valid"} - - print(invalid_dataset_metadata.filepath) - - with open(invalid_dataset_metadata.filepath, "w", encoding="utf-8") as file: - ryaml.dump(invalid_config, file) - - return invalid_dataset_metadata - - -@pytest.fixture -def invalid_table_metadata(metadatadir): - invalid_dataset_metadata = Metadata( - dataset_id="br_ibge_pib", - table_id="municipio", - metadata_path=metadatadir, - ) - invalid_dataset_metadata.create(if_exists="replace") - - invalid_config = invalid_dataset_metadata.local_metadata - invalid_config["table_id"] = None - - with open(invalid_dataset_metadata.filepath, "w", encoding="utf-8") as file: - ryaml.dump(invalid_config, file) - - return invalid_dataset_metadata - - -def test_validate_is_not_succesful( - invalid_dataset_metadata: Metadata, - invalid_table_metadata: Metadata, -): - with pytest.raises(BaseDosDadosException): - invalid_table_metadata.validate() - - with pytest.raises(BaseDosDadosException): - invalid_dataset_metadata.validate() - - -@pytest.fixture -def invalid_organization_dataset(metadatadir): - invalid_organization_dataset = Metadata( - dataset_id="br_ibge_pib", - metadata_path=metadatadir, - ) - invalid_organization_dataset.create(if_exists="replace") - - invalid_config = invalid_organization_dataset.local_metadata - invalid_config["organization"] = "not-a-valid-organization" - - with open(invalid_organization_dataset.filepath, "w", encoding="utf-8") as file: - ryaml.dump(invalid_config, file) - - return invalid_organization_dataset - - -def test_validate_organization_not_found(invalid_organization_dataset): - with pytest.raises(BaseDosDadosException, match="Organization not found"): - invalid_organization_dataset.validate() - - -# TODO: Mock ckan server to activate publish tests -@pytest.fixture -def pytest_dataset(metadatadir): - shutil.rmtree(metadatadir, ignore_errors=True) - pytest_dataset = Metadata( - dataset_id="pytest", - metadata_path=metadatadir - ) - pytest_dataset.create(if_exists="replace") - - # fill dataset metadata for it to be publishable - pytest_dataset_metadata = pytest_dataset.local_metadata - pytest_dataset_metadata["organization"] = "acaps" # set valid organization - - # materialize metadata file - ryaml.dump( - pytest_dataset_metadata, - open(pytest_dataset.filepath, "w", encoding="utf-8") - ) - - return pytest_dataset - - -@pytest.fixture -def pytest_table(metadatadir): - shutil.rmtree(metadatadir, ignore_errors=True) - pytest_table = Metadata( - dataset_id="pytest", - table_id="pytest" - ) - pytest_table.create(if_exists="replace") - return pytest_table - - -@pytest.mark.skip( - reason="This test requires a mocked CKAN server and a test dataset/table.") -def test_publish_is_successful( - valid_metadata_dataset: Metadata, - valid_metadata_table: Metadata, -): - assert isinstance(valid_metadata_dataset.publish(), dict) - assert isinstance(valid_metadata_table.publish(), dict) - - -@pytest.mark.skip(reason="This test requires a mocked CKAN server.") -def test_publish_is_not_successful( - invalid_dataset_metadata: Metadata, - invalid_table_metadata: Metadata, -): - with pytest.raises(AssertionError, match="Could not publish"): - invalid_dataset_metadata.publish() - - with pytest.raises(BaseDosDadosException, match="Could not publish"): - invalid_table_metadata.publish() - - -@pytest.mark.skip( - reason="This test requires a mocked CKAN server and a delete endpoint." -) -def test_publish_all_is_true( - pytest_dataset: Metadata, - pytest_table: Metadata, -): - res = pytest_table.publish(all=True) - assert isinstance(res, dict) - assert res != {} - assert pytest_dataset.exists_in_ckan() - - -@pytest.mark.skip(reason="This test requires a mocked CKAN server.") -def test_publish_if_exists_raise(valid_metadata_dataset: Metadata): - with pytest.raises(BaseDosDadosException, match="already exists in CKAN"): - valid_metadata_dataset.publish(if_exists="raise") - - -@pytest.mark.skip( - reason="This test requires a mocked CKAN server and a test dataset." -) -def test_publish_if_exists_replace(valid_metadata_dataset: Metadata): - res = valid_metadata_dataset.publish(if_exists="replace") - assert isinstance(res, dict) - assert res != {} - - -@pytest.mark.skip(reason="This test requires a mocked CKAN server.") -def test_publish_if_exists_pass(valid_metadata_dataset: Metadata): - assert isinstance(valid_metadata_dataset.publish(if_exists="pass"), dict) - assert valid_metadata_dataset.publish(if_exists="pass") == {} - - -@pytest.mark.skip(reason="This test requires a mocked CKAN server.") -def test_publish_update_locally_is_true( - pytest_dataset: Metadata -): - date_before = pytest_dataset.local_metadata.get('metadata_modified') - - # update local metadata - new_metadata = pytest_dataset.local_metadata.copy() - - # generate random strings with 3 characters - random_string = "".join(random.choice(string.ascii_uppercase) for _ in range(3)) - - # update metadata tags with random_string - new_metadata["tags"] = [random_string] - ryaml.dump( - new_metadata, open(new_metadata.filepath, "w", encoding="utf-8") - ) - - # publish changes - pytest_dataset.publish(update_locally=True) - - # get new tags from local metadata - new_tags = pytest_dataset.local_metadata.get('tags') - - # get new `metadata_modified` value from local config file - date_after = pytest_dataset.local_metadata.get('metadata_modified') - - assert new_tags == [random_string], "Tags were not updated locally" - assert date_after > date_before, "Date after should be greater than date before" From 1ab4121a287b837a893feb7d707c37932b0380e7 Mon Sep 17 00:00:00 2001 From: lucascr91 Date: Sat, 29 Jan 2022 12:55:18 -0300 Subject: [PATCH 11/22] remove download.py --- .../basedosdados/download/download.py | 362 +++++++----------- 1 file changed, 144 insertions(+), 218 deletions(-) diff --git a/python-package/basedosdados/download/download.py b/python-package/basedosdados/download/download.py index adf532000..c4dd89c25 100644 --- a/python-package/basedosdados/download/download.py +++ b/python-package/basedosdados/download/download.py @@ -18,7 +18,6 @@ BaseDosDadosNoBillingProjectIDException, ) from pandas_gbq.gbq import GenericGBQException -import requests def credentials(from_file=False, reauth=False): @@ -359,241 +358,221 @@ def _handle_output(verbose, output_type, df, col_name=None): return None -def list_datasets(query, limit=10, with_description=False, verbose=True): - """ - This function uses `bd_dataset_search` website API - enpoint to retrieve a list of available datasets. +def list_datasets( + query_project_id="basedosdados", + filter_by=None, + with_description=False, + from_file=False, + verbose=True, +): + """Fetch the dataset_id of datasets available at query_project_id. Prints information on + screen or returns it as a list. Args: - query (str): - String to search in datasets' metadata. - limit (int): - Field to limit the number of results + query_project_id (str): Optional. + Which project the table lives. You can change this you want to query different projects. + filter_by (str): Optional + String to be matched in dataset_id. with_description (bool): Optional If True, fetch short dataset description for each dataset. verbose (bool): Optional. If set to True, information is printed to the screen. If set to False, a list object is returned. - Returns: - list | stdout + + Example: + list_datasets( + filter_by='sp', + with_description=True, + ) """ - url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q={query}&page_size={limit}&resource_type=bdm_table" + client = bigquery.Client( + credentials=credentials(from_file=from_file), project=query_project_id + ) - # validate url - try: - response = requests.get(url) - response.raise_for_status() - except requests.exceptions.HTTPError as err: - return err - - json_response = response.json() - - # this dict has all information we need to output the function - dataset_dict = { - "dataset_id": [ - dataset["name"] for dataset in json_response["result"]["datasets"] - ], - "description": [ - dataset["notes"] if "notes" in dataset.keys() else None - for dataset in json_response["result"]["datasets"] - ], - } - - # select desired output using dataset_id info. Note that the output is either a standardized string or a list - if verbose & (with_description == False): - return _print_output(pd.DataFrame.from_dict(dataset_dict)[["dataset_id"]]) - elif verbose & with_description: - return _print_output( - pd.DataFrame.from_dict(dataset_dict)[["dataset_id", "description"]] - ) - elif (verbose == False) & (with_description == False): - return dataset_dict["dataset_id"] - elif (verbose == False) & with_description: - return [ - { - "dataset_id": dataset_dict["dataset_id"][k], - "description": dataset_dict["description"][k], - } - for k in range(len(dataset_dict["dataset_id"])) + datasets_list = list(client.list_datasets()) + + datasets = pd.DataFrame( + [dataset.dataset_id for dataset in datasets_list], columns=["dataset_id"] + ) + + if filter_by: + + datasets = datasets.loc[datasets["dataset_id"].str.contains(filter_by)] + + if with_description: + + datasets["description"] = [ + _get_header(client.get_dataset(dataset).description) + for dataset in datasets["dataset_id"] ] + return _handle_output( + verbose=verbose, + output_type="list", + df=datasets, + col_name="dataset_id", + ) + def list_dataset_tables( dataset_id, + query_project_id="basedosdados", + from_file=False, + filter_by=None, with_description=False, verbose=True, ): - """ - Fetch table_id for tables available at the specified dataset_id. Prints the information on screen or returns it as a list. + """Fetch table_id for tables available at the specified dataset_id. Prints the information + on screen or returns it as a list. Args: dataset_id (str): Optional. - Dataset id returned by list_datasets function - limit (int): - Field to limit the number of results + Dataset id available in basedosdados. + query_project_id (str): Optional. + Which project the table lives. You can change this you want to query different projects. + filter_by (str): Optional + String to be matched in the table_id. with_description (bool): Optional If True, fetch short table descriptions for each table that match the search criteria. verbose (bool): Optional. If set to True, information is printed to the screen. If set to False, a list object is returned. - Returns: - stdout | list + Example: + list_dataset_tables( + dataset_id='br_ibge_censo2010' + filter_by='renda', + with_description=True, + ) """ + client = bigquery.Client( + credentials=credentials(from_file=from_file), project=query_project_id + ) - url = f"https://basedosdados.org/api/3/action/bd_bdm_dataset_show?dataset_id={dataset_id}" + dataset = client.get_dataset(dataset_id) - # validate url - try: - response = requests.get(url) - response.raise_for_status() - except requests.exceptions.HTTPError as err: - return err - - json_response = response.json() - - dataset = json_response["result"] - # this dict has all information need to output the function - table_dict = { - "table_id": [ - dataset["resources"][k]["name"] for k in range(len(dataset["resources"])) - ], - "description": [ - dataset["resources"][k]["description"] - for k in range(len(dataset["resources"])) - ], - } - # select desired output using table_id info. Note that the output is either a standardized string or a list - if verbose & (with_description == False): - return _print_output(pd.DataFrame.from_dict(table_dict)[["table_id"]]) - elif verbose & with_description: - return _print_output( - pd.DataFrame.from_dict(table_dict)[["table_id", "description"]] - ) - elif (verbose == False) & (with_description == False): - return table_dict["table_id"] - elif (verbose == False) & with_description: - return [ - { - "table_id": table_dict["table_id"][k], - "description": table_dict["description"][k], - } - for k in range(len(table_dict["table_id"])) + tables_list = list(client.list_tables(dataset)) + + tables = pd.DataFrame( + [table.table_id for table in tables_list], columns=["table_id"] + ) + + if filter_by: + + tables = tables.loc[tables["table_id"].str.contains(filter_by)] + + if with_description: + + tables["description"] = [ + _get_header(client.get_table(f"{dataset_id}.{table}").description) + for table in tables["table_id"] ] + return _handle_output( + verbose=verbose, + output_type="list", + df=tables, + col_name="table_id", + ) + def get_dataset_description( - dataset_id, + dataset_id=None, + query_project_id="basedosdados", + from_file=False, verbose=True, ): - """ - Prints the full dataset description. + """Prints the full dataset description. Args: - dataset_id (str): Required. - Dataset id available in list_datasets. + dataset_id (str): Optional. + Dataset id available in basedosdados. + query_project_id (str): Optional. + Which project the table lives. You can change this you want to query different projects. verbose (bool): Optional. If set to True, information is printed to the screen. If set to False, data is returned as a `str`. - - Returns: - stdout | str """ - url = f"https://basedosdados.org/api/3/action/bd_bdm_dataset_show?dataset_id={dataset_id}" - # validate url - try: - response = requests.get(url) - response.raise_for_status() - except requests.exceptions.HTTPError as err: - return err - - json_response = response.json() + client = bigquery.Client( + credentials=credentials(from_file=from_file), project=query_project_id + ) - description = json_response["result"]["notes"] + dataset = client.get_dataset(dataset_id) - if verbose: - print(description) - else: - return description + return _handle_output(verbose=verbose, output_type="str", df=dataset) def get_table_description( - dataset_id, - table_id, + dataset_id=None, + table_id=None, + query_project_id="basedosdados", + from_file=False, verbose=True, ): - """ - Prints the full table description. + """Prints the full table description. Args: - dataset_id (str): Required. - Dataset id available in list_datasets. - table_id (str): Required. - Table id available in list_dataset_tables + dataset_id (str): Optional. + Dataset id available in basedosdados. It should always come with table_id. + table_id (str): Optional. + Table id available in basedosdados.dataset_id. + It should always come with dataset_id. + query_project_id (str): Optional. + Which project the table lives. You can change this you want to query different projects. verbose (bool): Optional. If set to True, information is printed to the screen. If set to False, data is returned as a `str`. - - Returns: - stdout | str """ - url = f"https://basedosdados.org/api/3/action/bd_bdm_table_show?dataset_id={dataset_id}&table_id={table_id}" - - # validate url - try: - response = requests.get(url) - response.raise_for_status() - except requests.exceptions.HTTPError as err: - return err - - json_response = response.json() + client = bigquery.Client( + credentials=credentials(from_file=from_file), project=query_project_id + ) - description = json_response["result"]["description"] + table = client.get_table(f"{dataset_id}.{table_id}") - if verbose: - print(description) - else: - return description + return _handle_output(verbose=verbose, output_type="str", df=table) def get_table_columns( - dataset_id, - table_id, + dataset_id=None, + table_id=None, + query_project_id="basedosdados", + from_file=False, verbose=True, ): - """ - Fetch the names, types and descriptions for the columns in the specified table. Prints - information on screen. + """Fetch the names, types and descriptions for the columns in the specified table. Prints + information on screen. + Args: - dataset_id (str): Required. - Dataset id available in list_datasets. - table_id (str): Required. - Table id available in list_dataset_tables + dataset_id (str): Optional. + Dataset id available in basedosdados. It should always come with table_id. + table_id (str): Optional. + Table id available in basedosdados.dataset_id. + It should always come with dataset_id. + query_project_id (str): Optional. + Which project the table lives. You can change this you want to query different projects. verbose (bool): Optional. If set to True, information is printed to the screen. If set to False, data is returned as a `list` of `dict`s. - - Returns: - stdout | list + Example: + get_table_columns( + dataset_id='br_ibge_censo2010', + table_id='pessoa_renda_setor_censitario' + ) """ - url = f"https://basedosdados.org/api/3/action/bd_bdm_table_show?dataset_id={dataset_id}&table_id={table_id}" + client = bigquery.Client( + credentials=credentials(from_file=from_file), project=query_project_id + ) - # validate url - try: - response = requests.get(url) - response.raise_for_status() - except requests.exceptions.HTTPError as err: - return err + table_ref = client.get_table(f"{dataset_id}.{table_id}") - json_response = response.json() + columns = [ + (field.name, field.field_type, field.description) for field in table_ref.schema + ] - columns = json_response["result"]["columns"] + description = pd.DataFrame(columns, columns=["name", "field_type", "description"]) - if verbose: - _print_output(pd.DataFrame(columns)) - else: - return columns + return _handle_output(verbose=verbose, output_type="records", df=description) def get_table_size( @@ -653,56 +632,3 @@ def get_table_size( ) return _handle_output(verbose=verbose, output_type="records", df=table_data) - - -def search(query, order_by): - """This function works as a wrapper to the `bd_dataset_search` website API - enpoint. - - Args: - query (str): - String to search in datasets and tables' metadata. - order_by (str): score|popular|recent - Field by which the results will be ordered. - - Returns: - pd.DataFrame: - Response from the API presented as a pandas DataFrame. Each row is - a table. Each column is a field identifying the table. - """ - - # validate order_by input - if order_by not in ["score", "popular", "recent"]: - raise ValueError( - f'order_by must be score, popular or recent. Received "{order_by}"' - ) - - url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q={query}&order_by={order_by}&resource_type=bdm_table" - - # validate url - try: - response = requests.get(url) - response.raise_for_status() - except requests.exceptions.HTTPError as err: - print(err) - - json_response = response.json() - - dataset_dfs = [] - # first loop identify the number of the tables in each datasets - for dataset in json_response["result"]["datasets"]: - tables_dfs = [] - n_tables = len(dataset["resources"]) - # second loop extracts tables' information for each dataset - for table in dataset["resources"]: - data_table = pd.DataFrame( - {k: str(table[k]) for k in list(table.keys())}, index=[0] - ) - tables_dfs.append(data_table) - # append tables' dataframes for each dataset - data_ds = tables_dfs[0].append(tables_dfs[1:]).reset_index(drop=True) - dataset_dfs.append(data_ds) - # append datasets' dataframes - df = dataset_dfs[0].append(dataset_dfs[1:]).reset_index(drop=True) - - return df From 874fac94a2ae33e58535f0f9ca9e2e47a04045f9 Mon Sep 17 00:00:00 2001 From: lucascr91 Date: Sat, 29 Jan 2022 12:57:28 -0300 Subject: [PATCH 12/22] remove test_download --- python-package/tests/test_download.py | 122 ++++++++++++++------------ 1 file changed, 64 insertions(+), 58 deletions(-) diff --git a/python-package/tests/test_download.py b/python-package/tests/test_download.py index 4cd852843..7a8e1bd3b 100644 --- a/python-package/tests/test_download.py +++ b/python-package/tests/test_download.py @@ -205,104 +205,101 @@ def test_read_table(): ) -def test_list_datasets_simple_verbose(capsys): +def test_list_datasets(capsys): - out = list_datasets( - query="trabalho", limit=10, with_description=False, verbose=True - ) + list_datasets(from_file=True) out, err = capsys.readouterr() # Capture prints assert "dataset_id" in out - # check input error - with pytest.raises(ValueError): - search(query="trabalho", order_by="name") - -def test_list_datasets_simple_list(): - out = list_datasets(query="", limit=12, with_description=False, verbose=False) - # check if function returns list - assert isinstance(out, list) - assert len(out) == 12 +def test_list_datasets_complete(capsys): - -def test_list_datasets_complete_list(): - - out = list_datasets( - query="trabalho", limit=12, with_description=True, verbose=False - ) - # check if function returns list - assert isinstance(out, list) - assert "dataset_id" in out[0].keys() - assert "description" in out[0].keys() - - -def test_list_datasets_complete_verbose(capsys): - - list_datasets(query="trabalho", limit=10, with_description=True, verbose=True) + list_datasets(with_description=True, filter_by="ibge", from_file=True) out, err = capsys.readouterr() # Capture prints assert "dataset_id" in out assert "description" in out -def test_list_dataset_tables_simple_verbose(capsys): +def test_list_datasets_all_descriptions(capsys): - list_dataset_tables(dataset_id="br_me_caged", with_description=False, verbose=True) + list_datasets(with_description=True, from_file=True) out, err = capsys.readouterr() # Capture prints - assert "table_id" in out - + assert len(out) > 0 -def test_list_dataset_tables_simple_list(): - out = list_dataset_tables( - dataset_id="br_me_caged", with_description=False, verbose=False - ) +def test_list_datasets_verbose_false(): + out = list_datasets(from_file=True, verbose=False) assert type(out) == list assert len(out) > 0 -def test_list_dataset_tables_complete_verbose(capsys): +def test_list_dataset_tables(capsys): + + list_dataset_tables(dataset_id="br_ibge_censo_demografico", from_file=True) + out, err = capsys.readouterr() # Capture prints + assert "table_id" in out + - list_dataset_tables(dataset_id="br_me_caged", with_description=True, verbose=True) +def test_list_dataset_tables_complete(capsys): + list_dataset_tables( + dataset_id="br_ibge_censo_demografico", + filter_by="renda", + with_description=True, + from_file=True, + ) out, err = capsys.readouterr() # Capture prints assert "table_id" in out assert "description" in out + assert "renda" in out -def test_list_dataset_tables_complete_list(): +def test_list_dataset_tables_all_descriptions(capsys): + list_dataset_tables( + dataset_id="br_ibge_censo_demografico", with_description=True, from_file=True + ) + out, err = capsys.readouterr() # Capture prints + assert len(out) > 0 + + +def test_list_dataset_tables_verbose_false(): out = list_dataset_tables( - dataset_id="br_me_caged", with_description=True, verbose=False + dataset_id="br_ibge_censo_demografico", from_file=True, verbose=False ) - assert type(out) == list - assert type(out[0]) == dict + assert len(out) > 0 def test_get_dataset_description(capsys): - get_dataset_description("br_me_caged", verbose=True) + get_dataset_description("br_ibge_censo_demografico", from_file=True) out, err = capsys.readouterr() # Capture prints assert len(out) > 0 def test_get_dataset_description_verbose_false(): - out = get_dataset_description("br_me_caged", verbose=False) + out = get_dataset_description( + "br_ibge_censo_demografico", from_file=True, verbose=False + ) assert type(out) == str assert len(out) > 0 def test_get_table_description(capsys): - get_table_description("br_me_caged", "microdados_antigos") + get_table_description( + "br_ibge_censo_demografico", "setor_censitario_basico_2010", from_file=True + ) out, err = capsys.readouterr() # Capture prints assert len(out) > 0 def test_get_table_description_verbose_false(): out = get_table_description( - dataset_id="br_me_caged", - table_id="microdados_antigos", + dataset_id="br_ibge_censo_demografico", + table_id="setor_censitario_basico_2010", + from_file=True, verbose=False, ) assert type(out) == str @@ -311,18 +308,21 @@ def test_get_table_description_verbose_false(): def test_get_table_columns(capsys): get_table_columns( - dataset_id="br_me_caged", - table_id="microdados_antigos", + dataset_id="br_ibge_censo_demografico", + table_id="setor_censitario_basico_2010", + from_file=True, ) out, err = capsys.readouterr() # Capture prints assert "name" in out + assert "field_type" in out assert "description" in out def test_get_table_columns_verbose_false(): out = get_table_columns( - dataset_id="br_me_caged", - table_id="microdados_antigos", + dataset_id="br_ibge_censo_demografico", + table_id="setor_censitario_basico_2010", + from_file=True, verbose=False, ) assert type(out) == list @@ -352,13 +352,19 @@ def test_get_table_size_verbose_false(): assert type(out) == list assert len(out) > 0 - def test_search(): - out = search(query="agua", order_by="score") - # check if function returns pd.DataFrame + out = search( + query='agua', + order_by='score' + ) + #check if function returns pd.DataFrame assert isinstance(out, pd.DataFrame) - # check if there is duplicate tables in the result - assert out.id.nunique() == out.shape[0] - # check input error + #check if there is duplicate tables in the result + assert out.id.nunique()==out.shape[0] + #check input error with pytest.raises(ValueError): - search(query="agua", order_by="name") + search( + query='agua', + order_by='name' + ) + From b8af7de6548b08bb08d0d296745106f73e995211 Mon Sep 17 00:00:00 2001 From: Lucas Cavalcanti Rodrigues Date: Sat, 29 Jan 2022 12:58:42 -0300 Subject: [PATCH 13/22] Delete test_download.py --- python-package/tests/test_download.py | 370 -------------------------- 1 file changed, 370 deletions(-) delete mode 100644 python-package/tests/test_download.py diff --git a/python-package/tests/test_download.py b/python-package/tests/test_download.py deleted file mode 100644 index 7a8e1bd3b..000000000 --- a/python-package/tests/test_download.py +++ /dev/null @@ -1,370 +0,0 @@ -from os import read -import pytest -from pathlib import Path -import pandas as pd -from pandas_gbq.gbq import GenericGBQException -import shutil -from basedosdados.download.download import search - -from basedosdados import ( - download, - read_sql, - read_table, - list_datasets, - list_dataset_tables, - get_dataset_description, - get_table_description, - get_table_columns, - get_table_size, -) -from basedosdados.exceptions import ( - BaseDosDadosException, - BaseDosDadosNoBillingProjectIDException, - BaseDosDadosInvalidProjectIDException, -) - - -TEST_PROJECT_ID = "basedosdados-dev" -SAVEFILE = Path(__file__).parent / "tmp_bases" / "test.csv" -SAVEPATH = Path(__file__).parent / "tmp_bases" -shutil.rmtree(SAVEPATH, ignore_errors=True) - - -def test_download_by_query(): - - download( - SAVEFILE, - query="select * from `basedosdados.br_ibge_pib.municipio` limit 10", - billing_project_id=TEST_PROJECT_ID, - index=False, - from_file=True, - ) - - assert SAVEFILE.exists() - - -def test_download_by_table(): - - download( - SAVEFILE, - dataset_id="br_ibge_pib", - table_id="municipio", - billing_project_id=TEST_PROJECT_ID, - limit=10, - from_file=True, - index=False, - ) - - assert SAVEFILE.exists() - - -def test_download_save_to_path(): - - download( - SAVEPATH, - dataset_id="br_ibge_pib", - table_id="municipio", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - limit=10, - index=False, - ) - - assert (SAVEPATH / "municipio.csv").exists() - - -def test_download_no_query_or_table(): - - with pytest.raises(BaseDosDadosException): - download( - SAVEFILE, - limit=10, - ) - - -def test_download_pandas_kwargs(): - - download( - SAVEFILE, - dataset_id="br_ibge_pib", - table_id="municipio", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - limit=10, - sep="|", - index=False, - ) - - assert SAVEFILE.exists() - - -def test_read_sql(): - - assert isinstance( - read_sql( - query="select * from `basedosdados.br_ibge_pib.municipio` limit 10", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - ), - pd.DataFrame, - ) - - -def test_read_sql_no_billing_project_id(): - - with pytest.raises(BaseDosDadosNoBillingProjectIDException) as excinfo: - read_sql( - query="select * from `basedosdados.br_ibge_pib.municipio` limit 10", - ) - - assert "We are not sure which Google Cloud project should be billed." in str( - excinfo.value - ) - - -def test_read_sql_invalid_billing_project_id(): - - pattern = r"You are using an invalid `billing_project_id`" - - with pytest.raises(BaseDosDadosInvalidProjectIDException, match=pattern): - read_sql( - query="select * from `basedosdados.br_ibge_pib.municipio` limit 10", - billing_project_id="inexistent_project_id", - from_file=True, - ) - - -def test_read_sql_inexistent_project(): - - with pytest.raises(GenericGBQException) as excinfo: - read_sql( - query="select * from `asedosdados.br_ibge_pib.municipio` limit 10", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - ) - - assert "Reason: 404 Not found: Project" in str(excinfo.value) - - -def test_read_sql_inexistent_dataset(): - - with pytest.raises(GenericGBQException) as excinfo: - read_sql( - query="select * from `basedosdados-dev.br_ibge_inexistent.municipio` limit 10", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - ) - - assert "Reason: 404 Not found: Dataset" in str(excinfo.value) - - -def test_read_sql_inexistent_table(): - - with pytest.raises(GenericGBQException) as excinfo: - read_sql( - query="select * from `basedosdados.br_ibge_pib.inexistent` limit 10", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - ) - - assert "Reason: 404 Not found: Table" in str(excinfo.value) - - -def test_read_sql_syntax_error(): - - with pytest.raises(GenericGBQException) as excinfo: - read_sql( - query="invalid_statement * from `basedosdados.br_ibge_pib.municipio` limit 10", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - ) - - assert "Reason: 400 Syntax error" in str(excinfo.value) - - -def test_read_sql_out_of_bound_date(): - - read_sql( - query="select DATE('1000-01-01')", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - ) - - -def test_read_table(): - - assert isinstance( - read_table( - dataset_id="br_ibge_pib", - table_id="municipio", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - limit=10, - ), - pd.DataFrame, - ) - - -def test_list_datasets(capsys): - - list_datasets(from_file=True) - out, err = capsys.readouterr() # Capture prints - assert "dataset_id" in out - - -def test_list_datasets_complete(capsys): - - list_datasets(with_description=True, filter_by="ibge", from_file=True) - out, err = capsys.readouterr() # Capture prints - assert "dataset_id" in out - assert "description" in out - - -def test_list_datasets_all_descriptions(capsys): - - list_datasets(with_description=True, from_file=True) - out, err = capsys.readouterr() # Capture prints - assert len(out) > 0 - - -def test_list_datasets_verbose_false(): - - out = list_datasets(from_file=True, verbose=False) - assert type(out) == list - assert len(out) > 0 - - -def test_list_dataset_tables(capsys): - - list_dataset_tables(dataset_id="br_ibge_censo_demografico", from_file=True) - out, err = capsys.readouterr() # Capture prints - assert "table_id" in out - - -def test_list_dataset_tables_complete(capsys): - - list_dataset_tables( - dataset_id="br_ibge_censo_demografico", - filter_by="renda", - with_description=True, - from_file=True, - ) - out, err = capsys.readouterr() # Capture prints - assert "table_id" in out - assert "description" in out - assert "renda" in out - - -def test_list_dataset_tables_all_descriptions(capsys): - list_dataset_tables( - dataset_id="br_ibge_censo_demografico", with_description=True, from_file=True - ) - out, err = capsys.readouterr() # Capture prints - assert len(out) > 0 - - -def test_list_dataset_tables_verbose_false(): - - out = list_dataset_tables( - dataset_id="br_ibge_censo_demografico", from_file=True, verbose=False - ) - assert type(out) == list - assert len(out) > 0 - - -def test_get_dataset_description(capsys): - - get_dataset_description("br_ibge_censo_demografico", from_file=True) - out, err = capsys.readouterr() # Capture prints - assert len(out) > 0 - - -def test_get_dataset_description_verbose_false(): - out = get_dataset_description( - "br_ibge_censo_demografico", from_file=True, verbose=False - ) - assert type(out) == str - assert len(out) > 0 - - -def test_get_table_description(capsys): - get_table_description( - "br_ibge_censo_demografico", "setor_censitario_basico_2010", from_file=True - ) - out, err = capsys.readouterr() # Capture prints - assert len(out) > 0 - - -def test_get_table_description_verbose_false(): - out = get_table_description( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - from_file=True, - verbose=False, - ) - assert type(out) == str - assert len(out) > 0 - - -def test_get_table_columns(capsys): - get_table_columns( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - from_file=True, - ) - out, err = capsys.readouterr() # Capture prints - assert "name" in out - assert "field_type" in out - assert "description" in out - - -def test_get_table_columns_verbose_false(): - out = get_table_columns( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - from_file=True, - verbose=False, - ) - assert type(out) == list - assert len(out) > 0 - - -def test_get_table_size(capsys): - get_table_size( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - ) - out, err = capsys.readouterr() - assert "num_rows" in out - assert "size_mb" in out - - -def test_get_table_size_verbose_false(): - out = get_table_size( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - verbose=False, - ) - assert type(out) == list - assert len(out) > 0 - -def test_search(): - out = search( - query='agua', - order_by='score' - ) - #check if function returns pd.DataFrame - assert isinstance(out, pd.DataFrame) - #check if there is duplicate tables in the result - assert out.id.nunique()==out.shape[0] - #check input error - with pytest.raises(ValueError): - search( - query='agua', - order_by='name' - ) - From b8a5b3de3f0a05d83c773003e1ffad7b94544d1b Mon Sep 17 00:00:00 2001 From: lucascr91 Date: Sat, 29 Jan 2022 13:02:28 -0300 Subject: [PATCH 14/22] remove test files --- python-package/tests/test_download.py | 392 +++++++++++++++++++++++ python-package/tests/test_metadata.py | 439 ++++++++++++++++++++++++++ 2 files changed, 831 insertions(+) create mode 100644 python-package/tests/test_download.py create mode 100644 python-package/tests/test_metadata.py diff --git a/python-package/tests/test_download.py b/python-package/tests/test_download.py new file mode 100644 index 000000000..d73ec16df --- /dev/null +++ b/python-package/tests/test_download.py @@ -0,0 +1,392 @@ +from os import read +import pytest +from pathlib import Path +import pandas as pd +from pandas_gbq.gbq import GenericGBQException +import shutil +from basedosdados.download.download import search + +from basedosdados import ( + download, + read_sql, + read_table, + list_datasets, + list_dataset_tables, + get_dataset_description, + get_table_description, + get_table_columns, + get_table_size, +) +from basedosdados.exceptions import ( + BaseDosDadosException, + BaseDosDadosNoBillingProjectIDException, + BaseDosDadosInvalidProjectIDException, +) + + +TEST_PROJECT_ID = "basedosdados-dev" +SAVEFILE = Path(__file__).parent / "tmp_bases" / "test.csv" +SAVEPATH = Path(__file__).parent / "tmp_bases" +shutil.rmtree(SAVEPATH, ignore_errors=True) + + +def test_download_by_query(): + + download( + SAVEFILE, + query="select * from `basedosdados.br_ibge_pib.municipio` limit 10", + billing_project_id=TEST_PROJECT_ID, + index=False, + from_file=True, + ) + + assert SAVEFILE.exists() + + +def test_download_by_table(): + + download( + SAVEFILE, + dataset_id="br_ibge_pib", + table_id="municipio", + billing_project_id=TEST_PROJECT_ID, + limit=10, + from_file=True, + index=False, + ) + + assert SAVEFILE.exists() + + +def test_download_save_to_path(): + + download( + SAVEPATH, + dataset_id="br_ibge_pib", + table_id="municipio", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + limit=10, + index=False, + ) + + assert (SAVEPATH / "municipio.csv").exists() + + +def test_download_no_query_or_table(): + + with pytest.raises(BaseDosDadosException): + download( + SAVEFILE, + limit=10, + ) + + +def test_download_pandas_kwargs(): + + download( + SAVEFILE, + dataset_id="br_ibge_pib", + table_id="municipio", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + limit=10, + sep="|", + index=False, + ) + + assert SAVEFILE.exists() + + +def test_read_sql(): + + assert isinstance( + read_sql( + query="select * from `basedosdados.br_ibge_pib.municipio` limit 10", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + ), + pd.DataFrame, + ) + + +def test_read_sql_no_billing_project_id(): + + with pytest.raises(BaseDosDadosNoBillingProjectIDException) as excinfo: + read_sql( + query="select * from `basedosdados.br_ibge_pib.municipio` limit 10", + ) + + assert "We are not sure which Google Cloud project should be billed." in str( + excinfo.value + ) + + +def test_read_sql_invalid_billing_project_id(): + + pattern = r"You are using an invalid `billing_project_id`" + + with pytest.raises(BaseDosDadosInvalidProjectIDException, match=pattern): + read_sql( + query="select * from `basedosdados.br_ibge_pib.municipio` limit 10", + billing_project_id="inexistent_project_id", + from_file=True, + ) + + +def test_read_sql_inexistent_project(): + + with pytest.raises(GenericGBQException) as excinfo: + read_sql( + query="select * from `asedosdados.br_ibge_pib.municipio` limit 10", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + ) + + assert "Reason: 404 Not found: Project" in str(excinfo.value) + + +def test_read_sql_inexistent_dataset(): + + with pytest.raises(GenericGBQException) as excinfo: + read_sql( + query="select * from `basedosdados-dev.br_ibge_inexistent.municipio` limit 10", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + ) + + assert "Reason: 404 Not found: Dataset" in str(excinfo.value) + + +def test_read_sql_inexistent_table(): + + with pytest.raises(GenericGBQException) as excinfo: + read_sql( + query="select * from `basedosdados.br_ibge_pib.inexistent` limit 10", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + ) + + assert "Reason: 404 Not found: Table" in str(excinfo.value) + + +def test_read_sql_syntax_error(): + + with pytest.raises(GenericGBQException) as excinfo: + read_sql( + query="invalid_statement * from `basedosdados.br_ibge_pib.municipio` limit 10", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + ) + + assert "Reason: 400 Syntax error" in str(excinfo.value) + + +def test_read_sql_out_of_bound_date(): + + read_sql( + query="select DATE('1000-01-01')", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + ) + + +def test_read_table(): + + assert isinstance( + read_table( + dataset_id="br_ibge_pib", + table_id="municipio", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + limit=10, + ), + pd.DataFrame, + ) + + +def test_list_datasets_default(capsys): + + out = list_datasets( + query="trabalho", order_by="score", with_description=False, verbose=True + ) + out, err = capsys.readouterr() # Capture prints + assert "dataset_id" in out + # check input error + with pytest.raises(ValueError): + search(query="trabalho", order_by="name") + + +def test_list_datasets_noverbose(): + + out = list_datasets( + query="trabalho", order_by="score", with_description=False, verbose=False + ) + # check if function returns list + assert isinstance(out, list) + + +def test_list_datasets_complete_list(): + + out = list_datasets( + query="trabalho", order_by="score", with_description=True, verbose=False + ) + # check if function returns list + assert isinstance(out, list) + assert "dataset_id" in out[0].keys() + assert "description" in out[0].keys() + + +def test_list_datasets_complete_verbose(capsys): + + list_datasets( + query="trabalho", order_by="score", with_description=True, verbose=True + ) + out, err = capsys.readouterr() # Capture prints + assert "dataset_id" in out + assert "description" in out + + + + +def test_list_datasets_verbose_false(): + + out = list_datasets(from_file=True, verbose=False) + assert type(out) == list + assert len(out) > 0 + + +def test_list_dataset_tables(capsys): + + list_dataset_tables(dataset_id="br_ibge_censo_demografico", from_file=True) + out, err = capsys.readouterr() # Capture prints + assert "table_id" in out + + +def test_list_dataset_tables_complete(capsys): + + list_dataset_tables( + dataset_id="br_ibge_censo_demografico", + filter_by="renda", + with_description=True, + from_file=True, + ) + out, err = capsys.readouterr() # Capture prints + assert "table_id" in out + assert "description" in out + assert "renda" in out + + +def test_list_dataset_tables_all_descriptions(capsys): + list_dataset_tables( + dataset_id="br_ibge_censo_demografico", with_description=True, from_file=True + ) + out, err = capsys.readouterr() # Capture prints + assert len(out) > 0 + + +def test_list_dataset_tables_verbose_false(): + + out = list_dataset_tables( + dataset_id="br_ibge_censo_demografico", from_file=True, verbose=False + ) + assert type(out) == list + assert len(out) > 0 + + +def test_get_dataset_description(capsys): + + get_dataset_description("br_ibge_censo_demografico", from_file=True) + out, err = capsys.readouterr() # Capture prints + assert len(out) > 0 + + +def test_get_dataset_description_verbose_false(): + out = get_dataset_description( + "br_ibge_censo_demografico", from_file=True, verbose=False + ) + assert type(out) == str + assert len(out) > 0 + + +def test_get_table_description(capsys): + get_table_description( + "br_ibge_censo_demografico", "setor_censitario_basico_2010", from_file=True + ) + out, err = capsys.readouterr() # Capture prints + assert len(out) > 0 + + +def test_get_table_description_verbose_false(): + out = get_table_description( + dataset_id="br_ibge_censo_demografico", + table_id="setor_censitario_basico_2010", + from_file=True, + verbose=False, + ) + assert type(out) == str + assert len(out) > 0 + + +def test_get_table_columns(capsys): + get_table_columns( + dataset_id="br_ibge_censo_demografico", + table_id="setor_censitario_basico_2010", + from_file=True, + ) + out, err = capsys.readouterr() # Capture prints + assert "name" in out + assert "field_type" in out + assert "description" in out + + +def test_get_table_columns_verbose_false(): + out = get_table_columns( + dataset_id="br_ibge_censo_demografico", + table_id="setor_censitario_basico_2010", + from_file=True, + verbose=False, + ) + assert type(out) == list + assert len(out) > 0 + + +def test_get_table_size(capsys): + get_table_size( + dataset_id="br_ibge_censo_demografico", + table_id="setor_censitario_basico_2010", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + ) + out, err = capsys.readouterr() + assert "num_rows" in out + assert "size_mb" in out + + +def test_get_table_size_verbose_false(): + out = get_table_size( + dataset_id="br_ibge_censo_demografico", + table_id="setor_censitario_basico_2010", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + verbose=False, + ) + assert type(out) == list + assert len(out) > 0 + +def test_search(): + out = search( + query='agua', + order_by='score' + ) + #check if function returns pd.DataFrame + assert isinstance(out, pd.DataFrame) + #check if there is duplicate tables in the result + assert out.id.nunique()==out.shape[0] + #check input error + with pytest.raises(ValueError): + search( + query='agua', + order_by='name' + ) + diff --git a/python-package/tests/test_metadata.py b/python-package/tests/test_metadata.py new file mode 100644 index 000000000..316adfdfc --- /dev/null +++ b/python-package/tests/test_metadata.py @@ -0,0 +1,439 @@ +import pytest +import ruamel.yaml as ryaml + +from pathlib import Path +import shutil +import random +import string + +from basedosdados import Metadata +from basedosdados.exceptions import BaseDosDadosException + + +DATASET_ID = "pytest" +TABLE_ID = "pytest" + +METADATA_FILES = {"dataset": "dataset_config.yaml", "table": "table_config.yaml"} + + +@pytest.fixture +def metadatadir(tmpdir_factory): + (Path(__file__).parent / "tmp_bases").mkdir(exist_ok=True) + return Path(__file__).parent / "tmp_bases" + + +@pytest.fixture +def dataset_metadata(metadatadir): + return Metadata(dataset_id=DATASET_ID, metadata_path=metadatadir) + + +@pytest.fixture +def table_metadata(metadatadir): + return Metadata(dataset_id=DATASET_ID, table_id=TABLE_ID, metadata_path=metadatadir) + + +@pytest.fixture +def dataset_metadata_path(metadatadir): + return Path(metadatadir) / DATASET_ID + + +@pytest.fixture +def table_metadata_path(metadatadir): + return Path(metadatadir) / DATASET_ID / TABLE_ID + + +def test_create_from_dataset_id(dataset_metadata, dataset_metadata_path): + shutil.rmtree(dataset_metadata_path, ignore_errors=True) + dataset_metadata.create() + assert (dataset_metadata_path / METADATA_FILES["dataset"]).exists() + + +def test_create_from_dataset_and_table_id(table_metadata, table_metadata_path): + shutil.rmtree(table_metadata_path, ignore_errors=True) + table_metadata.create() + assert (table_metadata_path / METADATA_FILES["table"]).exists() + + +def test_create_if_exists_raise(dataset_metadata, table_metadata): + + with pytest.raises(FileExistsError): + dataset_metadata.create(if_exists="raise") + + with pytest.raises(FileExistsError): + table_metadata.create(if_exists="raise") + + +def test_create_if_exists_replace( + dataset_metadata, dataset_metadata_path, table_metadata, table_metadata_path +): + dataset_metadata.create(if_exists="replace") + assert (dataset_metadata_path / METADATA_FILES["dataset"]).exists() + + table_metadata.create(if_exists="replace") + assert (table_metadata_path / METADATA_FILES["table"]).exists() + + +def test_create_if_exists_pass( + dataset_metadata, dataset_metadata_path, table_metadata, table_metadata_path +): + + # make sure new file is created + dataset_metadata.create(if_exists="replace") + assert (dataset_metadata_path / METADATA_FILES["dataset"]).exists() + + # make sure no Exception is raised on `if_exists="pass"` + dataset_metadata.create(if_exists="pass") + + # same procedure for `Table` + table_metadata.create(if_exists="replace") + assert (table_metadata_path / METADATA_FILES["table"]).exists() + table_metadata.create(if_exists="pass") + + +def test_create_columns(table_metadata, table_metadata_path): + shutil.rmtree(table_metadata_path, ignore_errors=True) + table_metadata.create(columns=["column1", "column2"]) + assert (table_metadata_path / METADATA_FILES["table"]).exists() + + +@pytest.fixture +def existent_metadata(metadatadir): + table_metadata_obj = Metadata( + dataset_id="br_me_rais", + table_id="microdados_vinculos", + metadata_path=metadatadir, + ) + return table_metadata_obj + + +@pytest.fixture +def existent_metadata_path(metadatadir): + return Path(metadatadir) / "br_me_rais" / "microdados_vinculos" + + +def test_create_partition_columns_from_existent_table( + existent_metadata: Metadata, + existent_metadata_path: Path, +): + shutil.rmtree(existent_metadata_path, ignore_errors=True) + + existent_metadata.create() + assert existent_metadata_path.exists() + + metadata_dict = existent_metadata.local_metadata + assert metadata_dict.get("partitions") == "ano, sigla_uf, id_municipio" + + +def test_create_partition_columns_from_user_input( + existent_metadata: Metadata, + existent_metadata_path: Path, +): + shutil.rmtree(existent_metadata_path, ignore_errors=True) + + existent_metadata.create(partition_columns=["id_municipio"]) + assert existent_metadata_path.exists() + + metadata_dict = existent_metadata.local_metadata + assert metadata_dict.get("partitions") == "id_municipio" + + +def test_create_force_columns_is_true( + existent_metadata: Metadata, + existent_metadata_path: Path, +): + shutil.rmtree(existent_metadata_path, ignore_errors=True) + existent_metadata.create(columns=["column1", "column2"], force_columns=True) + assert (existent_metadata_path / METADATA_FILES["table"]).exists() + + table_metadata_dict = existent_metadata.local_metadata + assert table_metadata_dict["columns"][0]["name"] == "column1" + assert table_metadata_dict["columns"][1]["name"] == "column2" + + +def test_create_force_columns_is_false( + existent_metadata: Metadata, + existent_metadata_path: Path, +): + shutil.rmtree(existent_metadata_path, ignore_errors=True) + existent_metadata.create(columns=["column1", "column2"], force_columns=False) + assert (existent_metadata_path / METADATA_FILES["table"]).exists() + + table_metadata_dict = existent_metadata.local_metadata + assert table_metadata_dict["columns"][0]["name"] != "column1" + assert table_metadata_dict["columns"][1]["name"] != "column2" + + +def test_create_table_only_is_true( + table_metadata, dataset_metadata_path, table_metadata_path + ): + shutil.rmtree(dataset_metadata_path, ignore_errors=True) + shutil.rmtree(table_metadata_path, ignore_errors=True) + + table_metadata.create(table_only=True) + assert (table_metadata_path / METADATA_FILES["table"]).exists() + assert not (dataset_metadata_path / METADATA_FILES["dataset"]).exists() + + +def test_create_table_only_is_false( + table_metadata, dataset_metadata_path, table_metadata_path + ): + shutil.rmtree(dataset_metadata_path, ignore_errors=True) + shutil.rmtree(table_metadata_path, ignore_errors=True) + + table_metadata.create(table_only=False) + assert (table_metadata_path / METADATA_FILES["table"]).exists() + assert (dataset_metadata_path / METADATA_FILES["dataset"]).exists() + + +@pytest.fixture +def out_of_date_metadata_obj(metadatadir): + out_of_date_metadata = Metadata(dataset_id="br_me_caged", metadata_path=metadatadir) + out_of_date_metadata.create(if_exists="replace") + + out_of_date_config = out_of_date_metadata.local_metadata + out_of_date_config["metadata_modified"] = "old_date" + ryaml.dump( + out_of_date_config, open(out_of_date_metadata.filepath, "w", encoding="utf-8") + ) + + return out_of_date_metadata + + +@pytest.fixture +def updated_metadata_obj(metadatadir): + updated_metadata = Metadata(dataset_id="br_me_caged", metadata_path=metadatadir) + updated_metadata.create(if_exists="replace") + + updated_config = updated_metadata.local_metadata + updated_config["metadata_modified"] = updated_metadata.ckan_metadata[ + "metadata_modified" + ] + ryaml.dump(updated_config, open(updated_metadata.filepath, "w", encoding="utf-8")) + + return updated_metadata + + +def test_is_updated_is_true(updated_metadata_obj): + assert updated_metadata_obj.is_updated() == True + + +def test_is_updated_is_false(out_of_date_metadata_obj): + assert out_of_date_metadata_obj.is_updated() == False + + +@pytest.fixture +def valid_metadata_dataset(metadatadir): + dataset_metadata = Metadata(dataset_id="br_ibge_pib", metadata_path=metadatadir) + dataset_metadata.create(if_exists="replace") + dataset_metadata.CKAN_API_KEY = "valid-key" + return dataset_metadata + + +@pytest.fixture +def valid_metadata_table(metadatadir): + table_metadata = Metadata( + dataset_id="br_ibge_pib", + table_id="municipio", + metadata_path=metadatadir, + ) + table_metadata.create(if_exists="replace") + dataset_metadata.CKAN_API_KEY = "valid-key" + return table_metadata + + +def test_validate_is_succesful( + valid_metadata_dataset: Metadata, valid_metadata_table: Metadata +): + assert valid_metadata_dataset.validate() == True + assert valid_metadata_table.validate() == True + + +@pytest.fixture +def invalid_dataset_metadata(metadatadir): + invalid_dataset_metadata = Metadata( + dataset_id="br_ibge_pib", + metadata_path=metadatadir, + ) + invalid_dataset_metadata.create(if_exists="replace") + + invalid_config = invalid_dataset_metadata.local_metadata + invalid_config["title"] = {"this_title": "is_not_valid"} + + print(invalid_dataset_metadata.filepath) + + with open(invalid_dataset_metadata.filepath, "w", encoding="utf-8") as file: + ryaml.dump(invalid_config, file) + + return invalid_dataset_metadata + + +@pytest.fixture +def invalid_table_metadata(metadatadir): + invalid_dataset_metadata = Metadata( + dataset_id="br_ibge_pib", + table_id="municipio", + metadata_path=metadatadir, + ) + invalid_dataset_metadata.create(if_exists="replace") + + invalid_config = invalid_dataset_metadata.local_metadata + invalid_config["table_id"] = None + + with open(invalid_dataset_metadata.filepath, "w", encoding="utf-8") as file: + ryaml.dump(invalid_config, file) + + return invalid_dataset_metadata + + +def test_validate_is_not_succesful( + invalid_dataset_metadata: Metadata, + invalid_table_metadata: Metadata, +): + with pytest.raises(BaseDosDadosException): + invalid_table_metadata.validate() + + with pytest.raises(BaseDosDadosException): + invalid_dataset_metadata.validate() + + +@pytest.fixture +def invalid_organization_dataset(metadatadir): + invalid_organization_dataset = Metadata( + dataset_id="br_ibge_pib", + metadata_path=metadatadir, + ) + invalid_organization_dataset.create(if_exists="replace") + + invalid_config = invalid_organization_dataset.local_metadata + invalid_config["organization"] = "not-a-valid-organization" + + with open(invalid_organization_dataset.filepath, "w", encoding="utf-8") as file: + ryaml.dump(invalid_config, file) + + return invalid_organization_dataset + + +def test_validate_organization_not_found(invalid_organization_dataset): + with pytest.raises(BaseDosDadosException, match="Organization not found"): + invalid_organization_dataset.validate() + + +# TODO: Mock ckan server to activate publish tests +@pytest.fixture +def pytest_dataset(metadatadir): + shutil.rmtree(metadatadir, ignore_errors=True) + pytest_dataset = Metadata( + dataset_id="pytest", + metadata_path=metadatadir + ) + pytest_dataset.create(if_exists="replace") + + # fill dataset metadata for it to be publishable + pytest_dataset_metadata = pytest_dataset.local_metadata + pytest_dataset_metadata["organization"] = "acaps" # set valid organization + + # materialize metadata file + ryaml.dump( + pytest_dataset_metadata, + open(pytest_dataset.filepath, "w", encoding="utf-8") + ) + + return pytest_dataset + + +@pytest.fixture +def pytest_table(metadatadir): + shutil.rmtree(metadatadir, ignore_errors=True) + pytest_table = Metadata( + dataset_id="pytest", + table_id="pytest" + ) + pytest_table.create(if_exists="replace") + return pytest_table + + +@pytest.mark.skip( + reason="This test requires a mocked CKAN server and a test dataset/table.") +def test_publish_is_successful( + valid_metadata_dataset: Metadata, + valid_metadata_table: Metadata, +): + assert isinstance(valid_metadata_dataset.publish(), dict) + assert isinstance(valid_metadata_table.publish(), dict) + + +@pytest.mark.skip(reason="This test requires a mocked CKAN server.") +def test_publish_is_not_successful( + invalid_dataset_metadata: Metadata, + invalid_table_metadata: Metadata, +): + with pytest.raises(AssertionError, match="Could not publish"): + invalid_dataset_metadata.publish() + + with pytest.raises(BaseDosDadosException, match="Could not publish"): + invalid_table_metadata.publish() + + +@pytest.mark.skip( + reason="This test requires a mocked CKAN server and a delete endpoint." +) +def test_publish_all_is_true( + pytest_dataset: Metadata, + pytest_table: Metadata, +): + res = pytest_table.publish(all=True) + assert isinstance(res, dict) + assert res != {} + assert pytest_dataset.exists_in_ckan() + + +@pytest.mark.skip(reason="This test requires a mocked CKAN server.") +def test_publish_if_exists_raise(valid_metadata_dataset: Metadata): + with pytest.raises(BaseDosDadosException, match="already exists in CKAN"): + valid_metadata_dataset.publish(if_exists="raise") + + +@pytest.mark.skip( + reason="This test requires a mocked CKAN server and a test dataset." +) +def test_publish_if_exists_replace(valid_metadata_dataset: Metadata): + res = valid_metadata_dataset.publish(if_exists="replace") + assert isinstance(res, dict) + assert res != {} + + +@pytest.mark.skip(reason="This test requires a mocked CKAN server.") +def test_publish_if_exists_pass(valid_metadata_dataset: Metadata): + assert isinstance(valid_metadata_dataset.publish(if_exists="pass"), dict) + assert valid_metadata_dataset.publish(if_exists="pass") == {} + + +@pytest.mark.skip(reason="This test requires a mocked CKAN server.") +def test_publish_update_locally_is_true( + pytest_dataset: Metadata +): + date_before = pytest_dataset.local_metadata.get('metadata_modified') + + # update local metadata + new_metadata = pytest_dataset.local_metadata.copy() + + # generate random strings with 3 characters + random_string = "".join(random.choice(string.ascii_uppercase) for _ in range(3)) + + # update metadata tags with random_string + new_metadata["tags"] = [random_string] + ryaml.dump( + new_metadata, open(new_metadata.filepath, "w", encoding="utf-8") + ) + + # publish changes + pytest_dataset.publish(update_locally=True) + + # get new tags from local metadata + new_tags = pytest_dataset.local_metadata.get('tags') + + # get new `metadata_modified` value from local config file + date_after = pytest_dataset.local_metadata.get('metadata_modified') + + assert new_tags == [random_string], "Tags were not updated locally" + assert date_after > date_before, "Date after should be greater than date before" From 1b8aa5849eb0255e4d8e915557980888464e4fa9 Mon Sep 17 00:00:00 2001 From: lucascr91 Date: Sat, 29 Jan 2022 13:05:24 -0300 Subject: [PATCH 15/22] remove test_download.py --- python-package/tests/test_download.py | 392 -------------------------- 1 file changed, 392 deletions(-) delete mode 100644 python-package/tests/test_download.py diff --git a/python-package/tests/test_download.py b/python-package/tests/test_download.py deleted file mode 100644 index d73ec16df..000000000 --- a/python-package/tests/test_download.py +++ /dev/null @@ -1,392 +0,0 @@ -from os import read -import pytest -from pathlib import Path -import pandas as pd -from pandas_gbq.gbq import GenericGBQException -import shutil -from basedosdados.download.download import search - -from basedosdados import ( - download, - read_sql, - read_table, - list_datasets, - list_dataset_tables, - get_dataset_description, - get_table_description, - get_table_columns, - get_table_size, -) -from basedosdados.exceptions import ( - BaseDosDadosException, - BaseDosDadosNoBillingProjectIDException, - BaseDosDadosInvalidProjectIDException, -) - - -TEST_PROJECT_ID = "basedosdados-dev" -SAVEFILE = Path(__file__).parent / "tmp_bases" / "test.csv" -SAVEPATH = Path(__file__).parent / "tmp_bases" -shutil.rmtree(SAVEPATH, ignore_errors=True) - - -def test_download_by_query(): - - download( - SAVEFILE, - query="select * from `basedosdados.br_ibge_pib.municipio` limit 10", - billing_project_id=TEST_PROJECT_ID, - index=False, - from_file=True, - ) - - assert SAVEFILE.exists() - - -def test_download_by_table(): - - download( - SAVEFILE, - dataset_id="br_ibge_pib", - table_id="municipio", - billing_project_id=TEST_PROJECT_ID, - limit=10, - from_file=True, - index=False, - ) - - assert SAVEFILE.exists() - - -def test_download_save_to_path(): - - download( - SAVEPATH, - dataset_id="br_ibge_pib", - table_id="municipio", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - limit=10, - index=False, - ) - - assert (SAVEPATH / "municipio.csv").exists() - - -def test_download_no_query_or_table(): - - with pytest.raises(BaseDosDadosException): - download( - SAVEFILE, - limit=10, - ) - - -def test_download_pandas_kwargs(): - - download( - SAVEFILE, - dataset_id="br_ibge_pib", - table_id="municipio", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - limit=10, - sep="|", - index=False, - ) - - assert SAVEFILE.exists() - - -def test_read_sql(): - - assert isinstance( - read_sql( - query="select * from `basedosdados.br_ibge_pib.municipio` limit 10", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - ), - pd.DataFrame, - ) - - -def test_read_sql_no_billing_project_id(): - - with pytest.raises(BaseDosDadosNoBillingProjectIDException) as excinfo: - read_sql( - query="select * from `basedosdados.br_ibge_pib.municipio` limit 10", - ) - - assert "We are not sure which Google Cloud project should be billed." in str( - excinfo.value - ) - - -def test_read_sql_invalid_billing_project_id(): - - pattern = r"You are using an invalid `billing_project_id`" - - with pytest.raises(BaseDosDadosInvalidProjectIDException, match=pattern): - read_sql( - query="select * from `basedosdados.br_ibge_pib.municipio` limit 10", - billing_project_id="inexistent_project_id", - from_file=True, - ) - - -def test_read_sql_inexistent_project(): - - with pytest.raises(GenericGBQException) as excinfo: - read_sql( - query="select * from `asedosdados.br_ibge_pib.municipio` limit 10", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - ) - - assert "Reason: 404 Not found: Project" in str(excinfo.value) - - -def test_read_sql_inexistent_dataset(): - - with pytest.raises(GenericGBQException) as excinfo: - read_sql( - query="select * from `basedosdados-dev.br_ibge_inexistent.municipio` limit 10", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - ) - - assert "Reason: 404 Not found: Dataset" in str(excinfo.value) - - -def test_read_sql_inexistent_table(): - - with pytest.raises(GenericGBQException) as excinfo: - read_sql( - query="select * from `basedosdados.br_ibge_pib.inexistent` limit 10", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - ) - - assert "Reason: 404 Not found: Table" in str(excinfo.value) - - -def test_read_sql_syntax_error(): - - with pytest.raises(GenericGBQException) as excinfo: - read_sql( - query="invalid_statement * from `basedosdados.br_ibge_pib.municipio` limit 10", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - ) - - assert "Reason: 400 Syntax error" in str(excinfo.value) - - -def test_read_sql_out_of_bound_date(): - - read_sql( - query="select DATE('1000-01-01')", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - ) - - -def test_read_table(): - - assert isinstance( - read_table( - dataset_id="br_ibge_pib", - table_id="municipio", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - limit=10, - ), - pd.DataFrame, - ) - - -def test_list_datasets_default(capsys): - - out = list_datasets( - query="trabalho", order_by="score", with_description=False, verbose=True - ) - out, err = capsys.readouterr() # Capture prints - assert "dataset_id" in out - # check input error - with pytest.raises(ValueError): - search(query="trabalho", order_by="name") - - -def test_list_datasets_noverbose(): - - out = list_datasets( - query="trabalho", order_by="score", with_description=False, verbose=False - ) - # check if function returns list - assert isinstance(out, list) - - -def test_list_datasets_complete_list(): - - out = list_datasets( - query="trabalho", order_by="score", with_description=True, verbose=False - ) - # check if function returns list - assert isinstance(out, list) - assert "dataset_id" in out[0].keys() - assert "description" in out[0].keys() - - -def test_list_datasets_complete_verbose(capsys): - - list_datasets( - query="trabalho", order_by="score", with_description=True, verbose=True - ) - out, err = capsys.readouterr() # Capture prints - assert "dataset_id" in out - assert "description" in out - - - - -def test_list_datasets_verbose_false(): - - out = list_datasets(from_file=True, verbose=False) - assert type(out) == list - assert len(out) > 0 - - -def test_list_dataset_tables(capsys): - - list_dataset_tables(dataset_id="br_ibge_censo_demografico", from_file=True) - out, err = capsys.readouterr() # Capture prints - assert "table_id" in out - - -def test_list_dataset_tables_complete(capsys): - - list_dataset_tables( - dataset_id="br_ibge_censo_demografico", - filter_by="renda", - with_description=True, - from_file=True, - ) - out, err = capsys.readouterr() # Capture prints - assert "table_id" in out - assert "description" in out - assert "renda" in out - - -def test_list_dataset_tables_all_descriptions(capsys): - list_dataset_tables( - dataset_id="br_ibge_censo_demografico", with_description=True, from_file=True - ) - out, err = capsys.readouterr() # Capture prints - assert len(out) > 0 - - -def test_list_dataset_tables_verbose_false(): - - out = list_dataset_tables( - dataset_id="br_ibge_censo_demografico", from_file=True, verbose=False - ) - assert type(out) == list - assert len(out) > 0 - - -def test_get_dataset_description(capsys): - - get_dataset_description("br_ibge_censo_demografico", from_file=True) - out, err = capsys.readouterr() # Capture prints - assert len(out) > 0 - - -def test_get_dataset_description_verbose_false(): - out = get_dataset_description( - "br_ibge_censo_demografico", from_file=True, verbose=False - ) - assert type(out) == str - assert len(out) > 0 - - -def test_get_table_description(capsys): - get_table_description( - "br_ibge_censo_demografico", "setor_censitario_basico_2010", from_file=True - ) - out, err = capsys.readouterr() # Capture prints - assert len(out) > 0 - - -def test_get_table_description_verbose_false(): - out = get_table_description( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - from_file=True, - verbose=False, - ) - assert type(out) == str - assert len(out) > 0 - - -def test_get_table_columns(capsys): - get_table_columns( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - from_file=True, - ) - out, err = capsys.readouterr() # Capture prints - assert "name" in out - assert "field_type" in out - assert "description" in out - - -def test_get_table_columns_verbose_false(): - out = get_table_columns( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - from_file=True, - verbose=False, - ) - assert type(out) == list - assert len(out) > 0 - - -def test_get_table_size(capsys): - get_table_size( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - ) - out, err = capsys.readouterr() - assert "num_rows" in out - assert "size_mb" in out - - -def test_get_table_size_verbose_false(): - out = get_table_size( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - verbose=False, - ) - assert type(out) == list - assert len(out) > 0 - -def test_search(): - out = search( - query='agua', - order_by='score' - ) - #check if function returns pd.DataFrame - assert isinstance(out, pd.DataFrame) - #check if there is duplicate tables in the result - assert out.id.nunique()==out.shape[0] - #check input error - with pytest.raises(ValueError): - search( - query='agua', - order_by='name' - ) - From 06a951ed5252a2d43e634aefc118702d666a2a8a Mon Sep 17 00:00:00 2001 From: lucascr91 Date: Sat, 29 Jan 2022 13:14:32 -0300 Subject: [PATCH 16/22] remove test_download.py --- python-package/tests/test_download.py | 378 ++++++++++++++++++++++++++ 1 file changed, 378 insertions(+) create mode 100644 python-package/tests/test_download.py diff --git a/python-package/tests/test_download.py b/python-package/tests/test_download.py new file mode 100644 index 000000000..7c878e051 --- /dev/null +++ b/python-package/tests/test_download.py @@ -0,0 +1,378 @@ +from os import read +import pytest +from pathlib import Path +import pandas as pd +from pandas_gbq.gbq import GenericGBQException +import shutil +from basedosdados.download.download import search + +from basedosdados import ( + download, + read_sql, + read_table, + list_datasets, + list_dataset_tables, + get_dataset_description, + get_table_description, + get_table_columns, + get_table_size, +) +from basedosdados.exceptions import ( + BaseDosDadosException, + BaseDosDadosNoBillingProjectIDException, + BaseDosDadosInvalidProjectIDException, +) + + +TEST_PROJECT_ID = "basedosdados-dev" +SAVEFILE = Path(__file__).parent / "tmp_bases" / "test.csv" +SAVEPATH = Path(__file__).parent / "tmp_bases" +shutil.rmtree(SAVEPATH, ignore_errors=True) + + +def test_download_by_query(): + + download( + SAVEFILE, + query="select * from `basedosdados.br_ibge_pib.municipio` limit 10", + billing_project_id=TEST_PROJECT_ID, + index=False, + from_file=True, + ) + + assert SAVEFILE.exists() + + +def test_download_by_table(): + + download( + SAVEFILE, + dataset_id="br_ibge_pib", + table_id="municipio", + billing_project_id=TEST_PROJECT_ID, + limit=10, + from_file=True, + index=False, + ) + + assert SAVEFILE.exists() + + +def test_download_save_to_path(): + + download( + SAVEPATH, + dataset_id="br_ibge_pib", + table_id="municipio", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + limit=10, + index=False, + ) + + assert (SAVEPATH / "municipio.csv").exists() + + +def test_download_no_query_or_table(): + + with pytest.raises(BaseDosDadosException): + download( + SAVEFILE, + limit=10, + ) + + +def test_download_pandas_kwargs(): + + download( + SAVEFILE, + dataset_id="br_ibge_pib", + table_id="municipio", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + limit=10, + sep="|", + index=False, + ) + + assert SAVEFILE.exists() + + +def test_read_sql(): + + assert isinstance( + read_sql( + query="select * from `basedosdados.br_ibge_pib.municipio` limit 10", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + ), + pd.DataFrame, + ) + + +def test_read_sql_no_billing_project_id(): + + with pytest.raises(BaseDosDadosNoBillingProjectIDException) as excinfo: + read_sql( + query="select * from `basedosdados.br_ibge_pib.municipio` limit 10", + ) + + assert "We are not sure which Google Cloud project should be billed." in str( + excinfo.value + ) + + +def test_read_sql_invalid_billing_project_id(): + + pattern = r"You are using an invalid `billing_project_id`" + + with pytest.raises(BaseDosDadosInvalidProjectIDException, match=pattern): + read_sql( + query="select * from `basedosdados.br_ibge_pib.municipio` limit 10", + billing_project_id="inexistent_project_id", + from_file=True, + ) + + +def test_read_sql_inexistent_project(): + + with pytest.raises(GenericGBQException) as excinfo: + read_sql( + query="select * from `asedosdados.br_ibge_pib.municipio` limit 10", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + ) + + assert "Reason: 404 Not found: Project" in str(excinfo.value) + + +def test_read_sql_inexistent_dataset(): + + with pytest.raises(GenericGBQException) as excinfo: + read_sql( + query="select * from `basedosdados-dev.br_ibge_inexistent.municipio` limit 10", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + ) + + assert "Reason: 404 Not found: Dataset" in str(excinfo.value) + + +def test_read_sql_inexistent_table(): + + with pytest.raises(GenericGBQException) as excinfo: + read_sql( + query="select * from `basedosdados.br_ibge_pib.inexistent` limit 10", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + ) + + assert "Reason: 404 Not found: Table" in str(excinfo.value) + + +def test_read_sql_syntax_error(): + + with pytest.raises(GenericGBQException) as excinfo: + read_sql( + query="invalid_statement * from `basedosdados.br_ibge_pib.municipio` limit 10", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + ) + + assert "Reason: 400 Syntax error" in str(excinfo.value) + + +def test_read_sql_out_of_bound_date(): + + read_sql( + query="select DATE('1000-01-01')", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + ) + + +def test_read_table(): + + assert isinstance( + read_table( + dataset_id="br_ibge_pib", + table_id="municipio", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + limit=10, + ), + pd.DataFrame, + ) + + +def test_list_datasets_simple_verbose(capsys): + + out = list_datasets( + query="trabalho", limit=10, with_description=False, verbose=True + ) + out, err = capsys.readouterr() # Capture prints + assert "dataset_id" in out + # check input error + with pytest.raises(ValueError): + search(query="trabalho", order_by="name") + + +def test_list_datasets_simple_list(): + + out = list_datasets( + query="", limit=12, with_description=False, verbose=False + ) + # check if function returns list + assert isinstance(out, list) + assert len(out) == 12 + + +def test_list_datasets_complete_list(): + + out = list_datasets( + query="trabalho", limit=12, with_description=True, verbose=False + ) + # check if function returns list + assert isinstance(out, list) + assert "dataset_id" in out[0].keys() + assert "description" in out[0].keys() + + +def test_list_datasets_complete_verbose(capsys): + + list_datasets( + query="trabalho", limit=10, with_description=True, verbose=True + ) + out, err = capsys.readouterr() # Capture prints + assert "dataset_id" in out + assert "description" in out + + +def test_list_dataset_tables_simple_verbose(capsys): + + list_dataset_tables(dataset_id="br-sp-alesp", with_description=False, verbose=True) + out, err = capsys.readouterr() # Capture prints + assert "table_id" in out + + +def test_list_dataset_tables_simple_list(): + + out = list_dataset_tables(dataset_id="br-sp-alesp", with_description=False, verbose=False) + + assert type(out) == list + assert len(out) > 0 + + +def test_list_dataset_tables_complete_verbose(capsys): + + list_dataset_tables(dataset_id="br-sp-alesp", with_description=True, verbose=True) + + out, err = capsys.readouterr() # Capture prints + assert "table_id" in out + assert "description" in out + + +def test_list_dataset_tables_complete_list(): + + out = list_dataset_tables(dataset_id="br-sp-alesp", with_description=True, verbose=False) + + assert type(out)==list + assert type(out[0])==dict + + +def test_get_dataset_description(capsys): + + get_dataset_description("br_ibge_censo_demografico", from_file=True) + out, err = capsys.readouterr() # Capture prints + assert len(out) > 0 + + +def test_get_dataset_description_verbose_false(): + out = get_dataset_description( + "br_ibge_censo_demografico", from_file=True, verbose=False + ) + assert type(out) == str + assert len(out) > 0 + + +def test_get_table_description(capsys): + get_table_description( + "br_ibge_censo_demografico", "setor_censitario_basico_2010", from_file=True + ) + out, err = capsys.readouterr() # Capture prints + assert len(out) > 0 + + +def test_get_table_description_verbose_false(): + out = get_table_description( + dataset_id="br_ibge_censo_demografico", + table_id="setor_censitario_basico_2010", + from_file=True, + verbose=False, + ) + assert type(out) == str + assert len(out) > 0 + + +def test_get_table_columns(capsys): + get_table_columns( + dataset_id="br_ibge_censo_demografico", + table_id="setor_censitario_basico_2010", + from_file=True, + ) + out, err = capsys.readouterr() # Capture prints + assert "name" in out + assert "field_type" in out + assert "description" in out + + +def test_get_table_columns_verbose_false(): + out = get_table_columns( + dataset_id="br_ibge_censo_demografico", + table_id="setor_censitario_basico_2010", + from_file=True, + verbose=False, + ) + assert type(out) == list + assert len(out) > 0 + + +def test_get_table_size(capsys): + get_table_size( + dataset_id="br_ibge_censo_demografico", + table_id="setor_censitario_basico_2010", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + ) + out, err = capsys.readouterr() + assert "num_rows" in out + assert "size_mb" in out + + +def test_get_table_size_verbose_false(): + out = get_table_size( + dataset_id="br_ibge_censo_demografico", + table_id="setor_censitario_basico_2010", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + verbose=False, + ) + assert type(out) == list + assert len(out) > 0 + +def test_search(): + out = search( + query='agua', + order_by='score' + ) + #check if function returns pd.DataFrame + assert isinstance(out, pd.DataFrame) + #check if there is duplicate tables in the result + assert out.id.nunique()==out.shape[0] + #check input error + with pytest.raises(ValueError): + search( + query='agua', + order_by='name' + ) + From 1defac9d4ef86301cfc3e4a362c9c7d4e140b76c Mon Sep 17 00:00:00 2001 From: lucascr91 Date: Sat, 29 Jan 2022 13:15:50 -0300 Subject: [PATCH 17/22] remove test_download.py --- python-package/tests/test_download.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/python-package/tests/test_download.py b/python-package/tests/test_download.py index 7c878e051..02604a350 100644 --- a/python-package/tests/test_download.py +++ b/python-package/tests/test_download.py @@ -257,7 +257,9 @@ def test_list_dataset_tables_simple_verbose(capsys): def test_list_dataset_tables_simple_list(): - out = list_dataset_tables(dataset_id="br-sp-alesp", with_description=False, verbose=False) + out = list_dataset_tables( + dataset_id="br-sp-alesp", with_description=False, verbose=False + ) assert type(out) == list assert len(out) > 0 @@ -274,10 +276,12 @@ def test_list_dataset_tables_complete_verbose(capsys): def test_list_dataset_tables_complete_list(): - out = list_dataset_tables(dataset_id="br-sp-alesp", with_description=True, verbose=False) + out = list_dataset_tables( + dataset_id="br-sp-alesp", with_description=True, verbose=False + ) - assert type(out)==list - assert type(out[0])==dict + assert type(out) == list + assert type(out[0]) == dict def test_get_dataset_description(capsys): From fbd5843d11080133d799ac0e8c62d26bf374564f Mon Sep 17 00:00:00 2001 From: lucascr91 Date: Sat, 29 Jan 2022 13:17:33 -0300 Subject: [PATCH 18/22] remove test_download.py --- python-package/tests/test_download.py | 382 -------------------------- 1 file changed, 382 deletions(-) delete mode 100644 python-package/tests/test_download.py diff --git a/python-package/tests/test_download.py b/python-package/tests/test_download.py deleted file mode 100644 index 02604a350..000000000 --- a/python-package/tests/test_download.py +++ /dev/null @@ -1,382 +0,0 @@ -from os import read -import pytest -from pathlib import Path -import pandas as pd -from pandas_gbq.gbq import GenericGBQException -import shutil -from basedosdados.download.download import search - -from basedosdados import ( - download, - read_sql, - read_table, - list_datasets, - list_dataset_tables, - get_dataset_description, - get_table_description, - get_table_columns, - get_table_size, -) -from basedosdados.exceptions import ( - BaseDosDadosException, - BaseDosDadosNoBillingProjectIDException, - BaseDosDadosInvalidProjectIDException, -) - - -TEST_PROJECT_ID = "basedosdados-dev" -SAVEFILE = Path(__file__).parent / "tmp_bases" / "test.csv" -SAVEPATH = Path(__file__).parent / "tmp_bases" -shutil.rmtree(SAVEPATH, ignore_errors=True) - - -def test_download_by_query(): - - download( - SAVEFILE, - query="select * from `basedosdados.br_ibge_pib.municipio` limit 10", - billing_project_id=TEST_PROJECT_ID, - index=False, - from_file=True, - ) - - assert SAVEFILE.exists() - - -def test_download_by_table(): - - download( - SAVEFILE, - dataset_id="br_ibge_pib", - table_id="municipio", - billing_project_id=TEST_PROJECT_ID, - limit=10, - from_file=True, - index=False, - ) - - assert SAVEFILE.exists() - - -def test_download_save_to_path(): - - download( - SAVEPATH, - dataset_id="br_ibge_pib", - table_id="municipio", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - limit=10, - index=False, - ) - - assert (SAVEPATH / "municipio.csv").exists() - - -def test_download_no_query_or_table(): - - with pytest.raises(BaseDosDadosException): - download( - SAVEFILE, - limit=10, - ) - - -def test_download_pandas_kwargs(): - - download( - SAVEFILE, - dataset_id="br_ibge_pib", - table_id="municipio", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - limit=10, - sep="|", - index=False, - ) - - assert SAVEFILE.exists() - - -def test_read_sql(): - - assert isinstance( - read_sql( - query="select * from `basedosdados.br_ibge_pib.municipio` limit 10", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - ), - pd.DataFrame, - ) - - -def test_read_sql_no_billing_project_id(): - - with pytest.raises(BaseDosDadosNoBillingProjectIDException) as excinfo: - read_sql( - query="select * from `basedosdados.br_ibge_pib.municipio` limit 10", - ) - - assert "We are not sure which Google Cloud project should be billed." in str( - excinfo.value - ) - - -def test_read_sql_invalid_billing_project_id(): - - pattern = r"You are using an invalid `billing_project_id`" - - with pytest.raises(BaseDosDadosInvalidProjectIDException, match=pattern): - read_sql( - query="select * from `basedosdados.br_ibge_pib.municipio` limit 10", - billing_project_id="inexistent_project_id", - from_file=True, - ) - - -def test_read_sql_inexistent_project(): - - with pytest.raises(GenericGBQException) as excinfo: - read_sql( - query="select * from `asedosdados.br_ibge_pib.municipio` limit 10", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - ) - - assert "Reason: 404 Not found: Project" in str(excinfo.value) - - -def test_read_sql_inexistent_dataset(): - - with pytest.raises(GenericGBQException) as excinfo: - read_sql( - query="select * from `basedosdados-dev.br_ibge_inexistent.municipio` limit 10", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - ) - - assert "Reason: 404 Not found: Dataset" in str(excinfo.value) - - -def test_read_sql_inexistent_table(): - - with pytest.raises(GenericGBQException) as excinfo: - read_sql( - query="select * from `basedosdados.br_ibge_pib.inexistent` limit 10", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - ) - - assert "Reason: 404 Not found: Table" in str(excinfo.value) - - -def test_read_sql_syntax_error(): - - with pytest.raises(GenericGBQException) as excinfo: - read_sql( - query="invalid_statement * from `basedosdados.br_ibge_pib.municipio` limit 10", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - ) - - assert "Reason: 400 Syntax error" in str(excinfo.value) - - -def test_read_sql_out_of_bound_date(): - - read_sql( - query="select DATE('1000-01-01')", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - ) - - -def test_read_table(): - - assert isinstance( - read_table( - dataset_id="br_ibge_pib", - table_id="municipio", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - limit=10, - ), - pd.DataFrame, - ) - - -def test_list_datasets_simple_verbose(capsys): - - out = list_datasets( - query="trabalho", limit=10, with_description=False, verbose=True - ) - out, err = capsys.readouterr() # Capture prints - assert "dataset_id" in out - # check input error - with pytest.raises(ValueError): - search(query="trabalho", order_by="name") - - -def test_list_datasets_simple_list(): - - out = list_datasets( - query="", limit=12, with_description=False, verbose=False - ) - # check if function returns list - assert isinstance(out, list) - assert len(out) == 12 - - -def test_list_datasets_complete_list(): - - out = list_datasets( - query="trabalho", limit=12, with_description=True, verbose=False - ) - # check if function returns list - assert isinstance(out, list) - assert "dataset_id" in out[0].keys() - assert "description" in out[0].keys() - - -def test_list_datasets_complete_verbose(capsys): - - list_datasets( - query="trabalho", limit=10, with_description=True, verbose=True - ) - out, err = capsys.readouterr() # Capture prints - assert "dataset_id" in out - assert "description" in out - - -def test_list_dataset_tables_simple_verbose(capsys): - - list_dataset_tables(dataset_id="br-sp-alesp", with_description=False, verbose=True) - out, err = capsys.readouterr() # Capture prints - assert "table_id" in out - - -def test_list_dataset_tables_simple_list(): - - out = list_dataset_tables( - dataset_id="br-sp-alesp", with_description=False, verbose=False - ) - - assert type(out) == list - assert len(out) > 0 - - -def test_list_dataset_tables_complete_verbose(capsys): - - list_dataset_tables(dataset_id="br-sp-alesp", with_description=True, verbose=True) - - out, err = capsys.readouterr() # Capture prints - assert "table_id" in out - assert "description" in out - - -def test_list_dataset_tables_complete_list(): - - out = list_dataset_tables( - dataset_id="br-sp-alesp", with_description=True, verbose=False - ) - - assert type(out) == list - assert type(out[0]) == dict - - -def test_get_dataset_description(capsys): - - get_dataset_description("br_ibge_censo_demografico", from_file=True) - out, err = capsys.readouterr() # Capture prints - assert len(out) > 0 - - -def test_get_dataset_description_verbose_false(): - out = get_dataset_description( - "br_ibge_censo_demografico", from_file=True, verbose=False - ) - assert type(out) == str - assert len(out) > 0 - - -def test_get_table_description(capsys): - get_table_description( - "br_ibge_censo_demografico", "setor_censitario_basico_2010", from_file=True - ) - out, err = capsys.readouterr() # Capture prints - assert len(out) > 0 - - -def test_get_table_description_verbose_false(): - out = get_table_description( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - from_file=True, - verbose=False, - ) - assert type(out) == str - assert len(out) > 0 - - -def test_get_table_columns(capsys): - get_table_columns( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - from_file=True, - ) - out, err = capsys.readouterr() # Capture prints - assert "name" in out - assert "field_type" in out - assert "description" in out - - -def test_get_table_columns_verbose_false(): - out = get_table_columns( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - from_file=True, - verbose=False, - ) - assert type(out) == list - assert len(out) > 0 - - -def test_get_table_size(capsys): - get_table_size( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - ) - out, err = capsys.readouterr() - assert "num_rows" in out - assert "size_mb" in out - - -def test_get_table_size_verbose_false(): - out = get_table_size( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - verbose=False, - ) - assert type(out) == list - assert len(out) > 0 - -def test_search(): - out = search( - query='agua', - order_by='score' - ) - #check if function returns pd.DataFrame - assert isinstance(out, pd.DataFrame) - #check if there is duplicate tables in the result - assert out.id.nunique()==out.shape[0] - #check input error - with pytest.raises(ValueError): - search( - query='agua', - order_by='name' - ) - From 10253f91fac8130004e859322fd2614c0d0b06ef Mon Sep 17 00:00:00 2001 From: lucascr91 Date: Sat, 29 Jan 2022 13:25:47 -0300 Subject: [PATCH 19/22] add tests metadata --- .../tests/test_download/test_metadata.py | 204 ++++++++++++++++++ 1 file changed, 204 insertions(+) create mode 100644 python-package/tests/test_download/test_metadata.py diff --git a/python-package/tests/test_download/test_metadata.py b/python-package/tests/test_download/test_metadata.py new file mode 100644 index 000000000..3605dc819 --- /dev/null +++ b/python-package/tests/test_download/test_metadata.py @@ -0,0 +1,204 @@ +from os import read +import pytest +from pathlib import Path +import pandas as pd +from pandas_gbq.gbq import GenericGBQException +import shutil +import requests + +from basedosdados import ( + list_datasets, + list_dataset_tables, + get_dataset_description, + get_table_description, + get_table_columns, + get_table_size, + search +) + + +TEST_PROJECT_ID = "basedosdados-dev" +SAVEFILE = Path(__file__).parent / "tmp_bases" / "test.csv" +SAVEPATH = Path(__file__).parent / "tmp_bases" +shutil.rmtree(SAVEPATH, ignore_errors=True) + +def test_list_datasets_simple_verbose(capsys): + + out = list_datasets( + query="trabalho", limit=10, with_description=False, verbose=True + ) + out, err = capsys.readouterr() # Capture prints + assert "dataset_id" in out + # check input error + with pytest.raises(ValueError): + search(query="trabalho", order_by="name") + + +def test_list_datasets_simple_list(): + + out = list_datasets(query="", limit=12, with_description=False, verbose=False) + # check if function returns list + assert isinstance(out, list) + assert len(out) == 12 + + +def test_list_datasets_complete_list(): + + out = list_datasets( + query="trabalho", limit=12, with_description=True, verbose=False + ) + # check if function returns list + assert isinstance(out, list) + assert "dataset_id" in out[0].keys() + assert "description" in out[0].keys() + + +def test_list_datasets_complete_verbose(capsys): + + list_datasets(query="trabalho", limit=10, with_description=True, verbose=True) + out, err = capsys.readouterr() # Capture prints + assert "dataset_id" in out + assert "description" in out + + +def test_list_dataset_tables_simple_verbose(capsys): + + list_dataset_tables(dataset_id="br_me_caged", with_description=False, verbose=True) + out, err = capsys.readouterr() # Capture prints + assert "table_id" in out + + +def test_list_dataset_tables_simple_list(): + + out = list_dataset_tables( + dataset_id="br_me_caged", with_description=False, verbose=False + ) + + assert type(out) == list + assert len(out) > 0 + + +def test_list_dataset_tables_complete_verbose(capsys): + + list_dataset_tables(dataset_id="br_me_caged", with_description=True, verbose=True) + + out, err = capsys.readouterr() # Capture prints + assert "table_id" in out + assert "description" in out + + +def test_list_dataset_tables_complete_list(): + + out = list_dataset_tables( + dataset_id="br_me_caged", with_description=True, verbose=False + ) + + assert type(out) == list + assert type(out[0]) == dict + + +def test_get_dataset_description(capsys): + + get_dataset_description("br_me_caged", verbose=True) + out, err = capsys.readouterr() # Capture prints + assert len(out) > 0 + + +def test_get_dataset_description_verbose_false(): + out = get_dataset_description("br_me_caged", verbose=False) + assert type(out) == str + assert len(out) > 0 + + +def test_get_table_description(capsys): + get_table_description("br_me_caged", "microdados_antigos") + out, err = capsys.readouterr() # Capture prints + assert len(out) > 0 + + +def test_get_table_description_verbose_false(): + out = get_table_description( + dataset_id="br_me_caged", + table_id="microdados_antigos", + verbose=False, + ) + assert type(out) == str + assert len(out) > 0 + + +def test_get_table_columns(capsys): + get_table_columns( + dataset_id="br_me_caged", + table_id="microdados_antigos", + ) + out, err = capsys.readouterr() # Capture prints + assert "name" in out + assert "description" in out + + +def test_get_table_columns_verbose_false(): + out = get_table_columns( + dataset_id="br_me_caged", + table_id="microdados_antigos", + verbose=False, + ) + assert type(out) == list + assert len(out) > 0 + + +def test_get_table_size(capsys): + get_table_size( + dataset_id="br_ibge_censo_demografico", + table_id="setor_censitario_basico_2010", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + ) + out, err = capsys.readouterr() + assert "num_rows" in out + assert "size_mb" in out + + +def test_get_table_size_verbose_false(): + out = get_table_size( + dataset_id="br_ibge_censo_demografico", + table_id="setor_censitario_basico_2010", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + verbose=False, + ) + assert type(out) == list + assert len(out) > 0 + + +def test_search(): + out = search(query="agua", order_by="score") + # check if function returns pd.DataFrame + assert isinstance(out, pd.DataFrame) + # check if there is duplicate tables in the result + assert out.id.nunique() == out.shape[0] + # check input error + with pytest.raises(ValueError): + search(query="agua", order_by="name") + +def test_get_table_size(capsys): + get_table_size( + dataset_id="br_ibge_censo_demografico", + table_id="setor_censitario_basico_2010", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + ) + out, err = capsys.readouterr() + assert "num_rows" in out + assert "size_mb" in out + + +def test_get_table_size_verbose_false(): + out = get_table_size( + dataset_id="br_ibge_censo_demografico", + table_id="setor_censitario_basico_2010", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + verbose=False, + ) + assert type(out) == list + assert len(out) > 0 \ No newline at end of file From e2b059297541719a346fbe7075b3fc25528071f3 Mon Sep 17 00:00:00 2001 From: lucascr91 Date: Sat, 29 Jan 2022 13:30:56 -0300 Subject: [PATCH 20/22] remove test_download.py --- python-package/tests/test_download.py | 352 ++++++++++++++++++++++++++ 1 file changed, 352 insertions(+) create mode 100644 python-package/tests/test_download.py diff --git a/python-package/tests/test_download.py b/python-package/tests/test_download.py new file mode 100644 index 000000000..3fa9222d7 --- /dev/null +++ b/python-package/tests/test_download.py @@ -0,0 +1,352 @@ +from os import read +import pytest +from pathlib import Path +import pandas as pd +from pandas_gbq.gbq import GenericGBQException +import shutil + +from basedosdados import ( + download, + read_sql, + read_table, + list_datasets, + list_dataset_tables, + get_dataset_description, + get_table_description, + get_table_columns, + get_table_size, +) +from basedosdados.exceptions import ( + BaseDosDadosException, + BaseDosDadosNoBillingProjectIDException, + BaseDosDadosInvalidProjectIDException, +) + + +TEST_PROJECT_ID = "basedosdados-dev" +SAVEFILE = Path(__file__).parent / "tmp_bases" / "test.csv" +SAVEPATH = Path(__file__).parent / "tmp_bases" +shutil.rmtree(SAVEPATH, ignore_errors=True) + + +def test_download_by_query(): + + download( + SAVEFILE, + query="select * from `basedosdados.br_ibge_pib.municipio` limit 10", + billing_project_id=TEST_PROJECT_ID, + index=False, + from_file=True, + ) + + assert SAVEFILE.exists() + + +def test_download_by_table(): + + download( + SAVEFILE, + dataset_id="br_ibge_pib", + table_id="municipio", + billing_project_id=TEST_PROJECT_ID, + limit=10, + from_file=True, + index=False, + ) + + assert SAVEFILE.exists() + + +def test_download_save_to_path(): + + download( + SAVEPATH, + dataset_id="br_ibge_pib", + table_id="municipio", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + limit=10, + index=False, + ) + + assert (SAVEPATH / "municipio.csv").exists() + + +def test_download_no_query_or_table(): + + with pytest.raises(BaseDosDadosException): + download( + SAVEFILE, + limit=10, + ) + + +def test_download_pandas_kwargs(): + + download( + SAVEFILE, + dataset_id="br_ibge_pib", + table_id="municipio", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + limit=10, + sep="|", + index=False, + ) + + assert SAVEFILE.exists() + + +def test_read_sql(): + + assert isinstance( + read_sql( + query="select * from `basedosdados.br_ibge_pib.municipio` limit 10", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + ), + pd.DataFrame, + ) + + +def test_read_sql_no_billing_project_id(): + + with pytest.raises(BaseDosDadosNoBillingProjectIDException) as excinfo: + read_sql( + query="select * from `basedosdados.br_ibge_pib.municipio` limit 10", + ) + + assert "We are not sure which Google Cloud project should be billed." in str( + excinfo.value + ) + + +def test_read_sql_invalid_billing_project_id(): + + pattern = r"You are using an invalid `billing_project_id`" + + with pytest.raises(BaseDosDadosInvalidProjectIDException, match=pattern): + read_sql( + query="select * from `basedosdados.br_ibge_pib.municipio` limit 10", + billing_project_id="inexistent_project_id", + from_file=True, + ) + + +def test_read_sql_inexistent_project(): + + with pytest.raises(GenericGBQException) as excinfo: + read_sql( + query="select * from `asedosdados.br_ibge_pib.municipio` limit 10", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + ) + + assert "Reason: 404 Not found: Project" in str(excinfo.value) + + +def test_read_sql_inexistent_dataset(): + + with pytest.raises(GenericGBQException) as excinfo: + read_sql( + query="select * from `basedosdados-dev.br_ibge_inexistent.municipio` limit 10", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + ) + + assert "Reason: 404 Not found: Dataset" in str(excinfo.value) + + +def test_read_sql_inexistent_table(): + + with pytest.raises(GenericGBQException) as excinfo: + read_sql( + query="select * from `basedosdados.br_ibge_pib.inexistent` limit 10", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + ) + + assert "Reason: 404 Not found: Table" in str(excinfo.value) + + +def test_read_sql_syntax_error(): + + with pytest.raises(GenericGBQException) as excinfo: + read_sql( + query="invalid_statement * from `basedosdados.br_ibge_pib.municipio` limit 10", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + ) + + assert "Reason: 400 Syntax error" in str(excinfo.value) + + +def test_read_sql_out_of_bound_date(): + + read_sql( + query="select DATE('1000-01-01')", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + ) + + +def test_read_table(): + + assert isinstance( + read_table( + dataset_id="br_ibge_pib", + table_id="municipio", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + limit=10, + ), + pd.DataFrame, + ) + + +def test_list_datasets(capsys): + + list_datasets(from_file=True) + out, err = capsys.readouterr() # Capture prints + assert "dataset_id" in out + + +def test_list_datasets_complete(capsys): + + list_datasets(with_description=True, filter_by="ibge", from_file=True) + out, err = capsys.readouterr() # Capture prints + assert "dataset_id" in out + assert "description" in out + + +def test_list_datasets_all_descriptions(capsys): + + list_datasets(with_description=True, from_file=True) + out, err = capsys.readouterr() # Capture prints + assert len(out) > 0 + + +def test_list_datasets_verbose_false(): + + out = list_datasets(from_file=True, verbose=False) + assert type(out) == list + assert len(out) > 0 + + +def test_list_dataset_tables(capsys): + + list_dataset_tables(dataset_id="br_ibge_censo_demografico", from_file=True) + out, err = capsys.readouterr() # Capture prints + assert "table_id" in out + + +def test_list_dataset_tables_complete(capsys): + + list_dataset_tables( + dataset_id="br_ibge_censo_demografico", + filter_by="renda", + with_description=True, + from_file=True, + ) + out, err = capsys.readouterr() # Capture prints + assert "table_id" in out + assert "description" in out + assert "renda" in out + + +def test_list_dataset_tables_all_descriptions(capsys): + list_dataset_tables( + dataset_id="br_ibge_censo_demografico", with_description=True, from_file=True + ) + out, err = capsys.readouterr() # Capture prints + assert len(out) > 0 + + +def test_list_dataset_tables_verbose_false(): + + out = list_dataset_tables( + dataset_id="br_ibge_censo_demografico", from_file=True, verbose=False + ) + assert type(out) == list + assert len(out) > 0 + + +def test_get_dataset_description(capsys): + + get_dataset_description("br_ibge_censo_demografico", from_file=True) + out, err = capsys.readouterr() # Capture prints + assert len(out) > 0 + + +def test_get_dataset_description_verbose_false(): + out = get_dataset_description( + "br_ibge_censo_demografico", from_file=True, verbose=False + ) + assert type(out) == str + assert len(out) > 0 + + +def test_get_table_description(capsys): + get_table_description( + "br_ibge_censo_demografico", "setor_censitario_basico_2010", from_file=True + ) + out, err = capsys.readouterr() # Capture prints + assert len(out) > 0 + + +def test_get_table_description_verbose_false(): + out = get_table_description( + dataset_id="br_ibge_censo_demografico", + table_id="setor_censitario_basico_2010", + from_file=True, + verbose=False, + ) + assert type(out) == str + assert len(out) > 0 + + +def test_get_table_columns(capsys): + get_table_columns( + dataset_id="br_ibge_censo_demografico", + table_id="setor_censitario_basico_2010", + from_file=True, + ) + out, err = capsys.readouterr() # Capture prints + assert "name" in out + assert "field_type" in out + assert "description" in out + + +def test_get_table_columns_verbose_false(): + out = get_table_columns( + dataset_id="br_ibge_censo_demografico", + table_id="setor_censitario_basico_2010", + from_file=True, + verbose=False, + ) + assert type(out) == list + assert len(out) > 0 + + +def test_get_table_size(capsys): + get_table_size( + dataset_id="br_ibge_censo_demografico", + table_id="setor_censitario_basico_2010", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + ) + out, err = capsys.readouterr() + assert "num_rows" in out + assert "size_mb" in out + + +def test_get_table_size_verbose_false(): + out = get_table_size( + dataset_id="br_ibge_censo_demografico", + table_id="setor_censitario_basico_2010", + billing_project_id=TEST_PROJECT_ID, + from_file=True, + verbose=False, + ) + assert type(out) == list + assert len(out) > 0 From cfea6c0a963a37d52d7dcc5b2e3755f3536eb8eb Mon Sep 17 00:00:00 2001 From: lucascr91 Date: Sat, 29 Jan 2022 16:50:57 -0300 Subject: [PATCH 21/22] remove unused imports --- python-package/basedosdados/download/metadata.py | 2 +- python-package/tests/test_download/test_metadata.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/python-package/basedosdados/download/metadata.py b/python-package/basedosdados/download/metadata.py index 24dbc02cf..4f8464998 100644 --- a/python-package/basedosdados/download/metadata.py +++ b/python-package/basedosdados/download/metadata.py @@ -421,7 +421,7 @@ def search(query, order_by): # first loop identify the number of the tables in each datasets for dataset in json_response["result"]["datasets"]: tables_dfs = [] - n_tables = len(dataset["resources"]) + len(dataset["resources"]) # second loop extracts tables' information for each dataset for table in dataset["resources"]: data_table = pd.DataFrame( diff --git a/python-package/tests/test_download/test_metadata.py b/python-package/tests/test_download/test_metadata.py index 3605dc819..3dd129862 100644 --- a/python-package/tests/test_download/test_metadata.py +++ b/python-package/tests/test_download/test_metadata.py @@ -1,4 +1,3 @@ -from os import read import pytest from pathlib import Path import pandas as pd From 14a3fd8e12262e859b6ef31a51f48d92bd891bc9 Mon Sep 17 00:00:00 2001 From: lucascr91 Date: Tue, 22 Feb 2022 17:06:09 -0300 Subject: [PATCH 22/22] [infra] add _safe_fetch and get_table_size functions --- python-package/basedosdados/__init__.py | 3 +- .../basedosdados/download/metadata.py | 132 +++++------------- .../tests/test_download/test_metadata.py | 50 ++----- 3 files changed, 48 insertions(+), 137 deletions(-) diff --git a/python-package/basedosdados/__init__.py b/python-package/basedosdados/__init__.py index 58e6a68ca..fd197e9ab 100644 --- a/python-package/basedosdados/__init__.py +++ b/python-package/basedosdados/__init__.py @@ -21,4 +21,5 @@ get_dataset_description, get_table_columns, get_table_size, -) + search +) \ No newline at end of file diff --git a/python-package/basedosdados/download/metadata.py b/python-package/basedosdados/download/metadata.py index 4f8464998..9aae894ba 100644 --- a/python-package/basedosdados/download/metadata.py +++ b/python-package/basedosdados/download/metadata.py @@ -4,20 +4,22 @@ from basedosdados.download.base import credentials - -def _get_header(text): - """Gets first paragraph of a text - Args: - text (str or None): Text to be split - Returns: - str: First paragraph +def _safe_fetch(url:str): """ - - if isinstance(text, str): - return text.split("\n")[0] - elif text is None: - return "" - + Safely fetchs urls and, if somehting goes wrong, informs user what is the possible cause + """ + try: + response = requests.get(url) + response.raise_for_status() + return response + except requests.exceptions.RequestException as err: + print ("This url doesn't appear to exists:",err) + except requests.exceptions.HTTPError as errh: + print ("Http Error:",errh) + except requests.exceptions.ConnectionError as errc: + print ("Error Connecting:",errc) + except requests.exceptions.Timeout as errt: + print ("Timeout Error:",errt) def _fix_size(s, step=80): @@ -44,14 +46,6 @@ def _print_output(df): print("-" * (step + 15)) print() - # func = lambda lista, final, step: ( - # func(lista[1:], - # (final + lista[0] + ' ') - # if len(final.split('\n')[-1]) <= step - # else final + '\n', - # step - # ) if len(lista) else final) - def _handle_output(verbose, output_type, df, col_name=None): """Handles datasets and tables listing outputs based on user's choice. @@ -110,12 +104,7 @@ def list_datasets(query, limit=10, with_description=False, verbose=True): url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q={query}&page_size={limit}&resource_type=bdm_table" - # validate url - try: - response = requests.get(url) - response.raise_for_status() - except requests.exceptions.HTTPError as err: - return err + response = _safe_fetch(url) json_response = response.json() @@ -171,14 +160,11 @@ def list_dataset_tables( stdout | list """ + dataset_id = dataset_id.replace("-","_") #The dataset_id pattern in the bd_dataset_search endpoint response uses a hyphen as a separator, while in the endpoint urls that specify the dataset_id parameter the separator used is an underscore. See issue #1079 + url = f"https://basedosdados.org/api/3/action/bd_bdm_dataset_show?dataset_id={dataset_id}" - # validate url - try: - response = requests.get(url) - response.raise_for_status() - except requests.exceptions.HTTPError as err: - return err + response = _safe_fetch(url) json_response = response.json() @@ -230,12 +216,7 @@ def get_dataset_description( """ url = f"https://basedosdados.org/api/3/action/bd_bdm_dataset_show?dataset_id={dataset_id}" - # validate url - try: - response = requests.get(url) - response.raise_for_status() - except requests.exceptions.HTTPError as err: - return err + response = _safe_fetch(url) json_response = response.json() @@ -269,12 +250,7 @@ def get_table_description( url = f"https://basedosdados.org/api/3/action/bd_bdm_table_show?dataset_id={dataset_id}&table_id={table_id}" - # validate url - try: - response = requests.get(url) - response.raise_for_status() - except requests.exceptions.HTTPError as err: - return err + response = _safe_fetch(url) json_response = response.json() @@ -309,12 +285,7 @@ def get_table_columns( url = f"https://basedosdados.org/api/3/action/bd_bdm_table_show?dataset_id={dataset_id}&table_id={table_id}" - # validate url - try: - response = requests.get(url) - response.raise_for_status() - except requests.exceptions.HTTPError as err: - return err + response = _safe_fetch(url) json_response = response.json() @@ -329,13 +300,9 @@ def get_table_columns( def get_table_size( dataset_id, table_id, - billing_project_id, - query_project_id="basedosdados", - from_file=False, verbose=True, ): - """Use a query to get the number of rows and size (in Mb) of a table query - from BigQuery. Prints information on screen in markdown friendly format. + """Use a query to get the number of rows and size (in Mb) of a table. WARNING: this query may cost a lot depending on the table. @@ -345,45 +312,24 @@ def get_table_size( table_id (str): Optional. Table id available in basedosdados.dataset_id. It should always come with dataset_id. - query_project_id (str): Optional. - Which project the table lives. You can change this you want to query different projects. - billing_project_id (str): Optional. - Project that will be billed. Find your Project ID here https://console.cloud.google.com/projectselector2/home/dashboard verbose (bool): Optional. If set to True, information is printed to the screen. If set to False, data is returned as a `list` of `dict`s. - Example: - get_table_size( - dataset_id='br_ibge_censo2010', - table_id='pessoa_renda_setor_censitario', - billing_project_id='yourprojectid' - ) """ - billing_client = bigquery.Client( - credentials=credentials(from_file=from_file), project=billing_project_id - ) - - query = f"""SELECT COUNT(*) FROM {query_project_id}.{dataset_id}.{table_id}""" - - job = billing_client.query(query, location="US") - - num_rows = job.to_dataframe().loc[0, "f0_"] + url = f"https://basedosdados.org/api/3/action/bd_bdm_table_show?dataset_id={dataset_id}&table_id={table_id}" - size_mb = round(job.total_bytes_processed / 1024 / 1024, 2) + response = _safe_fetch(url) - table_data = pd.DataFrame( - [ - { - "project_id": query_project_id, - "dataset_id": dataset_id, - "table_id": table_id, - "num_rows": num_rows, - "size_mb": size_mb, - } - ] - ) + json_response = response.json() - return _handle_output(verbose=verbose, output_type="records", df=table_data) + size = json_response["result"]["size"] + if size==None: + print("Size not available") + else: + if verbose: + _print_output(pd.DataFrame(size)) + else: + return size def search(query, order_by): """This function works as a wrapper to the `bd_dataset_search` website API enpoint. @@ -408,12 +354,7 @@ def search(query, order_by): url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q={query}&order_by={order_by}&resource_type=bdm_table" - # validate url - try: - response = requests.get(url) - response.raise_for_status() - except requests.exceptions.HTTPError as err: - print(err) + response = _safe_fetch(url) json_response = response.json() @@ -434,5 +375,4 @@ def search(query, order_by): # append datasets' dataframes df = dataset_dfs[0].append(dataset_dfs[1:]).reset_index(drop=True) - return df - + return df \ No newline at end of file diff --git a/python-package/tests/test_download/test_metadata.py b/python-package/tests/test_download/test_metadata.py index 3dd129862..d6788de83 100644 --- a/python-package/tests/test_download/test_metadata.py +++ b/python-package/tests/test_download/test_metadata.py @@ -12,9 +12,10 @@ get_table_description, get_table_columns, get_table_size, - search + search, ) +from basedosdados.download.metadata import _safe_fetch TEST_PROJECT_ID = "basedosdados-dev" SAVEFILE = Path(__file__).parent / "tmp_bases" / "test.csv" @@ -144,31 +145,6 @@ def test_get_table_columns_verbose_false(): assert type(out) == list assert len(out) > 0 - -def test_get_table_size(capsys): - get_table_size( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - ) - out, err = capsys.readouterr() - assert "num_rows" in out - assert "size_mb" in out - - -def test_get_table_size_verbose_false(): - out = get_table_size( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - verbose=False, - ) - assert type(out) == list - assert len(out) > 0 - - def test_search(): out = search(query="agua", order_by="score") # check if function returns pd.DataFrame @@ -183,21 +159,15 @@ def test_get_table_size(capsys): get_table_size( dataset_id="br_ibge_censo_demografico", table_id="setor_censitario_basico_2010", - billing_project_id=TEST_PROJECT_ID, - from_file=True, ) out, err = capsys.readouterr() - assert "num_rows" in out - assert "size_mb" in out + assert "not available" in out +def test__safe_fetch(capsys): -def test_get_table_size_verbose_false(): - out = get_table_size( - dataset_id="br_ibge_censo_demografico", - table_id="setor_censitario_basico_2010", - billing_project_id=TEST_PROJECT_ID, - from_file=True, - verbose=False, - ) - assert type(out) == list - assert len(out) > 0 \ No newline at end of file + _safe_fetch("https://www.lkajsdhgfal.com.br") + out, err = capsys.readouterr() # Capture prints + assert "HTTPSConnection" in out + + response = _safe_fetch("https://basedosdados.org/api/3/action/bd_dataset_search?q=agua&page_size=10&resource_type=bdm_table") + assert type(response.json())==dict