From a480d4902e4d03a744e22c44dffcb32ae5f10450 Mon Sep 17 00:00:00 2001
From: Diego Oliveira <d116626@gmail.com>
Date: Tue, 22 Feb 2022 17:35:01 -0300
Subject: [PATCH]  [infra] change download functions to consume CKAN endpoints
 #1129  (#1130)

* [infra] add function to wrap bd_dataset_search endpoint

* Update download.py

* [infra] modify list_datasets function to consume CKAN endpoint

* [infra] fix list_dataset function to include limit and remove order_by

* [infra] change function list_dataset_tables to use CKAN endpoint

* [infra] apply PEP8 to list_dataset_tables and respective tests

* add get_dataset_description, get_table_description, get_table_columns

* [infra] fix dataset_config.yaml folder path (#1067)

* feat(infra) merge master

* fix files organization to match master

* remove download.py

* remove test_download

* Delete test_download.py

* remove test files

* remove test_download.py

* remove test_download.py

* remove test_download.py

* remove test_download.py

* add tests metadata

* remove test_download.py

* remove unused imports

* [infra] add _safe_fetch and get_table_size functions

Co-authored-by: lucascr91 <lucas.ecomg@gmail.com>
---
 python-package/basedosdados/__init__.py       |   3 +-
 .../basedosdados/download/metadata.py         | 420 +++++++++---------
 .../tests/test_download/test_metadata.py      | 138 +++---
 3 files changed, 288 insertions(+), 273 deletions(-)

diff --git a/python-package/basedosdados/__init__.py b/python-package/basedosdados/__init__.py
index 58e6a68ca..fd197e9ab 100644
--- a/python-package/basedosdados/__init__.py
+++ b/python-package/basedosdados/__init__.py
@@ -21,4 +21,5 @@
     get_dataset_description,
     get_table_columns,
     get_table_size,
-)
+    search
+)
\ No newline at end of file
diff --git a/python-package/basedosdados/download/metadata.py b/python-package/basedosdados/download/metadata.py
index fe5d5dff9..9aae894ba 100644
--- a/python-package/basedosdados/download/metadata.py
+++ b/python-package/basedosdados/download/metadata.py
@@ -1,24 +1,25 @@
 from google.cloud import bigquery
 import pandas as pd
+import requests
 
 from basedosdados.download.base import credentials
 
-
-def _get_header(text):
-    """Gets first paragraph of a text
-
-    Args:
-        text (str or None): Text to be split
-
-    Returns:
-        str: First paragraph
+def _safe_fetch(url:str):
     """
-
-    if isinstance(text, str):
-        return text.split("\n")[0]
-    elif text is None:
-        return ""
-
+    Safely fetchs urls and, if somehting goes wrong, informs user what is the possible cause
+    """
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        return response
+    except requests.exceptions.RequestException as err:
+        print ("This url doesn't appear to exists:",err)
+    except requests.exceptions.HTTPError as errh:
+        print ("Http Error:",errh)
+    except requests.exceptions.ConnectionError as errc:
+        print ("Error Connecting:",errc)
+    except requests.exceptions.Timeout as errt:
+        print ("Timeout Error:",errt)   
 
 def _fix_size(s, step=80):
 
@@ -32,7 +33,6 @@ def _fix_size(s, step=80):
 
 def _print_output(df):
     """Prints dataframe contents as print blocks
-
     Args:
         df (pd.DataFrame): table to be printed
     """
@@ -46,14 +46,6 @@ def _print_output(df):
         print("-" * (step + 15))
     print()
 
-    # func = lambda lista, final, step: (
-    # func(lista[1:],
-    #     (final + lista[0] + ' ')
-    #         if len(final.split('\n')[-1]) <= step
-    #         else final + '\n',
-    #      step
-    #        ) if len(lista) else final)
-
 
 def _handle_output(verbose, output_type, df, col_name=None):
     """Handles datasets and tables listing outputs based on user's choice.
@@ -91,234 +83,226 @@ def _handle_output(verbose, output_type, df, col_name=None):
 
     return None
 
-
-def list_datasets(
-    query_project_id="basedosdados",
-    filter_by=None,
-    with_description=False,
-    from_file=False,
-    verbose=True,
-):
-    """Fetch the dataset_id of datasets available at query_project_id. Prints information on
-    screen or returns it as a list.
+def list_datasets(query, limit=10, with_description=False, verbose=True):
+    """
+    This function uses `bd_dataset_search` website API
+    enpoint to retrieve a list of available datasets.
 
     Args:
-        query_project_id (str): Optional.
-            Which project the table lives. You can change this you want to query different projects.
-        filter_by (str): Optional
-            String to be matched in dataset_id.
+        query (str):
+            String to search in datasets' metadata.
+        limit (int):
+            Field to limit the number of results
         with_description (bool): Optional
             If True, fetch short dataset description for each dataset.
         verbose (bool): Optional.
             If set to True, information is printed to the screen. If set to False, a list object is returned.
 
-
-    Example:
-        list_datasets(
-        filter_by='sp',
-        with_description=True,
-        )
+    Returns:
+        list | stdout
     """
 
-    client = bigquery.Client(
-        credentials=credentials(from_file=from_file), project=query_project_id
-    )
-
-    datasets_list = list(client.list_datasets())
-
-    datasets = pd.DataFrame(
-        [dataset.dataset_id for dataset in datasets_list], columns=["dataset_id"]
-    )
-
-    if filter_by:
-
-        datasets = datasets.loc[datasets["dataset_id"].str.contains(filter_by)]
-
-    if with_description:
-
-        datasets["description"] = [
-            _get_header(client.get_dataset(dataset).description)
-            for dataset in datasets["dataset_id"]
+    url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q={query}&page_size={limit}&resource_type=bdm_table"
+
+    response = _safe_fetch(url)
+
+    json_response = response.json()
+
+    # this dict has all information we need to output the function
+    dataset_dict = {
+        "dataset_id": [
+            dataset["name"] for dataset in json_response["result"]["datasets"]
+        ],
+        "description": [
+            dataset["notes"] if "notes" in dataset.keys() else None
+            for dataset in json_response["result"]["datasets"]
+        ],
+    }
+
+    # select desired output using dataset_id info. Note that the output is either a standardized string or a list
+    if verbose & (with_description == False):
+        return _print_output(pd.DataFrame.from_dict(dataset_dict)[["dataset_id"]])
+    elif verbose & with_description:
+        return _print_output(
+            pd.DataFrame.from_dict(dataset_dict)[["dataset_id", "description"]]
+        )
+    elif (verbose == False) & (with_description == False):
+        return dataset_dict["dataset_id"]
+    elif (verbose == False) & with_description:
+        return [
+            {
+                "dataset_id": dataset_dict["dataset_id"][k],
+                "description": dataset_dict["description"][k],
+            }
+            for k in range(len(dataset_dict["dataset_id"]))
         ]
 
-    return _handle_output(
-        verbose=verbose,
-        output_type="list",
-        df=datasets,
-        col_name="dataset_id",
-    )
-
 
 def list_dataset_tables(
     dataset_id,
-    query_project_id="basedosdados",
-    from_file=False,
-    filter_by=None,
     with_description=False,
     verbose=True,
 ):
-    """Fetch table_id for tables available at the specified dataset_id. Prints the information
-    on screen or returns it as a list.
+    """
+    Fetch table_id for tables available at the specified dataset_id. Prints the information on screen or returns it as a list.
 
     Args:
         dataset_id (str): Optional.
-            Dataset id available in basedosdados.
-        query_project_id (str): Optional.
-            Which project the table lives. You can change this you want to query different projects.
-        filter_by (str): Optional
-            String to be matched in the table_id.
+            Dataset id returned by list_datasets function
+        limit (int):
+            Field to limit the number of results
         with_description (bool): Optional
              If True, fetch short table descriptions for each table that match the search criteria.
         verbose (bool): Optional.
             If set to True, information is printed to the screen. If set to False, a list object is returned.
 
-    Example:
-        list_dataset_tables(
-        dataset_id='br_ibge_censo2010'
-        filter_by='renda',
-        with_description=True,
-        )
+    Returns:
+        stdout | list
     """
-    client = bigquery.Client(
-        credentials=credentials(from_file=from_file), project=query_project_id
-    )
-
-    dataset = client.get_dataset(dataset_id)
-
-    tables_list = list(client.list_tables(dataset))
 
-    tables = pd.DataFrame(
-        [table.table_id for table in tables_list], columns=["table_id"]
-    )
-
-    if filter_by:
-
-        tables = tables.loc[tables["table_id"].str.contains(filter_by)]
-
-    if with_description:
-
-        tables["description"] = [
-            _get_header(client.get_table(f"{dataset_id}.{table}").description)
-            for table in tables["table_id"]
+    dataset_id = dataset_id.replace("-","_") #The dataset_id pattern in the bd_dataset_search endpoint response uses a hyphen as a separator, while in the endpoint urls that specify the dataset_id parameter the separator used is an underscore. See issue #1079
+
+    url = f"https://basedosdados.org/api/3/action/bd_bdm_dataset_show?dataset_id={dataset_id}"
+
+    response = _safe_fetch(url)
+
+    json_response = response.json()
+
+    dataset = json_response["result"]
+    # this dict has all information need to output the function
+    table_dict = {
+        "table_id": [
+            dataset["resources"][k]["name"] for k in range(len(dataset["resources"]))
+        ],
+        "description": [
+            dataset["resources"][k]["description"]
+            for k in range(len(dataset["resources"]))
+        ],
+    }
+    # select desired output using table_id info. Note that the output is either a standardized string or a list
+    if verbose & (with_description == False):
+        return _print_output(pd.DataFrame.from_dict(table_dict)[["table_id"]])
+    elif verbose & with_description:
+        return _print_output(
+            pd.DataFrame.from_dict(table_dict)[["table_id", "description"]]
+        )
+    elif (verbose == False) & (with_description == False):
+        return table_dict["table_id"]
+    elif (verbose == False) & with_description:
+        return [
+            {
+                "table_id": table_dict["table_id"][k],
+                "description": table_dict["description"][k],
+            }
+            for k in range(len(table_dict["table_id"]))
         ]
 
-    return _handle_output(
-        verbose=verbose,
-        output_type="list",
-        df=tables,
-        col_name="table_id",
-    )
-
 
 def get_dataset_description(
-    dataset_id=None,
-    query_project_id="basedosdados",
-    from_file=False,
+    dataset_id,
     verbose=True,
 ):
-    """Prints the full dataset description.
+    """
+    Prints the full dataset description.
 
     Args:
-        dataset_id (str): Optional.
-            Dataset id available in basedosdados.
-        query_project_id (str): Optional.
-            Which project the table lives. You can change this you want to query different projects.
+        dataset_id (str): Required.
+            Dataset id available in list_datasets.
         verbose (bool): Optional.
             If set to True, information is printed to the screen. If set to False, data is returned as a `str`.
+
+    Returns:
+        stdout | str
     """
+    url = f"https://basedosdados.org/api/3/action/bd_bdm_dataset_show?dataset_id={dataset_id}"
 
-    client = bigquery.Client(
-        credentials=credentials(from_file=from_file), project=query_project_id
-    )
+    response = _safe_fetch(url)
 
-    dataset = client.get_dataset(dataset_id)
+    json_response = response.json()
 
-    return _handle_output(verbose=verbose, output_type="str", df=dataset)
+    description = json_response["result"]["notes"]
+
+    if verbose:
+        print(description)
+    else:
+        return description
 
 
 def get_table_description(
-    dataset_id=None,
-    table_id=None,
-    query_project_id="basedosdados",
-    from_file=False,
+    dataset_id,
+    table_id,
     verbose=True,
 ):
-    """Prints the full table description.
+    """
+    Prints the full table description.
 
     Args:
-        dataset_id (str): Optional.
-            Dataset id available in basedosdados. It should always come with table_id.
-        table_id (str): Optional.
-            Table id available in basedosdados.dataset_id.
-            It should always come with dataset_id.
-        query_project_id (str): Optional.
-            Which project the table lives. You can change this you want to query different projects.
+        dataset_id (str): Required.
+            Dataset id available in list_datasets.
+        table_id (str): Required.
+            Table id available in list_dataset_tables
         verbose (bool): Optional.
             If set to True, information is printed to the screen. If set to False, data is returned as a `str`.
+
+    Returns:
+        stdout | str
     """
 
-    client = bigquery.Client(
-        credentials=credentials(from_file=from_file), project=query_project_id
-    )
+    url = f"https://basedosdados.org/api/3/action/bd_bdm_table_show?dataset_id={dataset_id}&table_id={table_id}"
+
+    response = _safe_fetch(url)
 
-    table = client.get_table(f"{dataset_id}.{table_id}")
+    json_response = response.json()
 
-    return _handle_output(verbose=verbose, output_type="str", df=table)
+    description = json_response["result"]["description"]
+
+    if verbose:
+        print(description)
+    else:
+        return description
 
 
 def get_table_columns(
-    dataset_id=None,
-    table_id=None,
-    query_project_id="basedosdados",
-    from_file=False,
+    dataset_id,
+    table_id,
     verbose=True,
 ):
 
-    """Fetch the names, types and descriptions for the columns in the specified table. Prints
-    information on screen.
-
+    """
+        Fetch the names, types and descriptions for the columns in the specified table. Prints
+        information on screen.
     Args:
-        dataset_id (str): Optional.
-            Dataset id available in basedosdados. It should always come with table_id.
-        table_id (str): Optional.
-            Table id available in basedosdados.dataset_id.
-            It should always come with dataset_id.
-        query_project_id (str): Optional.
-            Which project the table lives. You can change this you want to query different projects.
+        dataset_id (str): Required.
+            Dataset id available in list_datasets.
+        table_id (str): Required.
+            Table id available in list_dataset_tables
         verbose (bool): Optional.
             If set to True, information is printed to the screen. If set to False, data is returned as a `list` of `dict`s.
-    Example:
-        get_table_columns(
-        dataset_id='br_ibge_censo2010',
-        table_id='pessoa_renda_setor_censitario'
-        )
+
+    Returns:
+        stdout | list
     """
 
-    client = bigquery.Client(
-        credentials=credentials(from_file=from_file), project=query_project_id
-    )
+    url = f"https://basedosdados.org/api/3/action/bd_bdm_table_show?dataset_id={dataset_id}&table_id={table_id}"
 
-    table_ref = client.get_table(f"{dataset_id}.{table_id}")
+    response = _safe_fetch(url)
 
-    columns = [
-        (field.name, field.field_type, field.description) for field in table_ref.schema
-    ]
+    json_response = response.json()
 
-    description = pd.DataFrame(columns, columns=["name", "field_type", "description"])
+    columns = json_response["result"]["columns"]
 
-    return _handle_output(verbose=verbose, output_type="records", df=description)
+    if verbose:
+        _print_output(pd.DataFrame(columns))
+    else:
+        return columns
 
 
 def get_table_size(
     dataset_id,
     table_id,
-    billing_project_id,
-    query_project_id="basedosdados",
-    from_file=False,
     verbose=True,
 ):
-    """Use a query to get the number of rows and size (in Mb) of a table query
-    from BigQuery. Prints information on screen in markdown friendly format.
+    """Use a query to get the number of rows and size (in Mb) of a table.
 
     WARNING: this query may cost a lot depending on the table.
 
@@ -328,41 +312,67 @@ def get_table_size(
         table_id (str): Optional.
             Table id available in basedosdados.dataset_id.
             It should always come with dataset_id.
-        query_project_id (str): Optional.
-            Which project the table lives. You can change this you want to query different projects.
-        billing_project_id (str): Optional.
-            Project that will be billed. Find your Project ID here https://console.cloud.google.com/projectselector2/home/dashboard
         verbose (bool): Optional.
             If set to True, information is printed to the screen. If set to False, data is returned as a `list` of `dict`s.
-    Example:
-        get_table_size(
-        dataset_id='br_ibge_censo2010',
-        table_id='pessoa_renda_setor_censitario',
-        billing_project_id='yourprojectid'
-        )
     """
-    billing_client = bigquery.Client(
-        credentials=credentials(from_file=from_file), project=billing_project_id
-    )
+    url = f"https://basedosdados.org/api/3/action/bd_bdm_table_show?dataset_id={dataset_id}&table_id={table_id}"
 
-    query = f"""SELECT COUNT(*) FROM {query_project_id}.{dataset_id}.{table_id}"""
+    response = _safe_fetch(url)
 
-    job = billing_client.query(query, location="US")
+    json_response = response.json()
 
-    num_rows = job.to_dataframe().loc[0, "f0_"]
+    size = json_response["result"]["size"]
 
-    size_mb = round(job.total_bytes_processed / 1024 / 1024, 2)
+    if size==None:
+        print("Size not available")
+    else:
+        if verbose:
+            _print_output(pd.DataFrame(size))
+        else:
+            return size
+def search(query, order_by):
+    """This function works as a wrapper to the `bd_dataset_search` website API
+    enpoint.
 
-    table_data = pd.DataFrame(
-        [
-            {
-                "project_id": query_project_id,
-                "dataset_id": dataset_id,
-                "table_id": table_id,
-                "num_rows": num_rows,
-                "size_mb": size_mb,
-            }
-        ]
-    )
+    Args:
+        query (str):
+            String to search in datasets and tables' metadata.
+        order_by (str): score|popular|recent
+            Field by which the results will be ordered.
+
+    Returns:
+        pd.DataFrame:
+            Response from the API presented as a pandas DataFrame. Each row is
+            a table. Each column is a field identifying the table.
+    """
+
+    # validate order_by input
+    if order_by not in ["score", "popular", "recent"]:
+        raise ValueError(
+            f'order_by must be score, popular or recent. Received "{order_by}"'
+        )
 
-    return _handle_output(verbose=verbose, output_type="records", df=table_data)
+    url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q={query}&order_by={order_by}&resource_type=bdm_table"
+
+    response = _safe_fetch(url)
+
+    json_response = response.json()
+
+    dataset_dfs = []
+    # first loop identify the number of the tables in each datasets
+    for dataset in json_response["result"]["datasets"]:
+        tables_dfs = []
+        len(dataset["resources"])
+        # second loop extracts tables' information for each dataset
+        for table in dataset["resources"]:
+            data_table = pd.DataFrame(
+                {k: str(table[k]) for k in list(table.keys())}, index=[0]
+            )
+            tables_dfs.append(data_table)
+        # append tables' dataframes for each dataset
+        data_ds = tables_dfs[0].append(tables_dfs[1:]).reset_index(drop=True)
+        dataset_dfs.append(data_ds)
+    # append datasets' dataframes
+    df = dataset_dfs[0].append(dataset_dfs[1:]).reset_index(drop=True)
+
+    return df
\ No newline at end of file
diff --git a/python-package/tests/test_download/test_metadata.py b/python-package/tests/test_download/test_metadata.py
index 048fcbd08..d6788de83 100644
--- a/python-package/tests/test_download/test_metadata.py
+++ b/python-package/tests/test_download/test_metadata.py
@@ -1,9 +1,9 @@
-from os import read
 import pytest
 from pathlib import Path
 import pandas as pd
 from pandas_gbq.gbq import GenericGBQException
 import shutil
+import requests
 
 from basedosdados import (
     list_datasets,
@@ -12,110 +12,114 @@
     get_table_description,
     get_table_columns,
     get_table_size,
+    search,
 )
 
+from basedosdados.download.metadata import _safe_fetch
 
 TEST_PROJECT_ID = "basedosdados-dev"
 SAVEFILE = Path(__file__).parent / "tmp_bases" / "test.csv"
 SAVEPATH = Path(__file__).parent / "tmp_bases"
 shutil.rmtree(SAVEPATH, ignore_errors=True)
 
+def test_list_datasets_simple_verbose(capsys):
 
-def test_list_datasets(capsys):
-
-    list_datasets(from_file=True)
+    out = list_datasets(
+        query="trabalho", limit=10, with_description=False, verbose=True
+    )
     out, err = capsys.readouterr()  # Capture prints
     assert "dataset_id" in out
+    # check input error
+    with pytest.raises(ValueError):
+        search(query="trabalho", order_by="name")
+
 
+def test_list_datasets_simple_list():
 
-def test_list_datasets_complete(capsys):
+    out = list_datasets(query="", limit=12, with_description=False, verbose=False)
+    # check if function returns list
+    assert isinstance(out, list)
+    assert len(out) == 12
 
-    list_datasets(with_description=True, filter_by="ibge", from_file=True)
+
+def test_list_datasets_complete_list():
+
+    out = list_datasets(
+        query="trabalho", limit=12, with_description=True, verbose=False
+    )
+    # check if function returns list
+    assert isinstance(out, list)
+    assert "dataset_id" in out[0].keys()
+    assert "description" in out[0].keys()
+
+
+def test_list_datasets_complete_verbose(capsys):
+
+    list_datasets(query="trabalho", limit=10, with_description=True, verbose=True)
     out, err = capsys.readouterr()  # Capture prints
     assert "dataset_id" in out
     assert "description" in out
 
 
-def test_list_datasets_all_descriptions(capsys):
+def test_list_dataset_tables_simple_verbose(capsys):
 
-    list_datasets(with_description=True, from_file=True)
+    list_dataset_tables(dataset_id="br_me_caged", with_description=False, verbose=True)
     out, err = capsys.readouterr()  # Capture prints
-    assert len(out) > 0
+    assert "table_id" in out
 
 
-def test_list_datasets_verbose_false():
+def test_list_dataset_tables_simple_list():
+
+    out = list_dataset_tables(
+        dataset_id="br_me_caged", with_description=False, verbose=False
+    )
 
-    out = list_datasets(from_file=True, verbose=False)
     assert type(out) == list
     assert len(out) > 0
 
 
-def test_list_dataset_tables(capsys):
-
-    list_dataset_tables(dataset_id="br_ibge_censo_demografico", from_file=True)
-    out, err = capsys.readouterr()  # Capture prints
-    assert "table_id" in out
+def test_list_dataset_tables_complete_verbose(capsys):
 
+    list_dataset_tables(dataset_id="br_me_caged", with_description=True, verbose=True)
 
-def test_list_dataset_tables_complete(capsys):
-
-    list_dataset_tables(
-        dataset_id="br_ibge_censo_demografico",
-        filter_by="renda",
-        with_description=True,
-        from_file=True,
-    )
     out, err = capsys.readouterr()  # Capture prints
     assert "table_id" in out
     assert "description" in out
-    assert "renda" in out
 
 
-def test_list_dataset_tables_all_descriptions(capsys):
-    list_dataset_tables(
-        dataset_id="br_ibge_censo_demografico", with_description=True, from_file=True
-    )
-    out, err = capsys.readouterr()  # Capture prints
-    assert len(out) > 0
-
-
-def test_list_dataset_tables_verbose_false():
+def test_list_dataset_tables_complete_list():
 
     out = list_dataset_tables(
-        dataset_id="br_ibge_censo_demografico", from_file=True, verbose=False
+        dataset_id="br_me_caged", with_description=True, verbose=False
     )
+
     assert type(out) == list
-    assert len(out) > 0
+    assert type(out[0]) == dict
 
 
 def test_get_dataset_description(capsys):
 
-    get_dataset_description("br_ibge_censo_demografico", from_file=True)
+    get_dataset_description("br_me_caged", verbose=True)
     out, err = capsys.readouterr()  # Capture prints
     assert len(out) > 0
 
 
 def test_get_dataset_description_verbose_false():
-    out = get_dataset_description(
-        "br_ibge_censo_demografico", from_file=True, verbose=False
-    )
+    out = get_dataset_description("br_me_caged", verbose=False)
     assert type(out) == str
     assert len(out) > 0
 
 
 def test_get_table_description(capsys):
-    get_table_description(
-        "br_ibge_censo_demografico", "setor_censitario_basico_2010", from_file=True
-    )
+    get_table_description("br_me_caged", "microdados_antigos")
     out, err = capsys.readouterr()  # Capture prints
     assert len(out) > 0
 
 
 def test_get_table_description_verbose_false():
     out = get_table_description(
-        dataset_id="br_ibge_censo_demografico",
-        table_id="setor_censitario_basico_2010",
-        from_file=True,
+        dataset_id="br_me_caged",
+        table_id="microdados_antigos",
         verbose=False,
     )
     assert type(out) == str
@@ -124,46 +128,46 @@ def test_get_table_description_verbose_false():
 
 def test_get_table_columns(capsys):
     get_table_columns(
-        dataset_id="br_ibge_censo_demografico",
-        table_id="setor_censitario_basico_2010",
-        from_file=True,
+        dataset_id="br_me_caged",
+        table_id="microdados_antigos",
     )
     out, err = capsys.readouterr()  # Capture prints
     assert "name" in out
-    assert "field_type" in out
     assert "description" in out
 
 
 def test_get_table_columns_verbose_false():
     out = get_table_columns(
-        dataset_id="br_ibge_censo_demografico",
-        table_id="setor_censitario_basico_2010",
-        from_file=True,
+        dataset_id="br_me_caged",
+        table_id="microdados_antigos",
         verbose=False,
     )
     assert type(out) == list
     assert len(out) > 0
 
+def test_search():
+    out = search(query="agua", order_by="score")
+    # check if function returns pd.DataFrame
+    assert isinstance(out, pd.DataFrame)
+    # check if there is duplicate tables in the result
+    assert out.id.nunique() == out.shape[0]
+    # check input error
+    with pytest.raises(ValueError):
+        search(query="agua", order_by="name")
 
 def test_get_table_size(capsys):
     get_table_size(
         dataset_id="br_ibge_censo_demografico",
         table_id="setor_censitario_basico_2010",
-        billing_project_id=TEST_PROJECT_ID,
-        from_file=True,
     )
     out, err = capsys.readouterr()
-    assert "num_rows" in out
-    assert "size_mb" in out
+    assert "not available" in out
 
+def test__safe_fetch(capsys):
 
-def test_get_table_size_verbose_false():
-    out = get_table_size(
-        dataset_id="br_ibge_censo_demografico",
-        table_id="setor_censitario_basico_2010",
-        billing_project_id=TEST_PROJECT_ID,
-        from_file=True,
-        verbose=False,
-    )
-    assert type(out) == list
-    assert len(out) > 0
+    _safe_fetch("https://www.lkajsdhgfal.com.br")
+    out, err = capsys.readouterr()  # Capture prints
+    assert "HTTPSConnection" in out
+
+    response = _safe_fetch("https://basedosdados.org/api/3/action/bd_dataset_search?q=agua&page_size=10&resource_type=bdm_table")
+    assert type(response.json())==dict