diff --git a/.trunk/configs/cspell.json b/.trunk/configs/cspell.json index c745caa..9667033 100644 --- a/.trunk/configs/cspell.json +++ b/.trunk/configs/cspell.json @@ -40,7 +40,8 @@ "dtypes", "isin", "pydantic", - "getfixturevalue" + "getfixturevalue", + "isna" ], "flagWords": ["hte"], "suggestionsTimeout": 5000 diff --git a/gcf_data_mapper/enums/family.py b/gcf_data_mapper/enums/family.py new file mode 100644 index 0000000..d4baff9 --- /dev/null +++ b/gcf_data_mapper/enums/family.py @@ -0,0 +1,33 @@ +from enum import Enum + + +class FamilyColumnsNames(Enum): + """The fields the GCF data mapper needs to parse family data/ metadata.""" + + APPROVED_REF = "ApprovedRef" + COUNTRIES = "Countries" + ENTITIES = "Entities" + FUNDING = "Funding" + PROJECT_URL = "ProjectURL" + PROJECTS_ID = "ProjectsID" + RESULT_AREAS = "ResultAreas" + SECTOR = "Sector" + THEME = "Theme" + + +class FamilyNestedColumnNames(Enum): + """The fields the GCF data mapper needs to parse nested family data/ metadata.""" + + AREA = "Area" + BUDGET = "BudgetUSDeq" + NAME = "Name" + REGION = "Region" + SOURCE = "Source" + TYPE = "Type" + + +class GCFProjectBudgetSource(Enum): + """The source of financing for the project's budget funding""" + + CO_FINANCING = "Co-Financing" + GCF = "GCF" diff --git a/gcf_data_mapper/parsers/family.py b/gcf_data_mapper/parsers/family.py index a0518c9..62c13dd 100644 --- a/gcf_data_mapper/parsers/family.py +++ b/gcf_data_mapper/parsers/family.py @@ -3,6 +3,129 @@ import click import pandas as pd +from gcf_data_mapper.enums.family import ( + FamilyColumnsNames, + FamilyNestedColumnNames, + GCFProjectBudgetSource, +) +from gcf_data_mapper.parsers.helpers import ( + arrays_contain_empty_values, + row_contains_columns_with_empty_values, + verify_required_fields_present, +) + + +def get_budgets(funding_list: list[dict], source: str) -> list[int]: + """Get the budget amount from the row based on the funding source. + + :param list[dict] row: A list of all the funding information, represented in dictionaries + :param str source: The funding source to retrieve the budget from. + + :return list[int]: A list of budget amounts corresponding to the source, + or [0] if the source is not found. + """ + + budget_key = FamilyNestedColumnNames.BUDGET.value + source_key = FamilyNestedColumnNames.SOURCE.value + + budgets = [ + funding[budget_key] for funding in funding_list if funding[source_key] == source + ] + + # Where we have projects which have been solely funded by the fund (GCF), or solely co-financed + # - so in instances where there will be no funding that match either the GCF or co-financing + # source value, we will map the `project_value_fund spend` or the `project_value_co_financing` + # as an array with 0 i.e [0] + return budgets if budgets else [0] + + +def map_family_metadata(row: pd.Series) -> Optional[dict]: + """Map the metadata of a family based on the provided row. + + :param pd.Series row: The row containing family information. + :return Optional[dict]: A dictionary containing mapped metadata for the family. + """ + + countries = row.at[FamilyColumnsNames.COUNTRIES.value] + entities = row.at[FamilyColumnsNames.ENTITIES.value] + funding_sources = row.at[FamilyColumnsNames.FUNDING.value] + result_areas = row.at[FamilyColumnsNames.RESULT_AREAS.value] + + area_key = FamilyNestedColumnNames.AREA.value + name_key = FamilyNestedColumnNames.NAME.value + region_key = FamilyNestedColumnNames.REGION.value + type_key = FamilyNestedColumnNames.TYPE.value + + co_financing_budgets = get_budgets( + funding_sources, GCFProjectBudgetSource.CO_FINANCING.value + ) + gcf_budgets = get_budgets(funding_sources, GCFProjectBudgetSource.GCF.value) + + implementing_agencies = [entity[name_key] for entity in entities] + regions = [country[region_key] for country in countries] + areas = [result[area_key] for result in result_areas] + types = [result[type_key] for result in result_areas] + + # As we are filtering the budget information by source for gcf and co financing, we + # know there will be instances where only one type of funding exists so checking + # for empty/false values would create false positives, hence the exclusion from this + # check + if arrays_contain_empty_values( + [ + ("Implementing Agencies", implementing_agencies), + ("Regions", regions), + ("Result Areas", areas), + ("Result Types", types), + ], + row.at[FamilyColumnsNames.PROJECTS_ID.value], + ): + return None + + metadata = { + "approved_ref": [row.at[FamilyColumnsNames.APPROVED_REF.value]], + "implementing_agencies": list(set(implementing_agencies)), + "project_id": [row.at[FamilyColumnsNames.PROJECTS_ID.value]], + "project_url": [row.at[FamilyColumnsNames.PROJECT_URL.value]], + "project_value_fund_spend": gcf_budgets, + "project_value_co_financing": co_financing_budgets, + "regions": list(set(regions)), + "result_areas": list(set(areas)), + "result_types": list(set(types)), + "sector": [row.at[FamilyColumnsNames.SECTOR.value]], + "theme": [row.at[FamilyColumnsNames.THEME.value]], + } + + return metadata + + +def process_row( + row: pd.Series, projects_id: str, required_columns: list[str] +) -> Optional[dict]: + """Map the family data based on the provided row. + + :param pd.Series row: The row containing family information. + :param str projects_id: The id of the current project that is being reformatted/processed + :param list required_columns: The list of required columns that we need to extract the + data from in the project + :return Optional[dict]: A dictionary containing mapped data for the family entity. + The function will return None, if the row contains missing data from expected columns/fields + """ + + if pd.isna(projects_id) or bool(projects_id) is False: + click.echo("🛑 Skipping row as it does not contain a project id") + return None + + if row_contains_columns_with_empty_values(row, required_columns): + click.echo( + f"🛑 Skipping row as it contains empty column values: See Project ID {projects_id}" + ) + return None + + # TODO: Map family data + return { + "metadata": map_family_metadata(row), + } + def family(projects_data: pd.DataFrame, debug: bool) -> list[Optional[dict[str, Any]]]: """Map the GCF family info to new structure. @@ -14,7 +137,17 @@ def family(projects_data: pd.DataFrame, debug: bool) -> list[Optional[dict[str, the 'destination' format described in the GCF Data Mapper Google Sheet. """ + if debug: click.echo("📝 Wrangling GCF family data.") - return [] + mapped_families = [] + + required_fields = set(str(e.value) for e in FamilyColumnsNames) + verify_required_fields_present(projects_data, required_fields) + + for _, row in projects_data.iterrows(): + projects_id = row.at[FamilyColumnsNames.PROJECTS_ID.value] + mapped_families.append(process_row(row, projects_id, list(required_fields))) + + return mapped_families diff --git a/gcf_data_mapper/parsers/helpers.py b/gcf_data_mapper/parsers/helpers.py index d994e00..1f1f471 100644 --- a/gcf_data_mapper/parsers/helpers.py +++ b/gcf_data_mapper/parsers/helpers.py @@ -1,3 +1,4 @@ +import click import pandas as pd @@ -8,7 +9,6 @@ def verify_required_fields_present( :param pd.DataFrame data: The DataFrame to check. :param set[str] required_fields: The required DataFrame columns. - :param bool debug: Whether debug mode is on. :raise AttributeError if any of the required fields are missing. :return bool: True if the DataFrame contains the required fields. """ @@ -17,12 +17,55 @@ def verify_required_fields_present( if diff == set(): return True + # sets are naturally un-ordered, sorting them means we can test the error message reliably + sorted_diff = sorted(diff) + sorted_cols = sorted(cols) + raise AttributeError( - f"Required fields '{str(diff)}' not present in df columns '" - f"{cols if cols != set() else r'{}'}'" + f"Required fields {sorted_diff} not present in df columns {sorted_cols}" ) def check_required_column_value_not_na(row: pd.Series, column_enum) -> bool: """Check if the row contains valid document column values (not NA).""" return all(pd.notna(row[column.value]) for column in column_enum) + + +def row_contains_columns_with_empty_values( + row: pd.Series, required_columns: list[str] +) -> bool: + """Check that all required values in the given row are not empty (isna). + + :param pd.Series row: The row to check for isna values. + :param list[str] required_columns: A list of column names that will be used to verify + isna values. + :return bool: True if the row contains columns with empty values, false if if all + expected columns are populated + """ + + # Ensure we are working with a pandas Series by re-selecting the required columns as a Series + row_subset = pd.Series(row[required_columns], index=required_columns) + + if row_subset.isna().any(): + return True + return False + + +def arrays_contain_empty_values(list_values: list[tuple], id: str) -> bool: + """Checks the list in a tuple for empty (falsy) values; {}, [], None, '' + + :param list[tuple] list_values: A list of tuples containing the name and array of values + :param str id: The ID of the project to include in message that we echo to the terminal. + :return bool: True if any list contains empty values, False otherwise. + """ + arrays_with_empty_values = [ + name for name, array in list_values if any(not value for value in array) + ] + + if arrays_with_empty_values: + click.echo( + f"🛑 The following lists contain empty values: {', '.join(sorted(arrays_with_empty_values))}. Projects ID {id}" + ) + return True + + return False diff --git a/pyproject.toml b/pyproject.toml index e921ea0..77775af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gcf-data-mapper" -version = "0.1.9" +version = "0.1.10" description = "A CLI tool to wrangle GCF data into format recognised by the bulk-import tool." authors = ["CPR-dev-team "] license = "Apache-2.0" diff --git a/tests/unit_tests/parsers/family/conftest.py b/tests/unit_tests/parsers/family/conftest.py new file mode 100644 index 0000000..f7df896 --- /dev/null +++ b/tests/unit_tests/parsers/family/conftest.py @@ -0,0 +1,61 @@ +import pandas as pd +import pytest + +from gcf_data_mapper.enums.family import FamilyColumnsNames, FamilyNestedColumnNames + + +@pytest.fixture() +def test_family_doc_df(): + yield pd.DataFrame( + [ + { + "ProjectsID": 12660, + "ApprovedRef": "FP003", + "ProjectName": "Enhancing resilience of coastal ecosystems and communities", + "Theme": "Adaptation", + "Sector": "Environment", + "ProjectURL": "https://www.climateaction.fund/project/FP003", + "Countries": [ + { + "CountryName": "Bangladesh", + "ISO3": "BGD", + "Region": "Asia", + }, + ], + "Entities": [ + { + "Name": "Green Innovations", + } + ], + "Funding": [ + { + "Source": "GCF", + "Budget": 9200000, + "BudgetUSDeq": 9200000, + }, + { + "ProjectBudgetID": 412, + "Source": "Co-Financing", + "Budget": 620000, + "BudgetUSDeq": 620000, + }, + ], + "ResultAreas": [ + { + "Area": "Coastal protection and restoration", + "Type": "Adaptation", + }, + ], + } + ] + ) + + +@pytest.fixture() +def required_family_columns(): + required_columns = [column.value for column in FamilyColumnsNames] + required_nested_family_columns = [ + column.value for column in FamilyNestedColumnNames + ] + + return required_columns, required_nested_family_columns diff --git a/tests/unit_tests/parsers/family/test_family.py b/tests/unit_tests/parsers/family/test_family.py index d4ce04c..ef532a5 100644 --- a/tests/unit_tests/parsers/family/test_family.py +++ b/tests/unit_tests/parsers/family/test_family.py @@ -1,9 +1,191 @@ +import pandas as pd import pytest -from gcf_data_mapper.parsers.family import family +from gcf_data_mapper.parsers.family import family, get_budgets, process_row -@pytest.mark.parametrize("debug", [True, False]) -def test_returns_empty(test_df, debug: bool): - family_data = family(test_df, debug) - assert family_data == [] +@pytest.fixture +def parsed_family_data(): + return [ + { + "metadata": { + "approved_ref": ["FP003"], + "implementing_agencies": ["Green Innovations"], + "project_id": [12660], + "project_url": ["https://www.climateaction.fund/project/FP003"], + "project_value_fund_spend": [9200000], + "project_value_co_financing": [620000], + "regions": ["Asia"], + "result_areas": ["Coastal protection and restoration"], + "result_types": ["Adaptation"], + "sector": ["Environment"], + "theme": ["Adaptation"], + } + } + ] + + +def test_returns_expected_family_data_structure( + test_family_doc_df: pd.DataFrame, parsed_family_data: list[dict] +): + family_data = family(test_family_doc_df, debug=True) + assert family_data != [] + assert len(family_data) == len(parsed_family_data) + assert family_data == parsed_family_data + + +def test_raises_error_on_validating_row_for_missing_columns(): + test_data_frame = pd.DataFrame( + [ + { + "ApprovedRef": "approved_ref", + "Entities": "Fake Entity", + "Funding": [{"Source": "GCF"}], + "ProjectURL": "www.fake-url.com", + "ProjectsID": 100, + "ResultAreas": [{"Area": "Coastal"}], + } + ] + ) + + expected_error_message = "Required fields ['Countries', 'Sector', 'Theme'] not present in df columns ['ApprovedRef', 'Entities', 'Funding', 'ProjectURL', 'ProjectsID', 'ResultAreas']" + with pytest.raises(AttributeError) as e: + family(test_data_frame, debug=True) + assert expected_error_message == str(e.value) + + +@pytest.mark.parametrize( + ("funding_list, source, expected_value"), + [ + ( + [ + { + "Source": "GCF", + "Budget": 1000, + "BudgetUSDeq": 2000, + }, + { + "Source": "Co-Financing", + "Budget": 1000, + "BudgetUSDeq": 2000, + }, + ], + "GCF", + [2000], + ), + ( + [ + { + "Source": "GCF", + "Budget": 1000, + "BudgetUSDeq": 2000, + }, + { + "Source": "Co-Financing", + "Budget": 1000, + "BudgetUSDeq": 2000, + }, + { + "Source": "Co-Financing", + "Budget": 2000, + "BudgetUSDeq": 4000, + }, + ], + "Co-Financing", + [2000, 4000], + ), + ( + [ + { + "Source": "Co-Financing", + "Budget": 1000, + "BudgetUSDeq": 2000, + }, + { + "Source": "Co-Financing", + "Budget": 2000, + "BudgetUSDeq": 4000, + }, + ], + "GCF", + [0], + ), + ], +) +def test_returns_expected_value_when_parsing_budget_data( + funding_list: list[dict], source: str, expected_value: list[int] +): + budgets = get_budgets(funding_list, source) + assert budgets == expected_value + + +@pytest.mark.parametrize( + ("test_ds,expected_return,error_message"), + [ + ( + pd.Series( + { + "ApprovedRef": pd.NA, + "Countries": pd.NA, + "Entities": pd.NA, + "Funding": [{"Source": "GCF"}], + "ProjectURL": "www.fake-url.com", + "ProjectsID": 100, + "ResultAreas": [{"Area": "Coastal"}], + "Sector": "TestSector", + "Theme": "TestTheme", + } + ), + None, + "🛑 Skipping row as it contains empty column values: See Project ID 100", + ), + ( + pd.Series( + { + "ApprovedRef": pd.NA, + "Countries": pd.NA, + "Entities": pd.NA, + "Funding": [{"Source": "GCF"}], + "ProjectURL": "www.fake-url.com", + "ProjectsID": pd.NA, + "ResultAreas": [{"Area": "Coastal"}], + "Sector": "TestSector", + "Theme": "TestTheme", + } + ), + None, + "🛑 Skipping row as it does not contain a project id", + ), + ( + pd.Series( + { + "ApprovedRef": pd.NA, + "Countries": pd.NA, + "Entities": pd.NA, + "Funding": [{"Source": "GCF"}], + "ProjectURL": "www.fake-url.com", + "ProjectsID": "", + "ResultAreas": [{"Area": "Coastal"}], + "Sector": "TestSector", + "Theme": "TestTheme", + } + ), + None, + "🛑 Skipping row as it does not contain a project id", + ), + ], +) +def test_skips_processing_row_if_row_contains_empty_values( + test_ds: pd.Series, + expected_return, + error_message: str, + capsys, + required_family_columns, +): + projects_id = test_ds.ProjectsID + + columns, _ = required_family_columns + expected_return = process_row(test_ds, projects_id, columns) + assert expected_return is None + captured = capsys.readouterr() + assert error_message == captured.out.strip() diff --git a/tests/unit_tests/parsers/helpers/test_helpers.py b/tests/unit_tests/parsers/helpers/test_helpers.py new file mode 100644 index 0000000..fd85245 --- /dev/null +++ b/tests/unit_tests/parsers/helpers/test_helpers.py @@ -0,0 +1,157 @@ +import pandas as pd +import pytest + +from gcf_data_mapper.parsers.helpers import ( + arrays_contain_empty_values, + row_contains_columns_with_empty_values, + verify_required_fields_present, +) + + +@pytest.mark.parametrize( + ("test_df", "expected_fields", "expected_error"), + [ + ( + pd.DataFrame( + { + "fruits": ["apple", "banana", "cherry"], + } + ), + set(["fruits", "vegetables"]), + "Required fields ['vegetables'] not present in df columns ['fruits']", + ), + ( + pd.DataFrame(), + set(["cars"]), + "Required fields ['cars'] not present in df columns []", + ), + ], +) +def test_returns_false_when_missing_fields( + test_df: pd.DataFrame, expected_fields: set[str], expected_error: str +): + with pytest.raises(AttributeError) as e: + verify_required_fields_present(test_df, expected_fields) + assert str(e.value) == expected_error + + +@pytest.mark.parametrize( + ("test_df", "expected_fields"), + [ + ( + pd.DataFrame( + { + "fruits": ["date", "elderberry", "fig"], + "vegetables": ["asparagus", "beetroot", "carrot"], + } + ), + set(["fruits", "vegetables"]), + ), + ( + pd.DataFrame( + { + "cars": ["Ford", "Renault", "Audi"], + } + ), + set(["cars"]), + ), + ], +) +def test_returns_true_when_no_missing_fields( + test_df: pd.DataFrame, expected_fields: set[str] +): + return_value = verify_required_fields_present(test_df, expected_fields) + assert return_value is True + + +@pytest.mark.parametrize( + ("test_ds,required_columns,expected_return"), + [ + ( + pd.Series({"Fruit": pd.NA, "Plant": pd.NA, "Tree": "Oak"}), + ["Fruit", "Plant"], + True, + ), + ( + pd.Series({"Fruit": "Apple", "Plant": "Mint", "Tree": "Rosemary"}), + ["Fruit", "Plant"], + False, + ), + ], +) +def test_checks_if_there_are_columns_with_empty_values_in_a_given_row( + test_ds: pd.Series, required_columns: list[str], expected_return: bool +): + result = row_contains_columns_with_empty_values(test_ds, required_columns) + assert result == expected_return + + +@pytest.mark.parametrize( + "list_values, project_id, expected_return", + [ + ( + [ + ("Fruits", ["Apple", "Mango"]), + ("Plants", ["Rosemary", "Mint"]), + ("Trees", ["Oak", "Sycamore"]), + ], + "P001", + False, # Function should return False when no empty values + ), + ( + [ + ("Fruits", ["Apple", "Mango"]), + ("Plants", ["", ""]), + ("Trees", ["Oak", "Sycamore"]), + ], + "P002", + True, # Function should return True when there are empty values + ), + ( + [ + ("Fruits", ["Apple", "Mango"]), + ("Plants", ["", ""]), + ("Trees", [""]), + ], + "P003", + True, + ), + ], +) +def test_check_arrays_for_empty_values( + list_values: list, project_id: str, expected_return: bool +): + result = arrays_contain_empty_values(list_values, project_id) + assert result == expected_return + assert type(result) is bool + + +@pytest.mark.parametrize( + "list_values, project_id, expected_output", + [ + ( + [ + ("Fruits", ["Apple", "Mango"]), + ("Plants", ["Rosemary", "Mint"]), + ("Trees", ["Oak", "Sycamore"]), + ], + "P001", + "", # If the array does not contain any empty values we don't expect an output + ), + ( + [ + ("Fruits", ["Apple", "Mango"]), + ("Plants", ["", ""]), + ("Trees", ["Oak", "Sycamore"]), + ], + "P002", + "🛑 The following lists contain empty values: Plants. Projects ID P002", + ), + ], +) +def test_check_arrays_for_empty_values_outputs_msg_to_the_cli( + list_values: list, project_id: str, expected_output: str, capsys +): + arrays_contain_empty_values(list_values, project_id) + captured = capsys.readouterr() + assert expected_output == captured.out.strip() diff --git a/tests/unit_tests/parsers/helpers/test_verify_required_fields_present.py b/tests/unit_tests/parsers/helpers/test_verify_required_fields_present.py deleted file mode 100644 index 06f81a9..0000000 --- a/tests/unit_tests/parsers/helpers/test_verify_required_fields_present.py +++ /dev/null @@ -1,60 +0,0 @@ -import pandas as pd -import pytest - -from gcf_data_mapper.parsers.helpers import verify_required_fields_present - - -@pytest.mark.parametrize( - ("test_df", "expected_fields", "expected_error"), - [ - ( - pd.DataFrame( - { - "fruits": ["apple", "banana", "cherry"], - } - ), - set(["fruits", "vegetables"]), - "Required fields '{'vegetables'}' not present in df columns '{'fruits'}'", - ), - ( - pd.DataFrame(), - set(["cars"]), - "Required fields '{'cars'}' not present in df columns '{}'", - ), - ], -) -def test_returns_false_when_missing_fields( - test_df: pd.DataFrame, expected_fields: set[str], expected_error: str -): - with pytest.raises(AttributeError) as e: - verify_required_fields_present(test_df, expected_fields) - assert str(e.value) == expected_error - - -@pytest.mark.parametrize( - ("test_df", "expected_fields"), - [ - ( - pd.DataFrame( - { - "fruits": ["date", "elderberry", "fig"], - "vegetables": ["asparagus", "beetroot", "carrot"], - } - ), - set(["fruits", "vegetables"]), - ), - ( - pd.DataFrame( - { - "cars": ["Ford", "Renault", "Audi"], - } - ), - set(["cars"]), - ), - ], -) -def test_returns_true_when_no_missing_fields( - test_df: pd.DataFrame, expected_fields: set[str] -): - return_value = verify_required_fields_present(test_df, expected_fields) - assert return_value is True