Feature/pdct 1419 map family metadata to metadata key in family json …

…structure (#13) * WIP : implement mapping for family metadata * feat: add helper functions checks if row has any required columns with empty values, checks if required columns from a row are missing * refactor: update helper function to display columns with empty values add tests accordingly * refactor: revert code back to display generic value error message will revisit to try and fix pyright typing errors * feat: update family data mapper - add function to check for value and key existence in nested objects - add private methods - doc string tidy ups - tidy up function that gets budget information * refactor: move function into helper file it can be used more widely across the codebase * refactor: update required columns to be a set instead of a list * test: add test for family mapping * Bump version to 0.1.9 * test: update family test data * test: update conftest fixture to include fields that we interact with * bump version to 0.1.10 * refactor: tidy up mapper to limit value errors being raised instead of raising and stopping the tool from running we can just stdout the error * refactor: update naming to improve readability * refactor: update our checks for false values - remove the function that checks for values in nested objects - we don't want the tool to stop running for malformed data, only when a key is missing. - so we have updated the helper function to check for false/empty values in a list and just echo it out and then take it to the client - We have gone back to pythons normal way of accessing keys in a dict, where a key error will be raised if we try access something that is not there * refactor: renaming of variables and functions to make them a bit more legible * test: update tests * feat: add rquired columns to conftest * refactor: remove duplicate helper function * test: fix failing test, updated error message * chore: add context to why we are returning [0] for budgets --------- Co-authored-by: Osneil Drakes <[email protected]> Co-authored-by: Osneil Drakes <[email protected]>
climatepolicyradar · Sep 9, 2024 · 4665dd9 · 4665dd9
1 parent 57dfbe5
commit 4665dd9
Show file tree

Hide file tree

Showing 9 changed files with 621 additions and 71 deletions.
diff --git a/.trunk/configs/cspell.json b/.trunk/configs/cspell.json
@@ -40,7 +40,8 @@
     "dtypes",
     "isin",
     "pydantic",
-    "getfixturevalue"
+    "getfixturevalue",
+    "isna"
   ],
   "flagWords": ["hte"],
   "suggestionsTimeout": 5000

diff --git a/gcf_data_mapper/enums/family.py b/gcf_data_mapper/enums/family.py
@@ -0,0 +1,33 @@
+from enum import Enum
+
+
+class FamilyColumnsNames(Enum):
+    """The fields the GCF data mapper needs to parse family data/ metadata."""
+
+    APPROVED_REF = "ApprovedRef"
+    COUNTRIES = "Countries"
+    ENTITIES = "Entities"
+    FUNDING = "Funding"
+    PROJECT_URL = "ProjectURL"
+    PROJECTS_ID = "ProjectsID"
+    RESULT_AREAS = "ResultAreas"
+    SECTOR = "Sector"
+    THEME = "Theme"
+
+
+class FamilyNestedColumnNames(Enum):
+    """The fields the GCF data mapper needs to parse nested family data/ metadata."""
+
+    AREA = "Area"
+    BUDGET = "BudgetUSDeq"
+    NAME = "Name"
+    REGION = "Region"
+    SOURCE = "Source"
+    TYPE = "Type"
+
+
+class GCFProjectBudgetSource(Enum):
+    """The source of financing for the project's budget funding"""
+
+    CO_FINANCING = "Co-Financing"
+    GCF = "GCF"
diff --git a/gcf_data_mapper/parsers/family.py b/gcf_data_mapper/parsers/family.py
@@ -3,6 +3,129 @@
 import click
 import pandas as pd
 
+from gcf_data_mapper.enums.family import (
+    FamilyColumnsNames,
+    FamilyNestedColumnNames,
+    GCFProjectBudgetSource,
+)
+from gcf_data_mapper.parsers.helpers import (
+    arrays_contain_empty_values,
+    row_contains_columns_with_empty_values,
+    verify_required_fields_present,
+)
+
+
+def get_budgets(funding_list: list[dict], source: str) -> list[int]:
+    """Get the budget amount from the row based on the funding source.
+
+    :param list[dict] row: A list of all the funding information, represented in dictionaries
+    :param str source: The funding source to retrieve the budget from.
+
+    :return list[int]: A list of budget amounts corresponding to the source,
+        or [0] if the source is not found.
+    """
+
+    budget_key = FamilyNestedColumnNames.BUDGET.value
+    source_key = FamilyNestedColumnNames.SOURCE.value
+
+    budgets = [
+        funding[budget_key] for funding in funding_list if funding[source_key] == source
+    ]
+
+    # Where we have projects which have been solely funded by the fund (GCF), or solely co-financed
+    # - so in instances where there will be no funding that match either the GCF or co-financing
+    # source value, we will map the `project_value_fund spend` or the `project_value_co_financing`
+    # as an array with 0 i.e [0]
+    return budgets if budgets else [0]
+
+
+def map_family_metadata(row: pd.Series) -> Optional[dict]:
+    """Map the metadata of a family based on the provided row.
+
+    :param pd.Series row: The row containing family information.
+    :return Optional[dict]: A dictionary containing mapped metadata for the family.
+    """
+
+    countries = row.at[FamilyColumnsNames.COUNTRIES.value]
+    entities = row.at[FamilyColumnsNames.ENTITIES.value]
+    funding_sources = row.at[FamilyColumnsNames.FUNDING.value]
+    result_areas = row.at[FamilyColumnsNames.RESULT_AREAS.value]
+
+    area_key = FamilyNestedColumnNames.AREA.value
+    name_key = FamilyNestedColumnNames.NAME.value
+    region_key = FamilyNestedColumnNames.REGION.value
+    type_key = FamilyNestedColumnNames.TYPE.value
+
+    co_financing_budgets = get_budgets(
+        funding_sources, GCFProjectBudgetSource.CO_FINANCING.value
+    )
+    gcf_budgets = get_budgets(funding_sources, GCFProjectBudgetSource.GCF.value)
+
+    implementing_agencies = [entity[name_key] for entity in entities]
+    regions = [country[region_key] for country in countries]
+    areas = [result[area_key] for result in result_areas]
+    types = [result[type_key] for result in result_areas]
+
+    # As we are filtering the budget information by source for gcf and co financing, we
+    # know there will be instances where only one type of funding exists so checking
+    # for empty/false values would create false positives, hence the exclusion from this
+    # check
+    if arrays_contain_empty_values(
+        [
+            ("Implementing Agencies", implementing_agencies),
+            ("Regions", regions),
+            ("Result Areas", areas),
+            ("Result Types", types),
+        ],
+        row.at[FamilyColumnsNames.PROJECTS_ID.value],
+    ):
+        return None
+
+    metadata = {
+        "approved_ref": [row.at[FamilyColumnsNames.APPROVED_REF.value]],
+        "implementing_agencies": list(set(implementing_agencies)),
+        "project_id": [row.at[FamilyColumnsNames.PROJECTS_ID.value]],
+        "project_url": [row.at[FamilyColumnsNames.PROJECT_URL.value]],
+        "project_value_fund_spend": gcf_budgets,
+        "project_value_co_financing": co_financing_budgets,
+        "regions": list(set(regions)),
+        "result_areas": list(set(areas)),
+        "result_types": list(set(types)),
+        "sector": [row.at[FamilyColumnsNames.SECTOR.value]],
+        "theme": [row.at[FamilyColumnsNames.THEME.value]],
+    }
+
+    return metadata
+
+
+def process_row(
+    row: pd.Series, projects_id: str, required_columns: list[str]
+) -> Optional[dict]:
+    """Map the family data based on the provided row.
+
+    :param pd.Series row: The row containing family information.
+    :param str projects_id: The id of the current project that is being reformatted/processed
+    :param list required_columns: The list of required columns that we need to extract the
+        data from in the project
+    :return Optional[dict]: A dictionary containing mapped data for the family entity.
+        The function will return None, if the row contains missing data from expected columns/fields
+    """
+
+    if pd.isna(projects_id) or bool(projects_id) is False:
+        click.echo("🛑 Skipping row as it does not contain a project id")
+        return None
+
+    if row_contains_columns_with_empty_values(row, required_columns):
+        click.echo(
+            f"🛑 Skipping row as it contains empty column values: See Project ID {projects_id}"
+        )
+        return None
+
+    # TODO: Map family data
+    return {
+        "metadata": map_family_metadata(row),
+    }
+
 
 def family(projects_data: pd.DataFrame, debug: bool) -> list[Optional[dict[str, Any]]]:
     """Map the GCF family info to new structure.
@@ -14,7 +137,17 @@ def family(projects_data: pd.DataFrame, debug: bool) -> list[Optional[dict[str,
         the 'destination' format described in the GCF Data Mapper Google
         Sheet.
     """
+
     if debug:
         click.echo("📝 Wrangling GCF family data.")
 
-    return []
+    mapped_families = []
+
+    required_fields = set(str(e.value) for e in FamilyColumnsNames)
+    verify_required_fields_present(projects_data, required_fields)
+
+    for _, row in projects_data.iterrows():
+        projects_id = row.at[FamilyColumnsNames.PROJECTS_ID.value]
+        mapped_families.append(process_row(row, projects_id, list(required_fields)))
+
+    return mapped_families
diff --git a/gcf_data_mapper/parsers/helpers.py b/gcf_data_mapper/parsers/helpers.py
@@ -1,3 +1,4 @@
+import click
 import pandas as pd
 
 
@@ -8,7 +9,6 @@ def verify_required_fields_present(
 
     :param pd.DataFrame data: The DataFrame to check.
     :param set[str] required_fields: The required DataFrame columns.
-    :param bool debug: Whether debug mode is on.
     :raise AttributeError if any of the required fields are missing.
     :return bool: True if the DataFrame contains the required fields.
     """
@@ -17,12 +17,55 @@ def verify_required_fields_present(
     if diff == set():
         return True
 
+    # sets are naturally un-ordered, sorting them means we can test the error message reliably
+    sorted_diff = sorted(diff)
+    sorted_cols = sorted(cols)
+
     raise AttributeError(
-        f"Required fields '{str(diff)}' not present in df columns '"
-        f"{cols if cols != set() else r'{}'}'"
+        f"Required fields {sorted_diff} not present in df columns {sorted_cols}"
     )
 
 
 def check_required_column_value_not_na(row: pd.Series, column_enum) -> bool:
     """Check if the row contains valid document column values (not NA)."""
     return all(pd.notna(row[column.value]) for column in column_enum)
+
+
+def row_contains_columns_with_empty_values(
+    row: pd.Series, required_columns: list[str]
+) -> bool:
+    """Check that all required values in the given row are not empty (isna).
+
+    :param pd.Series row: The row to check for isna values.
+    :param list[str] required_columns: A list of column names that will be used to verify
+        isna values.
+    :return bool: True if the row contains columns with empty values, false if if all
+        expected columns are populated
+    """
+
+    # Ensure we are working with a pandas Series by re-selecting the required columns as a Series
+    row_subset = pd.Series(row[required_columns], index=required_columns)
+
+    if row_subset.isna().any():
+        return True
+    return False
+
+
+def arrays_contain_empty_values(list_values: list[tuple], id: str) -> bool:
+    """Checks the list in a tuple for empty (falsy) values; {}, [], None, ''
+
+    :param list[tuple] list_values: A list of tuples containing the name and array of values
+    :param str id: The ID of the project to include in message that we echo to the terminal.
+    :return bool: True if any list contains empty values, False otherwise.
+    """
+    arrays_with_empty_values = [
+        name for name, array in list_values if any(not value for value in array)
+    ]
+
+    if arrays_with_empty_values:
+        click.echo(
+            f"🛑 The following lists contain empty values: {', '.join(sorted(arrays_with_empty_values))}. Projects ID {id}"
+        )
+        return True
+
+    return False
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gcf-data-mapper"
-version = "0.1.9"
+version = "0.1.10"
 description = "A CLI tool to wrangle GCF data into format recognised by the bulk-import tool."
 authors = ["CPR-dev-team <[email protected]>"]
 license = "Apache-2.0"

diff --git a/tests/unit_tests/parsers/family/conftest.py b/tests/unit_tests/parsers/family/conftest.py
@@ -0,0 +1,61 @@
+import pandas as pd
+import pytest
+
+from gcf_data_mapper.enums.family import FamilyColumnsNames, FamilyNestedColumnNames
+
+
+@pytest.fixture()
+def test_family_doc_df():
+    yield pd.DataFrame(
+        [
+            {
+                "ProjectsID": 12660,
+                "ApprovedRef": "FP003",
+                "ProjectName": "Enhancing resilience of coastal ecosystems and communities",
+                "Theme": "Adaptation",
+                "Sector": "Environment",
+                "ProjectURL": "https://www.climateaction.fund/project/FP003",
+                "Countries": [
+                    {
+                        "CountryName": "Bangladesh",
+                        "ISO3": "BGD",
+                        "Region": "Asia",
+                    },
+                ],
+                "Entities": [
+                    {
+                        "Name": "Green Innovations",
+                    }
+                ],
+                "Funding": [
+                    {
+                        "Source": "GCF",
+                        "Budget": 9200000,
+                        "BudgetUSDeq": 9200000,
+                    },
+                    {
+                        "ProjectBudgetID": 412,
+                        "Source": "Co-Financing",
+                        "Budget": 620000,
+                        "BudgetUSDeq": 620000,
+                    },
+                ],
+                "ResultAreas": [
+                    {
+                        "Area": "Coastal protection and restoration",
+                        "Type": "Adaptation",
+                    },
+                ],
+            }
+        ]
+    )
+
+
+@pytest.fixture()
+def required_family_columns():
+    required_columns = [column.value for column in FamilyColumnsNames]
+    required_nested_family_columns = [
+        column.value for column in FamilyNestedColumnNames
+    ]
+
+    return required_columns, required_nested_family_columns