Skip to content

Commit

Permalink
Feature/pdct 1419 map family metadata to metadata key in family json …
Browse files Browse the repository at this point in the history
…structure (#13)

* WIP : implement mapping for family metadata

* feat: add helper functions

checks if row has any required columns with empty values, checks if required columns from a row are missing

* refactor: update helper function to display columns with empty values

add tests accordingly

* refactor: revert code back to display generic value error message

will revisit to try and fix pyright typing errors

* feat: update family data mapper

 - add function to check for value and key existence in nested objects
 - add private methods
 - doc string tidy ups
 - tidy up function that gets budget information

* refactor: move function into helper file

it can be used more widely across the codebase

* refactor: update required columns to be a set instead of a list

* test: add test for family mapping

* Bump version to 0.1.9

* test: update family test data

* test: update conftest fixture to include fields that we interact with

* bump version to 0.1.10

* refactor: tidy up mapper to limit value errors being raised

instead of raising and stopping the tool from running we can just stdout the error

* refactor: update naming to improve readability

* refactor: update our checks for false values

- remove the function that checks for values in nested objects
- we don't want the tool to stop running for malformed data, only when a
  key is missing.
- so we have updated the helper function to check for false/empty values
  in a list and just echo it out and then take it to the client
- We have gone back to pythons normal way of accessing keys in a dict,
  where a key error will be raised if we try access something that is
  not there

* refactor: renaming of variables and functions to make them a bit more legible

* test: update tests

* feat: add rquired columns to conftest

* refactor: remove duplicate helper function

* test: fix failing test, updated error message

* chore: add context to why we are returning [0] for budgets

---------

Co-authored-by: Osneil Drakes <[email protected]>
Co-authored-by: Osneil Drakes <[email protected]>
  • Loading branch information
3 people authored Sep 9, 2024
1 parent 57dfbe5 commit 4665dd9
Show file tree
Hide file tree
Showing 9 changed files with 621 additions and 71 deletions.
3 changes: 2 additions & 1 deletion .trunk/configs/cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@
"dtypes",
"isin",
"pydantic",
"getfixturevalue"
"getfixturevalue",
"isna"
],
"flagWords": ["hte"],
"suggestionsTimeout": 5000
Expand Down
33 changes: 33 additions & 0 deletions gcf_data_mapper/enums/family.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from enum import Enum


class FamilyColumnsNames(Enum):
"""The fields the GCF data mapper needs to parse family data/ metadata."""

APPROVED_REF = "ApprovedRef"
COUNTRIES = "Countries"
ENTITIES = "Entities"
FUNDING = "Funding"
PROJECT_URL = "ProjectURL"
PROJECTS_ID = "ProjectsID"
RESULT_AREAS = "ResultAreas"
SECTOR = "Sector"
THEME = "Theme"


class FamilyNestedColumnNames(Enum):
"""The fields the GCF data mapper needs to parse nested family data/ metadata."""

AREA = "Area"
BUDGET = "BudgetUSDeq"
NAME = "Name"
REGION = "Region"
SOURCE = "Source"
TYPE = "Type"


class GCFProjectBudgetSource(Enum):
"""The source of financing for the project's budget funding"""

CO_FINANCING = "Co-Financing"
GCF = "GCF"
135 changes: 134 additions & 1 deletion gcf_data_mapper/parsers/family.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,129 @@
import click
import pandas as pd

from gcf_data_mapper.enums.family import (
FamilyColumnsNames,
FamilyNestedColumnNames,
GCFProjectBudgetSource,
)
from gcf_data_mapper.parsers.helpers import (
arrays_contain_empty_values,
row_contains_columns_with_empty_values,
verify_required_fields_present,
)


def get_budgets(funding_list: list[dict], source: str) -> list[int]:
"""Get the budget amount from the row based on the funding source.
:param list[dict] row: A list of all the funding information, represented in dictionaries
:param str source: The funding source to retrieve the budget from.
:return list[int]: A list of budget amounts corresponding to the source,
or [0] if the source is not found.
"""

budget_key = FamilyNestedColumnNames.BUDGET.value
source_key = FamilyNestedColumnNames.SOURCE.value

budgets = [
funding[budget_key] for funding in funding_list if funding[source_key] == source
]

# Where we have projects which have been solely funded by the fund (GCF), or solely co-financed
# - so in instances where there will be no funding that match either the GCF or co-financing
# source value, we will map the `project_value_fund spend` or the `project_value_co_financing`
# as an array with 0 i.e [0]
return budgets if budgets else [0]


def map_family_metadata(row: pd.Series) -> Optional[dict]:
"""Map the metadata of a family based on the provided row.
:param pd.Series row: The row containing family information.
:return Optional[dict]: A dictionary containing mapped metadata for the family.
"""

countries = row.at[FamilyColumnsNames.COUNTRIES.value]
entities = row.at[FamilyColumnsNames.ENTITIES.value]
funding_sources = row.at[FamilyColumnsNames.FUNDING.value]
result_areas = row.at[FamilyColumnsNames.RESULT_AREAS.value]

area_key = FamilyNestedColumnNames.AREA.value
name_key = FamilyNestedColumnNames.NAME.value
region_key = FamilyNestedColumnNames.REGION.value
type_key = FamilyNestedColumnNames.TYPE.value

co_financing_budgets = get_budgets(
funding_sources, GCFProjectBudgetSource.CO_FINANCING.value
)
gcf_budgets = get_budgets(funding_sources, GCFProjectBudgetSource.GCF.value)

implementing_agencies = [entity[name_key] for entity in entities]
regions = [country[region_key] for country in countries]
areas = [result[area_key] for result in result_areas]
types = [result[type_key] for result in result_areas]

# As we are filtering the budget information by source for gcf and co financing, we
# know there will be instances where only one type of funding exists so checking
# for empty/false values would create false positives, hence the exclusion from this
# check
if arrays_contain_empty_values(
[
("Implementing Agencies", implementing_agencies),
("Regions", regions),
("Result Areas", areas),
("Result Types", types),
],
row.at[FamilyColumnsNames.PROJECTS_ID.value],
):
return None

metadata = {
"approved_ref": [row.at[FamilyColumnsNames.APPROVED_REF.value]],
"implementing_agencies": list(set(implementing_agencies)),
"project_id": [row.at[FamilyColumnsNames.PROJECTS_ID.value]],
"project_url": [row.at[FamilyColumnsNames.PROJECT_URL.value]],
"project_value_fund_spend": gcf_budgets,
"project_value_co_financing": co_financing_budgets,
"regions": list(set(regions)),
"result_areas": list(set(areas)),
"result_types": list(set(types)),
"sector": [row.at[FamilyColumnsNames.SECTOR.value]],
"theme": [row.at[FamilyColumnsNames.THEME.value]],
}

return metadata


def process_row(
row: pd.Series, projects_id: str, required_columns: list[str]
) -> Optional[dict]:
"""Map the family data based on the provided row.
:param pd.Series row: The row containing family information.
:param str projects_id: The id of the current project that is being reformatted/processed
:param list required_columns: The list of required columns that we need to extract the
data from in the project
:return Optional[dict]: A dictionary containing mapped data for the family entity.
The function will return None, if the row contains missing data from expected columns/fields
"""

if pd.isna(projects_id) or bool(projects_id) is False:
click.echo("🛑 Skipping row as it does not contain a project id")
return None

if row_contains_columns_with_empty_values(row, required_columns):
click.echo(
f"🛑 Skipping row as it contains empty column values: See Project ID {projects_id}"
)
return None

# TODO: Map family data
return {
"metadata": map_family_metadata(row),
}


def family(projects_data: pd.DataFrame, debug: bool) -> list[Optional[dict[str, Any]]]:
"""Map the GCF family info to new structure.
Expand All @@ -14,7 +137,17 @@ def family(projects_data: pd.DataFrame, debug: bool) -> list[Optional[dict[str,
the 'destination' format described in the GCF Data Mapper Google
Sheet.
"""

if debug:
click.echo("📝 Wrangling GCF family data.")

return []
mapped_families = []

required_fields = set(str(e.value) for e in FamilyColumnsNames)
verify_required_fields_present(projects_data, required_fields)

for _, row in projects_data.iterrows():
projects_id = row.at[FamilyColumnsNames.PROJECTS_ID.value]
mapped_families.append(process_row(row, projects_id, list(required_fields)))

return mapped_families
49 changes: 46 additions & 3 deletions gcf_data_mapper/parsers/helpers.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import click
import pandas as pd


Expand All @@ -8,7 +9,6 @@ def verify_required_fields_present(
:param pd.DataFrame data: The DataFrame to check.
:param set[str] required_fields: The required DataFrame columns.
:param bool debug: Whether debug mode is on.
:raise AttributeError if any of the required fields are missing.
:return bool: True if the DataFrame contains the required fields.
"""
Expand All @@ -17,12 +17,55 @@ def verify_required_fields_present(
if diff == set():
return True

# sets are naturally un-ordered, sorting them means we can test the error message reliably
sorted_diff = sorted(diff)
sorted_cols = sorted(cols)

raise AttributeError(
f"Required fields '{str(diff)}' not present in df columns '"
f"{cols if cols != set() else r'{}'}'"
f"Required fields {sorted_diff} not present in df columns {sorted_cols}"
)


def check_required_column_value_not_na(row: pd.Series, column_enum) -> bool:
"""Check if the row contains valid document column values (not NA)."""
return all(pd.notna(row[column.value]) for column in column_enum)


def row_contains_columns_with_empty_values(
row: pd.Series, required_columns: list[str]
) -> bool:
"""Check that all required values in the given row are not empty (isna).
:param pd.Series row: The row to check for isna values.
:param list[str] required_columns: A list of column names that will be used to verify
isna values.
:return bool: True if the row contains columns with empty values, false if if all
expected columns are populated
"""

# Ensure we are working with a pandas Series by re-selecting the required columns as a Series
row_subset = pd.Series(row[required_columns], index=required_columns)

if row_subset.isna().any():
return True
return False


def arrays_contain_empty_values(list_values: list[tuple], id: str) -> bool:
"""Checks the list in a tuple for empty (falsy) values; {}, [], None, ''
:param list[tuple] list_values: A list of tuples containing the name and array of values
:param str id: The ID of the project to include in message that we echo to the terminal.
:return bool: True if any list contains empty values, False otherwise.
"""
arrays_with_empty_values = [
name for name, array in list_values if any(not value for value in array)
]

if arrays_with_empty_values:
click.echo(
f"🛑 The following lists contain empty values: {', '.join(sorted(arrays_with_empty_values))}. Projects ID {id}"
)
return True

return False
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "gcf-data-mapper"
version = "0.1.9"
version = "0.1.10"
description = "A CLI tool to wrangle GCF data into format recognised by the bulk-import tool."
authors = ["CPR-dev-team <[email protected]>"]
license = "Apache-2.0"
Expand Down
61 changes: 61 additions & 0 deletions tests/unit_tests/parsers/family/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import pandas as pd
import pytest

from gcf_data_mapper.enums.family import FamilyColumnsNames, FamilyNestedColumnNames


@pytest.fixture()
def test_family_doc_df():
yield pd.DataFrame(
[
{
"ProjectsID": 12660,
"ApprovedRef": "FP003",
"ProjectName": "Enhancing resilience of coastal ecosystems and communities",
"Theme": "Adaptation",
"Sector": "Environment",
"ProjectURL": "https://www.climateaction.fund/project/FP003",
"Countries": [
{
"CountryName": "Bangladesh",
"ISO3": "BGD",
"Region": "Asia",
},
],
"Entities": [
{
"Name": "Green Innovations",
}
],
"Funding": [
{
"Source": "GCF",
"Budget": 9200000,
"BudgetUSDeq": 9200000,
},
{
"ProjectBudgetID": 412,
"Source": "Co-Financing",
"Budget": 620000,
"BudgetUSDeq": 620000,
},
],
"ResultAreas": [
{
"Area": "Coastal protection and restoration",
"Type": "Adaptation",
},
],
}
]
)


@pytest.fixture()
def required_family_columns():
required_columns = [column.value for column in FamilyColumnsNames]
required_nested_family_columns = [
column.value for column in FamilyNestedColumnNames
]

return required_columns, required_nested_family_columns
Loading

0 comments on commit 4665dd9

Please sign in to comment.