diff --git a/.gitignore b/.gitignore index 53cf831..69a6c41 100644 --- a/.gitignore +++ b/.gitignore @@ -174,3 +174,6 @@ plugins user_trunk.yaml user.yaml tmp + +# Output files +output.json diff --git a/.trunk/configs/cspell.json b/.trunk/configs/cspell.json index 3eaa41a..db61b43 100644 --- a/.trunk/configs/cspell.json +++ b/.trunk/configs/cspell.json @@ -35,7 +35,8 @@ "iloc", "iterrows", "notna", - "conftest" + "conftest", + "capsys" ], "flagWords": ["hte"], "suggestionsTimeout": 5000 diff --git a/Makefile b/Makefile index 94f4cd8..9a77ac6 100644 --- a/Makefile +++ b/Makefile @@ -41,4 +41,11 @@ build: ## Build the project poetry build test: ## Run tests using pytest - poetry run pytest -v + poetry run pytest -vvv + +test_coverage: ## Run tests using pytest with coverage + poetry run coverage run -m pytest -vvv tests + coverage report + +test_coverage_html: test_coverage ## Run tests using pytest with coverage and generate a HTML report + coverage report diff --git a/gcf_data_mapper/cli.py b/gcf_data_mapper/cli.py index 89185e7..566e4d0 100644 --- a/gcf_data_mapper/cli.py +++ b/gcf_data_mapper/cli.py @@ -1,3 +1,4 @@ +import json import os import sys from typing import Any, Optional @@ -111,6 +112,13 @@ def dump_output( if debug: click.echo(f"📝 Output file {click.format_filename(output_file)}") + try: + with open(output_file, "w+", encoding="utf-8") as f: + json.dump(mapped_data, f, ensure_ascii=False, indent=2) + except Exception as e: + click.echo(f"❌ Failed to dump JSON to file. Error: {e}.") + sys.exit(1) + if __name__ == "__main__": entrypoint() diff --git a/gcf_data_mapper/enums/event.py b/gcf_data_mapper/enums/event.py new file mode 100644 index 0000000..d65bb72 --- /dev/null +++ b/gcf_data_mapper/enums/event.py @@ -0,0 +1,40 @@ +from collections import namedtuple +from enum import Enum + +Event = namedtuple("event", ["name", "type", "column_name"]) + + +class EventColumnNames(Enum): + """The fields the GCF data mapper needs to parse event data.""" + + APPROVED = "ApprovalDate" + UNDER_IMPLEMENTATION = "StartDate" + COMPLETED = "DateCompletion" + APPROVED_REF = "ApprovedRef" + PROJECTS_ID = "ProjectsID" + + +class EventTypeNames(Enum): + """The GCF event type names (should map to the GCF taxonomy).""" + + APPROVED = "Approved" + UNDER_IMPLEMENTATION = "Under Implementation" + COMPLETED = "Completed" + + +class Events: + APPROVED = Event( + "approved", + EventTypeNames.APPROVED.value, + EventColumnNames.APPROVED.value, + ) + UNDER_IMPLEMENTATION = Event( + "under_implementation", + EventTypeNames.UNDER_IMPLEMENTATION.value, + EventColumnNames.UNDER_IMPLEMENTATION.value, + ) + COMPLETED = Event( + "completed", + EventTypeNames.COMPLETED.value, + EventColumnNames.COMPLETED.value, + ) diff --git a/gcf_data_mapper/parsers/event.py b/gcf_data_mapper/parsers/event.py index 8eea7eb..8cf98cf 100644 --- a/gcf_data_mapper/parsers/event.py +++ b/gcf_data_mapper/parsers/event.py @@ -1,16 +1,117 @@ -from enum import Enum from typing import Any, Optional import click import pandas as pd +from gcf_data_mapper.enums.event import Event, EventColumnNames, Events from gcf_data_mapper.parsers.helpers import verify_required_fields_present -class RequiredColumns(Enum): - APPROVED = "ApprovalDate" - UNDER_IMPLEMENTATION = "StartDate" - COMPLETED = "DateCompletion" +def append_event( + gcf_events: list, + event: Event, + row: pd.Series, + approved_ref: str, + projects_id: str, + n_value: int, +) -> None: + """Append an event to the master list that is passed in. + + Remember, because lists are mutable in Python, any changes to the + list inside a function will be reflected outside of it as a + reference to the object is passed instead of just the value. + + :param list gcf_events: The list of GCF events. + :param Event event: The event to append. + :param pd.Series row: The row of data containing GCF event info. + Each row corresponds to a GCF 'family'. + :param str approved_ref: The FP number. + :param str projects_id: The GCF projects ID. + :param int n_value: The event number for the given GCF family. + """ + gcf_events.append( + { + "import_id": f"GCF.event.{approved_ref}_{projects_id}.n{n_value:04}", + "family_import_id": f"GCF.event.{approved_ref}.{projects_id}", + "event_title": event.type, + "date": row[event.column_name], + "event_type_value": event.type, + } + ) + + +def check_event_dates(row: pd.Series) -> dict[str, bool]: + """Check if the row contains valid event date values (not NA). + + :param pd.Series row: The row of data to check. + :return dict[str, bool]: A dict indicating the presence of each + event date. + """ + return { + Events.APPROVED.name: pd.notna(row.at[Events.APPROVED.column_name]), + Events.UNDER_IMPLEMENTATION.name: pd.notna( + row.at[Events.UNDER_IMPLEMENTATION.column_name] + ), + Events.COMPLETED.name: pd.notna(row.at[Events.COMPLETED.column_name]), + } + + +def initialise_event_counter( + event_counter: dict[str, int], family_import_id: str +) -> None: + """Initialise the event counter for a family_import_id if not present. + + Remember, because dictionaries are mutable in Python, any changes to + the dictionary inside a function will be reflected outside of it as + a reference to the object is passed instead of just the value. + + :param dict[str, int] event_counter: The event counter dictionary + containing each family_import_id as a key and its corresponding + counter of events. + :param str family_import_id: The family import ID to initialise an + event counter for. + """ + if family_import_id not in event_counter: + event_counter[family_import_id] = 0 + + +def process_event( + row: pd.Series, + gcf_events: list, + event_counter: dict, + approved_ref: str, + projects_id: str, +) -> None: + """Process a row to append events and update the event counter. + + :param pd.Series row: The row of data to process (corresponds to a + GCF family). + :param list gcf_events: The master list of already processed GCF + events. + :param dict event_counter: The event counter dictionary. + :param str approved_ref: The FP number. + :param str projects_id: The GCF projects ID. + """ + family_import_id = f"GCF.event.{approved_ref}.{projects_id}" + initialise_event_counter(event_counter, family_import_id) + + event_dates = check_event_dates(row) + if not any(event_dates.values()): + click.echo(f"🛑 No event dates found for {approved_ref} {projects_id}.") + return + + for event_name, has_event in event_dates.items(): + if has_event: + event = getattr(Events, event_name.upper()) + append_event( + gcf_events, + event, + row, + approved_ref, + projects_id, + event_counter[family_import_id], + ) + event_counter[family_import_id] += 1 def event(projects_data: pd.DataFrame, debug: bool) -> list[Optional[dict[str, Any]]]: @@ -26,7 +127,15 @@ def event(projects_data: pd.DataFrame, debug: bool) -> list[Optional[dict[str, A if debug: click.echo("📝 Wrangling GCF event data.") - required_fields = set(str(e.value) for e in RequiredColumns) + required_fields = set(str(e.value) for e in EventColumnNames) verify_required_fields_present(projects_data, required_fields) - return [] + gcf_events = [] + event_counter = {} + + for _, row in projects_data.iterrows(): + approved_ref = row.at[EventColumnNames.APPROVED_REF.value] + projects_id = row.at[EventColumnNames.PROJECTS_ID.value] + process_event(row, gcf_events, event_counter, approved_ref, projects_id) + + return gcf_events diff --git a/gcf_data_mapper/read.py b/gcf_data_mapper/read.py index 02e0281..26ca694 100644 --- a/gcf_data_mapper/read.py +++ b/gcf_data_mapper/read.py @@ -29,9 +29,9 @@ def read_csv_pd( :param Optional[int] chunk_size: The number of lines to read into memory in each batch iteratively. Defaults to 10**4. - :return Optional[pd.DataFrame]: A Pandas DataFrame containing the - CSV data if the file is successfully found and parsed by the - Pandas CSV reader. Otherwise this function will return None. + :return pd.DataFrame: A Pandas DataFrame containing the CSV data if + the file is successfully found and parsed by the Pandas CSV + reader. Otherwise an empty DataFrame will be returned. """ # Should the path exist, read the CSV contents into memory iteratively in chunks of # 'chunk_size' (10**4 by default). @@ -50,30 +50,28 @@ def read_csv_pd( return dataset except Exception as e: - print(e) - click.echo("Error occurred reading CSV file using Pandas: %s" % file_path) + click.echo(f"❌ Error reading file {file_path}: {e}") return pd.DataFrame([]) -def read_json_pd(file_path: str): +def read_json_pd(file_path: str) -> pd.DataFrame: """Load the data from the specified JSON file into a Pandas DF. :param str file_path: The filepath passed by the user to the tool. - :return Optional[pd.DataFrame]: A Pandas DataFrame containing the - JSON data if the file is successfully found and parsed by the - Pandas JSON reader. Otherwise this function will return None. + :return pd.DataFrame: A Pandas DataFrame containing the CSV data if + the file is successfully found and parsed by the Pandas CSV + reader. Otherwise an empty DataFrame will be returned. """ + df = pd.DataFrame([]) try: with open(file_path, "r") as file: df = pd.json_normalize(json.load(file)) - return df except Exception as e: - print(e) - click.echo("Error occurred reading JSON file using Pandas: %s" % file_path) - return pd.DataFrame([]) + click.echo(f"❌ Error reading file {file_path}: {e}") + return df def read_into_pandas(file_path: str, debug: bool = False) -> pd.DataFrame: @@ -99,18 +97,16 @@ def read_into_pandas(file_path: str, debug: bool = False) -> pd.DataFrame: if file_extension not in [e.value for e in AllowedFileExtensions]: raise ValueError("Error reading file: File must be a valid json or csv file") - if os.path.getsize(file_path) == 0 and debug: - click.echo(f"File '{file_path}' is empty") - df = pd.DataFrame([]) - try: - if file_extension == AllowedFileExtensions.CSV.value: - return read_csv_pd(file_path) - if file_extension == AllowedFileExtensions.JSON.value: - return read_json_pd(file_path) - except Exception as e: - click.echo(f"Error reading file: {e}") - raise e + + if os.path.getsize(file_path) == 0: + return df + + if file_extension == AllowedFileExtensions.CSV.value: + df = read_csv_pd(file_path) + + elif file_extension == AllowedFileExtensions.JSON.value: + df = read_json_pd(file_path) return df diff --git a/poetry.lock b/poetry.lock index 78af7a0..d62a82f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -25,6 +25,90 @@ files = [ {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +[[package]] +name = "coverage" +version = "7.6.1" +description = "Code coverage measurement for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "coverage-7.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b06079abebbc0e89e6163b8e8f0e16270124c154dc6e4a47b413dd538859af16"}, + {file = "coverage-7.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cf4b19715bccd7ee27b6b120e7e9dd56037b9c0681dcc1adc9ba9db3d417fa36"}, + {file = "coverage-7.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61c0abb4c85b095a784ef23fdd4aede7a2628478e7baba7c5e3deba61070a02"}, + {file = "coverage-7.6.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd21f6ae3f08b41004dfb433fa895d858f3f5979e7762d052b12aef444e29afc"}, + {file = "coverage-7.6.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f59d57baca39b32db42b83b2a7ba6f47ad9c394ec2076b084c3f029b7afca23"}, + {file = "coverage-7.6.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a1ac0ae2b8bd743b88ed0502544847c3053d7171a3cff9228af618a068ed9c34"}, + {file = "coverage-7.6.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e6a08c0be454c3b3beb105c0596ebdc2371fab6bb90c0c0297f4e58fd7e1012c"}, + {file = "coverage-7.6.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f5796e664fe802da4f57a168c85359a8fbf3eab5e55cd4e4569fbacecc903959"}, + {file = "coverage-7.6.1-cp310-cp310-win32.whl", hash = "sha256:7bb65125fcbef8d989fa1dd0e8a060999497629ca5b0efbca209588a73356232"}, + {file = "coverage-7.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:3115a95daa9bdba70aea750db7b96b37259a81a709223c8448fa97727d546fe0"}, + {file = "coverage-7.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7dea0889685db8550f839fa202744652e87c60015029ce3f60e006f8c4462c93"}, + {file = "coverage-7.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ed37bd3c3b063412f7620464a9ac1314d33100329f39799255fb8d3027da50d3"}, + {file = "coverage-7.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d85f5e9a5f8b73e2350097c3756ef7e785f55bd71205defa0bfdaf96c31616ff"}, + {file = "coverage-7.6.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9bc572be474cafb617672c43fe989d6e48d3c83af02ce8de73fff1c6bb3c198d"}, + {file = "coverage-7.6.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c0420b573964c760df9e9e86d1a9a622d0d27f417e1a949a8a66dd7bcee7bc6"}, + {file = "coverage-7.6.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1f4aa8219db826ce6be7099d559f8ec311549bfc4046f7f9fe9b5cea5c581c56"}, + {file = "coverage-7.6.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:fc5a77d0c516700ebad189b587de289a20a78324bc54baee03dd486f0855d234"}, + {file = "coverage-7.6.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b48f312cca9621272ae49008c7f613337c53fadca647d6384cc129d2996d1133"}, + {file = "coverage-7.6.1-cp311-cp311-win32.whl", hash = "sha256:1125ca0e5fd475cbbba3bb67ae20bd2c23a98fac4e32412883f9bcbaa81c314c"}, + {file = "coverage-7.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:8ae539519c4c040c5ffd0632784e21b2f03fc1340752af711f33e5be83a9d6c6"}, + {file = "coverage-7.6.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:95cae0efeb032af8458fc27d191f85d1717b1d4e49f7cb226cf526ff28179778"}, + {file = "coverage-7.6.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5621a9175cf9d0b0c84c2ef2b12e9f5f5071357c4d2ea6ca1cf01814f45d2391"}, + {file = "coverage-7.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:260933720fdcd75340e7dbe9060655aff3af1f0c5d20f46b57f262ab6c86a5e8"}, + {file = "coverage-7.6.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07e2ca0ad381b91350c0ed49d52699b625aab2b44b65e1b4e02fa9df0e92ad2d"}, + {file = "coverage-7.6.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c44fee9975f04b33331cb8eb272827111efc8930cfd582e0320613263ca849ca"}, + {file = "coverage-7.6.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:877abb17e6339d96bf08e7a622d05095e72b71f8afd8a9fefc82cf30ed944163"}, + {file = "coverage-7.6.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3e0cadcf6733c09154b461f1ca72d5416635e5e4ec4e536192180d34ec160f8a"}, + {file = "coverage-7.6.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c3c02d12f837d9683e5ab2f3d9844dc57655b92c74e286c262e0fc54213c216d"}, + {file = "coverage-7.6.1-cp312-cp312-win32.whl", hash = "sha256:e05882b70b87a18d937ca6768ff33cc3f72847cbc4de4491c8e73880766718e5"}, + {file = "coverage-7.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:b5d7b556859dd85f3a541db6a4e0167b86e7273e1cdc973e5b175166bb634fdb"}, + {file = "coverage-7.6.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a4acd025ecc06185ba2b801f2de85546e0b8ac787cf9d3b06e7e2a69f925b106"}, + {file = "coverage-7.6.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a6d3adcf24b624a7b778533480e32434a39ad8fa30c315208f6d3e5542aeb6e9"}, + {file = "coverage-7.6.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d0c212c49b6c10e6951362f7c6df3329f04c2b1c28499563d4035d964ab8e08c"}, + {file = "coverage-7.6.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e81d7a3e58882450ec4186ca59a3f20a5d4440f25b1cff6f0902ad890e6748a"}, + {file = "coverage-7.6.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78b260de9790fd81e69401c2dc8b17da47c8038176a79092a89cb2b7d945d060"}, + {file = "coverage-7.6.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a78d169acd38300060b28d600344a803628c3fd585c912cacc9ea8790fe96862"}, + {file = "coverage-7.6.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2c09f4ce52cb99dd7505cd0fc8e0e37c77b87f46bc9c1eb03fe3bc9991085388"}, + {file = "coverage-7.6.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6878ef48d4227aace338d88c48738a4258213cd7b74fd9a3d4d7582bb1d8a155"}, + {file = "coverage-7.6.1-cp313-cp313-win32.whl", hash = "sha256:44df346d5215a8c0e360307d46ffaabe0f5d3502c8a1cefd700b34baf31d411a"}, + {file = "coverage-7.6.1-cp313-cp313-win_amd64.whl", hash = "sha256:8284cf8c0dd272a247bc154eb6c95548722dce90d098c17a883ed36e67cdb129"}, + {file = "coverage-7.6.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:d3296782ca4eab572a1a4eca686d8bfb00226300dcefdf43faa25b5242ab8a3e"}, + {file = "coverage-7.6.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:502753043567491d3ff6d08629270127e0c31d4184c4c8d98f92c26f65019962"}, + {file = "coverage-7.6.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a89ecca80709d4076b95f89f308544ec8f7b4727e8a547913a35f16717856cb"}, + {file = "coverage-7.6.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a318d68e92e80af8b00fa99609796fdbcdfef3629c77c6283566c6f02c6d6704"}, + {file = "coverage-7.6.1-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13b0a73a0896988f053e4fbb7de6d93388e6dd292b0d87ee51d106f2c11b465b"}, + {file = "coverage-7.6.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4421712dbfc5562150f7554f13dde997a2e932a6b5f352edcce948a815efee6f"}, + {file = "coverage-7.6.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:166811d20dfea725e2e4baa71fffd6c968a958577848d2131f39b60043400223"}, + {file = "coverage-7.6.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:225667980479a17db1048cb2bf8bfb39b8e5be8f164b8f6628b64f78a72cf9d3"}, + {file = "coverage-7.6.1-cp313-cp313t-win32.whl", hash = "sha256:170d444ab405852903b7d04ea9ae9b98f98ab6d7e63e1115e82620807519797f"}, + {file = "coverage-7.6.1-cp313-cp313t-win_amd64.whl", hash = "sha256:b9f222de8cded79c49bf184bdbc06630d4c58eec9459b939b4a690c82ed05657"}, + {file = "coverage-7.6.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6db04803b6c7291985a761004e9060b2bca08da6d04f26a7f2294b8623a0c1a0"}, + {file = "coverage-7.6.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f1adfc8ac319e1a348af294106bc6a8458a0f1633cc62a1446aebc30c5fa186a"}, + {file = "coverage-7.6.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a95324a9de9650a729239daea117df21f4b9868ce32e63f8b650ebe6cef5595b"}, + {file = "coverage-7.6.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b43c03669dc4618ec25270b06ecd3ee4fa94c7f9b3c14bae6571ca00ef98b0d3"}, + {file = "coverage-7.6.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8929543a7192c13d177b770008bc4e8119f2e1f881d563fc6b6305d2d0ebe9de"}, + {file = "coverage-7.6.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:a09ece4a69cf399510c8ab25e0950d9cf2b42f7b3cb0374f95d2e2ff594478a6"}, + {file = "coverage-7.6.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:9054a0754de38d9dbd01a46621636689124d666bad1936d76c0341f7d71bf569"}, + {file = "coverage-7.6.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0dbde0f4aa9a16fa4d754356a8f2e36296ff4d83994b2c9d8398aa32f222f989"}, + {file = "coverage-7.6.1-cp38-cp38-win32.whl", hash = "sha256:da511e6ad4f7323ee5702e6633085fb76c2f893aaf8ce4c51a0ba4fc07580ea7"}, + {file = "coverage-7.6.1-cp38-cp38-win_amd64.whl", hash = "sha256:3f1156e3e8f2872197af3840d8ad307a9dd18e615dc64d9ee41696f287c57ad8"}, + {file = "coverage-7.6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:abd5fd0db5f4dc9289408aaf34908072f805ff7792632250dcb36dc591d24255"}, + {file = "coverage-7.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:547f45fa1a93154bd82050a7f3cddbc1a7a4dd2a9bf5cb7d06f4ae29fe94eaf8"}, + {file = "coverage-7.6.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:645786266c8f18a931b65bfcefdbf6952dd0dea98feee39bd188607a9d307ed2"}, + {file = "coverage-7.6.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9e0b2df163b8ed01d515807af24f63de04bebcecbd6c3bfeff88385789fdf75a"}, + {file = "coverage-7.6.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:609b06f178fe8e9f89ef676532760ec0b4deea15e9969bf754b37f7c40326dbc"}, + {file = "coverage-7.6.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:702855feff378050ae4f741045e19a32d57d19f3e0676d589df0575008ea5004"}, + {file = "coverage-7.6.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:2bdb062ea438f22d99cba0d7829c2ef0af1d768d1e4a4f528087224c90b132cb"}, + {file = "coverage-7.6.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:9c56863d44bd1c4fe2abb8a4d6f5371d197f1ac0ebdee542f07f35895fc07f36"}, + {file = "coverage-7.6.1-cp39-cp39-win32.whl", hash = "sha256:6e2cd258d7d927d09493c8df1ce9174ad01b381d4729a9d8d4e38670ca24774c"}, + {file = "coverage-7.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:06a737c882bd26d0d6ee7269b20b12f14a8704807a01056c80bb881a4b2ce6ca"}, + {file = "coverage-7.6.1-pp38.pp39.pp310-none-any.whl", hash = "sha256:e9a6e0eb86070e8ccaedfbd9d38fec54864f3125ab95419970575b42af7541df"}, + {file = "coverage-7.6.1.tar.gz", hash = "sha256:953510dfb7b12ab69d20135a0662397f077c59b1e6379a768e97c59d852ee51d"}, +] + +[package.extras] +toml = ["tomli"] + [[package]] name = "exceptiongroup" version = "1.2.2" @@ -293,4 +377,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = ">=3.10" -content-hash = "4d1a9d7854c49ffe3b3edad00d8705e95759fe94efbf28c3961a6beecb5b4e5e" +content-hash = "1937058237d08991f0226177781a4725988661ceba88817795ba018d067b80ff" diff --git a/pyproject.toml b/pyproject.toml index 878fadc..6e45a21 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gcf-data-mapper" -version = "0.1.7" +version = "0.1.8" description = "A CLI tool to wrangle GCF data into format recognised by the bulk-import tool." authors = ["CPR-dev-team "] license = "Apache-2.0" @@ -18,6 +18,9 @@ pytest = "^8.3.2" [tool.poetry.scripts] gcf_data_mapper = "gcf_data_mapper.cli:greet" +[tool.poetry.group.dev.dependencies] +coverage = "^7.6.1" + [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" diff --git a/tests/unit_tests/parsers/event/conftest.py b/tests/unit_tests/parsers/event/conftest.py index cb783e5..698a72c 100644 --- a/tests/unit_tests/parsers/event/conftest.py +++ b/tests/unit_tests/parsers/event/conftest.py @@ -27,5 +27,20 @@ def valid_data(): "ApprovalDate": ["some_approval"], "StartDate": ["some_start"], "DateCompletion": ["some_end"], + "ApprovedRef": ["an_approved_ref"], + "ProjectsID": ["a_project_id"], + } + ) + + +@pytest.fixture +def mock_row(): + return pd.Series( + { + "ApprovalDate": "2023-01-01", + "StartDate": None, + "DateCompletion": "2023-12-31", + "ApprovedRef": "FP123", + "ProjectsID": "PID456", } ) diff --git a/tests/unit_tests/parsers/event/test_append_event.py b/tests/unit_tests/parsers/event/test_append_event.py new file mode 100644 index 0000000..38d5e3f --- /dev/null +++ b/tests/unit_tests/parsers/event/test_append_event.py @@ -0,0 +1,37 @@ +import pandas as pd +import pytest + +from gcf_data_mapper.enums.event import Event +from gcf_data_mapper.parsers.event import append_event + + +@pytest.fixture +def mock_event(): + return Event(1, "Approved", "ApprovalDate") + + +@pytest.fixture +def mock_row(): + return pd.Series({"ApprovalDate": "2023-01-01"}) + + +def test_append_event_adds_event_to_list(mock_event, mock_row): + gcf_events = [] + append_event(gcf_events, mock_event, mock_row, "FP123", "PID456", 1) + assert len(gcf_events) == 1 + assert gcf_events[0]["import_id"] == "GCF.event.FP123_PID456.n0001" + + +def test_append_event_raises_key_error_for_missing_column(mock_event): + gcf_events = [] + row = pd.Series({"StartDate": "2023-01-01"}) + with pytest.raises(KeyError): + append_event(gcf_events, mock_event, row, "FP123", "PID456", 1) + + +def test_append_event_handles_none_date(mock_event): + gcf_events = [] + row = pd.Series({"ApprovalDate": None}) + append_event(gcf_events, mock_event, row, "FP123", "PID456", 1) + assert len(gcf_events) == 1 + assert gcf_events[0]["date"] is None diff --git a/tests/unit_tests/parsers/event/test_check_event_dates.py b/tests/unit_tests/parsers/event/test_check_event_dates.py new file mode 100644 index 0000000..4a723c3 --- /dev/null +++ b/tests/unit_tests/parsers/event/test_check_event_dates.py @@ -0,0 +1,47 @@ +import pandas as pd +import pytest + +from gcf_data_mapper.enums.event import Events +from gcf_data_mapper.parsers.event import check_event_dates + + +@pytest.fixture +def mock_row(): + return pd.Series( + { + Events.APPROVED.column_name: "2023-01-01", + Events.UNDER_IMPLEMENTATION.column_name: None, + Events.COMPLETED.column_name: "2023-12-31", + } + ) + + +def test_check_event_dates_returns_correct_flags(mock_row): + result = check_event_dates(mock_row) + assert result[Events.APPROVED.name] is True + assert result[Events.UNDER_IMPLEMENTATION.name] is False + assert result[Events.COMPLETED.name] is True + + +def test_check_event_dates_returns_false_for_missing_columns(): + row = pd.Series( + { + Events.APPROVED.column_name: None, + Events.UNDER_IMPLEMENTATION.column_name: None, + Events.COMPLETED.column_name: None, + } + ) + result = check_event_dates(row) + assert all(value is False for value in result.values()) + + +def test_check_event_dates_returns_false_for_all_na(): + row = pd.Series( + { + Events.APPROVED.column_name: None, + Events.UNDER_IMPLEMENTATION.column_name: None, + Events.COMPLETED.column_name: None, + } + ) + result = check_event_dates(row) + assert all(value is False for value in result.values()) diff --git a/tests/unit_tests/parsers/event/test_event.py b/tests/unit_tests/parsers/event/test_event.py index a7daa0d..072f360 100644 --- a/tests/unit_tests/parsers/event/test_event.py +++ b/tests/unit_tests/parsers/event/test_event.py @@ -1,13 +1,63 @@ +import pandas as pd import pytest from gcf_data_mapper.parsers.event import event -def test_returns_empty_when_cols_missing(required_cols_missing): +@pytest.fixture +def mock_projects_data(): + return pd.DataFrame( + { + "ApprovalDate": ["2023-01-01", None], + "StartDate": [None, "2023-06-01"], + "DateCompletion": ["2023-12-31", None], + "ApprovedRef": ["FP123", "FP124"], + "ProjectsID": ["PID456", "PID457"], + } + ) + + +def test_event_returns_correct_number_of_events(mock_projects_data): + result = event(mock_projects_data, debug=False) + assert len(result) == 3 + + +def test_event_raises_attribute_error_for_missing_fields(): + projects_data = pd.DataFrame({}) with pytest.raises(AttributeError): - event(required_cols_missing, debug=False) + event(projects_data, debug=False) + + +def test_event_logs_debug_message(mock_projects_data, capsys): + result = event(mock_projects_data, debug=True) + captured = capsys.readouterr() + assert "📝 Wrangling GCF event data." in captured.out + assert len(result) == 3 + + +def test_event_returns_empty_list_for_no_valid_dates(): + projects_data = pd.DataFrame( + { + "ApprovalDate": [None, None], + "StartDate": [None, None], + "DateCompletion": [None, None], + "ApprovedRef": ["FP123", "FP124"], + "ProjectsID": ["PID456", "PID457"], + } + ) + result = event(projects_data, debug=False) + assert len(result) == 0 -def test_success_with_valid_data(valid_data): - event_data = event(valid_data, debug=False) - assert event_data == [] +def test_event_handles_partial_valid_dates(): + projects_data = pd.DataFrame( + { + "ApprovalDate": ["2023-01-01", None], + "StartDate": [None, "2023-06-01"], + "DateCompletion": [None, "2023-12-31"], + "ApprovedRef": ["FP123", "FP124"], + "ProjectsID": ["PID456", "PID457"], + } + ) + result = event(projects_data, debug=False) + assert len(result) == 3 diff --git a/tests/unit_tests/parsers/event/test_initialise_event_counter.py b/tests/unit_tests/parsers/event/test_initialise_event_counter.py new file mode 100644 index 0000000..77efe2c --- /dev/null +++ b/tests/unit_tests/parsers/event/test_initialise_event_counter.py @@ -0,0 +1,19 @@ +from gcf_data_mapper.parsers.event import initialise_event_counter + + +def test_initialise_event_counter_adds_new_key(): + event_counter = {} + initialise_event_counter(event_counter, "GCF.event.FP123.PID456") + assert event_counter["GCF.event.FP123.PID456"] == 0 + + +def test_initialise_event_counter_does_not_overwrite_existing_key(): + event_counter = {"GCF.event.FP123.PID456": 5} + initialise_event_counter(event_counter, "GCF.event.FP123.PID456") + assert event_counter["GCF.event.FP123.PID456"] == 5 + + +def test_initialise_event_counter_handles_empty_key(): + event_counter = {} + initialise_event_counter(event_counter, "") + assert event_counter[""] == 0 diff --git a/tests/unit_tests/parsers/event/test_process_event.py b/tests/unit_tests/parsers/event/test_process_event.py new file mode 100644 index 0000000..174cddc --- /dev/null +++ b/tests/unit_tests/parsers/event/test_process_event.py @@ -0,0 +1,68 @@ +import pandas as pd +import pytest + +from gcf_data_mapper.enums.event import Events +from gcf_data_mapper.parsers.event import process_event + + +@pytest.fixture +def mock_row(): + return pd.Series( + { + Events.APPROVED.column_name: "2023-01-01", + Events.UNDER_IMPLEMENTATION.column_name: None, + Events.COMPLETED.column_name: "2023-12-31", + "ApprovedRef": "FP123", + "ProjectsID": "PID456", + } + ) + + +def test_process_event_adds_events_to_list(mock_row): + gcf_events = [] + event_counter = {} + process_event(mock_row, gcf_events, event_counter, "FP123", "PID456") + assert len(gcf_events) == 2 + assert event_counter["GCF.event.FP123.PID456"] == 2 + + +def test_process_event_handles_no_dates(): + row = pd.Series( + { + Events.APPROVED.column_name: None, + Events.UNDER_IMPLEMENTATION.column_name: None, + Events.COMPLETED.column_name: None, + "ApprovedRef": "FP123", + "ProjectsID": "PID456", + } + ) + gcf_events = [] + event_counter = {} + process_event(row, gcf_events, event_counter, "FP123", "PID456") + assert len(gcf_events) == 0 + assert event_counter["GCF.event.FP123.PID456"] == 0 + + +def test_process_event_handles_partial_dates(): + row = pd.Series( + { + Events.APPROVED.column_name: "2023-01-01", + Events.UNDER_IMPLEMENTATION.column_name: "2023-06-01", + Events.COMPLETED.column_name: None, + "ApprovedRef": "FP123", + "ProjectsID": "PID456", + } + ) + gcf_events = [] + event_counter = {} + process_event(row, gcf_events, event_counter, "FP123", "PID456") + assert len(gcf_events) == 2 + assert event_counter["GCF.event.FP123.PID456"] == 2 + + +def test_process_event_raises_key_error_for_missing_columns(): + row = pd.Series({"ApprovedRef": "FP123", "ProjectsID": "PID456"}) + gcf_events = [] + event_counter = {} + with pytest.raises(KeyError): + process_event(row, gcf_events, event_counter, "FP123", "PID456") diff --git a/tests/unit_tests/read/conftest.py b/tests/unit_tests/read/conftest.py index f9179ad..deaab5d 100644 --- a/tests/unit_tests/read/conftest.py +++ b/tests/unit_tests/read/conftest.py @@ -1,11 +1,4 @@ import os -import pytest - UNIT_TESTS_FOLDER = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) FIXTURES_FOLDER = os.path.join(UNIT_TESTS_FOLDER, "fixtures") - - -@pytest.fixture(scope="module") -def fixtures_folder(): - return FIXTURES_FOLDER diff --git a/tests/unit_tests/read/test_read.py b/tests/unit_tests/read/test_read.py index 3ea1b39..4d6014f 100644 --- a/tests/unit_tests/read/test_read.py +++ b/tests/unit_tests/read/test_read.py @@ -7,80 +7,6 @@ from tests.unit_tests.read.conftest import FIXTURES_FOLDER -def return_valid_csv_data(): - """ - Function which returns expected data structure of csv file. - """ - - csv_data = [ - { - "country": "Brazil", - "capital": "Brasilia", - "avg_temp_celsius": "21.5", - "annual_rainfall_mm": "1500", - "climate_zone": "Tropical", - }, - { - "country": "Canada", - "capital": "Ottawa", - "avg_temp_celsius": "6.3", - "annual_rainfall_mm": "940", - "climate_zone": "Continental", - }, - { - "country": "Egypt", - "capital": "Cairo", - "avg_temp_celsius": "22.1", - "annual_rainfall_mm": "25", - "climate_zone": "Desert", - }, - ] - return csv_data - - -def return_valid_json_data(): - """ - Function which returns expected data structure of json file. - """ - - json_data = [ - { - "country": "Brazil", - "capital": "Brasilia", - "climate_info": { - "avg_temp_celsius": 21.5, - "annual_rainfall_mm": 1500, - "climate_zone": "Tropical", - }, - "rivers": {"names": [{"egypt": "Nile"}, {"london": "Thames"}]}, - "natural_disasters": ["Floods", "Landslides"], - }, - { - "country": "Canada", - "capital": "Ottawa", - "climate_info": { - "avg_temp_celsius": 6.3, - "annual_rainfall_mm": 940, - "climate_zone": "Continental", - }, - "rivers": {"names": [{"egypt": "Nile"}, {"london": "Thames"}]}, - "natural_disasters": ["Blizzards", "Wildfires"], - }, - { - "country": "Egypt", - "capital": "Cairo", - "climate_info": { - "avg_temp_celsius": 26, - "annual_rainfall_mm": 80, - "climate_zone": "Desert", - }, - "rivers": {"names": [{"egypt": "Nile"}, {"london": "Thames"}]}, - "natural_disasters": ["Droughts"], - }, - ] - return json_data - - def test_valid_files_return_expected_output(): fam_data, doc_data = read( os.path.join(FIXTURES_FOLDER, "valid_climate_csv_data.csv"), diff --git a/tests/unit_tests/read/test_read_into_pandas.py b/tests/unit_tests/read/test_read_into_pandas.py index 537d4d8..e96cb12 100644 --- a/tests/unit_tests/read/test_read_into_pandas.py +++ b/tests/unit_tests/read/test_read_into_pandas.py @@ -49,8 +49,7 @@ def test_returns_empty_df_when_exception(filepath): ) def test_raises_when_file_not_exist(filepath): with pytest.raises(FileNotFoundError): - test_df = read_into_pandas(filepath) - assert test_df.empty is True + read_into_pandas(filepath) @pytest.mark.parametrize(