From 7a6454cee4f05768e7015a02fe99b0597a1ae903 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 14 Nov 2024 16:47:23 +0000 Subject: [PATCH 01/56] Add processed_path and id to config --- poetry.lock | 15 ++++++++-- pyproject.toml | 72 +++++++++++++++++++++------------------------- src/acbm/config.py | 31 ++++++++++++++++++++ 3 files changed, 76 insertions(+), 42 deletions(-) diff --git a/poetry.lock b/poetry.lock index 2f7d50b..60a73b0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "annotated-types" @@ -1023,6 +1023,17 @@ widgetsnbextension = ">=4.0.12,<4.1.0" [package.extras] test = ["ipykernel", "jsonschema", "pytest (>=3.6.0)", "pytest-cov", "pytz"] +[[package]] +name = "jcs" +version = "0.2.1" +description = "JCS - JSON Canonicalization" +optional = false +python-versions = ">=3.6.2" +files = [ + {file = "jcs-0.2.1-py3-none-any.whl", hash = "sha256:e23a3e1de60f832d33cd811bb9c3b3be79219cdf95f63b88f0972732c3fa8476"}, + {file = "jcs-0.2.1.tar.gz", hash = "sha256:9f20360b2f3b0a410d65493b448f96306d80e37fb46283f3f4aa5db2c7c1472b"}, +] + [[package]] name = "jedi" version = "0.19.1" @@ -3754,4 +3765,4 @@ test = ["pytest", "pytest-cov"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "033376bf1b5aeadf8ad90831c1681f92c37f475a591039c67385b8a7497de7b0" +content-hash = "62358db191b9d2f0cd4348dcdc6fa886fbf472b830af5db703fdefc8c3babfce" diff --git a/pyproject.toml b/pyproject.toml index ca1429b..119ebc3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,9 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "acbm" version = "0.1.0" -authors = [ - "Hussein Mahfouz ", -] +authors = ["Hussein Mahfouz "] homepage = "https://github.com/alan-turing-institute/acbm" repository = "https://github.com/alan-turing-institute/acbm" license = "Apache-2.0" @@ -28,7 +26,7 @@ python = "^3.10" pytest = { version = ">=6", optional = true } pytest-cov = { version = ">=3", optional = true } pandas = "^2.2.0" -uatk-spc = {git = "https://github.com/alan-turing-institute/uatk-spc.git", subdirectory = "python"} +uatk-spc = { git = "https://github.com/alan-turing-institute/uatk-spc.git", subdirectory = "python" } geopandas = "^0.14.3" matplotlib = "^3.8.3" scikit-learn = "^1.4.1.post1" @@ -43,9 +41,10 @@ tomlkit = "^0.13.0" cml-pam = "0.3.2" gdal = "<=3.8.4" pandera = "^0.20.4" -osmox = {git = "https://github.com/arup-group/osmox"} +osmox = { git = "https://github.com/arup-group/osmox" } pyrosm = "^0.6.2" jsonschema = "^4.23.0" +jcs = "^0.2.1" [tool.poetry.dev-dependencies] pytest = ">= 6" @@ -62,22 +61,13 @@ ipykernel = "^6.29.4" minversion = "6.0" addopts = ["-ra", "--showlocals", "--strict-markers", "--strict-config"] xfail_strict = true -filterwarnings = [ - "error", -] +filterwarnings = ["error"] log_cli_level = "INFO" -testpaths = [ - "tests", -] +testpaths = ["tests"] [tool.coverage] run.source = ["acbm"] -port.exclude_lines = [ - 'pragma: no cover', - '\.\.\.', - 'if typing.TYPE_CHECKING:', -] - +port.exclude_lines = ['pragma: no cover', '\.\.\.', 'if typing.TYPE_CHECKING:'] [tool.ruff] @@ -86,29 +76,31 @@ exclude = [] line-length = 88 # how long you want lines to be [tool.ruff.format] -docstring-code-format = true # code snippets in docstrings will be formatted +docstring-code-format = true # code snippets in docstrings will be formatted [tool.ruff.lint] select = [ - "E", "F", "W", # flake8 - "B", # flake8-bugbear - "I", # isort - "ARG", # flake8-unused-arguments - "C4", # flake8-comprehensions - "EM", # flake8-errmsg - "ICN", # flake8-import-conventions - "ISC", # flake8-implicit-str-concat - "G", # flake8-logging-format - "PGH", # pygrep-hooks - "PIE", # flake8-pie - "PL", # pylint - "PT", # flake8-pytest-style - "RET", # flake8-return - "RUF", # Ruff-specific - "SIM", # flake8-simplify - "UP", # pyupgrade - "YTT", # flake8-2020 - "EXE", # flake8-executable + "E", + "F", + "W", # flake8 + "B", # flake8-bugbear + "I", # isort + "ARG", # flake8-unused-arguments + "C4", # flake8-comprehensions + "EM", # flake8-errmsg + "ICN", # flake8-import-conventions + "ISC", # flake8-implicit-str-concat + "G", # flake8-logging-format + "PGH", # pygrep-hooks + "PIE", # flake8-pie + "PL", # pylint + "PT", # flake8-pytest-style + "RET", # flake8-return + "RUF", # Ruff-specific + "SIM", # flake8-simplify + "UP", # pyupgrade + "YTT", # flake8-2020 + "EXE", # flake8-executable ] ignore = [ @@ -118,7 +110,7 @@ ignore = [ "G004", # Logging statement uses f-string, not necessary here ] unfixable = [ - "F401", # Would remove unused imports - "F841", # Would remove unused variables + "F401", # Would remove unused imports + "F841", # Would remove unused variables ] -flake8-unused-arguments.ignore-variadic-names = true # allow unused *args/**kwargs +flake8-unused-arguments.ignore-variadic-names = true # allow unused *args/**kwargs diff --git a/src/acbm/config.py b/src/acbm/config.py index 6cef753..ce9920a 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -1,7 +1,9 @@ import random from dataclasses import dataclass +from hashlib import sha256 from pathlib import Path +import jcs import numpy as np import tomlkit from pydantic import BaseModel, Field @@ -44,6 +46,35 @@ class Config(BaseModel): ) matching: MatchingParams = Field(description="Config: parameters for matching.") + @property + def id(self): + """Since config determines outputs, the SHA256 hash of the config can be used + as an identifier for outputs. + + See [popgetter](https://github.com/Urban-Analytics-Technology-Platform/popgetter/blob/7da293f4eb2d36480dbd137a27be623aa09449bf/python/popgetter/metadata.py#L83). + """ + # Since the out paths are not too long, take first 10 chars + ID_LENGTH = 10 + + def serializable_vars(obj: object) -> dict: + variables = {} + # Check if variables are serializable + for key, val in vars(obj).items(): + try: + # Try to serialize + jcs.canonicalize(val) + # Store in dict if serializable + variables[key] = val + except Exception: + # If cannot serialize, continue + continue + + return sha256(jcs.canonicalize(serializable_vars(self))).hexdigest()[:ID_LENGTH] + + def processed_path(self) -> str: + """Returns full processed path.""" + return Path("data") / "processed" / self.id + @property def seed(self) -> int: return self.parameters.seed From 9b5704e25ca401e1e672cdee3e859140664fbb19 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 14 Nov 2024 17:23:41 +0000 Subject: [PATCH 02/56] Update paths in README --- README.md | 58 +++++++++++++++++++++++++++---------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index ab00af7..fe471db 100644 --- a/README.md +++ b/README.md @@ -89,37 +89,37 @@ The pipeline is a series of scripts that are run in sequence to generate the act │   │   │   ├── trip_eul_2002-2022.tab │   │   │   └── .tab │   │   ├── travel_times -│   │   | ├── oa -│   │   | | ├── travel_time_matrix.parquet -| | | └── msoa -│   │   | └── travel_time_matrix.parquet +│   │   │ ├── oa +│   │   │ | ├── travel_time_matrix.parquet +| | │ └── msoa +│   │   │ └── travel_time_matrix.parquet │   │   ├── ODWP01EW_OA.zip │   │   ├── ODWP15EW_MSOA_v1.zip -│   │   ├── spc_output -│   │   │   ├── >_people_hh.parquet (Generated in Script 1) -│   │   │   ├── >_people_tu.parquet (Generated in Script 1) -│   │   │   ├── raw -│   │   │   │   ├── _households.parquet -│   │   │   │   ├── _info_per_msoa.json -│   │   │   │   ├── .pb -│   │   │   │   ├── _people.parquet -│   │   │   │   ├── _time_use_diaries.parquet -│   │   │   │   ├── _venues.parquet -│   │   │   │   ├── README.md -│   ├── interim -│   │   ├── assigning (Generated in Script 3) -│   │   └── matching (Generated in Script 2) -│   └── processed -│   ├── acbm__ -│   │   ├── activities.csv -│   │   ├── households.csv -│   │   ├── legs.csv -│   │   ├── legs_with_locations.parquet -│   │   ├── people.csv -│   │   └── plans.xml -│   ├── plots -│   │   ├── assigning -│   │   └── validation +│   │   └── spc_output +│   │      └── raw +│   │         ├── _households.parquet +│   │         ├── _info_per_msoa.json +│   │         ├── .pb +│   │          ├── _people.parquet +│   │          ├── _time_use_diaries.parquet +│   │         ├── _venues.parquet +│   │         └── README.md +│   └── outputs +│ └- +│    │   +│ ├── interim +│ │      ├── >_people_hh.parquet (Generated in Script 1) +│ │      ├── assigning (Generated in Script 3) +│ │      └── matching (Generated in Script 2) +│    ├── activities.csv +│    ├── households.csv +│    ├── legs.csv +│    ├── legs_with_locations.parquet +│    ├── people.csv +│    ├── plans.xml +│    ├── plots +│    │   ├── assigning +│    │   └── validation ``` ## Step 1: Prepare Data Inputs From 0d811cf72481d0a5ff646a8cc93491b49f13f03b Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 14 Nov 2024 18:02:22 +0000 Subject: [PATCH 03/56] Update paths in scripts --- scripts/1_prep_synthpop.py | 5 +++- scripts/2_match_households_and_individuals.py | 6 ++--- scripts/3.1_assign_primary_feasible_zones.py | 26 +++++++++---------- scripts/3.2.1_assign_primary_zone_edu.py | 22 +++++++++------- scripts/3.2.2_assign_primary_zone_work.py | 21 +++++++-------- scripts/3.2.3_assign_secondary_zone.py | 21 ++++++++------- scripts/3.3_assign_facility_all.py | 14 +++++----- src/acbm/config.py | 10 +++++-- 8 files changed, 68 insertions(+), 57 deletions(-) diff --git a/scripts/1_prep_synthpop.py b/scripts/1_prep_synthpop.py index 3df2dd8..14e97dd 100644 --- a/scripts/1_prep_synthpop.py +++ b/scripts/1_prep_synthpop.py @@ -1,3 +1,5 @@ +import os + from uatk_spc.builder import Builder import acbm @@ -23,8 +25,9 @@ def main(config_file): ) .build() ) + os.makedirs(acbm.root_path / config.interim_path, exist_ok=True) spc_people_hh.to_parquet( - acbm.root_path / f"data/external/spc_output/{region}_people_hh.parquet" + acbm.root_path / config.interim_path / f"{region}_people_hh.parquet" ) diff --git a/scripts/2_match_households_and_individuals.py b/scripts/2_match_households_and_individuals.py index 77e6316..cb4ea9e 100644 --- a/scripts/2_match_households_and_individuals.py +++ b/scripts/2_match_households_and_individuals.py @@ -31,7 +31,8 @@ def main(config_file): pd.set_option("display.max_columns", None) def get_interim_path( - file_name: str, path: str | Path = acbm.root_path / "data/interim/matching/" + file_name: str, + path: str | Path = acbm.root_path / config.interim_path / "matching", ) -> str: os.makedirs(path, exist_ok=True) return f"{path}/{file_name}" @@ -44,8 +45,7 @@ def get_interim_path( # Read in the spc data (parquet format) spc = pd.read_parquet( - acbm.root_path / "data/external/spc_output/" - f"{config.region}_people_hh.parquet" + acbm.root_path / config.interim_path / f"{config.region}_people_hh.parquet" ) logger.info("Filtering SPC data to specific columns") diff --git a/scripts/3.1_assign_primary_feasible_zones.py b/scripts/3.1_assign_primary_feasible_zones.py index 9858691..1af964b 100644 --- a/scripts/3.1_assign_primary_feasible_zones.py +++ b/scripts/3.1_assign_primary_feasible_zones.py @@ -1,3 +1,4 @@ +import os import pickle as pkl import geopandas as gpd @@ -23,6 +24,11 @@ def main(config_file): config = load_config(config_file) config.init_rng() + def get_interim_path(file_name: str) -> str: + path = acbm.root_path / config.interim_path / "assigning" + os.makedirs(path, exist_ok=True) + return f"{path}/{file_name}" + #### LOAD DATA #### # --- Activity chains @@ -103,7 +109,7 @@ def main(config_file): # save travel_times as parquet travel_times.to_parquet( - acbm.root_path / "data/interim/assigning/travel_time_estimates.parquet" + acbm.root_path / config.interim_path / "travel_time_estimates.parquet" ) # --- Intrazonal trip times @@ -122,7 +128,7 @@ def main(config_file): intrazone_times = intrazone_time(zones=boundaries, key_column=config.zone_id) # save intrazone_times to pickle - with open(acbm.root_path / "data/interim/assigning/intrazone_times.pkl", "wb") as f: + with open(get_interim_path("intrazone_times.pkl"), "wb") as f: pkl.dump(intrazone_times, f) logger.info("Intrazonal travel time estimates created") @@ -175,9 +181,7 @@ def main(config_file): predicate="within", ) # save as pickle - osm_data_gdf.to_pickle( - acbm.root_path / "data/interim/assigning/osm_poi_with_zones.pkl" - ) + osm_data_gdf.to_pickle(get_interim_path("osm_poi_with_zones.pkl")) activities_per_zone = get_activities_per_zone( zones=boundaries, @@ -186,9 +190,7 @@ def main(config_file): return_df=True, ) - activities_per_zone.to_parquet( - acbm.root_path / "data/interim/assigning/activities_per_zone.parquet" - ) + activities_per_zone.to_parquet(get_interim_path("activities_per_zone.parquet")) #### Get possible zones for each primary activity @@ -220,9 +222,7 @@ def main(config_file): logger.info("Saving feasible zones for education activities") # save possible_zones_school to dictionary - with open( - acbm.root_path / "data/interim/assigning/possible_zones_education.pkl", "wb" - ) as f: + with open(get_interim_path("possible_zones_education.pkl"), "wb") as f: pkl.dump(possible_zones_school, f) del possible_zones_school @@ -248,9 +248,7 @@ def main(config_file): logger.info("Saving feasible zones for work activities") # save possible_zones_work to dictionary - with open( - acbm.root_path / "data/interim/assigning/possible_zones_work.pkl", "wb" - ) as f: + with open(get_interim_path("possible_zones_work.pkl"), "wb") as f: pkl.dump(possible_zones_work, f) del possible_zones_work diff --git a/scripts/3.2.1_assign_primary_zone_edu.py b/scripts/3.2.1_assign_primary_zone_edu.py index f401b5f..4a49672 100644 --- a/scripts/3.2.1_assign_primary_zone_edu.py +++ b/scripts/3.2.1_assign_primary_zone_edu.py @@ -1,3 +1,5 @@ +import os + import geopandas as gpd import pandas as pd @@ -19,6 +21,12 @@ @acbm_cli def main(config_file): config = load_config(config_file) + + def get_interim_path(file_name: str) -> str: + path = acbm.root_path / config.interim_path / "assigning" + os.makedirs(path, exist_ok=True) + return f"{path}/{file_name}" + # TODO: consider if RNG seed needs to be distinct for different assignments config.init_rng() @@ -29,7 +37,7 @@ def main(config_file): # --- Possible zones for each activity (calculated in 3.1_assign_possible_zones.py) logger.info("Loading possible zones for each activity") possible_zones_school = pd.read_pickle( - acbm.root_path / "data/interim/assigning/possible_zones_education.pkl" + get_interim_path("possible_zones_education.pkl") ) # --- boundaries @@ -44,9 +52,7 @@ def main(config_file): # --- osm POI data logger.info("Loading OSM POI data") - osm_data_gdf = pd.read_pickle( - acbm.root_path / "data/interim/assigning/osm_poi_with_zones.pkl" - ) + osm_data_gdf = pd.read_pickle(get_interim_path("osm_poi_with_zones.pkl")) # Convert the DataFrame into a GeoDataFrame, and assign a coordinate reference system (CRS) logger.info("Converting OSM POI data to GeoDataFrame") @@ -95,14 +101,14 @@ def main(config_file): logger.info("Loading activities per zone") activities_per_zone = pd.read_parquet( - acbm.root_path / "data/interim/assigning/activities_per_zone.parquet" + get_interim_path("activities_per_zone.parquet") ) # --- travel time estimates logger.info("Loading travel time estimates") travel_time_estimates = pd.read_parquet( - acbm.root_path / "data/interim/assigning/travel_time_estimates.parquet" + get_interim_path("travel_time_estimates.parquet") ) #### ASSIGN TO ZONE FROM FEASIBLE ZONES #### @@ -164,9 +170,7 @@ def main(config_file): logger.info("Saving activity chains with assigned zones") - activity_chains_edu.to_pickle( - acbm.root_path / "data/interim/assigning/activity_chains_education.pkl" - ) + activity_chains_edu.to_pickle(get_interim_path("activity_chains_education.pkl")) if __name__ == "__main__": diff --git a/scripts/3.2.2_assign_primary_zone_work.py b/scripts/3.2.2_assign_primary_zone_work.py index 91b6f06..7b86e74 100644 --- a/scripts/3.2.2_assign_primary_zone_work.py +++ b/scripts/3.2.2_assign_primary_zone_work.py @@ -26,12 +26,15 @@ def main(config_file): config = load_config(config_file) config.init_rng() + def get_interim_path(file_name: str) -> str: + path = acbm.root_path / config.interim_path / "assigning" + os.makedirs(path, exist_ok=True) + return f"{path}/{file_name}" + #### LOAD DATA #### # --- Possible zones for each activity (calculated in 3.1_assign_possible_zones.py) - possible_zones_work = pd.read_pickle( - acbm.root_path / "data/interim/assigning/possible_zones_work.pkl" - ) + possible_zones_work = pd.read_pickle(get_interim_path("possible_zones_work.pkl")) # --- boundaries @@ -45,9 +48,7 @@ def main(config_file): # osm POI data - osm_data_gdf = pd.read_pickle( - acbm.root_path / "data/interim/assigning/osm_poi_with_zones.pkl" - ) + osm_data_gdf = pd.read_pickle(get_interim_path("osm_poi_with_zones.pkl")) # Convert the DataFrame into a GeoDataFrame, and assign a coordinate reference system (CRS) osm_data_gdf = gpd.GeoDataFrame(osm_data_gdf, geometry="geometry", crs="EPSG:4326") @@ -279,8 +280,8 @@ def main(config_file): ) # Define the output file path - os.makedirs(acbm.root_path / "data/processed/", exist_ok=True) - output_file_path = acbm.root_path / "data/processed/workzone_rmse_results.txt" + os.makedirs(acbm.root_path / config.output_path, exist_ok=True) + output_file_path = acbm.root_path / config.output_path / "workzone_rmse_results.txt" # Open the file in write mode with open(output_file_path, "w") as file: @@ -331,9 +332,7 @@ def main(config_file): # save the activity chains as a pickle - activity_chains_work.to_pickle( - acbm.root_path / "data/interim/assigning/activity_chains_work.pkl" - ) + activity_chains_work.to_pickle(get_interim_path("activity_chains_work.pkl")) if __name__ == "__main__": diff --git a/scripts/3.2.3_assign_secondary_zone.py b/scripts/3.2.3_assign_secondary_zone.py index b144555..a6c79aa 100644 --- a/scripts/3.2.3_assign_secondary_zone.py +++ b/scripts/3.2.3_assign_secondary_zone.py @@ -7,6 +7,8 @@ - For more info on the spacetime approach for secondary locaiton assignment, see https://www.tandfonline.com/doi/full/10.1080/23249935.2021.1982068 """ +import os + import geopandas as gpd import numpy as np import pandas as pd @@ -35,6 +37,11 @@ def main(config_file): config.init_rng() zone_id = config.zone_id + def get_interim_path(file_name: str) -> str: + path = acbm.root_path / config.interim_path / "assigning" + os.makedirs(path, exist_ok=True) + return f"{path}/{file_name}" + # --- Load in the data logger.info("Loading: activity chains") @@ -101,15 +108,11 @@ def merge_columns_from_other(df: pd.DataFrame, other: pd.DataFrame) -> pd.DataFr ) activity_chains_edu = merge_columns_from_other( - pd.read_pickle( - acbm.root_path / "data/interim/assigning/activity_chains_education.pkl" - ), + pd.read_pickle(get_interim_path("activity_chains_education.pkl")), activity_chains, ) activity_chains_work = merge_columns_from_other( - pd.read_pickle( - acbm.root_path / "data/interim/assigning/activity_chains_work.pkl" - ), + pd.read_pickle(get_interim_path("activity_chains_work.pkl")), activity_chains, ) @@ -315,7 +318,7 @@ def merge_columns_from_other(df: pd.DataFrame, other: pd.DataFrame) -> pd.DataFr ) else: travel_times = pd.read_parquet( - acbm.root_path / "data/interim/assigning/travel_time_estimates.parquet" + get_interim_path("travel_time_estimates.parquet") ) # Edit modes @@ -349,7 +352,7 @@ def merge_columns_from_other(df: pd.DataFrame, other: pd.DataFrame) -> pd.DataFr logger.info("Analysis (matrices): Step 3 - Calculating OD probabilities") activities_per_zone = pd.read_parquet( - acbm.root_path / "data/interim/assigning/activities_per_zone.parquet" + get_interim_path("activities_per_zone.parquet") ) # keep only rows that don't match primary activities @@ -449,7 +452,7 @@ def merge_columns_from_other(df: pd.DataFrame, other: pd.DataFrame) -> pd.DataFr # --- Save logger.info("Saving: Step 7 - Saving population") - write.to_csv(population, dir=(acbm.root_path / "data/processed/activities_pam")) + write.to_csv(population, dir=acbm.root_path / config.output_path) if __name__ == "__main__": diff --git a/scripts/3.3_assign_facility_all.py b/scripts/3.3_assign_facility_all.py index 30113cc..ac3b31e 100644 --- a/scripts/3.3_assign_facility_all.py +++ b/scripts/3.3_assign_facility_all.py @@ -18,9 +18,7 @@ def main(config_file): # --- Load data: activity chains logger.info("Loading activity chains") - activity_chains = pd.read_csv( - acbm.root_path / "data/processed/activities_pam/legs.csv" - ) + activity_chains = pd.read_csv(config.output_path / "legs.csv") activity_chains = activity_chains.drop(columns=["Unnamed: 0", "freq"]) # --- Preprocess: Split activity chains by activity purpose @@ -139,7 +137,7 @@ def main(config_file): logger.info("a. Adding eduction type as fallback") # load in activity chains spc_with_nts = pd.read_parquet( - acbm.root_path / "data/interim/matching/spc_with_nts_trips.parquet", + acbm.root_path / config.interim_path / "matching/spc_with_nts_trips.parquet", columns=["id", "education_type", "seq", "TripTotalTime", "TripDisIncSW"], ) # we get one row per id @@ -286,7 +284,7 @@ def main(config_file): lambda point: point if pd.isna(point) else point.wkt ) activity_chains_all.drop(columns=geom_cols).to_parquet( - acbm.root_path / "data/processed/activities_pam/legs_with_locations.parquet" + acbm.root_path / config.output_path / "legs_with_locations.parquet" ) # --- Plots @@ -318,7 +316,7 @@ def main(config_file): x_label="Reported Travel Distance (km)", y_label="Actual Distance - Euclidian (km)", title_prefix=f"Scatter plot of TripDisIncSW vs. Length for {activity_type}", - save_dir=acbm.root_path / "data/processed/plots/assigning/", + save_dir=acbm.root_path / config.output_path / "plots/assigning/", ) # Plot 2: Euclidian travel distance vs reported (NTS) travel TIME @@ -341,7 +339,7 @@ def main(config_file): x_label="Reported Travel TIme (min)", y_label="Actual Distance - Euclidian (km)", title_prefix="Scatter plot of TripTotalTime vs. Length", - save_dir=acbm.root_path / "data/processed/plots/assigning/", + save_dir=acbm.root_path / config.output_path / "plots/assigning/", ) # .... @@ -357,7 +355,7 @@ def main(config_file): bin_size=5000, boundaries=boundaries, sample_size=1000, - save_dir=acbm.root_path / "data/processed/plots/assigning/", + save_dir=acbm.root_path / config.output_path / "plots/assigning/", ) diff --git a/src/acbm/config.py b/src/acbm/config.py index ce9920a..5dc99e4 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -71,9 +71,15 @@ def serializable_vars(obj: object) -> dict: return sha256(jcs.canonicalize(serializable_vars(self))).hexdigest()[:ID_LENGTH] - def processed_path(self) -> str: + @property + def output_path(self) -> str: + """Returns full processed path.""" + return Path("data") / "outputs" / self.id + + @property + def interim_path(self) -> str: """Returns full processed path.""" - return Path("data") / "processed" / self.id + return Path("data") / "outputs" / self.id / "interim" @property def seed(self) -> int: From c2636fa9fb9baf0093593fac5e31adb892a1c972 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 14 Nov 2024 18:05:44 +0000 Subject: [PATCH 04/56] Write config to outputs --- scripts/0_preprocess_inputs.py | 7 +++++++ src/acbm/config.py | 4 ++++ 2 files changed, 11 insertions(+) diff --git a/scripts/0_preprocess_inputs.py b/scripts/0_preprocess_inputs.py index ed8aa19..6e84735 100644 --- a/scripts/0_preprocess_inputs.py +++ b/scripts/0_preprocess_inputs.py @@ -1,3 +1,5 @@ +import os + import geopandas as gpd import pandas as pd from uatk_spc import Reader @@ -12,6 +14,11 @@ @acbm_cli def main(config_file): config = load_config(config_file) + + # Write config to file + os.makedirs(config.output_path, exist_ok=True) + config.write(config.output_path / "config.toml") + config.init_rng() region = config.region # Pick a region with SPC output saved diff --git a/src/acbm/config.py b/src/acbm/config.py index 5dc99e4..12c624b 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -114,6 +114,10 @@ def init_rng(self): msg = f"config does not provide a rng seed with err: {err}" raise ValueError(msg) from err + def write(self, filepath: str | Path): + with open(filepath, "w") as f: + f.write(tomlkit.dumps(self.model_dump(exclude_none=True))) + def load_config(filepath: str | Path) -> Config: with open(filepath, "rb") as f: From 8a3b6e9ec8300da0afc4342c603eca8f760d7b20 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 14 Nov 2024 21:05:05 +0000 Subject: [PATCH 05/56] Fix and refactor loading travel times --- scripts/3.1_assign_primary_feasible_zones.py | 7 +++---- scripts/3.2.1_assign_primary_zone_edu.py | 6 ++---- scripts/3.2.3_assign_secondary_zone.py | 11 ++--------- src/acbm/utils.py | 14 ++++++++++++++ 4 files changed, 21 insertions(+), 17 deletions(-) diff --git a/scripts/3.1_assign_primary_feasible_zones.py b/scripts/3.1_assign_primary_feasible_zones.py index 1af964b..342fffa 100644 --- a/scripts/3.1_assign_primary_feasible_zones.py +++ b/scripts/3.1_assign_primary_feasible_zones.py @@ -82,12 +82,11 @@ def get_interim_path(file_name: str) -> str: # (`tst`/`TripStart` and `tet`/`TripEnd`) # TODO: move to config - travel_time_matrix_path = ( - acbm.root_path / "data/external/travel_times/oa/travel_time_matrix.parquet" - ) - if config.parameters.travel_times: logger.info("Loading travel time matrix") + travel_time_matrix_path = ( + acbm.root_path / "data/external/travel_times/oa/travel_time_matrix.parquet" + ) try: travel_times = pd.read_parquet(travel_time_matrix_path) print("Travel time matrix loaded successfully.") diff --git a/scripts/3.2.1_assign_primary_zone_edu.py b/scripts/3.2.1_assign_primary_zone_edu.py index 4a49672..49dab6d 100644 --- a/scripts/3.2.1_assign_primary_zone_edu.py +++ b/scripts/3.2.1_assign_primary_zone_edu.py @@ -16,6 +16,7 @@ from acbm.config import load_config from acbm.logger_config import assigning_primary_zones_logger as logger from acbm.preprocessing import add_location +from acbm.utils import get_travel_times @acbm_cli @@ -106,10 +107,7 @@ def get_interim_path(file_name: str) -> str: # --- travel time estimates logger.info("Loading travel time estimates") - - travel_time_estimates = pd.read_parquet( - get_interim_path("travel_time_estimates.parquet") - ) + travel_time_estimates = get_travel_times(config) #### ASSIGN TO ZONE FROM FEASIBLE ZONES #### diff --git a/scripts/3.2.3_assign_secondary_zone.py b/scripts/3.2.3_assign_secondary_zone.py index a6c79aa..3ea392d 100644 --- a/scripts/3.2.3_assign_secondary_zone.py +++ b/scripts/3.2.3_assign_secondary_zone.py @@ -29,6 +29,7 @@ from acbm.config import load_config from acbm.logger_config import assigning_secondary_zones_logger as logger from acbm.preprocessing import add_location +from acbm.utils import get_travel_times @acbm_cli @@ -311,15 +312,7 @@ def merge_columns_from_other(df: pd.DataFrame, other: pd.DataFrame) -> pd.DataFr logger.info("Analysis (matrices): Step 1 - Loading travel time data") # load in the travel times (path differs for estimated ones) - # TODO: improve / save in same directory / add paths to config - if config.parameters.travel_times: - travel_times = pd.read_parquet( - acbm.root_path / "data/external/travel_times/oa/travel_time_matrix.parquet" - ) - else: - travel_times = pd.read_parquet( - get_interim_path("travel_time_estimates.parquet") - ) + travel_times = get_travel_times(config) # Edit modes logger.info("Analysis (matrices): Step 2 - Editing modes") diff --git a/src/acbm/utils.py b/src/acbm/utils.py index c135002..30ff863 100644 --- a/src/acbm/utils.py +++ b/src/acbm/utils.py @@ -1,8 +1,12 @@ from datetime import datetime import numpy as np +import pandas as pd from sklearn.metrics import mean_squared_error +import acbm +from acbm.config import Config + def prepend_datetime(s: str, delimiter: str = "_") -> str: current_date = datetime.now().strftime("%Y-%m-%d") @@ -37,3 +41,13 @@ def calculate_rmse(predictions, targets): # Calculate and return RMSE return np.sqrt(mse) + + +def get_travel_times(config: Config) -> pd.DataFrame: + if config.parameters.travel_times: + return pd.read_parquet( + acbm.root_path / "data/external/travel_times/oa/travel_time_matrix.parquet" + ) + return pd.read_parquet( + acbm.root_path / config.interim_path / "travel_time_estimates.parquet" + ) From 8c0f4547313557c05def6e0779b3549377035e35 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Fri, 15 Nov 2024 17:30:54 +0000 Subject: [PATCH 06/56] Fix config ID --- src/acbm/config.py | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/src/acbm/config.py b/src/acbm/config.py index 12c624b..f2be6c6 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -2,6 +2,7 @@ from dataclasses import dataclass from hashlib import sha256 from pathlib import Path +from typing import Tuple import jcs import numpy as np @@ -24,8 +25,8 @@ class Parameters(BaseModel): @dataclass(frozen=True) class MatchingParams(BaseModel): - required_columns: list[str] - optional_columns: list[str] + required_columns: Tuple[str, ...] + optional_columns: Tuple[str, ...] n_matches: int | None = None chunk_size: int = 50_000 @@ -55,21 +56,7 @@ def id(self): """ # Since the out paths are not too long, take first 10 chars ID_LENGTH = 10 - - def serializable_vars(obj: object) -> dict: - variables = {} - # Check if variables are serializable - for key, val in vars(obj).items(): - try: - # Try to serialize - jcs.canonicalize(val) - # Store in dict if serializable - variables[key] = val - except Exception: - # If cannot serialize, continue - continue - - return sha256(jcs.canonicalize(serializable_vars(self))).hexdigest()[:ID_LENGTH] + return sha256(jcs.canonicalize(self.model_dump())).hexdigest()[:ID_LENGTH] @property def output_path(self) -> str: From 6fea129ce12162dacb8d2d873e362f07a204ec8e Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Mon, 18 Nov 2024 11:06:42 +0000 Subject: [PATCH 07/56] Fix activity chains to use config and update config paths * Update reading activity chains dataframe to use config path * Add boundaries and osmox filepaths to config --- scripts/0.1_run_osmox.py | 6 +++--- scripts/0_preprocess_inputs.py | 4 ++-- scripts/3.1_assign_primary_feasible_zones.py | 8 +++----- scripts/3.2.1_assign_primary_zone_edu.py | 8 ++++---- scripts/3.2.2_assign_primary_zone_work.py | 6 ++---- scripts/3.2.3_assign_secondary_zone.py | 6 ++---- scripts/3.3_assign_facility_all.py | 6 ++---- src/acbm/assigning/utils.py | 10 ++++++++-- src/acbm/config.py | 16 ++++++++++++++++ 9 files changed, 42 insertions(+), 28 deletions(-) diff --git a/scripts/0.1_run_osmox.py b/scripts/0.1_run_osmox.py index 16ba912..4288acf 100644 --- a/scripts/0.1_run_osmox.py +++ b/scripts/0.1_run_osmox.py @@ -12,15 +12,15 @@ def main(config_file): config = load_config(config_file) config.init_rng() - os.makedirs(acbm.root_path / "data/interim/osmox", exist_ok=True) - fp = get_data(config.region, directory=acbm.root_path / "data/interim/osmox") + os.makedirs(acbm.root_path / config.osmox_path, exist_ok=True) + fp = get_data(config.region, directory=acbm.root_path / config.osmox_path) subprocess.run( [ "osmox", "run", acbm.root_path / "osmox/config_osmox.json", fp, - f"data/interim/osmox/{config.region}", + config.osmox_path / config.region, "-f", "geoparquet", "-crs", diff --git a/scripts/0_preprocess_inputs.py b/scripts/0_preprocess_inputs.py index 6e84735..492f169 100644 --- a/scripts/0_preprocess_inputs.py +++ b/scripts/0_preprocess_inputs.py @@ -74,9 +74,9 @@ def main(config_file): logger.info( f"4. Saving the boundaries to {acbm.root_path / 'data/external/boundaries/'} path" ) - + os.makedirs((acbm.root_path / config.boundaries_filepath).parents[0], exist_ok=True) boundaries_filtered.to_file( - acbm.root_path / "data/external/boundaries/study_area_zones.geojson", + acbm.root_path / config.boundaries_filepath, driver="GeoJSON", ) diff --git a/scripts/3.1_assign_primary_feasible_zones.py b/scripts/3.1_assign_primary_feasible_zones.py index 342fffa..3d78fdb 100644 --- a/scripts/3.1_assign_primary_feasible_zones.py +++ b/scripts/3.1_assign_primary_feasible_zones.py @@ -33,7 +33,7 @@ def get_interim_path(file_name: str) -> str: # --- Activity chains logger.info("Loading activity chains") - activity_chains = activity_chains_for_assignment() + activity_chains = activity_chains_for_assignment(config) logger.info("Activity chains loaded") # Filter to a specific day of the week @@ -46,9 +46,7 @@ def get_interim_path(file_name: str) -> str: logger.info("Loading study area boundaries") - boundaries = gpd.read_file( - acbm.root_path / "data/external/boundaries/study_area_zones.geojson" - ) + boundaries = gpd.read_file(acbm.root_path / config.boundaries_filepath) logger.info("Study area boundaries loaded") @@ -152,7 +150,7 @@ def get_interim_path(file_name: str) -> str: # osm data osm_data = gpd.read_parquet( - acbm.root_path / f"data/interim/osmox/{config.region}_epsg_4326.parquet" + acbm.root_path / config.osmox_path / (config.region + "_epsg_4326.parquet") ) logger.info("Activity locations loaded") diff --git a/scripts/3.2.1_assign_primary_zone_edu.py b/scripts/3.2.1_assign_primary_zone_edu.py index 49dab6d..06dbbe3 100644 --- a/scripts/3.2.1_assign_primary_zone_edu.py +++ b/scripts/3.2.1_assign_primary_zone_edu.py @@ -44,9 +44,7 @@ def get_interim_path(file_name: str) -> str: # --- boundaries logger.info("Loading study area boundaries") - boundaries = gpd.read_file( - acbm.root_path / "data/external/boundaries/study_area_zones.geojson" - ) + boundaries = gpd.read_file(acbm.root_path / config.boundaries_filepath) logger.info("Study area boundaries loaded") @@ -62,7 +60,9 @@ def get_interim_path(file_name: str) -> str: # --- Activity chains logger.info("Loading activity chains") - activity_chains = activity_chains_for_assignment(columns=cols_for_assignment_edu()) + activity_chains = activity_chains_for_assignment( + config, columns=cols_for_assignment_edu() + ) activity_chains = activity_chains[ activity_chains["TravDay"] == config.parameters.nts_day_of_week ] diff --git a/scripts/3.2.2_assign_primary_zone_work.py b/scripts/3.2.2_assign_primary_zone_work.py index 7b86e74..74af395 100644 --- a/scripts/3.2.2_assign_primary_zone_work.py +++ b/scripts/3.2.2_assign_primary_zone_work.py @@ -40,9 +40,7 @@ def get_interim_path(file_name: str) -> str: logger.info("Loading study area boundaries") - boundaries = gpd.read_file( - acbm.root_path / "data/external/boundaries/study_area_zones.geojson" - ) + boundaries = gpd.read_file(acbm.root_path / config.boundaries_filepath) logger.info("Study area boundaries loaded") @@ -53,7 +51,7 @@ def get_interim_path(file_name: str) -> str: osm_data_gdf = gpd.GeoDataFrame(osm_data_gdf, geometry="geometry", crs="EPSG:4326") # --- Activity chains - activity_chains = activity_chains_for_assignment(cols_for_assignment_work()) + activity_chains = activity_chains_for_assignment(config, cols_for_assignment_work()) activity_chains = add_locations_to_activity_chains(activity_chains) activity_chains = activity_chains[ activity_chains["TravDay"] == config.parameters.nts_day_of_week diff --git a/scripts/3.2.3_assign_secondary_zone.py b/scripts/3.2.3_assign_secondary_zone.py index 3ea392d..bc8d241 100644 --- a/scripts/3.2.3_assign_secondary_zone.py +++ b/scripts/3.2.3_assign_secondary_zone.py @@ -46,7 +46,7 @@ def get_interim_path(file_name: str) -> str: # --- Load in the data logger.info("Loading: activity chains") - activity_chains = activity_chains_for_assignment() + activity_chains = activity_chains_for_assignment(config) activity_chains = activity_chains[ activity_chains["TravDay"] == config.parameters.nts_day_of_week ] @@ -55,9 +55,7 @@ def get_interim_path(file_name: str) -> str: logger.info("Preprocessing: Adding OA21CD to the data") - boundaries = gpd.read_file( - acbm.root_path / "data/external/boundaries/study_area_zones.geojson" - ) + boundaries = gpd.read_file(acbm.root_path / config.boundaries_filepath) logger.info("Study area boundaries loaded") diff --git a/scripts/3.3_assign_facility_all.py b/scripts/3.3_assign_facility_all.py index ac3b31e..1db59eb 100644 --- a/scripts/3.3_assign_facility_all.py +++ b/scripts/3.3_assign_facility_all.py @@ -43,15 +43,13 @@ def main(config_file): logger.info("Loading facility data") osm_data_gdf = gpd.read_parquet( - acbm.root_path / f"data/interim/boundaries/{config.region}_epsg_4326.parquet" + acbm.root_path / config.osmox_path / (config.region + "_epsg_4326.parquet") ) # --- Load data: Boundaries logger.info("Loading study area boundaries") - boundaries = gpd.read_file( - acbm.root_path / "data/external/osmox/study_area_zones.geojson" - ) + boundaries = gpd.read_file(acbm.root_path / config.boundaries_filepath) logger.info("Study area boundaries loaded") diff --git a/src/acbm/assigning/utils.py b/src/acbm/assigning/utils.py index 0413171..f16ae1e 100644 --- a/src/acbm/assigning/utils.py +++ b/src/acbm/assigning/utils.py @@ -5,6 +5,7 @@ import pandas as pd import acbm +from acbm.config import Config def cols_for_assignment_all() -> list[str]: @@ -41,13 +42,18 @@ def cols_for_assignment_work() -> list[str]: return cols_for_assignment_edu() -def activity_chains_for_assignment(columns: list[str] | None = None) -> pd.DataFrame: +def activity_chains_for_assignment( + config: Config, columns: list[str] | None = None +) -> pd.DataFrame: """Gets activity chains with subset of columns required for assignment.""" if columns is None: columns = cols_for_assignment_all() return pd.read_parquet( - acbm.root_path / "data/interim/matching/spc_with_nts_trips.parquet", + acbm.root_path + / config.interim_path + / "matching" + / "spc_with_nts_trips.parquet", columns=columns, ) diff --git a/src/acbm/config.py b/src/acbm/config.py index f2be6c6..c287759 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -58,6 +58,22 @@ def id(self): ID_LENGTH = 10 return sha256(jcs.canonicalize(self.model_dump())).hexdigest()[:ID_LENGTH] + @property + def boundaries_filepath(self) -> str: + """Returns full processed path.""" + return ( + Path("data") + / "outputs" + / self.id + / "boundaries" + / "study_area_zones.geojson" + ) + + @property + def osmox_path(self) -> str: + """Returns full processed path.""" + return Path("data") / "outputs" / self.id / "osmox" + @property def output_path(self) -> str: """Returns full processed path.""" From ef6dfe3ebeab4bb03b918e3a4674bbce7bdd232e Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 20 Nov 2024 11:50:00 +0000 Subject: [PATCH 08/56] Fix comment --- src/acbm/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/acbm/config.py b/src/acbm/config.py index c287759..ed38056 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -54,7 +54,7 @@ def id(self): See [popgetter](https://github.com/Urban-Analytics-Technology-Platform/popgetter/blob/7da293f4eb2d36480dbd137a27be623aa09449bf/python/popgetter/metadata.py#L83). """ - # Since the out paths are not too long, take first 10 chars + # Take first 10 chars to enable paths to remain not too long ID_LENGTH = 10 return sha256(jcs.canonicalize(self.model_dump())).hexdigest()[:ID_LENGTH] From b8413ca1175469d2a551ea6a8eb36f6eff3e832f Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 20 Nov 2024 12:06:43 +0000 Subject: [PATCH 09/56] Remove flake8-pytest-style, add config test --- pyproject.toml | 1 - tests/test_config.py | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 tests/test_config.py diff --git a/pyproject.toml b/pyproject.toml index 119ebc3..e6b7775 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,7 +94,6 @@ select = [ "PGH", # pygrep-hooks "PIE", # flake8-pie "PL", # pylint - "PT", # flake8-pytest-style "RET", # flake8-return "RUF", # Ruff-specific "SIM", # flake8-simplify diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..1ffc880 --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,14 @@ +import pytest + +from acbm.config import ( + load_config, +) + + +@pytest.fixture +def config(): + return load_config("config/base.toml") + + +def test_id(config): + assert config.id == "324e59cde5" From f8c4c6f009ffaab40c4e0d79b7d62e8b883183ce Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 20 Nov 2024 16:07:15 +0000 Subject: [PATCH 10/56] Move code estimating travel time out of else --- scripts/3.1_assign_primary_feasible_zones.py | 28 +++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/scripts/3.1_assign_primary_feasible_zones.py b/scripts/3.1_assign_primary_feasible_zones.py index 3d78fdb..04c2e81 100644 --- a/scripts/3.1_assign_primary_feasible_zones.py +++ b/scripts/3.1_assign_primary_feasible_zones.py @@ -79,9 +79,21 @@ def get_interim_path(file_name: str) -> str: # are compared to the travel times of the individual's actual trips from the nts # (`tst`/`TripStart` and `tet`/`TripEnd`) - # TODO: move to config + logger.info("Creating estimated travel times matrix") + # Create a new travel time matrix based on distances between zones + travel_time_estimates = zones_to_time_matrix( + zones=boundaries, id_col=config.zone_id, time_units="m" + ) + logger.info("Travel time estimates created") + + # save travel_time_etstimates as parquet + travel_time_estimates.to_parquet( + acbm.root_path / config.interim_path / "travel_time_estimates.parquet" + ) + if config.parameters.travel_times: logger.info("Loading travel time matrix") + # TODO: move to config travel_time_matrix_path = ( acbm.root_path / "data/external/travel_times/oa/travel_time_matrix.parquet" ) @@ -96,18 +108,8 @@ def get_interim_path(file_name: str) -> str: ) raise e else: - # If travel_times is not true or loading failed, create a new travel time matrix - logger.info("No travel time matrix found. Creating a new travel time matrix.") - # Create a new travel time matrix based on distances between zones - travel_times = zones_to_time_matrix( - zones=boundaries, id_col=config.zone_id, time_units="m" - ) - logger.info("Travel time estimates created") - # save travel_times as parquet - - travel_times.to_parquet( - acbm.root_path / config.interim_path / "travel_time_estimates.parquet" - ) + # If travel_times is not true, set travel_times as travel_times_estimates + travel_times = travel_time_estimates # --- Intrazonal trip times # From 2bc00bf4a084fbb386eaa0d0ee42ab602f538411 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 20 Nov 2024 16:49:38 +0000 Subject: [PATCH 11/56] Refactor config paths --- src/acbm/config.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/acbm/config.py b/src/acbm/config.py index ed38056..b552b4c 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -61,18 +61,17 @@ def id(self): @property def boundaries_filepath(self) -> str: """Returns full processed path.""" - return ( - Path("data") - / "outputs" - / self.id - / "boundaries" - / "study_area_zones.geojson" - ) + return self.output_path / "boundaries" / "study_area_zones.geojson" @property def osmox_path(self) -> str: """Returns full processed path.""" - return Path("data") / "outputs" / self.id / "osmox" + return self.output_path / "osmox" + + @property + def validation_plots_path(self) -> str: + """Returns full processed path.""" + return self.output_path / "plots" / "validation" @property def output_path(self) -> str: @@ -82,7 +81,7 @@ def output_path(self) -> str: @property def interim_path(self) -> str: """Returns full processed path.""" - return Path("data") / "outputs" / self.id / "interim" + return self.output_path / "interim" @property def seed(self) -> int: From 24970ec902a9268698d6f3be583ef667ba2910de Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 20 Nov 2024 16:50:06 +0000 Subject: [PATCH 12/56] Update validation paths --- scripts/4_validation.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/scripts/4_validation.py b/scripts/4_validation.py index 059b248..6950a99 100644 --- a/scripts/4_validation.py +++ b/scripts/4_validation.py @@ -24,8 +24,7 @@ def main(config_file): # ----- Folder for validation plots logger.info("1. Creating folder for validation plots") - - validation_plots_path = acbm.root_path / "data/processed/plots/validation" + validation_plots_path = config.validation_plots_path os.makedirs(validation_plots_path, exist_ok=True) # ----- Reading in the data @@ -40,9 +39,9 @@ def main(config_file): legs_nts = legs_nts[legs_nts["TravDay"] == config.parameters.nts_day_of_week] # Model outputs - legs_acbm = pd.read_csv(acbm.root_path / "data/processed/activities_pam/legs.csv") + legs_acbm = pd.read_csv(acbm.root_path / config.output_path / "legs.csv") legs_acbm_geo = pd.read_parquet( - acbm.root_path / "data/processed/activities_pam/legs_with_locations.parquet" + acbm.root_path / config.output_path / "legs_with_locations.parquet" ) # ----- Preproccessing the data From 83a3f89d80ecfe6e16c9b9a535c5baf47b36c4af Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 20 Nov 2024 17:19:47 +0000 Subject: [PATCH 13/56] Update base config and add use_estimates bool --- config/base.toml | 17 +++++++++-------- scripts/3.2.1_assign_primary_zone_edu.py | 1 + src/acbm/utils.py | 4 ++-- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/config/base.toml b/config/base.toml index 6999ade..09d9697 100644 --- a/config/base.toml +++ b/config/base.toml @@ -1,9 +1,9 @@ [parameters] seed = 0 -region = "leeds" # this is used to query poi data from osm and to load in SPC data -number_of_households = 5000 # how many people from the SPC do we want to run the model for? Comment out if you want to run the analysis on the entire SPC populaiton -zone_id = "OA21CD" # "OA21CD": OA level, "MSOA11CD": MSOA level -travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography +region = "leeds" # this is used to query poi data from osm and to load in SPC data +number_of_households = 5000 # how many people from the SPC do we want to run the model for? Comment out if you want to run the analysis on the entire SPC populaiton +zone_id = "OA21CD" # "OA21CD": OA level, "MSOA11CD": MSOA level +travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography boundary_geography = "OA" # NTS years to use nts_years = [2019, 2021, 2022] @@ -16,7 +16,8 @@ nts_regions = [ 'West Midlands', 'East of England', 'South East', - 'South West'] + 'South West', +] # nts day of the week to use # 1: Monday, 2: Tuesday, 3: Wednesday, 4: Thursday, 5: Friday, 6: Saturday, 7: Sunday nts_day_of_week = 3 @@ -35,12 +36,12 @@ optional_columns = [ n_matches = 10 # What is the maximum number of NTS matches we want for each SPC household? [work_assignment] -commute_level = "MSOA" -use_percentages = true # if true, optimization problem will try to minimize percentage difference at OD level (not absolute numbers). Recommended to set it to true +commute_level = "OA" +use_percentages = true # if true, optimization problem will try to minimize percentage difference at OD level (not absolute numbers). Recommended to set it to true # weights to add for each objective in the optimization problem weight_max_dev = 0.2 weight_total_dev = 0.8 -max_zones = 8 # maximum number of feasible zones to include in the optimization problem (less zones makes problem smaller - so faster, but at the cost of a better solution) +max_zones = 8 # maximum number of feasible zones to include in the optimization problem (less zones makes problem smaller - so faster, but at the cost of a better solution) [postprocessing] pam_jitter = 30 diff --git a/scripts/3.2.1_assign_primary_zone_edu.py b/scripts/3.2.1_assign_primary_zone_edu.py index 06dbbe3..a53155d 100644 --- a/scripts/3.2.1_assign_primary_zone_edu.py +++ b/scripts/3.2.1_assign_primary_zone_edu.py @@ -107,6 +107,7 @@ def get_interim_path(file_name: str) -> str: # --- travel time estimates logger.info("Loading travel time estimates") + # TODO: check whether should use_estimates=True travel_time_estimates = get_travel_times(config) #### ASSIGN TO ZONE FROM FEASIBLE ZONES #### diff --git a/src/acbm/utils.py b/src/acbm/utils.py index 30ff863..def55c7 100644 --- a/src/acbm/utils.py +++ b/src/acbm/utils.py @@ -43,8 +43,8 @@ def calculate_rmse(predictions, targets): return np.sqrt(mse) -def get_travel_times(config: Config) -> pd.DataFrame: - if config.parameters.travel_times: +def get_travel_times(config: Config, use_estimates: bool = False) -> pd.DataFrame: + if config.parameters.travel_times and not use_estimates: return pd.read_parquet( acbm.root_path / "data/external/travel_times/oa/travel_time_matrix.parquet" ) From 1f4fcb8f4706008d6fcd68eaf732cc739f68faf4 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 20 Nov 2024 18:21:06 +0000 Subject: [PATCH 14/56] Fix test --- tests/test_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_config.py b/tests/test_config.py index 1ffc880..d0afe71 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -11,4 +11,4 @@ def config(): def test_id(config): - assert config.id == "324e59cde5" + assert config.id == "01d8ded073" From fe5ee8812a70a1b9ba340f4aaf2bdfc9e3bcae93 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 26 Nov 2024 15:44:23 +0000 Subject: [PATCH 15/56] Fix doc comments --- src/acbm/config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/acbm/config.py b/src/acbm/config.py index b552b4c..4d7c2a3 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -60,22 +60,22 @@ def id(self): @property def boundaries_filepath(self) -> str: - """Returns full processed path.""" + """Returns boundaries path.""" return self.output_path / "boundaries" / "study_area_zones.geojson" @property def osmox_path(self) -> str: - """Returns full processed path.""" + """Returns osmox path.""" return self.output_path / "osmox" @property def validation_plots_path(self) -> str: - """Returns full processed path.""" + """Returns validation plots path.""" return self.output_path / "plots" / "validation" @property def output_path(self) -> str: - """Returns full processed path.""" + """Returns output path.""" return Path("data") / "outputs" / self.id @property From 095d20e56e6a4349c5f8561592fa662a07e7337a Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 27 Nov 2024 14:28:49 +0000 Subject: [PATCH 16/56] Refactor paths into config properties --- config/base.toml | 2 + scripts/0.1_run_osmox.py | 8 +- scripts/0_preprocess_inputs.py | 27 +-- scripts/1_prep_synthpop.py | 17 +- scripts/2_match_households_and_individuals.py | 52 ++--- scripts/3.1_assign_primary_feasible_zones.py | 38 ++-- scripts/3.2.1_assign_primary_zone_edu.py | 26 +-- scripts/3.2.2_assign_primary_zone_work.py | 37 +-- scripts/3.2.3_assign_secondary_zone.py | 25 +-- scripts/3.3_assign_facility_all.py | 17 +- scripts/4_validation.py | 14 +- src/acbm/assigning/utils.py | 6 +- src/acbm/config.py | 211 +++++++++++++++++- src/acbm/preprocessing.py | 10 +- src/acbm/utils.py | 9 +- 15 files changed, 287 insertions(+), 212 deletions(-) diff --git a/config/base.toml b/config/base.toml index 09d9697..1a737cd 100644 --- a/config/base.toml +++ b/config/base.toml @@ -46,3 +46,5 @@ max_zones = 8 # maximum number of feasible zones to include in the opti [postprocessing] pam_jitter = 30 pam_min_duration = 10 + +[paths] diff --git a/scripts/0.1_run_osmox.py b/scripts/0.1_run_osmox.py index 4288acf..d6cf0b9 100644 --- a/scripts/0.1_run_osmox.py +++ b/scripts/0.1_run_osmox.py @@ -1,9 +1,7 @@ -import os import subprocess from pyrosm import get_data -import acbm from acbm.cli import acbm_cli from acbm.config import load_config @@ -12,13 +10,13 @@ def main(config_file): config = load_config(config_file) config.init_rng() - os.makedirs(acbm.root_path / config.osmox_path, exist_ok=True) - fp = get_data(config.region, directory=acbm.root_path / config.osmox_path) + fp = get_data(config.region, directory=config.osmox_path) subprocess.run( [ "osmox", "run", - acbm.root_path / "osmox/config_osmox.json", + # TODO: add to lib as string + config.root_path / "osmox/config_osmox.json", fp, config.osmox_path / config.region, "-f", diff --git a/scripts/0_preprocess_inputs.py b/scripts/0_preprocess_inputs.py index 492f169..1eaf13f 100644 --- a/scripts/0_preprocess_inputs.py +++ b/scripts/0_preprocess_inputs.py @@ -1,10 +1,7 @@ -import os - import geopandas as gpd import pandas as pd from uatk_spc import Reader -import acbm from acbm.cli import acbm_cli from acbm.config import load_config from acbm.logger_config import preprocessing_logger as logger @@ -14,15 +11,11 @@ @acbm_cli def main(config_file): config = load_config(config_file) - + config.make_dirs() # Write config to file - os.makedirs(config.output_path, exist_ok=True) config.write(config.output_path / "config.toml") config.init_rng() - region = config.region - # Pick a region with SPC output saved - spc_path = acbm.root_path / "data/external/spc_output/raw/" # ----- BOUNDARIES logger.info("Preprocessing Boundary Layer") @@ -31,9 +24,7 @@ def main(config_file): logger.info("1. Reading in the boundary layer for the whole of England") - boundaries = gpd.read_file( - acbm.root_path / "data/external/boundaries/oa_england.geojson" - ) + boundaries = gpd.read_file(config.boundaries_filepath) boundaries = boundaries.to_crs(epsg=4326) @@ -52,16 +43,13 @@ def main(config_file): logger.info("3. Filtering boundaries to specified study area") # Step 1: Get zones from SPC (these will be 2011 MSOAs) - spc = Reader(spc_path, region, backend="pandas") + spc = Reader(config.spc_raw_path, config.region, backend="pandas") zones_in_region = list(spc.info_per_msoa.keys()) # Step 2: Filter boundaries to identified zones # a) get MSOA11CD to MSOA21CD lookup - msoa_lookup = pd.read_csv( - acbm.root_path - / "data/external/MSOA_2011_MSOA_2021_Lookup_for_England_and_Wales.csv" - ) + msoa_lookup = pd.read_csv(config.lookup_filepath) # Filter msoa_lookup to include only rows where MSOA11CD is in zones_in_region msoa_lookup_filtered = msoa_lookup[msoa_lookup["MSOA11CD"].isin(zones_in_region)] # Extract the corresponding MSOA21CD values @@ -71,12 +59,9 @@ def main(config_file): boundaries_filtered = boundaries[boundaries["MSOA21CD"].isin(msoa21cd_values)] ## Save the output as parquet - logger.info( - f"4. Saving the boundaries to {acbm.root_path / 'data/external/boundaries/'} path" - ) - os.makedirs((acbm.root_path / config.boundaries_filepath).parents[0], exist_ok=True) + logger.info(f"4. Saving the boundaries to {config.study_areas_filepath} path") boundaries_filtered.to_file( - acbm.root_path / config.boundaries_filepath, + config.study_areas_filepath, driver="GeoJSON", ) diff --git a/scripts/1_prep_synthpop.py b/scripts/1_prep_synthpop.py index 14e97dd..8979d3d 100644 --- a/scripts/1_prep_synthpop.py +++ b/scripts/1_prep_synthpop.py @@ -1,8 +1,5 @@ -import os - from uatk_spc.builder import Builder -import acbm from acbm.cli import acbm_cli from acbm.config import load_config @@ -11,24 +8,20 @@ def main(config_file): config = load_config(config_file) config.init_rng() - region = config.region - - # Pick a region with SPC output saved - path = acbm.root_path / "data/external/spc_output/raw/" + config.make_dirs() # Add people and households spc_people_hh = ( - Builder(path, region, backend="pandas", input_type="parquet") + Builder( + config.spc_raw_path, config.region, backend="pandas", input_type="parquet" + ) .add_households() .unnest( ["health", "employment", "details", "demographics"], rsuffix="_household" ) .build() ) - os.makedirs(acbm.root_path / config.interim_path, exist_ok=True) - spc_people_hh.to_parquet( - acbm.root_path / config.interim_path / f"{region}_people_hh.parquet" - ) + spc_people_hh.to_parquet(config.spc_combined_filepath) if __name__ == "__main__": diff --git a/scripts/2_match_households_and_individuals.py b/scripts/2_match_households_and_individuals.py index cb4ea9e..518c517 100644 --- a/scripts/2_match_households_and_individuals.py +++ b/scripts/2_match_households_and_individuals.py @@ -1,4 +1,3 @@ -import os import pickle as pkl from pathlib import Path @@ -7,7 +6,6 @@ # from joblib import Parallel, delayed # from tqdm import trange -import acbm from acbm.assigning.utils import cols_for_assignment_all from acbm.cli import acbm_cli from acbm.config import load_config @@ -27,15 +25,14 @@ def main(config_file): config = load_config(config_file) config.init_rng() + config.make_dirs() pd.set_option("display.max_columns", None) def get_interim_path( file_name: str, - path: str | Path = acbm.root_path / config.interim_path / "matching", - ) -> str: - os.makedirs(path, exist_ok=True) - return f"{path}/{file_name}" + ) -> Path: + return config.interim_path / "matching" / file_name # ## Step 1: Load in the datasets @@ -44,9 +41,7 @@ def get_interim_path( logger.info("Loading SPC data") # Read in the spc data (parquet format) - spc = pd.read_parquet( - acbm.root_path / config.interim_path / f"{config.region}_people_hh.parquet" - ) + spc = pd.read_parquet(config.spc_combined_filepath) logger.info("Filtering SPC data to specific columns") # select columns @@ -107,20 +102,14 @@ def get_interim_path( # #### PSU logger.info("Loading NTS data: PSU table") - path_psu = ( - acbm.root_path / "data/external/nts/UKDA-5340-tab/tab/psu_eul_2002-2022.tab" - ) + path_psu = config.psu_filepath psu = pd.read_csv(path_psu, sep="\t") # #### Individuals logger.info("Loading NTS data: individuals table") - path_individuals = ( - acbm.root_path - / "data/external/nts/UKDA-5340-tab/tab/individual_eul_2002-2022.tab" - ) nts_individuals = pd.read_csv( - path_individuals, + config.nts_individuals_filepath, sep="\t", usecols=[ "IndividualID", @@ -159,12 +148,8 @@ def get_interim_path( # #### Households logger.info("Loading NTS data: household table") - path_households = ( - acbm.root_path - / "data/external/nts/UKDA-5340-tab/tab/household_eul_2002-2022.tab" - ) nts_households = pd.read_csv( - path_households, + config.nts_households_filepath, sep="\t", usecols=[ "HouseholdID", @@ -201,11 +186,8 @@ def get_interim_path( # #### Trips logger.info("Loading NTS data: trips table") - path_trips = ( - acbm.root_path / "data/external/nts/UKDA-5340-tab/tab/trip_eul_2002-2022.tab" - ) nts_trips = pd.read_csv( - path_trips, + config.nts_trips_filepath, sep="\t", usecols=[ "TripID", @@ -524,9 +506,7 @@ def get_interim_path( # We use the 2011 rural urban classification to match the SPC to the NTS. The NTS has 2 columns that we can use to match to the SPC: `Settlement2011EW_B03ID` and `Settlement2011EW_B04ID`. The `Settlement2011EW_B03ID` column is more general (urban / rural only), while the `Settlement2011EW_B04ID` column is more specific. We stick to the more general column for now. # read the rural urban classification data - rural_urban = pd.read_csv( - acbm.root_path / "data/external/census_2011_rural_urban.csv", sep="," - ) + rural_urban = pd.read_csv(config.rural_urban_filepath) # merge the rural_urban data with the spc spc_edited = spc_edited.merge( @@ -1130,18 +1110,12 @@ def get_interim_path( ) # save the file as a parquet file - spc_edited_copy.to_parquet(get_interim_path("spc_with_nts_trips.parquet")) + spc_edited_copy.to_parquet(config.spc_with_nts_trips_filepath) # save the nts data for later use in validation - nts_individuals.to_parquet( - acbm.root_path / "data/external/nts/filtered/nts_individuals.parquet" - ) - nts_households.to_parquet( - acbm.root_path / "data/external/nts/filtered/nts_households.parquet" - ) - nts_trips.to_parquet( - acbm.root_path / "data/external/nts/filtered/nts_trips.parquet" - ) + nts_individuals.to_parquet(config.output_path / "nts_individuals.parquet") + nts_households.to_parquet(config.output_path / "nts_households.parquet") + nts_trips.to_parquet(config.output_path / "nts_trips.parquet") if __name__ == "__main__": diff --git a/scripts/3.1_assign_primary_feasible_zones.py b/scripts/3.1_assign_primary_feasible_zones.py index 04c2e81..acca185 100644 --- a/scripts/3.1_assign_primary_feasible_zones.py +++ b/scripts/3.1_assign_primary_feasible_zones.py @@ -1,10 +1,8 @@ -import os import pickle as pkl import geopandas as gpd import pandas as pd -import acbm from acbm.assigning.feasible_zones_primary import get_possible_zones from acbm.assigning.utils import ( activity_chains_for_assignment, @@ -23,11 +21,7 @@ def main(config_file): config = load_config(config_file) config.init_rng() - - def get_interim_path(file_name: str) -> str: - path = acbm.root_path / config.interim_path / "assigning" - os.makedirs(path, exist_ok=True) - return f"{path}/{file_name}" + config.make_dirs() #### LOAD DATA #### @@ -46,14 +40,16 @@ def get_interim_path(file_name: str) -> str: logger.info("Loading study area boundaries") - boundaries = gpd.read_file(acbm.root_path / config.boundaries_filepath) + boundaries = gpd.read_file(config.study_areas_filepath) logger.info("Study area boundaries loaded") # --- Assign activity home locations to boundaries zoning system logger.info("Assigning activity home locations to boundaries zoning system") - activity_chains = add_locations_to_activity_chains(activity_chains) + activity_chains = add_locations_to_activity_chains( + activity_chains, centroid_layer=pd.read_csv(config.centroid_layer_filepath) + ) # Convert the DataFrame into a GeoDataFrame, and assign a coordinate reference system (CRS) activity_chains = gpd.GeoDataFrame(activity_chains, geometry="location") @@ -87,18 +83,12 @@ def get_interim_path(file_name: str) -> str: logger.info("Travel time estimates created") # save travel_time_etstimates as parquet - travel_time_estimates.to_parquet( - acbm.root_path / config.interim_path / "travel_time_estimates.parquet" - ) + travel_time_estimates.to_parquet(config.travel_times_estimates_filepath) if config.parameters.travel_times: logger.info("Loading travel time matrix") - # TODO: move to config - travel_time_matrix_path = ( - acbm.root_path / "data/external/travel_times/oa/travel_time_matrix.parquet" - ) try: - travel_times = pd.read_parquet(travel_time_matrix_path) + travel_times = pd.read_parquet(config.travel_times_filepath) print("Travel time matrix loaded successfully.") except Exception as e: logger.info( @@ -127,7 +117,7 @@ def get_interim_path(file_name: str) -> str: intrazone_times = intrazone_time(zones=boundaries, key_column=config.zone_id) # save intrazone_times to pickle - with open(get_interim_path("intrazone_times.pkl"), "wb") as f: + with open(config.interim_path / "assigning" / "intrazone_times.pkl", "wb") as f: pkl.dump(intrazone_times, f) logger.info("Intrazonal travel time estimates created") @@ -151,9 +141,7 @@ def get_interim_path(file_name: str) -> str: logger.info("Loading activity locations") # osm data - osm_data = gpd.read_parquet( - acbm.root_path / config.osmox_path / (config.region + "_epsg_4326.parquet") - ) + osm_data = gpd.read_parquet(config.osm_path) logger.info("Activity locations loaded") # remove rows with activities = home OR transit @@ -180,7 +168,7 @@ def get_interim_path(file_name: str) -> str: predicate="within", ) # save as pickle - osm_data_gdf.to_pickle(get_interim_path("osm_poi_with_zones.pkl")) + osm_data_gdf.to_pickle(config.osm_poi_with_zones) activities_per_zone = get_activities_per_zone( zones=boundaries, @@ -189,7 +177,7 @@ def get_interim_path(file_name: str) -> str: return_df=True, ) - activities_per_zone.to_parquet(get_interim_path("activities_per_zone.parquet")) + activities_per_zone.to_parquet(config.activities_per_zone) #### Get possible zones for each primary activity @@ -221,7 +209,7 @@ def get_interim_path(file_name: str) -> str: logger.info("Saving feasible zones for education activities") # save possible_zones_school to dictionary - with open(get_interim_path("possible_zones_education.pkl"), "wb") as f: + with open(config.possible_zones_education, "wb") as f: pkl.dump(possible_zones_school, f) del possible_zones_school @@ -247,7 +235,7 @@ def get_interim_path(file_name: str) -> str: logger.info("Saving feasible zones for work activities") # save possible_zones_work to dictionary - with open(get_interim_path("possible_zones_work.pkl"), "wb") as f: + with open(config.possible_zones_work, "wb") as f: pkl.dump(possible_zones_work, f) del possible_zones_work diff --git a/scripts/3.2.1_assign_primary_zone_edu.py b/scripts/3.2.1_assign_primary_zone_edu.py index a53155d..1185622 100644 --- a/scripts/3.2.1_assign_primary_zone_edu.py +++ b/scripts/3.2.1_assign_primary_zone_edu.py @@ -1,9 +1,6 @@ -import os - import geopandas as gpd import pandas as pd -import acbm from acbm.assigning.select_zone_primary import ( fill_missing_zones, select_zone, @@ -23,11 +20,6 @@ def main(config_file): config = load_config(config_file) - def get_interim_path(file_name: str) -> str: - path = acbm.root_path / config.interim_path / "assigning" - os.makedirs(path, exist_ok=True) - return f"{path}/{file_name}" - # TODO: consider if RNG seed needs to be distinct for different assignments config.init_rng() @@ -37,21 +29,19 @@ def get_interim_path(file_name: str) -> str: # --- Possible zones for each activity (calculated in 3.1_assign_possible_zones.py) logger.info("Loading possible zones for each activity") - possible_zones_school = pd.read_pickle( - get_interim_path("possible_zones_education.pkl") - ) + possible_zones_school = pd.read_pickle(config.possible_zones_education) # --- boundaries logger.info("Loading study area boundaries") - boundaries = gpd.read_file(acbm.root_path / config.boundaries_filepath) + boundaries = gpd.read_file(config.study_areas_filepath) logger.info("Study area boundaries loaded") # --- osm POI data logger.info("Loading OSM POI data") - osm_data_gdf = pd.read_pickle(get_interim_path("osm_poi_with_zones.pkl")) + osm_data_gdf = pd.read_pickle(config.osm_poi_with_zones) # Convert the DataFrame into a GeoDataFrame, and assign a coordinate reference system (CRS) logger.info("Converting OSM POI data to GeoDataFrame") @@ -73,9 +63,7 @@ def get_interim_path(file_name: str) -> str: logger.info("Assigning activity home locations to boundaries zoning system") # Convert location column in activity_chains to spatial column - centroid_layer = pd.read_csv( - acbm.root_path / "data/external/centroids/Output_Areas_Dec_2011_PWC_2022.csv" - ) + centroid_layer = pd.read_csv(config.centroid_layer_filepath) activity_chains_edu = add_location( activity_chains_edu, "EPSG:27700", @@ -101,9 +89,7 @@ def get_interim_path(file_name: str) -> str: # --- activities per zone logger.info("Loading activities per zone") - activities_per_zone = pd.read_parquet( - get_interim_path("activities_per_zone.parquet") - ) + activities_per_zone = pd.read_parquet(config.activities_per_zone) # --- travel time estimates logger.info("Loading travel time estimates") @@ -169,7 +155,7 @@ def get_interim_path(file_name: str) -> str: logger.info("Saving activity chains with assigned zones") - activity_chains_edu.to_pickle(get_interim_path("activity_chains_education.pkl")) + activity_chains_edu.to_pickle(config.activity_chains_education) if __name__ == "__main__": diff --git a/scripts/3.2.2_assign_primary_zone_work.py b/scripts/3.2.2_assign_primary_zone_work.py index 74af395..d468fba 100644 --- a/scripts/3.2.2_assign_primary_zone_work.py +++ b/scripts/3.2.2_assign_primary_zone_work.py @@ -1,9 +1,6 @@ -import os - import geopandas as gpd import pandas as pd -import acbm from acbm.assigning.plots import ( plot_workzone_assignment_heatmap, plot_workzone_assignment_line, @@ -26,33 +23,30 @@ def main(config_file): config = load_config(config_file) config.init_rng() - def get_interim_path(file_name: str) -> str: - path = acbm.root_path / config.interim_path / "assigning" - os.makedirs(path, exist_ok=True) - return f"{path}/{file_name}" - #### LOAD DATA #### # --- Possible zones for each activity (calculated in 3.1_assign_possible_zones.py) - possible_zones_work = pd.read_pickle(get_interim_path("possible_zones_work.pkl")) + possible_zones_work = pd.read_pickle(config.possible_zones_work) # --- boundaries logger.info("Loading study area boundaries") - boundaries = gpd.read_file(acbm.root_path / config.boundaries_filepath) + boundaries = gpd.read_file(config.study_areas_filepath) logger.info("Study area boundaries loaded") # osm POI data - osm_data_gdf = pd.read_pickle(get_interim_path("osm_poi_with_zones.pkl")) + osm_data_gdf = pd.read_pickle(config.osm_poi_with_zones) # Convert the DataFrame into a GeoDataFrame, and assign a coordinate reference system (CRS) osm_data_gdf = gpd.GeoDataFrame(osm_data_gdf, geometry="geometry", crs="EPSG:4326") # --- Activity chains activity_chains = activity_chains_for_assignment(config, cols_for_assignment_work()) - activity_chains = add_locations_to_activity_chains(activity_chains) + activity_chains = add_locations_to_activity_chains( + activity_chains, centroid_layer=pd.read_csv(config.centroid_layer_filepath) + ) activity_chains = activity_chains[ activity_chains["TravDay"] == config.parameters.nts_day_of_week ] @@ -76,9 +70,7 @@ def get_interim_path(file_name: str) -> str: if commute_level == "MSOA": print("Step 1: Reading in the zipped csv file") - travel_demand = pd.read_csv( - acbm.root_path / "data/external/ODWP15EW_MSOA_v1.zip" - ) + travel_demand = pd.read_csv(config.travel_demand_filepath) print("Step 2: Creating commute_mode_dict") commute_mode_dict = { @@ -139,7 +131,7 @@ def get_interim_path(file_name: str) -> str: elif commute_level == "OA": print("Step 1: Reading in the zipped csv file") - travel_demand = pd.read_csv(acbm.root_path / "data/external/ODWP01EW_OA.zip") + travel_demand = pd.read_csv(config.travel_demand_filepath) print("Step 2: Filtering rows and dropping unnecessary columns") travel_demand_clipped = travel_demand[ @@ -277,12 +269,8 @@ def get_interim_path(file_name: str) -> str: )["demand_assigned"].transform(lambda x: (x / x.sum()) * 100) ) - # Define the output file path - os.makedirs(acbm.root_path / config.output_path, exist_ok=True) - output_file_path = acbm.root_path / config.output_path / "workzone_rmse_results.txt" - # Open the file in write mode - with open(output_file_path, "w") as file: + with open(config.workzone_rmse_results_path, "w") as file: # (1) RMSE for % of Total Demand predictions = workzone_assignment_opt["pct_of_total_demand_assigned"] targets = workzone_assignment_opt["pct_of_total_demand_actual"] @@ -316,7 +304,7 @@ def get_interim_path(file_name: str) -> str: n=10, selection_type="top", sort_by="actual", - save_dir=acbm.root_path / "data/processed/plots/assigning/", + save_dir=config.assigning_plots_path, ) # Plot the demand_actual and demand_assigned values as a heatmap for n origin_zones. @@ -325,12 +313,11 @@ def get_interim_path(file_name: str) -> str: n=20, selection_type="top", sort_by="assigned", - save_dir=acbm.root_path / "data/processed/plots/assigning/", + save_dir=config.assigning_plots_path, ) # save the activity chains as a pickle - - activity_chains_work.to_pickle(get_interim_path("activity_chains_work.pkl")) + activity_chains_work.to_pickle(config.activity_chains_work) if __name__ == "__main__": diff --git a/scripts/3.2.3_assign_secondary_zone.py b/scripts/3.2.3_assign_secondary_zone.py index bc8d241..4ad6231 100644 --- a/scripts/3.2.3_assign_secondary_zone.py +++ b/scripts/3.2.3_assign_secondary_zone.py @@ -7,8 +7,6 @@ - For more info on the spacetime approach for secondary locaiton assignment, see https://www.tandfonline.com/doi/full/10.1080/23249935.2021.1982068 """ -import os - import geopandas as gpd import numpy as np import pandas as pd @@ -16,7 +14,6 @@ from pam.planner.od import ODFactory, ODMatrix from pam.read import load_travel_diary -import acbm from acbm.assigning.select_zone_secondary import ( create_od_matrices, shift_and_fill_column, @@ -38,11 +35,6 @@ def main(config_file): config.init_rng() zone_id = config.zone_id - def get_interim_path(file_name: str) -> str: - path = acbm.root_path / config.interim_path / "assigning" - os.makedirs(path, exist_ok=True) - return f"{path}/{file_name}" - # --- Load in the data logger.info("Loading: activity chains") @@ -55,16 +47,14 @@ def get_interim_path(file_name: str) -> str: logger.info("Preprocessing: Adding OA21CD to the data") - boundaries = gpd.read_file(acbm.root_path / config.boundaries_filepath) + boundaries = gpd.read_file(config.study_areas_filepath) logger.info("Study area boundaries loaded") # --- Assign activity home locations to boundaries zoning system # Convert location column in activity_chains to spatial column - centroid_layer = pd.read_csv( - acbm.root_path / "data/external/centroids/Output_Areas_Dec_2011_PWC_2022.csv" - ) + centroid_layer = pd.read_csv(config.centroid_layer_filepath) activity_chains = add_location( activity_chains, "EPSG:27700", "EPSG:4326", centroid_layer, "OA11CD", "OA11CD" ) @@ -107,11 +97,12 @@ def merge_columns_from_other(df: pd.DataFrame, other: pd.DataFrame) -> pd.DataFr ) activity_chains_edu = merge_columns_from_other( - pd.read_pickle(get_interim_path("activity_chains_education.pkl")), + pd.read_pickle(config.activity_chains_education), activity_chains, ) activity_chains_work = merge_columns_from_other( - pd.read_pickle(get_interim_path("activity_chains_work.pkl")), + # TODO: update with config path + pd.read_pickle(config.activity_chains_work), activity_chains, ) @@ -342,9 +333,7 @@ def merge_columns_from_other(df: pd.DataFrame, other: pd.DataFrame) -> pd.DataFr # --- Calculate OD probabilities (probabilities of choosing a destination zone for an activity, given the origin zone) logger.info("Analysis (matrices): Step 3 - Calculating OD probabilities") - activities_per_zone = pd.read_parquet( - get_interim_path("activities_per_zone.parquet") - ) + activities_per_zone = pd.read_parquet(config.activities_per_zone) # keep only rows that don't match primary activities activities_per_zone = activities_per_zone[ @@ -443,7 +432,7 @@ def merge_columns_from_other(df: pd.DataFrame, other: pd.DataFrame) -> pd.DataFr # --- Save logger.info("Saving: Step 7 - Saving population") - write.to_csv(population, dir=acbm.root_path / config.output_path) + write.to_csv(population, dir=config.output_path) if __name__ == "__main__": diff --git a/scripts/3.3_assign_facility_all.py b/scripts/3.3_assign_facility_all.py index 1db59eb..89e7b3d 100644 --- a/scripts/3.3_assign_facility_all.py +++ b/scripts/3.3_assign_facility_all.py @@ -2,7 +2,6 @@ import pandas as pd from libpysal.weights import Queen -import acbm from acbm.assigning.plots import plot_desire_lines, plot_scatter_actual_reported from acbm.assigning.select_facility import map_activity_locations, select_facility from acbm.cli import acbm_cli @@ -42,14 +41,12 @@ def main(config_file): # --- Load data: POI locations logger.info("Loading facility data") - osm_data_gdf = gpd.read_parquet( - acbm.root_path / config.osmox_path / (config.region + "_epsg_4326.parquet") - ) + osm_data_gdf = gpd.read_parquet(config.osm_path) # --- Load data: Boundaries logger.info("Loading study area boundaries") - boundaries = gpd.read_file(acbm.root_path / config.boundaries_filepath) + boundaries = gpd.read_file(config.study_areas_filepath) logger.info("Study area boundaries loaded") @@ -135,7 +132,7 @@ def main(config_file): logger.info("a. Adding eduction type as fallback") # load in activity chains spc_with_nts = pd.read_parquet( - acbm.root_path / config.interim_path / "matching/spc_with_nts_trips.parquet", + config.spc_with_nts_trips_filepath, columns=["id", "education_type", "seq", "TripTotalTime", "TripDisIncSW"], ) # we get one row per id @@ -282,7 +279,7 @@ def main(config_file): lambda point: point if pd.isna(point) else point.wkt ) activity_chains_all.drop(columns=geom_cols).to_parquet( - acbm.root_path / config.output_path / "legs_with_locations.parquet" + config.output_path / "legs_with_locations.parquet" ) # --- Plots @@ -314,7 +311,7 @@ def main(config_file): x_label="Reported Travel Distance (km)", y_label="Actual Distance - Euclidian (km)", title_prefix=f"Scatter plot of TripDisIncSW vs. Length for {activity_type}", - save_dir=acbm.root_path / config.output_path / "plots/assigning/", + save_dir=config.output_path / "plots/assigning/", ) # Plot 2: Euclidian travel distance vs reported (NTS) travel TIME @@ -337,7 +334,7 @@ def main(config_file): x_label="Reported Travel TIme (min)", y_label="Actual Distance - Euclidian (km)", title_prefix="Scatter plot of TripTotalTime vs. Length", - save_dir=acbm.root_path / config.output_path / "plots/assigning/", + save_dir=config.output_path / "plots/assigning/", ) # .... @@ -353,7 +350,7 @@ def main(config_file): bin_size=5000, boundaries=boundaries, sample_size=1000, - save_dir=acbm.root_path / config.output_path / "plots/assigning/", + save_dir=config.output_path / "plots/assigning/", ) diff --git a/scripts/4_validation.py b/scripts/4_validation.py index 6950a99..4e4ea65 100644 --- a/scripts/4_validation.py +++ b/scripts/4_validation.py @@ -1,10 +1,7 @@ -import os - import matplotlib.pyplot as plt import pandas as pd import seaborn as sns -import acbm from acbm.cli import acbm_cli from acbm.config import load_config from acbm.logger_config import validation_logger as logger @@ -25,24 +22,19 @@ def main(config_file): logger.info("1. Creating folder for validation plots") validation_plots_path = config.validation_plots_path - os.makedirs(validation_plots_path, exist_ok=True) # ----- Reading in the data logger.info("2. Reading in the data") # NTS data - legs_nts = pd.read_parquet( - acbm.root_path / "data/external/nts/filtered/nts_trips.parquet" - ) + legs_nts = pd.read_parquet(config.output_path / "nts_trips.parquet") legs_nts = legs_nts[legs_nts["TravDay"] == config.parameters.nts_day_of_week] # Model outputs - legs_acbm = pd.read_csv(acbm.root_path / config.output_path / "legs.csv") - legs_acbm_geo = pd.read_parquet( - acbm.root_path / config.output_path / "legs_with_locations.parquet" - ) + legs_acbm = pd.read_csv(config.output_path / "legs.csv") + legs_acbm_geo = pd.read_parquet(config.output_path / "legs_with_locations.parquet") # ----- Preproccessing the data diff --git a/src/acbm/assigning/utils.py b/src/acbm/assigning/utils.py index f16ae1e..a9ebea4 100644 --- a/src/acbm/assigning/utils.py +++ b/src/acbm/assigning/utils.py @@ -4,7 +4,6 @@ import numpy as np import pandas as pd -import acbm from acbm.config import Config @@ -50,10 +49,7 @@ def activity_chains_for_assignment( columns = cols_for_assignment_all() return pd.read_parquet( - acbm.root_path - / config.interim_path - / "matching" - / "spc_with_nts_trips.parquet", + config.spc_with_nts_trips_filepath, columns=columns, ) diff --git a/src/acbm/config.py b/src/acbm/config.py index 4d7c2a3..9864dcd 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -1,3 +1,4 @@ +import os import random from dataclasses import dataclass from hashlib import sha256 @@ -7,7 +8,9 @@ import jcs import numpy as np import tomlkit -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, field_serializer + +import acbm @dataclass(frozen=True) @@ -40,12 +43,49 @@ class WorkAssignmentParams(BaseModel): commute_level: str | None = None +@dataclass(frozen=True) +class PathParams(BaseModel): + root_path: Path | None = acbm.root_path + output_path: Path | None = None + osm_path: Path | None = None + study_areas_filepath: Path | None = None + + @field_serializer( + "root_path", + "output_path", + "osm_path", + "study_areas_filepath", + check_fields=False, + ) + def serialize_filepath(self, filepath: Path | str | None) -> str | None: + return None if filepath is None else str(filepath.relative_to(self.root_path)) + + class Config(BaseModel): parameters: Parameters = Field(description="Config: parameters.") work_assignment: WorkAssignmentParams = Field( description="Config: parameters for work assignment." ) matching: MatchingParams = Field(description="Config: parameters for matching.") + paths: PathParams = Field(description="Path overrides.") + + def make_dirs(self): + """Makes all directories requried from config""" + os.makedirs(self.output_path, exist_ok=True) + os.makedirs(self.assigning_plots_path, exist_ok=True) + os.makedirs(self.validation_plots_path, exist_ok=True) + os.makedirs(self.activities_per_zone.parent, exist_ok=True) + os.makedirs(self.study_areas_filepath.parent, exist_ok=True) + os.makedirs(self.interim_path, exist_ok=True) + os.makedirs(self.travel_times_estimates_filepath.parent, exist_ok=True) + os.makedirs(self.spc_combined_filepath.parent, exist_ok=True) + os.makedirs(self.spc_with_nts_trips_filepath.parent, exist_ok=True) + os.makedirs(self.osmox_path, exist_ok=True) + os.makedirs(self.osm_path.parent, exist_ok=True) + + @property + def root_path(self) -> Path: + return acbm.root_path if self.paths.root_path is None else self.paths.root_path @property def id(self): @@ -59,14 +99,89 @@ def id(self): return sha256(jcs.canonicalize(self.model_dump())).hexdigest()[:ID_LENGTH] @property - def boundaries_filepath(self) -> str: + def boundaries_filepath(self) -> Path: + """Returns boundaries path.""" + return self.external_path / "boundaries" / "oa_england.geojson" + + @property + def lookup_filepath(self) -> Path: + return ( + self.external_path / "MSOA_2011_MSOA_2021_Lookup_for_England_and_Wales.csv" + ) + + @property + def spc_raw_path(self) -> Path: + return self.root_path / "data" / "external" / "spc_output" / "raw" + + @property + def spc_combined_filepath(self) -> Path: + return self.interim_path / f"{self.region}_people_hh.parquet" + + @property + def study_areas_filepath(self) -> Path: """Returns boundaries path.""" - return self.output_path / "boundaries" / "study_area_zones.geojson" + return ( + self.output_path / "boundaries" / "study_area_zones.geojson" + if self.paths.study_areas_filepath is None + else self.paths.study_areas_filepath + ) + + @property + def workzone_rmse_results_path(self) -> Path: + return self.root_path / self.output_path / "workzone_rmse_results.txt" @property - def osmox_path(self) -> str: + def osmox_path(self) -> Path: """Returns osmox path.""" - return self.output_path / "osmox" + return self.root_path / self.output_path / "osmox" + + @property + def osm_path(self) -> Path: + """Returns osm path.""" + return ( + self.root_path / self.osmox_path / (self.region + "_epsg_4326.parquet") + if self.paths.osm_path is None + else self.paths.osm_path + ) + + @property + def activities_per_zone(self) -> Path: + """Returns activities per zone filepath.""" + return self.interim_path / "assigning" / "activities_per_zone.parquet" + + @property + def possible_zones_education(self) -> Path: + """Returns possible zones for education filepath.""" + return self.interim_path / "assigning" / "possible_zones_education.pkl" + + @property + def possible_zones_work(self) -> Path: + """Returns possible zones for work filepath.""" + return self.interim_path / "assigning" / "possible_zones_work.pkl" + + @property + def osm_poi_with_zones(self) -> Path: + """Returns OSM POI with zones filepath.""" + return self.interim_path / "assigning" / "osm_poi_with_zones.pkl" + + @property + def activity_chains_education(self) -> Path: + """Returns activity chains (education) filepath.""" + return self.interim_path / "assigning" / "activity_chains_education.pkl" + + @property + def activity_chains_work(self) -> Path: + """Returns activity chains (work) filepath.""" + return self.interim_path / "assigning" / "activity_chains_work.pkl" + + @property + def centroids(self) -> Path: + return self.external_path / "centroids" / "Output_Areas_Dec_2011_PWC_2022.csv" + + @property + def assigning_plots_path(self) -> str: + """Returns assigning plots path.""" + return self.output_path / "plots" / "assigning" @property def validation_plots_path(self) -> str: @@ -76,13 +191,95 @@ def validation_plots_path(self) -> str: @property def output_path(self) -> str: """Returns output path.""" - return Path("data") / "outputs" / self.id + return ( + self.root_path / "data" / "outputs" / self.id + if self.paths.output_path is None + else self.paths.output_path + ) + + @property + def external_path(self) -> str: + """Returns external data path.""" + return self.root_path / "data" / "external" @property def interim_path(self) -> str: - """Returns full processed path.""" + """Returns interim data path.""" return self.output_path / "interim" + @property + def psu_filepath(self) -> Path: + return ( + self.external_path + / "nts" + / "UKDA-5340-tab" + / "tab" + / "psu_eul_2002-2022.tab" + ) + + @property + def nts_individuals_filepath(self) -> Path: + return ( + self.external_path + / "nts" + / "UKDA-5340-tab" + / "tab" + / "individual_eul_2002-2022.tab" + ) + + @property + def nts_households_filepath(self) -> Path: + return ( + self.external_path + / "nts" + / "UKDA-5340-tab" + / "tab" + / "household_eul_2002-2022.tab" + ) + + @property + def nts_trips_filepath(self) -> Path: + return ( + self.external_path + / "nts" + / "UKDA-5340-tab" + / "tab" + / "trip_eul_2002-2022.tab" + ) + + @property + def rural_urban_filepath(self) -> Path: + return self.external_path / "census_2011_rural_urban.csv" + + @property + def centroid_layer_filepath(self) -> Path: + return self.external_path / "centroids" / "Output_Areas_Dec_2011_PWC_2022.csv" + + @property + def travel_demand_filepath(self) -> Path: + if self.work_assignment.commute_level == "msoa": + return self.external_path / "ODWP15EW_MSOA_v1.zip" + return self.external_path / "ODWP01EW_OA.zip" + + @property + def travel_times_filepath(self) -> Path: + if self.work_assignment.commute_level == "msoa": + return ( + self.external_path + / "travel_times" + / "msoa" + / "travel_time_matrix.parquet" + ) + return self.external_path / "travel_times" / "oa" / "travel_time_matrix.parquet" + + @property + def travel_times_estimates_filepath(self) -> Path: + return self.interim_path / "assigning" / "travel_time_estimates.parquet" + + @property + def spc_with_nts_trips_filepath(self) -> Path: + return Path(self.interim_path / "matching" / "spc_with_nts_trips.parquet") + @property def seed(self) -> int: return self.parameters.seed diff --git a/src/acbm/preprocessing.py b/src/acbm/preprocessing.py index b5f3c5c..7c14286 100644 --- a/src/acbm/preprocessing.py +++ b/src/acbm/preprocessing.py @@ -6,8 +6,6 @@ from pyproj import Transformer from shapely.geometry import MultiPolygon, Point -import acbm - # ----- PREPROCESSING BOUNDARIES @@ -388,11 +386,9 @@ def get_new_coords(loc): return gpd.GeoDataFrame(merged_df, geometry="location", crs=target_crs) -def add_locations_to_activity_chains(activity_chains: pd.DataFrame) -> pd.DataFrame: - # Add location column as spatial column from OA centroids - centroid_layer = pd.read_csv( - acbm.root_path / "data/external/centroids/Output_Areas_Dec_2011_PWC_2022.csv" - ) +def add_locations_to_activity_chains( + activity_chains: pd.DataFrame, centroid_layer: pd.DataFrame +) -> pd.DataFrame: return add_location( activity_chains, "EPSG:27700", "EPSG:4326", centroid_layer, "OA11CD", "OA11CD" ) diff --git a/src/acbm/utils.py b/src/acbm/utils.py index def55c7..e9e95ed 100644 --- a/src/acbm/utils.py +++ b/src/acbm/utils.py @@ -4,7 +4,6 @@ import pandas as pd from sklearn.metrics import mean_squared_error -import acbm from acbm.config import Config @@ -45,9 +44,5 @@ def calculate_rmse(predictions, targets): def get_travel_times(config: Config, use_estimates: bool = False) -> pd.DataFrame: if config.parameters.travel_times and not use_estimates: - return pd.read_parquet( - acbm.root_path / "data/external/travel_times/oa/travel_time_matrix.parquet" - ) - return pd.read_parquet( - acbm.root_path / config.interim_path / "travel_time_estimates.parquet" - ) + return pd.read_parquet(config.travel_times_filepath) + return pd.read_parquet(config.travel_times_estimates_filepath) From f1279c53fa004b830e94007d4c50637220d6d9fb Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 28 Nov 2024 11:14:11 +0000 Subject: [PATCH 17/56] Refactor logs to use config --- scripts/0.1_run_osmox.py | 8 ++-- scripts/0_preprocess_inputs.py | 11 ++---- scripts/1_prep_synthpop.py | 11 +++--- scripts/2_match_households_and_individuals.py | 10 ++--- scripts/3.1_assign_primary_feasible_zones.py | 8 ++-- scripts/3.2.1_assign_primary_zone_edu.py | 6 +-- scripts/3.2.2_assign_primary_zone_work.py | 7 ++-- scripts/3.2.3_assign_secondary_zone.py | 8 ++-- scripts/3.3_assign_facility_all.py | 7 ++-- scripts/4_validation.py | 7 ++-- src/acbm/config.py | 23 ++++++++++++ src/acbm/logger_config.py | 37 +++++++++---------- 12 files changed, 77 insertions(+), 66 deletions(-) diff --git a/scripts/0.1_run_osmox.py b/scripts/0.1_run_osmox.py index d6cf0b9..aa978ad 100644 --- a/scripts/0.1_run_osmox.py +++ b/scripts/0.1_run_osmox.py @@ -3,14 +3,16 @@ from pyrosm import get_data from acbm.cli import acbm_cli -from acbm.config import load_config +from acbm.config import load_and_setup_config @acbm_cli def main(config_file): - config = load_config(config_file) - config.init_rng() + config = load_and_setup_config(config_file) + logger = config.get_logger("preprocessing", __file__) + logger.info("Getting OSM data") fp = get_data(config.region, directory=config.osmox_path) + logger.info("Running osmox") subprocess.run( [ "osmox", diff --git a/scripts/0_preprocess_inputs.py b/scripts/0_preprocess_inputs.py index 1eaf13f..966e898 100644 --- a/scripts/0_preprocess_inputs.py +++ b/scripts/0_preprocess_inputs.py @@ -3,19 +3,14 @@ from uatk_spc import Reader from acbm.cli import acbm_cli -from acbm.config import load_config -from acbm.logger_config import preprocessing_logger as logger +from acbm.config import load_and_setup_config from acbm.preprocessing import edit_boundary_resolution @acbm_cli def main(config_file): - config = load_config(config_file) - config.make_dirs() - # Write config to file - config.write(config.output_path / "config.toml") - - config.init_rng() + config = load_and_setup_config(config_file) + logger = config.get_logger("preprocessing", __file__) # ----- BOUNDARIES logger.info("Preprocessing Boundary Layer") diff --git a/scripts/1_prep_synthpop.py b/scripts/1_prep_synthpop.py index 8979d3d..d9ed6f1 100644 --- a/scripts/1_prep_synthpop.py +++ b/scripts/1_prep_synthpop.py @@ -1,16 +1,15 @@ from uatk_spc.builder import Builder from acbm.cli import acbm_cli -from acbm.config import load_config +from acbm.config import load_and_setup_config @acbm_cli def main(config_file): - config = load_config(config_file) - config.init_rng() - config.make_dirs() + config = load_and_setup_config(config_file) + logger = config.get_logger("preprocessing", __file__) - # Add people and households + logger.info("Combine SPC people and houeshold data") spc_people_hh = ( Builder( config.spc_raw_path, config.region, backend="pandas", input_type="parquet" @@ -21,6 +20,8 @@ def main(config_file): ) .build() ) + + logger.info(f"Write combined SPC data to: {config.spc_combined_filepath}") spc_people_hh.to_parquet(config.spc_combined_filepath) diff --git a/scripts/2_match_households_and_individuals.py b/scripts/2_match_households_and_individuals.py index 518c517..5301ad2 100644 --- a/scripts/2_match_households_and_individuals.py +++ b/scripts/2_match_households_and_individuals.py @@ -4,12 +4,9 @@ import numpy as np import pandas as pd -# from joblib import Parallel, delayed -# from tqdm import trange from acbm.assigning.utils import cols_for_assignment_all from acbm.cli import acbm_cli -from acbm.config import load_config -from acbm.logger_config import matching_logger as logger +from acbm.config import load_and_setup_config from acbm.matching import MatcherExact, match_individuals from acbm.preprocessing import ( count_per_group, @@ -23,9 +20,8 @@ @acbm_cli def main(config_file): - config = load_config(config_file) - config.init_rng() - config.make_dirs() + config = load_and_setup_config(config_file) + logger = config.get_logger("matching", __file__) pd.set_option("display.max_columns", None) diff --git a/scripts/3.1_assign_primary_feasible_zones.py b/scripts/3.1_assign_primary_feasible_zones.py index acca185..2d30bf4 100644 --- a/scripts/3.1_assign_primary_feasible_zones.py +++ b/scripts/3.1_assign_primary_feasible_zones.py @@ -12,16 +12,14 @@ zones_to_time_matrix, ) from acbm.cli import acbm_cli -from acbm.config import load_config -from acbm.logger_config import assigning_primary_feasible_logger as logger +from acbm.config import load_and_setup_config from acbm.preprocessing import add_locations_to_activity_chains @acbm_cli def main(config_file): - config = load_config(config_file) - config.init_rng() - config.make_dirs() + config = load_and_setup_config(config_file) + logger = config.get_logger("assigning_primary_feasible", __file__) #### LOAD DATA #### diff --git a/scripts/3.2.1_assign_primary_zone_edu.py b/scripts/3.2.1_assign_primary_zone_edu.py index 1185622..dacef02 100644 --- a/scripts/3.2.1_assign_primary_zone_edu.py +++ b/scripts/3.2.1_assign_primary_zone_edu.py @@ -10,15 +10,15 @@ cols_for_assignment_edu, ) from acbm.cli import acbm_cli -from acbm.config import load_config -from acbm.logger_config import assigning_primary_zones_logger as logger +from acbm.config import load_and_setup_config from acbm.preprocessing import add_location from acbm.utils import get_travel_times @acbm_cli def main(config_file): - config = load_config(config_file) + config = load_and_setup_config(config_file) + logger = config.get_logger("assigning_primary_zone", __file__) # TODO: consider if RNG seed needs to be distinct for different assignments config.init_rng() diff --git a/scripts/3.2.2_assign_primary_zone_work.py b/scripts/3.2.2_assign_primary_zone_work.py index d468fba..bb833f0 100644 --- a/scripts/3.2.2_assign_primary_zone_work.py +++ b/scripts/3.2.2_assign_primary_zone_work.py @@ -12,16 +12,15 @@ filter_matrix_to_boundary, ) from acbm.cli import acbm_cli -from acbm.config import load_config -from acbm.logger_config import assigning_primary_zones_logger as logger +from acbm.config import load_and_setup_config from acbm.preprocessing import add_locations_to_activity_chains from acbm.utils import calculate_rmse @acbm_cli def main(config_file): - config = load_config(config_file) - config.init_rng() + config = load_and_setup_config(config_file) + logger = config.get_logger("assigning_primary_zone", __file__) #### LOAD DATA #### diff --git a/scripts/3.2.3_assign_secondary_zone.py b/scripts/3.2.3_assign_secondary_zone.py index 4ad6231..ed0084d 100644 --- a/scripts/3.2.3_assign_secondary_zone.py +++ b/scripts/3.2.3_assign_secondary_zone.py @@ -23,16 +23,16 @@ activity_chains_for_assignment, ) from acbm.cli import acbm_cli -from acbm.config import load_config -from acbm.logger_config import assigning_secondary_zones_logger as logger +from acbm.config import load_and_setup_config from acbm.preprocessing import add_location from acbm.utils import get_travel_times @acbm_cli def main(config_file): - config = load_config(config_file) - config.init_rng() + config = load_and_setup_config(config_file) + logger = config.get_logger("assigning_secondary_zone", __file__) + zone_id = config.zone_id # --- Load in the data diff --git a/scripts/3.3_assign_facility_all.py b/scripts/3.3_assign_facility_all.py index 89e7b3d..8f56f83 100644 --- a/scripts/3.3_assign_facility_all.py +++ b/scripts/3.3_assign_facility_all.py @@ -5,14 +5,13 @@ from acbm.assigning.plots import plot_desire_lines, plot_scatter_actual_reported from acbm.assigning.select_facility import map_activity_locations, select_facility from acbm.cli import acbm_cli -from acbm.config import load_config -from acbm.logger_config import assigning_facility_locations_logger as logger +from acbm.config import load_and_setup_config @acbm_cli def main(config_file): - config = load_config(config_file) - config.init_rng() + config = load_and_setup_config(config_file) + logger = config.get_logger("assigning_facility_locations", __file__) # --- Load data: activity chains logger.info("Loading activity chains") diff --git a/scripts/4_validation.py b/scripts/4_validation.py index 4e4ea65..ffe1327 100644 --- a/scripts/4_validation.py +++ b/scripts/4_validation.py @@ -3,8 +3,7 @@ import seaborn as sns from acbm.cli import acbm_cli -from acbm.config import load_config -from acbm.logger_config import validation_logger as logger +from acbm.config import load_and_setup_config from acbm.validating.plots import ( plot_activity_sequence_comparison, plot_comparison, @@ -15,8 +14,8 @@ @acbm_cli def main(config_file): - config = load_config(config_file) - config.init_rng() + config = load_and_setup_config(config_file) + logger = config.get_logger("validation", __file__) # ----- Folder for validation plots diff --git a/src/acbm/config.py b/src/acbm/config.py index 9864dcd..d1e8372 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -2,6 +2,7 @@ import random from dataclasses import dataclass from hashlib import sha256 +from logging import Logger from pathlib import Path from typing import Tuple @@ -11,6 +12,7 @@ from pydantic import BaseModel, Field, field_serializer import acbm +from acbm.logger_config import create_logger @dataclass(frozen=True) @@ -72,6 +74,7 @@ class Config(BaseModel): def make_dirs(self): """Makes all directories requried from config""" os.makedirs(self.output_path, exist_ok=True) + os.makedirs(self.logs_path, exist_ok=True) os.makedirs(self.assigning_plots_path, exist_ok=True) os.makedirs(self.validation_plots_path, exist_ok=True) os.makedirs(self.activities_per_zone.parent, exist_ok=True) @@ -98,6 +101,11 @@ def id(self): ID_LENGTH = 10 return sha256(jcs.canonicalize(self.model_dump())).hexdigest()[:ID_LENGTH] + @property + def logs_path(self) -> Path: + """Returns logs path.""" + return self.output_path / "logs" + @property def boundaries_filepath(self) -> Path: """Returns boundaries path.""" @@ -317,7 +325,22 @@ def write(self, filepath: str | Path): with open(filepath, "w") as f: f.write(tomlkit.dumps(self.model_dump(exclude_none=True))) + def get_logger(self, name: str, filename: str) -> Logger: + return create_logger(name, os.path.basename(filename), self.logs_path) + def load_config(filepath: str | Path) -> Config: + """Loads config from filepath.""" with open(filepath, "rb") as f: return Config.model_validate(tomlkit.load(f)) + + +def load_and_setup_config(filepath: str | Path) -> Config: + """Loads config from filepath, makes required dirs, writes config and inits RNG.""" + config = load_config(filepath) + config.make_dirs() + config_path = config.output_path / "config.toml" + if not os.path.isfile(config_path): + config.write(config_path) + config.init_rng() + return config diff --git a/src/acbm/logger_config.py b/src/acbm/logger_config.py index 87797ea..75f328d 100644 --- a/src/acbm/logger_config.py +++ b/src/acbm/logger_config.py @@ -1,6 +1,5 @@ import logging -import acbm from acbm.utils import prepend_datetime # # Configure the root logger @@ -19,13 +18,13 @@ ) -def create_logger(name, log_file): +def create_logger(name, log_file, logs_path): logger = logging.getLogger(name) logger.setLevel( logging.DEBUG ) # Ensure the logger captures all messages at DEBUG level and above if not logger.hasHandlers(): # Check if the logger already has handlers - file_handler = logging.FileHandler(acbm.logs_path / prepend_datetime(log_file)) + file_handler = logging.FileHandler(logs_path / prepend_datetime(log_file)) file_handler.setLevel(logging.DEBUG) # Set to DEBUG for file output file_handler.setFormatter( logging.Formatter( @@ -39,20 +38,20 @@ def create_logger(name, log_file): return logger -# Create loggers for different modules -preprocessing_logger = create_logger("preprocessing", "preprocessing.log") -matching_logger = create_logger("matching", "matching.log") -assigning_primary_feasible_logger = create_logger( - "assigning_primary_feasible", "assigning_primary_feasible.log" -) -assigning_primary_zones_logger = create_logger( - "assigning_primary_zone", "assigning_primary_zone.log" -) -assigning_secondary_zones_logger = create_logger( - "assigning_secondary_zone", "assigning_secondary_zone.log" -) -assigning_facility_locations_logger = create_logger( - "assigning_facility_locations", "assigning_facility_locations.log" -) +# # Create loggers for different modules +# preprocessing_logger = create_logger("preprocessing", "preprocessing.log") +# matching_logger = create_logger("matching", "matching.log") +# assigning_primary_feasible_logger = create_logger( +# "assigning_primary_feasible", "assigning_primary_feasible.log" +# ) +# assigning_primary_zones_logger = create_logger( +# "assigning_primary_zone", "assigning_primary_zone.log" +# ) +# assigning_secondary_zones_logger = create_logger( +# "assigning_secondary_zone", "assigning_secondary_zone.log" +# ) +# assigning_facility_locations_logger = create_logger( +# "assigning_facility_locations", "assigning_facility_locations.log" +# ) -validation_logger = create_logger("validation", "validation.log") +# validation_logger = create_logger("validation", "validation.log") From 3e40d6aa4a8adabfd3da5452ba4c24f616f7ceef Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 28 Nov 2024 11:23:34 +0000 Subject: [PATCH 18/56] Make PathParams optional --- config/base.toml | 2 -- src/acbm/config.py | 6 ++++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/config/base.toml b/config/base.toml index 1a737cd..09d9697 100644 --- a/config/base.toml +++ b/config/base.toml @@ -46,5 +46,3 @@ max_zones = 8 # maximum number of feasible zones to include in the opti [postprocessing] pam_jitter = 30 pam_min_duration = 10 - -[paths] diff --git a/src/acbm/config.py b/src/acbm/config.py index d1e8372..ad95a00 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -69,7 +69,7 @@ class Config(BaseModel): description="Config: parameters for work assignment." ) matching: MatchingParams = Field(description="Config: parameters for matching.") - paths: PathParams = Field(description="Path overrides.") + paths: PathParams | None = Field(description="Path overrides.", default=None) def make_dirs(self): """Makes all directories requried from config""" @@ -99,7 +99,9 @@ def id(self): """ # Take first 10 chars to enable paths to remain not too long ID_LENGTH = 10 - return sha256(jcs.canonicalize(self.model_dump())).hexdigest()[:ID_LENGTH] + return sha256(jcs.canonicalize(self.model_dump(exclude_none=True))).hexdigest()[ + :ID_LENGTH + ] @property def logs_path(self) -> Path: From f2edfb0a52d9dd3f89b65dbb6fd211538f3f4028 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 28 Nov 2024 11:24:10 +0000 Subject: [PATCH 19/56] Move prepend_datetime to logger_config module --- src/acbm/logger_config.py | 7 ++++++- src/acbm/utils.py | 7 ------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/acbm/logger_config.py b/src/acbm/logger_config.py index 75f328d..af01916 100644 --- a/src/acbm/logger_config.py +++ b/src/acbm/logger_config.py @@ -1,6 +1,11 @@ import logging +from datetime import datetime + + +def prepend_datetime(s: str, delimiter: str = "_") -> str: + current_date = datetime.now().strftime("%Y-%m-%d") + return f"{current_date}{delimiter}{s}" -from acbm.utils import prepend_datetime # # Configure the root logger # logging.basicConfig( diff --git a/src/acbm/utils.py b/src/acbm/utils.py index e9e95ed..41fbed7 100644 --- a/src/acbm/utils.py +++ b/src/acbm/utils.py @@ -1,5 +1,3 @@ -from datetime import datetime - import numpy as np import pandas as pd from sklearn.metrics import mean_squared_error @@ -7,11 +5,6 @@ from acbm.config import Config -def prepend_datetime(s: str, delimiter: str = "_") -> str: - current_date = datetime.now().strftime("%Y-%m-%d") - return f"{current_date}{delimiter}{s}" - - def calculate_rmse(predictions, targets): """ Calculate the Root Mean Squared Error (RMSE) between predictions and targets, From 53e3c108eeb9b17ed5ec2d518ff852b4491e2ec4 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 28 Nov 2024 11:44:52 +0000 Subject: [PATCH 20/56] Handle missing PathParams, revise log filename --- src/acbm/config.py | 20 +++++++++++++++----- src/acbm/logger_config.py | 2 +- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/src/acbm/config.py b/src/acbm/config.py index ad95a00..9215802 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -88,7 +88,11 @@ def make_dirs(self): @property def root_path(self) -> Path: - return acbm.root_path if self.paths.root_path is None else self.paths.root_path + return ( + acbm.root_path + if self.paths is None or self.paths.root_path is None + else self.paths.root_path + ) @property def id(self): @@ -132,7 +136,7 @@ def study_areas_filepath(self) -> Path: """Returns boundaries path.""" return ( self.output_path / "boundaries" / "study_area_zones.geojson" - if self.paths.study_areas_filepath is None + if self.paths is None or self.paths.study_areas_filepath is None else self.paths.study_areas_filepath ) @@ -150,7 +154,7 @@ def osm_path(self) -> Path: """Returns osm path.""" return ( self.root_path / self.osmox_path / (self.region + "_epsg_4326.parquet") - if self.paths.osm_path is None + if self.paths is None or self.paths.osm_path is None else self.paths.osm_path ) @@ -203,7 +207,7 @@ def output_path(self) -> str: """Returns output path.""" return ( self.root_path / "data" / "outputs" / self.id - if self.paths.output_path is None + if self.paths is None or self.paths.output_path is None else self.paths.output_path ) @@ -328,7 +332,13 @@ def write(self, filepath: str | Path): f.write(tomlkit.dumps(self.model_dump(exclude_none=True))) def get_logger(self, name: str, filename: str) -> Logger: - return create_logger(name, os.path.basename(filename), self.logs_path) + stem = ".".join(Path(os.path.basename(filename)).name.split(".")[:-1]) + stem_with_log_suffix = stem + ".log" + return create_logger( + name, + stem_with_log_suffix, + self.logs_path, + ) def load_config(filepath: str | Path) -> Config: diff --git a/src/acbm/logger_config.py b/src/acbm/logger_config.py index af01916..05ddd23 100644 --- a/src/acbm/logger_config.py +++ b/src/acbm/logger_config.py @@ -29,7 +29,7 @@ def create_logger(name, log_file, logs_path): logging.DEBUG ) # Ensure the logger captures all messages at DEBUG level and above if not logger.hasHandlers(): # Check if the logger already has handlers - file_handler = logging.FileHandler(logs_path / prepend_datetime(log_file)) + file_handler = logging.FileHandler(logs_path / log_file) file_handler.setLevel(logging.DEBUG) # Set to DEBUG for file output file_handler.setFormatter( logging.Formatter( From 5af333155f340d62a293a6794f5820aa92fca2b5 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 28 Nov 2024 12:06:54 +0000 Subject: [PATCH 21/56] Refactor logging --- scripts/3.2.2_assign_primary_zone_work.py | 20 ++++++++-------- src/acbm/assigning/feasible_zones_primary.py | 25 ++++++++++---------- src/acbm/assigning/select_facility.py | 3 ++- src/acbm/assigning/select_zone_primary.py | 9 +++---- src/acbm/assigning/select_zone_secondary.py | 10 ++++---- src/acbm/assigning/select_zone_work.py | 3 ++- src/acbm/logger_config.py | 19 --------------- 7 files changed, 38 insertions(+), 51 deletions(-) diff --git a/scripts/3.2.2_assign_primary_zone_work.py b/scripts/3.2.2_assign_primary_zone_work.py index bb833f0..d00f95d 100644 --- a/scripts/3.2.2_assign_primary_zone_work.py +++ b/scripts/3.2.2_assign_primary_zone_work.py @@ -68,10 +68,10 @@ def main(config_file): # Clean the data if commute_level == "MSOA": - print("Step 1: Reading in the zipped csv file") + logger.info("Step 1: Reading in the zipped csv file") travel_demand = pd.read_csv(config.travel_demand_filepath) - print("Step 2: Creating commute_mode_dict") + logger.info("Step 2: Creating commute_mode_dict") commute_mode_dict = { "Bus, minibus or coach": "pt", "Driving a car or van": "car", @@ -86,12 +86,12 @@ def main(config_file): "Work mainly at or from home": "home", } - print("Step 3: Mapping commute mode to model mode") + logger.info("Step 3: Mapping commute mode to model mode") travel_demand["mode"] = travel_demand[ "Method used to travel to workplace (12 categories) label" ].map(commute_mode_dict) - print("Step 4: Filtering rows and dropping unnecessary columns") + logger.info("Step 4: Filtering rows and dropping unnecessary columns") travel_demand_clipped = travel_demand[ travel_demand["Place of work indicator (4 categories) code"].isin([1, 3]) ] @@ -106,7 +106,7 @@ def main(config_file): ] ) - print("Step 5: Renaming columns and grouping") + logger.info("Step 5: Renaming columns and grouping") travel_demand_clipped = travel_demand_clipped.rename( columns={ "Middle layer Super Output Areas code": "MSOA21CD_home", @@ -119,7 +119,7 @@ def main(config_file): .reset_index() ) - print("Step 6: Filtering matrix to boundary") + logger.info("Step 6: Filtering matrix to boundary") travel_demand_clipped = filter_matrix_to_boundary( boundary=boundaries, matrix=travel_demand_clipped, @@ -129,10 +129,10 @@ def main(config_file): ) elif commute_level == "OA": - print("Step 1: Reading in the zipped csv file") + logger.info("Step 1: Reading in the zipped csv file") travel_demand = pd.read_csv(config.travel_demand_filepath) - print("Step 2: Filtering rows and dropping unnecessary columns") + logger.info("Step 2: Filtering rows and dropping unnecessary columns") travel_demand_clipped = travel_demand[ travel_demand["Place of work indicator (4 categories) code"].isin([1, 3]) ] @@ -143,7 +143,7 @@ def main(config_file): ] ) - print("Step 3: Renaming columns and grouping") + logger.info("Step 3: Renaming columns and grouping") travel_demand_clipped = travel_demand_clipped.rename( columns={ "Output Areas code": "OA21CD_home", @@ -156,7 +156,7 @@ def main(config_file): .reset_index() ) - print("Step 4: Filtering matrix to boundary") + logger.info("Step 4: Filtering matrix to boundary") travel_demand_clipped = filter_matrix_to_boundary( boundary=boundaries, matrix=travel_demand_clipped, diff --git a/src/acbm/assigning/feasible_zones_primary.py b/src/acbm/assigning/feasible_zones_primary.py index 75eb52d..9eb18fb 100644 --- a/src/acbm/assigning/feasible_zones_primary.py +++ b/src/acbm/assigning/feasible_zones_primary.py @@ -1,3 +1,4 @@ +import logging from typing import Optional import geopandas as gpd @@ -13,11 +14,12 @@ zones_to_time_matrix, ) from acbm.config import Config -from acbm.logger_config import assigning_primary_feasible_logger as logger pandarallel.initialize(progress_bar=True) +logger = logging.getLogger("assigning_primary_feasible") + # --- Schemas for validation activity_chains_schema = DataFrameSchema( @@ -128,8 +130,8 @@ def get_possible_zones( travel_times = input_schemas["travel_times"].validate(travel_times, lazy=True) except SchemaErrors as e: - print("Validation failed with errors:") - print(e.failure_cases) # prints all the validation errors at once + logger.error("Validation failed with errors:") + logger.error(e.failure_cases) # prints all the validation errors at once return None if travel_times is None: @@ -139,7 +141,7 @@ def get_possible_zones( ) list_of_modes = activity_chains["mode"].unique() - print(f"Unique modes found in the dataset are: {list_of_modes}") + logger.info(f"Unique modes found in the dataset are: {list_of_modes}") # use map_day_to_wkday_binary to identify if activity is on a weekday or weekend activity_chains["weekday"] = activity_chains["TravDay"].apply( @@ -157,7 +159,7 @@ def get_possible_zones( # loop over the list of modes for mode in list_of_modes: - print(f"Processing mode: {mode}") + logger.info(f"Processing mode: {mode}") # filter the travel_times dataframe to only include rows with the current mode travel_times_filtered_mode = travel_times[travel_times["mode"] == mode] @@ -174,9 +176,9 @@ def get_possible_zones( and "weekday" in travel_times.columns ): for time_of_day in list_of_times_of_day: - print(f"Processing time of day: {time_of_day} | mode: {mode}") + logger.info(f"Processing time of day: {time_of_day} | mode: {mode}") for day_type in day_types: - print( + logger.info( f"Processing time of day: {time_of_day} | weekday: {day_type} | mode: {mode}" ) # filter the travel_times dataframe to only include rows with the current time_of_day and weekday @@ -184,10 +186,8 @@ def get_possible_zones( (travel_times_filtered_mode["weekday"] == day_type) & (travel_times_filtered_mode["time_of_day"] == time_of_day) ] - print( - "unique modes after filtering are", - travel_times_filtered_mode_time_day["mode"].unique(), - ) + unique_modes = travel_times_filtered_mode_time_day["mode"].unique() + logger.info(f"unique modes after filtering are: {unique_modes}") # filter the activity chains to the current mode, time_of_day and weekday activity_chains_filtered = activity_chains[ @@ -292,7 +292,6 @@ def _get_possible_zones( dict A dictionary with the origin zone as the key and a list of possible destination zones as the value """ - # get the travel time travel_time = activity["TripTotalTime"] # get the origin zone @@ -321,6 +320,7 @@ def _get_possible_zones( filtered_activities_per_zone[zone_id] ) ] + # how many zones are reachable? logger.debug( f"Activity {activity.id}: Number of zones with activity {activity_purpose} \ @@ -338,6 +338,7 @@ def _get_possible_zones( <= travel_time + time_tolerance * travel_time ) ] + logger.debug( f"Activity {activity.id}: Number of zones with activity {activity_purpose} within threshold of reported time {travel_time}: \ {len(travel_times_filtered_time)}" diff --git a/src/acbm/assigning/select_facility.py b/src/acbm/assigning/select_facility.py index 6d926f1..42bfa50 100644 --- a/src/acbm/assigning/select_facility.py +++ b/src/acbm/assigning/select_facility.py @@ -1,3 +1,4 @@ +import logging from typing import Optional, Tuple import geopandas as gpd @@ -5,7 +6,7 @@ import pandas as pd from shapely import Point -from acbm.logger_config import assigning_facility_locations_logger as logger +logger = logging.getLogger("assigning_facility_locations") def _select_facility( diff --git a/src/acbm/assigning/select_zone_primary.py b/src/acbm/assigning/select_zone_primary.py index 8a37d19..7626677 100644 --- a/src/acbm/assigning/select_zone_primary.py +++ b/src/acbm/assigning/select_zone_primary.py @@ -1,8 +1,9 @@ +import logging from typing import Optional import pandas as pd -from acbm.logger_config import assigning_primary_zones_logger as logger +logger = logging.getLogger("assigning_primary_zone") def select_zone( @@ -149,7 +150,7 @@ def select_zone( return selected_zone except KeyError: - logger.info(f"KeyError: Key {activity_id} in possible_zones has no values") + logger.error(f"KeyError: Key {activity_id} in possible_zones has no values") return "NA" @@ -240,11 +241,11 @@ def _get_zones_using_time_estimate( str The zone that has the estimated time closest to the given time. """ - acceptable_modes = ["car", "pt", "walk", "cycle"] if mode is not None and mode not in acceptable_modes: error_message = f"Invalid mode: {mode}. Mode must be one of {acceptable_modes}." + logger.error(error_message) raise ValueError(error_message) # Convert to_zones to a set for faster lookup @@ -260,7 +261,7 @@ def _get_zones_using_time_estimate( # Check if the filtered dictionary is empty if not filtered_dict: # Handle the case where there are no travel time estimates for the target zone - print( + logger.info( f"No travel time estimates found for from_zone: {from_zone} to any of the to_zones: {to_zones}" ) return None diff --git a/src/acbm/assigning/select_zone_secondary.py b/src/acbm/assigning/select_zone_secondary.py index 4c53952..dd7dc3b 100644 --- a/src/acbm/assigning/select_zone_secondary.py +++ b/src/acbm/assigning/select_zone_secondary.py @@ -1,3 +1,4 @@ +import logging from copy import deepcopy import numpy as np @@ -5,7 +6,7 @@ import pandas as pd from pam.planner.choice_location import DiscretionaryTrips -from acbm.logger_config import assigning_secondary_zones_logger as logger +logger = logging.getLogger("assigning_secondary_zone") def set_home_ozone(data: pd.DataFrame, oact_col: str, ozone_col: str, hzone_col: str): @@ -152,17 +153,18 @@ def create_od_matrices( to_indices = df[zone_to].map(zone_index) for mode in modes: - print(f"Starting mode: {mode}") + logger.info(f"Starting mode: {mode}") mask = df[mode_column] == mode values = df[mask][value_column].fillna(fill_value) # Fill missing values od_matrices[mode][from_indices[mask], to_indices[mask]] = values - print(f"Finished mode: {mode}") + logger.info(f"Finished mode: {mode}") return od_matrices def update_population_plans( - population: pam.core.Population, od: pam.planner.od.ODFactory + population: pam.core.Population, + od: pam.planner.od.ODFactory, ) -> None: """ Update the plans in a population object using the DiscretionaryTrips planner diff --git a/src/acbm/assigning/select_zone_work.py b/src/acbm/assigning/select_zone_work.py index a8d4991..ff9257a 100644 --- a/src/acbm/assigning/select_zone_work.py +++ b/src/acbm/assigning/select_zone_work.py @@ -1,3 +1,4 @@ +import logging from dataclasses import dataclass, field from typing import Dict, List, Tuple @@ -5,7 +6,7 @@ import pandas as pd import pulp -from acbm.logger_config import assigning_primary_zones_logger as logger +logger = logging.getLogger("assigning_primary_zone") @dataclass diff --git a/src/acbm/logger_config.py b/src/acbm/logger_config.py index 05ddd23..1843232 100644 --- a/src/acbm/logger_config.py +++ b/src/acbm/logger_config.py @@ -41,22 +41,3 @@ def create_logger(name, log_file, logs_path): # avoid logs from being propagated to the root logger (so that they don't show in the notebook) logger.propagate = False return logger - - -# # Create loggers for different modules -# preprocessing_logger = create_logger("preprocessing", "preprocessing.log") -# matching_logger = create_logger("matching", "matching.log") -# assigning_primary_feasible_logger = create_logger( -# "assigning_primary_feasible", "assigning_primary_feasible.log" -# ) -# assigning_primary_zones_logger = create_logger( -# "assigning_primary_zone", "assigning_primary_zone.log" -# ) -# assigning_secondary_zones_logger = create_logger( -# "assigning_secondary_zone", "assigning_secondary_zone.log" -# ) -# assigning_facility_locations_logger = create_logger( -# "assigning_facility_locations", "assigning_facility_locations.log" -# ) - -# validation_logger = create_logger("validation", "validation.log") From 0ef173345677a425f8938cc6cd727c04db516810 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 28 Nov 2024 14:35:40 +0000 Subject: [PATCH 22/56] Fix case in config, add field validator for commute_level --- src/acbm/config.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/acbm/config.py b/src/acbm/config.py index 9215802..a02d010 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -9,7 +9,7 @@ import jcs import numpy as np import tomlkit -from pydantic import BaseModel, Field, field_serializer +from pydantic import BaseModel, Field, field_serializer, field_validator import acbm from acbm.logger_config import create_logger @@ -42,7 +42,11 @@ class WorkAssignmentParams(BaseModel): weight_max_dev: float weight_total_dev: float max_zones: int - commute_level: str | None = None + commute_level: str + + @field_validator("commute_level") + def validate_commute_level(commute_level: str) -> str: + return commute_level.upper() @dataclass(frozen=True) @@ -271,20 +275,20 @@ def centroid_layer_filepath(self) -> Path: @property def travel_demand_filepath(self) -> Path: - if self.work_assignment.commute_level == "msoa": + if self.work_assignment.commute_level == "MSOA": return self.external_path / "ODWP15EW_MSOA_v1.zip" return self.external_path / "ODWP01EW_OA.zip" @property def travel_times_filepath(self) -> Path: - if self.work_assignment.commute_level == "msoa": + if self.work_assignment.commute_level == "MSOA": return ( self.external_path / "travel_times" - / "msoa" + / "MSOA" / "travel_time_matrix.parquet" ) - return self.external_path / "travel_times" / "oa" / "travel_time_matrix.parquet" + return self.external_path / "travel_times" / "OA" / "travel_time_matrix.parquet" @property def travel_times_estimates_filepath(self) -> Path: From f34a2a2f3b74c8976db4167a03bce4a1f6799b38 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 28 Nov 2024 15:57:05 +0000 Subject: [PATCH 23/56] Update paths --- scripts/5_acbm_to_matsim_xml.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/scripts/5_acbm_to_matsim_xml.py b/scripts/5_acbm_to_matsim_xml.py index a514808..f0eef57 100644 --- a/scripts/5_acbm_to_matsim_xml.py +++ b/scripts/5_acbm_to_matsim_xml.py @@ -7,7 +7,6 @@ from pam.samplers.time import apply_jitter_to_plan from shapely import Point, wkt -import acbm from acbm.cli import acbm_cli from acbm.config import load_and_setup_config from acbm.postprocessing.matsim import ( @@ -28,19 +27,11 @@ def main(config_file): logger.info("1 - Loading data") - individuals = pd.read_csv( - acbm.root_path / "data/processed/activities_pam/people.csv" - ) - households = pd.read_csv( - acbm.root_path / "data/processed/activities_pam/households.csv" - ) - activities = pd.read_csv( - acbm.root_path / "data/processed/activities_pam/activities.csv" - ) - legs = pd.read_csv(acbm.root_path / "data/processed/activities_pam/legs.csv") - legs_geo = pd.read_parquet( - acbm.root_path / "data/processed/activities_pam/legs_with_locations.parquet" - ) + individuals = pd.read_csv(config.output_path / "people.csv") + households = pd.read_csv(config.output_path / "households.csv") + activities = pd.read_csv(config.output_path / "activities.csv") + legs = pd.read_csv(config.output_path / "legs.csv") + legs_geo = pd.read_parquet(config.output_path / "legs_with_locations.parquet") # ----- Clean the data @@ -154,7 +145,7 @@ def convert_to_point(value): write.write_matsim_population_v6( population=population, - path=acbm.root_path / "data/processed/activities_pam/plans.xml", + path=config.output_path / "plans.xml", ) From 628b553c5f8454d35fcf1525e116816d632174df Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 28 Nov 2024 16:32:15 +0000 Subject: [PATCH 24/56] Revise order of args and returned tuple --- scripts/5_acbm_to_matsim_xml.py | 5 ++--- src/acbm/postprocessing/matsim.py | 6 +++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/scripts/5_acbm_to_matsim_xml.py b/scripts/5_acbm_to_matsim_xml.py index f0eef57..2ace3ba 100644 --- a/scripts/5_acbm_to_matsim_xml.py +++ b/scripts/5_acbm_to_matsim_xml.py @@ -74,11 +74,10 @@ def main(config_file): }, inplace=True, ) - logger.info("2.4 - Remove people with missing location data ") - individuals, households, activities, legs, legs_geo = filter_no_location( - individuals, households, activities, legs, legs_geo + individuals, activities, legs, legs_geo, households = filter_no_location( + individuals, activities, legs, legs_geo, households ) log_row_count(individuals, "individuals", "2_filter_no_location", row_counts) diff --git a/src/acbm/postprocessing/matsim.py b/src/acbm/postprocessing/matsim.py index 248db15..6facb89 100644 --- a/src/acbm/postprocessing/matsim.py +++ b/src/acbm/postprocessing/matsim.py @@ -68,14 +68,14 @@ def filter_no_location( ---------- individuals : pd.DataFrame DataFrame containing individual data. - households : pd.DataFrame - DataFrame containing household data. activities : pd.DataFrame DataFrame containing activity data. legs : pd.DataFrame DataFrame containing legs data. legs_geo : pd.DataFrame DataFrame containing legs with geographic data. + households : pd.DataFrame + DataFrame containing household data. Returns ------- @@ -114,10 +114,10 @@ def filter_no_location( return ( individuals_cleaned, - households_cleaned, activities_cleaned, legs_cleaned, legs_geo_cleaned, + households_cleaned, ) From c8ef43adcb02be716a6a524c2288c1db4f73eeca Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 28 Nov 2024 16:38:54 +0000 Subject: [PATCH 25/56] Fix test --- tests/test_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_config.py b/tests/test_config.py index d0afe71..9dc9cc2 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -11,4 +11,4 @@ def config(): def test_id(config): - assert config.id == "01d8ded073" + assert config.id == "11df8ad099" From ca1d92240061593b32a5bbfd52ce5c18e8afb98b Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Fri, 29 Nov 2024 17:56:59 +0000 Subject: [PATCH 26/56] Fix doc comment --- src/acbm/preprocessing.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/acbm/preprocessing.py b/src/acbm/preprocessing.py index 5d31601..7601659 100644 --- a/src/acbm/preprocessing.py +++ b/src/acbm/preprocessing.py @@ -425,6 +425,8 @@ def add_locations_to_activity_chains( DataFrame containing the activity chains. target_crs : str The target CRS to reproject the locations to. + centroid_layer : pd.DataFrame + DataFrame containing zone centroids. Returns ------- From 2885ba3084140a3a476e31c4e828f40c88e27744 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Fri, 29 Nov 2024 18:04:08 +0000 Subject: [PATCH 27/56] Refactor loading/reprojecting boundaries with config --- scripts/0_preprocess_inputs.py | 5 +---- scripts/3.1_assign_primary_feasible_zones.py | 10 ++-------- scripts/3.2.1_assign_primary_zone_edu.py | 10 ++-------- scripts/3.2.2_assign_primary_zone_work.py | 11 ++--------- scripts/3.2.3_assign_secondary_zone.py | 13 +++++-------- scripts/3.3_assign_facility_all.py | 10 ++-------- src/acbm/config.py | 5 +++++ 7 files changed, 19 insertions(+), 45 deletions(-) diff --git a/scripts/0_preprocess_inputs.py b/scripts/0_preprocess_inputs.py index 6cfa4f4..994b2eb 100644 --- a/scripts/0_preprocess_inputs.py +++ b/scripts/0_preprocess_inputs.py @@ -1,4 +1,3 @@ -import geopandas as gpd import pandas as pd from uatk_spc import Reader @@ -19,9 +18,7 @@ def main(config_file): logger.info("1. Reading in the boundary layer for the whole of England") - boundaries = gpd.read_file(config.boundaries_filepath) - - boundaries = boundaries.to_crs(epsg=config.output_crs) + boundaries = config.get_boundaries() ## --- Dissolve boundaries if resolution is MSOA diff --git a/scripts/3.1_assign_primary_feasible_zones.py b/scripts/3.1_assign_primary_feasible_zones.py index 9cb26e6..f45d13c 100644 --- a/scripts/3.1_assign_primary_feasible_zones.py +++ b/scripts/3.1_assign_primary_feasible_zones.py @@ -37,14 +37,8 @@ def main(config_file): # --- Study area boundaries logger.info("Loading study area boundaries") - - boundaries = gpd.read_file(config.study_areas_filepath) - - logger.info("Study area boundaries loaded") - - # Reproject boundaries to the output CRS specified in the config - boundaries = boundaries.to_crs(f"epsg:{config.output_crs}") - logger.info(f"Boundaries reprojected to {config.output_crs}") + boundaries = config.get_boundaries() + logger.info(f"Study area boundaries loaded and reprojected to {config.output_crs}") # --- Assign activity home locations to boundaries zoning system diff --git a/scripts/3.2.1_assign_primary_zone_edu.py b/scripts/3.2.1_assign_primary_zone_edu.py index 26bf1b0..f543edb 100644 --- a/scripts/3.2.1_assign_primary_zone_edu.py +++ b/scripts/3.2.1_assign_primary_zone_edu.py @@ -33,14 +33,8 @@ def main(config_file): # --- boundaries logger.info("Loading study area boundaries") - - boundaries = gpd.read_file(config.study_areas_filepath) - - logger.info("Study area boundaries loaded") - - # Reproject boundaries to the output CRS specified in the config - boundaries = boundaries.to_crs(f"epsg:{config.output_crs}") - logger.info(f"Boundaries reprojected to {config.output_crs}") + boundaries = config.get_boundaries() + logger.info(f"Study area boundaries loaded and reprojected to {config.output_crs}") # --- osm POI data logger.info("Loading OSM POI data") diff --git a/scripts/3.2.2_assign_primary_zone_work.py b/scripts/3.2.2_assign_primary_zone_work.py index f5d92a1..eb66876 100644 --- a/scripts/3.2.2_assign_primary_zone_work.py +++ b/scripts/3.2.2_assign_primary_zone_work.py @@ -28,16 +28,9 @@ def main(config_file): possible_zones_work = pd.read_pickle(config.possible_zones_work) # --- boundaries - logger.info("Loading study area boundaries") - - boundaries = gpd.read_file(config.study_areas_filepath) - - logger.info("Study area boundaries loaded") - - # Reproject boundaries to the output CRS specified in the config - boundaries = boundaries.to_crs(f"epsg:{config.output_crs}") - logger.info(f"Boundaries reprojected to {config.output_crs}") + boundaries = config.get_boundaries() + logger.info(f"Study area boundaries loaded and reprojected to {config.output_crs}") # osm POI data diff --git a/scripts/3.2.3_assign_secondary_zone.py b/scripts/3.2.3_assign_secondary_zone.py index c216209..ac0a95b 100644 --- a/scripts/3.2.3_assign_secondary_zone.py +++ b/scripts/3.2.3_assign_secondary_zone.py @@ -43,16 +43,13 @@ def main(config_file): activity_chains["TravDay"] == config.parameters.nts_day_of_week ] + # TODO: remove obsolete comment # --- Add OA21CD to the data + # logger.info("Preprocessing: Adding OA21CD to the data") - logger.info("Preprocessing: Adding OA21CD to the data") - - boundaries = gpd.read_file(config.study_areas_filepath) - # Reproject boundaries to the output CRS specified in the config - boundaries = boundaries.to_crs(f"epsg:{config.output_crs}") - logger.info(f"Boundaries reprojected to {config.output_crs}") - - logger.info("Study area boundaries loaded") + logger.info("Loading study area boundaries") + boundaries = config.get_boundaries() + logger.info(f"Study area boundaries loaded and reprojected to {config.output_crs}") # --- Assign activity home locations to boundaries zoning system diff --git a/scripts/3.3_assign_facility_all.py b/scripts/3.3_assign_facility_all.py index 55702ce..813b00f 100644 --- a/scripts/3.3_assign_facility_all.py +++ b/scripts/3.3_assign_facility_all.py @@ -44,14 +44,8 @@ def main(config_file): # --- Load data: Boundaries logger.info("Loading study area boundaries") - - boundaries = gpd.read_file(config.study_areas_filepath) - - logger.info("Study area boundaries loaded") - - # Reproject boundaries to the output CRS specified in the config - boundaries = boundaries.to_crs(f"epsg:{config.output_crs}") - logger.info(f"Boundaries reprojected to {config.output_crs}") + boundaries = config.get_boundaries() + logger.info(f"Study area boundaries loaded and reprojected to {config.output_crs}") # --- Prepprocess: add zone column to POI data logger.info("Adding zone column to POI data") diff --git a/src/acbm/config.py b/src/acbm/config.py index 9a1cafa..e8e608b 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -6,6 +6,7 @@ from pathlib import Path from typing import Tuple +import geopandas as gpd import jcs import numpy as np import tomlkit @@ -358,6 +359,10 @@ def get_logger(self, name: str, filename: str) -> Logger: self.logs_path, ) + def get_boundaries(self) -> gpd.GeoDataFrame: + boundaries = gpd.read_file(self.boundaries_filepath) + return boundaries.to_crs(epsg=self.output_crs) + def load_config(filepath: str | Path) -> Config: """Loads config from filepath.""" From f35ba011c08f4433c2a3e6376f262557c3e9123e Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Fri, 29 Nov 2024 18:04:51 +0000 Subject: [PATCH 28/56] Fix config test --- tests/test_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_config.py b/tests/test_config.py index 9dc9cc2..2bf69d6 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -11,4 +11,4 @@ def config(): def test_id(config): - assert config.id == "11df8ad099" + assert config.id == "a89b65de35" From 3d0733bab179028fcf594349ce53265c1a6a8bb5 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Fri, 29 Nov 2024 18:08:41 +0000 Subject: [PATCH 29/56] Update OSM path config with output CRS --- src/acbm/config.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/acbm/config.py b/src/acbm/config.py index e8e608b..d0dddf3 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -168,7 +168,9 @@ def osmox_path(self) -> Path: def osm_path(self) -> Path: """Returns osm path.""" return ( - self.root_path / self.osmox_path / (self.region + "_epsg_4326.parquet") + self.root_path + / self.osmox_path + / (self.region + f"_epsg_{self.output_crs}.parquet") if self.paths is None or self.paths.osm_path is None else self.paths.osm_path ) From fdb2304e910542b041ac12be41cd092cdc00ab3d Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Fri, 29 Nov 2024 18:31:08 +0000 Subject: [PATCH 30/56] Replace print with logger, fix get_boundaries method --- src/acbm/config.py | 2 +- src/acbm/matching.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/acbm/config.py b/src/acbm/config.py index d0dddf3..610dc45 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -362,7 +362,7 @@ def get_logger(self, name: str, filename: str) -> Logger: ) def get_boundaries(self) -> gpd.GeoDataFrame: - boundaries = gpd.read_file(self.boundaries_filepath) + boundaries = gpd.read_file(self.study_areas_filepath) return boundaries.to_crs(epsg=self.output_crs) diff --git a/src/acbm/matching.py b/src/acbm/matching.py index 0be0dfd..4c0a5b1 100644 --- a/src/acbm/matching.py +++ b/src/acbm/matching.py @@ -1,3 +1,4 @@ +import logging from collections import defaultdict from dataclasses import dataclass, field from typing import Dict, List @@ -8,6 +9,8 @@ # categorical (exact) matching - (for household level) +logger = logging.getLogger("matching") + @dataclass class MatcherExact: @@ -282,7 +285,7 @@ def match_individuals( if show_progress and i % 100 == 0: # Print the iteration number and the number of keys in the dict - print(f"Matching for household {i} out of: {len(matches_hh)}") + logger.info(f"Matching for household {i} out of: {len(matches_hh)}") # apply the matching match = match_psm(rows_df1, rows_df2, matching_columns) From 721cf547587dccaaeca90037fac9cb507c71ea90 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Sat, 30 Nov 2024 13:05:32 +0000 Subject: [PATCH 31/56] Refactor with population type --- scripts/5_acbm_to_matsim_xml.py | 70 ++++----- src/acbm/postprocessing/matsim.py | 251 ++++++++++++++++-------------- 2 files changed, 173 insertions(+), 148 deletions(-) diff --git a/scripts/5_acbm_to_matsim_xml.py b/scripts/5_acbm_to_matsim_xml.py index 30062c6..bb75dfb 100644 --- a/scripts/5_acbm_to_matsim_xml.py +++ b/scripts/5_acbm_to_matsim_xml.py @@ -1,7 +1,6 @@ from datetime import timedelta import geopandas as gpd -import pandas as pd from pam import write from pam.read import load_travel_diary from pam.samplers.time import apply_jitter_to_plan @@ -10,10 +9,9 @@ from acbm.cli import acbm_cli from acbm.config import load_and_setup_config from acbm.postprocessing.matsim import ( + Population, add_home_location_to_individuals, calculate_percentage_remaining, - filter_by_pid, - filter_no_location, log_row_count, ) @@ -27,18 +25,14 @@ def main(config_file): logger.info("1 - Loading data") - individuals = pd.read_csv(config.output_path / "people.csv") - households = pd.read_csv(config.output_path / "households.csv") - activities = pd.read_csv(config.output_path / "activities.csv") - legs = pd.read_csv(config.output_path / "legs.csv") - legs_geo = pd.read_parquet(config.output_path / "legs_with_locations.parquet") + population = Population.read(config) # ----- Clean the data logger.info("2 - Cleaning data") # rename age_years to age in individuals - individuals.rename(columns={"age_years": "age"}, inplace=True) + population.individuals.rename(columns={"age_years": "age"}, inplace=True) # We will be removing some rows in each planning operation. This function helps keep a # record of the number of rows in each table after each operation. @@ -47,31 +41,29 @@ def main(config_file): logger.info("2.1 - Record number of rows in each df before cleaning") - log_row_count(individuals, "individuals", "0_initial", row_counts) - log_row_count(households, "households", "0_initial", row_counts) - log_row_count(activities, "activities", "0_initial", row_counts) - log_row_count(legs, "legs", "0_initial", row_counts) - log_row_count(legs_geo, "legs_geo", "0_initial", row_counts) + log_row_count(population.individuals, "individuals", "0_initial", row_counts) + log_row_count(population.households, "households", "0_initial", row_counts) + log_row_count(population.activities, "activities", "0_initial", row_counts) + log_row_count(population.legs, "legs", "0_initial", row_counts) + log_row_count(population.legs_geo, "legs_geo", "0_initial", row_counts) logger.info("2.2 - Remove people that don't exist across all dfs") # When writing to matsim using pam, we get an error when a pid exists in one dataset # but not in the other. We will remove these people from the datasets. - individuals, activities, legs, legs_geo, households = filter_by_pid( - individuals, activities, legs, legs_geo, households - ) + population = population.filter_by_pid() - log_row_count(individuals, "individuals", "1_filter_by_pid", row_counts) - log_row_count(households, "households", "1_filter_by_pid", row_counts) - log_row_count(activities, "activities", "1_filter_by_pid", row_counts) - log_row_count(legs, "legs", "1_filter_by_pid", row_counts) - log_row_count(legs_geo, "legs_geo", "1_filter_by_pid", row_counts) + log_row_count(population.individuals, "individuals", "1_filter_by_pid", row_counts) + log_row_count(population.households, "households", "1_filter_by_pid", row_counts) + log_row_count(population.activities, "activities", "1_filter_by_pid", row_counts) + log_row_count(population.legs, "legs", "1_filter_by_pid", row_counts) + log_row_count(population.legs_geo, "legs_geo", "1_filter_by_pid", row_counts) logger.info("2.3 - Rename geometry columns (for PAM)") # TODO: Rename columns upstream in 3.3_assign_facility_all script - legs_geo.rename( + population.legs_geo.rename( columns={ "start_location_geometry_wkt": "start_loc", "end_location_geometry_wkt": "end_loc", @@ -80,15 +72,19 @@ def main(config_file): ) logger.info("2.4 - Remove people with missing location data ") - individuals, activities, legs, legs_geo, households = filter_no_location( - individuals, activities, legs, legs_geo, households - ) + population = population.filter_no_location() - log_row_count(individuals, "individuals", "2_filter_no_location", row_counts) - log_row_count(households, "households", "2_filter_no_location", row_counts) - log_row_count(activities, "activities", "2_filter_no_location", row_counts) - log_row_count(legs, "legs", "2_filter_no_location", row_counts) - log_row_count(legs_geo, "legs_geo", "2_filter_no_location", row_counts) + log_row_count( + population.individuals, "individuals", "2_filter_no_location", row_counts + ) + log_row_count( + population.households, "households", "2_filter_no_location", row_counts + ) + log_row_count( + population.activities, "activities", "2_filter_no_location", row_counts + ) + log_row_count(population.legs, "legs", "2_filter_no_location", row_counts) + log_row_count(population.legs_geo, "legs_geo", "2_filter_no_location", row_counts) logger.info("2.5 - Log number of rows in each df after cleaning") @@ -109,16 +105,20 @@ def convert_to_point(value): return wkt.loads(value) # Convert start_loc and end_loc to shapely point objects - legs_geo["start_loc"] = legs_geo["start_loc"].apply(convert_to_point) - legs_geo["end_loc"] = legs_geo["end_loc"].apply(convert_to_point) + population.legs_geo["start_loc"] = population.legs_geo["start_loc"].apply( + convert_to_point + ) + population.legs_geo["end_loc"] = population.legs_geo["end_loc"].apply( + convert_to_point + ) # Convert to GeoDataFrame with start_loc as the active geometry - legs_geo = gpd.GeoDataFrame(legs_geo, geometry="start_loc") + legs_geo = gpd.GeoDataFrame(population.legs_geo, geometry="start_loc") logger.info("3b - Add home location to individuals") # Apply - individuals_geo = add_home_location_to_individuals(legs_geo, individuals) + individuals_geo = add_home_location_to_individuals(legs_geo, population.individuals) logger.info("4 - Write to MATSim XML") diff --git a/src/acbm/postprocessing/matsim.py b/src/acbm/postprocessing/matsim.py index 2f9bc96..8f5c12e 100644 --- a/src/acbm/postprocessing/matsim.py +++ b/src/acbm/postprocessing/matsim.py @@ -1,124 +1,149 @@ +from dataclasses import dataclass +from typing import Self + import pandas as pd +from acbm.config import Config -def filter_by_pid( - individuals: pd.DataFrame, - activities: pd.DataFrame, - legs: pd.DataFrame, - legs_geo: pd.DataFrame, - households: pd.DataFrame, -) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: - """ - Filter the input DataFrames to include only include people (pids) that exist in all - dfs - Parameters - ---------- +@dataclass +class Population: individuals: pd.DataFrame - Individuals DataFrame. + households: pd.DataFrame activities: pd.DataFrame - Activities DataFrame. - legs: pd.DataFrame: - Legs DataFrame. + legs: pd.DataFrame legs_geo: pd.DataFrame - Legs with geo DataFrame. - households: pd.DataFrame - Households DataFrame. - - Returns - ------- - tuple - A tuple containing the filtered DataFrames (individuals, activities, legs, legs_geo, households). - """ - # Identify common pids - common_pids = ( - set(individuals["pid"]) - .intersection(activities["pid"]) - .intersection(legs["pid"]) - .intersection(legs_geo["pid"]) - ) - - # Filter Individual Level DataFrames - individuals = individuals[individuals["pid"].isin(common_pids)] - activities = activities[activities["pid"].isin(common_pids)] - legs = legs[legs["pid"].isin(common_pids)] - legs_geo = legs_geo[legs_geo["pid"].isin(common_pids)] - - # Filter Household Level DataFrame - households = households[households["hid"].isin(individuals["hid"])] - - return individuals, activities, legs, legs_geo, households - - -def filter_no_location( - individuals: pd.DataFrame, - households: pd.DataFrame, - activities: pd.DataFrame, - legs: pd.DataFrame, - legs_geo: pd.DataFrame, -) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: - """ - Cleans the provided DataFrames by removing rows without location data. Gets all pids - that have at least one row with missing location data, and removes all rows with - these pids. pids are geneerated from two sources: - - legs_geo with missing start_loc or end_loc - - individuals with missing hzone - Parameters - ---------- - individuals : pd.DataFrame - DataFrame containing individual data. - activities : pd.DataFrame - DataFrame containing activity data. - legs : pd.DataFrame - DataFrame containing legs data. - legs_geo : pd.DataFrame - DataFrame containing legs with geographic data. - households : pd.DataFrame - DataFrame containing household data. - - Returns - ------- - tuple - A tuple containing the cleaned DataFrames - (individuals_cleaned, households_cleaned, activities_cleaned, legs_cleaned, legs_geo_cleaned). - """ - # Identify rows in legs_geo where start_loc or end_loc are null - invalid_rows_legs_geo = legs_geo[ - legs_geo["start_loc"].isnull() | legs_geo["end_loc"].isnull() - ] - - # Extract the pid values associated with these rows - invalid_pids_legs_geo = invalid_rows_legs_geo["pid"].unique() - - # Identify rows in individuals where hzone is null - invalid_rows_individuals = individuals[individuals["hzone"].isnull()] - - # Extract the pid values associated with these rows - invalid_pids_individuals = invalid_rows_individuals["pid"].unique() - - # Combine the invalid pid values from both sources - invalid_pids = set(invalid_pids_legs_geo).union(set(invalid_pids_individuals)) - - # Remove rows with these pids from all DataFrames - individuals_cleaned = individuals[~individuals["pid"].isin(invalid_pids)] - activities_cleaned = activities[~activities["pid"].isin(invalid_pids)] - legs_cleaned = legs[~legs["pid"].isin(invalid_pids)] - legs_geo_cleaned = legs_geo[~legs_geo["pid"].isin(invalid_pids)] - - # Extract remaining hid values from individuals_cleaned - remaining_hids = individuals_cleaned["hid"].unique() - - # Filter households_cleaned to only include rows with hid values in remaining_hids - households_cleaned = households[households["hid"].isin(remaining_hids)] - - return ( - individuals_cleaned, - activities_cleaned, - legs_cleaned, - legs_geo_cleaned, - households_cleaned, - ) + @classmethod + def read(cls, config: Config) -> Self: + individuals = pd.read_csv(config.output_path / "people.csv") + households = pd.read_csv(config.output_path / "households.csv") + activities = pd.read_csv(config.output_path / "activities.csv") + legs = pd.read_csv(config.output_path / "legs.csv") + legs_geo = pd.read_parquet(config.output_path / "legs_with_locations.parquet") + return Population( + individuals=individuals, + households=households, + activities=activities, + legs=legs, + legs_geo=legs_geo, + ) + + def filter_by_pid(self) -> Self: + """ + Filter the input DataFrames to include only include people (pids) that exist in all + dfs + + Parameters + ---------- + individuals: pd.DataFrame + Individuals DataFrame. + activities: pd.DataFrame + Activities DataFrame. + legs: pd.DataFrame: + Legs DataFrame. + legs_geo: pd.DataFrame + Legs with geo DataFrame. + households: pd.DataFrame + Households DataFrame. + + Returns + ------- + tuple + A tuple containing the filtered DataFrames (individuals, activities, legs, legs_geo, households). + """ + # Identify common pids + common_pids = ( + set(self.individuals["pid"]) + .intersection(self.activities["pid"]) + .intersection(self.legs["pid"]) + .intersection(self.legs_geo["pid"]) + ) + + # Filter Individual Level DataFrames + individuals = self.individuals[self.individuals["pid"].isin(common_pids)] + activities = self.activities[self.activities["pid"].isin(common_pids)] + legs = self.legs[self.legs["pid"].isin(common_pids)] + legs_geo = self.legs_geo[self.legs_geo["pid"].isin(common_pids)] + + # Filter Household Level DataFrame + households = self.households[self.households["hid"].isin(individuals["hid"])] + + return Population( + individuals=individuals, + households=households, + activities=activities, + legs=legs, + legs_geo=legs_geo, + ) + + def filter_no_location(self) -> Self: + """ + Cleans the provided DataFrames by removing rows without location data. Gets all pids + that have at least one row with missing location data, and removes all rows with + these pids. pids are geneerated from two sources: + - legs_geo with missing start_loc or end_loc + - individuals with missing hzone + + Parameters + ---------- + individuals : pd.DataFrame + DataFrame containing individual data. + activities : pd.DataFrame + DataFrame containing activity data. + legs : pd.DataFrame + DataFrame containing legs data. + legs_geo : pd.DataFrame + DataFrame containing legs with geographic data. + households : pd.DataFrame + DataFrame containing household data. + + Returns + ------- + tuple + A tuple containing the cleaned DataFrames + (individuals_cleaned, households_cleaned, activities_cleaned, legs_cleaned, legs_geo_cleaned). + """ + # Identify rows in legs_geo where start_loc or end_loc are null + invalid_rows_legs_geo = self.legs_geo[ + self.legs_geo["start_loc"].isnull() | self.legs_geo["end_loc"].isnull() + ] + + # Extract the pid values associated with these rows + invalid_pids_legs_geo = invalid_rows_legs_geo["pid"].unique() + + # Identify rows in individuals where hzone is null + invalid_rows_individuals = self.individuals[self.individuals["hzone"].isnull()] + + # Extract the pid values associated with these rows + invalid_pids_individuals = invalid_rows_individuals["pid"].unique() + + # Combine the invalid pid values from both sources + invalid_pids = set(invalid_pids_legs_geo).union(set(invalid_pids_individuals)) + + # Remove rows with these pids from all DataFrames + individuals_cleaned = self.individuals[ + ~self.individuals["pid"].isin(invalid_pids) + ] + activities_cleaned = self.activities[~self.activities["pid"].isin(invalid_pids)] + legs_cleaned = self.legs[~self.legs["pid"].isin(invalid_pids)] + legs_geo_cleaned = self.legs_geo[~self.legs_geo["pid"].isin(invalid_pids)] + + # Extract remaining hid values from individuals_cleaned + remaining_hids = individuals_cleaned["hid"].unique() + + # Filter households_cleaned to only include rows with hid values in remaining_hids + households_cleaned = self.households[ + self.households["hid"].isin(remaining_hids) + ] + + return Population( + individuals=individuals_cleaned, + households=households_cleaned, + activities=activities_cleaned, + legs=legs_cleaned, + legs_geo=legs_geo_cleaned, + ) def add_home_location_to_individuals( From 447496e894215eb83297637df2643a7d97855508 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Sat, 30 Nov 2024 13:11:16 +0000 Subject: [PATCH 32/56] Rename method --- scripts/0_preprocess_inputs.py | 2 +- scripts/3.1_assign_primary_feasible_zones.py | 2 +- scripts/3.2.1_assign_primary_zone_edu.py | 2 +- scripts/3.2.2_assign_primary_zone_work.py | 2 +- scripts/3.2.3_assign_secondary_zone.py | 2 +- scripts/3.3_assign_facility_all.py | 2 +- src/acbm/config.py | 6 +++--- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/0_preprocess_inputs.py b/scripts/0_preprocess_inputs.py index 994b2eb..267de49 100644 --- a/scripts/0_preprocess_inputs.py +++ b/scripts/0_preprocess_inputs.py @@ -18,7 +18,7 @@ def main(config_file): logger.info("1. Reading in the boundary layer for the whole of England") - boundaries = config.get_boundaries() + boundaries = config.get_study_area_boundaries() ## --- Dissolve boundaries if resolution is MSOA diff --git a/scripts/3.1_assign_primary_feasible_zones.py b/scripts/3.1_assign_primary_feasible_zones.py index f45d13c..b61fa23 100644 --- a/scripts/3.1_assign_primary_feasible_zones.py +++ b/scripts/3.1_assign_primary_feasible_zones.py @@ -37,7 +37,7 @@ def main(config_file): # --- Study area boundaries logger.info("Loading study area boundaries") - boundaries = config.get_boundaries() + boundaries = config.get_study_area_boundaries() logger.info(f"Study area boundaries loaded and reprojected to {config.output_crs}") # --- Assign activity home locations to boundaries zoning system diff --git a/scripts/3.2.1_assign_primary_zone_edu.py b/scripts/3.2.1_assign_primary_zone_edu.py index f543edb..183a46f 100644 --- a/scripts/3.2.1_assign_primary_zone_edu.py +++ b/scripts/3.2.1_assign_primary_zone_edu.py @@ -33,7 +33,7 @@ def main(config_file): # --- boundaries logger.info("Loading study area boundaries") - boundaries = config.get_boundaries() + boundaries = config.get_study_area_boundaries() logger.info(f"Study area boundaries loaded and reprojected to {config.output_crs}") # --- osm POI data diff --git a/scripts/3.2.2_assign_primary_zone_work.py b/scripts/3.2.2_assign_primary_zone_work.py index eb66876..ede5a1d 100644 --- a/scripts/3.2.2_assign_primary_zone_work.py +++ b/scripts/3.2.2_assign_primary_zone_work.py @@ -29,7 +29,7 @@ def main(config_file): # --- boundaries logger.info("Loading study area boundaries") - boundaries = config.get_boundaries() + boundaries = config.get_study_area_boundaries() logger.info(f"Study area boundaries loaded and reprojected to {config.output_crs}") # osm POI data diff --git a/scripts/3.2.3_assign_secondary_zone.py b/scripts/3.2.3_assign_secondary_zone.py index ac0a95b..a87ea10 100644 --- a/scripts/3.2.3_assign_secondary_zone.py +++ b/scripts/3.2.3_assign_secondary_zone.py @@ -48,7 +48,7 @@ def main(config_file): # logger.info("Preprocessing: Adding OA21CD to the data") logger.info("Loading study area boundaries") - boundaries = config.get_boundaries() + boundaries = config.get_study_area_boundaries() logger.info(f"Study area boundaries loaded and reprojected to {config.output_crs}") # --- Assign activity home locations to boundaries zoning system diff --git a/scripts/3.3_assign_facility_all.py b/scripts/3.3_assign_facility_all.py index 813b00f..b46138c 100644 --- a/scripts/3.3_assign_facility_all.py +++ b/scripts/3.3_assign_facility_all.py @@ -44,7 +44,7 @@ def main(config_file): # --- Load data: Boundaries logger.info("Loading study area boundaries") - boundaries = config.get_boundaries() + boundaries = config.get_study_area_boundaries() logger.info(f"Study area boundaries loaded and reprojected to {config.output_crs}") # --- Prepprocess: add zone column to POI data diff --git a/src/acbm/config.py b/src/acbm/config.py index 610dc45..eef187a 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -361,9 +361,9 @@ def get_logger(self, name: str, filename: str) -> Logger: self.logs_path, ) - def get_boundaries(self) -> gpd.GeoDataFrame: - boundaries = gpd.read_file(self.study_areas_filepath) - return boundaries.to_crs(epsg=self.output_crs) + def get_study_area_boundaries(self) -> gpd.GeoDataFrame: + study_area = gpd.read_file(self.study_area_filepath) + return study_area.to_crs(epsg=self.output_crs) def load_config(filepath: str | Path) -> Config: From dfaaaabb5847c55fa025ecf181d51bb5c335f1fa Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Sat, 30 Nov 2024 13:11:45 +0000 Subject: [PATCH 33/56] Rename field --- scripts/0_preprocess_inputs.py | 4 ++-- src/acbm/config.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/0_preprocess_inputs.py b/scripts/0_preprocess_inputs.py index 267de49..ebde65f 100644 --- a/scripts/0_preprocess_inputs.py +++ b/scripts/0_preprocess_inputs.py @@ -51,9 +51,9 @@ def main(config_file): boundaries_filtered = boundaries[boundaries["MSOA21CD"].isin(msoa21cd_values)] ## Save the output as parquet - logger.info(f"4. Saving the boundaries to {config.study_areas_filepath} path") + logger.info(f"4. Saving the boundaries to {config.study_area_filepath} path") boundaries_filtered.to_file( - config.study_areas_filepath, + config.study_area_filepath, driver="GeoJSON", ) diff --git a/src/acbm/config.py b/src/acbm/config.py index eef187a..5a6f41b 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -93,7 +93,7 @@ def make_dirs(self): os.makedirs(self.assigning_plots_path, exist_ok=True) os.makedirs(self.validation_plots_path, exist_ok=True) os.makedirs(self.activities_per_zone.parent, exist_ok=True) - os.makedirs(self.study_areas_filepath.parent, exist_ok=True) + os.makedirs(self.study_area_filepath.parent, exist_ok=True) os.makedirs(self.interim_path, exist_ok=True) os.makedirs(self.travel_times_estimates_filepath.parent, exist_ok=True) os.makedirs(self.spc_combined_filepath.parent, exist_ok=True) @@ -147,7 +147,7 @@ def spc_combined_filepath(self) -> Path: return self.interim_path / f"{self.region}_people_hh.parquet" @property - def study_areas_filepath(self) -> Path: + def study_area_filepath(self) -> Path: """Returns boundaries path.""" return ( self.output_path / "boundaries" / "study_area_zones.geojson" From ec45d94ec2ef2e006f2996a143942ae9c223e6bc Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Sat, 30 Nov 2024 13:12:01 +0000 Subject: [PATCH 34/56] Remove todo --- src/acbm/config.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/acbm/config.py b/src/acbm/config.py index 5a6f41b..07f3acb 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -339,7 +339,6 @@ def destination_zone_id(cls, zone_id: str) -> str: def boundary_geography(self) -> str: return self.parameters.boundary_geography - # TODO: consider moving to method in config def init_rng(self): try: np.random.seed(self.seed) From 009d5ebeab7e8910e1719ebaf0b43ee6e6d6191b Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Sat, 30 Nov 2024 13:23:59 +0000 Subject: [PATCH 35/56] Add python script for pipeline --- scripts/run_pipeline.py | 44 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100755 scripts/run_pipeline.py diff --git a/scripts/run_pipeline.py b/scripts/run_pipeline.py new file mode 100755 index 0000000..4b0f20f --- /dev/null +++ b/scripts/run_pipeline.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python + +import subprocess + +import click + + +def run_command(script_path, config_file): + command = ["python", script_path, "--config_file", config_file] + try: + subprocess.run(command, capture_output=False, text=True, check=True) + except subprocess.CalledProcessError as e: + print(f"Error: {e}") + print(f"Return code: {e.returncode}") + print(f"Output: {e.output}") + exit(1) + + +@click.command() +@click.option( + "--config_file", help="Filepath relative to repo root of config", type=str +) +@click.option( + "--skip-preprocess", help="Skip preprocess script", is_flag=True, default=False +) +@click.option("--skip-osmox", help="Skip osmox script", is_flag=True, default=False) +def main(config_file, skip_preprocess, skip_osmox): + if not skip_preprocess: + run_command("scripts/0_preprocess_inputs.py", config_file) + if not skip_osmox: + run_command("scripts/0.1_run_osmox.py", config_file) + run_command("scripts/1_prep_synthpop.py", config_file) + run_command("scripts/2_match_households_and_individuals.py", config_file) + run_command("scripts/3.1_assign_primary_feasible_zones.py", config_file) + run_command("scripts/3.2.1_assign_primary_zone_edu.py", config_file) + run_command("scripts/3.2.2_assign_primary_zone_work.py", config_file) + run_command("scripts/3.2.3_assign_secondary_zone.py", config_file) + run_command("scripts/3.3_assign_facility_all.py", config_file) + run_command("scripts/4_validation.py", config_file) + run_command("scripts/5_acbm_to_matsim_xml.py", config_file) + + +if __name__ == "__main__": + main() From 6d8ae6c95ce2f1dd938a1a314a76fce0a0db5338 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Sat, 30 Nov 2024 14:32:12 +0000 Subject: [PATCH 36/56] Fix preprocessing boundaries --- scripts/0_preprocess_inputs.py | 2 +- src/acbm/config.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/0_preprocess_inputs.py b/scripts/0_preprocess_inputs.py index ebde65f..50d2024 100644 --- a/scripts/0_preprocess_inputs.py +++ b/scripts/0_preprocess_inputs.py @@ -18,7 +18,7 @@ def main(config_file): logger.info("1. Reading in the boundary layer for the whole of England") - boundaries = config.get_study_area_boundaries() + boundaries = config.get_boundaries() ## --- Dissolve boundaries if resolution is MSOA diff --git a/src/acbm/config.py b/src/acbm/config.py index 07f3acb..c10eb0c 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -360,6 +360,10 @@ def get_logger(self, name: str, filename: str) -> Logger: self.logs_path, ) + def get_boundaries(self) -> gpd.GeoDataFrame: + boundaries = gpd.read_file(self.boundaries_filepath) + return boundaries.to_crs(epsg=self.output_crs) + def get_study_area_boundaries(self) -> gpd.GeoDataFrame: study_area = gpd.read_file(self.study_area_filepath) return study_area.to_crs(epsg=self.output_crs) From 96f8aca90bb84913adc992bbe42038cafa03ebc6 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 3 Dec 2024 13:53:42 +0000 Subject: [PATCH 37/56] Update CRS in osmox script --- scripts/0.1_run_osmox.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/0.1_run_osmox.py b/scripts/0.1_run_osmox.py index 0fb7830..bad7790 100644 --- a/scripts/0.1_run_osmox.py +++ b/scripts/0.1_run_osmox.py @@ -24,7 +24,9 @@ def main(config_file): "-f", "geoparquet", "-crs", - f"epsg:{config.output_crs}", + # TODO: check if this can be specified as the output CRS + # See: https://github.com/arup-group/osmox/blob/82602d411374ebc9fd33443f8f7c9816b63715ec/docs/osmox_run.md#L35-L38 + "epsg:27700", "-l", ], check=False, From eb468636178b77b71297eedcb6c07af7f2237068 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 3 Dec 2024 20:43:55 +0000 Subject: [PATCH 38/56] Add and revise CLI flags * Add flags for matching and assigning only * Revise config file flag to --config-file --- scripts/run_pipeline.py | 42 +++++++++++++++++++++++++++-------------- scripts/run_pipeline.sh | 22 ++++++++++----------- src/acbm/cli.py | 2 +- 3 files changed, 40 insertions(+), 26 deletions(-) diff --git a/scripts/run_pipeline.py b/scripts/run_pipeline.py index 4b0f20f..923aca6 100755 --- a/scripts/run_pipeline.py +++ b/scripts/run_pipeline.py @@ -6,7 +6,7 @@ def run_command(script_path, config_file): - command = ["python", script_path, "--config_file", config_file] + command = ["python", script_path, "--config-file", config_file] try: subprocess.run(command, capture_output=False, text=True, check=True) except subprocess.CalledProcessError as e: @@ -18,26 +18,40 @@ def run_command(script_path, config_file): @click.command() @click.option( - "--config_file", help="Filepath relative to repo root of config", type=str + "--config-file", help="Filepath relative to repo root of config", type=str ) @click.option( "--skip-preprocess", help="Skip preprocess script", is_flag=True, default=False ) @click.option("--skip-osmox", help="Skip osmox script", is_flag=True, default=False) -def main(config_file, skip_preprocess, skip_osmox): - if not skip_preprocess: +@click.option( + "--matching-only", + help="Only run the matching part of AcBM", + is_flag=True, + default=False, +) +@click.option( + "--assigning-only", + help="Only run the assigning part of AcBM", + is_flag=True, + default=False, +) +def main(config_file, skip_preprocess, skip_osmox, matching_only, assigning_only): + if not skip_preprocess and not assigning_only: run_command("scripts/0_preprocess_inputs.py", config_file) - if not skip_osmox: + if not skip_osmox and not matching_only and not assigning_only: run_command("scripts/0.1_run_osmox.py", config_file) - run_command("scripts/1_prep_synthpop.py", config_file) - run_command("scripts/2_match_households_and_individuals.py", config_file) - run_command("scripts/3.1_assign_primary_feasible_zones.py", config_file) - run_command("scripts/3.2.1_assign_primary_zone_edu.py", config_file) - run_command("scripts/3.2.2_assign_primary_zone_work.py", config_file) - run_command("scripts/3.2.3_assign_secondary_zone.py", config_file) - run_command("scripts/3.3_assign_facility_all.py", config_file) - run_command("scripts/4_validation.py", config_file) - run_command("scripts/5_acbm_to_matsim_xml.py", config_file) + if not assigning_only: + run_command("scripts/1_prep_synthpop.py", config_file) + run_command("scripts/2_match_households_and_individuals.py", config_file) + if not matching_only or assigning_only: + run_command("scripts/3.1_assign_primary_feasible_zones.py", config_file) + run_command("scripts/3.2.1_assign_primary_zone_edu.py", config_file) + run_command("scripts/3.2.2_assign_primary_zone_work.py", config_file) + run_command("scripts/3.2.3_assign_secondary_zone.py", config_file) + run_command("scripts/3.3_assign_facility_all.py", config_file) + run_command("scripts/4_validation.py", config_file) + run_command("scripts/5_acbm_to_matsim_xml.py", config_file) if __name__ == "__main__": diff --git a/scripts/run_pipeline.sh b/scripts/run_pipeline.sh index 025ed8c..0554b51 100755 --- a/scripts/run_pipeline.sh +++ b/scripts/run_pipeline.sh @@ -2,14 +2,14 @@ set -e -python scripts/0_preprocess_inputs.py --config_file $1 -python scripts/0.1_run_osmox.py --config_file $1 -python scripts/1_prep_synthpop.py --config_file $1 -python scripts/2_match_households_and_individuals.py --config_file $1 -python scripts/3.1_assign_primary_feasible_zones.py --config_file $1 -python scripts/3.2.1_assign_primary_zone_edu.py --config_file $1 -python scripts/3.2.2_assign_primary_zone_work.py --config_file $1 -python scripts/3.2.3_assign_secondary_zone.py --config_file $1 -python scripts/3.3_assign_facility_all.py --config_file $1 -python scripts/4_validation.py --config_file $1 -python scripts/5_acbm_to_matsim_xml.py --config_file $1 +python scripts/0_preprocess_inputs.py --config-file $1 +python scripts/0.1_run_osmox.py --config-file $1 +python scripts/1_prep_synthpop.py --config-file $1 +python scripts/2_match_households_and_individuals.py --config-file $1 +python scripts/3.1_assign_primary_feasible_zones.py --config-file $1 +python scripts/3.2.1_assign_primary_zone_edu.py --config-file $1 +python scripts/3.2.2_assign_primary_zone_work.py --config-file $1 +python scripts/3.2.3_assign_secondary_zone.py --config-file $1 +python scripts/3.3_assign_facility_all.py --config-file $1 +python scripts/4_validation.py --config-file $1 +python scripts/5_acbm_to_matsim_xml.py --config-file $1 diff --git a/src/acbm/cli.py b/src/acbm/cli.py index 053cb0e..6fb2d6e 100644 --- a/src/acbm/cli.py +++ b/src/acbm/cli.py @@ -7,7 +7,7 @@ def acbm_cli(c: Callable): @click.command() @click.option( - "--config_file", prompt="Filepath relative to repo root of config", type=str + "--config-file", prompt="Filepath relative to repo root of config", type=str ) def main(config_file): pd.options.mode.copy_on_write = True From 2807291fe7d110fddbfe6da4e5247ae67a8b45ac Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Mon, 9 Dec 2024 16:00:12 +0000 Subject: [PATCH 39/56] Additional logging and tqdm for assigning work zones --- src/acbm/assigning/select_zone_work.py | 96 +++++++++++++++----------- 1 file changed, 57 insertions(+), 39 deletions(-) diff --git a/src/acbm/assigning/select_zone_work.py b/src/acbm/assigning/select_zone_work.py index ff9257a..5005842 100644 --- a/src/acbm/assigning/select_zone_work.py +++ b/src/acbm/assigning/select_zone_work.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd import pulp +from tqdm import tqdm logger = logging.getLogger("assigning_primary_zone") @@ -429,7 +430,9 @@ def select_work_zone_optimization( max_dev = pulp.LpVariable("max_dev", 0, None, pulp.LpContinuous) # Create binary variables for each person, origin, and destination - for person_id, origins in self.activities_to_assign.items(): + logger.info("Adding people to assignment problem") + for person_id, origins in tqdm(self.activities_to_assign.items()): + # logger.info(f"person id: {person_id}, origins: {origins}") for origin_id, feasible_zones in origins.items(): person_vars = [] # Select up to 10 zones for each person based on the actual flows (to limit number of variables) @@ -476,49 +479,64 @@ def select_work_zone_optimization( if person_vars: prob += pulp.lpSum(person_vars) == 1 - # Calculate assigned percentages and deviation for each origin-destination pair - for (from_zone, to_zone), percentage in self.percentages.items(): - # Create a variable for the absolute deviation (with lower bound of 0) - abs_dev = pulp.LpVariable( - f"abs_dev_{from_zone}_{to_zone}", 0, None, pulp.LpContinuous - ) - deviation_vars[(from_zone, to_zone)] = abs_dev - # Calculate the assigned flow for the origin-destination pair - assigned_flow = pulp.lpSum( - assignment_vars.get((person_id, from_zone, to_zone), 0) - for person_id, origins in self.activities_to_assign.items() - if (person_id, from_zone, to_zone) in assignment_vars - ) - # Calculate the assigned percentage based on the total flows from each origin zone - if from_zone in self.total_flows: - total_people = self.total_flows[from_zone] - assigned_percentage = assigned_flow / total_people - else: - assigned_percentage = 0 - logger.warning(f"Warning: Origin {from_zone} not found in total_flows.") - - if use_percentages: - # to satisfy both constraints, abs_dev will be a positive number (it is the larger of the two) - prob += assigned_percentage - percentage <= abs_dev - prob += percentage - assigned_percentage <= abs_dev - else: - prob += ( - assigned_flow - self.actual_flows[(from_zone, to_zone)] <= abs_dev + if weight_max_dev > 0.0: + logger.info("Added max deviations to assignment problem") + # Calculate assigned percentages and deviation for each origin-destination pair + for (from_zone, to_zone), percentage in tqdm(self.percentages.items()): + # Create a variable for the absolute deviation (with lower bound of 0) + abs_dev = pulp.LpVariable( + f"abs_dev_{from_zone}_{to_zone}", 0, None, pulp.LpContinuous ) - prob += ( - self.actual_flows[(from_zone, to_zone)] - assigned_flow <= abs_dev + deviation_vars[(from_zone, to_zone)] = abs_dev + # Calculate the assigned flow for the origin-destination pair + assigned_flow = pulp.lpSum( + assignment_vars.get((person_id, from_zone, to_zone), 0) + for person_id, origins in self.activities_to_assign.items() + if (person_id, from_zone, to_zone) in assignment_vars ) + # Calculate the assigned percentage based on the total flows from each origin zone + if from_zone in self.total_flows: + total_people = self.total_flows[from_zone] + assigned_percentage = assigned_flow / total_people + else: + assigned_percentage = 0 + logger.warning( + f"Warning: Origin {from_zone} not found in total_flows." + ) - # Update the maximum deviation variable: it is the maximum of all deviations - prob += max_dev >= abs_dev + if use_percentages: + # to satisfy both constraints, abs_dev will be a positive number (it is the larger of the two) + prob += assigned_percentage - percentage <= abs_dev + prob += percentage - assigned_percentage <= abs_dev + else: + prob += ( + assigned_flow - self.actual_flows[(from_zone, to_zone)] + <= abs_dev + ) + prob += ( + self.actual_flows[(from_zone, to_zone)] - assigned_flow + <= abs_dev + ) - # Weighted objective function - prob += ( - weight_max_dev * max_dev - + weight_total_dev * pulp.lpSum(deviation_vars.values()), - "WeightedObjective", - ) + # Update the maximum deviation variable: it is the maximum of all deviations + prob += max_dev >= abs_dev + + logger.info("Adding weighted objective to assignment problem") + # Weighted objective function + prob += ( + weight_max_dev * max_dev + + weight_total_dev * pulp.lpSum(deviation_vars.values()), + "WeightedObjective", + ) + else: + logger.info("Adding weighted objective to assignment problem") + # Weighted objective function + prob += ( + weight_total_dev * pulp.lpSum(deviation_vars.values()), + "WeightedObjective", + ) + logger.info("Solving assignment problem") prob.solve() if pulp.LpStatus[prob.status] != "Optimal": From 248925adf2828d02e1d9ed8747391d33039e047e Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 10 Dec 2024 11:36:41 +0000 Subject: [PATCH 40/56] Add tqdm, change logging to debug --- src/acbm/assigning/select_facility.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/acbm/assigning/select_facility.py b/src/acbm/assigning/select_facility.py index 42bfa50..3aafa42 100644 --- a/src/acbm/assigning/select_facility.py +++ b/src/acbm/assigning/select_facility.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd from shapely import Point +from tqdm import tqdm logger = logging.getLogger("assigning_facility_locations") @@ -67,7 +68,7 @@ def _select_facility( # Extract the destination zone from the input row destination_zone = row[row_destination_zone_col] if pd.isna(destination_zone): - logger.info(f"Activity {row.name}: Destination zone is NA") + logger.debug(f"Activity {row.name}: Destination zone is NA") # return {"id": np.nan, "geometry": np.nan} # TODO: check this replacement is correct return {row[unique_id_col]: (np.nan, np.nan)} @@ -82,13 +83,13 @@ def _select_facility( lambda x: row[row_activity_type_col] in x ) ] - logger.info( + logger.debug( f"Activity {row.name}: Found {len(facilities_valid)} matching facilities in zone {destination_zone}" ) # If no specific facilities found in the initial zone, and neighboring zones are provided, search in neighboring zones if facilities_valid.empty and neighboring_zones: - logger.info( + logger.debug( f"Activity {row.name}: No {row[row_activity_type_col]} facilities in {destination_zone}. Expanding search to neighboring zones" ) neighbors = neighboring_zones.get(destination_zone, []) @@ -100,13 +101,13 @@ def _select_facility( lambda x: row[row_activity_type_col] in x ) ] - logger.info( + logger.debug( f"Activity {row.name}: Found {len(facilities_valid)} matching facilities in neighboring zones" ) # If no specific facilities found and a fallback type is provided, attempt to find facilities matching the fallback type if facilities_valid.empty and fallback_type: - logger.info( + logger.debug( f"Activity {row.name}: No {row[row_activity_type_col]} facilities in zone {destination_zone} or neighboring zones, trying with {fallback_type}" ) # This should consider both the initial zone and neighboring zones if the previous step expanded the search @@ -115,20 +116,20 @@ def _select_facility( lambda x: fallback_type in x ) ] - logger.info( + logger.debug( f"Activity {row.name}: Found {len(facilities_valid)} matching facilities with type: {fallback_type}" ) # if no specific facilities found and fallback_to_random is True, take all facilities in the zone if facilities_valid.empty and fallback_to_random: - logger.info( + logger.debug( f"Activity {row.name}: No facilities in zone {destination_zone} with {gdf_facility_type_col} '{fallback_type or row[row_activity_type_col]}'. Sampling from all facilities in the zone" ) facilities_valid = facilities_in_zone # If no facilities found after all attempts, log the failure and return NaN if facilities_valid.empty: - logger.info( + logger.debug( f"Activity {row.name}: No facilities in zone {destination_zone} with {gdf_facility_type_col} '{fallback_type or row[row_activity_type_col]}'" ) return {row[unique_id_col]: (np.nan, np.nan)} @@ -147,11 +148,11 @@ def _select_facility( ) facilities_valid = facilities_valid.dropna(subset=["floor_area"]) facility = facilities_valid.sample(1, weights=facilities_valid["floor_area"]) - logger.info(f"Activity {row.name}: Sampled facility based on floor area)") + logger.debug(f"Activity {row.name}: Sampled facility based on floor area)") else: # Otherwise, randomly sample one facility from the valid facilities facility = facilities_valid.sample(1) - logger.info(f"Activity {row.name}: Sampled facility randomly") + logger.debug(f"Activity {row.name}: Sampled facility randomly") # Return the id and geometry of the selected facility return { @@ -205,7 +206,7 @@ def select_facility( selected_facilities = {} # Select a facility for each row in the DataFrame - for _, row in df.iterrows(): + for _, row in tqdm(df.iterrows(), total=df.shape[0]): selected_facility = _select_facility( row=row, unique_id_col=unique_id_col, From 8226ccd9b97561d0c8355159525fdace87f4c126 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 11 Dec 2024 17:31:19 +0000 Subject: [PATCH 41/56] Add multiprocessing to parallelize select_facility --- src/acbm/assigning/select_facility.py | 56 +++++++++++++++------------ 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/src/acbm/assigning/select_facility.py b/src/acbm/assigning/select_facility.py index 3aafa42..74f9d17 100644 --- a/src/acbm/assigning/select_facility.py +++ b/src/acbm/assigning/select_facility.py @@ -1,4 +1,6 @@ import logging +import multiprocessing +from multiprocessing import Pool from typing import Optional, Tuple import geopandas as gpd @@ -64,7 +66,7 @@ def _select_facility( {unique_id_col: (np.nan, np.nan)} if no suitable facility is found. """ # ----- Step 1. Find valid facilities in the destination zone - + pd.options.mode.copy_on_write = True # Extract the destination zone from the input row destination_zone = row[row_destination_zone_col] if pd.isna(destination_zone): @@ -202,29 +204,35 @@ def select_facility( dict[str, Tuple[str, Point ] | Tuple[float, float]]: Unique ID column as keys with selected facility ID and facility ID's geometry, or (np.nan, np.nan) """ - # Initialize a dictionary to store the selected facilities - selected_facilities = {} - - # Select a facility for each row in the DataFrame - for _, row in tqdm(df.iterrows(), total=df.shape[0]): - selected_facility = _select_facility( - row=row, - unique_id_col=unique_id_col, - facilities_gdf=facilities_gdf, - row_destination_zone_col=row_destination_zone_col, - row_activity_type_col=row_activity_type_col, - gdf_facility_zone_col=gdf_facility_zone_col, - gdf_facility_type_col=gdf_facility_type_col, - gdf_sample_col=gdf_sample_col, - neighboring_zones=neighboring_zones, - fallback_type=fallback_type, - fallback_to_random=fallback_to_random, - ) - - # Update the dictionary with the selected facility - selected_facilities.update(selected_facility) - - return selected_facilities + # TODO: update this to be configurable + n_threads = multiprocessing.cpu_count() + with Pool(n_threads) as p: + # Set to a large enough chunk size so that each thread + # has a sufficiently large amount of processing to do. + chunk_size = 16_000 + d = {} + for start in tqdm(range(0, df.shape[0], chunk_size)): + chunk = df.iloc[start : start + chunk_size, :] + args = [ + ( + row, + unique_id_col, + facilities_gdf, + row_destination_zone_col, + gdf_facility_zone_col, + row_activity_type_col, + gdf_facility_type_col, + fallback_type, + fallback_to_random, + neighboring_zones, + gdf_sample_col, + ) + for _, row in chunk.iterrows() + ] + results = p.starmap(_select_facility, args) + for result in results: + d.update(result) + return d def map_activity_locations( From 4a3bfedf15241604e3075a855206906cddc3e14b Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 12 Dec 2024 09:02:20 +0000 Subject: [PATCH 42/56] Update Pool argument and comment --- src/acbm/assigning/select_facility.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/acbm/assigning/select_facility.py b/src/acbm/assigning/select_facility.py index 74f9d17..0429a76 100644 --- a/src/acbm/assigning/select_facility.py +++ b/src/acbm/assigning/select_facility.py @@ -1,5 +1,4 @@ import logging -import multiprocessing from multiprocessing import Pool from typing import Optional, Tuple @@ -204,10 +203,9 @@ def select_facility( dict[str, Tuple[str, Point ] | Tuple[float, float]]: Unique ID column as keys with selected facility ID and facility ID's geometry, or (np.nan, np.nan) """ - # TODO: update this to be configurable - n_threads = multiprocessing.cpu_count() - with Pool(n_threads) as p: - # Set to a large enough chunk size so that each thread + # TODO: update this to be configurable, `None` is os.process_cpu_count() + with Pool(None) as p: + # Set to a large enough chunk size so that each process # has a sufficiently large amount of processing to do. chunk_size = 16_000 d = {} From fb3583984d49c310a1888d202e93a5b2a6adfa01 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 12 Dec 2024 09:09:13 +0000 Subject: [PATCH 43/56] Example config for Greater London --- config/greater-london.toml | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 config/greater-london.toml diff --git a/config/greater-london.toml b/config/greater-london.toml new file mode 100644 index 0000000..f6b8ba5 --- /dev/null +++ b/config/greater-london.toml @@ -0,0 +1,33 @@ +[parameters] +seed = 0 +region = "greater-london" +zone_id = "MSOA21CD" +travel_times = false +boundary_geography = "MSOA" +nts_years = [2019, 2021, 2022] +nts_regions = ["London"] +nts_day_of_week = 3 +output_crs = 4326 + +[work_assignment] +use_percentages = true +weight_max_dev = 0.0 +weight_total_dev = 1.0 +max_zones = 4 +commute_level = "MSOA" + +[matching] +required_columns = ["number_adults", "number_children"] +optional_columns = [ + "number_cars", + "num_pension_age", + "rural_urban_2_categories", + "employment_status", + "tenure_status", +] +n_matches = 10 +chunk_size = 50000 + +[postprocessing] +pam_jitter = 30 +pam_min_duration = 10 From e22a0da3b47ddae203299464e44b75f2b3584513 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Fri, 13 Dec 2024 09:50:42 +0000 Subject: [PATCH 44/56] Example config for Leeds --- config/leeds.toml | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 config/leeds.toml diff --git a/config/leeds.toml b/config/leeds.toml new file mode 100644 index 0000000..3e60054 --- /dev/null +++ b/config/leeds.toml @@ -0,0 +1,42 @@ +[parameters] +seed = 0 +region = "leeds" +zone_id = "OA21CD" +travel_times = false +boundary_geography = "OA" +nts_years = [2019, 2021, 2022] +nts_regions = [ + 'Yorkshire and the Humber', + 'North West', + 'North East', + 'East Midlands', + 'West Midlands', + 'East of England', + 'South East', + 'South West', +] +nts_day_of_week = 3 +output_crs = 4326 + +[work_assignment] +use_percentages = false +weight_max_dev = 0.0 +weight_total_dev = 1.0 +max_zones = 4 +commute_level = "OA" + +[matching] +required_columns = ["number_adults", "number_children"] +optional_columns = [ + "number_cars", + "num_pension_age", + "rural_urban_2_categories", + "employment_status", + "tenure_status", +] +n_matches = 10 +chunk_size = 50000 + +[postprocessing] +pam_jitter = 30 +pam_min_duration = 10 From 1fae69afa6b346fe9b2fc1dfaf3a7dcb3e9f964e Mon Sep 17 00:00:00 2001 From: BZ-BowenZhang Date: Mon, 16 Dec 2024 01:45:12 +0000 Subject: [PATCH 45/56] Add validation notebook for between AcBM and Cencus --- notebooks/Validation_AcBM_with_Cencus.ipynb | 4406 +++++++++++++++++++ 1 file changed, 4406 insertions(+) create mode 100644 notebooks/Validation_AcBM_with_Cencus.ipynb diff --git a/notebooks/Validation_AcBM_with_Cencus.ipynb b/notebooks/Validation_AcBM_with_Cencus.ipynb new file mode 100644 index 0000000..2002cbb --- /dev/null +++ b/notebooks/Validation_AcBM_with_Cencus.ipynb @@ -0,0 +1,4406 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import geopandas as gpd\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "from shapely.geometry import Point\n", + "from uatk_spc.builder import Builder\n", + "from uatk_spc.reader import Reader" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Input Data Paths" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "geo_boundaries_path=\"../data/external/boundaries/oa_leeds.geojson\"\n", + "acbm_data_path=\"../data/outputs/36c777d445/legs_with_locations.parquet\"\n", + "Cencus21_OA_data_path=\"../data/external/cencus/ODWP01EW_OA.csv\"\n", + "Cencus21_MSOA_data_path=\"../data/external/cencus/ODWP01EW_MSOA.csv\"\n", + "#MSOA_boundaries_path=\"../data/external/boundaries/MSOA_DEC_2021_EW_NC_v3_2768211653661228621.geojson\"" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "boundaries = gpd.read_file(geo_boundaries_path)\n", + "boundaries.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "MSOA_boundaries=boundaries.dissolve(by='MSOA21CD')\n", + "MSOA_boundaries.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
geometryOBJECTIDOA21CDGlobalIDLSOA21CDLSOA21NMMSOA21NMLEP22CD1LEP22NM1LAD22CDLAD22NMrgn22cdrgn22nmctry22cdctry22nm
MSOA21CD
E02002330POLYGON ((439176.316 446904.131, 439048.115 44...55616E00058996d0e5f055-f45a-4c73-ba31-72a00babdfe1E01011702Leeds 001DLeeds 001E37000062Leeds City RegionE08000035LeedsE12000003Yorkshire and The HumberE92000001England
E02002331POLYGON ((439824.268 447781.097, 439846.716 44...55595E00058975a99aff6e-3846-46e8-a00a-493c3f470420E01011697Leeds 002ALeeds 002E37000062Leeds City RegionE08000035LeedsE12000003Yorkshire and The HumberE92000001England
E02002332POLYGON ((419465.000 445311.000, 419185.334 44...55004E00058339239f6484-5a0a-4e03-86cb-045ee4dd76bdE01011581Leeds 003ALeeds 003E37000062Leeds City RegionE08000035LeedsE12000003Yorkshire and The HumberE92000001England
E02002333POLYGON ((418604.775 444120.550, 418438.000 44...54998E00058333e1295d1f-a2f3-41d6-a7c8-937c7c36e003E01011572Leeds 004CLeeds 004E37000062Leeds City RegionE08000035LeedsE12000003Yorkshire and The HumberE92000001England
E02002334POLYGON ((444937.313 444639.906, 444928.976 44...55588E000589689df65ceb-2b55-471c-8e2d-aa803dfca3e5E01011709Leeds 005BLeeds 005E37000062Leeds City RegionE08000035LeedsE12000003Yorkshire and The HumberE92000001England
................................................
E02002437POLYGON ((427596.093 424322.312, 427618.813 42...54799E00058126b5da9cc9-e49f-4e9d-9f5b-a4d61278df1aE01011535Leeds 108ALeeds 108E37000062Leeds City RegionE08000035LeedsE12000003Yorkshire and The HumberE92000001England
E02006852POLYGON ((427455.473 436298.086, 427186.000 43...55515E00058891bd49b23f-805b-4dfa-8d58-e728f2ba239dE01011690Leeds 109CLeeds 109E37000062Leeds City RegionE08000035LeedsE12000003Yorkshire and The HumberE92000001England
E02006861POLYGON ((428682.183 435118.284, 428662.000 43...54333E0005762838dd690f-4fd2-4e5a-9abb-cae5f1745891E01011444Leeds 110CLeeds 110E37000062Leeds City RegionE08000035LeedsE12000003Yorkshire and The HumberE92000001England
E02006875POLYGON ((429164.000 432940.000, 429339.265 43...160254E00169780e25c17dd-793a-4a37-b838-1bcf74113e56E01033008Leeds 111ALeeds 111E37000062Leeds City RegionE08000035LeedsE12000003Yorkshire and The HumberE92000001England
E02006876POLYGON ((430650.424 430920.211, 430541.000 43...54471E00057776c09f4ae3-3dc7-417b-88bd-87f18eca3d64E01011470Leeds 112CLeeds 112E37000062Leeds City RegionE08000035LeedsE12000003Yorkshire and The HumberE92000001England
\n", + "

107 rows × 15 columns

\n", + "
" + ], + "text/plain": [ + " geometry OBJECTID \\\n", + "MSOA21CD \n", + "E02002330 POLYGON ((439176.316 446904.131, 439048.115 44... 55616 \n", + "E02002331 POLYGON ((439824.268 447781.097, 439846.716 44... 55595 \n", + "E02002332 POLYGON ((419465.000 445311.000, 419185.334 44... 55004 \n", + "E02002333 POLYGON ((418604.775 444120.550, 418438.000 44... 54998 \n", + "E02002334 POLYGON ((444937.313 444639.906, 444928.976 44... 55588 \n", + "... ... ... \n", + "E02002437 POLYGON ((427596.093 424322.312, 427618.813 42... 54799 \n", + "E02006852 POLYGON ((427455.473 436298.086, 427186.000 43... 55515 \n", + "E02006861 POLYGON ((428682.183 435118.284, 428662.000 43... 54333 \n", + "E02006875 POLYGON ((429164.000 432940.000, 429339.265 43... 160254 \n", + "E02006876 POLYGON ((430650.424 430920.211, 430541.000 43... 54471 \n", + "\n", + " OA21CD GlobalID LSOA21CD \\\n", + "MSOA21CD \n", + "E02002330 E00058996 d0e5f055-f45a-4c73-ba31-72a00babdfe1 E01011702 \n", + "E02002331 E00058975 a99aff6e-3846-46e8-a00a-493c3f470420 E01011697 \n", + "E02002332 E00058339 239f6484-5a0a-4e03-86cb-045ee4dd76bd E01011581 \n", + "E02002333 E00058333 e1295d1f-a2f3-41d6-a7c8-937c7c36e003 E01011572 \n", + "E02002334 E00058968 9df65ceb-2b55-471c-8e2d-aa803dfca3e5 E01011709 \n", + "... ... ... ... \n", + "E02002437 E00058126 b5da9cc9-e49f-4e9d-9f5b-a4d61278df1a E01011535 \n", + "E02006852 E00058891 bd49b23f-805b-4dfa-8d58-e728f2ba239d E01011690 \n", + "E02006861 E00057628 38dd690f-4fd2-4e5a-9abb-cae5f1745891 E01011444 \n", + "E02006875 E00169780 e25c17dd-793a-4a37-b838-1bcf74113e56 E01033008 \n", + "E02006876 E00057776 c09f4ae3-3dc7-417b-88bd-87f18eca3d64 E01011470 \n", + "\n", + " LSOA21NM MSOA21NM LEP22CD1 LEP22NM1 LAD22CD \\\n", + "MSOA21CD \n", + "E02002330 Leeds 001D Leeds 001 E37000062 Leeds City Region E08000035 \n", + "E02002331 Leeds 002A Leeds 002 E37000062 Leeds City Region E08000035 \n", + "E02002332 Leeds 003A Leeds 003 E37000062 Leeds City Region E08000035 \n", + "E02002333 Leeds 004C Leeds 004 E37000062 Leeds City Region E08000035 \n", + "E02002334 Leeds 005B Leeds 005 E37000062 Leeds City Region E08000035 \n", + "... ... ... ... ... ... \n", + "E02002437 Leeds 108A Leeds 108 E37000062 Leeds City Region E08000035 \n", + "E02006852 Leeds 109C Leeds 109 E37000062 Leeds City Region E08000035 \n", + "E02006861 Leeds 110C Leeds 110 E37000062 Leeds City Region E08000035 \n", + "E02006875 Leeds 111A Leeds 111 E37000062 Leeds City Region E08000035 \n", + "E02006876 Leeds 112C Leeds 112 E37000062 Leeds City Region E08000035 \n", + "\n", + " LAD22NM rgn22cd rgn22nm ctry22cd ctry22nm \n", + "MSOA21CD \n", + "E02002330 Leeds E12000003 Yorkshire and The Humber E92000001 England \n", + "E02002331 Leeds E12000003 Yorkshire and The Humber E92000001 England \n", + "E02002332 Leeds E12000003 Yorkshire and The Humber E92000001 England \n", + "E02002333 Leeds E12000003 Yorkshire and The Humber E92000001 England \n", + "E02002334 Leeds E12000003 Yorkshire and The Humber E92000001 England \n", + "... ... ... ... ... ... \n", + "E02002437 Leeds E12000003 Yorkshire and The Humber E92000001 England \n", + "E02006852 Leeds E12000003 Yorkshire and The Humber E92000001 England \n", + "E02006861 Leeds E12000003 Yorkshire and The Humber E92000001 England \n", + "E02006875 Leeds E12000003 Yorkshire and The Humber E92000001 England \n", + "E02006876 Leeds E12000003 Yorkshire and The Humber E92000001 England \n", + "\n", + "[107 rows x 15 columns]" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "MSOA_boundaries" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MSOA21CD
OA21CD
E00056750E02002340
E00056751E02002338
E00056752E02002339
E00056753E02002339
E00056754E02002340
......
E00187149E02006876
E00187150E02002432
E00187151E02002337
E00187152E02006875
E00187153E02002404
\n", + "

2607 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " MSOA21CD\n", + "OA21CD \n", + "E00056750 E02002340\n", + "E00056751 E02002338\n", + "E00056752 E02002339\n", + "E00056753 E02002339\n", + "E00056754 E02002340\n", + "... ...\n", + "E00187149 E02006876\n", + "E00187150 E02002432\n", + "E00187151 E02002337\n", + "E00187152 E02006875\n", + "E00187153 E02002404\n", + "\n", + "[2607 rows x 1 columns]" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# generate OA to MSOA matching list\n", + "OA_to_MSOA_21 = boundaries[[\"OA21CD\", \"MSOA21CD\"]].drop_duplicates()\n", + "OA_to_MSOA_21.set_index(\"OA21CD\", inplace=True)\n", + "OA_to_MSOA_21" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "OA_to_MSOA_21" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read in the AcBM datasets\n", + "\n", + "The dataset currently used is generated on 12/12/2024 for Leeds" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "acbm_data=pd.read_parquet(\"../data/outputs/36c777d445/legs_with_locations.parquet\", engine='pyarrow')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pidhidozonedzonepurporigin activitydestination activitymodeseqtsttetdurationstart_location_idend_location_idstart_location_geometry_wktend_location_geometry_wkt
12115E00059028Noneworkhomeworkcar_passenger1.01900-01-01 06:30:001900-01-01 06:34:000:04:00NoneNoneNoneNone
15125E00057570E00058739workescortworkcar2.01900-01-01 06:34:001900-01-01 07:00:000:26:0013383664501381546612POINT (-1.5129058870428844 53.81732739735367)POINT (-1.474703422980641 53.85175150478472)
18136E00059028E00059102workhomeworkcar1.01900-01-01 08:30:001900-01-01 09:00:000:30:00None1397386050NonePOINT (-1.437978763907208 53.831371262924726)
30198E00059102E00056971workescortworkcar2.01900-01-01 08:00:001900-01-01 08:10:000:10:001671756810377084374POINT (-1.4220144369942826 53.84172919739339)POINT (-1.4471266040651745 53.87141888994728)
32198E00169783E00056971workotherworkwalk4.01900-01-01 13:30:001900-01-01 13:45:000:15:0060575406377084374POINT (-1.5479900578735037 53.80108316543812)POINT (-1.4471266040651745 53.87141888994728)
...................................................
1157754794606334832E00057790E00169583workhomeworkcar1.01900-01-01 07:20:001900-01-01 07:40:000:20:0012389004082671267350POINT (-1.5320639571262564 53.781896143736425)POINT (-1.5933444255448457 53.82849087561576)
1157788794621334839E00187098E00165696workhomeworkcar1.01900-01-01 08:35:001900-01-01 09:15:000:40:00876772744735155888POINT (-1.5290247468603333 53.789487589857636)POINT (-1.6533095476728066 53.87043507716729)
1157797794625334842E00057799E00057577workhomeworkcar1.01900-01-01 07:35:001900-01-01 07:50:000:15:0013208369481313992658POINT (-1.5213598592927677 53.76963697620853)POINT (-1.5186351332907104 53.816325638496004)
1157799794626334842E00057799Noneworkhomeworkcar_passenger1.01900-01-01 07:00:001900-01-01 07:20:000:20:001320836948NonePOINT (-1.5213598592927677 53.76963697620853)None
1157810794637334851E00170617E00056766workhomeworkcar1.01900-01-01 08:30:001900-01-01 09:05:000:35:00408724848523021654POINT (-1.5362161982189206 53.793278868899215)POINT (-1.7122020271735048 53.87390173302263)
\n", + "

140433 rows × 16 columns

\n", + "
" + ], + "text/plain": [ + " pid hid ozone dzone purp origin activity \\\n", + "12 11 5 E00059028 None work home \n", + "15 12 5 E00057570 E00058739 work escort \n", + "18 13 6 E00059028 E00059102 work home \n", + "30 19 8 E00059102 E00056971 work escort \n", + "32 19 8 E00169783 E00056971 work other \n", + "... ... ... ... ... ... ... \n", + "1157754 794606 334832 E00057790 E00169583 work home \n", + "1157788 794621 334839 E00187098 E00165696 work home \n", + "1157797 794625 334842 E00057799 E00057577 work home \n", + "1157799 794626 334842 E00057799 None work home \n", + "1157810 794637 334851 E00170617 E00056766 work home \n", + "\n", + " destination activity mode seq tst \\\n", + "12 work car_passenger 1.0 1900-01-01 06:30:00 \n", + "15 work car 2.0 1900-01-01 06:34:00 \n", + "18 work car 1.0 1900-01-01 08:30:00 \n", + "30 work car 2.0 1900-01-01 08:00:00 \n", + "32 work walk 4.0 1900-01-01 13:30:00 \n", + "... ... ... ... ... \n", + "1157754 work car 1.0 1900-01-01 07:20:00 \n", + "1157788 work car 1.0 1900-01-01 08:35:00 \n", + "1157797 work car 1.0 1900-01-01 07:35:00 \n", + "1157799 work car_passenger 1.0 1900-01-01 07:00:00 \n", + "1157810 work car 1.0 1900-01-01 08:30:00 \n", + "\n", + " tet duration start_location_id end_location_id \\\n", + "12 1900-01-01 06:34:00 0:04:00 None None \n", + "15 1900-01-01 07:00:00 0:26:00 1338366450 1381546612 \n", + "18 1900-01-01 09:00:00 0:30:00 None 1397386050 \n", + "30 1900-01-01 08:10:00 0:10:00 1671756810 377084374 \n", + "32 1900-01-01 13:45:00 0:15:00 60575406 377084374 \n", + "... ... ... ... ... \n", + "1157754 1900-01-01 07:40:00 0:20:00 1238900408 2671267350 \n", + "1157788 1900-01-01 09:15:00 0:40:00 876772744 735155888 \n", + "1157797 1900-01-01 07:50:00 0:15:00 1320836948 1313992658 \n", + "1157799 1900-01-01 07:20:00 0:20:00 1320836948 None \n", + "1157810 1900-01-01 09:05:00 0:35:00 408724848 523021654 \n", + "\n", + " start_location_geometry_wkt \\\n", + "12 None \n", + "15 POINT (-1.5129058870428844 53.81732739735367) \n", + "18 None \n", + "30 POINT (-1.4220144369942826 53.84172919739339) \n", + "32 POINT (-1.5479900578735037 53.80108316543812) \n", + "... ... \n", + "1157754 POINT (-1.5320639571262564 53.781896143736425) \n", + "1157788 POINT (-1.5290247468603333 53.789487589857636) \n", + "1157797 POINT (-1.5213598592927677 53.76963697620853) \n", + "1157799 POINT (-1.5213598592927677 53.76963697620853) \n", + "1157810 POINT (-1.5362161982189206 53.793278868899215) \n", + "\n", + " end_location_geometry_wkt \n", + "12 None \n", + "15 POINT (-1.474703422980641 53.85175150478472) \n", + "18 POINT (-1.437978763907208 53.831371262924726) \n", + "30 POINT (-1.4471266040651745 53.87141888994728) \n", + "32 POINT (-1.4471266040651745 53.87141888994728) \n", + "... ... \n", + "1157754 POINT (-1.5933444255448457 53.82849087561576) \n", + "1157788 POINT (-1.6533095476728066 53.87043507716729) \n", + "1157797 POINT (-1.5186351332907104 53.816325638496004) \n", + "1157799 None \n", + "1157810 POINT (-1.7122020271735048 53.87390173302263) \n", + "\n", + "[140433 rows x 16 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# filter the data to only include work trips\n", + "acbm_data_work=acbm_data[acbm_data[\"purp\"]==\"work\"]\n", + "acbm_data_work" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": {}, + "outputs": [], + "source": [ + "acbm_data_work=acbm_data_work.join(OA_to_MSOA_21, on=\"ozone\").rename(columns={\"MSOA21CD\": \"ozone_MSOA\"}).join(OA_to_MSOA_21, on=\"dzone\").rename(columns={\"MSOA21CD\": \"dzone_MSOA\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "metadata": {}, + "outputs": [], + "source": [ + "# Filter the ozone & dzone is not none\n", + "acbm_data_work=acbm_data_work[~acbm_data_work[\"ozone\"].isna() & ~acbm_data_work[\"dzone\"].isna()]" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pidhidozonedzonepurporigin activitydestination activitymodeseqtsttetdurationstart_location_idend_location_idstart_location_geometry_wktend_location_geometry_wktozone_MSOAdzone_MSOA
18136E00059028E00059102workhomeworkcar1.01900-01-01 08:30:001900-01-01 09:00:000:30:00None1397386050NonePOINT (-1.437978763907208 53.831371262924726)E02002330E02002351
35208E00059012E00169786workhomeworkcar1.01900-01-01 07:50:001900-01-01 08:35:000:45:00None952311802NonePOINT (-1.5385467208011925 53.79821785944038)E02002330E02006875
43249E00059045E00058296workhomeworkcar1.01900-01-01 08:10:001900-01-01 08:42:000:32:00None262780150NonePOINT (-1.5182841630529933 53.864789509699904)E02002330E02002341
452711E00059022E00056971workhomeworkwalk1.01900-01-01 07:30:001900-01-01 07:55:000:25:00None377084374NonePOINT (-1.4471266040651745 53.87141888994728)E02002330E02002359
573313E00059016E00058877workhomeworkcar1.01900-01-01 07:05:001900-01-01 07:45:000:40:00None294189496NonePOINT (-1.520049852688877 53.806563321006514)E02002330E02002393
.........................................................
1157752794605334832E00057790E00057239workhomeworkcar1.01900-01-01 07:20:001900-01-01 07:40:000:20:0012389004081166563594POINT (-1.5320639571262564 53.781896143736425)POINT (-1.5467417317175023 53.82775152739614)E02006876E02002363
1157754794606334832E00057790E00169583workhomeworkcar1.01900-01-01 07:20:001900-01-01 07:40:000:20:0012389004082671267350POINT (-1.5320639571262564 53.781896143736425)POINT (-1.5933444255448457 53.82849087561576)E02006876E02006852
1157788794621334839E00187098E00165696workhomeworkcar1.01900-01-01 08:35:001900-01-01 09:15:000:40:00876772744735155888POINT (-1.5290247468603333 53.789487589857636)POINT (-1.6533095476728066 53.87043507716729)E02006876E02002336
1157797794625334842E00057799E00057577workhomeworkcar1.01900-01-01 07:35:001900-01-01 07:50:000:15:0013208369481313992658POINT (-1.5213598592927677 53.76963697620853)POINT (-1.5186351332907104 53.816325638496004)E02006876E02002377
1157810794637334851E00170617E00056766workhomeworkcar1.01900-01-01 08:30:001900-01-01 09:05:000:35:00408724848523021654POINT (-1.5362161982189206 53.793278868899215)POINT (-1.7122020271735048 53.87390173302263)E02006876E02002338
\n", + "

102712 rows × 18 columns

\n", + "
" + ], + "text/plain": [ + " pid hid ozone dzone purp origin activity \\\n", + "18 13 6 E00059028 E00059102 work home \n", + "35 20 8 E00059012 E00169786 work home \n", + "43 24 9 E00059045 E00058296 work home \n", + "45 27 11 E00059022 E00056971 work home \n", + "57 33 13 E00059016 E00058877 work home \n", + "... ... ... ... ... ... ... \n", + "1157752 794605 334832 E00057790 E00057239 work home \n", + "1157754 794606 334832 E00057790 E00169583 work home \n", + "1157788 794621 334839 E00187098 E00165696 work home \n", + "1157797 794625 334842 E00057799 E00057577 work home \n", + "1157810 794637 334851 E00170617 E00056766 work home \n", + "\n", + " destination activity mode seq tst \\\n", + "18 work car 1.0 1900-01-01 08:30:00 \n", + "35 work car 1.0 1900-01-01 07:50:00 \n", + "43 work car 1.0 1900-01-01 08:10:00 \n", + "45 work walk 1.0 1900-01-01 07:30:00 \n", + "57 work car 1.0 1900-01-01 07:05:00 \n", + "... ... ... ... ... \n", + "1157752 work car 1.0 1900-01-01 07:20:00 \n", + "1157754 work car 1.0 1900-01-01 07:20:00 \n", + "1157788 work car 1.0 1900-01-01 08:35:00 \n", + "1157797 work car 1.0 1900-01-01 07:35:00 \n", + "1157810 work car 1.0 1900-01-01 08:30:00 \n", + "\n", + " tet duration start_location_id end_location_id \\\n", + "18 1900-01-01 09:00:00 0:30:00 None 1397386050 \n", + "35 1900-01-01 08:35:00 0:45:00 None 952311802 \n", + "43 1900-01-01 08:42:00 0:32:00 None 262780150 \n", + "45 1900-01-01 07:55:00 0:25:00 None 377084374 \n", + "57 1900-01-01 07:45:00 0:40:00 None 294189496 \n", + "... ... ... ... ... \n", + "1157752 1900-01-01 07:40:00 0:20:00 1238900408 1166563594 \n", + "1157754 1900-01-01 07:40:00 0:20:00 1238900408 2671267350 \n", + "1157788 1900-01-01 09:15:00 0:40:00 876772744 735155888 \n", + "1157797 1900-01-01 07:50:00 0:15:00 1320836948 1313992658 \n", + "1157810 1900-01-01 09:05:00 0:35:00 408724848 523021654 \n", + "\n", + " start_location_geometry_wkt \\\n", + "18 None \n", + "35 None \n", + "43 None \n", + "45 None \n", + "57 None \n", + "... ... \n", + "1157752 POINT (-1.5320639571262564 53.781896143736425) \n", + "1157754 POINT (-1.5320639571262564 53.781896143736425) \n", + "1157788 POINT (-1.5290247468603333 53.789487589857636) \n", + "1157797 POINT (-1.5213598592927677 53.76963697620853) \n", + "1157810 POINT (-1.5362161982189206 53.793278868899215) \n", + "\n", + " end_location_geometry_wkt ozone_MSOA dzone_MSOA \n", + "18 POINT (-1.437978763907208 53.831371262924726) E02002330 E02002351 \n", + "35 POINT (-1.5385467208011925 53.79821785944038) E02002330 E02006875 \n", + "43 POINT (-1.5182841630529933 53.864789509699904) E02002330 E02002341 \n", + "45 POINT (-1.4471266040651745 53.87141888994728) E02002330 E02002359 \n", + "57 POINT (-1.520049852688877 53.806563321006514) E02002330 E02002393 \n", + "... ... ... ... \n", + "1157752 POINT (-1.5467417317175023 53.82775152739614) E02006876 E02002363 \n", + "1157754 POINT (-1.5933444255448457 53.82849087561576) E02006876 E02006852 \n", + "1157788 POINT (-1.6533095476728066 53.87043507716729) E02006876 E02002336 \n", + "1157797 POINT (-1.5186351332907104 53.816325638496004) E02006876 E02002377 \n", + "1157810 POINT (-1.7122020271735048 53.87390173302263) E02006876 E02002338 \n", + "\n", + "[102712 rows x 18 columns]" + ] + }, + "execution_count": 135, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Filter the home to work trips comparing with the cencus data\n", + "acbm_data_work=acbm_data_work[acbm_data_work[\"origin activity\"]==\"home\"]\n", + "acbm_data_work" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "metadata": {}, + "outputs": [], + "source": [ + "# deduplicate the data for only keep one commuting trip for the same person\n", + "acbm_data_work_unique=acbm_data_work.drop_duplicates(\"pid\",keep=\"first\")" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/jl/8nv_1f6915lct8qfk5b81hx80000gn/T/ipykernel_5022/1763523430.py:2: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " OD_matrix_acbm_OA = acbm_data_work_unique.groupby(\"ozone\").apply(lambda x: x.value_counts(\"dzone\")).reset_index()\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ozonedzonecount
0E00056750E001695784
1E00056750E000585644
2E00056750E001870752
3E00056750E000570682
4E00056750E000574032
............
56407E00187151E000568231
56408E00187151E000568221
56409E00187151E000567681
56410E00187151E001870801
56411E00187152E001871521
\n", + "

56412 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " ozone dzone count\n", + "0 E00056750 E00169578 4\n", + "1 E00056750 E00058564 4\n", + "2 E00056750 E00187075 2\n", + "3 E00056750 E00057068 2\n", + "4 E00056750 E00057403 2\n", + "... ... ... ...\n", + "56407 E00187151 E00056823 1\n", + "56408 E00187151 E00056822 1\n", + "56409 E00187151 E00056768 1\n", + "56410 E00187151 E00187080 1\n", + "56411 E00187152 E00187152 1\n", + "\n", + "[56412 rows x 3 columns]" + ] + }, + "execution_count": 139, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Generate the OD matrix from the ACBM data at OA level\n", + "OD_matrix_acbm_OA = acbm_data_work_unique.groupby(\"ozone\").apply(lambda x: x.value_counts(\"dzone\")).reset_index()\n", + "OD_matrix_acbm_OA" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/jl/8nv_1f6915lct8qfk5b81hx80000gn/T/ipykernel_5022/4150659795.py:2: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " OD_matrix_acbm_MSOA = acbm_data_work_unique.groupby(\"ozone_MSOA\").apply(lambda x: x.value_counts(\"dzone_MSOA\")).reset_index()\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ozone_MSOAdzone_MSOAcount
0E02002330E02002359447
1E02002330E02002335131
2E02002330E0200235192
3E02002330E0200687554
4E02002330E0200239331
............
9202E02006876E020023621
9203E02006876E020023691
9204E02006876E020024231
9205E02006876E020023851
9206E02006876E020023641
\n", + "

9207 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " ozone_MSOA dzone_MSOA count\n", + "0 E02002330 E02002359 447\n", + "1 E02002330 E02002335 131\n", + "2 E02002330 E02002351 92\n", + "3 E02002330 E02006875 54\n", + "4 E02002330 E02002393 31\n", + "... ... ... ...\n", + "9202 E02006876 E02002362 1\n", + "9203 E02006876 E02002369 1\n", + "9204 E02006876 E02002423 1\n", + "9205 E02006876 E02002385 1\n", + "9206 E02006876 E02002364 1\n", + "\n", + "[9207 rows x 3 columns]" + ] + }, + "execution_count": 140, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Aggregate the OD matrix to MSOA level\n", + "OD_matrix_acbm_MSOA = acbm_data_work_unique.groupby(\"ozone_MSOA\").apply(lambda x: x.value_counts(\"dzone_MSOA\")).reset_index()\n", + "OD_matrix_acbm_MSOA" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read in the Cencus 2021 Data" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Middle layer Super Output Areas codeMiddle layer Super Output Areas labelMSOA of workplace codeMSOA of workplace labelPlace of work indicator (4 categories) codePlace of work indicator (4 categories) labelCount
0E02000001City of London 001-8Does not apply-8Does not apply2653
1E02000001City of London 001999999999Workplace is outside the UK2Other (including offshore installation, workin...35
2E02000001City of London 001E02000001City of London 0011Mainly working at or from home, No fixed place3871
3E02000001City of London 001E02000001City of London 0013Working in the UK but not working at or from home436
4E02000001City of London 001E02000016Barking and Dagenham 0153Working in the UK but not working at or from home2
........................
1856451W02000428Swansea 032W02000418Carmarthenshire 0273Working in the UK but not working at or from home2
1856452W02000428Swansea 032W02000422Cardiff 0483Working in the UK but not working at or from home5
1856453W02000428Swansea 032W02000423Cardiff 0493Working in the UK but not working at or from home1
1856454W02000428Swansea 032W02000428Swansea 0321Mainly working at or from home, No fixed place2128
1856455W02000428Swansea 032W02000428Swansea 0323Working in the UK but not working at or from home160
\n", + "

1856456 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " Middle layer Super Output Areas code \\\n", + "0 E02000001 \n", + "1 E02000001 \n", + "2 E02000001 \n", + "3 E02000001 \n", + "4 E02000001 \n", + "... ... \n", + "1856451 W02000428 \n", + "1856452 W02000428 \n", + "1856453 W02000428 \n", + "1856454 W02000428 \n", + "1856455 W02000428 \n", + "\n", + " Middle layer Super Output Areas label MSOA of workplace code \\\n", + "0 City of London 001 -8 \n", + "1 City of London 001 999999999 \n", + "2 City of London 001 E02000001 \n", + "3 City of London 001 E02000001 \n", + "4 City of London 001 E02000016 \n", + "... ... ... \n", + "1856451 Swansea 032 W02000418 \n", + "1856452 Swansea 032 W02000422 \n", + "1856453 Swansea 032 W02000423 \n", + "1856454 Swansea 032 W02000428 \n", + "1856455 Swansea 032 W02000428 \n", + "\n", + " MSOA of workplace label \\\n", + "0 Does not apply \n", + "1 Workplace is outside the UK \n", + "2 City of London 001 \n", + "3 City of London 001 \n", + "4 Barking and Dagenham 015 \n", + "... ... \n", + "1856451 Carmarthenshire 027 \n", + "1856452 Cardiff 048 \n", + "1856453 Cardiff 049 \n", + "1856454 Swansea 032 \n", + "1856455 Swansea 032 \n", + "\n", + " Place of work indicator (4 categories) code \\\n", + "0 -8 \n", + "1 2 \n", + "2 1 \n", + "3 3 \n", + "4 3 \n", + "... ... \n", + "1856451 3 \n", + "1856452 3 \n", + "1856453 3 \n", + "1856454 1 \n", + "1856455 3 \n", + "\n", + " Place of work indicator (4 categories) label Count \n", + "0 Does not apply 2653 \n", + "1 Other (including offshore installation, workin... 35 \n", + "2 Mainly working at or from home, No fixed place 3871 \n", + "3 Working in the UK but not working at or from home 436 \n", + "4 Working in the UK but not working at or from home 2 \n", + "... ... ... \n", + "1856451 Working in the UK but not working at or from home 2 \n", + "1856452 Working in the UK but not working at or from home 5 \n", + "1856453 Working in the UK but not working at or from home 1 \n", + "1856454 Mainly working at or from home, No fixed place 2128 \n", + "1856455 Working in the UK but not working at or from home 160 \n", + "\n", + "[1856456 rows x 7 columns]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Read the census data at MSOA level\n", + "Cencus21_MSOA = pd.read_csv(Cencus21_MSOA_data_path)\n", + "\n", + "Cencus21_MSOA\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Validate the flows at OA level" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Output Areas codeOutput Areas labelOA of workplace codeOA of workplace labelPlace of work indicator (4 categories) codePlace of work indicator (4 categories) labelCount
0E00000001E00000001-8Does not apply-8Does not apply73
1E00000001E00000001E00000001E000000011Mainly working at or from home, No fixed place64
2E00000001E00000001E00004731E000047313Working in the UK but not working at or from home1
3E00000001E00000001E00006038E000060383Working in the UK but not working at or from home1
4E00000001E00000001E00013547E000135473Working in the UK but not working at or from home1
........................
10741180W00010697W00010697W00010425W000104253Working in the UK but not working at or from home1
10741181W00010697W00010697W00010563W000105633Working in the UK but not working at or from home1
10741182W00010697W00010697W00010623W000106233Working in the UK but not working at or from home1
10741183W00010697W00010697W00010671W000106713Working in the UK but not working at or from home1
10741184W00010697W00010697W00010697W000106971Mainly working at or from home, No fixed place44
\n", + "

10741185 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " Output Areas code Output Areas label OA of workplace code \\\n", + "0 E00000001 E00000001 -8 \n", + "1 E00000001 E00000001 E00000001 \n", + "2 E00000001 E00000001 E00004731 \n", + "3 E00000001 E00000001 E00006038 \n", + "4 E00000001 E00000001 E00013547 \n", + "... ... ... ... \n", + "10741180 W00010697 W00010697 W00010425 \n", + "10741181 W00010697 W00010697 W00010563 \n", + "10741182 W00010697 W00010697 W00010623 \n", + "10741183 W00010697 W00010697 W00010671 \n", + "10741184 W00010697 W00010697 W00010697 \n", + "\n", + " OA of workplace label Place of work indicator (4 categories) code \\\n", + "0 Does not apply -8 \n", + "1 E00000001 1 \n", + "2 E00004731 3 \n", + "3 E00006038 3 \n", + "4 E00013547 3 \n", + "... ... ... \n", + "10741180 W00010425 3 \n", + "10741181 W00010563 3 \n", + "10741182 W00010623 3 \n", + "10741183 W00010671 3 \n", + "10741184 W00010697 1 \n", + "\n", + " Place of work indicator (4 categories) label Count \n", + "0 Does not apply 73 \n", + "1 Mainly working at or from home, No fixed place 64 \n", + "2 Working in the UK but not working at or from home 1 \n", + "3 Working in the UK but not working at or from home 1 \n", + "4 Working in the UK but not working at or from home 1 \n", + "... ... ... \n", + "10741180 Working in the UK but not working at or from home 1 \n", + "10741181 Working in the UK but not working at or from home 1 \n", + "10741182 Working in the UK but not working at or from home 1 \n", + "10741183 Working in the UK but not working at or from home 1 \n", + "10741184 Mainly working at or from home, No fixed place 44 \n", + "\n", + "[10741185 rows x 7 columns]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Read the census data at OA level\n", + "\n", + "Cencus21_OA = pd.read_csv(Cencus21_OA_data_path)\n", + "\n", + "Cencus21_OA\n" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Output Areas codeOutput Areas labelOA of workplace codeOA of workplace labelPlace of work indicator (4 categories) codePlace of work indicator (4 categories) labelCount
3208071E00056750E00056750E00056750E000567501Mainly working at or from home, No fixed place56
3208072E00056750E00056750E00056752E000567523Working in the UK but not working at or from home2
3208073E00056750E00056750E00056753E000567533Working in the UK but not working at or from home8
3208074E00056750E00056750E00056766E000567663Working in the UK but not working at or from home2
3208075E00056750E00056750E00056784E000567843Working in the UK but not working at or from home1
........................
9992534E00187153E00187153E00187074E001870743Working in the UK but not working at or from home1
9992535E00187153E00187153E00187120E001871203Working in the UK but not working at or from home1
9992536E00187153E00187153E00187137E001871373Working in the UK but not working at or from home1
9992537E00187153E00187153E00187152E001871523Working in the UK but not working at or from home11
9992538E00187153E00187153E00187153E001871531Mainly working at or from home, No fixed place82
\n", + "

108789 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " Output Areas code Output Areas label OA of workplace code \\\n", + "3208071 E00056750 E00056750 E00056750 \n", + "3208072 E00056750 E00056750 E00056752 \n", + "3208073 E00056750 E00056750 E00056753 \n", + "3208074 E00056750 E00056750 E00056766 \n", + "3208075 E00056750 E00056750 E00056784 \n", + "... ... ... ... \n", + "9992534 E00187153 E00187153 E00187074 \n", + "9992535 E00187153 E00187153 E00187120 \n", + "9992536 E00187153 E00187153 E00187137 \n", + "9992537 E00187153 E00187153 E00187152 \n", + "9992538 E00187153 E00187153 E00187153 \n", + "\n", + " OA of workplace label Place of work indicator (4 categories) code \\\n", + "3208071 E00056750 1 \n", + "3208072 E00056752 3 \n", + "3208073 E00056753 3 \n", + "3208074 E00056766 3 \n", + "3208075 E00056784 3 \n", + "... ... ... \n", + "9992534 E00187074 3 \n", + "9992535 E00187120 3 \n", + "9992536 E00187137 3 \n", + "9992537 E00187152 3 \n", + "9992538 E00187153 1 \n", + "\n", + " Place of work indicator (4 categories) label Count \n", + "3208071 Mainly working at or from home, No fixed place 56 \n", + "3208072 Working in the UK but not working at or from home 2 \n", + "3208073 Working in the UK but not working at or from home 8 \n", + "3208074 Working in the UK but not working at or from home 2 \n", + "3208075 Working in the UK but not working at or from home 1 \n", + "... ... ... \n", + "9992534 Working in the UK but not working at or from home 1 \n", + "9992535 Working in the UK but not working at or from home 1 \n", + "9992536 Working in the UK but not working at or from home 1 \n", + "9992537 Working in the UK but not working at or from home 11 \n", + "9992538 Mainly working at or from home, No fixed place 82 \n", + "\n", + "[108789 rows x 7 columns]" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Filter the cencus data to case study area\n", + "\n", + "Cencus21_OA_case_study_area = Cencus21_OA[Cencus21_OA[\"Output Areas code\"].isin(boundaries[\"OA21CD\"].unique())]\n", + "Cencus21_OA_case_study_area = Cencus21_OA_case_study_area[Cencus21_OA_case_study_area[\"OA of workplace code\"].isin(boundaries[\"OA21CD\"].unique())]\n", + "Cencus21_OA_case_study_area" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Output Areas codeOutput Areas labelOA of workplace codeOA of workplace labelPlace of work indicator (4 categories) codeCencus_countACBM_count
0E00056750E00056750E00056750E000567501560.0
1E00056750E00056750E00056752E00056752320.0
2E00056750E00056750E00056753E00056753380.0
3E00056750E00056750E00056766E00056766320.0
4E00056750E00056750E00056784E00056784310.0
........................
108784E00187153E00187153E00187074E00187074310.0
108785E00187153E00187153E00187120E00187120310.0
108786E00187153E00187153E00187137E00187137310.0
108787E00187153E00187153E00187152E001871523110.0
108788E00187153E00187153E00187153E001871531820.0
\n", + "

108789 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " Output Areas code Output Areas label OA of workplace code \\\n", + "0 E00056750 E00056750 E00056750 \n", + "1 E00056750 E00056750 E00056752 \n", + "2 E00056750 E00056750 E00056753 \n", + "3 E00056750 E00056750 E00056766 \n", + "4 E00056750 E00056750 E00056784 \n", + "... ... ... ... \n", + "108784 E00187153 E00187153 E00187074 \n", + "108785 E00187153 E00187153 E00187120 \n", + "108786 E00187153 E00187153 E00187137 \n", + "108787 E00187153 E00187153 E00187152 \n", + "108788 E00187153 E00187153 E00187153 \n", + "\n", + " OA of workplace label Place of work indicator (4 categories) code \\\n", + "0 E00056750 1 \n", + "1 E00056752 3 \n", + "2 E00056753 3 \n", + "3 E00056766 3 \n", + "4 E00056784 3 \n", + "... ... ... \n", + "108784 E00187074 3 \n", + "108785 E00187120 3 \n", + "108786 E00187137 3 \n", + "108787 E00187152 3 \n", + "108788 E00187153 1 \n", + "\n", + " Cencus_count ACBM_count \n", + "0 56 0.0 \n", + "1 2 0.0 \n", + "2 8 0.0 \n", + "3 2 0.0 \n", + "4 1 0.0 \n", + "... ... ... \n", + "108784 1 0.0 \n", + "108785 1 0.0 \n", + "108786 1 0.0 \n", + "108787 11 0.0 \n", + "108788 82 0.0 \n", + "\n", + "[108789 rows x 7 columns]" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# combine the cencus data with the ACBM data at OA level to compare the difference\n", + "\n", + "Combine_Df_OA = pd.merge(Cencus21_OA_case_study_area, OD_matrix_acbm_OA, left_on= [\"Output Areas code\",\"OA of workplace code\"], right_on=[\"ozone\",\"dzone\"], how=\"left\").fillna(0)\n", + "Combine_Df_OA.drop(columns=[\"ozone\",\"dzone\",'Place of work indicator (4 categories) label'],inplace=True)\n", + "Combine_Df_OA.rename(columns={\"Count\": \"Cencus_count\",\"count\": \"ACBM_count\"},inplace=True)\n", + "Combine_Df_OA\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Census counts: 326921 \n", + "Current ACBM counts: 86463\n", + "The R^2 value is: 0.005976117773372713\n", + "The RMSE value is: 11.364015603538766\n", + "The MAE value is: 2.8038680381288548\n" + ] + } + ], + "source": [ + "# calculate the R^2, RMSE and MAE values at OA level for the comparison\n", + " \n", + "sum(Combine_Df_OA[\"Cencus_count\"])\n", + "sum(Combine_Df_OA[\"ACBM_count\"])\n", + "print(\n", + " \"Census counts:\",\n", + " sum(Combine_Df_OA[\"Cencus_count\"]),\n", + " \"\\nCurrent ACBM counts:\",\n", + " int (sum(Combine_Df_OA[\"ACBM_count\"]),)\n", + " )\n", + "\n", + "r2 = np.corrcoef(Combine_Df_OA[\"Cencus_count\"], Combine_Df_OA[\"ACBM_count\"])[0, 1] ** 2\n", + "print(\"The R^2 value is: \", r2)\n", + "\n", + "rmse = np.sqrt(np.mean((Combine_Df_OA[\"Cencus_count\"] - Combine_Df_OA[\"ACBM_count\"]) ** 2))\n", + "print(\"The RMSE value is: \", rmse)\n", + "\n", + "mes = np.mean(np.abs(Combine_Df_OA[\"Cencus_count\"] - Combine_Df_OA[\"ACBM_count\"]))\n", + "print(\"The MAE value is: \", mes)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Validate the flows at MSOA level" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Middle layer Super Output Areas codeMiddle layer Super Output Areas labelMSOA of workplace codeMSOA of workplace labelPlace of work indicator (4 categories) codePlace of work indicator (4 categories) labelCount
0E02000001City of London 001-8Does not apply-8Does not apply2653
1E02000001City of London 001999999999Workplace is outside the UK2Other (including offshore installation, workin...35
2E02000001City of London 001E02000001City of London 0011Mainly working at or from home, No fixed place3871
3E02000001City of London 001E02000001City of London 0013Working in the UK but not working at or from home436
4E02000001City of London 001E02000016Barking and Dagenham 0153Working in the UK but not working at or from home2
........................
1856451W02000428Swansea 032W02000418Carmarthenshire 0273Working in the UK but not working at or from home2
1856452W02000428Swansea 032W02000422Cardiff 0483Working in the UK but not working at or from home5
1856453W02000428Swansea 032W02000423Cardiff 0493Working in the UK but not working at or from home1
1856454W02000428Swansea 032W02000428Swansea 0321Mainly working at or from home, No fixed place2128
1856455W02000428Swansea 032W02000428Swansea 0323Working in the UK but not working at or from home160
\n", + "

1856456 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " Middle layer Super Output Areas code \\\n", + "0 E02000001 \n", + "1 E02000001 \n", + "2 E02000001 \n", + "3 E02000001 \n", + "4 E02000001 \n", + "... ... \n", + "1856451 W02000428 \n", + "1856452 W02000428 \n", + "1856453 W02000428 \n", + "1856454 W02000428 \n", + "1856455 W02000428 \n", + "\n", + " Middle layer Super Output Areas label MSOA of workplace code \\\n", + "0 City of London 001 -8 \n", + "1 City of London 001 999999999 \n", + "2 City of London 001 E02000001 \n", + "3 City of London 001 E02000001 \n", + "4 City of London 001 E02000016 \n", + "... ... ... \n", + "1856451 Swansea 032 W02000418 \n", + "1856452 Swansea 032 W02000422 \n", + "1856453 Swansea 032 W02000423 \n", + "1856454 Swansea 032 W02000428 \n", + "1856455 Swansea 032 W02000428 \n", + "\n", + " MSOA of workplace label \\\n", + "0 Does not apply \n", + "1 Workplace is outside the UK \n", + "2 City of London 001 \n", + "3 City of London 001 \n", + "4 Barking and Dagenham 015 \n", + "... ... \n", + "1856451 Carmarthenshire 027 \n", + "1856452 Cardiff 048 \n", + "1856453 Cardiff 049 \n", + "1856454 Swansea 032 \n", + "1856455 Swansea 032 \n", + "\n", + " Place of work indicator (4 categories) code \\\n", + "0 -8 \n", + "1 2 \n", + "2 1 \n", + "3 3 \n", + "4 3 \n", + "... ... \n", + "1856451 3 \n", + "1856452 3 \n", + "1856453 3 \n", + "1856454 1 \n", + "1856455 3 \n", + "\n", + " Place of work indicator (4 categories) label Count \n", + "0 Does not apply 2653 \n", + "1 Other (including offshore installation, workin... 35 \n", + "2 Mainly working at or from home, No fixed place 3871 \n", + "3 Working in the UK but not working at or from home 436 \n", + "4 Working in the UK but not working at or from home 2 \n", + "... ... ... \n", + "1856451 Working in the UK but not working at or from home 2 \n", + "1856452 Working in the UK but not working at or from home 5 \n", + "1856453 Working in the UK but not working at or from home 1 \n", + "1856454 Mainly working at or from home, No fixed place 2128 \n", + "1856455 Working in the UK but not working at or from home 160 \n", + "\n", + "[1856456 rows x 7 columns]" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Read the census data at OA level\n", + "\n", + "Cencus21_MSOA = pd.read_csv(Cencus21_MSOA_data_path)\n", + "\n", + "Cencus21_MSOA" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Middle layer Super Output Areas codeMiddle layer Super Output Areas labelMSOA of workplace codeMSOA of workplace labelPlace of work indicator (4 categories) codePlace of work indicator (4 categories) labelCount
713972E02002330Leeds 001E02002330Leeds 0011Mainly working at or from home, No fixed place1572
713973E02002330Leeds 001E02002330Leeds 0013Working in the UK but not working at or from home30
713974E02002330Leeds 001E02002331Leeds 0023Working in the UK but not working at or from home366
713975E02002330Leeds 001E02002332Leeds 0033Working in the UK but not working at or from home6
713976E02002330Leeds 001E02002333Leeds 0043Working in the UK but not working at or from home2
........................
1719013E02006876Leeds 112E02006852Leeds 1093Working in the UK but not working at or from home6
1719014E02006876Leeds 112E02006861Leeds 1103Working in the UK but not working at or from home8
1719015E02006876Leeds 112E02006875Leeds 1113Working in the UK but not working at or from home471
1719016E02006876Leeds 112E02006876Leeds 1121Mainly working at or from home, No fixed place2243
1719017E02006876Leeds 112E02006876Leeds 1123Working in the UK but not working at or from home233
\n", + "

10403 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " Middle layer Super Output Areas code \\\n", + "713972 E02002330 \n", + "713973 E02002330 \n", + "713974 E02002330 \n", + "713975 E02002330 \n", + "713976 E02002330 \n", + "... ... \n", + "1719013 E02006876 \n", + "1719014 E02006876 \n", + "1719015 E02006876 \n", + "1719016 E02006876 \n", + "1719017 E02006876 \n", + "\n", + " Middle layer Super Output Areas label MSOA of workplace code \\\n", + "713972 Leeds 001 E02002330 \n", + "713973 Leeds 001 E02002330 \n", + "713974 Leeds 001 E02002331 \n", + "713975 Leeds 001 E02002332 \n", + "713976 Leeds 001 E02002333 \n", + "... ... ... \n", + "1719013 Leeds 112 E02006852 \n", + "1719014 Leeds 112 E02006861 \n", + "1719015 Leeds 112 E02006875 \n", + "1719016 Leeds 112 E02006876 \n", + "1719017 Leeds 112 E02006876 \n", + "\n", + " MSOA of workplace label Place of work indicator (4 categories) code \\\n", + "713972 Leeds 001 1 \n", + "713973 Leeds 001 3 \n", + "713974 Leeds 002 3 \n", + "713975 Leeds 003 3 \n", + "713976 Leeds 004 3 \n", + "... ... ... \n", + "1719013 Leeds 109 3 \n", + "1719014 Leeds 110 3 \n", + "1719015 Leeds 111 3 \n", + "1719016 Leeds 112 1 \n", + "1719017 Leeds 112 3 \n", + "\n", + " Place of work indicator (4 categories) label Count \n", + "713972 Mainly working at or from home, No fixed place 1572 \n", + "713973 Working in the UK but not working at or from home 30 \n", + "713974 Working in the UK but not working at or from home 366 \n", + "713975 Working in the UK but not working at or from home 6 \n", + "713976 Working in the UK but not working at or from home 2 \n", + "... ... ... \n", + "1719013 Working in the UK but not working at or from home 6 \n", + "1719014 Working in the UK but not working at or from home 8 \n", + "1719015 Working in the UK but not working at or from home 471 \n", + "1719016 Mainly working at or from home, No fixed place 2243 \n", + "1719017 Working in the UK but not working at or from home 233 \n", + "\n", + "[10403 rows x 7 columns]" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Filter the cencus data to case study area\n", + "\n", + "Cencus21_MSOA_case_study_area = Cencus21_MSOA[Cencus21_MSOA[\"Middle layer Super Output Areas code\"].isin(boundaries[\"MSOA21CD\"].unique())]\n", + "Cencus21_MSOA_case_study_area = Cencus21_MSOA_case_study_area[Cencus21_MSOA_case_study_area[\"MSOA of workplace code\"].isin(boundaries[\"MSOA21CD\"].unique())]\n", + "Cencus21_MSOA_case_study_area" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
OrigDestDist
0E02002330E020023300.000000
1E02002330E020023313532.892984
2E02002330E0200233220083.427194
3E02002330E0200233319168.244335
4E02002330E020023345888.497237
............
11444E02006876E020024377329.451871
11445E02006876E020068527882.022965
11446E02006876E020068615685.936850
11447E02006876E020068752992.168008
11448E02006876E020068760.000000
\n", + "

11449 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " Orig Dest Dist\n", + "0 E02002330 E02002330 0.000000\n", + "1 E02002330 E02002331 3532.892984\n", + "2 E02002330 E02002332 20083.427194\n", + "3 E02002330 E02002333 19168.244335\n", + "4 E02002330 E02002334 5888.497237\n", + "... ... ... ...\n", + "11444 E02006876 E02002437 7329.451871\n", + "11445 E02006876 E02006852 7882.022965\n", + "11446 E02006876 E02006861 5685.936850\n", + "11447 E02006876 E02006875 2992.168008\n", + "11448 E02006876 E02006876 0.000000\n", + "\n", + "[11449 rows x 3 columns]" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "distances = (\n", + " MSOA_boundaries.rename_axis(\"Orig\")\n", + " .centroid.apply(\n", + " lambda x: MSOA_boundaries.rename_axis(\"Dest\").centroid.distance(x)\n", + " )\n", + " .stack()\n", + " .reset_index()\n", + ")\n", + "distances.rename(columns={0: \"Dist\"}, inplace=True)\n", + "distances" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Middle layer Super Output Areas codeMiddle layer Super Output Areas labelMSOA of workplace codeMSOA of workplace labelPlace of work indicator (4 categories) codeCencus_countACBM_count
0E02002330Leeds 001E02002330Leeds 001115720.0
1E02002330Leeds 001E02002330Leeds 0013300.0
2E02002330Leeds 001E02002331Leeds 00233660.0
3E02002330Leeds 001E02002332Leeds 003360.0
4E02002330Leeds 001E02002333Leeds 004320.0
........................
10398E02006876Leeds 112E02006852Leeds 109365.0
10399E02006876Leeds 112E02006861Leeds 110384.0
10400E02006876Leeds 112E02006875Leeds 111347188.0
10401E02006876Leeds 112E02006876Leeds 1121224329.0
10402E02006876Leeds 112E02006876Leeds 112323329.0
\n", + "

10403 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " Middle layer Super Output Areas code \\\n", + "0 E02002330 \n", + "1 E02002330 \n", + "2 E02002330 \n", + "3 E02002330 \n", + "4 E02002330 \n", + "... ... \n", + "10398 E02006876 \n", + "10399 E02006876 \n", + "10400 E02006876 \n", + "10401 E02006876 \n", + "10402 E02006876 \n", + "\n", + " Middle layer Super Output Areas label MSOA of workplace code \\\n", + "0 Leeds 001 E02002330 \n", + "1 Leeds 001 E02002330 \n", + "2 Leeds 001 E02002331 \n", + "3 Leeds 001 E02002332 \n", + "4 Leeds 001 E02002333 \n", + "... ... ... \n", + "10398 Leeds 112 E02006852 \n", + "10399 Leeds 112 E02006861 \n", + "10400 Leeds 112 E02006875 \n", + "10401 Leeds 112 E02006876 \n", + "10402 Leeds 112 E02006876 \n", + "\n", + " MSOA of workplace label Place of work indicator (4 categories) code \\\n", + "0 Leeds 001 1 \n", + "1 Leeds 001 3 \n", + "2 Leeds 002 3 \n", + "3 Leeds 003 3 \n", + "4 Leeds 004 3 \n", + "... ... ... \n", + "10398 Leeds 109 3 \n", + "10399 Leeds 110 3 \n", + "10400 Leeds 111 3 \n", + "10401 Leeds 112 1 \n", + "10402 Leeds 112 3 \n", + "\n", + " Cencus_count ACBM_count \n", + "0 1572 0.0 \n", + "1 30 0.0 \n", + "2 366 0.0 \n", + "3 6 0.0 \n", + "4 2 0.0 \n", + "... ... ... \n", + "10398 6 5.0 \n", + "10399 8 4.0 \n", + "10400 471 88.0 \n", + "10401 2243 29.0 \n", + "10402 233 29.0 \n", + "\n", + "[10403 rows x 7 columns]" + ] + }, + "execution_count": 153, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# combine the cencus data with the ACBM data at MSOA level to compare the difference\n", + "\n", + "Combine_Df_MSOA = pd.merge(Cencus21_MSOA_case_study_area, OD_matrix_acbm_MSOA, left_on= [\"Middle layer Super Output Areas code\",\"MSOA of workplace code\"], right_on=[\"ozone_MSOA\",\"dzone_MSOA\"], how=\"left\").fillna(0)\n", + "Combine_Df_MSOA.drop(columns=['Place of work indicator (4 categories) label',\"ozone_MSOA\",\"dzone_MSOA\"],inplace=True)\n", + "Combine_Df_MSOA.rename(columns={\"Count\": \"Cencus_count\",\"count\": \"ACBM_count\"},inplace=True)\n", + "Combine_Df_MSOA" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Middle layer Super Output Areas codeMiddle layer Super Output Areas labelMSOA of workplace codeMSOA of workplace labelPlace of work indicator (4 categories) codeCencus_countACBM_countDist
0E02002330Leeds 001E02002330Leeds 001115720.00.000000
1E02002330Leeds 001E02002330Leeds 0013300.00.000000
2E02002330Leeds 001E02002331Leeds 00233660.03532.892984
3E02002330Leeds 001E02002332Leeds 003360.020083.427194
4E02002330Leeds 001E02002333Leeds 004320.019168.244335
...........................
10398E02006876Leeds 112E02006852Leeds 109365.07882.022965
10399E02006876Leeds 112E02006861Leeds 110384.05685.936850
10400E02006876Leeds 112E02006875Leeds 111347188.02992.168008
10401E02006876Leeds 112E02006876Leeds 1121224329.00.000000
10402E02006876Leeds 112E02006876Leeds 112323329.00.000000
\n", + "

10403 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " Middle layer Super Output Areas code \\\n", + "0 E02002330 \n", + "1 E02002330 \n", + "2 E02002330 \n", + "3 E02002330 \n", + "4 E02002330 \n", + "... ... \n", + "10398 E02006876 \n", + "10399 E02006876 \n", + "10400 E02006876 \n", + "10401 E02006876 \n", + "10402 E02006876 \n", + "\n", + " Middle layer Super Output Areas label MSOA of workplace code \\\n", + "0 Leeds 001 E02002330 \n", + "1 Leeds 001 E02002330 \n", + "2 Leeds 001 E02002331 \n", + "3 Leeds 001 E02002332 \n", + "4 Leeds 001 E02002333 \n", + "... ... ... \n", + "10398 Leeds 112 E02006852 \n", + "10399 Leeds 112 E02006861 \n", + "10400 Leeds 112 E02006875 \n", + "10401 Leeds 112 E02006876 \n", + "10402 Leeds 112 E02006876 \n", + "\n", + " MSOA of workplace label Place of work indicator (4 categories) code \\\n", + "0 Leeds 001 1 \n", + "1 Leeds 001 3 \n", + "2 Leeds 002 3 \n", + "3 Leeds 003 3 \n", + "4 Leeds 004 3 \n", + "... ... ... \n", + "10398 Leeds 109 3 \n", + "10399 Leeds 110 3 \n", + "10400 Leeds 111 3 \n", + "10401 Leeds 112 1 \n", + "10402 Leeds 112 3 \n", + "\n", + " Cencus_count ACBM_count Dist \n", + "0 1572 0.0 0.000000 \n", + "1 30 0.0 0.000000 \n", + "2 366 0.0 3532.892984 \n", + "3 6 0.0 20083.427194 \n", + "4 2 0.0 19168.244335 \n", + "... ... ... ... \n", + "10398 6 5.0 7882.022965 \n", + "10399 8 4.0 5685.936850 \n", + "10400 471 88.0 2992.168008 \n", + "10401 2243 29.0 0.000000 \n", + "10402 233 29.0 0.000000 \n", + "\n", + "[10403 rows x 8 columns]" + ] + }, + "execution_count": 154, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Join the distances to the combined dataframe\n", + "Combine_Df_MSOA = pd.merge(Combine_Df_MSOA, distances, left_on=[\"Middle layer Super Output Areas code\", \"MSOA of workplace code\"], right_on=[\"Orig\", \"Dest\"], how=\"left\")\n", + "Combine_Df_MSOA.drop(columns=[\"Orig\", \"Dest\"], inplace=True)\n", + "Combine_Df_MSOA" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Census counts: 326885 \n", + "Current ACBM counts: 98431\n", + "The R^2 value is: 0.02268671717234933\n", + "The RMSE value is: 172.01178789952573\n", + "The MAE value is: 26.782082091704314\n" + ] + } + ], + "source": [ + "# calculate the R^2, RMSE and MAE values at MSOA level for the comparison\n", + "\n", + "sum(Combine_Df_MSOA[\"Cencus_count\"])\n", + "sum(Combine_Df_MSOA[\"ACBM_count\"])\n", + "print(\n", + " \"Census counts:\",\n", + " sum(Combine_Df_MSOA[\"Cencus_count\"]),\n", + " \"\\nCurrent ACBM counts:\",\n", + " int (sum(Combine_Df_MSOA[\"ACBM_count\"]),)\n", + " )\n", + "\n", + "r2 = np.corrcoef(Combine_Df_MSOA[\"Cencus_count\"], Combine_Df_MSOA[\"ACBM_count\"])[0, 1] ** 2\n", + "print(\"The R^2 value is: \", r2)\n", + "\n", + "rmse = np.sqrt(np.mean((Combine_Df_MSOA[\"Cencus_count\"] - Combine_Df_MSOA[\"ACBM_count\"]) ** 2))\n", + "print(\"The RMSE value is: \", rmse)\n", + "\n", + "mes = np.mean(np.abs(Combine_Df_MSOA[\"Cencus_count\"] - Combine_Df_MSOA[\"ACBM_count\"]))\n", + "print(\"The MAE value is: \", mes)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Middle layer Super Output Areas codeMiddle layer Super Output Areas labelMSOA of workplace codeMSOA of workplace labelPlace of work indicator (4 categories) codeCencus_countACBM_countDist
0E02002330Leeds 001E02002330Leeds 001115720.00.000000
1E02002330Leeds 001E02002330Leeds 0013300.00.000000
2E02002330Leeds 001E02002331Leeds 00233660.03532.892984
3E02002330Leeds 001E02002332Leeds 003360.020083.427194
4E02002330Leeds 001E02002333Leeds 004320.019168.244335
...........................
10398E02006876Leeds 112E02006852Leeds 109365.07882.022965
10399E02006876Leeds 112E02006861Leeds 110384.05685.936850
10400E02006876Leeds 112E02006875Leeds 111347188.02992.168008
10401E02006876Leeds 112E02006876Leeds 1121224329.00.000000
10402E02006876Leeds 112E02006876Leeds 112323329.00.000000
\n", + "

10403 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " Middle layer Super Output Areas code \\\n", + "0 E02002330 \n", + "1 E02002330 \n", + "2 E02002330 \n", + "3 E02002330 \n", + "4 E02002330 \n", + "... ... \n", + "10398 E02006876 \n", + "10399 E02006876 \n", + "10400 E02006876 \n", + "10401 E02006876 \n", + "10402 E02006876 \n", + "\n", + " Middle layer Super Output Areas label MSOA of workplace code \\\n", + "0 Leeds 001 E02002330 \n", + "1 Leeds 001 E02002330 \n", + "2 Leeds 001 E02002331 \n", + "3 Leeds 001 E02002332 \n", + "4 Leeds 001 E02002333 \n", + "... ... ... \n", + "10398 Leeds 112 E02006852 \n", + "10399 Leeds 112 E02006861 \n", + "10400 Leeds 112 E02006875 \n", + "10401 Leeds 112 E02006876 \n", + "10402 Leeds 112 E02006876 \n", + "\n", + " MSOA of workplace label Place of work indicator (4 categories) code \\\n", + "0 Leeds 001 1 \n", + "1 Leeds 001 3 \n", + "2 Leeds 002 3 \n", + "3 Leeds 003 3 \n", + "4 Leeds 004 3 \n", + "... ... ... \n", + "10398 Leeds 109 3 \n", + "10399 Leeds 110 3 \n", + "10400 Leeds 111 3 \n", + "10401 Leeds 112 1 \n", + "10402 Leeds 112 3 \n", + "\n", + " Cencus_count ACBM_count Dist \n", + "0 1572 0.0 0.000000 \n", + "1 30 0.0 0.000000 \n", + "2 366 0.0 3532.892984 \n", + "3 6 0.0 20083.427194 \n", + "4 2 0.0 19168.244335 \n", + "... ... ... ... \n", + "10398 6 5.0 7882.022965 \n", + "10399 8 4.0 5685.936850 \n", + "10400 471 88.0 2992.168008 \n", + "10401 2243 29.0 0.000000 \n", + "10402 233 29.0 0.000000 \n", + "\n", + "[10403 rows x 8 columns]" + ] + }, + "execution_count": 156, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Combine_Df_MSOA" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Middle layer Super Output Areas codeMiddle layer Super Output Areas labelMSOA of workplace codeMSOA of workplace labelPlace of work indicator (4 categories) codeCencus_countACBM_countDistdiffdiff_abs
0E02002330Leeds 001E02002330Leeds 001115720.00.0000001572.01572.0
1E02002330Leeds 001E02002330Leeds 0013300.00.00000030.030.0
2E02002330Leeds 001E02002331Leeds 00233660.03532.892984366.0366.0
3E02002330Leeds 001E02002332Leeds 003360.020083.4271946.06.0
4E02002330Leeds 001E02002333Leeds 004320.019168.2443352.02.0
.................................
10398E02006876Leeds 112E02006852Leeds 109365.07882.0229651.01.0
10399E02006876Leeds 112E02006861Leeds 110384.05685.9368504.04.0
10400E02006876Leeds 112E02006875Leeds 111347188.02992.168008383.0383.0
10401E02006876Leeds 112E02006876Leeds 1121224329.00.0000002214.02214.0
10402E02006876Leeds 112E02006876Leeds 112323329.00.000000204.0204.0
\n", + "

10403 rows × 10 columns

\n", + "
" + ], + "text/plain": [ + " Middle layer Super Output Areas code \\\n", + "0 E02002330 \n", + "1 E02002330 \n", + "2 E02002330 \n", + "3 E02002330 \n", + "4 E02002330 \n", + "... ... \n", + "10398 E02006876 \n", + "10399 E02006876 \n", + "10400 E02006876 \n", + "10401 E02006876 \n", + "10402 E02006876 \n", + "\n", + " Middle layer Super Output Areas label MSOA of workplace code \\\n", + "0 Leeds 001 E02002330 \n", + "1 Leeds 001 E02002330 \n", + "2 Leeds 001 E02002331 \n", + "3 Leeds 001 E02002332 \n", + "4 Leeds 001 E02002333 \n", + "... ... ... \n", + "10398 Leeds 112 E02006852 \n", + "10399 Leeds 112 E02006861 \n", + "10400 Leeds 112 E02006875 \n", + "10401 Leeds 112 E02006876 \n", + "10402 Leeds 112 E02006876 \n", + "\n", + " MSOA of workplace label Place of work indicator (4 categories) code \\\n", + "0 Leeds 001 1 \n", + "1 Leeds 001 3 \n", + "2 Leeds 002 3 \n", + "3 Leeds 003 3 \n", + "4 Leeds 004 3 \n", + "... ... ... \n", + "10398 Leeds 109 3 \n", + "10399 Leeds 110 3 \n", + "10400 Leeds 111 3 \n", + "10401 Leeds 112 1 \n", + "10402 Leeds 112 3 \n", + "\n", + " Cencus_count ACBM_count Dist diff diff_abs \n", + "0 1572 0.0 0.000000 1572.0 1572.0 \n", + "1 30 0.0 0.000000 30.0 30.0 \n", + "2 366 0.0 3532.892984 366.0 366.0 \n", + "3 6 0.0 20083.427194 6.0 6.0 \n", + "4 2 0.0 19168.244335 2.0 2.0 \n", + "... ... ... ... ... ... \n", + "10398 6 5.0 7882.022965 1.0 1.0 \n", + "10399 8 4.0 5685.936850 4.0 4.0 \n", + "10400 471 88.0 2992.168008 383.0 383.0 \n", + "10401 2243 29.0 0.000000 2214.0 2214.0 \n", + "10402 233 29.0 0.000000 204.0 204.0 \n", + "\n", + "[10403 rows x 10 columns]" + ] + }, + "execution_count": 163, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Combine_Df_MSOA['diff'] = Combine_Df_MSOA['Cencus_count'] - Combine_Df_MSOA['ACBM_count']\n", + "Combine_Df_MSOA['diff_abs'] = abs(Combine_Df_MSOA['diff'])\n", + "Combine_Df_MSOA" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ACBM average travel distance 7941.936440896207 \n", + " Census average travel distance 2553.8742505636333\n" + ] + } + ], + "source": [ + "# average travel distance calcuation\n", + "\n", + "Census_distance = sum(Combine_Df_MSOA[\"Cencus_count\"] * Combine_Df_MSOA[\"Dist\"]) / sum(\n", + " Combine_Df_MSOA[\"Cencus_count\"]\n", + ")\n", + "\n", + "ACBM_distance = sum(Combine_Df_MSOA[\"ACBM_count\"] * Combine_Df_MSOA[\"Dist\"]) / sum(\n", + " Combine_Df_MSOA[\"ACBM_count\"]\n", + ")\n", + "\n", + "print(\n", + " \"ACBM average travel distance\",\n", + " ACBM_distance,\n", + " \"\\n Census average travel distance\",\n", + " Census_distance,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Cencus_countACBM_countdiffdiff_absdiff_percentagediff_per_abs
Middle layer Super Output Areas code
E0200233024851057.01428.02892.00.5746481.163783
E020023312426836.01590.02746.00.6554001.131904
E020023322412823.01589.02317.00.6587890.960614
E0200233330911204.01887.03003.00.6104820.971530
E0200233428951035.01860.03110.00.6424871.074266
.....................
E0200243725481010.01538.02032.00.6036110.797488
E0200685255081528.03980.04888.00.7225850.887436
E020068614196934.03262.03846.00.7774070.916587
E020068753814971.02843.03749.00.7454120.982958
E020068763997826.03171.03575.00.7933450.894421
\n", + "

107 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " Cencus_count ACBM_count diff \\\n", + "Middle layer Super Output Areas code \n", + "E02002330 2485 1057.0 1428.0 \n", + "E02002331 2426 836.0 1590.0 \n", + "E02002332 2412 823.0 1589.0 \n", + "E02002333 3091 1204.0 1887.0 \n", + "E02002334 2895 1035.0 1860.0 \n", + "... ... ... ... \n", + "E02002437 2548 1010.0 1538.0 \n", + "E02006852 5508 1528.0 3980.0 \n", + "E02006861 4196 934.0 3262.0 \n", + "E02006875 3814 971.0 2843.0 \n", + "E02006876 3997 826.0 3171.0 \n", + "\n", + " diff_abs diff_percentage diff_per_abs \n", + "Middle layer Super Output Areas code \n", + "E02002330 2892.0 0.574648 1.163783 \n", + "E02002331 2746.0 0.655400 1.131904 \n", + "E02002332 2317.0 0.658789 0.960614 \n", + "E02002333 3003.0 0.610482 0.971530 \n", + "E02002334 3110.0 0.642487 1.074266 \n", + "... ... ... ... \n", + "E02002437 2032.0 0.603611 0.797488 \n", + "E02006852 4888.0 0.722585 0.887436 \n", + "E02006861 3846.0 0.777407 0.916587 \n", + "E02006875 3749.0 0.745412 0.982958 \n", + "E02006876 3575.0 0.793345 0.894421 \n", + "\n", + "[107 rows x 6 columns]" + ] + }, + "execution_count": 166, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum_dj = Combine_Df_MSOA.groupby(\"Middle layer Super Output Areas code\").agg(\n", + " {\"Cencus_count\": \"sum\", \"ACBM_count\": \"sum\",'diff': 'sum','diff_abs': 'sum'}\n", + ")\n", + "sum_dj[\"diff_percentage\"] = sum_dj[\"diff\"] / sum_dj[\"Cencus_count\"]\n", + "sum_dj[\"diff_per_abs\"] = abs(sum_dj[\"diff_abs\"]) / sum_dj[\"Cencus_count\"]\n", + "\n", + "\n", + "sum_dj" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- • absolute map flow differences:\n", + "\n", + "$$\n", + "Gi=\\sum\\limits_{j}(|(T_{ij}-T^{obs}_{ij})|)\n", + "$$\n" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 165, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(10, 8))\n", + "plt.axis(\"off\")\n", + "plt.title(\"Absolute value difference in flow between SPC and Census\")\n", + "MSOA_boundaries.join(sum_dj, on=\"MSOA21CD\").plot(\n", + " column=\"diff_abs\", ax=ax, legend=True, cmap=\"Reds\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- • absolute map percentage differences:\n", + "\n", + "$$\n", + "Gi=\\sum\\limits_{j}(|(T_{ij}-T^{obs}_{ij})|)/\\sum\\limits_{lk} T^{obs}_{lk}\n", + "$$\n" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 168, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(10, 8))\n", + "plt.axis(\"off\")\n", + "plt.title(\"Absolute percentage difference in flow between SPC and Census\")\n", + "MSOA_boundaries.join(sum_dj, on=\"MSOA21CD\").plot(\n", + " column=\"diff_per_abs\", ax=ax, legend=True, cmap=\"Reds\"\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 9930a17ef46d4ac24e029b088c79ad725ffd287f Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Fri, 13 Dec 2024 18:14:19 +0000 Subject: [PATCH 46/56] Add todo --- src/acbm/assigning/select_facility.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/acbm/assigning/select_facility.py b/src/acbm/assigning/select_facility.py index 0429a76..28a9e0d 100644 --- a/src/acbm/assigning/select_facility.py +++ b/src/acbm/assigning/select_facility.py @@ -65,6 +65,7 @@ def _select_facility( {unique_id_col: (np.nan, np.nan)} if no suitable facility is found. """ # ----- Step 1. Find valid facilities in the destination zone + # Added to enable multiprocessing pd.options.mode.copy_on_write = True # Extract the destination zone from the input row destination_zone = row[row_destination_zone_col] @@ -204,6 +205,7 @@ def select_facility( keys with selected facility ID and facility ID's geometry, or (np.nan, np.nan) """ # TODO: update this to be configurable, `None` is os.process_cpu_count() + # TODO: check if this is deterministic for a given seed (or pass seed to pool) with Pool(None) as p: # Set to a large enough chunk size so that each process # has a sufficiently large amount of processing to do. @@ -212,6 +214,7 @@ def select_facility( for start in tqdm(range(0, df.shape[0], chunk_size)): chunk = df.iloc[start : start + chunk_size, :] args = [ + # TODO: add seed (derived from global seed as an argument) ( row, unique_id_col, From 1a90abcb5cae35548329649a2cd04a2d697baede Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Fri, 13 Dec 2024 18:14:48 +0000 Subject: [PATCH 47/56] Add Leeds config with travel times --- config/leeds_with_travel_times.toml | 42 +++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 config/leeds_with_travel_times.toml diff --git a/config/leeds_with_travel_times.toml b/config/leeds_with_travel_times.toml new file mode 100644 index 0000000..8c3f920 --- /dev/null +++ b/config/leeds_with_travel_times.toml @@ -0,0 +1,42 @@ +[parameters] +seed = 0 +region = "leeds" +zone_id = "OA21CD" +travel_times = true +boundary_geography = "OA" +nts_years = [2019, 2021, 2022] +nts_regions = [ + 'Yorkshire and the Humber', + 'North West', + 'North East', + 'East Midlands', + 'West Midlands', + 'East of England', + 'South East', + 'South West', +] +nts_day_of_week = 3 +output_crs = 4326 + +[work_assignment] +use_percentages = false +weight_max_dev = 0.0 +weight_total_dev = 1.0 +max_zones = 4 +commute_level = "OA" + +[matching] +required_columns = ["number_adults", "number_children"] +optional_columns = [ + "number_cars", + "num_pension_age", + "rural_urban_2_categories", + "employment_status", + "tenure_status", +] +n_matches = 10 +chunk_size = 50000 + +[postprocessing] +pam_jitter = 30 +pam_min_duration = 10 From 265a6c4d1dd66fd3594630303502ca8245bdb704 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Sun, 15 Dec 2024 18:08:29 +0000 Subject: [PATCH 48/56] Add manual download fallback if pyrosm fails --- scripts/0.1_run_osmox.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/scripts/0.1_run_osmox.py b/scripts/0.1_run_osmox.py index bad7790..7bb1163 100644 --- a/scripts/0.1_run_osmox.py +++ b/scripts/0.1_run_osmox.py @@ -1,5 +1,7 @@ +import os import subprocess +import requests from pyrosm import get_data from acbm.cli import acbm_cli @@ -11,7 +13,20 @@ def main(config_file): config = load_and_setup_config(config_file) logger = config.get_logger("preprocessing", __file__) logger.info("Getting OSM data") - fp = get_data(config.region, directory=config.osmox_path) + try: + fp = get_data(config.region, directory=config.osmox_path) + except Exception as e: + logger.error(e) + logger.info(f"Trying manual download of region: {config.region}") + url = f"http://download.geofabrik.de/europe/united-kingdom/england/{config.region}-latest.osm.pbf" + filename = url.split("/")[-1] + fp = os.path.join(config.osmox_path, filename) + response = requests.get(url, stream=True) + response.raise_for_status() + with open(fp, "wb") as file: + for chunk in response.iter_content(chunk_size=8192): + file.write(chunk) + logger.info(f"Region ({config.region}) successfully downloaded to: {fp}") logger.info("Running osmox") subprocess.run( [ From 54bae53a8a7b12d19a2d8bdc4257b8d01fde3b94 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Mon, 16 Dec 2024 18:22:39 +0000 Subject: [PATCH 49/56] Add travel times plots --- scripts/3.3_assign_facility_all.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/scripts/3.3_assign_facility_all.py b/scripts/3.3_assign_facility_all.py index b46138c..21d329a 100644 --- a/scripts/3.3_assign_facility_all.py +++ b/scripts/3.3_assign_facility_all.py @@ -6,6 +6,7 @@ from acbm.assigning.select_facility import map_activity_locations, select_facility from acbm.cli import acbm_cli from acbm.config import load_and_setup_config +from acbm.utils import get_travel_times @acbm_cli @@ -336,6 +337,28 @@ def main(config_file): save_dir=config.output_path / "plots/assigning/", ) + # Add travel times + tte = get_travel_times(config) + activity_chains_all = activity_chains_all.merge( + tte[[tte.columns[0], tte.columns[1], "mode", "time"]], + left_on=["ozone", "dzone", "mode"], + right_on=[tte.columns[0], tte.columns[1], "mode"], + how="left", + ) + # Iterate over each unique activity type and create a plot + for activity_type in unique_activity_types: + plot_scatter_actual_reported( + activities=activity_chains_all, + activity_type=activity_type, + activity_type_col="destination activity", + x_col="TripTotalTime", + y_col="time", + x_label="Reported Travel TIme (min)", + y_label="Modelled time (min)", + crs=f"EPSG:{config.output_crs}", + title_prefix="Scatter plot of TripTotalTime vs. Modelled time", + save_dir=config.output_path / "plots/assigning/", + ) # .... # Plot 3: Desire lines between start and end locations From eda975bd36cb934739a55529a5aa38a5f7baf6f1 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Mon, 16 Dec 2024 20:18:57 +0000 Subject: [PATCH 50/56] Revise CRS for plots from config --- src/acbm/assigning/plots.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/acbm/assigning/plots.py b/src/acbm/assigning/plots.py index 26d7d51..7c47a17 100644 --- a/src/acbm/assigning/plots.py +++ b/src/acbm/assigning/plots.py @@ -291,7 +291,7 @@ def plot_desire_lines( ) # convert crs to metric - activity_chains_plot = activity_chains_plot.to_crs(epsg=3857) + activity_chains_plot = activity_chains_plot.to_crs(crs=crs) # calculate the length of the line_geometry in meters activity_chains_plot["length"] = activity_chains_plot["line_geometry"].length @@ -429,7 +429,7 @@ def plot_scatter_actual_reported( activity_chains_plot = activity_chains_plot.set_geometry("line_geometry", crs=crs) # convert crs to metric - activity_chains_plot = activity_chains_plot.to_crs(epsg=3857) + activity_chains_plot = activity_chains_plot.to_crs(crs=crs) # calculate the length of the line_geometry in meters activity_chains_plot["length"] = activity_chains_plot["line_geometry"].length From a13616388fd3c669de7c905f50606c283fb7b1ff Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Mon, 16 Dec 2024 20:19:21 +0000 Subject: [PATCH 51/56] Add y_scale parameter --- scripts/3.3_assign_facility_all.py | 1 + src/acbm/assigning/plots.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/3.3_assign_facility_all.py b/scripts/3.3_assign_facility_all.py index 21d329a..2099b9b 100644 --- a/scripts/3.3_assign_facility_all.py +++ b/scripts/3.3_assign_facility_all.py @@ -358,6 +358,7 @@ def main(config_file): crs=f"EPSG:{config.output_crs}", title_prefix="Scatter plot of TripTotalTime vs. Modelled time", save_dir=config.output_path / "plots/assigning/", + y_scale=1.0, ) # .... diff --git a/src/acbm/assigning/plots.py b/src/acbm/assigning/plots.py index 7c47a17..80a7bd1 100644 --- a/src/acbm/assigning/plots.py +++ b/src/acbm/assigning/plots.py @@ -390,6 +390,7 @@ def plot_scatter_actual_reported( crs: str, save_dir: str | Path | None = None, display: bool = False, + y_scale: float = 1 / 1000, ): """ Plots scatter plots with trend lines for different modes in activity chains. @@ -451,11 +452,11 @@ def plot_scatter_actual_reported( # Plot the scatter plot ax = axs[i] ax.scatter( - subset_mode[x_col], subset_mode[y_col] / 1000, alpha=0.1, lw=0 + subset_mode[x_col], subset_mode[y_col] * y_scale, alpha=0.1, lw=0 ) # Use a single color for all plots # Calculate and plot the trend line - z = np.polyfit(subset_mode[x_col], subset_mode[y_col] / 1000, 1) + z = np.polyfit(subset_mode[x_col], subset_mode[y_col] * y_scale, 1) p = np.poly1d(z) ax.plot(subset_mode[x_col], p(subset_mode[x_col]), "r--") From c779bcd507ea500a0e01dda86e834c0f116cbdeb Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 17 Dec 2024 21:31:32 +0000 Subject: [PATCH 52/56] Add tolerance parameters --- scripts/3.1_assign_primary_feasible_zones.py | 8 ++++++-- src/acbm/config.py | 2 ++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/scripts/3.1_assign_primary_feasible_zones.py b/scripts/3.1_assign_primary_feasible_zones.py index b61fa23..613e06f 100644 --- a/scripts/3.1_assign_primary_feasible_zones.py +++ b/scripts/3.1_assign_primary_feasible_zones.py @@ -200,7 +200,9 @@ def main(config_file): zone_id=config.zone_id, filter_by_activity=True, activity_col="education_type", - time_tolerance=0.3, + time_tolerance=config.parameters.tolerance_edu + if config.parameters.tolerance_edu is not None + else 0.3, ) logger.info("Saving feasible zones for education activities") @@ -225,7 +227,9 @@ def main(config_file): zone_id=config.zone_id, filter_by_activity=True, activity_col="dact", - time_tolerance=0.3, + time_tolerance=config.parameters.tolerance_work + if config.parameters.tolerance_work is not None + else 0.3, ) logger.info("Saving feasible zones for work activities") diff --git a/src/acbm/config.py b/src/acbm/config.py index c10eb0c..2cfa99f 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -28,6 +28,8 @@ class Parameters(BaseModel): nts_regions: list[str] nts_day_of_week: int output_crs: int + tolerance_work: float | None = None + tolerance_edu: float | None = None @dataclass(frozen=True) From 098a8473eec449ddc1319d039ed0b2ef11ee2571 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 14 Jan 2025 18:28:35 +0000 Subject: [PATCH 53/56] Update config --- config/base.toml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/config/base.toml b/config/base.toml index da41aa4..410dc11 100644 --- a/config/base.toml +++ b/config/base.toml @@ -48,3 +48,15 @@ max_zones = 8 # maximum number of feasible zones to include in the opti [postprocessing] pam_jitter = 30 pam_min_duration = 10 +# for get_pt_subscription: everyone above this age has a subscription (pensioners get free travel) +# TODO: more sophisticated approach +pt_subscription_age = 66 +# to define if a person is a student: +# eveyone below this age is a student +student_age_base = 16 +# everyone below this age that has at least one "education" activity is a student +student_age_upper = 30 +# eveyone who uses one of the modes below is classified as a passenger (isPassenger = True) +modes_passenger = ['car_passenger', 'taxi'] +# yearly state pension: for getting hhlIncome of pensioners +state_pension = 11502 From b3d5581c3f5f771b95bae902decea0d79122ed4b Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 14 Jan 2025 18:29:16 +0000 Subject: [PATCH 54/56] Fix test --- tests/test_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_config.py b/tests/test_config.py index 2bf69d6..5ac5b86 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -11,4 +11,4 @@ def config(): def test_id(config): - assert config.id == "a89b65de35" + assert config.id == "e4589718b2" From 8e4a2887f8810657faacbde35942488cadc5caba Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 30 Jan 2025 17:18:52 +0000 Subject: [PATCH 55/56] Remove prepend_datetime --- src/acbm/logger_config.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/acbm/logger_config.py b/src/acbm/logger_config.py index 1843232..8229077 100644 --- a/src/acbm/logger_config.py +++ b/src/acbm/logger_config.py @@ -1,11 +1,4 @@ import logging -from datetime import datetime - - -def prepend_datetime(s: str, delimiter: str = "_") -> str: - current_date = datetime.now().strftime("%Y-%m-%d") - return f"{current_date}{delimiter}{s}" - # # Configure the root logger # logging.basicConfig( From 256ebde3e5a112dbfc1506b3463dcfc7de5836c5 Mon Sep 17 00:00:00 2001 From: Sam Greenbury <50113363+sgreenbury@users.noreply.github.com> Date: Thu, 30 Jan 2025 17:40:29 +0000 Subject: [PATCH 56/56] Revise comment --- scripts/0.1_run_osmox.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/0.1_run_osmox.py b/scripts/0.1_run_osmox.py index 7bb1163..05eb39d 100644 --- a/scripts/0.1_run_osmox.py +++ b/scripts/0.1_run_osmox.py @@ -39,7 +39,9 @@ def main(config_file): "-f", "geoparquet", "-crs", - # TODO: check if this can be specified as the output CRS + # For the distances to be accurate, needs to be same CRS as OSM data for the region. + # However, distances from osmox are currently not used in the pipeline so any CRS will work. + # In general, the CRS is transformed in the pipeline when this data is used. # See: https://github.com/arup-group/osmox/blob/82602d411374ebc9fd33443f8f7c9816b63715ec/docs/osmox_run.md#L35-L38 "epsg:27700", "-l",