Merge pull request #71 from Urban-Analytics-Technology-Platform/53-paths

* Revise the filepath structure for pipeline outputs, include @Property methods within the config providing the paths and refactor scripts to use these * Add a `Population` class for reading pipeline outputs * Fallback download option when pyrosm fails (e.g. for West Yorkshire) * Configurable time tolerance * Rewrite of the run pipeline script with Python * Multiprocessing for script 3.3
Urban-Analytics-Technology-Platform · Jan 30, 2025 · cdb1389 · cdb1389
2 parents 1cec3f7 + 256ebde
commit cdb1389
Show file tree

Hide file tree

Showing 36 changed files with 5,523 additions and 703 deletions.
diff --git a/README.md b/README.md
@@ -89,37 +89,37 @@ The pipeline is a series of scripts that are run in sequence to generate the act
 │   │   │           ├── trip_eul_2002-2022.tab
 │   │   │           └── <other_nts_tables>.tab
 │   │   ├── travel_times
-│   │   |    ├── oa
-│   │   |    |   ├── travel_time_matrix.parquet
-|   |   |    └── msoa
-│   │   |        └── travel_time_matrix.parquet
+│   │   │    ├── oa
+│   │   │    |   ├── travel_time_matrix.parquet
+|   |   │    └── msoa
+│   │   │        └── travel_time_matrix.parquet
 │   │   ├── ODWP01EW_OA.zip
 │   │   ├── ODWP15EW_MSOA_v1.zip
-│   │   ├── spc_output
-│   │   │   ├── <region>>_people_hh.parquet (Generated in Script 1)
-│   │   │   ├── <region>>_people_tu.parquet (Generated in Script 1)
-│   │   │   ├── raw
-│   │   │   │   ├── <region>_households.parquet
-│   │   │   │   ├── <region>_info_per_msoa.json
-│   │   │   │   ├── <region>.pb
-│   │   │   │   ├── <region>_people.parquet
-│   │   │   │   ├── <region>_time_use_diaries.parquet
-│   │   │   │   ├── <region>_venues.parquet
-│   │   │   │   ├── README.md
-│   ├── interim
-│   │   ├── assigning (Generated in Script 3)
-│   │   └── matching (Generated in Script 2)
-│   └── processed
-│       ├── acbm_<config_name>_<date>
-│       │   ├── activities.csv
-│       │   ├── households.csv
-│       │   ├── legs.csv
-│       │   ├── legs_with_locations.parquet
-│       │   ├── people.csv
-│       │   └── plans.xml
-│       ├── plots
-│       │   ├── assigning
-│       │   └── validation
+│   │   └── spc_output
+│   │      └── raw
+│   │          ├── <region>_households.parquet
+│   │          ├── <region>_info_per_msoa.json
+│   │          ├── <region>.pb
+│   │          ├── <region>_people.parquet
+│   │          ├── <region>_time_use_diaries.parquet
+│   │          ├── <region>_venues.parquet
+│   │          └── README.md
+│   └── outputs
+│       └- <config_id>
+│           │  
+│           ├── interim
+│           │       ├── <region>>_people_hh.parquet (Generated in Script 1)
+│           │       ├── assigning (Generated in Script 3)
+│           │       └── matching (Generated in Script 2)
+│           ├── activities.csv
+│           ├── households.csv
+│           ├── legs.csv
+│           ├── legs_with_locations.parquet
+│           ├── people.csv
+│           ├── plans.xml
+│           ├── plots
+│           │   ├── assigning
+│           │   └── validation
 ```
 
 ## Step 1: Prepare Data Inputs

diff --git a/config/base.toml b/config/base.toml
@@ -48,3 +48,15 @@ max_zones = 8          # maximum number of feasible zones to include in the opti
 [postprocessing]
 pam_jitter = 30
 pam_min_duration = 10
+# for get_pt_subscription: everyone above this age has a subscription (pensioners get free travel)
+# TODO: more sophisticated approach
+pt_subscription_age = 66
+# to define if a person is a student:
+# eveyone below this age is a student
+student_age_base = 16
+# everyone below this age that has at least one "education" activity is a student
+student_age_upper = 30
+# eveyone who uses one of the modes below is classified as a passenger (isPassenger = True)
+modes_passenger =  ['car_passenger', 'taxi']
+# yearly state pension: for getting hhlIncome of pensioners
+state_pension = 11502
diff --git a/config/greater-london.toml b/config/greater-london.toml
@@ -0,0 +1,33 @@
+[parameters]
+seed = 0
+region = "greater-london"
+zone_id = "MSOA21CD"
+travel_times = false
+boundary_geography = "MSOA"
+nts_years = [2019, 2021, 2022]
+nts_regions = ["London"]
+nts_day_of_week = 3
+output_crs = 4326
+
+[work_assignment]
+use_percentages = true
+weight_max_dev = 0.0
+weight_total_dev = 1.0
+max_zones = 4
+commute_level = "MSOA"
+
+[matching]
+required_columns = ["number_adults", "number_children"]
+optional_columns = [
+    "number_cars",
+    "num_pension_age",
+    "rural_urban_2_categories",
+    "employment_status",
+    "tenure_status",
+]
+n_matches = 10
+chunk_size = 50000
+
+[postprocessing]
+pam_jitter = 30
+pam_min_duration = 10
diff --git a/config/leeds.toml b/config/leeds.toml
@@ -0,0 +1,42 @@
+[parameters]
+seed = 0
+region = "leeds"
+zone_id = "OA21CD"
+travel_times = false
+boundary_geography = "OA"
+nts_years = [2019, 2021, 2022]
+nts_regions = [
+    'Yorkshire and the Humber',
+    'North West',
+    'North East',
+    'East Midlands',
+    'West Midlands',
+    'East of England',
+    'South East',
+    'South West',
+]
+nts_day_of_week = 3
+output_crs = 4326
+
+[work_assignment]
+use_percentages = false
+weight_max_dev = 0.0
+weight_total_dev = 1.0
+max_zones = 4
+commute_level = "OA"
+
+[matching]
+required_columns = ["number_adults", "number_children"]
+optional_columns = [
+    "number_cars",
+    "num_pension_age",
+    "rural_urban_2_categories",
+    "employment_status",
+    "tenure_status",
+]
+n_matches = 10
+chunk_size = 50000
+
+[postprocessing]
+pam_jitter = 30
+pam_min_duration = 10
diff --git a/config/leeds_with_travel_times.toml b/config/leeds_with_travel_times.toml
@@ -0,0 +1,42 @@
+[parameters]
+seed = 0
+region = "leeds"
+zone_id = "OA21CD"
+travel_times = true
+boundary_geography = "OA"
+nts_years = [2019, 2021, 2022]
+nts_regions = [
+    'Yorkshire and the Humber',
+    'North West',
+    'North East',
+    'East Midlands',
+    'West Midlands',
+    'East of England',
+    'South East',
+    'South West',
+]
+nts_day_of_week = 3
+output_crs = 4326
+
+[work_assignment]
+use_percentages = false
+weight_max_dev = 0.0
+weight_total_dev = 1.0
+max_zones = 4
+commute_level = "OA"
+
+[matching]
+required_columns = ["number_adults", "number_children"]
+optional_columns = [
+    "number_cars",
+    "num_pension_age",
+    "rural_urban_2_categories",
+    "employment_status",
+    "tenure_status",
+]
+n_matches = 10
+chunk_size = 50000
+
+[postprocessing]
+pam_jitter = 30
+pam_min_duration = 10
diff --git a/notebooks/Validation_AcBM_with_Cencus.ipynb b/notebooks/Validation_AcBM_with_Cencus.ipynb
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,9 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "acbm"
 version = "0.1.0"
-authors = [
-  "Hussein Mahfouz <[email protected]>",
-]
+authors = ["Hussein Mahfouz <[email protected]>"]
 homepage = "https://github.com/alan-turing-institute/acbm"
 repository = "https://github.com/alan-turing-institute/acbm"
 license = "Apache-2.0"
@@ -28,7 +26,7 @@ python = "^3.10"
 pytest = { version = ">=6", optional = true }
 pytest-cov = { version = ">=3", optional = true }
 pandas = "^2.2.0"
-uatk-spc = {git = "https://github.com/alan-turing-institute/uatk-spc.git", subdirectory = "python"}
+uatk-spc = { git = "https://github.com/alan-turing-institute/uatk-spc.git", subdirectory = "python" }
 geopandas = "^0.14.3"
 matplotlib = "^3.8.3"
 scikit-learn = "^1.4.1.post1"
@@ -43,9 +41,10 @@ tomlkit = "^0.13.0"
 cml-pam = "0.3.2"
 gdal = "<=3.8.4"
 pandera = "^0.20.4"
-osmox = {git = "https://github.com/arup-group/osmox"}
+osmox = { git = "https://github.com/arup-group/osmox" }
 pyrosm = "^0.6.2"
 jsonschema = "^4.23.0"
+jcs = "^0.2.1"
 
 [tool.poetry.dev-dependencies]
 pytest = ">= 6"
@@ -62,22 +61,13 @@ ipykernel = "^6.29.4"
 minversion = "6.0"
 addopts = ["-ra", "--showlocals", "--strict-markers", "--strict-config"]
 xfail_strict = true
-filterwarnings = [
-  "error",
-]
+filterwarnings = ["error"]
 log_cli_level = "INFO"
-testpaths = [
-  "tests",
-]
+testpaths = ["tests"]
 
 [tool.coverage]
 run.source = ["acbm"]
-port.exclude_lines = [
-  'pragma: no cover',
-  '\.\.\.',
-  'if typing.TYPE_CHECKING:',
-]
-
+port.exclude_lines = ['pragma: no cover', '\.\.\.', 'if typing.TYPE_CHECKING:']
 
 
 [tool.ruff]
@@ -86,29 +76,30 @@ exclude = []
 line-length = 88 # how long you want lines to be
 
 [tool.ruff.format]
-docstring-code-format = true  # code snippets in docstrings will be formatted
+docstring-code-format = true # code snippets in docstrings will be formatted
 
 [tool.ruff.lint]
 select = [
-  "E", "F", "W", # flake8
-  "B",           # flake8-bugbear
-  "I",           # isort
-  "ARG",         # flake8-unused-arguments
-  "C4",          # flake8-comprehensions
-  "EM",          # flake8-errmsg
-  "ICN",         # flake8-import-conventions
-  "ISC",         # flake8-implicit-str-concat
-  "G",           # flake8-logging-format
-  "PGH",         # pygrep-hooks
-  "PIE",         # flake8-pie
-  "PL",          # pylint
-  "PT",          # flake8-pytest-style
-  "RET",         # flake8-return
-  "RUF",         # Ruff-specific
-  "SIM",         # flake8-simplify
-  "UP",          # pyupgrade
-  "YTT",         # flake8-2020
-  "EXE",         # flake8-executable
+  "E",
+  "F",
+  "W",   # flake8
+  "B",   # flake8-bugbear
+  "I",   # isort
+  "ARG", # flake8-unused-arguments
+  "C4",  # flake8-comprehensions
+  "EM",  # flake8-errmsg
+  "ICN", # flake8-import-conventions
+  "ISC", # flake8-implicit-str-concat
+  "G",   # flake8-logging-format
+  "PGH", # pygrep-hooks
+  "PIE", # flake8-pie
+  "PL",  # pylint
+  "RET", # flake8-return
+  "RUF", # Ruff-specific
+  "SIM", # flake8-simplify
+  "UP",  # pyupgrade
+  "YTT", # flake8-2020
+  "EXE", # flake8-executable
 ]
 
 ignore = [
@@ -118,7 +109,7 @@ ignore = [
   "G004",   # Logging statement uses f-string, not necessary here
 ]
 unfixable = [
-  "F401",   # Would remove unused imports
-  "F841",   # Would remove unused variables
+  "F401", # Would remove unused imports
+  "F841", # Would remove unused variables
 ]
-flake8-unused-arguments.ignore-variadic-names = true  # allow unused *args/**kwargs
+flake8-unused-arguments.ignore-variadic-names = true # allow unused *args/**kwargs