From 84a927312509ac568ca5ca480a7851a012627c6c Mon Sep 17 00:00:00 2001
From: Hussein Mahfouz <45176416+Hussein-Mahfouz@users.noreply.github.com>
Date: Fri, 13 Dec 2024 18:49:19 +0100
Subject: [PATCH 01/28] adjusted distances

---
 config/base.toml                             |  7 ++++
 scripts/3.1_assign_primary_feasible_zones.py | 10 +++++-
 scripts/3.3_assign_facility_all.py           | 12 +++++--
 scripts/4_validation.py                      |  2 ++
 scripts/run_pipeline.sh                      | 10 +++---
 src/acbm/assigning/feasible_zones_primary.py | 17 +++++++--
 src/acbm/assigning/plots.py                  | 19 +++++++++++
 src/acbm/assigning/utils.py                  | 36 ++++++++++++++++++++
 src/acbm/config.py                           | 11 +++++-
 src/acbm/validating/utils.py                 | 23 +++++++++++--
 10 files changed, 133 insertions(+), 14 deletions(-)

diff --git a/config/base.toml b/config/base.toml
index da41aa4..acd93c6 100644
--- a/config/base.toml
+++ b/config/base.toml
@@ -37,6 +37,13 @@ optional_columns = [
 ]
 n_matches = 10 # What is the maximum number of NTS matches we want for each SPC household?
 
+[feasible_assignment]
+# detour factor when converting euclidian distance to actual travel distance
+detour_factor = 1.56
+# decay rate when converting euclidian to travel distance (0.0001 is a good value)
+# actual_distance = distance * (1 + ((detour_factor - 1) * np.exp(-decay_rate * distance)))
+decay_rate = 0.0001
+
 [work_assignment]
 commute_level = "OA"
 use_percentages = true # if true, optimization problem will try to minimize percentage difference at OD level (not absolute numbers). Recommended to set it to true
diff --git a/scripts/3.1_assign_primary_feasible_zones.py b/scripts/3.1_assign_primary_feasible_zones.py
index 85ba982..11c9a83 100644
--- a/scripts/3.1_assign_primary_feasible_zones.py
+++ b/scripts/3.1_assign_primary_feasible_zones.py
@@ -82,7 +82,11 @@ def main(config_file):
     logger.info("Creating estimated travel times matrix")
     # Create a new travel time matrix based on distances between zones
     travel_time_estimates = zones_to_time_matrix(
-        zones=boundaries, id_col=config.zone_id, time_units="m"
+        zones=boundaries,
+        id_col=config.zone_id,
+        time_units="m",
+        detour_factor=config.feasible_assignment.detour_factor,
+        decay_rate=config.feasible_assignment.decay_rate,
     )
     logger.info("Travel time estimates created")
 
@@ -223,6 +227,8 @@ def main(config_file):
         filter_by_activity=True,
         activity_col="education_type",
         time_tolerance=0.3,
+        detour_factor=config.feasible_assignment.detour_factor,
+        decay_rate=config.feasible_assignment.decay_rate,
     )
 
     logger.info("Saving feasible zones for education activities")
@@ -250,6 +256,8 @@ def main(config_file):
         filter_by_activity=True,
         activity_col="dact",
         time_tolerance=0.3,
+        detour_factor=config.feasible_assignment.detour_factor,
+        decay_rate=config.feasible_assignment.decay_rate,
     )
 
     logger.info("Saving feasible zones for work activities")
diff --git a/scripts/3.3_assign_facility_all.py b/scripts/3.3_assign_facility_all.py
index f0d3097..cae6a79 100644
--- a/scripts/3.3_assign_facility_all.py
+++ b/scripts/3.3_assign_facility_all.py
@@ -321,7 +321,10 @@ def main(config_file):
             x_col="TripDisIncSW",
             y_col="length",
             x_label="Reported Travel Distance (km)",
-            y_label="Actual Distance - Euclidian (km)",
+            y_label="Actual Distance - Estimated (km)",
+            detour_factor=config.feasible_assignment.detour_factor,
+            decay_rate=config.feasible_assignment.decay_rate,
+            x_axis_max=50,
             crs=f"EPSG:{config.output_crs}",
             title_prefix=f"Scatter plot of TripDisIncSW vs. Length for {activity_type}",
             save_dir=acbm.root_path / "data/processed/plots/assigning/",
@@ -344,8 +347,11 @@ def main(config_file):
             activity_type_col="destination activity",
             x_col="TripTotalTime",
             y_col="length",
-            x_label="Reported Travel TIme (min)",
-            y_label="Actual Distance - Euclidian (km)",
+            x_label="Reported Travel Time (min)",
+            y_label="Actual Distance - Estimated (km)",
+            detour_factor=config.feasible_assignment.detour_factor,
+            decay_rate=config.feasible_assignment.decay_rate,
+            x_axis_max=180,
             crs=f"EPSG:{config.output_crs}",
             title_prefix="Scatter plot of TripTotalTime vs. Length",
             save_dir=acbm.root_path / "data/processed/plots/assigning/",
diff --git a/scripts/4_validation.py b/scripts/4_validation.py
index 1dbb5b3..a8e2080 100644
--- a/scripts/4_validation.py
+++ b/scripts/4_validation.py
@@ -227,6 +227,8 @@ def main(config_file):
         start_wkt_col="start_location_geometry_wkt",
         end_wkt_col="end_location_geometry_wkt",
         crs_epsg=config.output_crs,
+        detour_factor=1.56,
+        decay_rate=0.0001,
     )
 
     # Plot: Aggregate
diff --git a/scripts/run_pipeline.sh b/scripts/run_pipeline.sh
index 9da0845..8944c2b 100755
--- a/scripts/run_pipeline.sh
+++ b/scripts/run_pipeline.sh
@@ -5,11 +5,11 @@ set -e
 # python scripts/0_preprocess_inputs.py --config_file $1
 # python scripts/0.1_run_osmox.py --config_file $1
 # python scripts/1_prep_synthpop.py --config_file $1
-python scripts/2_match_households_and_individuals.py --config_file $1
-python scripts/3.1_assign_primary_feasible_zones.py --config_file $1
-python scripts/3.2.1_assign_primary_zone_edu.py --config_file $1
-python scripts/3.2.2_assign_primary_zone_work.py --config_file $1
-python scripts/3.2.3_assign_secondary_zone.py --config_file $1
+# python scripts/2_match_households_and_individuals.py --config_file $1
+# python scripts/3.1_assign_primary_feasible_zones.py --config_file $1
+# python scripts/3.2.1_assign_primary_zone_edu.py --config_file $1
+# python scripts/3.2.2_assign_primary_zone_work.py --config_file $1
+# python scripts/3.2.3_assign_secondary_zone.py --config_file $1
 python scripts/3.3_assign_facility_all.py --config_file $1
 python scripts/4_validation.py --config_file $1
 python scripts/5_acbm_to_matsim_xml.py --config_file $1
diff --git a/src/acbm/assigning/feasible_zones_primary.py b/src/acbm/assigning/feasible_zones_primary.py
index b451257..92c4d9e 100644
--- a/src/acbm/assigning/feasible_zones_primary.py
+++ b/src/acbm/assigning/feasible_zones_primary.py
@@ -75,6 +75,8 @@ def get_possible_zones(
     travel_times: Optional[pd.DataFrame] = None,
     filter_by_activity: bool = False,
     time_tolerance: float = 0.2,
+    detour_factor: float = 1.56,
+    decay_rate: float = 0.0001,
 ) -> dict:
     """
     Get possible zones for all activity chains in the dataset. This function loops over the travel_times dataframe and filters by mode, time of day and weekday/weekend.
@@ -104,6 +106,13 @@ def get_possible_zones(
         activity chain's travel time (which is stored in "TripTotalTime"). Allowable travel_times are those that fall in the range of:
         travel_time_reported * (1 - time_tolerance) <= travel_time_reported <= travel_time_reported * (1 + time_tolerance)
         Default = 0.2
+    detour_factor: float
+        The detour factor used to calculate travel distances between zones from euclidian distances.
+        Default = 1.56
+    decay_rate: float
+        The decay rate used to calculate travel times between zones from travel distances. Detours make up a smaller portion of
+        longer distance trips. Default = 0.0001
+
 
     Returns
     -------
@@ -135,7 +144,11 @@ def get_possible_zones(
     if travel_times is None:
         logger.info("Travel time matrix not provided: Creating travel times estimates")
         travel_times = zones_to_time_matrix(
-            zones=boundaries, id_col=zone_id, time_units="m"
+            zones=boundaries,
+            id_col=zone_id,
+            time_units="m",
+            detour_factor=detour_factor,
+            decay_rate=decay_rate,
         )
 
     list_of_modes = activity_chains["mode"].unique()
@@ -224,7 +237,7 @@ def get_possible_zones(
             travel_times_filtered_mode_time_day = travel_times_filtered_mode
             # if mode = car_passenger, we compare to the travel time for car (we don't
             # have travel times for car_passenger)
-            if mode == "car_passenger":
+            if mode in ("car_passenger", "taxi"):
                 activity_chains_filtered = activity_chains[
                     (activity_chains["mode"] == "car")
                 ]
diff --git a/src/acbm/assigning/plots.py b/src/acbm/assigning/plots.py
index 26d7d51..da3ef0d 100644
--- a/src/acbm/assigning/plots.py
+++ b/src/acbm/assigning/plots.py
@@ -11,6 +11,8 @@
 from matplotlib import patches as mpatches
 from shapely.geometry import LineString
 
+from acbm.assigning.utils import _adjust_distance
+
 
 def plot_workzone_assignment_line(
     assignment_results: pd.DataFrame,
@@ -387,7 +389,10 @@ def plot_scatter_actual_reported(
     title_prefix: str,
     activity_type: str,
     activity_type_col: str,
+    detour_factor: float,
+    decay_rate: float,
     crs: str,
+    x_axis_max: float,
     save_dir: str | Path | None = None,
     display: bool = False,
 ):
@@ -404,7 +409,10 @@ def plot_scatter_actual_reported(
     - title_prefix: Prefix for the plot titles.
     - activity_type: Type of activity to plot.
     - activity_type_col: Column name for the activity type.
+    - detour_factor: Detour factor for the distance adjustment. (see `_adjust_distance`)
+    - decay_rate: Decay rate for the distance adjustment.
     - crs: Coordinate reference system (CRS) of the activities data.
+    - x_axis_max: Maximum value for the x-axis. (use to remove very high reported distances / times)
     - save_dir: Directory to save the plots. If None, plots are not saved.
     - display: Whether to display plots by calling `plt.show()`.
     """
@@ -433,6 +441,13 @@ def plot_scatter_actual_reported(
     # calculate the length of the line_geometry in meters
     activity_chains_plot["length"] = activity_chains_plot["line_geometry"].length
 
+    # Create a column for estimated distance
+    activity_chains_plot["length"] = activity_chains_plot["length"].apply(
+        lambda d: _adjust_distance(
+            d, detour_factor=detour_factor, decay_rate=decay_rate
+        )
+    )
+
     # Calculate the number of rows and columns for the subplots. It is a function of the number of modes
     nrows = math.ceil(len(activity_chains_plot["mode"].unique()) / 2)
     ncols = 2
@@ -459,6 +474,10 @@ def plot_scatter_actual_reported(
         p = np.poly1d(z)
         ax.plot(subset_mode[x_col], p(subset_mode[x_col]), "r--")
 
+        # Set x-axis limit
+        if x_axis_max:
+            ax.set_xlim(0, x_axis_max)
+
         ax.set_title(f"{title_prefix} for mode: {mode}")
         ax.set_xlabel(x_label)
         ax.set_ylabel(y_label)
diff --git a/src/acbm/assigning/utils.py b/src/acbm/assigning/utils.py
index 27d9ce7..3b2214e 100644
--- a/src/acbm/assigning/utils.py
+++ b/src/acbm/assigning/utils.py
@@ -284,10 +284,39 @@ def _get_activities_per_zone_df(activities_per_zone: dict) -> pd.DataFrame:
     return pd.concat(activities_per_zone.values())
 
 
+def _adjust_distance(
+    distance: float,
+    detour_factor: float,
+    decay_rate: float,
+) -> float:
+    """
+    Adjusts euclidian distances by adding a detour factor. We use minkowski distance
+    and a decay rate, as longer detour makes up a smaller proportion of the total
+    distance as the distance increases.
+
+    Parameters
+    ----------
+    distance : float
+        The original distance.
+    detour_factor : float
+        The detour factor to be applied.
+    decay_rate : float
+        The decay rate to be applied.
+
+    Returns
+    -------
+    float
+        The adjusted distance.
+    """
+    return distance * (1 + ((detour_factor - 1) * np.exp(-decay_rate * distance)))
+
+
 def zones_to_time_matrix(
     zones: gpd.GeoDataFrame,
     time_units: str,
     id_col: Optional[str] = None,
+    detour_factor: float = 1.56,
+    decay_rate: float = 0.0001,
 ) -> pd.DataFrame:
     """
     Calculates the distance matrix between the centroids of the given zones and returns it as a DataFrame. The matrix also adds
@@ -305,6 +334,8 @@ def zones_to_time_matrix(
         The name of the column in the zones GeoDataFrame to use as the ID. If None, the index values are used. Default is None.
     time_units: str, optional
         The units to use for the travel time. Options are 's' for seconds and 'm' for minutes.
+    detour_factor: float, optional
+        The detour factor to apply to the distance. Default is 1.56.
     Returns
     -------
     pd.DataFrame
@@ -321,6 +352,11 @@ def zones_to_time_matrix(
     distances = distances.stack().reset_index()
     distances.columns = [f"{id_col}_from", f"{id_col}_to", "distance"]
 
+    # adjust distance column by adding a detour factor
+    distances["distance"] = distances["distance"].apply(
+        lambda d: _adjust_distance(d, detour_factor, decay_rate)
+    )
+
     # replace the index values with the id_col values if id_col is specified
     if id_col is not None:
         # create a mapping from index to id_col values
diff --git a/src/acbm/config.py b/src/acbm/config.py
index eaca0d6..931e5ae 100644
--- a/src/acbm/config.py
+++ b/src/acbm/config.py
@@ -29,6 +29,12 @@ class MatchingParams(BaseModel):
     chunk_size: int = 50_000
 
 
+@dataclass(frozen=True)
+class FeasibleAssignmentParams(BaseModel):
+    detour_factor: float
+    decay_rate: float
+
+
 @dataclass(frozen=True)
 class WorkAssignmentParams(BaseModel):
     use_percentages: bool
@@ -51,10 +57,13 @@ class Postprocessing(BaseModel):
 
 class Config(BaseModel):
     parameters: Parameters = Field(description="Config: parameters.")
+    matching: MatchingParams = Field(description="Config: parameters for matching.")
+    feasible_assignment: FeasibleAssignmentParams = Field(
+        description="Config: parameters for assignment of feasible zones."
+    )
     work_assignment: WorkAssignmentParams = Field(
         description="Config: parameters for work assignment."
     )
-    matching: MatchingParams = Field(description="Config: parameters for matching.")
     postprocessing: Postprocessing = Field(
         description="Config: parameters for postprocessing."
     )
diff --git a/src/acbm/validating/utils.py b/src/acbm/validating/utils.py
index 3ea2b40..dfc999f 100644
--- a/src/acbm/validating/utils.py
+++ b/src/acbm/validating/utils.py
@@ -2,6 +2,8 @@
 import pandas as pd
 from shapely import wkt
 
+from acbm.assigning.utils import _adjust_distance
+
 
 def process_sequences(
     df: pd.DataFrame,
@@ -76,6 +78,8 @@ def calculate_od_distances(
     end_wkt_col: str,
     crs_epsg: int,
     projected_epsg: int = 3857,
+    detour_factor: float = 1.56,
+    decay_rate: float = 0.0001,
 ) -> pd.DataFrame:
     """
     Calculate distances between start and end geometries in a DataFrame.
@@ -94,6 +98,11 @@ def calculate_od_distances(
     projected_epsg: int
         EPSG code for the projected CRS (default is 3857). We need a metric crs
         to calculte distances in meters.
+    detour_factor: float
+        Factor to adjust the estimated distance.
+    decay_rate: float
+        Decay rate for the distance adjustment. Detours are a smaller proportion of
+        the direct distance for longer trips.
 
     Returns
     -------
@@ -120,7 +129,17 @@ def calculate_od_distances(
     gdf = gdf.to_crs(epsg=projected_epsg)
     end_gdf = end_gdf.to_crs(epsg=projected_epsg)
 
-    # Calculate the distance between start and end geometries (in km)
-    gdf["distance"] = round(gdf.geometry.distance(end_gdf.geometry) / 1000, 1)
+    # Calculate the distance between start and end geometries (in m)
+    gdf["distance"] = gdf.geometry.distance(end_gdf.geometry)
+
+    # Estimate actual travel distance
+    gdf["distance"] = gdf["distance"].apply(
+        lambda d: _adjust_distance(
+            d, detour_factor=detour_factor, decay_rate=decay_rate
+        )
+    )
+
+    # convert distance to km
+    gdf["distance"] = round(gdf["distance"] / 1000, 1)
 
     return gdf

From a588c678f2fd7a3a6e1c4f0fb9af4b67b3cba4ab Mon Sep 17 00:00:00 2001
From: Hussein Mahfouz <45176416+Hussein-Mahfouz@users.noreply.github.com>
Date: Fri, 13 Dec 2024 19:31:49 +0100
Subject: [PATCH 02/28] update _adjust_distance

---
 scripts/3.1_assign_primary_feasible_zones.py |  4 ++--
 scripts/run_pipeline.sh                      | 10 +++++-----
 src/acbm/assigning/utils.py                  |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/scripts/3.1_assign_primary_feasible_zones.py b/scripts/3.1_assign_primary_feasible_zones.py
index 11c9a83..d1984c6 100644
--- a/scripts/3.1_assign_primary_feasible_zones.py
+++ b/scripts/3.1_assign_primary_feasible_zones.py
@@ -226,7 +226,7 @@ def main(config_file):
         zone_id=config.zone_id,
         filter_by_activity=True,
         activity_col="education_type",
-        time_tolerance=0.3,
+        time_tolerance=0.2,
         detour_factor=config.feasible_assignment.detour_factor,
         decay_rate=config.feasible_assignment.decay_rate,
     )
@@ -255,7 +255,7 @@ def main(config_file):
         zone_id=config.zone_id,
         filter_by_activity=True,
         activity_col="dact",
-        time_tolerance=0.3,
+        time_tolerance=0.2,
         detour_factor=config.feasible_assignment.detour_factor,
         decay_rate=config.feasible_assignment.decay_rate,
     )
diff --git a/scripts/run_pipeline.sh b/scripts/run_pipeline.sh
index 8944c2b..9da0845 100755
--- a/scripts/run_pipeline.sh
+++ b/scripts/run_pipeline.sh
@@ -5,11 +5,11 @@ set -e
 # python scripts/0_preprocess_inputs.py --config_file $1
 # python scripts/0.1_run_osmox.py --config_file $1
 # python scripts/1_prep_synthpop.py --config_file $1
-# python scripts/2_match_households_and_individuals.py --config_file $1
-# python scripts/3.1_assign_primary_feasible_zones.py --config_file $1
-# python scripts/3.2.1_assign_primary_zone_edu.py --config_file $1
-# python scripts/3.2.2_assign_primary_zone_work.py --config_file $1
-# python scripts/3.2.3_assign_secondary_zone.py --config_file $1
+python scripts/2_match_households_and_individuals.py --config_file $1
+python scripts/3.1_assign_primary_feasible_zones.py --config_file $1
+python scripts/3.2.1_assign_primary_zone_edu.py --config_file $1
+python scripts/3.2.2_assign_primary_zone_work.py --config_file $1
+python scripts/3.2.3_assign_secondary_zone.py --config_file $1
 python scripts/3.3_assign_facility_all.py --config_file $1
 python scripts/4_validation.py --config_file $1
 python scripts/5_acbm_to_matsim_xml.py --config_file $1
diff --git a/src/acbm/assigning/utils.py b/src/acbm/assigning/utils.py
index 3b2214e..fb6d7cc 100644
--- a/src/acbm/assigning/utils.py
+++ b/src/acbm/assigning/utils.py
@@ -308,7 +308,7 @@ def _adjust_distance(
     float
         The adjusted distance.
     """
-    return distance * (1 + ((detour_factor - 1) * np.exp(-decay_rate * distance)))
+    return distance * ((1 + (detour_factor - 1)) * np.exp(-decay_rate * distance))
 
 
 def zones_to_time_matrix(

From 9fa6b0f48657c124ec883647f887778afe4ec939 Mon Sep 17 00:00:00 2001
From: Hussein Mahfouz <45176416+Hussein-Mahfouz@users.noreply.github.com>
Date: Fri, 13 Dec 2024 19:34:01 +0100
Subject: [PATCH 03/28] undo update _adjust_distance

---
 src/acbm/assigning/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/acbm/assigning/utils.py b/src/acbm/assigning/utils.py
index fb6d7cc..3b2214e 100644
--- a/src/acbm/assigning/utils.py
+++ b/src/acbm/assigning/utils.py
@@ -308,7 +308,7 @@ def _adjust_distance(
     float
         The adjusted distance.
     """
-    return distance * ((1 + (detour_factor - 1)) * np.exp(-decay_rate * distance))
+    return distance * (1 + ((detour_factor - 1) * np.exp(-decay_rate * distance)))
 
 
 def zones_to_time_matrix(

From 40d2e2f4714d03678996cdbebc7135d636836107 Mon Sep 17 00:00:00 2001
From: Hussein Mahfouz <45176416+Hussein-Mahfouz@users.noreply.github.com>
Date: Fri, 13 Dec 2024 19:38:16 +0100
Subject: [PATCH 04/28] edit intrazone

---
 src/acbm/assigning/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/acbm/assigning/utils.py b/src/acbm/assigning/utils.py
index 3b2214e..2dd37c1 100644
--- a/src/acbm/assigning/utils.py
+++ b/src/acbm/assigning/utils.py
@@ -483,7 +483,8 @@ def intrazone_time(zones: gpd.GeoDataFrame, key_column: str) -> dict:
     # Calculate the area of each zone
     zones["area"] = zones["geometry"].area
     # Calculate the average distance within each zone
-    zones["average_dist"] = np.sqrt(zones["area"]) / 2
+    # sqrt(area) / 2 would be radius
+    zones["average_dist"] = np.sqrt(zones["area"]) / 1.5
 
     # Mode speeds in m/s
     mode_speeds_mps = {

From 3e521eac1b7a6271d5403e9b272515d7eaf9733d Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Tue, 7 Jan 2025 16:37:34 +0000
Subject: [PATCH 05/28] Fix missing function args

---
 scripts/3.3_assign_facility_all.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scripts/3.3_assign_facility_all.py b/scripts/3.3_assign_facility_all.py
index ba73bdf..1d197c3 100644
--- a/scripts/3.3_assign_facility_all.py
+++ b/scripts/3.3_assign_facility_all.py
@@ -361,6 +361,9 @@ def main(config_file):
             y_col="time",
             x_label="Reported Travel TIme (min)",
             y_label="Modelled time (min)",
+            detour_factor=config.feasible_assignment.detour_factor,
+            decay_rate=config.feasible_assignment.decay_rate,
+            x_axis_max=180,
             crs=f"EPSG:{config.output_crs}",
             title_prefix="Scatter plot of TripTotalTime vs. Modelled time",
             save_dir=config.output_path / "plots/assigning/",

From 5b6cfbbb8c53694c1b5849ad8a409f9670d743aa Mon Sep 17 00:00:00 2001
From: Hussein Mahfouz <45176416+Hussein-Mahfouz@users.noreply.github.com>
Date: Fri, 10 Jan 2025 15:38:11 +0100
Subject: [PATCH 06/28] config for reference

---
 config/base_all_msoa.toml | 78 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 config/base_all_msoa.toml

diff --git a/config/base_all_msoa.toml b/config/base_all_msoa.toml
new file mode 100644
index 0000000..4593485
--- /dev/null
+++ b/config/base_all_msoa.toml
@@ -0,0 +1,78 @@
+[parameters]
+seed = 0
+ # this is used to query poi data from osm and to load in SPC data
+region = "leeds"
+# how many people from the SPC do we want to run the model for? Comment out if you want to run the analysis on the entire SPC populaiton
+number_of_households = 25000
+# "OA21CD": OA level, "MSOA11CD": MSOA level
+zone_id = "MSOA21CD"
+ # Only set to true if you have travel time matrix at the level specified in boundary_geography
+travel_times = false
+boundary_geography = "MSOA"
+# NTS years to use
+nts_years = [2019, 2021, 2022]
+# NTS regions to use
+nts_regions = [
+    'Yorkshire and the Humber',
+    'North West',
+    'North East',
+    'East Midlands',
+    'West Midlands',
+    'East of England',
+    'South East',
+    'South West']
+# nts day of the week to use
+# 1: Monday, 2: Tuesday, 3: Wednesday, 4: Thursday, 5: Friday, 6: Saturday, 7: Sunday
+nts_day_of_week = 3
+# what crs do we want the output to be in? (just add the number, e.g. 3857)
+output_crs = 3857
+
+[matching]
+# for optional and required columns, see the [iterative_match_categorical](https://github.com/Urban-Analytics-Technology-Platform/acbm/blob/ca181c54d7484ebe44706ff4b43c26286b22aceb/src/acbm/matching.py#L110) function
+# Do not add any column not listed below. You can only move a column from optional to require (or vise versa)
+required_columns = [
+    "number_adults",
+    "number_children",
+    "num_pension_age",
+]
+optional_columns = [
+    "number_cars",
+    "rural_urban_2_categories",
+    "employment_status",
+    "tenure_status",
+]
+# What is the maximum number of NTS matches we want for each SPC household?
+n_matches = 10
+
+[feasible_assignment]
+# detour factor when converting euclidian distance to actual travel distance
+detour_factor = 1.56
+# decay rate when converting euclidian to travel distance (0.0001 is a good value)
+# actual_distance = distance * (1 + ((detour_factor - 1) * np.exp(-decay_rate * distance)))
+decay_rate = 0.0001
+
+[work_assignment]
+commute_level = "MSOA"
+# if true, optimization problem will try to minimize percentage difference at OD level (not absolute numbers). Recommended to set it to true
+use_percentages = true
+# weights to add for each objective in the optimization problem
+weight_max_dev = 0.2
+weight_total_dev = 0.8
+# maximum number of feasible zones to include in the optimization problem (less zones makes problem smaller - so faster, but at the cost of a better solution)
+max_zones = 10
+
+[postprocessing]
+pam_jitter = 30
+pam_min_duration = 10
+# for get_pt_subscription: everyone above this age has a subscription (pensioners get free travel)
+# TODO: more sophisticated approach
+pt_subscription_age = 66
+# to define if a person is a student:
+# eveyone below this age is a student
+student_age_base = 16
+# everyone below this age that has at least one "education" activity is a student
+student_age_upper = 30
+# eveyone who uses one of the modes below is classified as a passenger (isPassenger = True)
+modes_passenger =  ['car_passenger', 'taxi']
+# yearly state pension: for getting hhlIncome of pensioners
+state_pension = 11502

From 0e06e29d9cf9494d935102111feccb58203e348b Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Tue, 14 Jan 2025 18:33:11 +0000
Subject: [PATCH 07/28] Update config

---
 config/base.toml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/config/base.toml b/config/base.toml
index acd93c6..692c9b5 100644
--- a/config/base.toml
+++ b/config/base.toml
@@ -55,3 +55,15 @@ max_zones = 8          # maximum number of feasible zones to include in the opti
 [postprocessing]
 pam_jitter = 30
 pam_min_duration = 10
+# for get_pt_subscription: everyone above this age has a subscription (pensioners get free travel)
+# TODO: more sophisticated approach
+pt_subscription_age = 66
+# to define if a person is a student:
+# eveyone below this age is a student
+student_age_base = 16
+# everyone below this age that has at least one "education" activity is a student
+student_age_upper = 30
+# eveyone who uses one of the modes below is classified as a passenger (isPassenger = True)
+modes_passenger =  ['car_passenger', 'taxi']
+# yearly state pension: for getting hhlIncome of pensioners
+state_pension = 11502

From 9c0003ab677c78ee1692c6e20a0918975138de6c Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Tue, 14 Jan 2025 18:38:54 +0000
Subject: [PATCH 08/28] Fix test

---
 tests/test_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_config.py b/tests/test_config.py
index 2bf69d6..759816e 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -11,4 +11,4 @@ def config():
 
 
 def test_id(config):
-    assert config.id == "a89b65de35"
+    assert config.id == "0ebb8c3ee7"

From 6164a281421f4c617441aca63ab61c669ec4c1b0 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Tue, 7 Jan 2025 21:41:17 +0000
Subject: [PATCH 09/28] Adjust matching to account for common travel day

---
 config/base.toml                              |  2 +-
 scripts/2_match_households_and_individuals.py | 13 ++++++++-
 scripts/3.1_assign_primary_feasible_zones.py  |  2 +-
 scripts/3.2.1_assign_primary_zone_edu.py      |  2 +-
 scripts/3.2.2_assign_primary_zone_work.py     |  2 +-
 scripts/3.2.3_assign_secondary_zone.py        |  2 +-
 scripts/4_validation.py                       |  2 +-
 src/acbm/assigning/feasible_zones_primary.py  |  2 +-
 src/acbm/config.py                            |  2 +-
 src/acbm/utils.py                             | 27 +++++++++++++++++++
 tests/test_utils.py                           | 21 +++++++++++++++
 11 files changed, 68 insertions(+), 9 deletions(-)
 create mode 100644 tests/test_utils.py

diff --git a/config/base.toml b/config/base.toml
index 692c9b5..a8b8357 100644
--- a/config/base.toml
+++ b/config/base.toml
@@ -20,7 +20,7 @@ nts_regions = [
 ]
 # nts day of the week to use
 # 1: Monday, 2: Tuesday, 3: Wednesday, 4: Thursday, 5: Friday, 6: Saturday, 7: Sunday
-nts_day_of_week = 3
+nts_days_of_week = [3]
 # what crs do we want the output to be in? (just add the number, e.g. 3857)
 output_crs = 3857
 
diff --git a/scripts/2_match_households_and_individuals.py b/scripts/2_match_households_and_individuals.py
index b86ca93..a3b7496 100644
--- a/scripts/2_match_households_and_individuals.py
+++ b/scripts/2_match_households_and_individuals.py
@@ -16,6 +16,7 @@
     transform_by_group,
     truncate_values,
 )
+from acbm.utils import households_with_common_travel_days
 
 
 @acbm_cli
@@ -237,8 +238,18 @@ def get_interim_path(
     nts_households = nts_filter_by_region(nts_households, psu, regions)
     nts_trips = nts_filter_by_region(nts_trips, psu, regions)
 
-    # Create dictionaries of key value pairs
+    # Ensure that the households have at least one day in `nts_days_of_week` that
+    # all household members have trips for
+    hids = households_with_common_travel_days(
+        nts_trips, config.parameters.nts_days_of_week
+    )
 
+    # Subset individuals and households given filtering of trips
+    nts_trips = nts_trips[nts_trips["HouseholdID"].isin(hids)]
+    nts_individuals = nts_individuals[nts_individuals["HouseholdID"].isin(hids)]
+    nts_households = nts_households[nts_households["HouseholdID"].isin(hids)]
+
+    # Create dictionaries of key value pairs
     """
     guide to the dictionaries:
 
diff --git a/scripts/3.1_assign_primary_feasible_zones.py b/scripts/3.1_assign_primary_feasible_zones.py
index 392be8a..4da27c7 100644
--- a/scripts/3.1_assign_primary_feasible_zones.py
+++ b/scripts/3.1_assign_primary_feasible_zones.py
@@ -31,7 +31,7 @@ def main(config_file):
     # Filter to a specific day of the week
     logger.info("Filtering activity chains to a specific day of the week")
     activity_chains = activity_chains[
-        activity_chains["TravDay"] == config.parameters.nts_day_of_week
+        activity_chains["TravDay"].isin(config.parameters.nts_days_of_week)
     ]
 
     # --- Study area boundaries
diff --git a/scripts/3.2.1_assign_primary_zone_edu.py b/scripts/3.2.1_assign_primary_zone_edu.py
index 183a46f..58a3822 100644
--- a/scripts/3.2.1_assign_primary_zone_edu.py
+++ b/scripts/3.2.1_assign_primary_zone_edu.py
@@ -54,7 +54,7 @@ def main(config_file):
         config, columns=cols_for_assignment_edu()
     )
     activity_chains = activity_chains[
-        activity_chains["TravDay"] == config.parameters.nts_day_of_week
+        activity_chains["TravDay"].isin(config.parameters.nts_days_of_week)
     ]
 
     logger.info("Filtering activity chains for trip purpose: education")
diff --git a/scripts/3.2.2_assign_primary_zone_work.py b/scripts/3.2.2_assign_primary_zone_work.py
index 1c5e98b..98256d1 100644
--- a/scripts/3.2.2_assign_primary_zone_work.py
+++ b/scripts/3.2.2_assign_primary_zone_work.py
@@ -45,7 +45,7 @@ def main(config_file):
 
     activity_chains = activity_chains_for_assignment(config, cols_for_assignment_work())
     activity_chains = activity_chains[
-        activity_chains["TravDay"] == config.parameters.nts_day_of_week
+        activity_chains["TravDay"].isin(config.parameters.nts_days_of_week)
     ]
 
     logger.info("Filtering activity chains for trip purpose: work")
diff --git a/scripts/3.2.3_assign_secondary_zone.py b/scripts/3.2.3_assign_secondary_zone.py
index a87ea10..66f1145 100644
--- a/scripts/3.2.3_assign_secondary_zone.py
+++ b/scripts/3.2.3_assign_secondary_zone.py
@@ -40,7 +40,7 @@ def main(config_file):
 
     activity_chains = activity_chains_for_assignment(config)
     activity_chains = activity_chains[
-        activity_chains["TravDay"] == config.parameters.nts_day_of_week
+        activity_chains["TravDay"].isin(config.parameters.nts_days_of_week)
     ]
 
     # TODO: remove obsolete comment
diff --git a/scripts/4_validation.py b/scripts/4_validation.py
index ba246ce..ab64025 100644
--- a/scripts/4_validation.py
+++ b/scripts/4_validation.py
@@ -29,7 +29,7 @@ def main(config_file):
     # NTS data
     legs_nts = pd.read_parquet(config.output_path / "nts_trips.parquet")
 
-    legs_nts = legs_nts[legs_nts["TravDay"] == config.parameters.nts_day_of_week]
+    legs_nts = legs_nts[legs_nts["TravDay"].isin(config.parameters.nts_days_of_week)]
 
     # Model outputs
     legs_acbm = pd.read_csv(config.output_path / "legs.csv")
diff --git a/src/acbm/assigning/feasible_zones_primary.py b/src/acbm/assigning/feasible_zones_primary.py
index 9ef9f26..9c3eb85 100644
--- a/src/acbm/assigning/feasible_zones_primary.py
+++ b/src/acbm/assigning/feasible_zones_primary.py
@@ -25,7 +25,7 @@
 activity_chains_schema = DataFrameSchema(
     {
         "mode": Column(str),
-        "TravDay": Column(pa.Float, Check.isin([1, 2, 3, 4, 5, 6, 7]), nullable=True),
+        # "TravDay": Column(pa.Float, Check.isin([1, 2, 3, 4, 5, 6, 7]), nullable=True),
         "tst": Column(pa.Float, Check.less_than_or_equal_to(1440), nullable=True),
         "TripTotalTime": Column(pa.Float, nullable=True),
         # TODO: add more columns ...
diff --git a/src/acbm/config.py b/src/acbm/config.py
index 477add7..4495700 100644
--- a/src/acbm/config.py
+++ b/src/acbm/config.py
@@ -26,7 +26,7 @@ class Parameters(BaseModel):
     boundary_geography: str
     nts_years: list[int]
     nts_regions: list[str]
-    nts_day_of_week: int
+    nts_days_of_week: list[int]
     output_crs: int
     tolerance_work: float | None = None
     tolerance_edu: float | None = None
diff --git a/src/acbm/utils.py b/src/acbm/utils.py
index 41fbed7..df8bdfc 100644
--- a/src/acbm/utils.py
+++ b/src/acbm/utils.py
@@ -39,3 +39,30 @@ def get_travel_times(config: Config, use_estimates: bool = False) -> pd.DataFram
     if config.parameters.travel_times and not use_estimates:
         return pd.read_parquet(config.travel_times_filepath)
     return pd.read_parquet(config.travel_times_estimates_filepath)
+
+
+def households_with_common_travel_days(
+    nts_trips: pd.DataFrame, days: list[int]
+) -> list[int]:
+    return (
+        nts_trips.groupby(["HouseholdID", "IndividualID"])["TravDay"]
+        .apply(list)
+        .map(set)
+        .to_frame()
+        .groupby(["HouseholdID"])["TravDay"]
+        .apply(
+            lambda sets_of_days: set.intersection(*sets_of_days)
+            if set.intersection(*sets_of_days)
+            else None
+        )
+        .to_frame()["TravDay"]
+        .apply(
+            lambda common_days: [day for day in common_days if day in days]
+            if common_days is not None
+            else []
+        )
+        .apply(lambda common_days: common_days if common_days else pd.NA)
+        .dropna()
+        .reset_index()["HouseholdID"]
+        .to_list()
+    )
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 0000000..0a90147
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,21 @@
+import pandas as pd
+import pytest
+
+from acbm.utils import households_with_common_travel_days
+
+
+@pytest.fixture
+def nts_trips():
+    return pd.DataFrame.from_dict(
+        {
+            "IndividualID": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+            "HouseholdID": [1, 1, 1, 2, 2, 2, 3, 3, 3, 3],
+            "TravDay": [1, 1, 1, 2, 3, 2, 3, 3, 3, 3],
+        }
+    )
+
+
+def test_households_with_common_travel_days(nts_trips):
+    assert households_with_common_travel_days(nts_trips, [1]) == [1]
+    assert households_with_common_travel_days(nts_trips, [1, 2]) == [1]
+    assert households_with_common_travel_days(nts_trips, [1, 3]) == [1, 3]

From 0cea61ffc671df25c628d747c0c115446ed7311b Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 8 Jan 2025 16:22:50 +0000
Subject: [PATCH 10/28] Add 'DayID' to outputs

---
 src/acbm/assigning/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/acbm/assigning/utils.py b/src/acbm/assigning/utils.py
index d866d9e..efbd21e 100644
--- a/src/acbm/assigning/utils.py
+++ b/src/acbm/assigning/utils.py
@@ -18,6 +18,7 @@ def cols_for_assignment_all() -> list[str]:
         "age_years",
         "TripDisIncSW",
         "tet",
+        "DayID",
     ]
 
 

From 88f4911febefc80e4d562f9461d9ad975afdd510 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 8 Jan 2025 17:43:24 +0000
Subject: [PATCH 11/28] Fix missing subset of nts_trips

---
 scripts/2_match_households_and_individuals.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/2_match_households_and_individuals.py b/scripts/2_match_households_and_individuals.py
index a3b7496..ab4f8fb 100644
--- a/scripts/2_match_households_and_individuals.py
+++ b/scripts/2_match_households_and_individuals.py
@@ -245,7 +245,10 @@ def get_interim_path(
     )
 
     # Subset individuals and households given filtering of trips
-    nts_trips = nts_trips[nts_trips["HouseholdID"].isin(hids)]
+    nts_trips = nts_trips[
+        nts_trips["HouseholdID"].isin(hids)
+        & nts_trips["TravDay"].isin(config.parameters.nts_days_of_week)
+    ]
     nts_individuals = nts_individuals[nts_individuals["HouseholdID"].isin(hids)]
     nts_households = nts_households[nts_households["HouseholdID"].isin(hids)]
 

From e0fc2f9135a398b114f5b3b82acea33086d16ef4 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 8 Jan 2025 17:46:37 +0000
Subject: [PATCH 12/28] Add interim output with chosen random day for
 households

---
 scripts/3.1_assign_primary_feasible_zones.py | 21 ++++++++++++++++----
 scripts/3.2.1_assign_primary_zone_edu.py     |  5 +----
 scripts/3.2.2_assign_primary_zone_work.py    |  8 +++-----
 scripts/3.2.3_assign_secondary_zone.py       |  5 +----
 src/acbm/assigning/utils.py                  | 18 +++++++++++++----
 5 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/scripts/3.1_assign_primary_feasible_zones.py b/scripts/3.1_assign_primary_feasible_zones.py
index 4da27c7..809c1b3 100644
--- a/scripts/3.1_assign_primary_feasible_zones.py
+++ b/scripts/3.1_assign_primary_feasible_zones.py
@@ -1,6 +1,7 @@
 import pickle as pkl
 
 import geopandas as gpd
+import numpy as np
 import pandas as pd
 
 from acbm.assigning.feasible_zones_primary import get_possible_zones
@@ -28,11 +29,23 @@ def main(config_file):
     activity_chains = activity_chains_for_assignment(config)
     logger.info("Activity chains loaded")
 
-    # Filter to a specific day of the week
     logger.info("Filtering activity chains to a specific day of the week")
-    activity_chains = activity_chains[
-        activity_chains["TravDay"].isin(config.parameters.nts_days_of_week)
-    ]
+
+    # Generate random sample of days by household
+    activity_chains.merge(
+        activity_chains.groupby(["household"])["TravDay"]
+        .apply(np.unique)
+        .apply(np.random.choice)
+        .rename("ChosenTravDay"),
+        left_on=["household", "TravDay"],
+        right_on=["household", "ChosenTravDay"],
+        how="inner",
+    )[["id", "household", "TravDay"]].drop_duplicates().to_parquet(
+        config.output_path / "interim" / "assigning" / "chosen_trav_day.parquet"
+    )
+
+    # Filter to chosen day
+    activity_chains = activity_chains_for_assignment(config, subset_to_chosen_day=True)
 
     # --- Study area boundaries
 
diff --git a/scripts/3.2.1_assign_primary_zone_edu.py b/scripts/3.2.1_assign_primary_zone_edu.py
index 58a3822..d540eec 100644
--- a/scripts/3.2.1_assign_primary_zone_edu.py
+++ b/scripts/3.2.1_assign_primary_zone_edu.py
@@ -51,11 +51,8 @@ def main(config_file):
     logger.info("Loading activity chains")
 
     activity_chains = activity_chains_for_assignment(
-        config, columns=cols_for_assignment_edu()
+        config, columns=cols_for_assignment_edu(), subset_to_chosen_day=True
     )
-    activity_chains = activity_chains[
-        activity_chains["TravDay"].isin(config.parameters.nts_days_of_week)
-    ]
 
     logger.info("Filtering activity chains for trip purpose: education")
     activity_chains_edu = activity_chains[activity_chains["dact"] == "education"]
diff --git a/scripts/3.2.2_assign_primary_zone_work.py b/scripts/3.2.2_assign_primary_zone_work.py
index 98256d1..2cd561b 100644
--- a/scripts/3.2.2_assign_primary_zone_work.py
+++ b/scripts/3.2.2_assign_primary_zone_work.py
@@ -42,11 +42,9 @@ def main(config_file):
 
     # --- Activity chains
     logger.info("Loading activity chains")
-
-    activity_chains = activity_chains_for_assignment(config, cols_for_assignment_work())
-    activity_chains = activity_chains[
-        activity_chains["TravDay"].isin(config.parameters.nts_days_of_week)
-    ]
+    activity_chains = activity_chains_for_assignment(
+        config, cols_for_assignment_work(), subset_to_chosen_day=True
+    )
 
     logger.info("Filtering activity chains for trip purpose: work")
     activity_chains_work = activity_chains[activity_chains["dact"] == "work"]
diff --git a/scripts/3.2.3_assign_secondary_zone.py b/scripts/3.2.3_assign_secondary_zone.py
index 66f1145..0f8b045 100644
--- a/scripts/3.2.3_assign_secondary_zone.py
+++ b/scripts/3.2.3_assign_secondary_zone.py
@@ -38,10 +38,7 @@ def main(config_file):
     # --- Load in the data
     logger.info("Loading: activity chains")
 
-    activity_chains = activity_chains_for_assignment(config)
-    activity_chains = activity_chains[
-        activity_chains["TravDay"].isin(config.parameters.nts_days_of_week)
-    ]
+    activity_chains = activity_chains_for_assignment(config, subset_to_chosen_day=True)
 
     # TODO: remove obsolete comment
     # --- Add OA21CD to the data
diff --git a/src/acbm/assigning/utils.py b/src/acbm/assigning/utils.py
index efbd21e..53ce213 100644
--- a/src/acbm/assigning/utils.py
+++ b/src/acbm/assigning/utils.py
@@ -11,7 +11,6 @@ def cols_for_assignment_all() -> list[str]:
     """Gets activity chains with subset of columns required for assignment."""
     return [
         *cols_for_assignment_edu(),
-        "household",
         "oact",
         "nts_ind_id",
         "nts_hh_id",
@@ -25,12 +24,13 @@ def cols_for_assignment_all() -> list[str]:
 def cols_for_assignment_edu() -> list[str]:
     """Gets activity chains with subset of columns required for assignment."""
     return [
+        "id",
+        "household",
         "TravDay",
         "OA11CD",
         "dact",
         "mode",
         "tst",
-        "id",
         "seq",
         "TripTotalTime",
         "education_type",
@@ -43,16 +43,26 @@ def cols_for_assignment_work() -> list[str]:
 
 
 def activity_chains_for_assignment(
-    config: Config, columns: list[str] | None = None
+    config: Config, columns: list[str] | None = None, subset_to_chosen_day: bool = False
 ) -> pd.DataFrame:
     """Gets activity chains with subset of columns required for assignment."""
     if columns is None:
         columns = cols_for_assignment_all()
 
-    return pd.read_parquet(
+    activity_chains = pd.read_parquet(
         config.spc_with_nts_trips_filepath,
         columns=columns,
     )
+    if not subset_to_chosen_day:
+        return activity_chains
+
+    return activity_chains.merge(
+        pd.read_parquet(
+            config.output_path / "interim" / "assigning" / "chosen_trav_day.parquet"
+        ),
+        on=["id", "household", "TravDay"],
+        how="inner",
+    )
 
 
 def _map_time_to_day_part(

From ed45b922b3de2294b9971b4d10ae0a2fb3e32058 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 9 Jan 2025 18:40:49 +0000
Subject: [PATCH 13/28] Add todo

---
 scripts/2_match_households_and_individuals.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/2_match_households_and_individuals.py b/scripts/2_match_households_and_individuals.py
index ab4f8fb..09b2eb2 100644
--- a/scripts/2_match_households_and_individuals.py
+++ b/scripts/2_match_households_and_individuals.py
@@ -234,6 +234,7 @@ def get_interim_path(
 
     regions = config.parameters.nts_regions
 
+    # TODO: Currently this only seems to work for 2019, check this
     nts_individuals = nts_filter_by_region(nts_individuals, psu, regions)
     nts_households = nts_filter_by_region(nts_households, psu, regions)
     nts_trips = nts_filter_by_region(nts_trips, psu, regions)

From ce5a261d320f96a81dd6f3cacc8c99167929ce4a Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Tue, 14 Jan 2025 18:13:34 +0000
Subject: [PATCH 14/28] Handle missing data and revise test for common trav
 days

---
 src/acbm/utils.py   |  2 ++
 tests/test_utils.py | 24 +++++++++++++++++++++---
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/src/acbm/utils.py b/src/acbm/utils.py
index df8bdfc..91469e1 100644
--- a/src/acbm/utils.py
+++ b/src/acbm/utils.py
@@ -59,6 +59,8 @@ def households_with_common_travel_days(
         .apply(
             lambda common_days: [day for day in common_days if day in days]
             if common_days is not None
+            and common_days != {pd.NA}
+            and common_days != {np.nan}
             else []
         )
         .apply(lambda common_days: common_days if common_days else pd.NA)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 0a90147..565ce31 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -8,9 +8,26 @@
 def nts_trips():
     return pd.DataFrame.from_dict(
         {
-            "IndividualID": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-            "HouseholdID": [1, 1, 1, 2, 2, 2, 3, 3, 3, 3],
-            "TravDay": [1, 1, 1, 2, 3, 2, 3, 3, 3, 3],
+            "IndividualID": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+            "HouseholdID": [1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5],
+            "TravDay": [
+                1,
+                1,
+                1,
+                2,
+                3,
+                2,
+                3,
+                3,
+                3,
+                3,
+                pd.NA,
+                pd.NA,
+                pd.NA,
+                pd.NA,
+                pd.NA,
+                4,
+            ],
         }
     )
 
@@ -19,3 +36,4 @@ def test_households_with_common_travel_days(nts_trips):
     assert households_with_common_travel_days(nts_trips, [1]) == [1]
     assert households_with_common_travel_days(nts_trips, [1, 2]) == [1]
     assert households_with_common_travel_days(nts_trips, [1, 3]) == [1, 3]
+    assert households_with_common_travel_days(nts_trips, [1, 3, 4]) == [1, 3]

From ced648d280316467d465308703ad3847f05987fb Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Tue, 14 Jan 2025 18:42:12 +0000
Subject: [PATCH 15/28] Fix test

---
 tests/test_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_config.py b/tests/test_config.py
index 759816e..0d2372e 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -11,4 +11,4 @@ def config():
 
 
 def test_id(config):
-    assert config.id == "0ebb8c3ee7"
+    assert config.id == "464741c848"

From 5f17c5656760be59aec800523a9052165fa14a5b Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 15 Jan 2025 17:07:30 +0000
Subject: [PATCH 16/28] Add chosen travel day modelling using pwkstat

---
 src/acbm/assigning/utils.py | 90 +++++++++++++++++++++++++++++++++++++
 src/acbm/config.py          |  5 +++
 src/acbm/utils.py           |  8 ++--
 3 files changed, 99 insertions(+), 4 deletions(-)

diff --git a/src/acbm/assigning/utils.py b/src/acbm/assigning/utils.py
index 53ce213..b099463 100644
--- a/src/acbm/assigning/utils.py
+++ b/src/acbm/assigning/utils.py
@@ -3,6 +3,7 @@
 import geopandas as gpd
 import numpy as np
 import pandas as pd
+import polars as pl
 
 from acbm.config import Config
 
@@ -573,3 +574,92 @@ def replace_intrazonal_travel_time(
 
     # Return the modified DataFrame
     return travel_times_copy
+
+
+def get_chosen_day(config: Config) -> pd.DataFrame:
+    """Gets the chosen day for population given config."""
+    acs = pl.DataFrame(activity_chains_for_assignment(config))
+    work_days = (
+        acs.filter(pl.col("dact").eq("work"))
+        .group_by("id")
+        .agg(pl.col("TravDay").unique())
+        .select(["id", pl.col("TravDay").list.drop_nulls().list.sample(n=1)])
+        .explode("TravDay")
+        .rename({"TravDay": "TravDayWork"})
+    )
+    non_work_days = (
+        acs.filter(~pl.col("dact").eq("work"))
+        .group_by("id")
+        .agg(pl.col("TravDay").unique())
+        .select(["id", pl.col("TravDay").list.drop_nulls().list.sample(n=1)])
+        .explode("TravDay")
+        .rename({"TravDay": "TravDayNonWork"})
+    )
+
+    any_days = (
+        acs.group_by("id")
+        .agg(pl.col("TravDay").unique())
+        .select(["id", pl.col("TravDay").list.drop_nulls()])
+        .select(
+            [
+                "id",
+                pl.when(pl.col("TravDay").list.len() > 0)
+                # Note: this has to be set to with_replacement despite non-empty check
+                .then(pl.col("TravDay").list.sample(n=1, with_replacement=True))
+                .otherwise(None),
+            ]
+        )
+        .explode("TravDay")
+        .rename({"TravDay": "TravDayAny"})
+    ).sort("id")
+
+    # Combine day choices for different conditions
+    acs_combine = (
+        acs.join(work_days, on="id", how="left", coalesce=True)
+        .join(non_work_days, on="id", how="left", coalesce=True)
+        .join(any_days, on="id", how="left", coalesce=True)
+        .join(
+            pl.scan_parquet(config.spc_combined_filepath)
+            .select(["id", "pwkstat"])
+            .collect(),
+            on="id",
+        )
+    )
+
+    # Choose a day given pwkstat
+    acs_combine = acs_combine.with_columns(
+        [
+            # If pwkstat = 1 (full time)
+            # and a work travel day is available
+            pl.when(pl.col("pwkstat").eq(1) & pl.col("TravDayWork").is_not_null())
+            .then(pl.col("TravDayWork"))
+            .otherwise(
+                # If pwkstat = 1 (full time)
+                # and a work travel day is NOT available
+                pl.when(pl.col("pwkstat").eq(1) & pl.col("TravDayWork").is_null())
+                .then(pl.col("TravDayAny"))
+                .otherwise(
+                    # If pwkstat = 2 (part time)
+                    # and a work travel day is available
+                    # and a non-work travel day is available
+                    pl.when(
+                        pl.col("pwkstat").eq(2)
+                        & pl.col("TravDayWork").is_not_null()
+                        & pl.col("TravDayNonWork").is_not_null()
+                    )
+                    .then(
+                        # Sample either TravDayWork or TravDayNonWork
+                        # stochastically given config
+                        pl.col("TravDayWork")
+                        # TODO: update from config
+                        if np.random.random() < 1
+                        else pl.col("TravDayNonWork")
+                    )
+                    .otherwise(pl.col("TravDayAny"))
+                )
+            )
+            .alias("ChosenTravDay")
+        ]
+    )
+
+    return acs_combine.select(["id", "ChosenTravDay"]).unique().sort("id").to_pandas()
diff --git a/src/acbm/config.py b/src/acbm/config.py
index 4495700..be157be 100644
--- a/src/acbm/config.py
+++ b/src/acbm/config.py
@@ -9,6 +9,7 @@
 import geopandas as gpd
 import jcs
 import numpy as np
+import polars as pl
 import tomlkit
 from pydantic import BaseModel, Field, field_serializer, field_validator
 
@@ -30,6 +31,8 @@ class Parameters(BaseModel):
     output_crs: int
     tolerance_work: float | None = None
     tolerance_edu: float | None = None
+    common_household_day: bool = False
+    part_time_work_prob: float = 0.7
 
 
 @dataclass(frozen=True)
@@ -359,6 +362,8 @@ def init_rng(self):
         try:
             np.random.seed(self.seed)
             random.seed(self.seed)
+            pl.set_random_seed(self.seed)
+
         except Exception as err:
             msg = f"config does not provide a rng seed with err: {err}"
             raise ValueError(msg) from err
diff --git a/src/acbm/utils.py b/src/acbm/utils.py
index 91469e1..42a328a 100644
--- a/src/acbm/utils.py
+++ b/src/acbm/utils.py
@@ -42,14 +42,14 @@ def get_travel_times(config: Config, use_estimates: bool = False) -> pd.DataFram
 
 
 def households_with_common_travel_days(
-    nts_trips: pd.DataFrame, days: list[int]
+    nts_trips: pd.DataFrame, days: list[int], hid="HouseholdID", pid="IndividualID"
 ) -> list[int]:
     return (
-        nts_trips.groupby(["HouseholdID", "IndividualID"])["TravDay"]
+        nts_trips.groupby([hid, pid])["TravDay"]
         .apply(list)
         .map(set)
         .to_frame()
-        .groupby(["HouseholdID"])["TravDay"]
+        .groupby([hid])["TravDay"]
         .apply(
             lambda sets_of_days: set.intersection(*sets_of_days)
             if set.intersection(*sets_of_days)
@@ -65,6 +65,6 @@ def households_with_common_travel_days(
         )
         .apply(lambda common_days: common_days if common_days else pd.NA)
         .dropna()
-        .reset_index()["HouseholdID"]
+        .reset_index()[hid]
         .to_list()
     )

From 4b6db8c4f619ad7aad41e667bf513d76eb41ba1d Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 15 Jan 2025 18:09:47 +0000
Subject: [PATCH 17/28] Revise subsetting of households given trav day config

---
 scripts/2_match_households_and_individuals.py | 16 +++++--
 scripts/3.1_assign_primary_feasible_zones.py  | 12 +----
 src/acbm/assigning/utils.py                   | 25 +++++++++-
 src/acbm/config.py                            |  2 +-
 src/acbm/utils.py                             | 34 ++++++++++++++
 tests/test_config.py                          |  2 +-
 tests/test_utils.py                           | 46 +++++++++++++++++--
 7 files changed, 117 insertions(+), 20 deletions(-)

diff --git a/scripts/2_match_households_and_individuals.py b/scripts/2_match_households_and_individuals.py
index 09b2eb2..fd7d37e 100644
--- a/scripts/2_match_households_and_individuals.py
+++ b/scripts/2_match_households_and_individuals.py
@@ -16,7 +16,10 @@
     transform_by_group,
     truncate_values,
 )
-from acbm.utils import households_with_common_travel_days
+from acbm.utils import (
+    households_with_common_travel_days,
+    households_with_travel_days_in_nts_weeks,
+)
 
 
 @acbm_cli
@@ -241,9 +244,14 @@ def get_interim_path(
 
     # Ensure that the households have at least one day in `nts_days_of_week` that
     # all household members have trips for
-    hids = households_with_common_travel_days(
-        nts_trips, config.parameters.nts_days_of_week
-    )
+    if config.parameters.common_household_day:
+        hids = households_with_common_travel_days(
+            nts_trips, config.parameters.nts_days_of_week
+        )
+    else:
+        hids = households_with_travel_days_in_nts_weeks(
+            nts_trips, config.parameters.nts_days_of_week
+        )
 
     # Subset individuals and households given filtering of trips
     nts_trips = nts_trips[
diff --git a/scripts/3.1_assign_primary_feasible_zones.py b/scripts/3.1_assign_primary_feasible_zones.py
index 809c1b3..6da9d26 100644
--- a/scripts/3.1_assign_primary_feasible_zones.py
+++ b/scripts/3.1_assign_primary_feasible_zones.py
@@ -1,13 +1,13 @@
 import pickle as pkl
 
 import geopandas as gpd
-import numpy as np
 import pandas as pd
 
 from acbm.assigning.feasible_zones_primary import get_possible_zones
 from acbm.assigning.utils import (
     activity_chains_for_assignment,
     get_activities_per_zone,
+    get_chosen_day,
     intrazone_time,
     replace_intrazonal_travel_time,
     zones_to_time_matrix,
@@ -32,15 +32,7 @@ def main(config_file):
     logger.info("Filtering activity chains to a specific day of the week")
 
     # Generate random sample of days by household
-    activity_chains.merge(
-        activity_chains.groupby(["household"])["TravDay"]
-        .apply(np.unique)
-        .apply(np.random.choice)
-        .rename("ChosenTravDay"),
-        left_on=["household", "TravDay"],
-        right_on=["household", "ChosenTravDay"],
-        how="inner",
-    )[["id", "household", "TravDay"]].drop_duplicates().to_parquet(
+    get_chosen_day(config).to_parquet(
         config.output_path / "interim" / "assigning" / "chosen_trav_day.parquet"
     )
 
diff --git a/src/acbm/assigning/utils.py b/src/acbm/assigning/utils.py
index b099463..020a5c4 100644
--- a/src/acbm/assigning/utils.py
+++ b/src/acbm/assigning/utils.py
@@ -579,6 +579,23 @@ def replace_intrazonal_travel_time(
 def get_chosen_day(config: Config) -> pd.DataFrame:
     """Gets the chosen day for population given config."""
     acs = pl.DataFrame(activity_chains_for_assignment(config))
+
+    if config.parameters.common_household_day:
+        return (
+            acs.join(
+                acs.group_by("household")
+                .agg(pl.col("TravDay").unique().sample(1, with_replacement=True))
+                .explode("TravDay"),
+                on=["household", "TravDay"],
+                how="inner",
+            )
+            .select(["id", "TravDay"])
+            .unique()
+            .sort("id")
+            .to_pandas()
+        )
+
+    # For any TravDay and modelling increased households
     work_days = (
         acs.filter(pl.col("dact").eq("work"))
         .group_by("id")
@@ -662,4 +679,10 @@ def get_chosen_day(config: Config) -> pd.DataFrame:
         ]
     )
 
-    return acs_combine.select(["id", "ChosenTravDay"]).unique().sort("id").to_pandas()
+    return (
+        acs_combine.select(["id", "ChosenTravDay"])
+        .unique()
+        .rename({"ChosenTravDay": "TravDay"})
+        .sort("id")
+        .to_pandas()
+    )
diff --git a/src/acbm/config.py b/src/acbm/config.py
index be157be..ed3daf4 100644
--- a/src/acbm/config.py
+++ b/src/acbm/config.py
@@ -31,7 +31,7 @@ class Parameters(BaseModel):
     output_crs: int
     tolerance_work: float | None = None
     tolerance_edu: float | None = None
-    common_household_day: bool = False
+    common_household_day: bool = True
     part_time_work_prob: float = 0.7
 
 
diff --git a/src/acbm/utils.py b/src/acbm/utils.py
index 42a328a..5703840 100644
--- a/src/acbm/utils.py
+++ b/src/acbm/utils.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pandas as pd
+import polars as pl
 from sklearn.metrics import mean_squared_error
 
 from acbm.config import Config
@@ -68,3 +69,36 @@ def households_with_common_travel_days(
         .reset_index()[hid]
         .to_list()
     )
+
+
+def households_with_travel_days_in_nts_weeks(
+    nts_trips: pd.DataFrame, days: list[int], hid="HouseholdID", pid="IndividualID"
+) -> list[int]:
+    return (
+        pl.DataFrame(nts_trips)
+        .group_by([hid, pid])
+        .agg(pl.col("TravDay").unique())
+        .select(
+            [
+                hid,
+                pid,
+                pl.col("TravDay").list.drop_nulls().list.set_intersection(pl.lit(days)),
+            ]
+        )
+        .select(
+            [
+                hid,
+                pid,
+                pl.when(pl.col("TravDay").list.len().eq(0))
+                .then(None)
+                .otherwise(pl.col("TravDay"))
+                .alias("TravDay"),
+            ]
+        )
+        .group_by(hid)
+        .agg(pl.col("TravDay").list.len().ne(0).all())
+        .filter(pl.col("TravDay").eq(True))
+        .get_column(hid)
+        .sort()
+        .to_list()
+    )
diff --git a/tests/test_config.py b/tests/test_config.py
index 0d2372e..878afd3 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -11,4 +11,4 @@ def config():
 
 
 def test_id(config):
-    assert config.id == "464741c848"
+    assert config.id == "5709d35b6f"
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 565ce31..5da7ef9 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,15 +1,39 @@
 import pandas as pd
 import pytest
 
-from acbm.utils import households_with_common_travel_days
+from acbm.utils import (
+    households_with_common_travel_days,
+    households_with_travel_days_in_nts_weeks,
+)
 
 
 @pytest.fixture
 def nts_trips():
     return pd.DataFrame.from_dict(
         {
-            "IndividualID": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
-            "HouseholdID": [1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5],
+            "IndividualID": [
+                1,
+                2,
+                3,
+                4,
+                5,
+                6,
+                7,
+                8,
+                9,
+                10,
+                11,
+                12,
+                13,
+                14,
+                15,
+                16,
+                17,
+                18,
+                19,
+                20,
+            ],
+            "HouseholdID": [1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7],
             "TravDay": [
                 1,
                 1,
@@ -27,6 +51,10 @@ def nts_trips():
                 pd.NA,
                 pd.NA,
                 4,
+                4,
+                5,
+                5,
+                pd.NA,
             ],
         }
     )
@@ -37,3 +65,15 @@ def test_households_with_common_travel_days(nts_trips):
     assert households_with_common_travel_days(nts_trips, [1, 2]) == [1]
     assert households_with_common_travel_days(nts_trips, [1, 3]) == [1, 3]
     assert households_with_common_travel_days(nts_trips, [1, 3, 4]) == [1, 3]
+
+
+def test_households_with_travel_days_in_nts_weeks(nts_trips):
+    assert households_with_travel_days_in_nts_weeks(nts_trips, [1]) == [1]
+    assert households_with_travel_days_in_nts_weeks(nts_trips, [1, 2]) == [1]
+    assert households_with_travel_days_in_nts_weeks(nts_trips, [1, 3]) == [1, 3]
+    assert households_with_travel_days_in_nts_weeks(nts_trips, [1, 3, 4]) == [1, 3]
+    assert households_with_travel_days_in_nts_weeks(nts_trips, [1, 3, 4, 5]) == [
+        1,
+        3,
+        6,
+    ]

From 1f5e0b94604b2fbca13b42123695e3a7e395f8e6 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 15 Jan 2025 18:19:08 +0000
Subject: [PATCH 18/28] Fix test

---
 tests/test_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_config.py b/tests/test_config.py
index 878afd3..7f12069 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -11,4 +11,4 @@ def config():
 
 
 def test_id(config):
-    assert config.id == "5709d35b6f"
+    assert config.id == "21e42c9d68"

From 4ce4b1a3b4cbe394bfda7657850c017c81d27349 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 15 Jan 2025 18:35:59 +0000
Subject: [PATCH 19/28] Add logging for NTS filtering

---
 scripts/2_match_households_and_individuals.py | 10 +++++--
 src/acbm/preprocessing.py                     | 27 +++++++++++++++++--
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/scripts/2_match_households_and_individuals.py b/scripts/2_match_households_and_individuals.py
index fd7d37e..4025cbb 100644
--- a/scripts/2_match_households_and_individuals.py
+++ b/scripts/2_match_households_and_individuals.py
@@ -226,22 +226,28 @@ def get_interim_path(
 
     logger.info("Filtering NTS data by specified year(s)")
 
+    logger.info(f"Total NTS households: {nts_households.shape[0]:,.0f}")
     years = config.parameters.nts_years
 
     nts_individuals = nts_filter_by_year(nts_individuals, psu, years)
     nts_households = nts_filter_by_year(nts_households, psu, years)
     nts_trips = nts_filter_by_year(nts_trips, psu, years)
 
+    logger.info(
+        f"Total NTS households (after year filtering): {nts_households.shape[0]:,.0f}"
+    )
     # #### Filter by geography
-    #
 
     regions = config.parameters.nts_regions
 
-    # TODO: Currently this only seems to work for 2019, check this
     nts_individuals = nts_filter_by_region(nts_individuals, psu, regions)
     nts_households = nts_filter_by_region(nts_households, psu, regions)
     nts_trips = nts_filter_by_region(nts_trips, psu, regions)
 
+    logger.info(
+        f"Total NTS households (after region filtering): {nts_households.shape[0]:,.0f}"
+    )
+
     # Ensure that the households have at least one day in `nts_days_of_week` that
     # all household members have trips for
     if config.parameters.common_household_day:
diff --git a/src/acbm/preprocessing.py b/src/acbm/preprocessing.py
index 7601659..d6d0bdb 100644
--- a/src/acbm/preprocessing.py
+++ b/src/acbm/preprocessing.py
@@ -70,11 +70,10 @@ def nts_filter_by_year(
 
     data: pandas DataFrame
         The NTS data to be filtered
-    psu: pandas DataFrame
-        The Primary Sampling Unit table in the NTS. It has the year
     years: list
         The chosen year(s)
     """
+    # return data.loc[data["SurveyYear"].isin(years)]
     # Check that all values of 'years' exist in the 'SurveyYear' column of 'psu'
 
     # Get the unique years in the 'SurveyYear' column of 'psu'
@@ -111,6 +110,7 @@ def nts_filter_by_region(
     # 1. Create a column in the PSU table with the region names
 
     # Dictionary of the regions in the NTS and how they are coded
+    # PSUGOR_B02ID but does not have values for 2021 and 2022
     region_dict = {
         -10.0: "DEAD",
         -9.0: "DNA",
@@ -127,8 +127,31 @@ def nts_filter_by_region(
         10.0: "Wales",
         11.0: "Scotland",
     }
+    # PSUStatsReg_B01ID but does not have values for 2021 and 2022
+    # region_dict = {
+    #     -10.0: "DEAD",
+    #     -9.0: "DNA",
+    #     -8.0: "NA",
+    #     1.0: "Northern, Metropolitan",
+    #     2.0: "Northern, Non-metropolitan",
+    #     3.0: "Yorkshire / Humberside, Metropolitan",
+    #     4.0: "Yorkshire / Humberside, Non-metropolitan",
+    #     5.0: "East Midlands",
+    #     6.0: "East Anglia",
+    #     7.0: "South East (excluding London Boroughs)",
+    #     8.0: "London Boroughs",
+    #     9.0: "South West",
+    #     10.0: "West Midlands, Metropolitan",
+    #     11.0: "West Midlands, Non-metropolitan",
+    #     12.0: "North West, Metropolitan",
+    #     13.0: "North West, Non-metropolitan",
+    #     14.0: "Wales",
+    #     15.0: "Scotland",
+    # }
+
     # In the PSU table, create a column with the region names
     psu["region_name"] = psu["PSUGOR_B02ID"].map(region_dict)
+    # psu["region_name"] = psu["PSUStatsReg_B01ID"].map(region_dict)
 
     # 2. Check that all values of 'years' exist in the 'SurveyYear' column of 'psu'
 

From 46946d17d335b218ee82db13b726a3696a24ca8e Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 15 Jan 2025 18:36:52 +0000
Subject: [PATCH 20/28] Fix merge columns

---
 src/acbm/assigning/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/acbm/assigning/utils.py b/src/acbm/assigning/utils.py
index 020a5c4..614612e 100644
--- a/src/acbm/assigning/utils.py
+++ b/src/acbm/assigning/utils.py
@@ -61,7 +61,7 @@ def activity_chains_for_assignment(
         pd.read_parquet(
             config.output_path / "interim" / "assigning" / "chosen_trav_day.parquet"
         ),
-        on=["id", "household", "TravDay"],
+        on=["id", "TravDay"],
         how="inner",
     )
 

From 43014932a25a55a05c94d6139da045b6a378514e Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 15 Jan 2025 21:54:06 +0000
Subject: [PATCH 21/28] Add matching for remaining individuals

---
 scripts/2_match_households_and_individuals.py | 15 ++++-
 src/acbm/matching.py                          | 59 +++++++++++++++++++
 2 files changed, 73 insertions(+), 1 deletion(-)

diff --git a/scripts/2_match_households_and_individuals.py b/scripts/2_match_households_and_individuals.py
index 4025cbb..546977d 100644
--- a/scripts/2_match_households_and_individuals.py
+++ b/scripts/2_match_households_and_individuals.py
@@ -7,7 +7,7 @@
 from acbm.assigning.utils import cols_for_assignment_all
 from acbm.cli import acbm_cli
 from acbm.config import load_and_setup_config
-from acbm.matching import MatcherExact, match_individuals
+from acbm.matching import MatcherExact, match_individuals, match_remaining_individuals
 from acbm.preprocessing import (
     count_per_group,
     nts_filter_by_region,
@@ -953,6 +953,19 @@ def get_interim_path(
             show_progress=True,
         )
 
+        # match remaining individuals
+        remaining_ids = spc_edited.loc[
+            ~spc_edited.index.isin(matches_ind.keys()), "id"
+        ].to_list()
+        matches_remaining_ind = match_remaining_individuals(
+            df1=spc_edited,
+            df2=nts_individuals,
+            matching_columns=["age_group", "sex"],
+            remaining_ids=remaining_ids,
+            show_progress=True,
+        )
+        matches_ind.update(matches_remaining_ind)
+
         # save random sample
         with open(
             get_interim_path("matches_ind_level_categorical_random_sample.pkl"), "wb"
diff --git a/src/acbm/matching.py b/src/acbm/matching.py
index 4c0a5b1..50b05e2 100644
--- a/src/acbm/matching.py
+++ b/src/acbm/matching.py
@@ -294,3 +294,62 @@ def match_individuals(
         matches.update(match)
 
     return matches
+
+
+def match_remaining_individuals(
+    df1: pd.DataFrame,
+    df2: pd.DataFrame,
+    matching_columns: list,
+    remaining_ids: list[int],
+    show_progress: bool = False,
+) -> dict:
+    """
+    Apply a matching function iteratively to members of each household.
+    In each iteration, filter df1 and df2 to the household ids of item i
+    in matches_hh, and then apply the matching function to the filtered DataFrames.
+
+    Parameters
+    ----------
+    df1: pandas DataFrame
+        The first DataFrame to be matched on
+    df2: pandas DataFrame
+        The second DataFrame to be matched with
+    matching_columns: list
+        The columns to be used for the matching
+    matches_hh: dict
+        A dictionary with the matched household ids {df1_id: df2_id}
+    show_progress: bool
+        Whether to print the progress of the matching to the console
+
+    Returns
+    -------
+    matches: dict
+        A dictionary with the matched row indeces from the two DataFrames {df1: df2}
+
+    """
+    # Initialize an empty dic to store the matches
+    matches = {}
+
+    # loop over all groups of df1_id
+    # note: for large populations looping through the groups (keys) of the
+    # large dataframe (assumed to be df1) is more efficient than looping
+    # over keys and subsetting on a key in each iteration.
+    df1_remaining = df1.loc[df1["id"].isin(remaining_ids)]
+    chunk_size = 1000
+    for i, rows_df1 in df1_remaining.groupby(
+        np.arange(len(df1_remaining)) // chunk_size
+    ):
+        rows_df2 = df2
+        if show_progress:
+            # Print the iteration number and the number of keys in the dict
+            print(
+                f"Matching remaining individuals, {i * chunk_size} out of: {len(remaining_ids)}"
+            )
+
+        # apply the matching
+        match = match_psm(rows_df1, rows_df2, matching_columns)
+
+        # append the results to the main dict
+        matches.update(match)
+
+    return matches

From be2eb6b8f5176079fea5d9b49de8393a6c48b87e Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 15 Jan 2025 21:55:25 +0000
Subject: [PATCH 22/28] Revise region variable to 'PSUStatsReg_B01ID'

---
 src/acbm/preprocessing.py | 67 ++++++++++++++++++++-------------------
 1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/src/acbm/preprocessing.py b/src/acbm/preprocessing.py
index d6d0bdb..4d8aa7e 100644
--- a/src/acbm/preprocessing.py
+++ b/src/acbm/preprocessing.py
@@ -111,47 +111,48 @@ def nts_filter_by_region(
 
     # Dictionary of the regions in the NTS and how they are coded
     # PSUGOR_B02ID but does not have values for 2021 and 2022
-    region_dict = {
-        -10.0: "DEAD",
-        -9.0: "DNA",
-        -8.0: "NA",
-        1.0: "North East",
-        2.0: "North West",
-        3.0: "Yorkshire and the Humber",
-        4.0: "East Midlands",
-        5.0: "West Midlands",
-        6.0: "East of England",
-        7.0: "London",
-        8.0: "South East",
-        9.0: "South West",
-        10.0: "Wales",
-        11.0: "Scotland",
-    }
-    # PSUStatsReg_B01ID but does not have values for 2021 and 2022
     # region_dict = {
     #     -10.0: "DEAD",
     #     -9.0: "DNA",
     #     -8.0: "NA",
-    #     1.0: "Northern, Metropolitan",
-    #     2.0: "Northern, Non-metropolitan",
-    #     3.0: "Yorkshire / Humberside, Metropolitan",
-    #     4.0: "Yorkshire / Humberside, Non-metropolitan",
-    #     5.0: "East Midlands",
-    #     6.0: "East Anglia",
-    #     7.0: "South East (excluding London Boroughs)",
-    #     8.0: "London Boroughs",
+    #     1.0: "North East",
+    #     2.0: "North West",
+    #     3.0: "Yorkshire and the Humber",
+    #     4.0: "East Midlands",
+    #     5.0: "West Midlands",
+    #     6.0: "East of England",
+    #     7.0: "London",
+    #     8.0: "South East",
     #     9.0: "South West",
-    #     10.0: "West Midlands, Metropolitan",
-    #     11.0: "West Midlands, Non-metropolitan",
-    #     12.0: "North West, Metropolitan",
-    #     13.0: "North West, Non-metropolitan",
-    #     14.0: "Wales",
-    #     15.0: "Scotland",
+    #     10.0: "Wales",
+    #     11.0: "Scotland",
     # }
 
+    # PSUStatsReg_B01ID but does not have values for 2021 and 2022
+    region_dict = {
+        -10.0: "DEAD",
+        -9.0: "DNA",
+        -8.0: "NA",
+        1.0: "Northern, Metropolitan",
+        2.0: "Northern, Non-metropolitan",
+        3.0: "Yorkshire / Humberside, Metropolitan",
+        4.0: "Yorkshire / Humberside, Non-metropolitan",
+        5.0: "East Midlands",
+        6.0: "East Anglia",
+        7.0: "South East (excluding London Boroughs)",
+        8.0: "London Boroughs",
+        9.0: "South West",
+        10.0: "West Midlands, Metropolitan",
+        11.0: "West Midlands, Non-metropolitan",
+        12.0: "North West, Metropolitan",
+        13.0: "North West, Non-metropolitan",
+        14.0: "Wales",
+        15.0: "Scotland",
+    }
+
     # In the PSU table, create a column with the region names
-    psu["region_name"] = psu["PSUGOR_B02ID"].map(region_dict)
-    # psu["region_name"] = psu["PSUStatsReg_B01ID"].map(region_dict)
+    # psu["region_name"] = psu["PSUGOR_B02ID"].map(region_dict)
+    psu["region_name"] = psu["PSUStatsReg_B01ID"].map(region_dict)
 
     # 2. Check that all values of 'years' exist in the 'SurveyYear' column of 'psu'
 

From 748260a1d0231e47112b3efb2b4eefe4599e8177 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 15 Jan 2025 22:23:04 +0000
Subject: [PATCH 23/28] Fix logging

---
 src/acbm/matching.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/acbm/matching.py b/src/acbm/matching.py
index 50b05e2..595fda6 100644
--- a/src/acbm/matching.py
+++ b/src/acbm/matching.py
@@ -342,7 +342,7 @@ def match_remaining_individuals(
         rows_df2 = df2
         if show_progress:
             # Print the iteration number and the number of keys in the dict
-            print(
+            logger.info(
                 f"Matching remaining individuals, {i * chunk_size} out of: {len(remaining_ids)}"
             )
 

From 5105dab073ea2108d42854de46b2bd1bcee7a41c Mon Sep 17 00:00:00 2001
From: Sam Greenbury <50113363+sgreenbury@users.noreply.github.com>
Date: Thu, 16 Jan 2025 18:04:37 +0000
Subject: [PATCH 24/28] Remove obsolete comment

---
 src/acbm/preprocessing.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/acbm/preprocessing.py b/src/acbm/preprocessing.py
index 4d8aa7e..6b0a0ab 100644
--- a/src/acbm/preprocessing.py
+++ b/src/acbm/preprocessing.py
@@ -73,7 +73,6 @@ def nts_filter_by_year(
     years: list
         The chosen year(s)
     """
-    # return data.loc[data["SurveyYear"].isin(years)]
     # Check that all values of 'years' exist in the 'SurveyYear' column of 'psu'
 
     # Get the unique years in the 'SurveyYear' column of 'psu'

From 32dfe63e59a0a0a956ea4c3e2c99d7d091ddac2e Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 16 Jan 2025 18:23:17 +0000
Subject: [PATCH 25/28] Uncomment code

---
 src/acbm/assigning/feasible_zones_primary.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/acbm/assigning/feasible_zones_primary.py b/src/acbm/assigning/feasible_zones_primary.py
index 9c3eb85..9ef9f26 100644
--- a/src/acbm/assigning/feasible_zones_primary.py
+++ b/src/acbm/assigning/feasible_zones_primary.py
@@ -25,7 +25,7 @@
 activity_chains_schema = DataFrameSchema(
     {
         "mode": Column(str),
-        # "TravDay": Column(pa.Float, Check.isin([1, 2, 3, 4, 5, 6, 7]), nullable=True),
+        "TravDay": Column(pa.Float, Check.isin([1, 2, 3, 4, 5, 6, 7]), nullable=True),
         "tst": Column(pa.Float, Check.less_than_or_equal_to(1440), nullable=True),
         "TripTotalTime": Column(pa.Float, nullable=True),
         # TODO: add more columns ...

From 6ebd5be1a4370503a6c339b26addbc4bb42e30ca Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 16 Jan 2025 20:37:30 +0000
Subject: [PATCH 26/28] Limit the number of processes to proportion of
 cpu_count()

---
 src/acbm/assigning/select_facility.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/acbm/assigning/select_facility.py b/src/acbm/assigning/select_facility.py
index 28a9e0d..485544e 100644
--- a/src/acbm/assigning/select_facility.py
+++ b/src/acbm/assigning/select_facility.py
@@ -1,5 +1,5 @@
 import logging
-from multiprocessing import Pool
+from multiprocessing import Pool, cpu_count
 from typing import Optional, Tuple
 
 import geopandas as gpd
@@ -204,9 +204,9 @@ def select_facility(
     dict[str, Tuple[str, Point ] | Tuple[float, float]]: Unique ID column as
         keys with selected facility ID and facility ID's geometry, or (np.nan, np.nan)
     """
-    # TODO: update this to be configurable, `None` is os.process_cpu_count()
     # TODO: check if this is deterministic for a given seed (or pass seed to pool)
-    with Pool(None) as p:
+    # TODO: update to be configurable
+    with Pool(max(int(cpu_count() * 0.75), 1)) as p:
         # Set to a large enough chunk size so that each process
         # has a sufficiently large amount of processing to do.
         chunk_size = 16_000

From 1dd281b5936cb2f6b3624947723054cd8b594f2f Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Mon, 20 Jan 2025 22:06:48 +0000
Subject: [PATCH 27/28] Only consider working not from home

---
 scripts/3.2.2_assign_primary_zone_work.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/3.2.2_assign_primary_zone_work.py b/scripts/3.2.2_assign_primary_zone_work.py
index 2cd561b..30ae0b9 100644
--- a/scripts/3.2.2_assign_primary_zone_work.py
+++ b/scripts/3.2.2_assign_primary_zone_work.py
@@ -98,7 +98,7 @@ def main(config_file):
 
         logger.info("Step 4: Filtering rows and dropping unnecessary columns")
         travel_demand_clipped = travel_demand[
-            travel_demand["Place of work indicator (4 categories) code"].isin([1, 3])
+            travel_demand["Place of work indicator (4 categories) code"].isin([3])
         ]
         travel_demand_clipped = travel_demand_clipped.drop(
             columns=[
@@ -139,7 +139,7 @@ def main(config_file):
 
         logger.info("Step 2: Filtering rows and dropping unnecessary columns")
         travel_demand_clipped = travel_demand[
-            travel_demand["Place of work indicator (4 categories) code"].isin([1, 3])
+            travel_demand["Place of work indicator (4 categories) code"].isin([3])
         ]
         travel_demand_clipped = travel_demand_clipped.drop(
             columns=[

From a07c03570763d913d5d9ba502dd534bbd1c4c998 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 22 Jan 2025 18:46:28 +0000
Subject: [PATCH 28/28] Add scaling

---
 scripts/3.2.2_assign_primary_zone_work.py | 4 +++-
 src/acbm/assigning/select_zone_work.py    | 5 +++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/scripts/3.2.2_assign_primary_zone_work.py b/scripts/3.2.2_assign_primary_zone_work.py
index 30ae0b9..fbf667f 100644
--- a/scripts/3.2.2_assign_primary_zone_work.py
+++ b/scripts/3.2.2_assign_primary_zone_work.py
@@ -200,7 +200,9 @@ def main(config_file):
     #### ASSIGN TO ZONE FROM FEASIBLE ZONES ####
 
     zone_assignment = WorkZoneAssignment(
-        activities_to_assign=possible_zones_work, actual_flows=travel_demand_dict_nomode
+        activities_to_assign=possible_zones_work,
+        actual_flows=travel_demand_dict_nomode,
+        scaling=config.parameters.part_time_work_prob,
     )
 
     assignments_df = zone_assignment.select_work_zone_optimization(
diff --git a/src/acbm/assigning/select_zone_work.py b/src/acbm/assigning/select_zone_work.py
index 5005842..cb4c7c0 100644
--- a/src/acbm/assigning/select_zone_work.py
+++ b/src/acbm/assigning/select_zone_work.py
@@ -51,6 +51,7 @@ class WorkZoneAssignment:
     remaining_flows: Dict[Tuple[str, str], int] = field(init=False)
     total_flows: Dict[str, int] = field(init=False)
     percentages: Dict[Tuple[str, str], float] = field(init=False)
+    scaling: float = field(init=True)
 
     def __post_init__(self):
         """
@@ -75,9 +76,9 @@ def _calculate_total_flows(self) -> Dict[str, int]:
         total_flows = {}
         for (from_zone, _), flow in self.actual_flows.items():
             if from_zone in total_flows:
-                total_flows[from_zone] += flow
+                total_flows[from_zone] += int(flow * self.scaling)
             else:
-                total_flows[from_zone] = flow
+                total_flows[from_zone] = int(flow * self.scaling)
         return total_flows
 
     def _calculate_percentages(self) -> Dict[Tuple[str, str], float]: