From 186adbda7d86f1c8ef477c311e736102b10f4ffc Mon Sep 17 00:00:00 2001
From: Daniel Olsen <daniel@breakthroughenergy.org>
Date: Thu, 28 Oct 2021 16:47:45 -0700
Subject: [PATCH 1/3] chore: add demand-related constants and population data
 paths to const.py

---
 ATTRIBUTION.md                          | 32 +++++++++++++++++++++++++
 prereise/gather/griddata/hifld/const.py |  6 +++++
 2 files changed, 38 insertions(+)

diff --git a/ATTRIBUTION.md b/ATTRIBUTION.md
index ecadf61af..557cc6b20 100644
--- a/ATTRIBUTION.md
+++ b/ATTRIBUTION.md
@@ -447,4 +447,36 @@ The data are used to populate the power generation profile of U.S. transmission
 ##### Note
 Public use (see https://hifld-geoplatform.opendata.arcgis.com/datasets/geoplatform::power-plants/about). Users are advised to read the data set's metadata thoroughly to understand appropriate use and data limitations.
 
+
+##### Source
+* Name: U.S. Zips
+* Author: Pareto Software, LLC
+* Description: Data on United States ZIP codes
+* Source: https://simplemaps.com
+* Exact source location: https://simplemaps.com/data/us-zips
+* Note: version 1.78, accessed 2021-10-27.
+
+##### Destination
+* Modifications to source file(s): None
+* Location: https://besciences.blob.core.windows.net/datasets/geo_data/uszips.csv
+
+##### General Purpose
+The data are used to estimate population served by each substation, to distribute demand.
+
+
+##### Source
+* Name: U.S. Counties
+* Author: Pareto Software, LLC
+* Description: Data on United States counties
+* Source: https://simplemaps.com
+* Exact source location: https://simplemaps.com/data/us-counties
+* Note: version 1.71, accessed 2021-10-27.
+
+##### Destination
+* Modifications to source file(s): None
+* Location: https://besciences.blob.core.windows.net/datasets/geo_data/uscounties.csv
+
+##### General Purpose
+The data are used to estimate population served by each substation, to distribute demand.
+
 ---
diff --git a/prereise/gather/griddata/hifld/const.py b/prereise/gather/griddata/hifld/const.py
index f2012551f..b5a89ba38 100644
--- a/prereise/gather/griddata/hifld/const.py
+++ b/prereise/gather/griddata/hifld/const.py
@@ -111,6 +111,8 @@
     "epa_needs": "https://besciences.blob.core.windows.net/datasets/EPA_NEEDS/needs-v620_06-30-21-2_active.csv",
     "substations": "https://besciences.blob.core.windows.net/datasets/hifld/Electric_Substations_Jul2020.csv",
     "transmission_lines": "https://besciences.blob.core.windows.net/datasets/hifld/Electric_Power_Transmission_Lines_Jul2020.geojson.zip",
+    "us_counties": "https://besciences.blob.core.windows.net/datasets/geo_data/uscounties.csv",
+    "us_zips": "https://besciences.blob.core.windows.net/datasets/geo_data/uszips.csv",
 }
 eia_epa_crosswalk_path = "https://raw.githubusercontent.com/Breakthrough-Energy/camd-eia-crosswalk/master/epa_eia_crosswalk.csv"
 
@@ -386,6 +388,7 @@
     "Solar Thermal without Energy Storage": 0,
 }
 
+
 # These lines were manually identified based on a combination of: their 'TYPE'
 # classification, their substation names, and their geographical paths. The capacities
 # for each line were compiled from a variety of public sources.
@@ -402,3 +405,6 @@
     310053: 400,  # Trans-Bay Cable
     311958: 5,  # Alamogordo Solar Energy Center
 }
+
+substation_load_share = 0.5
+demand_per_person = 2.01e-3

From 2c79b0028bf070ee977ea51c5b2195652829c552 Mon Sep 17 00:00:00 2001
From: Daniel Olsen <daniel@breakthroughenergy.org>
Date: Tue, 2 Nov 2021 10:52:18 -0700
Subject: [PATCH 2/3] feat: add functions to load county and ZIP code data

---
 .../gather/griddata/hifld/data_access/load.py  | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/prereise/gather/griddata/hifld/data_access/load.py b/prereise/gather/griddata/hifld/data_access/load.py
index f18c12a95..feb4dedd8 100644
--- a/prereise/gather/griddata/hifld/data_access/load.py
+++ b/prereise/gather/griddata/hifld/data_access/load.py
@@ -231,3 +231,21 @@ def get_zone(path):
     :return: (*pandas.DataFrame*) -- information related to load zone
     """
     return pd.read_csv(path, index_col="zone_id")
+
+
+def get_us_counties(path):
+    """Read the file containing county data.
+
+    :param str path: path to file. Either local or URL.
+    :return: (*pandas.DataFrame*) -- information related to counties
+    """
+    return pd.read_csv(path).set_index("county_fips")
+
+
+def get_us_zips(path):
+    """Read the file containing ZIP code data.
+
+    :param str path: path to file. Either local or URL.
+    :return: (*pandas.DataFrame*) -- information related to ZIP codes
+    """
+    return pd.read_csv(path, dtype={"zip": "string"}).set_index("zip")

From b7115abb676e2d9263defa4d6f712bf3d29f252f Mon Sep 17 00:00:00 2001
From: Daniel Olsen <daniel@breakthroughenergy.org>
Date: Thu, 28 Oct 2021 16:54:03 -0700
Subject: [PATCH 3/3] feat: add function to distribute demand to buses by
 population

---
 .../griddata/hifld/data_process/demand.py     | 76 +++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 prereise/gather/griddata/hifld/data_process/demand.py

diff --git a/prereise/gather/griddata/hifld/data_process/demand.py b/prereise/gather/griddata/hifld/data_process/demand.py
new file mode 100644
index 000000000..408eeef34
--- /dev/null
+++ b/prereise/gather/griddata/hifld/data_process/demand.py
@@ -0,0 +1,76 @@
+import pandas as pd
+
+from prereise.gather.griddata.hifld import const
+from prereise.gather.griddata.hifld.data_access.load import get_us_counties, get_us_zips
+
+
+def assign_demand_to_buses(substations, branch, plant, bus):
+    """Using data on population by county and ZIP code, assign demand to substations,
+    then to the lowest-voltage bus within each substation.
+    This demand parameter is added inplace as a 'Pd' column to the ``bus`` data frame.
+
+    :param pandas.DataFrame substations: table of substation data.
+    :param pandas.DataFrame branch: table of branch data.
+    :param pandas.DataFrame plant: table of plant data.
+    :param pandas.DataFrame bus: table of bus data.
+    """
+    # Load data
+    zip_data = get_us_zips(const.blob_paths["us_zips"])
+    county_data = get_us_counties(const.blob_paths["us_counties"])
+
+    # Determine each substation's transmission capacity, then sort for selection
+    filtered_branch = branch.query("SUB_1_ID != SUB_2_ID")
+    from_cap = filtered_branch.groupby("SUB_1_ID").sum()["rateA"]
+    to_cap = filtered_branch.groupby("SUB_2_ID").sum()["rateA"]
+    sub_cap = from_cap.combine(to_cap, lambda x, y: x + y, fill_value=0)
+    # Sort substations by their capacities for later ordered selection
+    sorted_subs = substations.loc[sub_cap.sort_values(ascending=False).index].copy()
+
+    # Determine for each ZIP, how much demand to assign to each load substation
+    # Assume here that generator substations don't have load attached to them
+    filtered_subs = sorted_subs.loc[~sorted_subs.index.isin(plant["sub_id"])]
+    subs_per_zip = filtered_subs.value_counts("ZIP")
+    zip_load_substations = subs_per_zip * const.substation_load_share
+    zip_load_substations = zip_load_substations.round().clip(lower=1)
+    zip_assigned_population = (zip_data["population"] / zip_load_substations).dropna()
+    # Select the N substations per ZIP with greatest transmission capacity
+    load_substations = pd.concat(
+        df.head(int(zip_load_substations[name]))
+        for name, df in filtered_subs.groupby("ZIP")
+    )
+    substations["pop_ZIP"] = load_substations["ZIP"].map(zip_assigned_population)
+
+    # Assign remaining county population to substations with load already,
+    # plus the most connected substation in any county without a load substation.
+    load_subs_from_zips = substations.query("pop_ZIP > 0")
+    load_subs_per_county = load_subs_from_zips.value_counts("COUNTYFIPS")
+    county_pop = county_data["population"]
+
+    # Select the one substation per missing county with greatest transmission capacity
+    counties_without_load_subs = set(county_pop.index) - set(load_subs_per_county.index)
+    subs_in_counties_without_load_subs = sorted_subs.loc[
+        sorted_subs["COUNTYFIPS"].isin(counties_without_load_subs)
+    ]
+    added_load_subs = pd.concat(
+        df.head(1)
+        for name, df in subs_in_counties_without_load_subs.groupby("COUNTYFIPS")
+    )
+    load_subs = pd.concat([load_subs_from_zips, added_load_subs])
+    load_subs_per_county = load_subs_per_county.reindex(county_pop.index).fillna(1)
+
+    # Distribute population remaining after ZIP distribution to identified load buses
+    distributed_pop = load_subs.groupby("COUNTYFIPS")["pop_ZIP"].sum()
+    remaining_pop = county_pop - distributed_pop.reindex(county_pop.index).fillna(0)
+    remaining_pop_per_sub = remaining_pop.clip(lower=0) / load_subs_per_county
+    # We may still miss some population, since there may be a county without any
+    # substations, but we should cover the vast majority.
+    substations["pop_county"] = load_subs["COUNTYFIPS"].map(remaining_pop_per_sub)
+
+    # Translate population to demand
+    total_pop = substations["pop_ZIP"].fillna(0) + substations["pop_county"].fillna(0)
+    sub_demand = total_pop * const.demand_per_person
+
+    load_buses = pd.concat(
+        df.head(1) for sub_id, df in bus.sort_values("baseKV").groupby("sub_id")
+    )
+    bus["Pd"] = load_buses["sub_id"].map(sub_demand).reindex(bus.index).fillna(0)