Fix pdp gen failure from different institution ids

There's a cohort schema check that ensures there's a single institution for all the data, but the pdp generation script creates a new institution id per record. This creates the institution id once and uses it for both cohort and course data. Also fixed what's probably a copy paste error, test_raw_cohort_record -> test_raw_course_record in test_raw_course.py
datakind · Dec 30, 2024 · 113fab5 · 113fab5
1 parent 488c442
commit 113fab5
Show file tree

Hide file tree

Showing 5 changed files with 27 additions and 10 deletions.
diff --git a/scripts/generate_synthetic_pdp_datasets.py b/scripts/generate_synthetic_pdp_datasets.py
@@ -19,13 +19,15 @@ def main():
     FAKER.add_provider(pdp.raw_cohort.Provider)
     FAKER.add_provider(pdp.raw_course.Provider)
 
+    # institution_id must be the same for all records.
+    institution_id = FAKER.numerify("#####!")
     cohort_records = [
-        FAKER.raw_cohort_record(normalize_col_names=args.normalize_col_names)
+        FAKER.raw_cohort_record(normalize_col_names=args.normalize_col_names, institution_id=institution_id)
         for _ in range(args.num_students)
     ]
     course_records = [
         FAKER.raw_course_record(
-            cohort_record, normalize_col_names=args.normalize_col_names
+            cohort_record, normalize_col_names=args.normalize_col_names, institution_id=institution_id
         )
         for cohort_record in cohort_records
         for _ in range(

diff --git a/src/student_success_tool/generation/pdp/raw_cohort.py b/src/student_success_tool/generation/pdp/raw_cohort.py
@@ -12,6 +12,7 @@ def raw_cohort_record(
         min_cohort_yr: int = 2010,
         max_cohort_yr: t.Optional[int] = None,
         normalize_col_names: bool = False,
+        institution_id: int = 12345,
     ) -> dict[str, object]:
         # some fields are inputs to others; compute them first, accordingly
         enrollment_type = self.enrollment_type()
@@ -24,7 +25,7 @@ def raw_cohort_record(
         # TODO: handle other cases, e.g. gateway course attempted/completed/grades
         record = {
             "Student GUID": self.student_guid(),
-            "Institution ID": self.institution_id(),
+            "Institution ID": institution_id,
             "Cohort": self.cohort(min_yr=min_cohort_yr, max_yr=max_cohort_yr),
             "Cohort Term": self.cohort_term(),
             "Student Age": self.student_age(),

diff --git a/src/student_success_tool/generation/pdp/raw_course.py b/src/student_success_tool/generation/pdp/raw_course.py
@@ -8,7 +8,7 @@
 
 class Provider(BaseProvider):
     def raw_course_record(
-        self, cohort_record: t.Optional[dict] = None, normalize_col_names: bool = False
+        self, cohort_record: t.Optional[dict] = None, normalize_col_names: bool = False, institution_id: int = 12345
     ) -> dict[str, object]:
         # use existing values where records overlap
         if cohort_record is not None:
@@ -18,7 +18,6 @@ def raw_course_record(
             race = cr.get("race", cr["Race"])
             ethnicity = cr.get("ethnicity", cr["Ethnicity"])
             gender = cr.get("gender", cr["Gender"])
-            institution_id = cr.get("institution_id", cr["Institution ID"])
             cohort = cr.get("cohort", cr["Cohort"])
             cohort_term = cr.get("cohort_term", cr["Cohort Term"])
             _has_enrollment_other_inst: bool = (
@@ -34,7 +33,6 @@ def raw_course_record(
             race = self.race()
             ethnicity = self.ethnicity()
             gender = self.gender()
-            institution_id = self.institution_id()
             cohort = self.cohort()
             cohort_term = self.cohort_term()
             _has_enrollment_other_inst: bool = self.generator.random.random() < 0.25  # type: ignore
@@ -109,9 +107,6 @@ def raw_course_record(
     def student_guid(self) -> str:
         return self.numerify("#####!")  # type: ignore
 
-    def institution_id(self) -> str:
-        return self.numerify("#####!")  # type: ignore
-
     def student_age(self) -> str:
         return self.random_element(["20 AND YOUNGER", ">20 - 24", "OLDER THAN 24"])
 

diff --git a/tests/generation/pdp/test_raw_cohort.py b/tests/generation/pdp/test_raw_cohort.py
@@ -24,3 +24,13 @@ def test_raw_cohort_record(min_cohort_yr, max_cohort_yr, normalize_col_names):
         df_obs = pd.DataFrame([obs])
         obs_valid = RawPDPCohortDataSchema.validate(df_obs, lazy=True)
         assert isinstance(obs_valid, pd.DataFrame)  # => data passed validation
+        print(df_obs)
+
+
+def test_multiple_raw_cohort_records():
+    cohort_records = [
+        FAKER.raw_cohort_record(normalize_col_names=True) for _ in range(10)
+    ]
+    df_cohort = pd.DataFrame(cohort_records)
+    obs_valid = RawPDPCohortDataSchema.validate(df_cohort, lazy=True)
+    assert isinstance(obs_valid, pd.DataFrame)  # => data passed validation
diff --git a/tests/generation/pdp/test_raw_course.py b/tests/generation/pdp/test_raw_course.py
@@ -13,10 +13,19 @@
     ["normalize_col_names"],
     [(False,), (True,)],
 )
-def test_raw_cohort_record(normalize_col_names):
+def test_raw_course_record(normalize_col_names):
     obs = FAKER.raw_course_record(normalize_col_names=normalize_col_names)
     assert obs and isinstance(obs, dict)
     if normalize_col_names is True:
         df_obs = pd.DataFrame([obs])
         obs_valid = RawPDPCourseDataSchema.validate(df_obs, lazy=True)
         assert isinstance(obs_valid, pd.DataFrame)  # => data passed validation
+
+
+def test_multiple_raw_course_records():
+    course_records = [
+        FAKER.raw_course_record(normalize_col_names=True) for _ in range(10)
+    ]
+    df_obs = pd.DataFrame(course_records)
+    obs_valid = RawPDPCourseDataSchema.validate(df_obs, lazy=False)
+    assert isinstance(obs_valid, pd.DataFrame)  # => data passed validation