diff --git a/scripts/generate_synthetic_pdp_datasets.py b/scripts/generate_synthetic_pdp_datasets.py index 8700bf1f..a06d61ad 100644 --- a/scripts/generate_synthetic_pdp_datasets.py +++ b/scripts/generate_synthetic_pdp_datasets.py @@ -19,13 +19,15 @@ def main(): FAKER.add_provider(pdp.raw_cohort.Provider) FAKER.add_provider(pdp.raw_course.Provider) + # institution_id must be the same for all records. + institution_id = FAKER.numerify("#####!") cohort_records = [ - FAKER.raw_cohort_record(normalize_col_names=args.normalize_col_names) + FAKER.raw_cohort_record(normalize_col_names=args.normalize_col_names, institution_id=institution_id) for _ in range(args.num_students) ] course_records = [ FAKER.raw_course_record( - cohort_record, normalize_col_names=args.normalize_col_names + cohort_record, normalize_col_names=args.normalize_col_names, institution_id=institution_id ) for cohort_record in cohort_records for _ in range( diff --git a/src/student_success_tool/generation/pdp/raw_cohort.py b/src/student_success_tool/generation/pdp/raw_cohort.py index dd078ee2..9197bad7 100644 --- a/src/student_success_tool/generation/pdp/raw_cohort.py +++ b/src/student_success_tool/generation/pdp/raw_cohort.py @@ -12,6 +12,7 @@ def raw_cohort_record( min_cohort_yr: int = 2010, max_cohort_yr: t.Optional[int] = None, normalize_col_names: bool = False, + institution_id: int = 12345, ) -> dict[str, object]: # some fields are inputs to others; compute them first, accordingly enrollment_type = self.enrollment_type() @@ -24,7 +25,7 @@ def raw_cohort_record( # TODO: handle other cases, e.g. gateway course attempted/completed/grades record = { "Student GUID": self.student_guid(), - "Institution ID": self.institution_id(), + "Institution ID": institution_id, "Cohort": self.cohort(min_yr=min_cohort_yr, max_yr=max_cohort_yr), "Cohort Term": self.cohort_term(), "Student Age": self.student_age(), diff --git a/src/student_success_tool/generation/pdp/raw_course.py b/src/student_success_tool/generation/pdp/raw_course.py index 0b3db5df..c6d61082 100644 --- a/src/student_success_tool/generation/pdp/raw_course.py +++ b/src/student_success_tool/generation/pdp/raw_course.py @@ -8,7 +8,7 @@ class Provider(BaseProvider): def raw_course_record( - self, cohort_record: t.Optional[dict] = None, normalize_col_names: bool = False + self, cohort_record: t.Optional[dict] = None, normalize_col_names: bool = False, institution_id: int = 12345 ) -> dict[str, object]: # use existing values where records overlap if cohort_record is not None: @@ -18,7 +18,6 @@ def raw_course_record( race = cr.get("race", cr["Race"]) ethnicity = cr.get("ethnicity", cr["Ethnicity"]) gender = cr.get("gender", cr["Gender"]) - institution_id = cr.get("institution_id", cr["Institution ID"]) cohort = cr.get("cohort", cr["Cohort"]) cohort_term = cr.get("cohort_term", cr["Cohort Term"]) _has_enrollment_other_inst: bool = ( @@ -34,7 +33,6 @@ def raw_course_record( race = self.race() ethnicity = self.ethnicity() gender = self.gender() - institution_id = self.institution_id() cohort = self.cohort() cohort_term = self.cohort_term() _has_enrollment_other_inst: bool = self.generator.random.random() < 0.25 # type: ignore @@ -109,9 +107,6 @@ def raw_course_record( def student_guid(self) -> str: return self.numerify("#####!") # type: ignore - def institution_id(self) -> str: - return self.numerify("#####!") # type: ignore - def student_age(self) -> str: return self.random_element(["20 AND YOUNGER", ">20 - 24", "OLDER THAN 24"]) diff --git a/tests/generation/pdp/test_raw_cohort.py b/tests/generation/pdp/test_raw_cohort.py index 3ffe1269..b37b31cc 100644 --- a/tests/generation/pdp/test_raw_cohort.py +++ b/tests/generation/pdp/test_raw_cohort.py @@ -24,3 +24,13 @@ def test_raw_cohort_record(min_cohort_yr, max_cohort_yr, normalize_col_names): df_obs = pd.DataFrame([obs]) obs_valid = RawPDPCohortDataSchema.validate(df_obs, lazy=True) assert isinstance(obs_valid, pd.DataFrame) # => data passed validation + print(df_obs) + + +def test_multiple_raw_cohort_records(): + cohort_records = [ + FAKER.raw_cohort_record(normalize_col_names=True) for _ in range(10) + ] + df_cohort = pd.DataFrame(cohort_records) + obs_valid = RawPDPCohortDataSchema.validate(df_cohort, lazy=True) + assert isinstance(obs_valid, pd.DataFrame) # => data passed validation \ No newline at end of file diff --git a/tests/generation/pdp/test_raw_course.py b/tests/generation/pdp/test_raw_course.py index 35c320f4..0d29a153 100644 --- a/tests/generation/pdp/test_raw_course.py +++ b/tests/generation/pdp/test_raw_course.py @@ -13,10 +13,19 @@ ["normalize_col_names"], [(False,), (True,)], ) -def test_raw_cohort_record(normalize_col_names): +def test_raw_course_record(normalize_col_names): obs = FAKER.raw_course_record(normalize_col_names=normalize_col_names) assert obs and isinstance(obs, dict) if normalize_col_names is True: df_obs = pd.DataFrame([obs]) obs_valid = RawPDPCourseDataSchema.validate(df_obs, lazy=True) assert isinstance(obs_valid, pd.DataFrame) # => data passed validation + + +def test_multiple_raw_course_records(): + course_records = [ + FAKER.raw_course_record(normalize_col_names=True) for _ in range(10) + ] + df_obs = pd.DataFrame(course_records) + obs_valid = RawPDPCourseDataSchema.validate(df_obs, lazy=False) + assert isinstance(obs_valid, pd.DataFrame) # => data passed validation