Skip to content

Commit

Permalink
Fix pdp gen failure from different institution ids
Browse files Browse the repository at this point in the history
There's a cohort schema check that ensures there's a single institution
for all the data, but the pdp generation script creates a new
institution id per record. This creates the institution id once and uses
it for both cohort and course data.

Also fixed what's probably a copy paste error, test_raw_cohort_record ->
test_raw_course_record in test_raw_course.py
  • Loading branch information
ZakMiller committed Dec 30, 2024
1 parent 488c442 commit 113fab5
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 10 deletions.
6 changes: 4 additions & 2 deletions scripts/generate_synthetic_pdp_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,15 @@ def main():
FAKER.add_provider(pdp.raw_cohort.Provider)
FAKER.add_provider(pdp.raw_course.Provider)

# institution_id must be the same for all records.
institution_id = FAKER.numerify("#####!")
cohort_records = [
FAKER.raw_cohort_record(normalize_col_names=args.normalize_col_names)
FAKER.raw_cohort_record(normalize_col_names=args.normalize_col_names, institution_id=institution_id)
for _ in range(args.num_students)
]
course_records = [
FAKER.raw_course_record(
cohort_record, normalize_col_names=args.normalize_col_names
cohort_record, normalize_col_names=args.normalize_col_names, institution_id=institution_id
)
for cohort_record in cohort_records
for _ in range(
Expand Down
3 changes: 2 additions & 1 deletion src/student_success_tool/generation/pdp/raw_cohort.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def raw_cohort_record(
min_cohort_yr: int = 2010,
max_cohort_yr: t.Optional[int] = None,
normalize_col_names: bool = False,
institution_id: int = 12345,
) -> dict[str, object]:
# some fields are inputs to others; compute them first, accordingly
enrollment_type = self.enrollment_type()
Expand All @@ -24,7 +25,7 @@ def raw_cohort_record(
# TODO: handle other cases, e.g. gateway course attempted/completed/grades
record = {
"Student GUID": self.student_guid(),
"Institution ID": self.institution_id(),
"Institution ID": institution_id,
"Cohort": self.cohort(min_yr=min_cohort_yr, max_yr=max_cohort_yr),
"Cohort Term": self.cohort_term(),
"Student Age": self.student_age(),
Expand Down
7 changes: 1 addition & 6 deletions src/student_success_tool/generation/pdp/raw_course.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

class Provider(BaseProvider):
def raw_course_record(
self, cohort_record: t.Optional[dict] = None, normalize_col_names: bool = False
self, cohort_record: t.Optional[dict] = None, normalize_col_names: bool = False, institution_id: int = 12345
) -> dict[str, object]:
# use existing values where records overlap
if cohort_record is not None:
Expand All @@ -18,7 +18,6 @@ def raw_course_record(
race = cr.get("race", cr["Race"])
ethnicity = cr.get("ethnicity", cr["Ethnicity"])
gender = cr.get("gender", cr["Gender"])
institution_id = cr.get("institution_id", cr["Institution ID"])
cohort = cr.get("cohort", cr["Cohort"])
cohort_term = cr.get("cohort_term", cr["Cohort Term"])
_has_enrollment_other_inst: bool = (
Expand All @@ -34,7 +33,6 @@ def raw_course_record(
race = self.race()
ethnicity = self.ethnicity()
gender = self.gender()
institution_id = self.institution_id()
cohort = self.cohort()
cohort_term = self.cohort_term()
_has_enrollment_other_inst: bool = self.generator.random.random() < 0.25 # type: ignore
Expand Down Expand Up @@ -109,9 +107,6 @@ def raw_course_record(
def student_guid(self) -> str:
return self.numerify("#####!") # type: ignore

def institution_id(self) -> str:
return self.numerify("#####!") # type: ignore

def student_age(self) -> str:
return self.random_element(["20 AND YOUNGER", ">20 - 24", "OLDER THAN 24"])

Expand Down
10 changes: 10 additions & 0 deletions tests/generation/pdp/test_raw_cohort.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,13 @@ def test_raw_cohort_record(min_cohort_yr, max_cohort_yr, normalize_col_names):
df_obs = pd.DataFrame([obs])
obs_valid = RawPDPCohortDataSchema.validate(df_obs, lazy=True)
assert isinstance(obs_valid, pd.DataFrame) # => data passed validation
print(df_obs)


def test_multiple_raw_cohort_records():
cohort_records = [
FAKER.raw_cohort_record(normalize_col_names=True) for _ in range(10)
]
df_cohort = pd.DataFrame(cohort_records)
obs_valid = RawPDPCohortDataSchema.validate(df_cohort, lazy=True)
assert isinstance(obs_valid, pd.DataFrame) # => data passed validation
11 changes: 10 additions & 1 deletion tests/generation/pdp/test_raw_course.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,19 @@
["normalize_col_names"],
[(False,), (True,)],
)
def test_raw_cohort_record(normalize_col_names):
def test_raw_course_record(normalize_col_names):
obs = FAKER.raw_course_record(normalize_col_names=normalize_col_names)
assert obs and isinstance(obs, dict)
if normalize_col_names is True:
df_obs = pd.DataFrame([obs])
obs_valid = RawPDPCourseDataSchema.validate(df_obs, lazy=True)
assert isinstance(obs_valid, pd.DataFrame) # => data passed validation


def test_multiple_raw_course_records():
course_records = [
FAKER.raw_course_record(normalize_col_names=True) for _ in range(10)
]
df_obs = pd.DataFrame(course_records)
obs_valid = RawPDPCourseDataSchema.validate(df_obs, lazy=False)
assert isinstance(obs_valid, pd.DataFrame) # => data passed validation

0 comments on commit 113fab5

Please sign in to comment.