-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompas.R
91 lines (74 loc) · 2.24 KB
/
compas.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
library(groundhog)
groundhog.library("
tidymodels
fs
RSQLite
dbplyr
tidyverse
", "2024-07-04")
################################################################################
# Setup the environment.
source("setup.R")
DATASET <- "compas"
################################################################################
# Load and clean the COMPAS dataset.
conn <- dbConnect(RSQLite::SQLite(), path("data", "compas", "compas.db"))
df <- tbl(conn, "compas") %>%
filter(type_of_assessment == "Risk of Recidivism") %>%
left_join(
tbl(conn, "people"),
by = c("person_id" = "id"),
) %>%
select(
race,
score = raw_score,
outcome = is_recid
) %>%
# NOTE: The original dataset has a few missing compas scores.
filter(outcome != -1) %>%
collect() %>%
mutate(
race = factor(case_when(
race == "African-American" ~ "b",
race == "Hispanic" ~ "h",
race == "Caucasian" ~ "w"
), levels = c("b", "h", "w")),
outcome = factor(as.logical(outcome), levels = c(TRUE, FALSE))
) %>%
drop_na(race)
################################################################################
# Model risk.
# Recalibrate the COMPAS scores using a logistic regression model.
m_risk <- logistic_reg() %>%
set_engine("glm") %>%
fit(outcome ~ race * score, data = df)
df <- predict(m_risk, df, type = "prob") %>%
bind_cols(df) %>%
select(
outcome,
risk = .pred_TRUE,
race
)
rm(m_risk)
################################################################################
# Generate the data needed for the monotonicity plots.
# Convenience function for calculating the proportion of people belonging to
# each group at a given risk level.
density_ratio <- function(df, n_buckets) {
df %>%
group_by(bucket = ntile(risk, {{ n_buckets }})) %>%
summarize(
p_b = mean(race == "b"),
p_h = mean(race == "h"),
p_w = mean(race == "w"),
r_b = p_b / (p_b + p_w),
r_h = p_h / (p_h + p_w)
)
}
source("monotonicity.R")
################################################################################
# Run the simulation.
source("simulation.R")
################################################################################
# Perform model checks.
source("model_checks.R")