The de-identification approach outlined in this research does result in a loss of both accuracy and precision for the purpose of inferring meaningful locations. However, we argue that this loss is relatively inconsequential for many human mobility analyses and is a worthwhile trade-off vis-a-vis the ethical use of this dataset and any potential harm that can result from its use.
To evaluate this trade-off, we have applied each of the four algorithms on both original and de-identified datasets. More than 80% of the users are labelled with the same home location in both datasets, and the median ‘error’ for mismatches is less than 2 kilometres for most algorithms (i.e. a neighbouring hexagon is selected instead).
grids <- read_sf(here("analysis/data/derived_data/spatial_hex_grid.shp"))
df_anonymized <- readRDS(here("analysis/data/derived_data/sg_tweets_anonymized_20200929.rds"))
u_ids <- df_anonymized %>%
dplyr::select(u_id, u_id_anonymized) %>%
distinct(u_id, u_id_anonymized)
#identified home locations from original dataset
raw_hmlc <- readRDS(here::here("analysis/data/raw_data/hm750_hmlc.rds")) %>% mutate(name = "HMLC")
#online social network recipe
raw_osna <- readRDS(here::here("analysis/data/raw_data/hm750_osna.rds")) %>% mutate(name = "OSNA")
#mobile positioning data
raw_apdm <- readRDS(here::here("analysis/data/raw_data/hm750_apdm.rds")) %>% mutate(name = "APDM")
#frequency
raw_freq <- readRDS(here::here("analysis/data/raw_data/hm750_freq.rds")) %>% mutate(name = "FREQ")
hm_original <- bind_rows(raw_hmlc, raw_apdm, raw_osna, raw_freq)
#identified home locations from de-identified dataset
hm_hmlc <- read_csv(here::here("analysis/data/derived_data/hm_hmlc.csv"))
#online social network recipe
hm_osna <- read_csv(here::here("analysis/data/derived_data/hm_osna.csv"))
#mobile positioning data
hm_apdm <- read_csv(here::here("analysis/data/derived_data/hm_apdm.csv"))
#frequency
hm_freq <- read_csv(here::here("analysis/data/derived_data/hm_freq.csv"))
hm_anonymized <- bind_rows(hm_hmlc, hm_apdm, hm_osna, hm_freq)
cal_shared_users <- function(raw_hm, anonymized_hm){
df_joined <- raw_hm %>%
dplyr::select(-name) %>%
left_join(., u_ids) %>%
mutate(home = as.numeric(home)) %>%
left_join(., anonymized_hm, by = c("u_id_anonymized" = "u_id")) %>%
drop_na()
recipe_nm <- unique(raw_hm$name)
tibble(
recipe = recipe_nm,
n_shared_users = nrow(df_joined),
n_matched = df_joined %>% filter(home.x == home.y) %>% nrow()
)
}
df_matched <- map2_df(list(raw_apdm, raw_freq, raw_hmlc, raw_osna), list(hm_apdm, hm_freq, hm_hmlc, hm_osna), function(x, y) cal_shared_users(x, y)) %>%
mutate(pct_matched = round(n_matched/n_shared_users, 4) * 100)
df_matched %>%
rename(Recipe = recipe, `# of shared users` = n_shared_users, `# of matched home locations` = n_matched, `% of matched home locations` = pct_matched) %>%
knitr::kable(., caption = 'Comparison between original and de-identified datasets')
Recipe | # of shared users | # of matched home locations | % of matched home locations |
---|---|---|---|
APDM | 32047 | 28944 | 90.32 |
FREQ | 38963 | 36458 | 93.57 |
HMLC | 24712 | 23013 | 93.12 |
OSNA | 107094 | 86961 | 81.20 |
Comparison between original and de-identified datasets
cal_dist_diff <- function(raw_hm, anonymized_hm){
df_sub <- raw_hm %>%
dplyr::select(-name) %>%
left_join(., u_ids) %>%
mutate(home = as.numeric(home)) %>%
left_join(., anonymized_hm, by = c("u_id_anonymized" = "u_id")) %>%
drop_na() %>%
filter(home.x != home.y)
geom_a <- df_sub %>%
left_join(., grids, by = c("home.x" = "grid_id")) %>%
st_as_sf() %>%
st_centroid()
geom_b <- df_sub %>%
left_join(., grids, by = c("home.y" = "grid_id")) %>%
st_as_sf() %>%
st_centroid()
df_sub %>%
mutate(diff_dist = st_distance(geom_a, geom_b, by_element = TRUE),
diff_dist = round(as.numeric(diff_dist)/1000, 2)) %>%
mutate(diff_median = median(diff_dist))
}
dist_hmlc <- cal_dist_diff(raw_hmlc, hm_hmlc)
dist_apmd <- cal_dist_diff(raw_apdm, hm_apdm)
dist_osna <- cal_dist_diff(raw_osna, hm_osna)
dist_freq <- cal_dist_diff(raw_freq, hm_freq)
bind_rows(dist_hmlc, dist_apmd, dist_osna, dist_freq) %>%
group_by(name) %>%
dplyr::summarise(median_dist = median(diff_dist)) %>%
rename(Recipe = name, `Median shifted distance (km)` = median_dist) %>%
knitr::kable(.,
caption = 'Median shifted distance of identified home locations')
Recipe | Median shifted distance (km) |
---|---|
APDM | 0.75 |
FREQ | 1.50 |
HMLC | 0.75 |
OSNA | 2.70 |
Median shifted distance of identified home locations
bind_rows(dist_hmlc, dist_apmd, dist_osna, dist_freq) %>%
ggplot(., aes(diff_dist, color = name)) +
geom_density(alpha = 0.3) +
scale_x_continuous(breaks = seq(0, 50, 5)) +
labs(x = "Distance (km)", y = "Density", title = "", color = "Recipe") +
theme_classic() +
theme(title = element_text(size = 10),
legend.position = c(0.93, 0.9))