Skip to content

Latest commit

 

History

History
143 lines (119 loc) · 5.75 KB

05-appendix-comparison-between-raw-and-deidentified-datasets.md

File metadata and controls

143 lines (119 loc) · 5.75 KB

Comparison between original and de-identified datasets

The de-identification approach outlined in this research does result in a loss of both accuracy and precision for the purpose of inferring meaningful locations. However, we argue that this loss is relatively inconsequential for many human mobility analyses and is a worthwhile trade-off vis-a-vis the ethical use of this dataset and any potential harm that can result from its use.

To evaluate this trade-off, we have applied each of the four algorithms on both original and de-identified datasets. More than 80% of the users are labelled with the same home location in both datasets, and the median ‘error’ for mismatches is less than 2 kilometres for most algorithms (i.e. a neighbouring hexagon is selected instead).

grids <- read_sf(here("analysis/data/derived_data/spatial_hex_grid.shp")) 
df_anonymized <- readRDS(here("analysis/data/derived_data/sg_tweets_anonymized_20200929.rds"))
u_ids <- df_anonymized %>% 
  dplyr::select(u_id, u_id_anonymized) %>% 
  distinct(u_id, u_id_anonymized)


#identified home locations from original dataset
raw_hmlc <- readRDS(here::here("analysis/data/raw_data/hm750_hmlc.rds")) %>% mutate(name = "HMLC")
#online social network recipe
raw_osna <- readRDS(here::here("analysis/data/raw_data/hm750_osna.rds")) %>% mutate(name = "OSNA")
#mobile positioning data
raw_apdm <- readRDS(here::here("analysis/data/raw_data/hm750_apdm.rds")) %>% mutate(name = "APDM")
#frequency
raw_freq <- readRDS(here::here("analysis/data/raw_data/hm750_freq.rds")) %>% mutate(name = "FREQ")
hm_original <- bind_rows(raw_hmlc, raw_apdm, raw_osna, raw_freq)


#identified home locations from de-identified dataset
hm_hmlc <- read_csv(here::here("analysis/data/derived_data/hm_hmlc.csv")) 
#online social network recipe
hm_osna <- read_csv(here::here("analysis/data/derived_data/hm_osna.csv"))
#mobile positioning data
hm_apdm <- read_csv(here::here("analysis/data/derived_data/hm_apdm.csv")) 
#frequency
hm_freq <- read_csv(here::here("analysis/data/derived_data/hm_freq.csv")) 
hm_anonymized <- bind_rows(hm_hmlc, hm_apdm, hm_osna, hm_freq)

Percentage of matched identified home locations

cal_shared_users <- function(raw_hm, anonymized_hm){
  df_joined <- raw_hm %>% 
    dplyr::select(-name) %>% 
    left_join(., u_ids) %>% 
    mutate(home = as.numeric(home)) %>%
    left_join(., anonymized_hm, by = c("u_id_anonymized" = "u_id")) %>% 
    drop_na()
  recipe_nm <- unique(raw_hm$name)
  tibble(
    recipe = recipe_nm,
    n_shared_users = nrow(df_joined), 
    n_matched  = df_joined %>% filter(home.x == home.y) %>% nrow()
  )
}

df_matched <- map2_df(list(raw_apdm, raw_freq, raw_hmlc, raw_osna), list(hm_apdm, hm_freq, hm_hmlc, hm_osna), function(x, y) cal_shared_users(x, y)) %>% 
  mutate(pct_matched = round(n_matched/n_shared_users, 4) * 100)
df_matched %>% 
  rename(Recipe = recipe, `# of shared users` = n_shared_users, `# of matched home locations` = n_matched, `% of matched home locations` = pct_matched) %>% 
  knitr::kable(., caption = 'Comparison between original and de-identified datasets')
Recipe # of shared users # of matched home locations % of matched home locations
APDM 32047 28944 90.32
FREQ 38963 36458 93.57
HMLC 24712 23013 93.12
OSNA 107094 86961 81.20

Comparison between original and de-identified datasets

Median shifted distance of identified home locations

cal_dist_diff <- function(raw_hm, anonymized_hm){
  df_sub <- raw_hm %>% 
    dplyr::select(-name) %>% 
    left_join(., u_ids) %>% 
    mutate(home = as.numeric(home)) %>%
    left_join(., anonymized_hm, by = c("u_id_anonymized" = "u_id")) %>% 
    drop_na() %>% 
    filter(home.x != home.y)  
  
  geom_a <- df_sub %>% 
    left_join(., grids, by = c("home.x" = "grid_id")) %>% 
    st_as_sf() %>% 
    st_centroid()
  
  
  geom_b <- df_sub %>% 
    left_join(., grids, by = c("home.y" = "grid_id")) %>% 
    st_as_sf() %>% 
    st_centroid()
  
  df_sub %>% 
    mutate(diff_dist = st_distance(geom_a, geom_b, by_element = TRUE),
           diff_dist = round(as.numeric(diff_dist)/1000, 2)) %>% 
    mutate(diff_median = median(diff_dist))
}

dist_hmlc <- cal_dist_diff(raw_hmlc, hm_hmlc)
dist_apmd <- cal_dist_diff(raw_apdm, hm_apdm)
dist_osna <- cal_dist_diff(raw_osna, hm_osna)
dist_freq <- cal_dist_diff(raw_freq, hm_freq)

bind_rows(dist_hmlc, dist_apmd, dist_osna, dist_freq) %>% 
  group_by(name) %>% 
  dplyr::summarise(median_dist = median(diff_dist)) %>% 
  rename(Recipe = name, `Median shifted distance (km)` = median_dist) %>% 
  knitr::kable(., 
    caption = 'Median shifted distance of identified home locations') 
Recipe Median shifted distance (km)
APDM 0.75
FREQ 1.50
HMLC 0.75
OSNA 2.70

Median shifted distance of identified home locations

bind_rows(dist_hmlc, dist_apmd, dist_osna, dist_freq) %>% 
  ggplot(., aes(diff_dist, color = name)) +
  geom_density(alpha = 0.3) + 
  scale_x_continuous(breaks = seq(0, 50, 5)) +
  labs(x = "Distance (km)", y = "Density", title = "", color = "Recipe") + 
  theme_classic() + 
  theme(title = element_text(size = 10), 
        legend.position = c(0.93, 0.9))