-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathAnalyzeImageHeterogeneity_Tutorial.R
251 lines (208 loc) · 9.86 KB
/
AnalyzeImageHeterogeneity_Tutorial.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
#!/usr/bin/env Rscript
{
################################
# Image heterogeneity tutorial using causalimages
################################
# clean environment
rm(list = ls()); options( error = NULL )
# remote install latest version of the package
# devtools::install_github(repo = "cjerzak/causalimages-software/causalimages")
# local install for development team
# install.packages("~/Documents/causalimages-software/causalimages",repos = NULL, type = "source",force = F)
# build backend you haven't ready:
# causalimages::BuildBackend()
# run code if downloading data for the first time
download_folder <- "~/Downloads/UgandaAnalysis.zip"
reSaveTfRecords <- T
if( reDownloadRawData <- F ){
# specify uganda data URL
uganda_data_url <- "https://dl.dropboxusercontent.com/s/xy8xvva4i46di9d/Public%20Replication%20Data%2C%20YOP%20Experiment.zip?dl=0"
download_folder <- "~/Downloads/UgandaAnalysis.zip"
# download into new directory
download.file( uganda_data_url, destfile = download_folder)
# unzip and list files
unzip(download_folder, exdir = "~/Downloads/UgandaAnalysis")
}
# load in package
library( causalimages )
# set new wd
setwd(sprintf('%s/Public Replication Data, YOP Experiment/',
gsub(download_folder,pattern="\\.zip",replace="")))
# see directory contents
list.files()
# images saved here
list.files( "./Uganda2000_processed" )
# individual-level data
UgandaDataProcessed <- read.csv( "./UgandaDataProcessed.csv" )
# unit-level covariates (many covariates are subject to missingness!)
dim( UgandaDataProcessed )
table( UgandaDataProcessed$age )
# approximate longitude + latitude for units
head( cbind(UgandaDataProcessed$geo_long, UgandaDataProcessed$geo_lat) )
# image keys of units (use for referencing satellite images)
UgandaDataProcessed$geo_long_lat_key
# an experimental outcome
UgandaDataProcessed$Yobs
# treatment variable
UgandaDataProcessed$Wobs
# information on keys linking to satellite images for all of Uganda
# (not just experimental context, use for constructing transportability maps)
UgandaGeoKeyMat <- read.csv( "./UgandaGeoKeyMat.csv" )
# set outcome
UgandaDataProcessed$Yobs <- UgandaDataProcessed$human_capital_index_e
# drop observations with NAs in key variables
# (you can also use a multiple imputation strategy)
UgandaDataProcessed <- UgandaDataProcessed[!is.na(UgandaDataProcessed$Yobs) &
!is.na(UgandaDataProcessed$Wobs) &
!is.na(UgandaDataProcessed$geo_lat) , ]
# sanity checks
{
# write a function that reads in images as saved and process them into an array
NBANDS <- 3L
imageHeight <- imageWidth <- 351L # pixel height/width
acquireImageRep <- function( keys ){
# keys <- unique(UgandaDataProcessed$geo_long_lat_key)[1:5]
# initialize an array shell to hold image slices
array_shell <- array(NA, dim = c(1L, imageHeight, imageWidth, NBANDS))
# dim(array_shell)
# iterate over keys:
# -- images are referenced to keys
# -- keys are referenced to units (to allow for duplicate images uses)
array_ <- sapply(keys, function(key_) {
# iterate over all image bands (NBANDS = 3 for RBG images)
for (band_ in 1:NBANDS) {
# place the image in the correct place in the array
array_shell[,,,band_] <-
as.matrix(data.table::fread(
input = sprintf("./Uganda2000_processed/GeoKey%s_BAND%s.csv", key_, band_), header = FALSE)[-1,])
}
return(array_shell)
}, simplify = "array")
# return the array in the format c(nBatch, imageWidth, imageHeight, nChannels)
# ensure that the dimensions are correctly ordered for further processing
if(length(keys) > 1){ array_ <- aperm(array_[1,,,,], c(4, 1, 2, 3) ) }
if(length(keys) == 1){
array_ <- aperm(array_, c(1,5, 2, 3, 4))
array_ <- array(array_, dim(array_)[-1])
}
return(array_)
}
# try out the function
# note: some units are co-located in same area (hence, multiple observations per image key)
ImageBatch <- acquireImageRep( UgandaDataProcessed$geo_long_lat_key[ check_indices <- c(1, 20, 50, 101) ])
acquireImageRep( UgandaDataProcessed$geo_long_lat_key[ check_indices[1:2] ] )
# sanity checks in the analysis of earth observation data are essential
# check that images are centered around correct location
causalimages::image2( as.array(ImageBatch)[1,,,1] )
UgandaDataProcessed$geo_long[check_indices[1]]
UgandaDataProcessed$geo_lat[check_indices[1]]
# check against google maps to confirm correctness
# https://www.google.com/maps/place/1%C2%B018'16.4%22N+34%C2%B005'15.1%22E/@1.3111951,34.0518834,10145m/data=!3m1!1e3!4m4!3m3!8m2!3d1.3045556!4d34.0875278?entry=ttu
# scramble data (important for reading into causalimages::WriteTfRecord
# to ensure no systematic biases in data sequence with model training
set.seed(144L); UgandaDataProcessed <- UgandaDataProcessed[sample(1:nrow(UgandaDataProcessed)),]
}
# Image heterogeneity example
if(T == ){
# write a tf records repository
# whenever changes are made to the input data to AnalyzeImageHeterogeneity, WriteTfRecord() should be re-run
# to ensure correct ordering of data
tfrecord_loc <- "~/Downloads/UgandaExample.tfrecord"
if( reSaveTfRecords ){
causalimages::WriteTfRecord(
file = tfrecord_loc,
uniqueImageKeys = unique(UgandaDataProcessed$geo_long_lat_key),
acquireImageFxn = acquireImageRep )
}
for(ImageModelClass in c("VisionTransformer","CNN")){
for(optimizeImageRep in c(T, F)){
print(sprintf("Image hetero analysis & optimizeImageRep: %s",optimizeImageRep))
# perform image heterogeneity analysis (toy example)
ImageHeterogeneityResults <- causalimages::AnalyzeImageHeterogeneity(
# data inputs
obsW = UgandaDataProcessed$Wobs,
obsY = UgandaDataProcessed$Yobs,
X = matrix(rnorm(length(UgandaDataProcessed$Yobs)*10),ncol=10),
imageKeysOfUnits = UgandaDataProcessed$geo_long_lat_key,
file = tfrecord_loc, # location of tf record (use absolute file paths)
lat = UgandaDataProcessed$geo_lat, # not required but helpful for dealing with redundant locations in EO data
long = UgandaDataProcessed$geo_long, # not required but helpful for dealing with redundant locations in EO data
# inputs to control where visual results are saved as PDF or PNGs
# (these image grids are large and difficult to display in RStudio's interactive mode)
plotResults = T,
figuresPath = "~/Downloads/HeteroTutorial", # where to write analysis figures
figuresTag = "HeterogeneityImTutorial",plotBands = 1L:3L,
# optional arguments for generating transportability maps
# here, we leave those NULL for simplicity
transportabilityMat = NULL, #
# other modeling options
imageModelClass = ImageModelClass,
nSGD = 5L, # make this larger for real applications (e.g., 2000L)
nDepth_ImageRep = ifelse(optimizeImageRep, yes = 1L, no = 1L),
nWidth_ImageRep = as.integer(2L^6),
optimizeImageRep = optimizeImageRep,
batchSize = 8L, # make this larger for real application (e.g., 50L)
kClust_est = 2 # vary depending on problem. Usually < 5
)
try(dev.off(), T)
}
}
}
# image sequence example
if(T == T){
acquireVideoRep <- function(keys) {
# Get image data as an array from disk
tmp <- acquireImageRep(keys)
# Expand dimensions: we create a new dimension at the start
tmp <- array(tmp, dim = c(1, dim(tmp)))
# Transpose dimensions to get the required order
tmp <- aperm(tmp, c(2, 1, 3, 4, 5))
# Swap image dimensions to see variability across time
tmp_ <- aperm(tmp, c(1, 2, 4, 3, 5))
# Concatenate along the second axis
tmp <- abind::abind(tmp, tmp_, along = 2)
return(tmp)
}
# write the tf records repository
tfrecord_loc_imSeq <- "~/Downloads/UgandaExampleVideo.tfrecord"
if(reSaveTfRecords){
causalimages::WriteTfRecord( file = tfrecord_loc_imSeq,
uniqueImageKeys = unique(UgandaDataProcessed$geo_long_lat_key),
acquireImageFxn = acquireVideoRep, writeVideo = T )
}
for(ImageModelClass in (c("VisionTransformer","CNN"))){
for(optimizeImageRep in c(T, F)){
print(sprintf("Image seq hetero analysis & optimizeImageRep: %s",optimizeImageRep))
# Note: optimizeImageRep = T breaks with video on METAL framework
VideoHeterogeneityResults <- causalimages::AnalyzeImageHeterogeneity(
# data inputs
obsW = UgandaDataProcessed$Wobs,
obsY = UgandaDataProcessed$Yobs,
imageKeysOfUnits = UgandaDataProcessed$geo_long_lat_key,
file = tfrecord_loc_imSeq, # location of tf record (absolute paths are safest)
dataType = "video",
lat = UgandaDataProcessed$geo_lat, # not required but helpful for dealing with redundant locations in EO data
long = UgandaDataProcessed$geo_long, # not required but helpful for dealing with redundant locations in EO data
# inputs to control where visual results are saved as PDF or PNGs
# (these image grids are large and difficult to display in RStudio's interactive mode)
plotResults = T,
figuresPath = "~/Downloads/HeteroTutorial",
plotBands = 1L:3L, figuresTag = "HeterogeneityImSeqTutorial",
# optional arguments for generating transportability maps
# here, we leave those NULL for simplicity
transportabilityMat = NULL, #
# other modeling options
imageModelClass = ImageModelClass,
nSGD = 5L, # make this larger for real applications (e.g., 2000L)
nDepth_ImageRep = ifelse(optimizeImageRep, yes = 1L, no = 1L),
nWidth_ImageRep = as.integer(2L^5),
optimizeImageRep = optimizeImageRep,
kClust_est = 2, # vary depending on problem. Usually < 5
batchSize = 8L, # make this larger for real application (e.g., 50L)
strides = 2L )
try(dev.off(), T)
}
}
}
causalimages::print2("Done with image heterogeneity tutorial!")
}