-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconvert_lda_topic_to_hlc_class.py
254 lines (210 loc) · 12.4 KB
/
convert_lda_topic_to_hlc_class.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#/usr/bin/env python
# Convert the probabilities of latent topics from a trained LDA model to
# probabilities of targeted harmonized land cover classes.
#
# Zhan Li, [email protected]
# Created: Fri May 3 22:48:40 PDT 2019
import argparse
import logging, logging.config
import numpy as np
import pandas as pd
from sklearn.externals import joblib
import scipy.stats as spsta
import scipy.optimize as spopt
import scipy.sparse as spspa
from numba import jit, float64
LOGGING = {
"version" : 1,
"formatters" : {
"default" : {
"format" : "%(asctime)s %(levelname)s %(message)s",
},
},
"handlers" : {
"console" : {
"class" : "logging.StreamHandler",
"level" : "DEBUG",
"formatter" : "default",
"stream" : "ext://sys.stdout",
},
},
"root" : {
"handlers" : ["console"],
"level" : "DEBUG",
},
}
logging.config.dictConfig(LOGGING)
logger = logging.getLogger('lda-topic2hlc-class')
def getCmdArgs():
p = argparse.ArgumentParser(description="Estimate probabilities of harmonized land cover class from the probabilities of latent topics from a trained LDA model")
p.add_argument("--lda_model", "-m", dest="lda_model_file", metavar="LDA_MODEL_JOBLIB", required=True, help="A trained LDA model that has been pickled by joblib.")
p.add_argument("--lda_vocab", "-w", dest="lda_vocab_file", metavar="LDA_VOCAB_WORDS_JOBLIB", required=True, help="List of LDA model's vocabulary words that has been pickled by joblib.")
p.add_argument("--lda_topics", "-t", dest="lda_topics_csv", metavar="LDA_TOPIC_PROB_CSV", required=False, help="An optional CSV file that lists topic probabilities of documents inferred by the given LDA model, with rows being documents and columns being topics, and the first row being header. The first few columns can be extra values to index documents but all the remaining columns must be topic probabilities in the same numbers of topics given by the LDA model file. If provided, the topic probabilities will be converted to HLC class probabilities and saved in a CSV file. Otherwise, only topic-to-HLC conversion matrix will be estimated and saved.")
p.add_argument("--agreed_word", "-A", dest="agreed_words_csv", metavar="LUT_WORD_COMB2HLC", required=True, help="A look-up table of agreed vocabulary words (source classes) to targeted HLC classes, with each row being agreed words followed by its corresponding HLC class in the last column, and first row being the header.")
p.add_argument("--t2h", dest="t2h_csv", metavar="TOPIC2HLC_CSV", required=True, help="Output CSV file that stores the estimated topic-to-HLC matrix from the LUT of agreed words and HLC classes.")
p.add_argument("--hlc_prob", dest="hlc_prob_csv", metavar="HLC_PROB_CSV", required=False, help="Output CSV file that stores the estimated HLC class probabilities from topic probabilities if a CSV file of topic probabilities is given.")
p.add_argument("--N_factor", "-N", dest="N_factor", metavar="WORD_COUNT_PER_DOC", required=False, type=int, default=1000, help="Word count per pseudo document. Default: 1000.")
p.add_argument("--n_resample", "-n", dest="n_resample", metavar="RESAMPLE_COUNT_PER_HLC", required=False, type=int, default=100, help="Resample count per HLC class. Default: 100.")
p.add_argument("--replicate", "-r", dest="n_replicates", metavar="TIMES_TO_REPEAT_ESTIMATION", required=False, type=int, default=1, help="Number of times to repeat the estimation. The estimation of topic-to-HLC matrix and HLC class probabilities inherits randomness from the pseudo-document generation and initial values for optimization. running multiple times to have a stable concensus. Default: 1.")
p.add_argument("--seed", dest="random_seed", metavar="SEED_FOR_RANDOM_GENERATOR", required=False, type=int, default=None, help="A seed for random generator to generate the same pseudo random numbers for processing. Default: None, do not seed the generator.")
cmdargs = p.parse_args()
if (cmdargs.lda_topics_csv is not None and cmdargs.hlc_prob_csv is None):
raise RuntimeError("CSV of topic probabilities is given but no output CSV for estimated HLC class probabilities!")
if (cmdargs.lda_topics_csv is None and cmdargs.hlc_prob_csv is not None):
raise RuntimeError("CSV of topic probabilities is missing but output CSV for estimated HLC class probabilities is given!")
return cmdargs
@jit(float64(float64[:], float64[:,:], float64[:], float64[:]), nopython=True, parallel=True)
def objectiveFunc(x, A, b, wt):
return np.sum((A.dot(x) - b)**2 * wt)
@jit(float64[:](float64[:], float64[:, :]), nopython=True, parallel=True)
def eqConsFunc(x, B):
return B.dot(x)-np.ones(B.shape[0])
@jit(float64[:, :](float64[:], float64[:, :]), nopython=True, parallel=True)
def eqConsJac(x, B):
return B
@jit(float64[:](float64[:]), nopython=True, parallel=True)
def ineqCons1Func(x):
return x
@jit(float64[:, :](float64[:]), nopython=True, parallel=True)
def ineqCons1Jac(x):
return np.identity(x.size)
@jit(float64[:](float64[:]), nopython=True, parallel=True)
def ineqCons2Func(x):
return 1 - x
@jit(float64[:, :](float64[:]), nopython=True, parallel=True)
def ineqCons2Jac(x):
return -1*np.identity(x.size)
def main(cmdargs):
model_file = cmdargs.lda_model_file
vocab_file = cmdargs.lda_vocab_file
lda_lut_csv = cmdargs.lda_topics_csv
agreed_words_csv = cmdargs.agreed_words_csv
t2h_csv = cmdargs.t2h_csv
hlc_prob_csv = cmdargs.hlc_prob_csv
N_factor = cmdargs.N_factor
n_per_hlc = cmdargs.n_resample
n_replicates = cmdargs.n_replicates
random_seed = cmdargs.random_seed
if random_seed is not None:
np.random.seed(random_seed)
# Read agreed word combinations and their HLC class
hlc_conc_lut = pd.read_csv(agreed_words_csv, header=0)
# load model and vocab
lda_model = joblib.load(model_file)
lda_vocab = joblib.load(vocab_file)
# an array of HLC classes
hlc_code_list = np.unique(hlc_conc_lut.iloc[:, -1])
if np.issubsctype(hlc_code_list.dtype, np.number):
hc_dtype = np.int
else:
hc_dtype = hlc_code_dtype.dtype
hlc_code_list = hlc_code_list.astype(hc_dtype)
# number of sources of words and an array of sources
n_maps = hlc_conc_lut.shape[1]-1
map_names = list(hlc_conc_lut.columns[0:-1])
hlc_conc_lut = hlc_conc_lut.set_index(map_names)
# Check if all the given agreed words are in LDA's vocabulary
vocab_index = pd.Index(lda_vocab)
for val in map_names:
indexer = vocab_index.get_indexer(hlc_conc_lut.index.get_level_values(val))
if np.sum(indexer==-1) > 0:
raise RuntimeError("Some agreed words in the column of {0:s} are not in the LDA vocabulary!".format(val))
# Read topic probabilities of some documents if given
lda_lut_df = pd.read_csv(lda_lut_csv, header=0)
# Create pseudo documents from agreed word combinations
train_y = np.zeros((hlc_conc_lut.shape[0], len(hlc_code_list)))
index = pd.Index(hlc_code_list).get_indexer(hlc_conc_lut.iloc[:, -1].astype(hc_dtype))
train_y[np.array(range(0, train_y.shape[0])), index] = 1
train_y = pd.DataFrame(np.hstack([train_y,
hlc_conc_lut.iloc[:, -1].values[:, np.newaxis],
np.ones((train_y.shape[0], 1))]),
index=hlc_conc_lut.index, columns=list(hlc_code_list)+["hlc", "weight"])
df = train_y.reset_index()
csv_suffix_fmtstr = "_{{0:0{0:d}d}}.csv".format(len(str(n_replicates)))
for i in range(n_replicates):
# Bootstrapping resampling agreed word combinations to create more pseudo docs.
train_y = []
for val in hlc_code_list:
sflag = df["hlc"].astype(hc_dtype) == val
balls = df.loc[sflag, :].values.tolist()
x = np.array([tuple(val) for val in balls], dtype=[(str(i), type(val)) for i, val in enumerate(balls[0])])
train_y.append(np.asarray(np.random.choice(x, size=n_per_hlc, replace=True).tolist()))
train_y = np.vstack(train_y)
train_y = np.hstack([train_y, np.zeros((train_y.shape[0], n_maps))])
prop_names = [str(val)+"_prop" for val in map_names]
train_y = pd.DataFrame(train_y, columns = list(df.columns)+prop_names)
train_y.loc[:, prop_names] = np.random.dirichlet(np.ones(n_maps)*1./n_maps, size=train_y.shape[0])
dw_mat = np.zeros((train_y.shape[0], len(lda_vocab)))
for mn, pn in zip(map_names, prop_names):
indexer = vocab_index.get_indexer(train_y[mn])
dw_mat[np.array(range(dw_mat.shape[0])), indexer] = (train_y[pn] * N_factor).astype(int)
train_X = pd.DataFrame(lda_model.transform(dw_mat),
index = train_y.index,
columns=["prob_topic_{0:d}".format(i+1) for i in range(lda_model.n_components)])
train_data = pd.concat([train_y, train_X], axis=1, join="inner")
# Set up the optimization problem to estimate T2H matrix
n_topics = lda_model.n_components
indptr = np.array(list(range(len(hlc_code_list)+1)), dtype=int)
indices = np.array(list(range(len(hlc_code_list))), dtype=int)
# for bsr sparse matrix, data is in the dimension of [nblocks, nrows_of_a_block, ncols_of_a_block]
bsrmat_list = []
tpcols = ["prob_topic_{0:d}".format(i+1) for i in range(n_topics)]
for idx in train_data.index:
data = np.tile(train_data.loc[idx, tpcols].values, (len(hlc_code_list), 1, 1))
bsrmat_list.append( spspa.bsr_matrix((data, indices, indptr), shape=(len(hlc_code_list), len(hlc_code_list)*n_topics)) )
A = spspa.vstack(bsrmat_list, format="bsr").toarray()
b = train_data.loc[:, hlc_code_list].stack().values
wt = train_data.loc[:, "weight"].values
wt = np.repeat(wt, len(hlc_code_list))
B = spspa.hstack([spspa.identity(n_topics)]*len(hlc_code_list)).toarray()
# use random samples from Dirichlet distribution to initialize x0.
x0 = np.random.dirichlet(np.ones(len(hlc_code_list))*1./len(hlc_code_list), size = n_topics).T
x0 = pd.DataFrame(x0, index=hlc_code_list, columns=list(range(n_topics)))
x0.index.names = ["hlc"]
x0.columns.names = ["topic"]
x0 = x0.stack().reorder_levels(["hlc", "topic"]).values.squeeze()
# Start optimization
bounds = spopt.Bounds(0, 1)
eq_cons = [{'type': 'eq',
'fun': eqConsFunc,
'jac': eqConsJac,
'args': (B,)}]
ineq_cons = [{'type': 'ineq',
'fun': ineqCons1Func,
'jac': ineqCons1Jac},
{'type': 'ineq',
'fun': ineqCons2Func,
'jac': ineqCons2Jac}]
opt_out = spopt.minimize(objectiveFunc, x0, args=(A, b, wt), method="SLSQP",
bounds=bounds, constraints=eq_cons+ineq_cons,
options=dict(maxiter=200))
t2h_mat = opt_out.x.reshape((-1, n_topics)).T
t2h_mat = pd.DataFrame(t2h_mat, columns=hlc_code_list)
# Write output
if n_replicates == 1:
cur_t2h_csv = t2h_csv
else:
cur_t2h_csv = t2h_csv.rstrip(".csv") + csv_suffix_fmtstr.format(i+1)
with open(cur_t2h_csv, "w") as outfobj:
outfobj.write(("# opt.success = {0:s}, " \
+ "opt.status = {1:d}, " \
+ "opt.message = {2:s}, " \
+ "opt.nit = {3:d}\n").format(str(opt_out.success),
opt_out.status, opt_out.message, opt_out.nit))
t2h_mat.to_csv(outfobj, mode="a")
if hlc_prob_csv is not None and lda_lut_csv is not None:
# Now estimate the HLC class probabilities if requested
tmp = np.matmul(lda_lut_df.iloc[:, -n_topics:].values, t2h_mat.values)
hlc_prob_lut = pd.concat([lda_lut_df.iloc[:, 0:-n_topics],
pd.DataFrame(tmp, index=lda_lut_df.index, columns=hlc_code_list)],
axis=1)
if n_replicates == 1:
cur_hlc_prob_csv = hlc_prob_csv
else:
cur_hlc_prob_csv = hlc_prob_csv.rstrip(".csv") + csv_suffix_fmtstr.format(i+1)
with open(cur_hlc_prob_csv, "w") as outfobj:
outfobj.write("# t2h_matrix = {0:s}\n".format(cur_t2h_csv))
hlc_prob_lut.to_csv(outfobj, mode="a")
if __name__ == "__main__":
cmdargs = getCmdArgs()
main(cmdargs)