-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdimensionality_reduction.py
106 lines (76 loc) · 3.05 KB
/
dimensionality_reduction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import argparse
import numpy as np
from sklearn import decomposition
from embedding import load_embedding
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--embedding', required=True)
parser.add_argument('-m', '--method', choices=['PCA', 'tSVD', 'DRA'], default='DRA')
parser.add_argument('-o', '--output', required=True)
parser.add_argument('-n', '--reduce_to', type=int, default=300)
parser.add_argument('-b', '--do_in_batches', action='store_true')
parser.add_argument('-nb', '--batch_size', type=int, default=1024)
args = parser.parse_args()
emb = load_embedding(args.embedding, lower=False, length_normalize=False, delete_duplicates=True)
if args.method == 'PCA':
if args.do_in_batches:
emb.vectors = PPA_batches(emb.vectors, args.reduce_to, args.batch_size)
else:
emb.vectors = PCA(emb.vectors, args.reduce_to)
elif args.method == 'tSVD':
emb.vectors = T_SVD(emb.vectors, args.reduce_to)
elif args.method == 'DRA':
if args.do_in_batches:
emb.vectors = DRA_batches(emb.vectors, args.reduce_to, args.batch_size)
else:
emb.vectors = DRA(emb.vectors, args.reduce_to)
else:
raise ValueError(str(args.method) + ' reduction method not supported. Reduction method supported: PCA, tSVD, DRA')
emb.export(args.output)
def PPA(matrix):
# PCA to get Top Components
n = matrix.shape[1]
pca = decomposition.PCA(n_components=n)
X_train = matrix - np.mean(matrix)
X_fit = pca.fit_transform(X_train)
U1 = pca.components_
z = []
# Removing Projections on Top Components
for i, x in enumerate(X_train):
for u in U1[0:7]:
x = x - np.dot(u.transpose(), x) * u
z.append(x)
return np.asarray(z)
def PPA_batches(matrix, batch_size):
# PCA to get Top Components
n = matrix.shape[1]
pca = decomposition.IncrementalPCA(n_components=n, batch_size=batch_size)
X_train = matrix - np.mean(matrix)
X_fit = pca.fit_transform(X_train)
U1 = pca.components_
z = []
# Removing Projections on Top Components
for i, x in enumerate(X_train):
for u in U1[0:7]:
x = x - np.dot(u.transpose(), x) * u
z.append(x)
return np.asarray(z)
def PCA(matrix, n):
pca = decomposition.PCA(n_components=n)
# SKLEARN CENTERS DE DATA, THIS IS REDUNTDANT.
X_train = matrix - np.mean(matrix)
return pca.fit_transform(X_train)
def PCA_batches(matrix, n, batch_size):
pca = decomposition.IncrementalPCA(n_components=n, batch_size=batch_size)
# SKLEARN CENTERS DE DATA, THIS IS REDUNTDANT.
X_train = matrix - np.mean(matrix)
return pca.fit_transform(X_train)
def T_SVD(matrix,n):
svd = decomposition.TruncatedSVD(n_components = n)
return svd.fit_transform(matrix)
def DRA(matrix, n):
return PPA(PCA(PPA(matrix), n))
def DRA_batches(matrix, n, batch_size):
return PPA_batches(PCA_batches(PPA_batches(matrix, batch_size), n, batch_size), batch_size)
if __name__ == '__main__':
main()