forked from noycohen100/AutoGRD
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCo-occurrence_computation
166 lines (120 loc) · 4.56 KB
/
Co-occurrence_computation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import itertools
import time
def parseLine(tree, tree_number):
"""This function responsibles of parse the lines according to ,
Parameters
----------
tree : str
string which describes the instances' distribution in some tress
tree_number : int
Returns
-------
list
list of pairs : <key = tree number + leaf number , value = instance number>
"""
parts = tree.split(',')
ans = []
for i in range(0, len(parts)):
ans.append(((tree_number,parts[i]), i))
return ans
def combinations(row):
""" This function return all the possible options of combinations
Parameters
----------
row : str
string which describes instances that felt in the same leaf
tree_number : int
Returns
-------
list
list of edges (pairs of instances)
"""
l = sorted(row[1])
k = row[0][1]
return [(v) for v in itertools.combinations(l, 2)]
def addToEdge(edge):
""" This function cleans the edges from some symbols and adds some symbols that are required later.
Parameters
----------
edge : str
string (instance1, instance2)
Returns
str
instance1 instance2 0 |{}|
"""
e = str(edge).replace("," ,"")
e = e.replace("(" ,"")
e = e.replace(")", "")
return e + " 0 |{}|"
def main(list_paths, datasets_names_list, output1, output 2):
""" This function computes the co- co-occurrence score for each pair of instacnes.
Then , removes the pairs with the weakest co-occurence scores (10% )
The others pairs, are written to two different files: output1, output2
Parameters
----------
list_paths : list
list of the datasets' paths
datasets_names_list: list
The datasets's names, orderd like the list_paths
output1: str
Path to the first output file
output2: str
Path to the second output file
"""
list_paths=list_paths
list_names = datasets_names_list
dataset_names_dic= {}
for num in range(0, len(list_names)):
dataset_names_dic[num] = list_names[num]
counter_data_set =0
for path in list_paths:
dataset_name = dataset_names_dic[counter_data_set]
counter_data_set = counter_data_set +1
# Load and parse data file into an RDD of LabeledPoint
data = sc.textFile(path)
temp = []
counter = 1
for i in data.collect():
instances_number = len(i.split(",")) # Find the instances number
temp.append((i , counter)) # Add line number for each line
counter = counter +1
data = sc.parallelize(temp)
a = data.flatMap(lambda tp :parseLine(tp[0],tp[1])).groupByKey().map(lambda x : (x[0], list(x[1]))).collect() # return : <number of tree , leaf number value: list of instances>
a = sc.parallelize(a)
selected_edge = a.map(lambda x: combinations(x)).flatMap(lambda x: x).map(lambda x: (x,1)).reduceByKey(lambda tp1, tp2 : tp1+tp2).sortBy(lambda a: -a[1])
edges_amount_before_filter = len(selected_edge.collect())
precentage = 0.9
wish_number_of_edges = edges_amount_before_filter*precentage
d = sc.parallelize(selected_edge.take(int(wish_number_of_edges)))
d = d.map(lambda x: x[0])
gw_begin =""
for i in range(0, instances_number):
gw_begin = gw_begin + "|{}|" +"\n" # Save according to
edge_gw = d.map(lambda edge : addToEdge(edge))
s_gw = str(edge_gw.collect())
s_gw = s_gw.replace(" '" , "")
s_gw = s_gw.replace("'" , "")
s_gw = s_gw.replace("," , "\n")
s_gw = s_gw.replace("[","")
s_gw = s_gw.replace("]","")
print("s_gw")
s_with_count = "" + str(instances_number) + " " + str(len(d.collect()))+"\n" # string that save edges list with the number of appearence
s = str(d.collect())
s = s.replace(")," ,"\n")
s = s.replace(" (", "")
s = s.replace(",", "")
s = s.replace("[","")
s = s.replace("]","")
s = s.replace("(", "")
s = s.replace(")", "")
dbutils.fs.put(output1 +dataset_name + ".gw", gw_begin +s_gw )
dbutils.fs.put(output2 +dataset_name + ".in", s_with_count + s)
del data
# The output files are required later by the graphlet correlation distance method
# The format of Output1:
#LEDA graph format
# The format of output2 (.in file):
#Input file describes the network in a simple text format.
#The first line contains two integers n and e - the number of nodes and edges.
#The following e lines describe undirected edges with space-separated ids of their endpoints.
#Node ids should be between 0 and n-1 (see example.in).