-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenerate_data.py
120 lines (85 loc) · 3.18 KB
/
generate_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# generate tensorflow data
import argparse
import os
import sys
from tqdm import tqdm
import tensorflow as tf
from dataset_graph_rep import *
from utils import *
import time
import numpy as np
import pickle
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _floats_feature(value):
if isinstance(value, list):
return tf.train.Feature(float_list=tf.train.FloatList(value=value))
else:
return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def create_dataset(G, theta, trials, name, run = 1, regular_degree = None):
''' Creates a dataset by spreading over graph G.
Inputs:
G graph object
theta number of corrupt connections per node
name filename to write to
train is this a training dataset (True) or test dataset (False)?
run the index of the dataset. Only used if dataset is too large
to fit in memory
regular_degree the degree of the tree, if graph is a regular tree
'''
run_prefix = 'run' + str(run) + '_'
filename = os.path.join('data/', run_prefix + name)
# filename = os.path.join('data/' + name + str(run))
label_filename = os.path.join('data/' + run_prefix + name + '_labels')
print('Writing', filename)
# writer = tf.python_io.TFRecordWriter(filename)
timestamps = []
labels = []
# start_timescale = 10
num_nodes = nx.number_of_nodes(G)
for trial in tqdm(range(trials)):
if regular_degree is None:
nodes = G.nodes()
else:
nodes = [n for n in G.nodes() if G.degree(n) >= regular_degree]
source = random.choice(nodes)
# Spread the message
G.spread_message(source, num_corrupt_cnx = theta)
# Normalize all the timestamp vectors to the first reporting time
source_time = min(G.adversary_timestamps.values())
ts = [t - source_time for t in G.adversary_timestamps.values()]
timestamps += [ts]
# labels += [[int(i == source) for i in range(num_nodes)]]
labels += [source]
with open(filename,'wb') as f:
np.save(f, np.array(timestamps))
with open(label_filename, 'wb') as f:
np.save(f, np.array(labels))
if __name__ == '__main__':
theta = 1
check_ml = True
run = 1
# filename = 'data/bitcoin.gexf' # Bitcoin snapshot graph
# filename = 'data/tree_4.gexf' # 100 node random regular graph
# filename = 'data/tree_5.gexf' # 100 node random regular graph
filename = 'data/random_regular.gexf' # 100 node random regular graph
if filename == 'data/tree_4.gexf':
regular_degree = 4
elif filename == 'data/tree_5.gexf':
regular_degree = 5
else:
regular_degree = None
args = parse_arguments()
spreading_time = 20
G = DataGraphDiffusion(filename, spreading_time = spreading_time)
train_trials = args.trials # We'll separate out the validation set later
test_trials = train_trials / 10
# Convert to Examples and write the result to TFRecords.
print 'Creating training data'
create_dataset(G, theta, train_trials, 'train', run = run, regular_degree = regular_degree)
# print 'Creating validation data'
# create_dataset(G, theta, validation_trials, 'validation')
print 'Creating test data'
create_dataset(G, theta, test_trials, 'test', run = run)