-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy path08buildcharts.py
138 lines (110 loc) · 4.32 KB
/
08buildcharts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/python
# input: histogram fn and an output directory
# output: per-feature significance charts for home runs and strikeouts
import csv
import json
import os.path
import re
import sys
from common import UNK
def build_chart(feature_name, data, category_names, output_dir):
from reportlab.graphics.shapes import Drawing, String
from reportlab.graphics.charts.barcharts import VerticalBarChart
from reportlab.graphics.charts.legends import Legend
from reportlab.graphics.charts.textlabels import Label
from reportlab.lib import colors
from reportlab.lib.validators import Auto
# build chart and save it
d = Drawing(800, 600)
d.add(String(200,180,feature_name), name='title')
chart = VerticalBarChart()
chart.width = d.width-100
chart.height = d.height-75
chart.x = 40
chart.y = 40
chart.data = data
chart.categoryAxis.categoryNames = category_names
chart.valueAxis.valueMin = 0
chart.valueAxis.valueMax = 2
chart.bars[0].fillColor = colors.red
chart.bars[1].fillColor = colors.blue
d.add(chart)
d.title.x = d.width/2
d.title.y = d.height - 30
d.title.textAnchor ='middle'
d.title.fontSize = 24
d.add(Legend(),name='Legend')
d.Legend.colorNamePairs = [(chart.bars[i].fillColor, name) for i, name in enumerate(["Home Run", "Strikeout"])]
d.Legend.fontName = 'Times-Roman'
d.Legend.fontSize = 16
d.Legend.x = d.width-80
d.Legend.y = d.height-25
d.Legend.dxTextSpace = 5
d.Legend.dy = 5
d.Legend.dx = 5
d.Legend.deltay = 5
d.Legend.alignment ='right'
d.add(Label(),name='XLabel')
d.XLabel.fontName = 'Times-Roman'
d.XLabel.fontSize = 12
d.XLabel.x = chart.x + chart.width/2
d.XLabel.y = 15
d.XLabel.textAnchor ='middle'
d.XLabel._text = "Value"
d.add(Label(),name='YLabel')
d.YLabel.fontName = 'Times-Roman'
d.YLabel.fontSize = 12
d.YLabel.x = 10
d.YLabel.y = chart.y + chart.height/2
d.YLabel.angle = 90
d.YLabel.textAnchor ='middle'
d.YLabel._text = "Likelihood Index"
d.save(fnRoot=os.path.join(output_dir, feature_name), formats=['png'])
def build_csv(feature_name, data, category_names, output_dir):
out_csv = csv.writer(open(os.path.join(output_dir, feature_name + ".csv"), "w"))
out_csv.writerow([""] + category_names)
out_csv.writerow(["HR"] + data[0])
out_csv.writerow(["K"] + data[1])
def dump_feature(feature_name, value_dict, output_dir):
# drop insignificant features
value_dict.pop(UNK, None)
# HACK: apparently game_temp = 0 is the same as <UNK>
if feature_name == "game_temp":
value_dict.pop("[0-5)", None)
# first pass, sum up the labels
label_counts = {}
for fval, counts_by_label in value_dict.iteritems():
for label, count in counts_by_label.iteritems():
label_counts[label] = label_counts.get(label, 0) + count
num_samples = sum(label_counts.itervalues())
predictor_labels = ["HR", "K"]
data = [ [] for l in predictor_labels ]
category_names = []
baseline_likelihood = dict( (k, float(label_counts[k])/num_samples) for k in predictor_labels )
def keyfn(x):
# gah, pesky numeric values
try:
return int(x)
except Exception:
pass
if x.startswith("[") and x.endswith(")"): return int(re.match("\[(-?\d+)-\d+\)", x).group(1))
if x.endswith("+"): return int(x[:-1])
return x
# (HR-index, K-index) for each significant value
for fval in sorted(value_dict.keys(), key=keyfn):
counts_by_label = value_dict[fval]
# "significant" features have > 0.05% of the total number of samples
if sum(counts_by_label.values()) > 0.0005 * num_samples:
category_names.append(fval)
for i, label in enumerate(predictor_labels):
likelihood = float(counts_by_label[label])/sum(counts_by_label.values())
data[i].append(likelihood / baseline_likelihood[label])
build_chart(feature_name, data, category_names, output_dir)
build_csv(feature_name, data, category_names, output_dir)
def main():
histogram_fn, output_dir = sys.argv[1:]
histogram = json.load(open(histogram_fn))
for feature, value_dict in histogram.iteritems():
dump_feature(feature, value_dict, output_dir)
if __name__ == "__main__":
main()