-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathsequentiality.py
executable file
·106 lines (83 loc) · 2.98 KB
/
sequentiality.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import math
import sys
import matplotlib
matplotlib.use('Agg') # Avoid tkinter dependency
import matplotlib.pyplot as plt
from . import charset as cset
def string_sequentiality(string, charset, plot_scatterplot=False):
"""
Computes how much a string contains sequence of consecutive or distance-fixed characters
:param string: the string
:param charset: a charset as a string
:param plot_scatterplot: optional boolean, if true plots a scatterplot
:return: sequentiality index, 0 (low sequentiality) to 1 (high sequentiality)
:rtype: float
"""
if len(string) <= 2:
return 0
string_length = len(string)
window_size = math.floor(math.log(string_length))
window_size = int(window_size)
counter = 0
buckets = {}
for j in range(1, len(string)):
for i in range(max(j - window_size, 0), j):
diff = math.fabs((ord(string[j]) - ord(string[i])))
buckets[diff] = buckets.get(diff, 0) + 1
counter += 1
# normalize histogram
for key in buckets.keys():
buckets[key] = buckets[key] / counter
# Calculate MSE
charset_buckets = cset.get_char_distance_distribution(charset)
mse = 0
for key in charset_buckets.keys():
diff = buckets.get(key, 0) - charset_buckets.get(key, 0)
square_diff = diff ** 2
mse += square_diff / len(charset_buckets.keys())
if plot_scatterplot:
# Plot the scatterplot
subplot = plt.subplot(111)
subplot.set_xlabel("Average distance from other characters")
subplot.set_ylabel("% of chars at distance x from the others")
s1 = s2 = None
for v in charset_buckets.items():
x = v[0]
y = v[1]
s1 = plt.scatter(x, y * 100, alpha=0.6, color='r', label='charset')
for v in buckets.items():
x = v[0]
y = v[1]
s2 = plt.scatter(x, y * 100, alpha=0.6, color='g', label='string')
plt.legend(handles=[s1, s2])
plt.show()
return mse
def weighted_sequentiality(string, charset):
"""
Returns the string sequentiality weighted by the string length. I.e.
ABC is less meaningful than ABCDEFGHIJKLMNO
:param string:
:param charset:
:return:
"""
return string_sequentiality(string, charset) * len(string)
def multiple_string_sequentiality(item_charset_dict):
"""
Calculates the sequentiality for a list of strings
:param item_charset_dict: list of string:charset
:return: string:sequentiality dictionary
:rtype: dict
"""
items = {}
for item in item_charset_dict:
items[item] = (string_sequentiality(item[0], item[1]))
return items
def main(argv):
if len(argv) != 2:
print("Usage: python {0} string_to_be_computed".format(argv[0]))
return
else:
print(
"Sequentiality index: {0}".format(string_sequentiality(argv[1], cset.get_narrower_charset(argv[1]), True)))
if __name__ == '__main__':
main(sys.argv)