-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathprepare.py
105 lines (92 loc) · 5.98 KB
/
prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import random
import json
import glob
import pickle
from collections import Counter
# this freezes my computer. let's load 1 at a time instead
# data = list()
# for file in glob.glob("TinyStories_all_data/*"):
# with open(file) as o:
# # print(type(json.load(o)))
# data.extend(json.load(o))
thrown_out = 0
def process_story(par):
global thrown_out
par = par.strip()
par = par.replace('\\', '')
par = par.replace(' ', ' ')
par = par.replace('–', '-')
par = par.replace(' — ', ' - ')
par = par.replace('—', ' - ')
par = par.replace('…', '...')
par = par.replace('“', '"')
par = par.replace('”', '"')
par = par.replace('’', '\'') # curly quotes. don't replace the opposite side: we want to cull them
# in the future, we might try replacing 160 and 180. nah, it doesn't matter. we're only throwing out 1282 paragraphs
# there's some grammar mistakes, like missing commas, and ?". which is double punctuation
# "\ ", a space after a \
for c in par:
if ord(c) != 10 and (ord(c) > 127 or ord(c) < 32):
thrown_out += 1
# print("offending char:", ord(c), c)
return ""
for c in par:
if c == '|' or c == '<' or c == '/' or c == '`' or c == '*' or c == '=' or c == '_' or c == '&' or c == '@' or c == '~' or c == '#' or c == '%' or c == '[' or c == ']' or c == '+' or c == '(' or c == ')':
# ` is usually as strange punctuation, or `` ''
# / is for Tom/Lily stories, "he/she"
# * is usually in *emphasis*, but often incorrect, or in *** separating parts of a story
# $ is usually correct
# & is as an abbrev, or ’s
# ~ is rare and not used well
# # is wrong, hashtags and rarely as numbers
# there's one mistake for % and the rest are ok. I decided to clean it anyway
# [ is used wrong
# _ is used poorly
# = is sometimes used for addition but more often for mistakes
# + is for addition, abbreviation, and A+. but has some mistakes
# ( ) is about 80% correctly used
thrown_out += 1
# print("offending char:", ord(c), c, ":", par) #<i>, <End of Story>, <|im_start|>
return ""
if len(par) < 100:
thrown_out += 1
# if len(par) > 0:
# print("too short:", par) # many empty stories or story fragments. some legitimate stories are deleted below 200
return ""
if par[-1] != '.' and par[-1] != '!' and par[-1] != '"' and par[-1] != '?':
thrown_out += 1
# print("offending char:", par[-1])
# print(par)
return ""
return par
total_data = []
def clean(d):
global total_data
# print("generators:", set([i['source'] for i in d])) # it's only GPT-3.5 and GPT-4. no mistakes here
d = [process_story(i['story']) for i in d if i['source'] == 'GPT-4']
d = [i for i in d if i != ''] # remove empty lines
total_data += d
# for file in ["TinyStories_all_data/data00.json"]:
for file in glob.glob("TinyStories_all_data/*"):
with open(file) as o:
# print(type(json.load(o)))
data = json.load(o)
clean(data)
print("thrown out:", thrown_out)
print("survived:", len(total_data))
# print("uniques", len(set(total_data)))
# print("duplicates", [item for item, count in collections.Counter(total_data).items() if count > 1]) # no duplicates
# after some removals: Counter({' ': 426859449, 'e': 204912889, 'a': 150822107, 't': 139517558, 'o': 112349865, 'h': 105934005, 'n': 100530370, 'd': 95421243, 'i': 94879636, 's': 84200324, 'r': 75921863, 'l': 67910605, 'y': 51200735, 'm': 44606524, '.': 44303626, 'w': 43890889, 'u': 41336733, 'p': 33429250, 'g': 31785986, 'c': 30360697, 'b': 26834430, 'f': 24930515, ',': 23967379, 'k': 20172791, 'T': 20115680, '"': 12313202, 'v': 11770115, '\n': 9128807, 'S': 8571627, 'H': 6240920, 'I': 5084215, 'O': 4663978, "'": 4138731, 'L': 4092379, '!': 3870671, 'B': 3417017, 'x': 3397591, 'M': 3275768, 'A': 2511284, 'W': 1795287, 'j': 1644565, '?': 1400451, 'Y': 1313533, 'z': 1015289, 'J': 906218, 'F': 825037, 'D': 770337, 'C': 609437, 'q': 596899, 'N': 483349, 'E': 465002, 'K': 337323, 'P': 301036, 'G': 252938, 'R': 226506, '-': 193971, ':': 119893, 'Z': 67185, 'V': 42153, 'U': 31080, '3': 24147, ';': 11440, 'Q': 9834, 'X': 3447, '1': 3416, '0': 2392, '2': 1941, '5': 1672, '4': 770, '/': 472, '9': 433, '`': 366, '8': 356, '6': 317, '7': 273, ')': 199, '(': 194, '$': 155, '_': 102, '*': 88, '&': 44, '=': 43, '+': 27, '[': 18, ']': 18, '%': 13, '#': 9, '~': 5, '@': 2})
# c = Counter()
# for s in total_data:
# c += Counter(s)
# print(c)
# all_chars = set()
# for i in t:
# all_chars = all_chars.union(set(i))
# print("all chars", all_chars)
# all chars {'ó', '\xad', '\t', '❤', '¡', '(', '€', 'r', 'V', '把', 'T', '自', '\u3000', 'Z', '在', '恩', '\u2005', ';', '"', 'ā', '‘', '\u202a', 'p', 'â', '\u2009', '。', ')', '應', '了', '會', 'S', 'h', 'D', 'u', '兒', '剛', '5', 'd', '\ue000', 'İ', '/', 'ᴇ', '9', '¢', '’', '‑', 'è', '當', '\uf04a', 'É', '給', 'N', 'j', '米', '_', '\\', '…', '~', '‐', '―', '獨', '\u200e', 'ᴜ', '度', 'n', 'P', '�', 'a', 'O', 'ñ', '—', 'f', '🎓', '>', 'l', 'Y', 'ᴀ', 's', '\xa0', 'í', '\x92', 'G', '童', 'ғ', 'c', 'E', '7', '£', 'W', '兩', '\u2028', '!', '很', '興', '−', '留', 'á', 'K', 'ᴢ', 'F', '8', '們', '保', 'ʏ', '´', 'J', '但', '又', 'I', '奮', '$', '🤩', '4', '過', '她', '\u200c', '+', 'L', 'o', '3', '是', '🌴', '️', '個', '=', '0', 'b', 'ᴅ', 'q', '己', '·', '天', '#', 'ᴏ', 'A', '1', '他', '*', '莉', '─', '艾', 'R', 'ᴡ', '}', '難', 'U', 'X', '[', '和', '™', '»', ' ', '裡', 'y', 'à', ']', 'C', '?', '些', '整', '¿', '6', ':', '🍌', 'ᴄ', 'ᴛ', 'x', 'ɪ', 'm', 'g', '田', 'e', 'M', '„', ',', '\u200a', '°', '<', '\u2029', '{', '.', '”', '`', 'ö', '的', 'ß', "'", 'Q', 'ú', '%', '分', 'v', '–', 'і', 'ê', 'B', '“', '\ufeff', 'ʙ', 'i', 'z', '«', '@', '-', '一', 'ï', 't', 'w', '巴', '到', '玉', 'k', 'œ', ',', '‚', 'H', 'é', 'ʜ', '&', '§', '2', '\u200b', '這', '答', '高', '|', '時'}
# print("all chars used:", set(''.join(t))) # maybe this caused my computer to crash?
print(random.choice(total_data))
with open("cleaned_data.pkl", "wb") as file:
pickle.dump(total_data, file)