-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathex13HW.py
199 lines (171 loc) · 5.91 KB
/
ex13HW.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# Exercise 13.1. Write a program that reads a file, breaks each line into words,
# strips whitespace and punctuation from the words, and converts them to
# lowercase.
import string
def clean_and_return(a):
a=string.strip(a)
a=string.lower(a)
for x in string.whitespace[:-1]:
a=a.replace(x,'')
for x in string.punctuation[:-1]:
a=a.replace(x,'')
for x in a:
if x>'z':
a=a.replace(x,'')
for x in string.digits:
a=a.replace(x,'')
b=a.split()
return b
# Exercise 13.2. Go to Project Gutenberg (gutenberg. org ) and download your
# favorite out-of-copyright book in plain text format.
# Modify your program from the previous exercise to read the book you downloaded
# skip over the header information at the beginning of the file, and process the
# rest of the words as before.
# Then modify the program to count the total number of words in the book, and
# the number of times each word is used.
# Print the number of different words used in the book.
# Compare different books by different authors, written in different eras.
# Which author uses the most extensive vocabulary?
def analyzefile(a):
f=open('./'+a,'r')
d=dict()
a='word'
### just to get passed the header
for i in range (572):
f.readline()
######
while a != '':
a=f.readline()
b=clean_and_return(a)
for x in b:
d[x]=d.get(x,0)+1
f.close()
return d
def find_biggest(d):
biggest=''
n=0
for x in d:
if d.get(x)>n:
biggest=x
n=d.get(x)
print biggest, n
return biggest
# Exercise 13.3. Modify the program from the previous exercise to print the 20
# most frequently-used words in the book
def top20(a):
b=''
d=dict(a.items()+{'madebykonrad':0}.items())
for i in range(20):
b=find_biggest(d)
d[b]=1
# Exercise 13.4. Modify the previous program to read a word list and then
# print all the words in the book that are not in the word list. How many of
# them are typos? How many of them are common words that should be in the word
# list, and how many of them are really obscure?
def not_english(bookdic,englishdic):
import time
b=bookdic.keys()
for x in b:
if x not in englishdic:
print x
time.sleep(1)
# Exercise 13.5. Write a function named choose_from_hist that takes a histogram
# and returns a random value from the histogram, chosen with probability in
# proportion to frequency.
#this assumes you have a dict of the histogram of 't' other wise it's simply
# random.choice(t)
def choose_from_hist(d):
import random
word=''
for x in d.keys():
word+=x*d.get(x)
return random.choice(word)
# Exercise 13.7. This algorithm works, but it is not very efficient; each time
# you choose a random word, it rebuilds the list, which is as big as the original
# book. An obvious improvement is to build the list once and then make multiple
# selections, but the list is still big.
# Write a program that uses this algorithm to choose a random word from the book.
def random_word(d):
if type(d)==list:
dic=dict()
for x in d:
dic[x]=dic.get(x,1)+1
d=dic
import random
b=sum(d.values())
a=random.randint(1,b)
word=''
for x in d:
a=a-d[x]
word=x
if a<=0:
return x
# Exercise 13.8. Markov analysis:
# 1. Write a program to read a text from a file and perform Markov analysis
def markov(a):
f=open('./'+a,'r')
d=dict()
a='word'
b=''
while a != '':
a=f.readline()
b+=a
a=clean_and_return(b)
for i in range(len(a)-3):
if d.get((a[i],a[i+1]),[])==[]:
d[(a[i],a[i+1])]=[]
d[(a[i],a[i+1])].append([a[i+2],a[i+3]])
f.close()
return d
#2. Add a function to the previous program to generate random text based on
# the Markov analysis.
def write(tup,d):
import random
import time
rand=random.randint(0,len(d.get(tup))-1)
tup= d.get(tup)[rand]
print tup
time.sleep(1)
tup=tuple(tup)
write(tup,d)
# Exercise 13.9.
# The 'rank' of a word is its position in a list of words sorted by frequency: the most common word has rank 1, the second most common has rank 2, etc.
# Zipf's law describes a relationship between the ranks and frequencies of words in natural languages
# (http: // en. wikipedia. org/ wiki/ Zipf's_ law ).
# Specifically, it predicts that the frequency, f , of the word with rank r is:
# f = cr -s
# where s and c are parameters that depend on the language and the text.
# If you take the logarithm of both sides of this equation, you get:
# log f = log c - s log r
# So if you plot log f versus log r, you should get a straight line with slope -s and intercept log c.
# Write a program that reads a text from a file, counts word frequencies, and prints one line for each
# word, in descending order of frequency, with log f and log r. Use the graphing program of your
# choice to plot the results and check whether they form a straight line. Can you estimate the value of
# s?
def sortdic(d):
import math
ls=d.items()
logf=[]
logr=[]
ls.sort(key=lambda x:x[1],reverse=True)
for i in range(len(ls)-1):
logf.append( math.log(ls[i][1]))
logr.append( math.log(i+1))
return (logr,logf)
# if i%2==0:
# logf+=math.log(ls[i][1])
# logr-=math.log(i+1)
# else:
# logf-=math.log(ls[i][1])
# logr+=math.log(i+1)
# return (logf,logr) #given this s=0.8071824484324789
def findk(d):
import math
ls=d.items()
s=0.8071824484324789
k=0
ls.sort(key=lambda x:x[1],reverse=True)
for i in range(len(ls)-1):
k+=math.log(ls[i][1])+s*math.log(i+1)
return k/(len(ls)-1) #k=7.268894074998243
#equation = log f = 7.268894074998243 -0.8071824484324789(log r)