forked from rahgoar/DataScience_OttawaU_2019
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcorpus_work1.py
33 lines (22 loc) · 851 Bytes
/
corpus_work1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from nltk.corpus import gutenberg # Docs from project gutenberg.org
files_en = gutenberg.fileids() # Get file ids
doc_en = gutenberg.open('austen-emma.txt').read()
from nltk import regexp_tokenize
pattern = r'''(?x) ([A-Z]\.)+ | \w+(-\w+)* | \$?\d+(\.\d+)?%? | \.\.\. | [][.,;"'?():-_`]'''
tokens_en = regexp_tokenize(doc_en, pattern)
#nltk.download('gutenberg')
import nltk
en = nltk.Text(tokens_en)
print(len(en.tokens)) # returns number of tokens (document length)
print(len(set(en.tokens))) # returns number of unique tokens
en.vocab()
#en.plot(50)
print(doc_en.count('Emma'))
print(tokens_en.count('Emma'))
print(en.count('Emma')) # Counts occurrences
#en.dispersion_plot(['Emma', 'Frank', 'Jane'])
#en.concordance('Emma', lines=5)
# Find similar words;
#en.similar('Emma')
#en.similar('Frank')
#en.collocations()