-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain_spider.py
74 lines (70 loc) · 3.17 KB
/
main_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import scrapy
import spacy
import string
from bs4 import BeautifulSoup
import re
class MainSpider(scrapy.Spider):
name = "scraper"
def start_requests(self):
urls = [
'http://blog.rei.com/camp/how-to-introduce-your-indoorsy-friend-to-the-outdoors/',
'http://www.cnn.com/2013/06/10/politics/edward-snowden-profile/',
'http://www.amazon.com/Cuisinart-CPT-122-Compact-2-SliceToaster/dp/B009GQ034C/ref=sr_1_1?s=kitchen&ie=UTF8&qid=1431620315&sr=1-1&keywords=toaster'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[2]
filename = '%s.txt' % page
soup = BeautifulSoup(response.text, 'lxml')
headers = soup.find_all(re.compile('^h[1-6]$'))
headers_text = ''
for headlines in headers:
headers_text += headlines.text.strip() + '. '
# print(headers_text)
# Remove all extraneous text like links images vidoes ads
for script in soup(["script", "style","a","span","i","input","textarea","img"]):
script.decompose() # rip it out
# get text
all_p = soup.find_all('p')
text = soup.get_text()
# Clean up the text
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
nlp = spacy.load('en_core_web_sm')
sentences = nlp(text)
# print(sentences)
output_list = {}
# Create key topics list and frequency using lemming and sentence trees
# bad_list = ['he','she','to','they','He','She','who','Who','it']
for token in sentences:
if token.dep_ == 'nsubj' or token.tag_ == 'NNP':
if token.text not in string.punctuation and token.tag_ != 'PRP' and token.tag_ != 'DT' and token.tag_ != 'WP' and token.text.isalpha() :
in_text = token.text
weight = 1
if token.tag_ == 'NN':
in_text = token.lemma_
if token.dep_ == 'nsubj':
weight += 1
if token.tag_ == 'NNP':
weight += 1
if in_text == 'he':
print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_)
if in_text not in output_list:
output_list[in_text] = 0
output_list[in_text] += 1
# print(output_list)
# Sort and take top five topics and Print them
final_list = sorted(output_list, key=output_list.__getitem__, reverse=True)
tags = final_list[:max(len(final_list)//10,5)]
print(tags)
# Save all scraped text to txt file with name "top-domainname.txt"
with open(filename, 'w') as f:
f.write(text)
tagfile = '%s_tags.txt' % page
with open(tagfile, 'w') as f:
f.write((',').join(tags))
self.log('Saved file %s' % filename)