This repository has been archived by the owner on Jul 13, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjson_from_hackernews.py
69 lines (57 loc) · 2.4 KB
/
json_from_hackernews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import argparse
from datetime import datetime
import json
import requests
from bs4 import BeautifulSoup
parser = argparse.ArgumentParser(
description="download comments to articles from HackerNews")
parser.add_argument(
"-output_file",
type=str,
help="the one-JSON-per-line serialized file",
default="hackernews_utterances.jsons")
args = parser.parse_args()
response = requests.request("GET", "https://news.ycombinator.com/")
soup = BeautifulSoup(response.text, 'html.parser')
article_comment_links = set()
for link in soup.find_all('a'):
if link.get('href').startswith('item?id='):
article_comment_links.add(
"https://news.ycombinator.com/" +
link.get('href'))
print(
'found {0} articles from the HackerNews homepage'.format(
len(article_comment_links)))
comment_count = 0
for article_link in article_comment_links:
print('examining ' + article_link)
response = requests.request("GET", article_link)
soup = BeautifulSoup(response.text, 'html.parser')
story_url = ''
for story_link in soup.find_all("a", class_="storylink"):
print('story:', story_link.get('href'))
story_url = story_link.get('href')
with open(args.output_file, "a+") as writer:
for comment_box in soup.find_all("td", class_="default"):
print(comment_box.contents)
comment = comment_box.contents[2]
clean_comment = comment.get_text().replace(
'\n', ' ').rsplit(
'reply', maxsplit=1)[0].strip()
username = comment_box.contents[0].contents[0].contents[1].get_text()
# relative time can later be processed with https://github.com/comtravo/ctparse
relative_time = comment_box.contents[0].contents[0].contents[3].get_text()
writer.write(json.dumps({"text": clean_comment,
"scrape_timestamp": datetime.now().isoformat()[:19] + 'Z',
"relative_time": relative_time,
"source": article_link,
"username": username,
"story_url": story_url}))
writer.write('\n')
print('-----')
comment_count += 1
writer.close()
print(
'Done! Processed {0} comments from {1} article links'.format(
comment_count,
len(article_link)))