-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbillboard_scrapper.py
116 lines (96 loc) · 3.9 KB
/
billboard_scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import requests
from bs4 import BeautifulSoup
class Billboard_Entry:
'''
Entry fetched from the billboard.
Contains current rank, title and artist of the song.
Prints in the following format: "[##] Song by Artist"
'''
def __init__(self,rank,title,artist):
self.rank = rank
self.title = title
self.artist = artist
def __str__(self):
return f'[{self.rank}] {self.title} by {self.artist}'
def pretty_print(self):
'''
Returns in a simplified format the title of song accompanied
by first name of artist for several uses(e.i. to input as query on a webpage).
'''
pretty_artist = self.artist.split(' ')[0]
return f'{self.title} by {pretty_artist}'
class Billboard:
'''
Billboard object containing all the billboard entries
'''
def __init__(self, entries):
self.entries = entries
self.chart_name = entries[0]
def __str__(self):
string = ""
for entry in self.entries:
string += (str(entry) + "\n" )
return string
class Billboard_Error(Exception):
'''
Custom exception for failure to properly parse an entry
'''
pass
def fetch_billboard(chart='hot-100'):
'''
Fetches billboard from https://www.billboard.com.
Defaults to The Top Hot 100 Billboard, but takes in a string of any Billboard
chart in the form of '/chart_x'.
Returns list with chart tile at index [0] and Billboard_Entry objects.
TODO:
The Hot-100 page seems to have changed and therefore turned 'unscrapable'. Needs to be updated.
'''
url = 'https://www.billboard.com/charts/' + chart
response = requests.get(url,
headers = {"Accept":"text/html"})
#Parses HTML string from billboard.com with bs4
html_string = response.text
parsed_html = BeautifulSoup(html_string, 'html.parser')
#fetches the chart's title
chart_title = parsed_html.find('title').text
#empty list to allocate all entry objects created from the info fetched
entries = []
#Line below is able to append chart title to index position 0 on resulting array
#entries.append(chart_title)
#Iterates over each entry on the billboard, creating a Entry Object and appending it to entries list
if chart =='hot-100':
#Main 100-Billboard chart has a different DOM structure than the rest.
ranks = parsed_html.find_all(class_='chart-element__rank__number')
titles = parsed_html.find_all(class_='chart-element__information__song')
artists = parsed_html.find_all(class_='chart-element__information__artist')
for i in range(len(parsed_html.find_all(class_='chart-list__element'))):
try:
rank = int(ranks[i].get_text().strip())
title = titles[i].get_text().strip()
artist = artists[i].get_text().strip() or ''
if artist == '':
artist = 'Unknown'
except:
message = "Failed to parse entry: " + str(item)
raise Billboard_Error(message)
entry = Billboard_Entry(rank, title, artist)
entries.append(entry)
else:
#All other charts are seemingly structured the same.
for item in parsed_html.find_all(class_='chart-list-item'):
try:
rank = int(item['data-rank'].strip())
title = item['data-title'].strip()
artist = item['data-artist'].strip() or ''
if artist == '':
artist = 'Unknown'
except:
message = "Failed to parse entry"
raise Billboard_Error(message)
entry = Billboard_Entry(rank, title, artist)
entries.append(entry)
return entries
if __name__ == '__main__':
entries = fetch_billboard()
billboard = Billboard(entries)
print(billboard)