-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraping.py
89 lines (69 loc) · 3.28 KB
/
scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from time import sleep
from time import time
from random import randint
from IPython.display import clear_output
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
# Use pagination to navigate(until 4 pages), use comprehension list to create a list of strings containing the page number
pages = [str(i) for i in range(1,5)]
# Searche the movies between 2017 and 2018
years_url = [str(i) for i in range(2000, 2018)]
# Fix the date at starting script
start_time = time()
# Variable which calculates the request number
requests = 0
# For every year between 2000 and 2017
for year_url in years_url:
# For every page between 1 et 4
for page in pages:
# Do a http GET sur le site imdb
response = get('https://www.imdb.com/search/title/?release_date='+year_url + '&sort=num_votes,desc&page='+ page +'')
# Pause the loop between 8 until 15 secondes
sleep(randint(8, 15))
# Display request data
request += 1 # new request, we increment this variable by 1
sleep(randint(1, 3)) # Do a wait between 1 and 3 secondes
elapsed_time = time() - start_time # Calculate elapsed time since the first request
print("Requests{}; Frequency{} requests/s".format(request, request/elapsed_time))
clear_output(wait = True)
# If the status code is different from 200, there is a problem, we must warning that
if response.status_code != 200:
warn('Request:{}; status_code:{}'.format(request, response.status_code))
#The BeautifulSoup module allow to analyse a html document by gettings and parsing its elements
html_soup = BeautifulSoup(response.text, 'html.parser')
movies_container = html_soup .find_all('div', class_="lister-item mode-advanced")
# Create empty lists containg all the necessary data
names = []
years = []
imdb_ratings = []
meta_scores = []
votes = []
# We base on movies_container for dat extraction
for container in movies_container:
# if the movies has a metascore, we scrap this information
if container.find('div', class_="ratings-metascore") is not None:
# Scrap the movie name
name = container.h3.a.text
names.append(name) # We add every movie to the list
# Scrap the movie's year
year = container.h3.find('span', class_='lister-item-year text-muted unbold').get_text()
years.append(year) # On ajouter l'année de sortie de film
# Scrap the imdb note
imdb_rating = float(container.strong.text)
imdb_ratings.append(imdb_rating)
# Scrap the metascore
meta_score = container.find('div', class_='inline-block ratings-metascore').span.get_text()
meta_scores.append(int(meta_score))
# Scrap the nb votes
vote = container.find('span', attrs={"name":"nv"})['data-value']
votes.append(int(vote))
data_movies = pd.DataFrame ({
"movie": names,
"year": years,
"imdb_ratings": imdb_ratings,
"meta_scores": meta_scores,
"votes": votes
})
print("Data visualisation :" )
print(data_movies)