-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanime_dataset.py
144 lines (105 loc) · 5.18 KB
/
anime_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import csv
import re
import requests
import json
output_file = 'anime_dataset.csv'
start_id = 1 # Starting anime ID
end_id = 100 # Ending anime ID
count = 0 # keep count of anime for the current session
animes = [] # list to store anime data
anime_id = start_id
while anime_id <= end_id:
apiUrl = 'https://api.jikan.moe/v4/anime/' + str(anime_id) # base url for API
# API call
page = requests.get(apiUrl)
# I will do 5 retries
tries = 0
while tries < 5 and page.status_code != 200:
tries += 1
page = requests.get(apiUrl)
# if status code is 200 then process the response
if page.status_code == 200:
jsonData = page.json()
# Check if 'data' key is present in the response
if 'data' in jsonData:
anime = {}
# Fetching animeID
anime['animeID'] = anime_id
# Fetching Name of the anime
anime['Name'] = jsonData['data'].get('title')
# Fetching english title of the anime
anime['English name'] = jsonData['data'].get('title_english')
# Fetching japanese title of the anime
anime['Japanese name'] = jsonData['data'].get('title_japanese')
# Fetching Score of the anime
anime['Score'] = jsonData['data'].get('score')
# Fetching Genres of the anime
anime['Genres'] = ', '.join([genre['name'] for genre in jsonData['data'].get('genres', [])])
# Fetching and cleaning the Synopsis of the anime
synopsis = jsonData['data'].get('synopsis')
if synopsis is not None:
cleaned_synopsis = re.sub(r'\[.*?\]', '', synopsis).strip()
anime['Synopsis'] = cleaned_synopsis
else:
anime['Synopsis'] = ""
# Fetching Type of the anime
anime['Type'] = jsonData['data'].get('type')
# Fetching Episodes of the anime
anime['Episodes'] = jsonData['data'].get('episodes')
# Fetching Aired dates of the anime
anime['Aired'] = jsonData['data'].get('aired', {}).get('string')
# Fetching season and year and then combining them in the Premiered column
premiered = jsonData['data'].get('season')
year = jsonData['data'].get('year')
if year is not None:
premiered += ' ' + str(year)
anime['Premiered'] = premiered
# Fetching Status dates of the anime
anime['Status'] = jsonData['data'].get('status')
# Fetching Producers of the anime
anime['Producers'] = ', '.join([producer['name'] for producer in jsonData['data'].get('producers', [])])
# Fetching Licensors of the anime
anime['Licensors'] = ', '.join([license['name'] for license in jsonData['data'].get('licensors', [])])
# Fetching Studios of the anime
anime['Studios'] = ', '.join([studio['name'] for studio in jsonData['data'].get('studios', [])])
# Fetching Source of the anime
anime['Source'] = jsonData['data'].get('source')
# Fetching Duration of the anime
anime['Duration'] = jsonData['data'].get('duration')
# Fetching Rating of the anime
anime['Rating'] = jsonData['data'].get('rating')
# Fetching Rank of the anime
anime['Rank'] = jsonData['data'].get('rank')
# Fetching Popularity of the anime
anime['Popularity'] = jsonData['data'].get('popularity')
# Fetching Favorites of the anime
anime['Favorites'] = jsonData['data'].get('favorites')
# Fetching Scored By of the anime
anime['Scored By'] = jsonData['data'].get('scored_by')
# Fetching Members of the anime
anime['Members'] = jsonData['data'].get('members')
animes.append(anime)
count += 1
# Writing the dataset to a CSV file after processing each anime
if count % 200 == 0:
print('{} anime processed, writing to file'.format(count))
if animes:
fieldnames = list(animes[0].keys()) # Get the fieldnames from the first anime
mode = 'a' if count > 0 else 'w' # Choose the mode based on the count
with open(output_file, mode, newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
if count == 200: # Write the header row only for the first batch
writer.writeheader()
writer.writerows(animes)
animes.clear()
else:
print('Skipping anime {}: Invalid data'.format(anime_id))
anime_id += 1
# Writing the remaining dataset to a CSV file
if animes:
print('Writing the final dataset to a CSV file')
fieldnames = list(animes[0].keys()) # Get the fieldnames from the first anime
with open(output_file, 'a', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerows(animes)
print('Total', count, 'anime data fetched. Done.')