This repository was archived by the owner on Jun 10, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathget_memes.py
66 lines (51 loc) · 1.96 KB
/
get_memes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import os
import sys
import json
import requests
MAX_DOCS = int(sys.argv[1])
JSON_URL = "https://jina-examples-datasets.s3.amazonaws.com/memes/memes.json"
OUTPUT_DIR = "./data"
def get_json(url, output_dir):
if not os.path.isfile(f"{output_dir}/memes.json"):
if not os.path.isdir(output_dir):
os.makedirs(output_dir)
print(f"Downloading {url} to '{output_dir}' directory")
r = requests.get(url, allow_redirects=True)
if r.status_code == 200:
with open(f"{output_dir}/memes.json", "wb") as file:
file.write(r.content)
def prep_docs(input_file, max_docs, output_dir, random_seed=1337, shuffle=True):
print(f"Preparing {max_docs} Documents")
memes = []
print(f"Processing {input_file}")
with open(input_file, "r") as file:
raw_json = json.loads(file.read())
for template in raw_json:
for meme in template["generated_memes"]:
meme["template"] = template["name"]
memes.extend(template["generated_memes"])
if shuffle:
import random
random.seed(random_seed)
random.shuffle(memes)
os.chdir(output_dir)
counter = 1
for meme in memes[:max_docs]:
# Download image
url = f'http:{meme["image_url"]}'
filename = meme["image_url"].split("/")[-1]
if not os.path.isfile(filename):
print(f"Downloading {filename} - {counter}/{max_docs}")
try:
r = requests.get(url, allow_redirects=True)
if r.status_code == 200:
with open(filename, "wb") as file:
file.write(r.content)
counter += 1
except:
print(f"Error on {filename}, skipping.")
else:
print(f"{filename} already downloaded, skipping")
counter +=1
get_json(url=JSON_URL, output_dir=OUTPUT_DIR)
prep_docs("data/memes.json", max_docs=MAX_DOCS, output_dir=OUTPUT_DIR, shuffle=True)