-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch_bible.py
433 lines (380 loc) · 19.6 KB
/
fetch_bible.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
fetch_bible.py — Downloads chapters from BibleGateway in parallel (asyncio + aiohttp)
and saves them as structured JSON.
v 1.4.2 (17-Apr-2025)
=====================
* --book is optional ⇒ omitted = Full Bible (66 books)
* Complete BOOK_INFO (alias→code+order) and CHAPS (chapters per book)
* Creates directories before downloading; clear error if no permissions
* Default concurrency 10 with progress bar per book
"""
import argparse
import asyncio
import html
import json
import re
import sys
from pathlib import Path
from typing import Dict, List, Tuple
import aiohttp
from bs4 import BeautifulSoup, element
from tqdm.asyncio import tqdm
HEADERS = {
"User-Agent": "Mozilla/5.0 (compatible; Sion-AI/1.4.2)",
"Accept-Charset": "utf-8"
}
# ---------------------------------------------------------------------------
# Table: any name → (folder code, canonical order)
# ---------------------------------------------------------------------------
BOOK_INFO: Dict[str, Tuple[str, int]] = {}
_names_orders = [
# (order, code, aliases …)
(1, "gen", ["genesis", "génesis", "gen"]),
(2, "exo", ["exodus", "éxodo", "exo"]),
(3, "lev", ["leviticus", "levítico", "lev"]),
(4, "num", ["numbers", "números", "num"]),
(5, "deu", ["deuteronomy", "deuteronomio", "deu"]),
(6, "jos", ["joshua", "josué", "jos"]),
(7, "jdg", ["judges", "jueces", "jdg"]),
(8, "rut", ["ruth", "rut"]),
(9, "1sa", ["1 samuel", "first samuel", "1sa"]),
(10, "2sa", ["2 samuel", "second samuel", "2sa"]),
(11, "1ki", ["1 kings", "1 reyes", "1ki"]),
(12, "2ki", ["2 kings", "2 reyes", "2ki"]),
(13, "1ch", ["1 chronicles", "1 cronicas", "1 crónicas", "1ch"]),
(14, "2ch", ["2 chronicles", "2 cronicas", "2 crónicas", "2ch"]),
(15, "ezr", ["ezra", "esdras", "ezr"]),
(16, "neh", ["nehemiah", "nehemias", "nehemías", "neh"]),
(17, "est", ["esther", "ester", "est"]),
(18, "job", ["job"]),
(19, "psa", ["psalms", "salmos", "psa"]),
(20, "pro", ["proverbs", "proverbios", "pro"]),
(21, "ecc", ["ecclesiastes", "eclesiastes", "eclesiastés", "ecc"]),
(22, "sng", ["song of solomon", "song of songs", "cantares", "cantar de los cantares", "sng"]),
(23, "isa", ["isaiah", "isaias", "isaías", "isa"]),
(24, "jer", ["jeremiah", "jeremias", "jeremías", "jer"]),
(25, "lam", ["lamentations", "lamentaciones", "lam"]),
(26, "ezk", ["ezekiel", "ezequiel", "ezk"]),
(27, "dan", ["daniel", "dan"]),
(28, "hos", ["hosea", "oseas", "hos"]),
(29, "jol", ["joel", "jol"]),
(30, "amo", ["amos", "amós", "amo"]),
(31, "oba", ["obadiah", "abdias", "abdías", "oba"]),
(32, "jon", ["jonah", "jonas", "jonás", "jon"]),
(33, "mic", ["micah", "miqueas", "mic"]),
(34, "nam", ["nahum", "nam"]),
(35, "hab", ["habakkuk", "habacuc", "hab"]),
(36, "zep", ["zephaniah", "sofonias", "sofonías", "zep"]),
(37, "hag", ["haggai", "hageo", "hag"]),
(38, "zec", ["zechariah", "zacarias", "zacarías", "zec"]),
(39, "mal", ["malachi", "malaquias", "malaquías", "mal"]),
(40, "mat", ["matthew", "mateo", "mat"]),
(41, "mrk", ["mark", "marcos", "mrk"]),
(42, "luk", ["luke", "lucas", "luk"]),
(43, "jhn", ["john", "juan", "jhn"]),
(44, "act", ["acts", "hechos", "act"]),
(45, "rom", ["romans", "romanos", "rom"]),
(46, "1co", ["1 corinthians", "1 corintios", "1co"]),
(47, "2co", ["2 corinthians", "2 corintios", "2co"]),
(48, "gal", ["galatians", "galatas", "gálatas", "gal"]),
(49, "eph", ["ephesians", "efesios", "eph"]),
(50, "php", ["philippians", "filipenses", "php"]),
(51, "col", ["colossians", "colosenses", "col"]),
(52, "1th", ["1 thessalonians", "1 tesalonicenses", "1th"]),
(53, "2th", ["2 thessalonians", "2 tesalonicenses", "2th"]),
(54, "1ti", ["1 timothy", "1 timoteo", "1ti"]),
(55, "2ti", ["2 timothy", "2 timoteo", "2ti"]),
(56, "tit", ["titus", "tito", "tit"]),
(57, "phm", ["philemon", "filemon", "filemón", "phm"]),
(58, "heb", ["hebrews", "hebreos", "heb"]),
(59, "jas", ["james", "santiago", "jas"]),
(60, "1pe", ["1 peter", "1 pedro", "1pe"]),
(61, "2pe", ["2 peter", "2 pedro", "2pe"]),
(62, "1jn", ["1 john", "1 juan", "1jn"]),
(63, "2jn", ["2 john", "2 juan", "2jn"]),
(64, "3jn", ["3 john", "3 juan", "3jn"]),
(65, "jud", ["jude", "judas", "jud"]),
(66, "rev", ["revelation", "apocalipsis", "revelacion", "revelación", "rev"]),
]
for order, code, names in _names_orders:
for n in names:
BOOK_INFO[n] = (code, order)
# Chapters per book
CHAPS = {
"gen": 50, "exo": 40, "lev": 27, "num": 36, "deu": 34,
"jos": 24, "jdg": 21, "rut": 4, "1sa": 31, "2sa": 24, "1ki": 22, "2ki": 25,
"1ch": 29, "2ch": 36, "ezr": 10, "neh": 13, "est": 10, "job": 42, "psa": 150,
"pro": 31, "ecc": 12, "sng": 8, "isa": 66, "jer": 52, "lam": 5, "ezk": 48,
"dan": 12, "hos": 14, "jol": 3, "amo": 9, "oba": 1, "jon": 4, "mic": 7,
"nam": 3, "hab": 3, "zep": 3, "hag": 2, "zec": 14, "mal": 4,
"mat": 28, "mrk": 16, "luk": 24, "jhn": 21, "act": 28, "rom": 16,
"1co": 16, "2co": 13, "gal": 6, "eph": 6, "php": 4, "col": 4,
"1th": 5, "2th": 3, "1ti": 6, "2ti": 4, "tit": 3, "phm": 1,
"heb": 13, "jas": 5, "1pe": 5, "2pe": 3, "1jn": 5, "2jn": 1, "3jn": 1,
"jud": 1, "rev": 22,
}
# ---------------------------------------------------------------------------
async def fetch_html(session: aiohttp.ClientSession, url: str) -> str:
# Fetches the HTML content of a given URL.
async with session.get(url, headers=HEADERS, timeout=30) as r:
r.raise_for_status()
return await r.text(encoding='utf-8')
def parse_chapter_html(html_text: str, book_code: str, version: str, chapter: int) -> dict:
# Parses the HTML content of a Bible chapter to extract verses.
soup = BeautifulSoup(html_text, "html.parser", from_encoding='utf-8')
container = soup.find("div", class_=re.compile(r"^passage-content"))
if container is None:
# If BibleGateway returns 404 or an empty div → chapter does not exist
raise ValueError("passage-content not found")
# --- Pre-processing: Remove unwanted elements ---
for element_to_remove in container.find_all(["sup", "div", "h3"],
class_=["footnote", "footnotes", "crossreference", "crossrefs"]):
element_to_remove.decompose() # Remove footnotes, cross-references, etc.
for h3 in container.find_all("h3"):
# Ensure all H3 titles are removed, even if they lack a class
h3.decompose()
verses_data = {} # Dictionary: verse_num -> [list of text/format parts]
current_verse_num = None
ignore_next_text_node_if_matches_num = False # Flag to ignore the number text right after finding it
# Iterate over all descendant nodes in order
for node in container.descendants:
if isinstance(node, element.Tag):
# --- Identify start of verse ---
verse_marker_found = False
extracted_num = None
if node.name == 'sup' and 'versenum' in node.get('class', []):
# Standard verse number marker
extracted_num = node.get_text(strip=True)
verse_marker_found = True
# Capture the chapter number as verse 1
elif node.name == 'span' and 'chapternum' in node.get('class', []):
# Verse 1 often uses chapternum
extracted_num = node.get_text(strip=True)
if extracted_num.isdigit(): # Make sure it's the chapter number
extracted_num = "1" # Assign as verse 1
verse_marker_found = True
if verse_marker_found and extracted_num and extracted_num.isdigit():
# Found a new verse number
current_verse_num = extracted_num
if current_verse_num not in verses_data:
verses_data[current_verse_num] = []
ignore_next_text_node_if_matches_num = True # Set flag
# The number itself is not added to the text.
continue # Skip to the next node
# --- Handle line breaks ---
if node.name == 'br' and current_verse_num:
# Only add newline if the last element wasn't already one
if not verses_data[current_verse_num] or verses_data[current_verse_num][-1] != "\n":
verses_data[current_verse_num].append("\n")
ignore_next_text_node_if_matches_num = False # Any content after <br> is not the number
# Ignore empty spans or containers that don't directly add text here
if node.name == 'span' and not node.get_text(strip=True):
ignore_next_text_node_if_matches_num = False # Reset flag if span is empty
continue
elif isinstance(node, element.NavigableString):
# --- Add text to the current verse ---
text_part = str(node)
if current_verse_num:
# Ignore if it's the number text we just found
if ignore_next_text_node_if_matches_num and text_part.strip() == current_verse_num:
ignore_next_text_node_if_matches_num = False # Deactivate flag
continue # Skip this text
# Add the text (preserving original spaces)
verses_data[current_verse_num].append(text_part)
# If we added real text, the next text node can't be the number
if text_part.strip():
ignore_next_text_node_if_matches_num = False
# --- Format output ---
verses = []
# Ensure all verse numbers exist up to the maximum found
if verses_data:
max_verse = max(map(int, verses_data.keys()))
for i in range(1, max_verse + 1):
num_str = str(i)
verse_content = verses_data.get(num_str, [])
# Join parts preserving format
raw_text = html.unescape("".join(verse_content))
# Create readable text: replace newlines/tabs with space, normalize spaces
readable_text = raw_text.replace('\n', ' ').replace('\t', ' ')
readable_text_normalized = " ".join(readable_text.split()).strip()
verses.append({
"verse": num_str,
"text": raw_text.strip(), # Remove leading/trailing whitespace from the full text
"readableText": readable_text_normalized
})
else: # If no verses were found
pass # Or handle the case of an empty/error chapter
return {
"book": book_code.upper(), # Convert book code to uppercase
"rev": version,
"chapter": str(chapter),
"verses": verses,
}
def parse_range(arg: str, max_chaps: int) -> List[int]:
# Converts 'all' or '1-5,8,10' into a list of ints respecting the maximum.
if arg.lower() == "all":
return list(range(1, max_chaps + 1))
out: List[int] = []
for part in arg.split(","):
part = part.strip()
if not part:
continue
if "-" in part:
try: # Add error handling for invalid ranges
a, b = map(int, part.split("-"))
if a <= b:
out.extend(range(a, b + 1))
else:
print(f"Warning: Invalid range '{part}', skipping.", file=sys.stderr) # Warn about invalid range
except ValueError:
print(f"Warning: Invalid range format '{part}', skipping.", file=sys.stderr) # Warn about format error
else:
try: # Add error handling for invalid numbers
out.append(int(part))
except ValueError:
print(f"Warning: Invalid chapter number '{part}', skipping.", file=sys.stderr) # Warn about non-integer
# Filter out chapters exceeding the book's maximum
return [n for n in out if 1 <= n <= max_chaps]
async def grab(
sem: asyncio.Semaphore,
session: aiohttp.ClientSession,
version: str,
book_name: str, # Use the formatted book name for the URL
book_code: str,
chap: int,
out_dir: Path,
):
# Fetches and parses a single chapter, saving it to a JSON file.
# Use the user-provided (capitalized) book name for the URL for better compatibility
url = f"https://www.biblegateway.com/passage/?search={book_name}%20{chap}&version={version}"
async with sem:
try:
html_text = await fetch_html(session, url)
data = parse_chapter_html(html_text, book_code, version, chap)
if not data["verses"]: # Check if parsing resulted in empty verses
print(f"Warning: No verses found for {book_name} {chap} ({version}). Skipping file save.", file=sys.stderr)
return None
except ValueError as e: # Catch specific parsing errors
print(f"Error parsing {book_name} {chap} ({version}): {e}. Skipping.", file=sys.stderr)
return None
except aiohttp.ClientResponseError as e: # Catch HTTP errors
print(f"HTTP Error fetching {book_name} {chap} ({version}): {e.status} {e.message}. Skipping.", file=sys.stderr)
return None
except asyncio.TimeoutError: # Catch timeouts
print(f"Timeout fetching {book_name} {chap} ({version}). Skipping.", file=sys.stderr)
return None
except Exception as e: # Catch other potential errors during fetch/parse
# Non-existent chapter or network error -> does not interrupt the rest
print(f"Error processing {book_name} {chap} ({version}): {e}. Skipping.", file=sys.stderr)
return None
# Create directory and save file only if data was successfully parsed
out_dir.mkdir(parents=True, exist_ok=True)
(out_dir / f"{book_code}.{chap:03}.json").write_text(
json.dumps(data, ensure_ascii=False, indent=2),
encoding='utf-8'
)
return chap # Return chapter number on success
async def download_book(
version: str,
book_name: str, # User input, potentially needs normalization for URL? No, use normalized `pretty_book_name`
book_code: str,
order: int,
chapters: str,
dest_root: Path,
concurrency: int,
):
# Downloads all specified chapters for a given book.
max_chaps = CHAPS.get(book_code) # Use .get for safety
if max_chaps is None:
print(f"Error: Chapter count not found for book code '{book_code}'. Skipping book.", file=sys.stderr)
return
chapter_list = parse_range(chapters, max_chaps)
if not chapter_list:
print(f"No valid chapters specified for {book_name}. Skipping book.", file=sys.stderr)
return
# Use normalized book name (first alias, capitalized) for display/directory, but potentially keep original `book_name` for URL?
# Let's stick to the capitalized first alias for consistency.
pretty_book_name = next(alias for alias, (code, _) in BOOK_INFO.items() if code == book_code).title() # Get canonical name
prefix = f"{order:02d}_{book_code}" # Use order for consistent dir naming
out_dir = dest_root / version.upper() / prefix
# Try creating the output directory early to fail fast on permission errors
try:
out_dir.mkdir(parents=True, exist_ok=True)
except PermissionError as e:
print(f"🚫 Permission denied creating directory {out_dir}: {e}. Skipping book '{pretty_book_name}'.", file=sys.stderr)
return
except OSError as e:
print(f"🚫 OS Error creating directory {out_dir}: {e}. Skipping book '{pretty_book_name}'.", file=sys.stderr)
return
sem = asyncio.Semaphore(concurrency)
async with aiohttp.ClientSession() as session:
tasks = [
# Pass pretty_book_name for the URL? Let's test with this first.
grab(sem, session, version.upper(), pretty_book_name, book_code, c, out_dir)
for c in chapter_list
]
# Use tqdm for progress bar
successful_downloads = 0
print(f"Downloading {pretty_book_name} ({version.upper()})...")
with tqdm(total=len(tasks), desc=pretty_book_name, unit="chap") as pbar:
for future in asyncio.as_completed(tasks):
result = await future
if result is not None: # Check if grab returned a chapter number (success)
successful_downloads += 1
pbar.update(1)
print(f"Finished {pretty_book_name}: {successful_downloads}/{len(tasks)} chapters downloaded successfully.")
def normalise_book(user_input: str) -> Tuple[str, str, int]:
# Normalizes user book input to get canonical code and order.
key = user_input.lower().strip()
if key not in BOOK_INFO:
# Consider suggesting alternatives or listing known books on error?
raise SystemExit(f"⚠️ Book '{user_input}' not recognized.")
code, order = BOOK_INFO[key]
# Return the capitalized version of the first alias for consistency
pretty_name = next(alias for alias, (c, o) in BOOK_INFO.items() if c == code and o == order).title()
return pretty_name, code, order
async def main():
# ----------------- CLI Setup -----------------
parser = argparse.ArgumentParser(description="Download Bible chapters from BibleGateway in parallel.")
parser.add_argument("--version", required=True, help="Bible version code (e.g., PDT, NTV, RVR1960). Case-insensitive for directory, but used as-is for URL.")
parser.add_argument("--book", help="Book name (e.g., Genesis, Psalms). Omit to download the entire Bible (all 66 books).")
parser.add_argument("--chapters", default="all", help="Chapter range (e.g., '1-5,8,10' or 'all'). Default is 'all'.")
parser.add_argument("--dest", default="./data", help="Root directory for downloads (default: ./data).")
parser.add_argument("--concurrency", type=int, default=10, help="Number of simultaneous downloads (default: 10).")
args = parser.parse_args()
dest_root = Path(args.dest).expanduser().resolve()
# Attempt to create the root destination directory early
try:
dest_root.mkdir(exist_ok=True, parents=True)
except PermissionError as e:
raise SystemExit(f"🚫 Permission denied to create root destination directory {dest_root}: {e}") from e
except OSError as e:
raise SystemExit(f"🚫 OS Error creating root destination directory {dest_root}: {e}") from e
version_code = args.version.upper() # Use uppercase for directory structure consistency
if args.book:
# Single book download
try:
pretty_book_name, book_code, order = normalise_book(args.book)
await download_book(version_code, pretty_book_name, book_code, order, args.chapters, dest_root, args.concurrency)
except SystemExit as e: # Catch normalization errors
print(e, file=sys.stderr)
sys.exit(1) # Exit if book name is invalid
else:
# Full Bible download
print(f"📖 Downloading full Bible ({version_code})...")
# Use _names_orders to ensure canonical book order
for order, book_code, aliases in _names_orders:
# Use the first alias, capitalized, as the standard name
pretty_book_name = aliases[0].title()
# Order is already available from the loop
await download_book(version_code, pretty_book_name, book_code, order, "all", dest_root, args.concurrency)
print("✅ Download finished.")
if __name__ == "__main__":
# Ensure graceful shutdown on Ctrl+C
try:
asyncio.run(main())
except KeyboardInterrupt:
print("Download interrupted by user.", file=sys.stderr)
sys.exit(1)