|
1 | 1 | from __future__ import annotations
|
2 | 2 |
|
3 |
| -import asyncio |
4 | 3 | import gzip
|
5 | 4 | import os
|
6 | 5 | import re
|
7 | 6 | from datetime import datetime
|
8 | 7 | from functools import lru_cache, partial, wraps
|
9 | 8 | from multiprocessing import Manager
|
10 | 9 | from multiprocessing.context import TimeoutError
|
11 |
| -from multiprocessing.pool import MapResult, Pool |
| 10 | +from multiprocessing.pool import MapResult, Pool, ThreadPool |
12 | 11 | from queue import Empty, Queue
|
13 | 12 | from typing import (
|
14 | 13 | Any,
|
15 | 14 | Callable,
|
16 |
| - Coroutine, |
17 | 15 | Generic,
|
18 | 16 | Iterator,
|
19 | 17 | List,
|
|
27 | 25 | cast,
|
28 | 26 | )
|
29 | 27 |
|
30 |
| -import aiohttp |
31 | 28 | import dill
|
32 | 29 | import more_itertools
|
| 30 | +import requests |
33 | 31 | from dateutil.rrule import MONTHLY, rrule
|
34 |
| -from tqdm.asyncio import tqdm |
| 32 | +from tqdm import tqdm |
35 | 33 | from typing_extensions import ParamSpec
|
36 | 34 |
|
37 | 35 | from fundus.publishers.base_objects import PublisherEnum
|
@@ -147,24 +145,22 @@ def _get_warc_paths(self, start: datetime, end: datetime) -> List[str]:
|
147 | 145 | f"{self.server_address}crawl-data/CC-NEWS/{date.strftime('%Y/%m')}/warc.paths.gz" for date in date_sequence
|
148 | 146 | ]
|
149 | 147 |
|
150 |
| - async def load_warc_paths_from(url: str) -> List[str]: |
151 |
| - async with aiohttp.ClientSession(raise_for_status=True) as session: |
152 |
| - async with session.get(url) as response: |
153 |
| - return gzip.decompress(await response.read()).decode("utf-8").split() |
| 148 | + with tqdm(total=len(urls), desc="Loading WARC Paths", leave=False) as bar: |
154 | 149 |
|
155 |
| - load_warc_paths: Coroutine[Any, Any, List[List[str]]] = tqdm.gather( |
156 |
| - *[load_warc_paths_from(url) for url in urls], |
157 |
| - total=len(urls), |
158 |
| - desc="Loading WARC paths", |
159 |
| - leave=False, |
160 |
| - ) |
| 150 | + def load_paths(url: str) -> List[str]: |
| 151 | + with requests.Session() as session: |
| 152 | + paths = gzip.decompress(session.get(url).content).decode("utf-8").split() |
| 153 | + bar.update() |
| 154 | + return paths |
161 | 155 |
|
162 |
| - try: |
163 |
| - event_loop = asyncio.get_running_loop() |
164 |
| - except RuntimeError: |
165 |
| - nested_warc_paths = asyncio.run(load_warc_paths) |
166 |
| - else: |
167 |
| - nested_warc_paths = event_loop.run_until_complete(load_warc_paths) |
| 156 | + if self.processes == 0: |
| 157 | + nested_warc_paths = [load_paths(url) for url in urls] |
| 158 | + else: |
| 159 | + # use two threads per process, default two threads per core |
| 160 | + max_number_of_threads = self.processes * 2 |
| 161 | + |
| 162 | + with ThreadPool(processes=min(len(urls), max_number_of_threads)) as pool: |
| 163 | + nested_warc_paths = pool.map(load_paths, urls) |
168 | 164 |
|
169 | 165 | warc_paths: Iterator[str] = more_itertools.flatten(nested_warc_paths)
|
170 | 166 |
|
|
0 commit comments