generated from opentensor/bittensor-subnet-template
-
Notifications
You must be signed in to change notification settings - Fork 53
/
Copy pathrandom_website.py
62 lines (49 loc) · 2.13 KB
/
random_website.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import random
from typing import Optional
import trafilatura
from loguru import logger
# from duckduckgo_search import DDGS
from prompting.base.duckduckgo_patch import PatchedDDGS
from prompting.datasets.utils import ENGLISH_WORDS
from shared.base import BaseDataset, Context, DatasetEntry
from shared.settings import shared_settings
MAX_CHARS = 5000
class DDGDatasetEntry(DatasetEntry):
search_term: str
website_url: str = None
website_content: str = None
class DDGDataset(BaseDataset):
english_words: list[str] = None
def search_random_term(self, retries: int = 3) -> tuple[Optional[str], Optional[list[dict[str, str]]]]:
ddg = PatchedDDGS(proxy=shared_settings.PROXY_URL, verify=False)
for _ in range(retries):
random_words = " ".join(random.sample(ENGLISH_WORDS, 3))
try:
results = list(ddg.text(random_words))
if results:
return random_words, results
except Exception as ex:
logger.error(f"Failed to get search results from DuckDuckGo: {ex}")
return None, None
@staticmethod
def extract_website_content(url: str) -> Optional[str]:
try:
website = trafilatura.fetch_url(url)
extracted = trafilatura.extract(website)
return extracted[:MAX_CHARS] if extracted else None
except Exception as ex:
logger.error(f"Failed to extract content from website {url}: {ex}")
def next(self) -> Optional[DDGDatasetEntry]:
search_term, results = self.search_random_term(retries=5)
if not results:
return None
website_url = results[0]["href"]
website_content = self.extract_website_content(website_url)
if not website_content or len(website_content) == 0:
logger.error(f"Failed to extract content from website {website_url}")
return None
return DDGDatasetEntry(search_term=search_term, website_url=website_url, website_content=website_content)
def get(self) -> Optional[DDGDatasetEntry]:
return self.next()
def random(self) -> Context:
return self.next()