flairNLP · MaxDall · Apr 24, 2024 · Feb 17, 2024 · Apr 21, 2024 · Apr 21, 2024
diff --git a/scripts/generate_parser_test_files.py b/scripts/generate_parser_test_files.py
@@ -6,15 +6,17 @@
 from tqdm import tqdm
 
 from fundus import Crawler, PublisherCollection
-from fundus.logging import basic_logger
+from fundus.logging import create_logger
 from fundus.publishers.base_objects import PublisherEnum
 from fundus.scraping.article import Article
 from fundus.scraping.filter import RequiresAll
 from fundus.scraping.html import WebSource
-from fundus.scraping.scraper import BaseScraper, WebScraper
+from fundus.scraping.scraper import BaseScraper
 from tests.test_parser import attributes_required_to_cover
 from tests.utility import HTMLTestFile, get_test_case_json, load_html_test_file_mapping
 
+logger = create_logger(__name__)
+
 
 def get_test_article(enum: PublisherEnum, url: Optional[str] = None) -> Optional[Article]:
     if url is not None:
@@ -84,7 +86,7 @@ def main() -> None:
     # sort args.attributes for consistency
     arguments.attributes = sorted(set(arguments.attributes) or attributes_required_to_cover)
 
-    basic_logger.setLevel(WARN)
+    logger.setLevel(WARN)
 
     publishers: List[PublisherEnum] = (
         list(PublisherCollection)
@@ -107,7 +109,7 @@ def main() -> None:
 
             if arguments.overwrite or not html_mapping.get(publisher.parser.latest_version):
                 if not (article := get_test_article(publisher, url)):
-                    basic_logger.error(f"Couldn't get article for {publisher.name}. Skipping")
+                    logger.error(f"Couldn't get article for {publisher.name}. Skipping")
                     continue
                 html = HTMLTestFile(
                     url=article.html.responded_url,

diff --git a/src/fundus/logging.py b/src/fundus/logging.py
@@ -0,0 +1,27 @@
+import logging
+
+__all__ = ["set_log_level", "create_logger"]
+
+_loggers = []
+
+_stream_handler = logging.StreamHandler()
+_formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+_stream_handler.setFormatter(_formatter)
+
+logging.basicConfig(
+    level=logging.ERROR,
+    handlers=[_stream_handler],
+)
+
+
+def create_logger(name: str) -> logging.Logger:
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.ERROR)
+    logger.addHandler(_stream_handler)
+    _loggers.append(logger)
+    return logger
+
+
+def set_log_level(level: int):
+    for logger in _loggers:
+        logger.setLevel(level)
diff --git a/src/fundus/logging/__init__.py b/src/fundus/logging/__init__.py
diff --git a/src/fundus/logging/logger.py b/src/fundus/logging/logger.py
diff --git a/src/fundus/scraping/article.py b/src/fundus/scraping/article.py
@@ -8,10 +8,12 @@
 import more_itertools
 from colorama import Fore, Style
 
-from fundus.logging.logger import basic_logger
+from fundus.logging import create_logger
 from fundus.parser import ArticleBody
 from fundus.scraping.html import HTML
 
+logger = create_logger(__name__)
+
 
 @dataclass(frozen=True)
 class Article:
@@ -54,7 +56,7 @@ def lang(self) -> Optional[str]:
             try:
                 language = langdetect.detect(self.plaintext)
             except langdetect.LangDetectException:
-                basic_logger.debug(f"Unable to detect language for article '{self.html.responded_url}'")
+                logger.debug(f"Unable to detect language for article '{self.html.responded_url}'")
 
         # use @lang attribute of <html> tag as fallback
         if not language or language == langdetect.detector_factory.Detector.UNKNOWN_LANG:

diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py
@@ -35,7 +35,7 @@
 from tqdm import tqdm
 from typing_extensions import ParamSpec, TypeAlias
 
-from fundus.logging import basic_logger
+from fundus.logging import create_logger
 from fundus.publishers.base_objects import PublisherCollectionMeta, PublisherEnum
 from fundus.scraping.article import Article
 from fundus.scraping.delay import Delay
@@ -45,6 +45,8 @@
 from fundus.scraping.session import session_handler
 from fundus.scraping.url import URLSource
 
+logger = create_logger(__name__)
+
 _T = TypeVar("_T")
 _P = ParamSpec("_P")
 
@@ -190,15 +192,15 @@ def build_extraction_filter() -> Optional[ExtractionFilter]:
                     )
                 )
                 if missing_attributes := extraction_filter.required_attributes - supported_attributes:
-                    basic_logger.warning(
+                    logger.warning(
                         f"The required attribute(s) `{', '.join(missing_attributes)}` "
                         f"is(are) not supported by {publisher.publisher_name}. Skipping publisher"
                     )
                 else:
                     fitting_publishers.append(publisher)
 
             if not fitting_publishers:
-                basic_logger.error(
+                logger.error(
                     f"Could not find any fitting publishers for required attributes  "
                     f"`{', '.join(extraction_filter.required_attributes)}`"
                 )

diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py
@@ -10,7 +10,7 @@
 from fastwarc import ArchiveIterator, WarcRecord, WarcRecordType
 from requests import ConnectionError, HTTPError
 
-from fundus.logging import basic_logger
+from fundus.logging import create_logger
 from fundus.publishers.base_objects import PublisherEnum
 from fundus.scraping.delay import Delay
 from fundus.scraping.filter import URLFilter
@@ -29,6 +29,8 @@
 from fundus.scraping.session import session_handler
 from fundus.scraping.url import URLSource
 
+logger = create_logger(__name__)
+
 
 @dataclass(frozen=True)
 class HTML:
@@ -97,11 +99,11 @@ def filter_url(u: str) -> bool:
                 timestamp = time.time()
 
             if not validators.url(url):
-                basic_logger.debug(f"Skipped requested URL '{url}' because the URL is malformed")
+                logger.debug(f"Skipped requested URL '{url}' because the URL is malformed")
                 continue
 
             if filter_url(url):
-                basic_logger.debug(f"Skipped requested URL '{url}' because of URL filter")
+                logger.debug(f"Skipped requested URL '{url}' because of URL filter")
                 continue
 
             session = session_handler.get_session()
@@ -114,23 +116,23 @@ def filter_url(u: str) -> bool:
                 response = session.get(url, headers=self.request_header)
 
             except (HTTPError, ConnectionError) as error:
-                basic_logger.info(f"Skipped requested URL '{url}' because of '{error}'")
+                logger.info(f"Skipped requested URL '{url}' because of '{error}'")
                 if isinstance(error, HTTPError) and error.response.status_code >= 500:
-                    basic_logger.info(f"Skipped {self.publisher} due to server errors: '{error}'")
+                    logger.info(f"Skipped {self.publisher} due to server errors: '{error}'")
                 continue
 
             except Exception as error:
-                basic_logger.warning(f"Warning! Skipped  requested URL '{url}' because of an unexpected error {error}")
+                logger.warning(f"Warning! Skipped  requested URL '{url}' because of an unexpected error {error}")
                 continue
 
             else:
                 if filter_url(str(response.url)):
-                    basic_logger.debug(f"Skipped responded URL '{str(response.url)}' because of URL filter")
+                    logger.debug(f"Skipped responded URL '{str(response.url)}' because of URL filter")
                     continue
                 html = response.text
 
                 if response.history:
-                    basic_logger.info(f"Got redirected {len(response.history)} time(s) from {url} -> {response.url}")
+                    logger.info(f"Got redirected {len(response.history)} time(s) from {url} -> {response.url}")
 
                 source_info = (
                     WebSourceInfo(self.publisher, type(self.url_source).__name__, self.url_source.url)
@@ -167,20 +169,20 @@ def extract_content(record: WarcRecord) -> Optional[str]:
                 encoding: Optional[str] = chardet.detect(warc_body)["encoding"]
 
                 if encoding is not None:
-                    basic_logger.debug(
+                    logger.debug(
                         f"Trying to decode record {record.record_id!r} from {target_url!r} "
                         f"using detected encoding {encoding}."
                     )
 
                     try:
                         return str(warc_body, encoding=encoding)
                     except UnicodeDecodeError:
-                        basic_logger.warning(
+                        logger.warning(
                             f"Couldn't decode record {record.record_id!r} from {target_url!r} with "
                             f"original charset {record.http_charset!r} using detected charset {encoding!r}."
                         )
                 else:
-                    basic_logger.warning(
+                    logger.warning(
                         f"Couldn't detect charset for record {record.record_id!r} from {target_url!r} "
                         f"with invalid original charset {record.http_charset!r}."
                     )
@@ -194,7 +196,7 @@ def extract_content(record: WarcRecord) -> Optional[str]:
                 target_url = str(warc_record.headers["WARC-Target-URI"])
 
                 if url_filter is not None and url_filter(target_url):
-                    basic_logger.debug(f"Skipped WARC record with target URI {target_url!r} because of URL filter")
+                    logger.debug(f"Skipped WARC record with target URI {target_url!r} because of URL filter")
                     continue
 
                 publisher_domain: str = urlparse(target_url).netloc
@@ -205,7 +207,7 @@ def extract_content(record: WarcRecord) -> Optional[str]:
                 publisher = self._publisher_mapping[publisher_domain]
 
                 if publisher.url_filter is not None and publisher.url_filter(target_url):
-                    basic_logger.debug(
+                    logger.debug(
                         f"Skipped WARC record with target URI {target_url!r} because of "
                         f"publisher specific URL filter"
                     )

diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py
@@ -2,7 +2,7 @@
 
 import more_itertools
 
-from fundus.logging import basic_logger
+from fundus.logging import create_logger
 from fundus.parser import ParserProxy
 from fundus.publishers.base_objects import PublisherEnum
 from fundus.scraping.article import Article
@@ -15,6 +15,8 @@
 from fundus.scraping.html import CCNewsSource, HTMLSource, WebSource
 from fundus.scraping.url import URLSource
 
+logger = create_logger(__name__)
+
 
 class BaseScraper:
     def __init__(self, *sources: HTMLSource, parser_mapping: Dict[str, ParserProxy]):
@@ -37,27 +39,25 @@ def scrape(
                 except Exception as err:
                     if error_handling == "raise":
                         error_message = f"Run into an error processing article '{html.requested_url}'"
-                        basic_logger.error(error_message)
+                        logger.error(error_message)
                         err.args = (str(err) + "\n\n" + error_message,)
                         raise err
                     elif error_handling == "catch":
                         yield Article(html=html, exception=err)
                     elif error_handling == "suppress":
-                        basic_logger.info(f"Skipped article at '{html.requested_url}' because of: {err!r}")
+                        logger.info(f"Skipped article at '{html.requested_url}' because of: {err!r}")
                     else:
                         raise ValueError(f"Unknown value '{error_handling}' for parameter <error_handling>'")
 
                 else:
                     if extraction_filter and (filter_result := extraction_filter(extraction)):
                         if isinstance(filter_result, FilterResultWithMissingAttributes):
-                            basic_logger.debug(
+                            logger.debug(
                                 f"Skipped article at '{html.requested_url}' because attribute(s) "
                                 f"{', '.join(filter_result.missing_attributes)!r} is(are) missing"
                             )
                         else:
-                            basic_logger.debug(
-                                f"Skipped article at '{html.requested_url}' because of extraction filter"
-                            )
+                            logger.debug(f"Skipped article at '{html.requested_url}' because of extraction filter")
                     else:
                         article = Article.from_extracted(html=html, extracted=extraction)
                         yield article

diff --git a/src/fundus/scraping/session.py b/src/fundus/scraping/session.py
@@ -5,7 +5,9 @@
 import requests.adapters
 from typing_extensions import Self
 
-from fundus.logging import basic_logger
+from fundus.logging import create_logger
+
+logger = create_logger(__name__)
 
 _default_header = {"user-agent": "Fundus"}
 
@@ -40,14 +42,14 @@ def _session_factory(self) -> requests.Session:
             A new requests.Session
         """
 
-        basic_logger.debug("Creating new session")
+        logger.debug("Creating new session")
         session = requests.Session()
 
         def _response_log(response: requests.Response, *args, **kwargs) -> None:
             history = response.history
             previous_status_codes = [f"({response.status_code})" for response in history] if history else []
             status_code_chain = " -> ".join(previous_status_codes + [f"({response.status_code})"])
-            basic_logger.debug(
+            logger.debug(
                 f"{status_code_chain} <{response.request.method} {response.url!r}> "
                 f"took {response.elapsed.total_seconds()} second(s)"
             )
@@ -92,7 +94,7 @@ def close_current_session(self) -> None:
         """
         if self.session is not None:
             session = self.get_session()
-            basic_logger.debug(f"Close session {session}")
+            logger.debug(f"Close session {session}")
             session.close()
             self.session = None
 

diff --git a/src/fundus/scraping/url.py b/src/fundus/scraping/url.py
@@ -12,10 +12,12 @@
 from lxml.etree import XPath
 from requests import ConnectionError, HTTPError
 
-from fundus.logging import basic_logger
+from fundus.logging import create_logger
 from fundus.scraping.filter import URLFilter, inverse
 from fundus.scraping.session import _default_header, session_handler
 
+logger = create_logger(__name__)
+
 
 class _ArchiveDecompressor:
     def __init__(self):
@@ -45,7 +47,7 @@ def __post_init__(self):
         if not self._request_header:
             self._request_header = _default_header
         if not validators.url(self.url, strict_query=False):
-            basic_logger.error(f"{type(self).__name__} initialized with invalid URL {self.url}")
+            logger.error(f"{type(self).__name__} initialized with invalid URL {self.url}")
 
     def set_header(self, request_header: Dict[str, str]) -> None:
         self._request_header = request_header
@@ -78,7 +80,7 @@ def __iter__(self) -> Iterator[str]:
         html = response.text
         rss_feed = feedparser.parse(html)
         if exception := rss_feed.get("bozo_exception"):
-            basic_logger.warning(f"Warning! Couldn't parse rss feed '{self.url}' because of {exception}")
+            logger.warning(f"Warning! Couldn't parse rss feed '{self.url}' because of {exception}")
             return
         else:
             for url in (entry["link"] for entry in rss_feed["entries"]):
@@ -99,17 +101,17 @@ def __iter__(self) -> Iterator[str]:
         def yield_recursive(sitemap_url: str) -> Iterator[str]:
             session = session_handler.get_session()
             if not validators.url(sitemap_url):
-                basic_logger.info(f"Skipped sitemap '{sitemap_url}' because the URL is malformed")
+                logger.info(f"Skipped sitemap '{sitemap_url}' because the URL is malformed")
             try:
                 response = session.get(url=sitemap_url, headers=self._request_header)
             except (HTTPError, ConnectionError) as error:
-                basic_logger.warning(f"Warning! Couldn't reach sitemap '{sitemap_url}' because of {error}")
+                logger.warning(f"Warning! Couldn't reach sitemap '{sitemap_url}' because of {error}")
                 return
             content = response.content
             if (content_type := response.headers.get("content-type")) in self._decompressor.supported_file_formats:
                 content = self._decompressor.decompress(content, content_type)
             if not content:
-                basic_logger.warning(f"Warning! Empty sitemap at '{sitemap_url}'")
+                logger.warning(f"Warning! Empty sitemap at '{sitemap_url}'")
                 return
             tree = lxml.html.fromstring(content)
             urls = [node.text_content() for node in self._url_selector(tree)]