Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement module scoped logging #361

Merged
merged 5 commits into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions scripts/generate_parser_test_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,17 @@
from tqdm import tqdm

from fundus import Crawler, PublisherCollection
from fundus.logging import basic_logger
from fundus.logging import create_logger
from fundus.publishers.base_objects import PublisherEnum
from fundus.scraping.article import Article
from fundus.scraping.filter import RequiresAll
from fundus.scraping.html import WebSource
from fundus.scraping.scraper import BaseScraper, WebScraper
from fundus.scraping.scraper import BaseScraper
from tests.test_parser import attributes_required_to_cover
from tests.utility import HTMLTestFile, get_test_case_json, load_html_test_file_mapping

logger = create_logger(__name__)


def get_test_article(enum: PublisherEnum, url: Optional[str] = None) -> Optional[Article]:
if url is not None:
Expand Down Expand Up @@ -84,7 +86,7 @@ def main() -> None:
# sort args.attributes for consistency
arguments.attributes = sorted(set(arguments.attributes) or attributes_required_to_cover)

basic_logger.setLevel(WARN)
logger.setLevel(WARN)

publishers: List[PublisherEnum] = (
list(PublisherCollection)
Expand All @@ -107,7 +109,7 @@ def main() -> None:

if arguments.overwrite or not html_mapping.get(publisher.parser.latest_version):
if not (article := get_test_article(publisher, url)):
basic_logger.error(f"Couldn't get article for {publisher.name}. Skipping")
logger.error(f"Couldn't get article for {publisher.name}. Skipping")
continue
html = HTMLTestFile(
url=article.html.responded_url,
Expand Down
27 changes: 27 additions & 0 deletions src/fundus/logging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import logging

__all__ = ["set_log_level", "create_logger"]

_loggers = []

_stream_handler = logging.StreamHandler()
_formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
_stream_handler.setFormatter(_formatter)

logging.basicConfig(
level=logging.ERROR,
handlers=[_stream_handler],
)


def create_logger(name: str) -> logging.Logger:
logger = logging.getLogger(name)
logger.setLevel(logging.ERROR)
logger.addHandler(_stream_handler)
_loggers.append(logger)
return logger


def set_log_level(level: int):
for logger in _loggers:
logger.setLevel(level)
3 changes: 0 additions & 3 deletions src/fundus/logging/__init__.py

This file was deleted.

9 changes: 0 additions & 9 deletions src/fundus/logging/logger.py

This file was deleted.

6 changes: 4 additions & 2 deletions src/fundus/scraping/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@
import more_itertools
from colorama import Fore, Style

from fundus.logging.logger import basic_logger
from fundus.logging import create_logger
from fundus.parser import ArticleBody
from fundus.scraping.html import HTML

logger = create_logger(__name__)


@dataclass(frozen=True)
class Article:
Expand Down Expand Up @@ -54,7 +56,7 @@ def lang(self) -> Optional[str]:
try:
language = langdetect.detect(self.plaintext)
except langdetect.LangDetectException:
basic_logger.debug(f"Unable to detect language for article '{self.html.responded_url}'")
logger.debug(f"Unable to detect language for article '{self.html.responded_url}'")

# use @lang attribute of <html> tag as fallback
if not language or language == langdetect.detector_factory.Detector.UNKNOWN_LANG:
Expand Down
8 changes: 5 additions & 3 deletions src/fundus/scraping/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from tqdm import tqdm
from typing_extensions import ParamSpec, TypeAlias

from fundus.logging import basic_logger
from fundus.logging import create_logger
from fundus.publishers.base_objects import PublisherCollectionMeta, PublisherEnum
from fundus.scraping.article import Article
from fundus.scraping.delay import Delay
Expand All @@ -45,6 +45,8 @@
from fundus.scraping.session import session_handler
from fundus.scraping.url import URLSource

logger = create_logger(__name__)

_T = TypeVar("_T")
_P = ParamSpec("_P")

Expand Down Expand Up @@ -190,15 +192,15 @@ def build_extraction_filter() -> Optional[ExtractionFilter]:
)
)
if missing_attributes := extraction_filter.required_attributes - supported_attributes:
basic_logger.warning(
logger.warning(
f"The required attribute(s) `{', '.join(missing_attributes)}` "
f"is(are) not supported by {publisher.publisher_name}. Skipping publisher"
)
else:
fitting_publishers.append(publisher)

if not fitting_publishers:
basic_logger.error(
logger.error(
f"Could not find any fitting publishers for required attributes "
f"`{', '.join(extraction_filter.required_attributes)}`"
)
Expand Down
28 changes: 15 additions & 13 deletions src/fundus/scraping/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from fastwarc import ArchiveIterator, WarcRecord, WarcRecordType
from requests import ConnectionError, HTTPError

from fundus.logging import basic_logger
from fundus.logging import create_logger
from fundus.publishers.base_objects import PublisherEnum
from fundus.scraping.delay import Delay
from fundus.scraping.filter import URLFilter
Expand All @@ -29,6 +29,8 @@
from fundus.scraping.session import session_handler
from fundus.scraping.url import URLSource

logger = create_logger(__name__)


@dataclass(frozen=True)
class HTML:
Expand Down Expand Up @@ -97,11 +99,11 @@ def filter_url(u: str) -> bool:
timestamp = time.time()

if not validators.url(url):
basic_logger.debug(f"Skipped requested URL '{url}' because the URL is malformed")
logger.debug(f"Skipped requested URL '{url}' because the URL is malformed")
continue

if filter_url(url):
basic_logger.debug(f"Skipped requested URL '{url}' because of URL filter")
logger.debug(f"Skipped requested URL '{url}' because of URL filter")
continue

session = session_handler.get_session()
Expand All @@ -114,23 +116,23 @@ def filter_url(u: str) -> bool:
response = session.get(url, headers=self.request_header)

except (HTTPError, ConnectionError) as error:
basic_logger.info(f"Skipped requested URL '{url}' because of '{error}'")
logger.info(f"Skipped requested URL '{url}' because of '{error}'")
if isinstance(error, HTTPError) and error.response.status_code >= 500:
basic_logger.info(f"Skipped {self.publisher} due to server errors: '{error}'")
logger.info(f"Skipped {self.publisher} due to server errors: '{error}'")
continue

except Exception as error:
basic_logger.warning(f"Warning! Skipped requested URL '{url}' because of an unexpected error {error}")
logger.warning(f"Warning! Skipped requested URL '{url}' because of an unexpected error {error}")
continue

else:
if filter_url(str(response.url)):
basic_logger.debug(f"Skipped responded URL '{str(response.url)}' because of URL filter")
logger.debug(f"Skipped responded URL '{str(response.url)}' because of URL filter")
continue
html = response.text

if response.history:
basic_logger.info(f"Got redirected {len(response.history)} time(s) from {url} -> {response.url}")
logger.info(f"Got redirected {len(response.history)} time(s) from {url} -> {response.url}")

source_info = (
WebSourceInfo(self.publisher, type(self.url_source).__name__, self.url_source.url)
Expand Down Expand Up @@ -167,20 +169,20 @@ def extract_content(record: WarcRecord) -> Optional[str]:
encoding: Optional[str] = chardet.detect(warc_body)["encoding"]

if encoding is not None:
basic_logger.debug(
logger.debug(
f"Trying to decode record {record.record_id!r} from {target_url!r} "
f"using detected encoding {encoding}."
)

try:
return str(warc_body, encoding=encoding)
except UnicodeDecodeError:
basic_logger.warning(
logger.warning(
f"Couldn't decode record {record.record_id!r} from {target_url!r} with "
f"original charset {record.http_charset!r} using detected charset {encoding!r}."
)
else:
basic_logger.warning(
logger.warning(
f"Couldn't detect charset for record {record.record_id!r} from {target_url!r} "
f"with invalid original charset {record.http_charset!r}."
)
Expand All @@ -194,7 +196,7 @@ def extract_content(record: WarcRecord) -> Optional[str]:
target_url = str(warc_record.headers["WARC-Target-URI"])

if url_filter is not None and url_filter(target_url):
basic_logger.debug(f"Skipped WARC record with target URI {target_url!r} because of URL filter")
logger.debug(f"Skipped WARC record with target URI {target_url!r} because of URL filter")
continue

publisher_domain: str = urlparse(target_url).netloc
Expand All @@ -205,7 +207,7 @@ def extract_content(record: WarcRecord) -> Optional[str]:
publisher = self._publisher_mapping[publisher_domain]

if publisher.url_filter is not None and publisher.url_filter(target_url):
basic_logger.debug(
logger.debug(
f"Skipped WARC record with target URI {target_url!r} because of "
f"publisher specific URL filter"
)
Expand Down
14 changes: 7 additions & 7 deletions src/fundus/scraping/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import more_itertools

from fundus.logging import basic_logger
from fundus.logging import create_logger
from fundus.parser import ParserProxy
from fundus.publishers.base_objects import PublisherEnum
from fundus.scraping.article import Article
Expand All @@ -15,6 +15,8 @@
from fundus.scraping.html import CCNewsSource, HTMLSource, WebSource
from fundus.scraping.url import URLSource

logger = create_logger(__name__)


class BaseScraper:
def __init__(self, *sources: HTMLSource, parser_mapping: Dict[str, ParserProxy]):
Expand All @@ -37,27 +39,25 @@ def scrape(
except Exception as err:
if error_handling == "raise":
error_message = f"Run into an error processing article '{html.requested_url}'"
basic_logger.error(error_message)
logger.error(error_message)
err.args = (str(err) + "\n\n" + error_message,)
raise err
elif error_handling == "catch":
yield Article(html=html, exception=err)
elif error_handling == "suppress":
basic_logger.info(f"Skipped article at '{html.requested_url}' because of: {err!r}")
logger.info(f"Skipped article at '{html.requested_url}' because of: {err!r}")
else:
raise ValueError(f"Unknown value '{error_handling}' for parameter <error_handling>'")

else:
if extraction_filter and (filter_result := extraction_filter(extraction)):
if isinstance(filter_result, FilterResultWithMissingAttributes):
basic_logger.debug(
logger.debug(
f"Skipped article at '{html.requested_url}' because attribute(s) "
f"{', '.join(filter_result.missing_attributes)!r} is(are) missing"
)
else:
basic_logger.debug(
f"Skipped article at '{html.requested_url}' because of extraction filter"
)
logger.debug(f"Skipped article at '{html.requested_url}' because of extraction filter")
else:
article = Article.from_extracted(html=html, extracted=extraction)
yield article
Expand Down
10 changes: 6 additions & 4 deletions src/fundus/scraping/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
import requests.adapters
from typing_extensions import Self

from fundus.logging import basic_logger
from fundus.logging import create_logger

logger = create_logger(__name__)

_default_header = {"user-agent": "Fundus"}

Expand Down Expand Up @@ -40,14 +42,14 @@ def _session_factory(self) -> requests.Session:
A new requests.Session
"""

basic_logger.debug("Creating new session")
logger.debug("Creating new session")
session = requests.Session()

def _response_log(response: requests.Response, *args, **kwargs) -> None:
history = response.history
previous_status_codes = [f"({response.status_code})" for response in history] if history else []
status_code_chain = " -> ".join(previous_status_codes + [f"({response.status_code})"])
basic_logger.debug(
logger.debug(
f"{status_code_chain} <{response.request.method} {response.url!r}> "
f"took {response.elapsed.total_seconds()} second(s)"
)
Expand Down Expand Up @@ -92,7 +94,7 @@ def close_current_session(self) -> None:
"""
if self.session is not None:
session = self.get_session()
basic_logger.debug(f"Close session {session}")
logger.debug(f"Close session {session}")
session.close()
self.session = None

Expand Down
14 changes: 8 additions & 6 deletions src/fundus/scraping/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@
from lxml.etree import XPath
from requests import ConnectionError, HTTPError

from fundus.logging import basic_logger
from fundus.logging import create_logger
from fundus.scraping.filter import URLFilter, inverse
from fundus.scraping.session import _default_header, session_handler

logger = create_logger(__name__)


class _ArchiveDecompressor:
def __init__(self):
Expand Down Expand Up @@ -45,7 +47,7 @@ def __post_init__(self):
if not self._request_header:
self._request_header = _default_header
if not validators.url(self.url, strict_query=False):
basic_logger.error(f"{type(self).__name__} initialized with invalid URL {self.url}")
logger.error(f"{type(self).__name__} initialized with invalid URL {self.url}")

def set_header(self, request_header: Dict[str, str]) -> None:
self._request_header = request_header
Expand Down Expand Up @@ -78,7 +80,7 @@ def __iter__(self) -> Iterator[str]:
html = response.text
rss_feed = feedparser.parse(html)
if exception := rss_feed.get("bozo_exception"):
basic_logger.warning(f"Warning! Couldn't parse rss feed '{self.url}' because of {exception}")
logger.warning(f"Warning! Couldn't parse rss feed '{self.url}' because of {exception}")
return
else:
for url in (entry["link"] for entry in rss_feed["entries"]):
Expand All @@ -99,17 +101,17 @@ def __iter__(self) -> Iterator[str]:
def yield_recursive(sitemap_url: str) -> Iterator[str]:
session = session_handler.get_session()
if not validators.url(sitemap_url):
basic_logger.info(f"Skipped sitemap '{sitemap_url}' because the URL is malformed")
logger.info(f"Skipped sitemap '{sitemap_url}' because the URL is malformed")
try:
response = session.get(url=sitemap_url, headers=self._request_header)
except (HTTPError, ConnectionError) as error:
basic_logger.warning(f"Warning! Couldn't reach sitemap '{sitemap_url}' because of {error}")
logger.warning(f"Warning! Couldn't reach sitemap '{sitemap_url}' because of {error}")
return
content = response.content
if (content_type := response.headers.get("content-type")) in self._decompressor.supported_file_formats:
content = self._decompressor.decompress(content, content_type)
if not content:
basic_logger.warning(f"Warning! Empty sitemap at '{sitemap_url}'")
logger.warning(f"Warning! Empty sitemap at '{sitemap_url}'")
return
tree = lxml.html.fromstring(content)
urls = [node.text_content() for node in self._url_selector(tree)]
Expand Down