diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..c5420cd --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,19 @@ +repos: +- repo: https://github.com/PyCQA/isort + rev: 5.13.1 + hooks: + - id: isort +- repo: https://github.com/psf/black + rev: 24.3.0 + hooks: + - id: black +- repo: https://github.com/pycqa/flake8 + rev: 7.0.0 + hooks: + - id: flake8 +- repo: https://github.com/adamchainz/blacken-docs + rev: 1.16.0 + hooks: + - id: blacken-docs + additional_dependencies: + - black==24.3.0 diff --git a/docs/asyncio_api.rst b/docs/asyncio_api.rst index 2c6e568..c9a1e4a 100644 --- a/docs/asyncio_api.rst +++ b/docs/asyncio_api.rst @@ -14,9 +14,11 @@ You can use the method ``request_raw`` to perform individual requests: client = AsyncZyteAPI(api_key="YOUR_API_KEY") + async def single_request(url): return await client.get({"url": url, "browserHtml": True}) + response = asyncio.run(single_request("https://books.toscrape.com")) .. tip:: You can skip the ``api_key`` parameter if you :ref:`use an environment @@ -34,12 +36,10 @@ parallel, using multiple connections: from zyte_api import AsyncZyteAPI, create_session from zyte_api.aio.errors import RequestError + async def extract_from(urls, n_conn): client = AsyncZyteAPI(n_conn=n_conn) - requests = [ - {"url": url, "browserHtml": True} - for url in urls - ] + requests = [{"url": url, "browserHtml": True} for url in urls] async with create_session(n_conn) as session: res_iter = client.iter(requests, session=session) for fut in res_iter: @@ -51,6 +51,7 @@ parallel, using multiple connections: print(e, file=sys.stderr) raise + urls = ["https://toscrape.com", "https://books.toscrape.com"] asyncio.run(extract_from(urls, n_conn=15)) diff --git a/docs/conf.py b/docs/conf.py index 4fa4cfd..5002ec5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,19 +12,22 @@ # import os import sys -sys.path.insert(0, os.path.abspath('../')) + +import sphinx_rtd_theme + +sys.path.insert(0, os.path.abspath("../")) # -- Project information ----------------------------------------------------- -project = u'python-zyte-api' -copyright = u'2021, Zyte Group Ltd' -author = u'Zyte Group Ltd' +project = "python-zyte-api" +copyright = "2021, Zyte Group Ltd" +author = "Zyte Group Ltd" # The short X.Y version -version = u'' +version = "" # The full version, including alpha/beta/rc tags -release = u'0.4.8' +release = "0.4.8" # -- General configuration --------------------------------------------------- @@ -37,37 +40,37 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.intersphinx', - 'sphinx.ext.ifconfig', - 'sphinx.ext.viewcode', - 'sphinx.ext.githubpages', - 'sphinx.ext.autosummary', + "sphinx.ext.autodoc", + "sphinx.ext.intersphinx", + "sphinx.ext.ifconfig", + "sphinx.ext.viewcode", + "sphinx.ext.githubpages", + "sphinx.ext.autosummary", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The master toctree document. -master_doc = 'index' +master_doc = "index" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = 'en' +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # The name of the Pygments (syntax highlighting) style to use. pygments_style = None @@ -78,12 +81,11 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Add any paths that contain custom themes here, relative to this directory. # Add path to the RTD explicitly to robustify builds (otherwise might # fail in a clean Debian build env) -import sphinx_rtd_theme html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] # Theme options are theme-specific and customize the look and feel of a theme @@ -111,7 +113,7 @@ # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. -htmlhelp_basename = 'python-zyte-apidoc' +htmlhelp_basename = "python-zyte-apidoc" # -- Options for LaTeX output ------------------------------------------------ @@ -120,15 +122,12 @@ # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # # 'preamble': '', - # Latex figure (float) alignment # # 'figure_align': 'htbp', @@ -138,8 +137,13 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'python-zyte-api.tex', u'python-zyte-api Documentation', - u'Zyte Group Ltd', 'manual'), + ( + master_doc, + "python-zyte-api.tex", + "python-zyte-api Documentation", + "Zyte Group Ltd", + "manual", + ), ] @@ -148,8 +152,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - (master_doc, 'python-zyte-api', u'python-zyte-api Documentation', - [author], 1) + (master_doc, "python-zyte-api", "python-zyte-api Documentation", [author], 1) ] @@ -159,9 +162,15 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'python-zyte-api', u'python-zyte-api Documentation', - author, 'python-zyte-api', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "python-zyte-api", + "python-zyte-api Documentation", + author, + "python-zyte-api", + "One line description of project.", + "Miscellaneous", + ), ] @@ -180,22 +189,31 @@ # epub_uid = '' # A list of files that should not be packed into the epub file. -epub_exclude_files = ['search.html'] +epub_exclude_files = ["search.html"] # -- Extension configuration ------------------------------------------------- # -- Options for intersphinx extension --------------------------------------- intersphinx_mapping = { - 'python': ('https://docs.python.org/3', None, ), - 'aiohttp': ('https://docs.aiohttp.org/en/stable/', None, ), - 'tenacity': ('https://tenacity.readthedocs.io/en/latest/', None, ), + "python": ( + "https://docs.python.org/3", + None, + ), + "aiohttp": ( + "https://docs.aiohttp.org/en/stable/", + None, + ), + "tenacity": ( + "https://tenacity.readthedocs.io/en/latest/", + None, + ), } autodoc_default_options = { # 'special-members': '__init__,__call__', # 'undoc-members': True, - 'exclude-members': '__weakref__' + "exclude-members": "__weakref__" } add_module_names = False diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..830e253 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,6 @@ +[tool.isort] +profile = "black" +multi_line_output = 3 + +[tool.black] +target-version = ["py38", "py39", "py310", "py311", "py312"] diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..5efe5f2 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,12 @@ +[flake8] +ignore = + # Style issues handled by black. + E501, + E203, + W503, + +per-file-ignores = + # F401: Ignore "imported but unused" errors in __init__ files, as those + # imports are there to expose submodule functions so they can be imported + # directly from that module + zyte_api/__init__.py:F401 \ No newline at end of file diff --git a/setup.py b/setup.py index 658699a..7383672 100755 --- a/setup.py +++ b/setup.py @@ -1,48 +1,49 @@ #!/usr/bin/env python import os -from setuptools import setup, find_packages + +from setuptools import find_packages, setup def get_version(): about = {} here = os.path.abspath(os.path.dirname(__file__)) - with open(os.path.join(here, 'zyte_api/__version__.py')) as f: + with open(os.path.join(here, "zyte_api/__version__.py")) as f: exec(f.read(), about) - return about['__version__'] + return about["__version__"] setup( - name='zyte-api', + name="zyte-api", version=get_version(), - description='Python interface to Zyte API', - long_description=open('README.rst').read() + "\n\n" + open('CHANGES.rst').read(), - long_description_content_type='text/x-rst', - author='Zyte Group Ltd', - author_email='opensource@zyte.com', - url='https://github.com/zytedata/python-zyte-api', - packages=find_packages(exclude=['tests', 'examples']), - entry_points = { - 'console_scripts': ['zyte-api=zyte_api.__main__:_main'], + description="Python interface to Zyte API", + long_description=open("README.rst").read() + "\n\n" + open("CHANGES.rst").read(), + long_description_content_type="text/x-rst", + author="Zyte Group Ltd", + author_email="opensource@zyte.com", + url="https://github.com/zytedata/python-zyte-api", + packages=find_packages(exclude=["tests", "examples"]), + entry_points={ + "console_scripts": ["zyte-api=zyte_api.__main__:_main"], }, install_requires=[ - 'aiohttp >= 3.8.0', - 'attrs', - 'brotli', - 'runstats', - 'tenacity', - 'tqdm', - 'w3lib >= 2.1.1', + "aiohttp >= 3.8.0", + "attrs", + "brotli", + "runstats", + "tenacity", + "tqdm", + "w3lib >= 2.1.1", ], classifiers=[ - 'Development Status :: 3 - Alpha', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: BSD License', - 'Natural Language :: English', - 'Operating System :: OS Independent', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: BSD License", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", ], ) diff --git a/tests/mockserver.py b/tests/mockserver.py index ea8fffb..8f4330d 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -4,18 +4,14 @@ import sys import time from base64 import b64encode -from contextlib import asynccontextmanager from importlib import import_module from subprocess import PIPE, Popen -from typing import Any, Dict, Optional +from typing import Any, Dict from urllib.parse import urlparse -from pytest_twisted import ensureDeferred from twisted.internet import reactor -from twisted.internet.defer import Deferred -from twisted.internet.task import deferLater from twisted.web.resource import Resource -from twisted.web.server import NOT_DONE_YET, Site +from twisted.web.server import Site def get_ephemeral_port(): diff --git a/tests/test_async.py b/tests/test_async.py index 0d15df0..eb93054 100644 --- a/tests/test_async.py +++ b/tests/test_async.py @@ -1,10 +1,8 @@ -from types import GeneratorType +import pytest from zyte_api import AsyncZyteAPI from zyte_api.apikey import NoApiKey -import pytest - def test_api_key(): AsyncZyteAPI(api_key="a") @@ -15,8 +13,13 @@ def test_api_key(): @pytest.mark.asyncio async def test_get(mockserver): client = AsyncZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) - expected_result = {"url": "https://a.example", "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg=="} - actual_result = await client.get({"url": "https://a.example", "httpResponseBody": True}) + expected_result = { + "url": "https://a.example", + "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", + } + actual_result = await client.get( + {"url": "https://a.example", "httpResponseBody": True} + ) assert actual_result == expected_result @@ -29,9 +32,15 @@ async def test_iter(mockserver): {"url": "https://b.example", "httpResponseBody": True}, ] expected_results = [ - {"url": "https://a.example", "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg=="}, + { + "url": "https://a.example", + "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", + }, Exception, - {"url": "https://b.example", "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg=="}, + { + "url": "https://b.example", + "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", + }, ] actual_results = [] for future in client.iter(queries): diff --git a/tests/test_client.py b/tests/test_client.py index cd4c214..ae34726 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -12,11 +12,11 @@ USER_AGENT, ), ( - f'scrapy-zyte-api/0.11.1 {USER_AGENT}', - f'scrapy-zyte-api/0.11.1 {USER_AGENT}', + f"scrapy-zyte-api/0.11.1 {USER_AGENT}", + f"scrapy-zyte-api/0.11.1 {USER_AGENT}", ), ), ) def test_user_agent(user_agent, expected): - client = AsyncClient(api_key='123', api_url='http:\\test', user_agent=user_agent) + client = AsyncClient(api_key="123", api_url="http:\\test", user_agent=user_agent) assert client.user_agent == expected diff --git a/tests/test_main.py b/tests/test_main.py index 0087774..85573af 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,7 +1,7 @@ import json import os from json import JSONDecodeError -from unittest.mock import Mock, patch, AsyncMock +from unittest.mock import AsyncMock, Mock, patch import pytest diff --git a/tox.ini b/tox.ini index fca46c5..fec688d 100644 --- a/tox.ini +++ b/tox.ini @@ -31,3 +31,7 @@ deps = basepython = python3 commands = sphinx-build -W -b html . {envtmpdir}/html + +[testenv:pre-commit] +deps = pre-commit +commands = pre-commit run --all-files --show-diff-on-failure diff --git a/zyte_api/__main__.py b/zyte_api/__main__.py index 3e03e70..b81dc63 100644 --- a/zyte_api/__main__.py +++ b/zyte_api/__main__.py @@ -10,12 +10,9 @@ import tqdm from tenacity import retry_if_exception -from zyte_api.aio.client import ( - create_session, - AsyncClient, -) +from zyte_api.aio.client import AsyncClient, create_session from zyte_api.aio.retry import RetryFactory, _is_throttling_error -from zyte_api.constants import ENV_VARIABLE, API_URL +from zyte_api.constants import API_URL, ENV_VARIABLE from zyte_api.utils import _guess_intype @@ -167,7 +164,7 @@ def _main(program_name="zyte-api"): p.add_argument( "--store-errors", help="when set to true, it includes all types of responses, and when set to false," - " it includes only error-free responses in the output.", + " it includes only error-free responses in the output.", ) args = p.parse_args() logging.basicConfig(stream=sys.stderr, level=getattr(logging, args.loglevel)) diff --git a/zyte_api/__version__.py b/zyte_api/__version__.py index 5bf52d5..a3a9bd5 100644 --- a/zyte_api/__version__.py +++ b/zyte_api/__version__.py @@ -1 +1 @@ -__version__ = '0.4.8' +__version__ = "0.4.8" diff --git a/zyte_api/_async.py b/zyte_api/_async.py index 8156bde..6553c5c 100644 --- a/zyte_api/_async.py +++ b/zyte_api/_async.py @@ -1,19 +1,18 @@ import asyncio import time from functools import partial -from typing import Any, Dict, Iterator, List, Optional, TYPE_CHECKING, Union +from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional import aiohttp from tenacity import AsyncRetrying +from ._utils import _AIO_API_TIMEOUT from .aio.errors import RequestError from .aio.retry import zyte_api_retrying from .apikey import get_apikey -from .constants import API_URL, API_TIMEOUT +from .constants import API_URL from .stats import AggStats, ResponseStats from .utils import USER_AGENT, _process_query -from ._utils import _AIO_API_TIMEOUT, create_session - if TYPE_CHECKING: _ResponseFuture = asyncio.Future[Dict[str, Any]] @@ -22,11 +21,9 @@ def _post_func(session): - """ Return a function to send a POST request """ + """Return a function to send a POST request""" if session is None: - return partial(aiohttp.request, - method='POST', - timeout=_AIO_API_TIMEOUT) + return partial(aiohttp.request, method="POST", timeout=_AIO_API_TIMEOUT) else: return session.post @@ -52,7 +49,7 @@ async def get( self, query: dict, *, - endpoint: str = 'extract', + endpoint: str = "extract", session=None, handle_retries=True, retrying: Optional[AsyncRetrying] = None, @@ -60,10 +57,7 @@ async def get( retrying = retrying or self.retrying post = _post_func(session) auth = aiohttp.BasicAuth(self.api_key) - headers = { - 'User-Agent': self.user_agent, - 'Accept-Encoding': 'br' - } + headers = {"User-Agent": self.user_agent, "Accept-Encoding": "br"} response_stats = [] start_global = time.perf_counter() @@ -125,7 +119,7 @@ def iter( self, queries: List[dict], *, - endpoint: str = 'extract', + endpoint: str = "extract", session: Optional[aiohttp.ClientSession] = None, handle_retries=True, retrying: Optional[AsyncRetrying] = None, diff --git a/zyte_api/_utils.py b/zyte_api/_utils.py index e547fb5..116206f 100644 --- a/zyte_api/_utils.py +++ b/zyte_api/_utils.py @@ -3,7 +3,6 @@ from .constants import API_TIMEOUT - # 120 seconds is probably too long, but we are concerned about the case with # many concurrent requests and some processing logic running in the same reactor, # thus, saturating the CPU. This will make timeouts more likely. @@ -11,9 +10,8 @@ def create_session(connection_pool_size=100, **kwargs) -> aiohttp.ClientSession: - """ Create a session with parameters suited for Zyte API """ - kwargs.setdefault('timeout', _AIO_API_TIMEOUT) + """Create a session with parameters suited for Zyte API""" + kwargs.setdefault("timeout", _AIO_API_TIMEOUT) if "connector" not in kwargs: - kwargs["connector"] = TCPConnector(limit=connection_pool_size, - force_close=True) + kwargs["connector"] = TCPConnector(limit=connection_pool_size, force_close=True) return aiohttp.ClientSession(**kwargs) diff --git a/zyte_api/aio/__init__.py b/zyte_api/aio/__init__.py index b69b052..9833d38 100644 --- a/zyte_api/aio/__init__.py +++ b/zyte_api/aio/__init__.py @@ -1,3 +1,3 @@ """ Asyncio client for Zyte API -""" \ No newline at end of file +""" diff --git a/zyte_api/aio/client.py b/zyte_api/aio/client.py index 68f2530..c2336db 100644 --- a/zyte_api/aio/client.py +++ b/zyte_api/aio/client.py @@ -4,23 +4,20 @@ import asyncio import time -from functools import partial -from typing import Optional, Iterator, List +from typing import Iterator, List, Optional from warnings import warn import aiohttp -from aiohttp import TCPConnector from tenacity import AsyncRetrying -from .errors import RequestError -from .retry import zyte_api_retrying from .._async import _post_func -from .._utils import _AIO_API_TIMEOUT as AIO_API_TIMEOUT, create_session +from .._utils import create_session # noqa: F401 from ..apikey import get_apikey -from ..constants import API_URL, API_TIMEOUT +from ..constants import API_URL from ..stats import AggStats, ResponseStats from ..utils import USER_AGENT, _process_query - +from .errors import RequestError +from .retry import zyte_api_retrying warn( ( @@ -33,13 +30,15 @@ class AsyncClient: - def __init__(self, *, - api_key=None, - api_url=API_URL, - n_conn=15, - retrying: Optional[AsyncRetrying] = None, - user_agent: Optional[str] = None, - ): + def __init__( + self, + *, + api_key=None, + api_url=API_URL, + n_conn=15, + retrying: Optional[AsyncRetrying] = None, + user_agent: Optional[str] = None, + ): self.api_key = get_apikey(api_key) self.api_url = api_url self.n_conn = n_conn @@ -47,19 +46,19 @@ def __init__(self, *, self.retrying = retrying or zyte_api_retrying self.user_agent = user_agent or USER_AGENT - async def request_raw(self, query: dict, *, - endpoint: str = 'extract', - session=None, - handle_retries=True, - retrying: Optional[AsyncRetrying] = None, - ): + async def request_raw( + self, + query: dict, + *, + endpoint: str = "extract", + session=None, + handle_retries=True, + retrying: Optional[AsyncRetrying] = None, + ): retrying = retrying or self.retrying post = _post_func(session) auth = aiohttp.BasicAuth(self.api_key) - headers = { - 'User-Agent': self.user_agent, - 'Accept-Encoding': 'br' - } + headers = {"User-Agent": self.user_agent, "Accept-Encoding": "br"} response_stats = [] start_global = time.perf_counter() @@ -117,13 +116,14 @@ async def request(): return result - def request_parallel_as_completed(self, - queries: List[dict], - *, - endpoint: str = 'extract', - session: Optional[aiohttp.ClientSession] = None, - ) -> Iterator[asyncio.Future]: - """ Send multiple requests to Zyte API in parallel. + def request_parallel_as_completed( + self, + queries: List[dict], + *, + endpoint: str = "extract", + session: Optional[aiohttp.ClientSession] = None, + ) -> Iterator[asyncio.Future]: + """Send multiple requests to Zyte API in parallel. Return an `asyncio.as_completed` iterator. ``queries`` is a list of requests to process (dicts). @@ -136,8 +136,6 @@ def request_parallel_as_completed(self, async def _request(query): async with sem: - return await self.request_raw(query, - endpoint=endpoint, - session=session) + return await self.request_raw(query, endpoint=endpoint, session=session) return asyncio.as_completed([_request(query) for query in queries]) diff --git a/zyte_api/aio/errors.py b/zyte_api/aio/errors.py index 8b3005c..cf12e80 100644 --- a/zyte_api/aio/errors.py +++ b/zyte_api/aio/errors.py @@ -9,10 +9,11 @@ class RequestError(ClientResponseError): - """ Exception which is raised when Request-level error is returned. + """Exception which is raised when Request-level error is returned. In contrast with ClientResponseError, it allows to inspect response content. """ + def __init__(self, *args, **kwargs): self.response_content = kwargs.pop("response_content") self.request_id = kwargs.pop("request_id", None) @@ -25,6 +26,8 @@ def parsed(self): return ParsedError.from_body(self.response_content) def __str__(self): - return f"RequestError: {self.status}, message={self.message}, " \ - f"headers={self.headers}, body={self.response_content}, " \ - f"request_id={self.request_id}" + return ( + f"RequestError: {self.status}, message={self.message}, " + f"headers={self.headers}, body={self.response_content}, " + f"request_id={self.request_id}" + ) diff --git a/zyte_api/aio/retry.py b/zyte_api/aio/retry.py index 1f8b4fb..39299b5 100644 --- a/zyte_api/aio/retry.py +++ b/zyte_api/aio/retry.py @@ -9,22 +9,24 @@ from aiohttp import client_exceptions from tenacity import ( + AsyncRetrying, + RetryCallState, + after_log, + before_log, + before_sleep_log, + retry_base, + retry_if_exception, + stop_after_attempt, + stop_after_delay, wait_chain, wait_fixed, - wait_random_exponential, wait_random, - stop_after_attempt, - stop_after_delay, - retry_if_exception, - RetryCallState, - before_sleep_log, - after_log, AsyncRetrying, before_log, retry_base, + wait_random_exponential, ) from tenacity.stop import stop_never from .errors import RequestError - logger = logging.getLogger(__name__) @@ -62,6 +64,7 @@ class RetryFactory: """ Build custom retry configuration """ + retry_condition: retry_base = ( retry_if_exception(_is_throttling_error) | retry_if_exception(_is_network_error) @@ -71,19 +74,18 @@ class RetryFactory: throttling_wait = wait_chain( # always wait 20-40s first wait_fixed(20) + wait_random(0, 20), - # wait 20-40s again wait_fixed(20) + wait_random(0, 20), - # wait from 30 to 630s, with full jitter and exponentially # increasing max wait time - wait_fixed(30) + wait_random_exponential(multiplier=1, max=600) + wait_fixed(30) + wait_random_exponential(multiplier=1, max=600), ) # connection errors, other client and server failures network_error_wait = ( # wait from 3s to ~1m - wait_random(3, 7) + wait_random_exponential(multiplier=1, max=55) + wait_random(3, 7) + + wait_random_exponential(multiplier=1, max=55) ) temporary_download_error_wait = network_error_wait throttling_stop = stop_never diff --git a/zyte_api/apikey.py b/zyte_api/apikey.py index f9b0f80..c1cc70b 100644 --- a/zyte_api/apikey.py +++ b/zyte_api/apikey.py @@ -10,11 +10,13 @@ class NoApiKey(Exception): def get_apikey(key: Optional[str] = None) -> str: - """ Return API key, probably loading it from an environment variable """ + """Return API key, probably loading it from an environment variable""" if key is not None: return key try: return os.environ[ENV_VARIABLE] except KeyError: - raise NoApiKey("API key not found. Please set {} " - "environment variable.".format(ENV_VARIABLE)) + raise NoApiKey( + "API key not found. Please set {} " + "environment variable.".format(ENV_VARIABLE) + ) diff --git a/zyte_api/constants.py b/zyte_api/constants.py index 926577d..a433302 100644 --- a/zyte_api/constants.py +++ b/zyte_api/constants.py @@ -1,10 +1,10 @@ # -*- coding: utf-8 -*- # Name of the environment variable with the API key -ENV_VARIABLE = 'ZYTE_API_KEY' +ENV_VARIABLE = "ZYTE_API_KEY" # API URL -API_URL = 'https://api.zyte.com/v1/' +API_URL = "https://api.zyte.com/v1/" # Default timeout that server uses. Client timeouts should be larger than that. API_TIMEOUT = 200 diff --git a/zyte_api/errors.py b/zyte_api/errors.py index b608bf1..8088b54 100644 --- a/zyte_api/errors.py +++ b/zyte_api/errors.py @@ -6,13 +6,14 @@ @attr.s(auto_attribs=True) class ParsedError: - """ Parsed error from Zyte API """ + """Parsed error from Zyte API""" + response_body: bytes data: Optional[dict] parse_error: Optional[str] @classmethod - def from_body(cls, response_body: bytes) -> 'ParsedError': + def from_body(cls, response_body: bytes) -> "ParsedError": data = None parse_error = None @@ -25,12 +26,8 @@ def from_body(cls, response_body: bytes) -> 'ParsedError': except (json.JSONDecodeError, UnicodeDecodeError) as _: # noqa: F841 parse_error = "bad_json" - return cls( - response_body=response_body, - data=data, - parse_error=parse_error - ) + return cls(response_body=response_body, data=data, parse_error=parse_error) @property def type(self) -> Optional[str]: - return (self.data or {}).get('type', None) + return (self.data or {}).get("type", None) diff --git a/zyte_api/stats.py b/zyte_api/stats.py index 1789ee5..42c7b6a 100644 --- a/zyte_api/stats.py +++ b/zyte_api/stats.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- -from typing import Optional -from collections import Counter import functools import time +from collections import Counter +from typing import Optional import attr from runstats import Statistics @@ -17,6 +17,7 @@ def wrapper(*args, **kwargs): return meth(*args, **kwargs) except ZeroDivisionError: return 0 + return wrapper @@ -26,9 +27,13 @@ def __init__(self): self.time_total_stats = Statistics() self.n_success = 0 # number of successful results returned to the user - self.n_fatal_errors = 0 # number of errors returned to the user, after all retries + self.n_fatal_errors = ( + 0 # number of errors returned to the user, after all retries + ) - self.n_attempts = 0 # total amount of requests made to Zyte API, including retries + self.n_attempts = ( + 0 # total amount of requests made to Zyte API, including retries + ) self.n_429 = 0 # number of 429 (throttling) responses self.n_errors = 0 # number of errors, including errors which were retried @@ -46,25 +51,29 @@ def __str__(self): self.error_ratio(), self.n_success, self.n_processed, - self.success_ratio() + self.success_ratio(), ) def summary(self): return ( - "\n" + - "Summary\n" + - "-------\n" + - "Mean connection time: {:0.2f}\n".format(self.time_connect_stats.mean()) + - "Mean response time: {:0.2f}\n".format(self.time_total_stats.mean()) + - "Throttle ratio: {:0.1%}\n".format(self.throttle_ratio()) + - "Attempts: {}\n".format(self.n_attempts) + - "Errors: {:0.1%}, fatal: {}, non fatal: {}\n".format( + "\n" + + "Summary\n" + + "-------\n" + + "Mean connection time: {:0.2f}\n".format( + self.time_connect_stats.mean() + ) + + "Mean response time: {:0.2f}\n".format(self.time_total_stats.mean()) + + "Throttle ratio: {:0.1%}\n".format(self.throttle_ratio()) + + "Attempts: {}\n".format(self.n_attempts) + + "Errors: {:0.1%}, fatal: {}, non fatal: {}\n".format( self.error_ratio(), self.n_fatal_errors, - self.n_errors - self.n_fatal_errors) + - "Successful URLs: {} of {}\n".format( - self.n_success, self.n_processed) + - "Success ratio: {:0.1%}\n".format(self.success_ratio()) + self.n_errors - self.n_fatal_errors, + ) + + "Successful URLs: {} of {}\n".format( + self.n_success, self.n_processed + ) + + "Success ratio: {:0.1%}\n".format(self.success_ratio()) ) @zero_on_division_error @@ -81,7 +90,7 @@ def success_ratio(self): @property def n_processed(self): - """ Total number of processed URLs """ + """Total number of processed URLs""" return self.n_success + self.n_fatal_errors diff --git a/zyte_api/utils.py b/zyte_api/utils.py index 7767a9b..1b24a60 100644 --- a/zyte_api/utils.py +++ b/zyte_api/utils.py @@ -5,7 +5,7 @@ from .__version__ import __version__ -USER_AGENT = f'python-zyte-api/{__version__}' +USER_AGENT = f"python-zyte-api/{__version__}" def _guess_intype(file_name, lines): @@ -16,7 +16,7 @@ def _guess_intype(file_name, lines): if extension == "txt": return "txt" - if re.search(r'^\s*\{', lines[0]): + if re.search(r"^\s*\{", lines[0]): return "jl" return "txt"