Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: fix CurlImpersonateHttpClient cookies handler #946

Merged
merged 9 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions src/crawlee/http_clients/_curl_impersonate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
from crawlee._utils.docs import docs_group

try:
from curl_cffi import CurlInfo
from curl_cffi.requests import AsyncSession
from curl_cffi.requests.cookies import Cookies, CurlMorsel
from curl_cffi.requests.exceptions import ProxyError as CurlProxyError
from curl_cffi.requests.exceptions import RequestException as CurlRequestError
from curl_cffi.requests.impersonate import DEFAULT_CHROME as CURL_DEFAULT_CHROME
Expand All @@ -26,6 +28,8 @@
if TYPE_CHECKING:
from collections.abc import Iterable

from curl_cffi import Curl
from curl_cffi.requests import Request as CurlRequest
from curl_cffi.requests import Response

from crawlee import Request
Expand All @@ -35,6 +39,16 @@
from crawlee.statistics import Statistics


class _EmptyCookies(Cookies):
@override
def get_cookies_for_curl(self, request: CurlRequest) -> list[CurlMorsel]:
return []

@override
def update_cookies_from_curl(self, morsels: list[CurlMorsel]) -> None:
return None


class _CurlImpersonateResponse:
"""Adapter class for `curl_cffi.requests.Response` to conform to the `HttpResponse` protocol."""

Expand Down Expand Up @@ -151,6 +165,10 @@ async def crawl(
self._ignore_http_error_status_codes,
)

if self._persist_cookies_per_session and session and response.curl:
response_cookies = self._get_cookies(response.curl)
session.cookies.update(response_cookies)

request.loaded_url = response.url

return HttpCrawlingResult(
Expand Down Expand Up @@ -194,6 +212,10 @@ async def send_request(
self._ignore_http_error_status_codes,
)

if self._persist_cookies_per_session and session and response.curl:
response_cookies = self._get_cookies(response.curl)
session.cookies.update(response_cookies)

return _CurlImpersonateResponse(response)

def _get_client(self, proxy_url: str | None) -> AsyncSession:
Expand All @@ -217,6 +239,7 @@ def _get_client(self, proxy_url: str | None) -> AsyncSession:

# Create and store the new session with the specified kwargs.
self._client_by_proxy_url[proxy_url] = AsyncSession(**kwargs)
self._client_by_proxy_url[proxy_url].cookies = _EmptyCookies()

return self._client_by_proxy_url[proxy_url]

Expand All @@ -230,3 +253,11 @@ def _is_proxy_error(error: CurlRequestError) -> bool:
return True

return False

@staticmethod
def _get_cookies(curl: Curl) -> dict[str, str]:
cookies = {}
for curl_cookie in curl.getinfo(CurlInfo.COOKIELIST): # type: ignore[union-attr]
curl_morsel = CurlMorsel.from_curl_format(curl_cookie) # type: ignore[arg-type]
cookies[curl_morsel.name] = curl_morsel.value
return cookies
78 changes: 76 additions & 2 deletions tests/unit/crawlers/_http/test_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import respx
from httpx import Response

from crawlee._request import Request
from crawlee import ConcurrencySettings, Request
from crawlee.crawlers import HttpCrawler
from crawlee.http_clients import CurlImpersonateHttpClient, HttpxHttpClient
from crawlee.sessions import SessionPool
Expand Down Expand Up @@ -183,7 +183,15 @@ async def test_handles_server_error(
assert server['500_endpoint'].called


async def test_stores_cookies(httpbin: URL) -> None:
@pytest.mark.parametrize(
'http_client_class',
[
pytest.param(CurlImpersonateHttpClient, id='curl'),
pytest.param(HttpxHttpClient, id='httpx'),
],
)
async def test_stores_cookies(http_client_class: type[BaseHttpClient], httpbin: URL) -> None:
http_client = http_client_class()
visit = Mock()
track_session_usage = Mock()

Expand All @@ -192,6 +200,7 @@ async def test_stores_cookies(httpbin: URL) -> None:
# /cookies/set might redirect us to a page that we can't access - no problem, we only care about cookies
ignore_http_error_status_codes=[401],
session_pool=session_pool,
http_client=http_client,
)

@crawler.router.default_handler
Expand Down Expand Up @@ -410,3 +419,68 @@ def mark_request_execution(request: Request) -> Response: # noqa: ARG001 # Unus
await crawler.run([Request.from_url(url=test_url)])

assert execution_order == ['pre-navigation-hook 1', 'pre-navigation-hook 2', 'request', 'final handler']


@pytest.mark.parametrize(
'http_client_class',
[
pytest.param(CurlImpersonateHttpClient, id='curl'),
pytest.param(HttpxHttpClient, id='httpx'),
],
)
async def test_isolation_cookies(http_client_class: type[BaseHttpClient], httpbin: URL) -> None:
http_client = http_client_class()
sessions_ids: list[str] = []
sessions_cookies: dict[str, dict[str, str]] = {}
response_cookies: dict[str, dict[str, str]] = {}

crawler = HttpCrawler(
session_pool=SessionPool(max_pool_size=1),
http_client=http_client,
concurrency_settings=ConcurrencySettings(max_concurrency=1),
)

@crawler.router.default_handler
async def handler(context: HttpCrawlingContext) -> None:
if not context.session:
return

sessions_ids.append(context.session.id)

if context.request.unique_key not in {'1', '2'}:
return

sessions_cookies[context.session.id] = context.session.cookies
response_data = json.loads(context.http_response.read())
response_cookies[context.session.id] = response_data.get('cookies')

if context.request.user_data.get('retire_session'):
context.session.retire()

await crawler.run(
[
# The first request sets the cookie in the session
str(httpbin.with_path('/cookies/set').extend_query(a=1)),
# With the second request, we check the cookies in the session and set retire
Request.from_url(str(httpbin.with_path('/cookies')), unique_key='1', user_data={'retire_session': True}),
# The third request is made with a new session to make sure it does not use another session's cookies
Request.from_url(str(httpbin.with_path('/cookies')), unique_key='2'),
]
)

assert len(sessions_cookies) == 2
assert len(response_cookies) == 2

assert sessions_ids[0] == sessions_ids[1]

cookie_session_id = sessions_ids[0]
clean_session_id = sessions_ids[2]

assert cookie_session_id != clean_session_id

# The initiated cookies must match in both the response and the session store
assert sessions_cookies[cookie_session_id] == response_cookies[cookie_session_id] == {'a': '1'}

# For a clean session, the cookie should not be in the session store or in the response
# This way we can be sure that no cookies are being leaked through the http client
assert sessions_cookies[clean_session_id] == response_cookies[clean_session_id] == {}
Loading