diff --git a/src/crawlee/http_clients/_curl_impersonate.py b/src/crawlee/http_clients/_curl_impersonate.py index 488a27f88a..a55b2269f2 100644 --- a/src/crawlee/http_clients/_curl_impersonate.py +++ b/src/crawlee/http_clients/_curl_impersonate.py @@ -5,7 +5,9 @@ from crawlee._utils.docs import docs_group try: + from curl_cffi import CurlInfo from curl_cffi.requests import AsyncSession + from curl_cffi.requests.cookies import Cookies, CurlMorsel from curl_cffi.requests.exceptions import ProxyError as CurlProxyError from curl_cffi.requests.exceptions import RequestException as CurlRequestError from curl_cffi.requests.impersonate import DEFAULT_CHROME as CURL_DEFAULT_CHROME @@ -26,6 +28,8 @@ if TYPE_CHECKING: from collections.abc import Iterable + from curl_cffi import Curl + from curl_cffi.requests import Request as CurlRequest from curl_cffi.requests import Response from crawlee import Request @@ -35,6 +39,16 @@ from crawlee.statistics import Statistics +class _EmptyCookies(Cookies): + @override + def get_cookies_for_curl(self, request: CurlRequest) -> list[CurlMorsel]: + return [] + + @override + def update_cookies_from_curl(self, morsels: list[CurlMorsel]) -> None: + return None + + class _CurlImpersonateResponse: """Adapter class for `curl_cffi.requests.Response` to conform to the `HttpResponse` protocol.""" @@ -151,6 +165,10 @@ async def crawl( self._ignore_http_error_status_codes, ) + if self._persist_cookies_per_session and session and response.curl: + response_cookies = self._get_cookies(response.curl) + session.cookies.update(response_cookies) + request.loaded_url = response.url return HttpCrawlingResult( @@ -194,6 +212,10 @@ async def send_request( self._ignore_http_error_status_codes, ) + if self._persist_cookies_per_session and session and response.curl: + response_cookies = self._get_cookies(response.curl) + session.cookies.update(response_cookies) + return _CurlImpersonateResponse(response) def _get_client(self, proxy_url: str | None) -> AsyncSession: @@ -217,6 +239,7 @@ def _get_client(self, proxy_url: str | None) -> AsyncSession: # Create and store the new session with the specified kwargs. self._client_by_proxy_url[proxy_url] = AsyncSession(**kwargs) + self._client_by_proxy_url[proxy_url].cookies = _EmptyCookies() return self._client_by_proxy_url[proxy_url] @@ -230,3 +253,11 @@ def _is_proxy_error(error: CurlRequestError) -> bool: return True return False + + @staticmethod + def _get_cookies(curl: Curl) -> dict[str, str]: + cookies = {} + for curl_cookie in curl.getinfo(CurlInfo.COOKIELIST): # type: ignore[union-attr] + curl_morsel = CurlMorsel.from_curl_format(curl_cookie) # type: ignore[arg-type] + cookies[curl_morsel.name] = curl_morsel.value + return cookies diff --git a/tests/unit/crawlers/_http/test_http_crawler.py b/tests/unit/crawlers/_http/test_http_crawler.py index ac125cb7a7..17153a0433 100644 --- a/tests/unit/crawlers/_http/test_http_crawler.py +++ b/tests/unit/crawlers/_http/test_http_crawler.py @@ -9,7 +9,7 @@ import respx from httpx import Response -from crawlee._request import Request +from crawlee import ConcurrencySettings, Request from crawlee.crawlers import HttpCrawler from crawlee.http_clients import CurlImpersonateHttpClient, HttpxHttpClient from crawlee.sessions import SessionPool @@ -183,7 +183,15 @@ async def test_handles_server_error( assert server['500_endpoint'].called -async def test_stores_cookies(httpbin: URL) -> None: +@pytest.mark.parametrize( + 'http_client_class', + [ + pytest.param(CurlImpersonateHttpClient, id='curl'), + pytest.param(HttpxHttpClient, id='httpx'), + ], +) +async def test_stores_cookies(http_client_class: type[BaseHttpClient], httpbin: URL) -> None: + http_client = http_client_class() visit = Mock() track_session_usage = Mock() @@ -192,6 +200,7 @@ async def test_stores_cookies(httpbin: URL) -> None: # /cookies/set might redirect us to a page that we can't access - no problem, we only care about cookies ignore_http_error_status_codes=[401], session_pool=session_pool, + http_client=http_client, ) @crawler.router.default_handler @@ -410,3 +419,68 @@ def mark_request_execution(request: Request) -> Response: # noqa: ARG001 # Unus await crawler.run([Request.from_url(url=test_url)]) assert execution_order == ['pre-navigation-hook 1', 'pre-navigation-hook 2', 'request', 'final handler'] + + +@pytest.mark.parametrize( + 'http_client_class', + [ + pytest.param(CurlImpersonateHttpClient, id='curl'), + pytest.param(HttpxHttpClient, id='httpx'), + ], +) +async def test_isolation_cookies(http_client_class: type[BaseHttpClient], httpbin: URL) -> None: + http_client = http_client_class() + sessions_ids: list[str] = [] + sessions_cookies: dict[str, dict[str, str]] = {} + response_cookies: dict[str, dict[str, str]] = {} + + crawler = HttpCrawler( + session_pool=SessionPool(max_pool_size=1), + http_client=http_client, + concurrency_settings=ConcurrencySettings(max_concurrency=1), + ) + + @crawler.router.default_handler + async def handler(context: HttpCrawlingContext) -> None: + if not context.session: + return + + sessions_ids.append(context.session.id) + + if context.request.unique_key not in {'1', '2'}: + return + + sessions_cookies[context.session.id] = context.session.cookies + response_data = json.loads(context.http_response.read()) + response_cookies[context.session.id] = response_data.get('cookies') + + if context.request.user_data.get('retire_session'): + context.session.retire() + + await crawler.run( + [ + # The first request sets the cookie in the session + str(httpbin.with_path('/cookies/set').extend_query(a=1)), + # With the second request, we check the cookies in the session and set retire + Request.from_url(str(httpbin.with_path('/cookies')), unique_key='1', user_data={'retire_session': True}), + # The third request is made with a new session to make sure it does not use another session's cookies + Request.from_url(str(httpbin.with_path('/cookies')), unique_key='2'), + ] + ) + + assert len(sessions_cookies) == 2 + assert len(response_cookies) == 2 + + assert sessions_ids[0] == sessions_ids[1] + + cookie_session_id = sessions_ids[0] + clean_session_id = sessions_ids[2] + + assert cookie_session_id != clean_session_id + + # The initiated cookies must match in both the response and the session store + assert sessions_cookies[cookie_session_id] == response_cookies[cookie_session_id] == {'a': '1'} + + # For a clean session, the cookie should not be in the session store or in the response + # This way we can be sure that no cookies are being leaked through the http client + assert sessions_cookies[clean_session_id] == response_cookies[clean_session_id] == {}