Skip to content

Commit

Permalink
fix: fix CurlImpersonateHttpClient cookies handler (#946)
Browse files Browse the repository at this point in the history
### Description

- fix cookie handling. Behavior alignment with `HttpxHttpClient`.

### Issues

- #933
  • Loading branch information
Mantisus authored Feb 5, 2025
1 parent eae3a33 commit ed415c4
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 2 deletions.
31 changes: 31 additions & 0 deletions src/crawlee/http_clients/_curl_impersonate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
from crawlee._utils.docs import docs_group

try:
from curl_cffi import CurlInfo
from curl_cffi.requests import AsyncSession
from curl_cffi.requests.cookies import Cookies, CurlMorsel
from curl_cffi.requests.exceptions import ProxyError as CurlProxyError
from curl_cffi.requests.exceptions import RequestException as CurlRequestError
from curl_cffi.requests.impersonate import DEFAULT_CHROME as CURL_DEFAULT_CHROME
Expand All @@ -26,6 +28,8 @@
if TYPE_CHECKING:
from collections.abc import Iterable

from curl_cffi import Curl
from curl_cffi.requests import Request as CurlRequest
from curl_cffi.requests import Response

from crawlee import Request
Expand All @@ -35,6 +39,16 @@
from crawlee.statistics import Statistics


class _EmptyCookies(Cookies):
@override
def get_cookies_for_curl(self, request: CurlRequest) -> list[CurlMorsel]:
return []

@override
def update_cookies_from_curl(self, morsels: list[CurlMorsel]) -> None:
return None


class _CurlImpersonateResponse:
"""Adapter class for `curl_cffi.requests.Response` to conform to the `HttpResponse` protocol."""

Expand Down Expand Up @@ -151,6 +165,10 @@ async def crawl(
self._ignore_http_error_status_codes,
)

if self._persist_cookies_per_session and session and response.curl:
response_cookies = self._get_cookies(response.curl)
session.cookies.update(response_cookies)

request.loaded_url = response.url

return HttpCrawlingResult(
Expand Down Expand Up @@ -194,6 +212,10 @@ async def send_request(
self._ignore_http_error_status_codes,
)

if self._persist_cookies_per_session and session and response.curl:
response_cookies = self._get_cookies(response.curl)
session.cookies.update(response_cookies)

return _CurlImpersonateResponse(response)

def _get_client(self, proxy_url: str | None) -> AsyncSession:
Expand All @@ -217,6 +239,7 @@ def _get_client(self, proxy_url: str | None) -> AsyncSession:

# Create and store the new session with the specified kwargs.
self._client_by_proxy_url[proxy_url] = AsyncSession(**kwargs)
self._client_by_proxy_url[proxy_url].cookies = _EmptyCookies()

return self._client_by_proxy_url[proxy_url]

Expand All @@ -230,3 +253,11 @@ def _is_proxy_error(error: CurlRequestError) -> bool:
return True

return False

@staticmethod
def _get_cookies(curl: Curl) -> dict[str, str]:
cookies = {}
for curl_cookie in curl.getinfo(CurlInfo.COOKIELIST): # type: ignore[union-attr]
curl_morsel = CurlMorsel.from_curl_format(curl_cookie) # type: ignore[arg-type]
cookies[curl_morsel.name] = curl_morsel.value
return cookies
78 changes: 76 additions & 2 deletions tests/unit/crawlers/_http/test_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import respx
from httpx import Response

from crawlee._request import Request
from crawlee import ConcurrencySettings, Request
from crawlee.crawlers import HttpCrawler
from crawlee.http_clients import CurlImpersonateHttpClient, HttpxHttpClient
from crawlee.sessions import SessionPool
Expand Down Expand Up @@ -183,7 +183,15 @@ async def test_handles_server_error(
assert server['500_endpoint'].called


async def test_stores_cookies(httpbin: URL) -> None:
@pytest.mark.parametrize(
'http_client_class',
[
pytest.param(CurlImpersonateHttpClient, id='curl'),
pytest.param(HttpxHttpClient, id='httpx'),
],
)
async def test_stores_cookies(http_client_class: type[BaseHttpClient], httpbin: URL) -> None:
http_client = http_client_class()
visit = Mock()
track_session_usage = Mock()

Expand All @@ -192,6 +200,7 @@ async def test_stores_cookies(httpbin: URL) -> None:
# /cookies/set might redirect us to a page that we can't access - no problem, we only care about cookies
ignore_http_error_status_codes=[401],
session_pool=session_pool,
http_client=http_client,
)

@crawler.router.default_handler
Expand Down Expand Up @@ -410,3 +419,68 @@ def mark_request_execution(request: Request) -> Response: # noqa: ARG001 # Unus
await crawler.run([Request.from_url(url=test_url)])

assert execution_order == ['pre-navigation-hook 1', 'pre-navigation-hook 2', 'request', 'final handler']


@pytest.mark.parametrize(
'http_client_class',
[
pytest.param(CurlImpersonateHttpClient, id='curl'),
pytest.param(HttpxHttpClient, id='httpx'),
],
)
async def test_isolation_cookies(http_client_class: type[BaseHttpClient], httpbin: URL) -> None:
http_client = http_client_class()
sessions_ids: list[str] = []
sessions_cookies: dict[str, dict[str, str]] = {}
response_cookies: dict[str, dict[str, str]] = {}

crawler = HttpCrawler(
session_pool=SessionPool(max_pool_size=1),
http_client=http_client,
concurrency_settings=ConcurrencySettings(max_concurrency=1),
)

@crawler.router.default_handler
async def handler(context: HttpCrawlingContext) -> None:
if not context.session:
return

sessions_ids.append(context.session.id)

if context.request.unique_key not in {'1', '2'}:
return

sessions_cookies[context.session.id] = context.session.cookies
response_data = json.loads(context.http_response.read())
response_cookies[context.session.id] = response_data.get('cookies')

if context.request.user_data.get('retire_session'):
context.session.retire()

await crawler.run(
[
# The first request sets the cookie in the session
str(httpbin.with_path('/cookies/set').extend_query(a=1)),
# With the second request, we check the cookies in the session and set retire
Request.from_url(str(httpbin.with_path('/cookies')), unique_key='1', user_data={'retire_session': True}),
# The third request is made with a new session to make sure it does not use another session's cookies
Request.from_url(str(httpbin.with_path('/cookies')), unique_key='2'),
]
)

assert len(sessions_cookies) == 2
assert len(response_cookies) == 2

assert sessions_ids[0] == sessions_ids[1]

cookie_session_id = sessions_ids[0]
clean_session_id = sessions_ids[2]

assert cookie_session_id != clean_session_id

# The initiated cookies must match in both the response and the session store
assert sessions_cookies[cookie_session_id] == response_cookies[cookie_session_id] == {'a': '1'}

# For a clean session, the cookie should not be in the session store or in the response
# This way we can be sure that no cookies are being leaked through the http client
assert sessions_cookies[clean_session_id] == response_cookies[clean_session_id] == {}

0 comments on commit ed415c4

Please sign in to comment.