diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 3dcd9256..64292bb2 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -50,4 +50,3 @@ jobs: - uses: pypa/gh-action-pypi-publish@v1.5.0 with: password: ${{ secrets.PYPI_TOKEN }} - diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 39dc500a..c54eae5c 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -6,7 +6,6 @@ on: push: branches: - main - - master - bugfix/* - feature/* jobs: diff --git a/Makefile b/Makefile index de35bd0a..1a670d8f 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ .ONESHELL: SHELL := bash -VERSION := 0.5.4 -CURL_VERSION := curl-7.84.0 +VERSION := 0.6.0b6 +CURL_VERSION := curl-8.1.1 .preprocessed: curl_cffi/include/curl/curl.h .so_downloaded touch .preprocessed @@ -15,7 +15,7 @@ $(CURL_VERSION): tar -xf $(CURL_VERSION).tar.xz curl-impersonate-$(VERSION)/chrome/patches: $(CURL_VERSION) - curl -L "https://github.com/lwthiker/curl-impersonate/archive/refs/tags/v$(VERSION).tar.gz" \ + curl -L "https://github.com/yifeikong/curl-impersonate/archive/refs/tags/v$(VERSION).tar.gz" \ -o "curl-impersonate-$(VERSION).tar.gz" tar -xf curl-impersonate-$(VERSION).tar.gz @@ -28,7 +28,7 @@ curl_cffi/include/curl/curl.h: curl-impersonate-$(VERSION)/chrome/patches cp -R include/curl/* ../curl_cffi/include/curl/ .so_downloaded: - python preprocess/download_so.py + python preprocess/download_so.py $(VERSION) touch .so_downloaded preprocess: .preprocessed diff --git a/README-zh.md b/README-zh.md index 5b27a5ea..95b32d34 100644 --- a/README-zh.md +++ b/README-zh.md @@ -14,6 +14,16 @@ TLS 或者 JA3 指纹。如果你莫名其妙地被某个网站封锁了,可 - 预编译,不需要再自己机器上再弄一遍。 - 支持 `asyncio`,并且每个请求都可以换代理。 - 支持 http 2.0,requests 不支持。 +- 支持 websocket。 + +|库|requests|aiohttp|httpx|pycurl|curl_cffi| +|---|---|---|---|---|---| +|http2|❌|❌|✅|✅|✅| +|sync|✅|❌|✅|✅|✅| +|async|❌|✅|✅|❌|✅| +|websocket|❌|✅|❌|❌|✅| +|指纹|❌|❌|❌|❌|✅| +|速度|🐇|🐇🐇|🐇|🐇🐇|🐇🐇| ## 安装 @@ -23,8 +33,14 @@ TLS 或者 JA3 指纹。如果你莫名其妙地被某个网站封锁了,可 在其他小众平台,你可能需要先编译并安装 `curl-impersonate` 并且设置 `LD_LIBRARY_PATH` 这些 环境变量。 +安装测试版: + + pip install curl_cffi --pre + ## 使用 +尽量模仿比较新的浏览器,不要直接从下边的例子里复制 `chrome110` 去用。 + ### 类 requests ```python @@ -59,7 +75,9 @@ print(r.json()) # {'cookies': {'foo': 'bar'}} ``` -支持模拟的浏览器版本,和 [curl-impersonate](https://github.com/lwthiker/curl-impersonate) 一致: +支持模拟的浏览器版本,和我 [fork](https://github.com/yifeikong/curl-impersonate) 的 [curl-impersonate](https://github.com/lwthiker/curl-impersonate) 一致: + +不过只支持类似 Chrome 的浏览器。Firefox 的支持进展可以查看 #55 - chrome99 - chrome100 @@ -67,11 +85,15 @@ print(r.json()) - chrome104 - chrome107 - chrome110 +- chrome116 +- chrome119 +- chrome120 - chrome99_android - edge99 - edge101 - safari15_3 - safari15_5 +- safari17_2_ios ### asyncio @@ -102,6 +124,22 @@ async with AsyncSession() as s: results = await asyncio.gather(*tasks) ``` +### WebSockets + +```python +from curl_cffi.requests import Session, WebSocket + +def on_message(ws: WebSocket, message): + print(message) + +with Session() as s: + ws = s.ws_connect( + "wss://api.gemini.com/v1/marketdata/BTCUSD", + on_message=on_message, + ) + ws.run_forever() +``` + ### 类 curl 另外,你还可以使用类似 curl 的底层 API: @@ -125,7 +163,10 @@ print(body.decode()) 更多细节请查看 [英文文档](https://curl-cffi.readthedocs.io)。 -如果你用 scrapy 的话,可以参考这个中间件:[tieyongjie/scrapy-fingerprint](https://github.com/tieyongjie/scrapy-fingerprint) +如果你用 scrapy 的话,可以参考这些中间件: + +- [tieyongjie/scrapy-fingerprint](https://github.com/tieyongjie/scrapy-fingerprint) +- [jxlil/scrapy-impersonate](https://github.com/jxlil/scrapy-impersonate) 有问题和建议请优先提 issue,中英文均可,也可以加 [TG 群](https://t.me/+lL9n33eZp480MGM1) 或微信群讨论: @@ -136,6 +177,7 @@ print(body.decode()) - 该项目 fork 自:[multippt/python_curl_cffi](https://github.com/multippt/python_curl_cffi), MIT 协议发布。 - Headers/Cookies 代码来自 [httpx](https://github.com/encode/httpx/blob/master/httpx/_models.py), BSD 协议发布。 - Asyncio 支持是受 Tornado 的 curl http client 启发而做。 +- WebSocket API 的设计来自 [websocket_client](https://github.com/websocket-client/websocket-client)。 ## 赞助 diff --git a/README.md b/README.md index d0b6ec39..b3d57d6e 100644 --- a/README.md +++ b/README.md @@ -17,12 +17,14 @@ website for no obvious reason, you can give this package a try. - Pre-compiled, so you don't have to compile on your machine. - Supports `asyncio` with proxy rotation on each request. - Supports http 2.0, which requests does not. +- Supports websocket. |library|requests|aiohttp|httpx|pycurl|curl_cffi| |---|---|---|---|---|---| |http2|❌|❌|✅|✅|✅| |sync|✅|❌|✅|✅|✅| |async|❌|✅|✅|❌|✅| +|websocket|❌|✅|❌|❌|✅| |fingerprints|❌|❌|❌|❌|✅| |speed|🐇|🐇🐇|🐇|🐇🐇|🐇🐇| @@ -40,6 +42,8 @@ To install beta releases: ## Usage +Use the latest impersonate versions, do NOT copy `chrome110` here without changing. + ### requests-like ```python @@ -74,7 +78,9 @@ print(r.json()) # {'cookies': {'foo': 'bar'}} ``` -Supported impersonate versions, as supported by [curl-impersonate](https://github.com/lwthiker/curl-impersonate): +Supported impersonate versions, as supported by my [fork](https://github.com/yifeikong/curl-impersonate) of [curl-impersonate](https://github.com/lwthiker/curl-impersonate): + +However, only Chrome-like browsers are supported. Firefox support is tracked in #55 - chrome99 - chrome100 @@ -82,11 +88,15 @@ Supported impersonate versions, as supported by [curl-impersonate](https://githu - chrome104 - chrome107 - chrome110 +- chrome116 +- chrome119 +- chrome120 - chrome99_android - edge99 - edge101 - safari15_3 - safari15_5 +- safari17_2_ios ### asyncio @@ -117,6 +127,22 @@ async with AsyncSession() as s: results = await asyncio.gather(*tasks) ``` +### WebSockets + +```python +from curl_cffi.requests import Session, WebSocket + +def on_message(ws: WebSocket, message): + print(message) + +with Session() as s: + ws = s.ws_connect( + "wss://api.gemini.com/v1/marketdata/BTCUSD", + on_message=on_message, + ) + ws.run_forever() +``` + ### curl-like Alternatively, you can use the low-level curl-like API: @@ -140,13 +166,17 @@ print(body.decode()) See the [docs](https://curl-cffi.readthedocs.io) for more details. -If you are using scrapy, check out this middleware: [tieyongjie/scrapy-fingerprint](https://github.com/tieyongjie/scrapy-fingerprint) +If you are using scrapy, check out these middlewares: + +- [tieyongjie/scrapy-fingerprint](https://github.com/tieyongjie/scrapy-fingerprint) +- [jxlil/scrapy-impersonate](https://github.com/jxlil/scrapy-impersonate) ## Acknowledgement - Originally forked from [multippt/python_curl_cffi](https://github.com/multippt/python_curl_cffi), which is under the MIT license. - Headers/Cookies files are copied from [httpx](https://github.com/encode/httpx/blob/master/httpx/_models.py), which is under the BSD license. - Asyncio support is inspired by Tornado's curl http client. +- The WebSocket API is inspired by [websocket_client](https://github.com/websocket-client/websocket-client) ## Sponsor diff --git a/bump_version.sh b/bump_version.sh new file mode 100755 index 00000000..9591d5e2 --- /dev/null +++ b/bump_version.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +VERSION=$1 + +# Makefile +gsed "s/^VERSION := .*/VERSION := ${VERSION}/g" -i Makefile + +# curl_cffi/__version__.py +gsed "s/^__version__ = .*/__version__ = \"${VERSION}\"/g" -i curl_cffi/__version__.py + +# pyproject.toml +gsed "s/^version = .*/version = \"${VERSION}\"/g" -i pyproject.toml diff --git a/curl_cffi/__version__.py b/curl_cffi/__version__.py index b0cfd8d3..c3bfd383 100644 --- a/curl_cffi/__version__.py +++ b/curl_cffi/__version__.py @@ -7,5 +7,5 @@ # __description__ = metadata.metadata("curl_cffi")["Summary"] # __version__ = metadata.version("curl_cffi") __description__ = "libcurl ffi bindings for Python, with impersonation support" -__version__ = "0.5.10" +__version__ = "0.6.0b6" __curl_version__ = Curl().version().decode() diff --git a/curl_cffi/const.py b/curl_cffi/const.py index c29b76ad..f81068e3 100644 --- a/curl_cffi/const.py +++ b/curl_cffi/const.py @@ -106,7 +106,7 @@ class CurlOpt(IntEnum): SSL_CTX_DATA = 10000 + 109 FTP_CREATE_MISSING_DIRS = 0 + 110 PROXYAUTH = 0 + 111 - FTP_RESPONSE_TIMEOUT = 0 + 112 + SERVER_RESPONSE_TIMEOUT = 0 + 112 IPRESOLVE = 0 + 113 MAXFILESIZE = 0 + 114 INFILESIZE_LARGE = 30000 + 115 @@ -303,14 +303,21 @@ class CurlOpt(IntEnum): MIME_OPTIONS = 0 + 315 SSH_HOSTKEYFUNCTION = 20000 + 316 SSH_HOSTKEYDATA = 10000 + 317 - HTTPBASEHEADER = 10000 + 318 - SSL_SIG_HASH_ALGS = 10000 + 319 - SSL_ENABLE_ALPS = 0 + 320 - SSL_CERT_COMPRESSION = 10000 + 321 - SSL_ENABLE_TICKET = 0 + 322 - HTTP2_PSEUDO_HEADERS_ORDER = 10000 + 323 - HTTP2_NO_SERVER_PUSH = 0 + 324 - SSL_PERMUTE_EXTENSIONS = 0 + 325 + PROTOCOLS_STR = 10000 + 318 + REDIR_PROTOCOLS_STR = 10000 + 319 + WS_OPTIONS = 0 + 320 + CA_CACHE_TIMEOUT = 0 + 321 + QUICK_EXIT = 0 + 322 + HTTPBASEHEADER = 10000 + 323 + SSL_SIG_HASH_ALGS = 10000 + 324 + SSL_ENABLE_ALPS = 0 + 325 + SSL_CERT_COMPRESSION = 10000 + 326 + SSL_ENABLE_TICKET = 0 + 327 + HTTP2_PSEUDO_HEADERS_ORDER = 10000 + 328 + HTTP2_SETTINGS = 10000 + 329 + SSL_PERMUTE_EXTENSIONS = 0 + 330 + HTTP2_WINDOW_UPDATE = 0 + 331 + ECH = 10000 + 332 if locals().get("WRITEDATA"): FILE = locals().get("WRITEDATA") @@ -328,22 +335,16 @@ class CurlInfo(IntEnum): NAMELOOKUP_TIME = 0x300000 + 4 CONNECT_TIME = 0x300000 + 5 PRETRANSFER_TIME = 0x300000 + 6 - SIZE_UPLOAD = 0x300000 + 7 SIZE_UPLOAD_T = 0x600000 + 7 - SIZE_DOWNLOAD = 0x300000 + 8 SIZE_DOWNLOAD_T = 0x600000 + 8 - SPEED_DOWNLOAD = 0x300000 + 9 SPEED_DOWNLOAD_T = 0x600000 + 9 - SPEED_UPLOAD = 0x300000 + 10 SPEED_UPLOAD_T = 0x600000 + 10 HEADER_SIZE = 0x200000 + 11 REQUEST_SIZE = 0x200000 + 12 SSL_VERIFYRESULT = 0x200000 + 13 FILETIME = 0x200000 + 14 FILETIME_T = 0x600000 + 14 - CONTENT_LENGTH_DOWNLOAD = 0x300000 + 15 CONTENT_LENGTH_DOWNLOAD_T = 0x600000 + 15 - CONTENT_LENGTH_UPLOAD = 0x300000 + 16 CONTENT_LENGTH_UPLOAD_T = 0x600000 + 16 STARTTRANSFER_TIME = 0x300000 + 17 CONTENT_TYPE = 0x100000 + 18 @@ -357,7 +358,6 @@ class CurlInfo(IntEnum): NUM_CONNECTS = 0x200000 + 26 SSL_ENGINES = 0x400000 + 27 COOKIELIST = 0x400000 + 28 - LASTSOCKET = 0x200000 + 29 FTP_ENTRY_PATH = 0x100000 + 30 REDIRECT_URL = 0x100000 + 31 PRIMARY_IP = 0x100000 + 32 @@ -371,12 +371,10 @@ class CurlInfo(IntEnum): PRIMARY_PORT = 0x200000 + 40 LOCAL_IP = 0x100000 + 41 LOCAL_PORT = 0x200000 + 42 - TLS_SESSION = 0x400000 + 43 ACTIVESOCKET = 0x500000 + 44 TLS_SSL_PTR = 0x400000 + 45 HTTP_VERSION = 0x200000 + 46 PROXY_SSL_VERIFYRESULT = 0x200000 + 47 - PROTOCOL = 0x200000 + 48 SCHEME = 0x100000 + 49 TOTAL_TIME_T = 0x600000 + 50 NAMELOOKUP_TIME_T = 0x600000 + 51 @@ -492,7 +490,7 @@ class CurlECode(IntEnum): TFTP_UNKNOWNID = 72 REMOTE_FILE_EXISTS = 73 TFTP_NOSUCHUSER = 74 - CONV_FAILED = 75 + OBSOLETE75 = 75 OBSOLETE76 = 76 SSL_CACERT_BADFILE = 77 REMOTE_FILE_NOT_FOUND = 78 @@ -517,6 +515,7 @@ class CurlECode(IntEnum): PROXY = 97 SSL_CLIENTCERT = 98 UNRECOVERABLE_POLL = 99 + ECH_REQUIRED = 100 class CurlHttpVersion(IntEnum): @@ -527,3 +526,12 @@ class CurlHttpVersion(IntEnum): V2TLS = 4 # use version 2 for HTTPS, version 1.1 for HTTP */ V2_PRIOR_KNOWLEDGE = 5 # please use HTTP 2 without HTTP/1.1 Upgrade */ V3 = 30 # Makes use of explicit HTTP/3 without fallback. + + +class CurlWsFlag(IntEnum): + TEXT = (1<<0) + BINARY = (1<<1) + CONT = (1<<2) + CLOSE = (1<<3) + PING = (1<<4) + OFFSET = (1<<5) diff --git a/curl_cffi/curl.py b/curl_cffi/curl.py index 48e4d096..672dcd0b 100644 --- a/curl_cffi/curl.py +++ b/curl_cffi/curl.py @@ -6,7 +6,7 @@ import certifi from ._wrapper import ffi, lib # type: ignore -from .const import CurlHttpVersion, CurlInfo, CurlOpt +from .const import CurlHttpVersion, CurlInfo, CurlOpt, CurlWsFlag DEFAULT_CACERT = certifi.where() @@ -112,6 +112,10 @@ def _set_error_buffer(self): self.setopt(CurlOpt.VERBOSE, 1) lib._curl_easy_setopt(self._curl, CurlOpt.DEBUGFUNCTION, lib.debug_function) + def debug(self): + self.setopt(CurlOpt.VERBOSE, 1) + lib._curl_easy_setopt(self._curl, CurlOpt.DEBUGFUNCTION, lib.debug_function) + def __del__(self): self.close() @@ -341,3 +345,26 @@ def close(self): self._curl = None ffi.release(self._error_buffer) self._resolve = ffi.NULL + + def ws_recv(self, n: int = 1024): + buffer = ffi.new("char[]", n) + n_recv = ffi.new("int *") + p_frame = ffi.new("struct curl_ws_frame **") + + ret = lib.curl_ws_recv(self._curl, buffer, n, n_recv, p_frame) + self._check_error(ret, "WS_RECV") + + # Frame meta explained: https://curl.se/libcurl/c/curl_ws_meta.html + frame = p_frame[0] + + return ffi.buffer(buffer)[: n_recv[0]], frame + + def ws_send(self, payload: bytes, flags: CurlWsFlag = CurlWsFlag.BINARY) -> int: + n_sent = ffi.new("int *") + buffer = ffi.from_buffer(payload) + ret = lib.curl_ws_send(self._curl, buffer, len(buffer), n_sent, 0, flags) + self._check_error(ret, "WS_SEND") + return n_sent[0] + + def ws_close(self): + self.ws_send(b"", CurlWsFlag.CLOSE) diff --git a/curl_cffi/ffi/cdef.c b/curl_cffi/ffi/cdef.c index 25b47289..b5dc98ba 100644 --- a/curl_cffi/ffi/cdef.c +++ b/curl_cffi/ffi/cdef.c @@ -46,3 +46,14 @@ struct CURLMsg *curl_multi_info_read(void* curlm, int *msg_in_queue); extern "Python" void socket_function(void *curl, int sockfd, int what, void *clientp, void *socketp); extern "Python" void timer_function(void *curlm, int timeout_ms, void *clientp); +// websocket +struct curl_ws_frame { + int age; /* zero */ + int flags; /* See the CURLWS_* defines */ + long offset; /* the offset of this data into the frame */ + long bytesleft; /* number of pending bytes left of the payload */ + ...; +}; + +int curl_ws_recv(void *curl, void *buffer, int buflen, int *recv, struct curl_ws_frame **meta); +int curl_ws_send(void *curl, void *buffer, int buflen, int *sent, int fragsize, unsigned int sendflags); diff --git a/curl_cffi/requests/__init__.py b/curl_cffi/requests/__init__.py index 24beb15f..80ab213f 100644 --- a/curl_cffi/requests/__init__.py +++ b/curl_cffi/requests/__init__.py @@ -15,6 +15,7 @@ "Headers", "Request", "Response", + "WebSocket", ] from functools import partial @@ -27,6 +28,7 @@ from .errors import RequestsError from .headers import Headers, HeaderTypes from .session import AsyncSession, BrowserType, Session +from .websockets import WebSocket # ThreadType = Literal["eventlet", "gevent", None] diff --git a/curl_cffi/requests/exceptions.py b/curl_cffi/requests/exceptions.py new file mode 100644 index 00000000..efbda06f --- /dev/null +++ b/curl_cffi/requests/exceptions.py @@ -0,0 +1,3 @@ +from .errors import RequestsError + +RequestsException = RequestsError diff --git a/curl_cffi/requests/session.py b/curl_cffi/requests/session.py index 29418f69..a28c4163 100644 --- a/curl_cffi/requests/session.py +++ b/curl_cffi/requests/session.py @@ -19,6 +19,7 @@ from .errors import RequestsError from .headers import Headers, HeaderTypes from .models import Request, Response +from .websockets import WebSocket try: import gevent @@ -40,15 +41,27 @@ class BrowserType(str, Enum): chrome104 = "chrome104" chrome107 = "chrome107" chrome110 = "chrome110" + chrome116 = "chrome116" + chrome119 = "chrome119" + chrome120 = "chrome120" chrome99_android = "chrome99_android" safari15_3 = "safari15_3" safari15_5 = "safari15_5" + safari17_2_ios = "safari17_2_ios" + + chrome = "chrome120" @classmethod def has(cls, item): return item in cls.__members__ +class BrowserSpec: + """A more structured way of selecting browsers""" + + # TODO + + def _update_url_params(url: str, params: Dict) -> str: """Add GET params to provided URL being aware of existing. @@ -239,7 +252,7 @@ def _set_curl_options( h.update(headers) # remove Host header if it's unnecessary, otherwise curl maybe confused. - # Host header will be automatically add by curl if it's not present. + # Host header will be automatically added by curl if it's not present. # https://github.com/yifeikong/curl_cffi/issues/119 host_header = h.get("Host") if host_header is not None: @@ -575,6 +588,31 @@ def stream(self, *args, **kwargs): finally: rsp.close() + def ws_connect( + self, + url, + *args, + on_message: Optional[Callable[[WebSocket, str], None]] = None, + on_error: Optional[Callable[[WebSocket, str], None]] = None, + on_open: Optional[Callable] = None, + on_close: Optional[Callable] = None, + **kwargs, + ): + self._set_curl_options(self.curl, "GET", url, *args, **kwargs) + + # https://curl.se/docs/websocket.html + self.curl.setopt(CurlOpt.CONNECT_ONLY, 2) + self.curl.perform() + + return WebSocket( + self, + self.curl, + on_message=on_message, + on_error=on_error, + on_open=on_open, + on_close=on_close, + ) + def request( self, method: str, @@ -748,16 +786,20 @@ def __init__( ``` """ super().__init__(**kwargs) - self.loop = loop + self._loop = loop self._acurl = async_curl self.max_clients = max_clients self._closed = False self.init_pool() + @property + def loop(self): + if self._loop is None: + self._loop = asyncio.get_running_loop() + return self._loop + @property def acurl(self): - if self.loop is None: - self.loop = asyncio.get_running_loop() if self._acurl is None: self._acurl = AsyncCurl(loop=self.loop) return self._acurl @@ -818,6 +860,14 @@ async def stream(self, *args, **kwargs): finally: await rsp.aclose() + async def ws_connect(self, url, *args, **kwargs): + curl = await self.pop_curl() + # curl.debug() + self._set_curl_options(curl, "GET", url, *args, **kwargs) + curl.setopt(CurlOpt.CONNECT_ONLY, 2) # https://curl.se/docs/websocket.html + await self.loop.run_in_executor(None, curl.perform) + return WebSocket(self, curl) + async def request( self, method: str, diff --git a/curl_cffi/requests/websockets.py b/curl_cffi/requests/websockets.py new file mode 100644 index 00000000..9fbbbd8a --- /dev/null +++ b/curl_cffi/requests/websockets.py @@ -0,0 +1,109 @@ +from typing import Callable, Optional, Tuple +import asyncio +from curl_cffi.const import CurlECode, CurlWsFlag +from curl_cffi.curl import CurlError + + +ON_MESSAGE_T = Callable[["WebSocket", bytes], None] +ON_ERROR_T = Callable[["WebSocket", CurlError], None] +ON_OPEN_T = Callable[["WebSocket"], None] +ON_CLOSE_T = Callable[["WebSocket"], None] + + +class WebSocket: + def __init__( + self, + session, + curl, + on_message: Optional[ON_MESSAGE_T] = None, + on_error: Optional[ON_ERROR_T] = None, + on_open: Optional[ON_OPEN_T] = None, + on_close: Optional[ON_CLOSE_T] = None, + ): + self.session = session + self.curl = curl + self.on_message = on_message + self.on_error = on_error + self.on_open = on_open + self.on_close = on_close + self.keep_running = True + self._loop = None + + def recv_fragment(self): + return self.curl.ws_recv() + + def recv(self) -> Tuple[bytes, int]: + """ + Receive a frame as bytes. + + libcurl split frames into fragments, so we have to collect all the chunks for + a frame. + """ + chunks = [] + flags = 0 + # TODO use select here + while True: + try: + chunk, frame = self.curl.ws_recv() + flags = frame.flags + chunks.append(chunk) + if frame.bytesleft == 0 and flags & CurlWsFlag.CONT == 0: + break + except CurlError as e: + if e.code == CurlECode.AGAIN: + pass + else: + raise + + return b"".join(chunks), flags + + def send(self, payload: bytes, flags: CurlWsFlag = CurlWsFlag.BINARY): + """Send a data frame""" + return self.curl.ws_send(payload, flags) + + def run_forever(self): + """ + libcurl automatically handles pings and pongs. + + ref: https://curl.se/libcurl/c/libcurl-ws.html + """ + if self.on_open: + self.on_open(self) + try: + # Keep reading the messages and invoke callbacks + while self.keep_running: + try: + msg, flags = self.recv() + if self.on_message: + self.on_message(self, msg) + if flags & CurlWsFlag.CLOSE: + self.keep_running = False + except CurlError as e: + if self.on_error: + self.on_error(self, e) + finally: + if self.on_close: + self.on_close(self) + + def close(self, msg: bytes = b""): + # FIXME how to reset, or can a curl handle connect to two websockets? + self.send(msg, CurlWsFlag.CLOSE) + self.keep_running = False + self.curl.close() + + @property + def loop(self): + if self._loop is None: + self._loop = asyncio.get_running_loop() + return self._loop + + async def arecv(self): + return await self.loop.run_in_executor(None, self.recv) + + async def asend(self, payload: bytes, flags: CurlWsFlag = CurlWsFlag.BINARY): + return await self.loop.run_in_executor(None, self.send, payload, flags) + + async def aclose(self): + await self.loop.run_in_executor(None, self.close) + self.curl.reset() + self.session.push_curl(self.curl) diff --git a/examples/websockets/long_running.py b/examples/websockets/long_running.py new file mode 100644 index 00000000..a539e0be --- /dev/null +++ b/examples/websockets/long_running.py @@ -0,0 +1,41 @@ +from curl_cffi.requests import Session, WebSocket + + +msg_count = 0 + + +def on_message(ws: WebSocket, message): + global msg_count + + print("------------------------------------------------------") + print(message) + print("======================================================") + + msg_count += 1 + if msg_count >= 100: + ws.close() + + +def on_error(ws: WebSocket, error): + print(error) + + +def on_open(ws: WebSocket): + print("For websockets, you need to set $wss_proxy environment variable!\n" + "$https_proxy will not work!") + print(">>> Websocket open!") + + +def on_close(ws: WebSocket): + print("<<< Websocket closed!") + + +with Session() as s: + ws = s.ws_connect( + "wss://api.gemini.com/v1/marketdata/BTCUSD", + on_open=on_open, + on_close=on_close, + on_message=on_message, + on_error=on_error, + ) + ws.run_forever() diff --git a/examples/websockets/long_running_async.py b/examples/websockets/long_running_async.py new file mode 100644 index 00000000..8b69218c --- /dev/null +++ b/examples/websockets/long_running_async.py @@ -0,0 +1,38 @@ +""" +WIP: this has not been implemented yet. +""" +import asyncio +from curl_cffi import requests + + +async def on_message(ws, message): + print(message) + + +async def on_error(ws, error): + print(error) + + +async def on_open(ws): + print("For websockets, you need to set $wss_proxy environment variable!\n" + "$https_proxy will not work!") + print(">>> Websocket open!") + + +async def on_close(ws): + print("<<< Websocket closed!") + + +async def main(): + async with requests.AsyncSession() as s: + ws = await s.ws_connect( + "wss://api.gemini.com/v1/marketdata/BTCUSD", + on_open=on_open, + on_close=on_close, + on_message=on_message, + on_error=on_error, + ) + ws.run_forever() + + +asyncio.run(main()) diff --git a/examples/websockets/short_running.py b/examples/websockets/short_running.py new file mode 100644 index 00000000..486f2a34 --- /dev/null +++ b/examples/websockets/short_running.py @@ -0,0 +1,21 @@ +import asyncio +from curl_cffi import requests + +URL = "ws://echo.websocket.events" + +with requests.Session() as s: + w = s.ws_connect(URL) + w.send(b"Foo") + reply = w.recv() + print(reply) + + +async def async_examples(): + async with requests.AsyncSession() as s: + w = await s.ws_connect(URL) + await w.asend(b"Bar") + reply = await w.arecv() + print(reply) + + +asyncio.run(async_examples()) diff --git a/preprocess/download_so.py b/preprocess/download_so.py index 727ec43e..9e9d7750 100644 --- a/preprocess/download_so.py +++ b/preprocess/download_so.py @@ -6,7 +6,7 @@ uname = platform.uname() -VERSION = "0.5.4" +VERSION = sys.argv[1] if uname.system == "Windows": LIBDIR = "./lib" @@ -32,17 +32,14 @@ def reporthook(blocknum, blocksize, totalsize): if uname.machine == "arm64": # TODO Download my own build of libcurl-impersonate for M1 Mac url = "" - filename = "./curl-impersonate.tar.gz" else: - url = f"https://github.com/lwthiker/curl-impersonate/releases/download/v{VERSION}/libcurl-impersonate-v{VERSION}.{uname.machine}-macos.tar.gz" - filename = "./curl-impersonate.tar.gz" + url = f"https://github.com/yifeikong/curl-impersonate/releases/download/v{VERSION}/libcurl-impersonate-v{VERSION}.{uname.machine}-macos.tar.gz" elif uname.system == "Windows": - url = f"https://github.com/yifeikong/curl-impersonate-win/releases/download/v{VERSION}/curl-impersonate-chrome.tar.gz" - filename = "./curl-impersonate.tar.gz" + url = f"https://github.com/yifeikong/curl-impersonate/releases/download/v{VERSION}/libcurl-impersonate-v{VERSION}.AMD64-win32.tar.gz" else: - url = f"https://github.com/lwthiker/curl-impersonate/releases/download/v{VERSION}/libcurl-impersonate-v{VERSION}.{uname.machine}-linux-gnu.tar.gz" - filename = "./curl-impersonate.tar.gz" + url = f"https://github.com/yifeikong/curl-impersonate/releases/download/v{VERSION}/libcurl-impersonate-v{VERSION}.{uname.machine}-linux-gnu.tar.gz" +filename = "./curl-impersonate.tar.gz" if url: print(f"Download libcurl-impersonate-chrome from {url}") urlretrieve(url, filename, (None if os.getenv("GITHUB_ACTIONS") else reporthook)) diff --git a/preprocess/generate_consts.py b/preprocess/generate_consts.py index 33813b0f..c456754a 100644 --- a/preprocess/generate_consts.py +++ b/preprocess/generate_consts.py @@ -68,3 +68,13 @@ f.write(" V2TLS = 4 # use version 2 for HTTPS, version 1.1 for HTTP */\n") f.write(" V2_PRIOR_KNOWLEDGE = 5 # please use HTTP 2 without HTTP/1.1 Upgrade */\n") f.write(" V3 = 30 # Makes use of explicit HTTP/3 without fallback.\n") + f.write("\n\n") + + + f.write("class CurlWsFlag(IntEnum):\n") + f.write(" TEXT = (1<<0)\n") + f.write(" BINARY = (1<<1)\n") + f.write(" CONT = (1<<2)\n") + f.write(" CLOSE = (1<<3)\n") + f.write(" PING = (1<<4)\n") + f.write(" OFFSET = (1<<5)\n") diff --git a/pyproject.toml b/pyproject.toml index ac8c38e0..d5831cad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "curl_cffi" -version = "0.5.10" +version = "0.6.0b6" authors = [{ name = "Yifei Kong", email = "kong@yifei.me" }] description = "libcurl ffi bindings for Python, with impersonation support" license = { file = "LICENSE" } @@ -46,6 +46,7 @@ dev = [ "trio-typing==0.7.0", "trustme==0.9.0", "uvicorn==0.18.3", + "websockets==11.0.3", ] build = [ "cibuildwheel", @@ -63,6 +64,7 @@ test = [ "trustme==0.9.0", "uvicorn==0.18.3", "proxy.py==2.4.3", + "websockets==11.0.3", ] diff --git a/tests/unittest/conftest.py b/tests/unittest/conftest.py index e5635f77..c69a7da0 100644 --- a/tests/unittest/conftest.py +++ b/tests/unittest/conftest.py @@ -4,6 +4,7 @@ import threading import time import typing +import websockets from asyncio import sleep from collections import defaultdict from urllib.parse import parse_qs @@ -559,6 +560,29 @@ async def watch_restarts(self): # pragma: nocover await self.startup() +async def echo(websocket): + while True: + name = (await websocket.recv()).decode() + # print(f"<<< {name}") + + await websocket.send(name) + # print(f">>> {name}") + + +class TestWebsocketServer: + def __init__(self, port): + self.url = f"ws://127.0.0.1:{port}" + self.port = port + + def run(self): + async def serve(port): + # GitHub actions only likes 127, not localhost, wtf... + async with websockets.serve(echo, "127.0.0.1", port): + await asyncio.Future() # run forever + + asyncio.run(serve(self.port)) + + def serve_in_thread(server: Server): thread = threading.Thread(target=server.run) thread.start() @@ -571,6 +595,19 @@ def serve_in_thread(server: Server): thread.join() +@pytest.fixture(scope="session") +def ws_server(): + server = TestWebsocketServer(port=8964) + thread = threading.Thread(target=server.run, daemon=True) + thread.start() + try: + time.sleep(2) # FIXME find a reliable way to check the server is up + yield server + finally: + pass + # thread.join() + + @pytest.fixture(scope="session") def server(): config = Config(app=app, lifespan="off", loop="asyncio") diff --git a/tests/unittest/test_smoke.py b/tests/unittest/test_smoke.py new file mode 100644 index 00000000..40f46fbc --- /dev/null +++ b/tests/unittest/test_smoke.py @@ -0,0 +1,20 @@ +# Simple smoke test to real world websites +from curl_cffi import requests + + +URLS = [ + "https://www.zhihu.com", + "https://www.weibo.com", +] + + +def test_without_impersonate(): + for url in URLS: + r = requests.get(url) + assert r.status_code == 200 + + +def test_with_impersonate(): + for url in URLS: + r = requests.get(url, impersonate=requests.BrowserType.chrome) + assert r.status_code == 200 diff --git a/tests/unittest/test_websockets.py b/tests/unittest/test_websockets.py new file mode 100644 index 00000000..8b26423b --- /dev/null +++ b/tests/unittest/test_websockets.py @@ -0,0 +1,29 @@ +from curl_cffi.requests import Session + + +def test_websocket(ws_server): + with Session() as s: + s.ws_connect(ws_server.url) + + +def test_hello(ws_server): + with Session() as s: + ws = s.ws_connect(ws_server.url) + ws.send(b"Foo me once") + content, _ = ws.recv() + assert content == b"Foo me once" + + +def test_hello_twice(ws_server): + with Session() as s: + w = s.ws_connect(ws_server.url) + + w.send(b"Bar") + reply, _ = w.recv() + print(reply) + + for _ in range(10): + w.send(b"Bar") + reply, _ = w.recv() + assert reply == b"Bar" + print(reply)