From 1de1cd9ae18d89e88805816b7fdea4e1438cd39f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=86=E4=B8=BA=E5=90=9B=E6=95=85?= Date: Mon, 6 Jan 2025 08:59:57 +0800 Subject: [PATCH] fix: fix decoding error(utf-8,gbk,iso-8859 and other charsets) in readability (issue #2435) (#2449) --- apps/main/package.json | 1 + apps/main/src/lib/readability.ts | 35 +++++++++++++++++++++----------- pnpm-lock.yaml | 30 +++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 12 deletions(-) diff --git a/apps/main/package.json b/apps/main/package.json index 4df5592b97..d7aab971b1 100644 --- a/apps/main/package.json +++ b/apps/main/package.json @@ -31,6 +31,7 @@ "@openpanel/web": "1.0.1", "@sentry/electron": "5.7.0", "builder-util-runtime": "9.2.10", + "chardet": "^2.0.0", "cookie-es": "^1.2.2", "dompurify": "~3.2.2", "electron-context-menu": "4.0.4", diff --git a/apps/main/src/lib/readability.ts b/apps/main/src/lib/readability.ts index 3dfa9e177e..a67d15ac0f 100644 --- a/apps/main/src/lib/readability.ts +++ b/apps/main/src/lib/readability.ts @@ -1,5 +1,6 @@ import { Readability } from "@mozilla/readability" import { name, version } from "@pkg" +import chardet from "chardet" import DOMPurify from "dompurify" import { parseHTML } from "linkedom" import { fetch } from "ofetch" @@ -21,24 +22,34 @@ function sanitizeHTMLString(dirtyDocumentString: string) { return sanitizedDocumentString } +/** + * Decodes the response body of a `fetch` request into a string, ensuring proper character set handling. + * @throws Will return "Failed to decode response content." if the decoding process encounters any errors. + */ +async function decodeResponseBodyChars(res: Response) { + // Read the response body as an ArrayBuffer + const buffer = await res.arrayBuffer() + // Step 1: Get charset from Content-Type header + const contentType = res.headers.get("content-type") + const httpCharset = contentType?.match(/charset=([\w-]+)/i)?.[1] + // Step 2: Use charset from Content-Type header or fall back to chardet + const detectedCharset = httpCharset || chardet.detect(Buffer.from(buffer)) || "utf-8" + // Step 3: Decode the response body using the detected charset + try { + const decodedText = new TextDecoder(detectedCharset, { fatal: false }).decode(buffer) + return decodedText + } catch { + return "Failed to decode response content." + } +} + export async function readability(url: string) { const dirtyDocumentString = await fetch(url, { headers: { "User-Agent": userAgents, Accept: "text/html", }, - }).then(async (res) => { - const contentType = res.headers.get("content-type") - // text/html; charset=GBK - if (!contentType) return res.text() - const charset = contentType.match(/charset=([a-zA-Z-\d]+)/)?.[1] - if (charset) { - const blob = await res.blob() - const buffer = await blob.arrayBuffer() - return new TextDecoder(charset).decode(buffer) - } - return res.text() - }) + }).then(decodeResponseBodyChars) const sanitizedDocumentString = sanitizeHTMLString(dirtyDocumentString) const baseUrl = new URL(url).origin diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 943839e19d..891cf0c6b3 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -318,6 +318,9 @@ importers: builder-util-runtime: specifier: 9.2.10 version: 9.2.10 + chardet: + specifier: ^2.0.0 + version: 2.0.0 cookie-es: specifier: ^1.2.2 version: 1.2.2 @@ -5079,24 +5082,28 @@ packages: engines: {node: '>= 10'} cpu: [arm64] os: [linux] + libc: [glibc] '@resvg/resvg-js-linux-arm64-musl@2.6.2': resolution: {integrity: sha512-3h3dLPWNgSsD4lQBJPb4f+kvdOSJHa5PjTYVsWHxLUzH4IFTJUAnmuWpw4KqyQ3NA5QCyhw4TWgxk3jRkQxEKg==} engines: {node: '>= 10'} cpu: [arm64] os: [linux] + libc: [musl] '@resvg/resvg-js-linux-x64-gnu@2.6.2': resolution: {integrity: sha512-IVUe+ckIerA7xMZ50duAZzwf1U7khQe2E0QpUxu5MBJNao5RqC0zwV/Zm965vw6D3gGFUl7j4m+oJjubBVoftw==} engines: {node: '>= 10'} cpu: [x64] os: [linux] + libc: [glibc] '@resvg/resvg-js-linux-x64-musl@2.6.2': resolution: {integrity: sha512-UOf83vqTzoYQO9SZ0fPl2ZIFtNIz/Rr/y+7X8XRX1ZnBYsQ/tTb+cj9TE+KHOdmlTFBxhYzVkP2lRByCzqi4jQ==} engines: {node: '>= 10'} cpu: [x64] os: [linux] + libc: [musl] '@resvg/resvg-js-win32-arm64-msvc@2.6.2': resolution: {integrity: sha512-7C/RSgCa+7vqZ7qAbItfiaAWhyRSoD4l4BQAbVDqRRsRgY+S+hgS3in0Rxr7IorKUpGE69X48q6/nOAuTJQxeQ==} @@ -5207,51 +5214,61 @@ packages: resolution: {integrity: sha512-QAg11ZIt6mcmzpNE6JZBpKfJaKkqTm1A9+y9O+frdZJEuhQxiugM05gnCWiANHj4RmbgeVJpTdmKRmH/a+0QbA==} cpu: [arm] os: [linux] + libc: [glibc] '@rollup/rollup-linux-arm-musleabihf@4.28.1': resolution: {integrity: sha512-dRP9PEBfolq1dmMcFqbEPSd9VlRuVWEGSmbxVEfiq2cs2jlZAl0YNxFzAQS2OrQmsLBLAATDMb3Z6MFv5vOcXg==} cpu: [arm] os: [linux] + libc: [musl] '@rollup/rollup-linux-arm64-gnu@4.28.1': resolution: {integrity: sha512-uGr8khxO+CKT4XU8ZUH1TTEUtlktK6Kgtv0+6bIFSeiSlnGJHG1tSFSjm41uQ9sAO/5ULx9mWOz70jYLyv1QkA==} cpu: [arm64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-arm64-musl@4.28.1': resolution: {integrity: sha512-QF54q8MYGAqMLrX2t7tNpi01nvq5RI59UBNx+3+37zoKX5KViPo/gk2QLhsuqok05sSCRluj0D00LzCwBikb0A==} cpu: [arm64] os: [linux] + libc: [musl] '@rollup/rollup-linux-loongarch64-gnu@4.28.1': resolution: {integrity: sha512-vPul4uodvWvLhRco2w0GcyZcdyBfpfDRgNKU+p35AWEbJ/HPs1tOUrkSueVbBS0RQHAf/A+nNtDpvw95PeVKOA==} cpu: [loong64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-powerpc64le-gnu@4.28.1': resolution: {integrity: sha512-pTnTdBuC2+pt1Rmm2SV7JWRqzhYpEILML4PKODqLz+C7Ou2apEV52h19CR7es+u04KlqplggmN9sqZlekg3R1A==} cpu: [ppc64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-riscv64-gnu@4.28.1': resolution: {integrity: sha512-vWXy1Nfg7TPBSuAncfInmAI/WZDd5vOklyLJDdIRKABcZWojNDY0NJwruY2AcnCLnRJKSaBgf/GiJfauu8cQZA==} cpu: [riscv64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-s390x-gnu@4.28.1': resolution: {integrity: sha512-/yqC2Y53oZjb0yz8PVuGOQQNOTwxcizudunl/tFs1aLvObTclTwZ0JhXF2XcPT/zuaymemCDSuuUPXJJyqeDOg==} cpu: [s390x] os: [linux] + libc: [glibc] '@rollup/rollup-linux-x64-gnu@4.28.1': resolution: {integrity: sha512-fzgeABz7rrAlKYB0y2kSEiURrI0691CSL0+KXwKwhxvj92VULEDQLpBYLHpF49MSiPG4sq5CK3qHMnb9tlCjBw==} cpu: [x64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-x64-musl@4.28.1': resolution: {integrity: sha512-xQTDVzSGiMlSshpJCtudbWyRfLaNiVPXt1WgdWTwWz9n0U12cI2ZVtWe/Jgwyv/6wjL7b66uu61Vg0POWVfz4g==} cpu: [x64] os: [linux] + libc: [musl] '@rollup/rollup-win32-arm64-msvc@4.28.1': resolution: {integrity: sha512-wSXmDRVupJstFP7elGMgv+2HqXelQhuNf+IS4V+nUpNVi/GUiBgDmfwD0UGN3pcAnWsgKG3I52wMOBnk1VHr/A==} @@ -6704,6 +6721,9 @@ packages: character-reference-invalid@2.0.1: resolution: {integrity: sha512-iBZ4F4wRbyORVsu0jPV7gXkOsGYjGHPmAyv+HiHG8gi5PtC9KI2j1+v8/tlibRvjoWX027ypmG/n0HtO5t7unw==} + chardet@2.0.0: + resolution: {integrity: sha512-xVgPpulCooDjY6zH4m9YW3jbkaBe3FKIAvF5sj5t7aBNsVl2ljIE+xwJ4iNgiDZHFQvNIpjdKdVOQvvk5ZfxbQ==} + charenc@0.0.2: resolution: {integrity: sha512-yrLQ/yVUFXkzg7EDQsPieE/53+0RlaWTs+wBrvW36cyilJ2SaDWfl4Yj7MtLTXleV9uEKefbAGUPv2/iWSooRA==} @@ -10217,48 +10237,56 @@ packages: engines: {node: '>= 12.0.0'} cpu: [arm64] os: [linux] + libc: [glibc] lightningcss-linux-arm64-gnu@1.28.2: resolution: {integrity: sha512-nhfjYkfymWZSxdtTNMWyhFk2ImUm0X7NAgJWFwnsYPOfmtWQEapzG/DXZTfEfMjSzERNUNJoQjPAbdqgB+sjiw==} engines: {node: '>= 12.0.0'} cpu: [arm64] os: [linux] + libc: [glibc] lightningcss-linux-arm64-musl@1.27.0: resolution: {integrity: sha512-rCGBm2ax7kQ9pBSeITfCW9XSVF69VX+fm5DIpvDZQl4NnQoMQyRwhZQm9pd59m8leZ1IesRqWk2v/DntMo26lg==} engines: {node: '>= 12.0.0'} cpu: [arm64] os: [linux] + libc: [musl] lightningcss-linux-arm64-musl@1.28.2: resolution: {integrity: sha512-1SPG1ZTNnphWvAv8RVOymlZ8BDtAg69Hbo7n4QxARvkFVCJAt0cgjAw1Fox0WEhf4PwnyoOBaVH0Z5YNgzt4dA==} engines: {node: '>= 12.0.0'} cpu: [arm64] os: [linux] + libc: [musl] lightningcss-linux-x64-gnu@1.27.0: resolution: {integrity: sha512-Dk/jovSI7qqhJDiUibvaikNKI2x6kWPN79AQiD/E/KeQWMjdGe9kw51RAgoWFDi0coP4jinaH14Nrt/J8z3U4A==} engines: {node: '>= 12.0.0'} cpu: [x64] os: [linux] + libc: [glibc] lightningcss-linux-x64-gnu@1.28.2: resolution: {integrity: sha512-ZhQy0FcO//INWUdo/iEdbefntTdpPVQ0XJwwtdbBuMQe+uxqZoytm9M+iqR9O5noWFaxK+nbS2iR/I80Q2Ofpg==} engines: {node: '>= 12.0.0'} cpu: [x64] os: [linux] + libc: [glibc] lightningcss-linux-x64-musl@1.27.0: resolution: {integrity: sha512-QKjTxXm8A9s6v9Tg3Fk0gscCQA1t/HMoF7Woy1u68wCk5kS4fR+q3vXa1p3++REW784cRAtkYKrPy6JKibrEZA==} engines: {node: '>= 12.0.0'} cpu: [x64] os: [linux] + libc: [musl] lightningcss-linux-x64-musl@1.28.2: resolution: {integrity: sha512-alb/j1NMrgQmSFyzTbN1/pvMPM+gdDw7YBuQ5VSgcFDypN3Ah0BzC2dTZbzwzaMdUVDszX6zH5MzjfVN1oGuww==} engines: {node: '>= 12.0.0'} cpu: [x64] os: [linux] + libc: [musl] lightningcss-win32-arm64-msvc@1.27.0: resolution: {integrity: sha512-/wXegPS1hnhkeG4OXQKEMQeJd48RDC3qdh+OA8pCuOPCyvnm/yEayrJdJVqzBsqpy1aJklRCVxscpFur80o6iQ==} @@ -21595,6 +21623,8 @@ snapshots: character-reference-invalid@2.0.1: {} + chardet@2.0.0: {} + charenc@0.0.2: {} check-error@2.1.1: {}