Skip to content

Commit

Permalink
fix: fix decoding error(utf-8,gbk,iso-8859 and other charsets) in rea…
Browse files Browse the repository at this point in the history
…dability (issue #2435) (#2449)
  • Loading branch information
PrinOrange authored Jan 6, 2025
1 parent 1e8196a commit 1de1cd9
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 12 deletions.
1 change: 1 addition & 0 deletions apps/main/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
"@openpanel/web": "1.0.1",
"@sentry/electron": "5.7.0",
"builder-util-runtime": "9.2.10",
"chardet": "^2.0.0",
"cookie-es": "^1.2.2",
"dompurify": "~3.2.2",
"electron-context-menu": "4.0.4",
Expand Down
35 changes: 23 additions & 12 deletions apps/main/src/lib/readability.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { Readability } from "@mozilla/readability"
import { name, version } from "@pkg"
import chardet from "chardet"
import DOMPurify from "dompurify"
import { parseHTML } from "linkedom"
import { fetch } from "ofetch"
Expand All @@ -21,24 +22,34 @@ function sanitizeHTMLString(dirtyDocumentString: string) {
return sanitizedDocumentString
}

/**
* Decodes the response body of a `fetch` request into a string, ensuring proper character set handling.
* @throws Will return "Failed to decode response content." if the decoding process encounters any errors.
*/
async function decodeResponseBodyChars(res: Response) {
// Read the response body as an ArrayBuffer
const buffer = await res.arrayBuffer()
// Step 1: Get charset from Content-Type header
const contentType = res.headers.get("content-type")
const httpCharset = contentType?.match(/charset=([\w-]+)/i)?.[1]
// Step 2: Use charset from Content-Type header or fall back to chardet
const detectedCharset = httpCharset || chardet.detect(Buffer.from(buffer)) || "utf-8"
// Step 3: Decode the response body using the detected charset
try {
const decodedText = new TextDecoder(detectedCharset, { fatal: false }).decode(buffer)
return decodedText
} catch {
return "Failed to decode response content."
}
}

export async function readability(url: string) {
const dirtyDocumentString = await fetch(url, {
headers: {
"User-Agent": userAgents,
Accept: "text/html",
},
}).then(async (res) => {
const contentType = res.headers.get("content-type")
// text/html; charset=GBK
if (!contentType) return res.text()
const charset = contentType.match(/charset=([a-zA-Z-\d]+)/)?.[1]
if (charset) {
const blob = await res.blob()
const buffer = await blob.arrayBuffer()
return new TextDecoder(charset).decode(buffer)
}
return res.text()
})
}).then(decodeResponseBodyChars)

const sanitizedDocumentString = sanitizeHTMLString(dirtyDocumentString)
const baseUrl = new URL(url).origin
Expand Down
30 changes: 30 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 1de1cd9

Please sign in to comment.