Skip to content

Commit

Permalink
feat: Pre-generate encode trie (#776)
Browse files Browse the repository at this point in the history
  • Loading branch information
fb55 authored Apr 2, 2022
1 parent b7b24d6 commit 3ef75e1
Show file tree
Hide file tree
Showing 7 changed files with 133 additions and 45 deletions.
1 change: 0 additions & 1 deletion .prettierignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
node_modules/
coverage/
lib/
src/maps/
maps/
7 changes: 3 additions & 4 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,11 @@
"format:prettier": "npm run prettier -- --write",
"prettier": "prettier '**/*.{ts,md,json,yml}'",
"build": "npm run build:cjs && npm run build:esm",
"build:cjs": "tsc && cp -r src/maps lib",
"build:esm": "tsc --module esnext --target es2019 --outDir lib/esm && npm rum build:esm:fixup && echo '{\"type\":\"module\"}' > lib/esm/package.json",
"build:esm:fixup": "sed -i.b '1s|\".*json\"|\"../maps/entities-encode.json\" assert {type:\"json\"}|' lib/esm/encode-trie.js && rm lib/esm/encode-trie.js.b",
"build:cjs": "tsc",
"build:esm": "tsc --module esnext --target es2019 --outDir lib/esm && echo '{\"type\":\"module\"}' > lib/esm/package.json",
"build:docs": "typedoc --hideGenerator src/index.ts",
"build:trie": "ts-node scripts/write-decode-map.ts",
"build:encode-map": "jq -c 'to_entries | reverse | map( {(.value) : .key } ) | sort | add' maps/entities.json > src/maps/entities-encode.json",
"build:encode-trie": "ts-node scripts/write-encode-map.ts",
"prepare": "npm run build"
},
"repository": {
Expand Down
97 changes: 97 additions & 0 deletions scripts/write-encode-map.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/* eslint-disable node/no-unsupported-features/es-builtins */

import htmlMap from "../maps/entities.json";
import { writeFileSync } from "fs";

interface TrieNode {
/** The value, if the node has a value. */
v?: string;
/** A map with the next nodes, if there are any. */
n?: Map<number, TrieNode>;
}

const htmlTrie = getTrie(htmlMap);
const serialized = serializeTrie(htmlTrie);

writeFileSync(
`${__dirname}/../src/generated/encode-html.ts`,
`// Generated using scripts/write-encode-map.ts
type EncodeTrieNode =
| string
| { v?: string; n: number | Map<number, EncodeTrieNode>; o?: string };
// prettier-ignore
export default ${
// Fix the type of the first map to refer to trie nodes.
serialized.replace("<number,string>", "<number,EncodeTrieNode>")
};
`
);

console.log("Done!");

function getTrie(map: Record<string, string>): Map<number, TrieNode> {
const trie = new Map<number, TrieNode>();

for (const entity of Object.keys(map)) {
const decoded = map[entity];
// Resolve the key
let lastMap = trie;
for (let i = 0; i < decoded.length - 1; i++) {
const char = decoded.charCodeAt(i);
const next = lastMap.get(char) ?? {};
lastMap.set(char, next);
lastMap = next.n ??= new Map();
}
const val = lastMap.get(decoded.charCodeAt(decoded.length - 1)) ?? {};
val.v ??= entity;
lastMap.set(decoded.charCodeAt(decoded.length - 1), val);
}

return trie;
}

function wrapValue(value: string | undefined): string {
if (value == null) throw new Error("unexpected null");

return `"&${value};"`;
}

function serializeTrie(trie: Map<number, TrieNode>): string {
const entries: [number, TrieNode][] = Array.from(trie.entries()).sort(
(a, b) => a[0] - b[0]
);

return `new Map<number,string>([${entries
.map(([key, value]) => {
if (!value.n) {
if (value.v == null) throw new Error("unexpected null");
return `[${key},${wrapValue(value.v)}]`;
}
const entries: string[] = [];
if (value.v != null) {
entries.push(`v:${wrapValue(value.v)}`);
}
/*
* We encode branches as either a number with an `o` (other) value,
* or as a map.
*
* We use a map if there are more than one character in the key.
*/
if (value.n.size > 1) {
entries.push(`n:${serializeTrie(value.n)}`);
} else {
const [cond, other] = Array.from(value.n)[0];
entries.push(`n:${cond},o:${wrapValue(other.v)}`);
}
return `[${key},{${entries.join(",")}}]`;
})
.join(",")}])`;
}
61 changes: 22 additions & 39 deletions src/encode-trie.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import htmlMap from "./maps/entities-encode.json";
import htmlTrie from "./generated/encode-html";

const enum Surrogate {
Mask = 0b1111_1100_0000_0000,
Expand All @@ -23,8 +23,6 @@ export const getCodePoint =
0x10000
: c.charCodeAt(index);

const htmlTrie = getTrie(htmlMap);

export function encodeHTMLTrieRe(regExp: RegExp, str: string): string {
let ret = "";
let lastIdx = 0;
Expand All @@ -33,20 +31,31 @@ export function encodeHTMLTrieRe(regExp: RegExp, str: string): string {
while ((match = regExp.exec(str)) !== null) {
const i = match.index;
const char = str.charCodeAt(i);
const next = htmlTrie.get(char);
let next = htmlTrie.get(char);

if (next != null) {
if (typeof next !== "string") {
// We are in a branch. Try to match the next char.
if (i + 1 < str.length) {
const value =
typeof next.n === "number"
? next.n === str.charCodeAt(i + 1)
? next.o
: null
: next.n.get(str.charCodeAt(i + 1));

if (next) {
if (next.next != null && i + 1 < str.length) {
const value = next.next.get(str.charCodeAt(i + 1))?.value;
if (value != null) {
ret += str.substring(lastIdx, i) + value;
regExp.lastIndex += 1;
lastIdx = i + 2;
continue;
if (value) {
ret += str.substring(lastIdx, i) + value;
lastIdx = regExp.lastIndex += 1;
continue;
}
}

// If we have a character without a value, use a numeric entitiy.
next = next.v ?? `&#x${char.toString(16)};`;
}

ret += str.substring(lastIdx, i) + next.value;
ret += str.substring(lastIdx, i) + next;
lastIdx = i + 1;
} else {
ret += `${str.substring(lastIdx, i)}&#x${getCodePoint(
Expand All @@ -60,29 +69,3 @@ export function encodeHTMLTrieRe(regExp: RegExp, str: string): string {

return ret + str.substr(lastIdx);
}

export interface TrieNode {
value?: string;
next?: Map<number, TrieNode>;
}

export function getTrie(map: Record<string, string>): Map<number, TrieNode> {
const trie = new Map<number, TrieNode>();

for (const decoded of Object.keys(map)) {
const entity = map[decoded];
// Resolve the key
let lastMap = trie;
for (let i = 0; i < decoded.length - 1; i++) {
const char = decoded.charCodeAt(i);
const next = lastMap.get(char) ?? {};
lastMap.set(char, next);
lastMap = next.next ??= new Map();
}
const val = lastMap.get(decoded.charCodeAt(decoded.length - 1)) ?? {};
val.value ??= `&${entity};`;
lastMap.set(decoded.charCodeAt(decoded.length - 1), val);
}

return trie;
}
3 changes: 3 additions & 0 deletions src/encode.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ describe("Encode->decode test", () => {
expect(decoded).toBe(char);
}
});

it("should encode trailing parts of entities", () =>
expect(entities.encodeHTML("\ud835")).toBe("&#xd835;"));
});

describe("encodeNonAsciiHTML", () => {
Expand Down
8 changes: 8 additions & 0 deletions src/generated/encode-html.ts

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion src/maps/entities-encode.json

This file was deleted.

0 comments on commit 3ef75e1

Please sign in to comment.