Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(decode): Add EntityDecoder class #1136

Merged
merged 30 commits into from
Apr 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
d90eade
feat: Add `EntityDecoder` class
fb55 Mar 27, 2023
f82785d
Add comments
fb55 Mar 27, 2023
ac1ebf7
Rename `reset` to `startEntity`, move `isAttribute`
fb55 Mar 27, 2023
6fb8f25
Update indices
fb55 Mar 28, 2023
832bf9d
Update decode.ts
fb55 Mar 28, 2023
02af4c8
Add `EntityDecoderMode`
fb55 Apr 1, 2023
cfb8239
Fix numeric entities
fb55 Apr 1, 2023
71b6867
Emit code points instead of strings
fb55 Apr 1, 2023
1c69162
Add `this.treeCurrent`
fb55 Apr 2, 2023
f19a8a4
Merge `codepoint` and `resultIdx`
fb55 Apr 2, 2023
7dac658
Add `consumed` to callback
fb55 Apr 2, 2023
3593b3b
Add tests from htmlparser2
fb55 Apr 2, 2023
96f320b
Remove unnecessary `this.treeCurrent`
fb55 Apr 2, 2023
0731486
Support attribute decoding mode
fb55 Apr 2, 2023
3d9c577
Simplify named entity data emission
fb55 Apr 2, 2023
bd1af64
s/treeIdx/treeIndex
fb55 Apr 3, 2023
1a1b373
s/strIdx/offset
fb55 Apr 3, 2023
de6b9f5
s/lastIdx/lastIndex
fb55 Apr 3, 2023
b93c7af
Remove TODO
fb55 Apr 3, 2023
7b3a210
Add entity error handling
fb55 Apr 5, 2023
ac06454
Add more tests, docs
fb55 Apr 5, 2023
36f8645
Keep the `DecodingMode` enum
fb55 Apr 5, 2023
b49cf05
Simplify level & mode handling
fb55 Apr 5, 2023
9c3ed1b
Fix entities in attributes followed by letters
fb55 Apr 6, 2023
726e9f9
Emit second code points with `consumed=0`
fb55 Apr 6, 2023
308e84c
Change order of errors
fb55 Apr 6, 2023
ec5e9c6
Revert "Emit second code points with `consumed=0`"
fb55 Apr 6, 2023
aef3927
Add consumed count to `absenceOfDigitsInNumericCharacterReference`
fb55 Apr 8, 2023
39bf309
Improve test coverage
fb55 Apr 8, 2023
f1bf302
Document functions
fb55 Apr 8, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
237 changes: 237 additions & 0 deletions src/decode.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ describe("Decode test", () => {
{ input: ":", output: ":" },
{ input: ":", output: ":" },
{ input: ":", output: ":" },
{ input: "&#", output: "&#" },
{ input: "&>", output: "&>" },
{ input: "id=770&#anchor", output: "id=770&#anchor" },
];
Expand Down Expand Up @@ -42,4 +43,240 @@ describe("Decode test", () => {

it("should parse &nbsp followed by < (#852)", () =>
expect(entities.decodeHTML("&nbsp<")).toBe("\u00a0<"));

it("should decode trailing legacy entities", () => {
expect(entities.decodeHTML("&timesbar;&timesbar")).toBe("⨱×bar");
});

it("should decode multi-byte entities", () => {
expect(entities.decodeHTML("&NotGreaterFullEqual;")).toBe("≧̸");
});

it("should not decode legacy entities followed by text in attribute mode", () => {
expect(
entities.decodeHTML("&not", entities.DecodingMode.Attribute)
).toBe("¬");

expect(
entities.decodeHTML("&noti", entities.DecodingMode.Attribute)
).toBe("&noti");

expect(
entities.decodeHTML("&not=", entities.DecodingMode.Attribute)
).toBe("&not=");

expect(entities.decodeHTMLAttribute("&notp")).toBe("&notp");
expect(entities.decodeHTMLAttribute("&notP")).toBe("&notP");
expect(entities.decodeHTMLAttribute("&not3")).toBe("&not3");
});
});

describe("EntityDecoder", () => {
it("should decode decimal entities", () => {
const cb = jest.fn();
const decoder = new entities.EntityDecoder(entities.htmlDecodeTree, cb);

expect(decoder.write("&#5", 1)).toBe(-1);
expect(decoder.write("8;", 0)).toBe(5);

expect(cb).toHaveBeenCalledTimes(1);
expect(cb).toHaveBeenCalledWith(":".charCodeAt(0), 5);
});

it("should decode hex entities", () => {
const cb = jest.fn();
const decoder = new entities.EntityDecoder(entities.htmlDecodeTree, cb);

expect(decoder.write("&#x3a;", 1)).toBe(6);

expect(cb).toHaveBeenCalledTimes(1);
expect(cb).toHaveBeenCalledWith(":".charCodeAt(0), 6);
});

it("should decode named entities", () => {
const cb = jest.fn();
const decoder = new entities.EntityDecoder(entities.htmlDecodeTree, cb);

expect(decoder.write("&amp;", 1)).toBe(5);

expect(cb).toHaveBeenCalledTimes(1);
expect(cb).toHaveBeenCalledWith("&".charCodeAt(0), 5);
});

it("should decode legacy entities", () => {
const cb = jest.fn();
const decoder = new entities.EntityDecoder(entities.htmlDecodeTree, cb);
decoder.startEntity(entities.DecodingMode.Legacy);

expect(decoder.write("&amp", 1)).toBe(-1);

expect(cb).toHaveBeenCalledTimes(0);

expect(decoder.end()).toBe(4);

expect(cb).toHaveBeenCalledTimes(1);
expect(cb).toHaveBeenCalledWith("&".charCodeAt(0), 4);
});

it("should decode named entity written character by character", () => {
const cb = jest.fn();
const decoder = new entities.EntityDecoder(entities.htmlDecodeTree, cb);

for (const c of "amp") {
expect(decoder.write(c, 0)).toBe(-1);
}
expect(decoder.write(";", 0)).toBe(5);

expect(cb).toHaveBeenCalledTimes(1);
expect(cb).toHaveBeenCalledWith("&".charCodeAt(0), 5);
});

it("should decode numeric entity written character by character", () => {
const cb = jest.fn();
const decoder = new entities.EntityDecoder(entities.htmlDecodeTree, cb);

for (const c of "#x3a") {
expect(decoder.write(c, 0)).toBe(-1);
}
expect(decoder.write(";", 0)).toBe(6);

expect(cb).toHaveBeenCalledTimes(1);
expect(cb).toHaveBeenCalledWith(":".charCodeAt(0), 6);
});

it("should not fail if nothing is written", () => {
const cb = jest.fn();
const decoder = new entities.EntityDecoder(entities.htmlDecodeTree, cb);

expect(decoder.end()).toBe(0);
expect(cb).toHaveBeenCalledTimes(0);
});

describe("errors", () => {
it("should produce an error for a named entity without a semicolon", () => {
const errorHandlers = {
missingSemicolonAfterCharacterReference: jest.fn(),
absenceOfDigitsInNumericCharacterReference: jest.fn(),
validateNumericCharacterReference: jest.fn(),
};
const cb = jest.fn();
const decoder = new entities.EntityDecoder(
entities.htmlDecodeTree,
cb,
errorHandlers
);

decoder.startEntity(entities.DecodingMode.Legacy);
expect(decoder.write("&amp;", 1)).toBe(5);
expect(cb).toHaveBeenCalledTimes(1);
expect(cb).toHaveBeenCalledWith("&".charCodeAt(0), 5);
expect(
errorHandlers.missingSemicolonAfterCharacterReference
).toHaveBeenCalledTimes(0);

decoder.startEntity(entities.DecodingMode.Legacy);
expect(decoder.write("&amp", 1)).toBe(-1);
expect(decoder.end()).toBe(4);

expect(cb).toHaveBeenCalledTimes(2);
expect(cb).toHaveBeenLastCalledWith("&".charCodeAt(0), 4);
expect(
errorHandlers.missingSemicolonAfterCharacterReference
).toHaveBeenCalledTimes(1);
});

it("should produce an error for a numeric entity without a semicolon", () => {
const errorHandlers = {
missingSemicolonAfterCharacterReference: jest.fn(),
absenceOfDigitsInNumericCharacterReference: jest.fn(),
validateNumericCharacterReference: jest.fn(),
};
const cb = jest.fn();
const decoder = new entities.EntityDecoder(
entities.htmlDecodeTree,
cb,
errorHandlers
);

decoder.startEntity(entities.DecodingMode.Legacy);
expect(decoder.write("&#x3a", 1)).toBe(-1);
expect(decoder.end()).toBe(5);

expect(cb).toHaveBeenCalledTimes(1);
expect(cb).toHaveBeenCalledWith(0x3a, 5);
expect(
errorHandlers.missingSemicolonAfterCharacterReference
).toHaveBeenCalledTimes(1);
expect(
errorHandlers.absenceOfDigitsInNumericCharacterReference
).toHaveBeenCalledTimes(0);
expect(
errorHandlers.validateNumericCharacterReference
).toHaveBeenCalledTimes(1);
expect(
errorHandlers.validateNumericCharacterReference
).toHaveBeenCalledWith(0x3a);
});

it("should produce an error for numeric entities without digits", () => {
const errorHandlers = {
missingSemicolonAfterCharacterReference: jest.fn(),
absenceOfDigitsInNumericCharacterReference: jest.fn(),
validateNumericCharacterReference: jest.fn(),
};
const cb = jest.fn();
const decoder = new entities.EntityDecoder(
entities.htmlDecodeTree,
cb,
errorHandlers
);

decoder.startEntity(entities.DecodingMode.Legacy);
expect(decoder.write("&#", 1)).toBe(-1);
expect(decoder.end()).toBe(0);

expect(cb).toHaveBeenCalledTimes(0);
expect(
errorHandlers.missingSemicolonAfterCharacterReference
).toHaveBeenCalledTimes(0);
expect(
errorHandlers.absenceOfDigitsInNumericCharacterReference
).toHaveBeenCalledTimes(1);
expect(
errorHandlers.absenceOfDigitsInNumericCharacterReference
).toHaveBeenCalledWith(2);
expect(
errorHandlers.validateNumericCharacterReference
).toHaveBeenCalledTimes(0);
});

it("should produce an error for hex entities without digits", () => {
const errorHandlers = {
missingSemicolonAfterCharacterReference: jest.fn(),
absenceOfDigitsInNumericCharacterReference: jest.fn(),
validateNumericCharacterReference: jest.fn(),
};
const cb = jest.fn();
const decoder = new entities.EntityDecoder(
entities.htmlDecodeTree,
cb,
errorHandlers
);

decoder.startEntity(entities.DecodingMode.Legacy);
expect(decoder.write("&#x", 1)).toBe(-1);
expect(decoder.end()).toBe(0);

expect(cb).toHaveBeenCalledTimes(0);
expect(
errorHandlers.missingSemicolonAfterCharacterReference
).toHaveBeenCalledTimes(0);
expect(
errorHandlers.absenceOfDigitsInNumericCharacterReference
).toHaveBeenCalledTimes(1);
expect(
errorHandlers.validateNumericCharacterReference
).toHaveBeenCalledTimes(0);
});
});
});
Loading
Loading