From defe27baa0fbd707f935c28a1f14eefeb823e688 Mon Sep 17 00:00:00 2001 From: Andrew Duthie Date: Tue, 29 Jan 2019 12:04:59 -0500 Subject: [PATCH] Block API: Parse entity only when valid character reference (#13512) * Block API: Rename IdentityEntityParser as DecodeEntityParser * Block API: Parse entity only when valid character reference --- packages/blocks/CHANGELOG.md | 4 ++ packages/blocks/src/api/test/validation.js | 49 +++++++++++-- packages/blocks/src/api/validation.js | 81 ++++++++++++++++++++-- 3 files changed, 123 insertions(+), 11 deletions(-) diff --git a/packages/blocks/CHANGELOG.md b/packages/blocks/CHANGELOG.md index 4d607751d8a31b..e4a1b5977bcc01 100644 --- a/packages/blocks/CHANGELOG.md +++ b/packages/blocks/CHANGELOG.md @@ -4,6 +4,10 @@ - Blocks' `transforms` will receive `innerBlocks` as the second argument (or an array of each block's respective `innerBlocks` for a multi-transform). +### Bug Fixes + +- Block validation will now correctly validate character references, resolving some issues where a standalone ampersand `&` followed later in markup by a character reference (e.g. `&`) could wrongly mark a block as being invalid. ([#13512](https://github.com/WordPress/gutenberg/pull/13512)) + ## 6.0.5 (2019-01-03) ## 6.0.4 (2018-12-12) diff --git a/packages/blocks/src/api/test/validation.js b/packages/blocks/src/api/test/validation.js index babcad7081ae0e..fcfb61bff6ae2c 100644 --- a/packages/blocks/src/api/test/validation.js +++ b/packages/blocks/src/api/test/validation.js @@ -2,7 +2,8 @@ * Internal dependencies */ import { - IdentityEntityParser, + isValidCharacterReference, + DecodeEntityParser, getTextPiecesSplitOnWhitespace, getTextWithCollapsedWhitespace, getMeaningfulAttributePairs, @@ -41,13 +42,39 @@ describe( 'validation', () => { } ); } ); - describe( 'IdentityEntityParser', () => { + describe( 'isValidCharacterReference', () => { + it( 'returns true for a named character reference', () => { + const result = isValidCharacterReference( 'blk12' ); + + expect( result ).toBe( true ); + } ); + + it( 'returns true for a decimal character reference', () => { + const result = isValidCharacterReference( '#33' ); + + expect( result ).toBe( true ); + } ); + + it( 'returns true for a hexadecimal character reference', () => { + const result = isValidCharacterReference( '#xC6' ); + + expect( result ).toBe( true ); + } ); + + it( 'returns false for an invalid character reference', () => { + const result = isValidCharacterReference( ' Test

Test &' ); + + expect( result ).toBe( false ); + } ); + } ); + + describe( 'DecodeEntityParser', () => { it( 'can be constructed', () => { - expect( new IdentityEntityParser() instanceof IdentityEntityParser ).toBe( true ); + expect( new DecodeEntityParser() instanceof DecodeEntityParser ).toBe( true ); } ); it( 'returns parse as decoded value', () => { - expect( new IdentityEntityParser().parse( 'quot' ) ).toBe( '"' ); + expect( new DecodeEntityParser().parse( 'quot' ) ).toBe( '"' ); } ); } ); @@ -397,6 +424,20 @@ describe( 'validation', () => { expect( isEquivalent ).toBe( true ); } ); + it( 'should account for character reference validity', () => { + // Regression: Previously the validator would wrongly evaluate the + // segment of text ` Test

Test &` as a character + // reference, as it's between an opening `&` and terminating `;`. + // + // See: https://github.com/WordPress/gutenberg/issues/12448 + const isEquivalent = isEquivalentHTML( + '

Test & Test

Test & Test

', + '

Test & Test

Test & Test

' + ); + + expect( isEquivalent ).toBe( true ); + } ); + it( 'should return false when more tokens in first', () => { const isEquivalent = isEquivalentHTML( '
Hello
', diff --git a/packages/blocks/src/api/validation.js b/packages/blocks/src/api/validation.js index 4ccf580b8ffdd9..452c8f832a43fc 100644 --- a/packages/blocks/src/api/validation.js +++ b/packages/blocks/src/api/validation.js @@ -154,24 +154,91 @@ const TEXT_NORMALIZATIONS = [ ]; /** - * Subsitute EntityParser class for `simple-html-tokenizer` which bypasses - * entity substitution in favor of validator's internal normalization. + * Regular expression matching a named character reference. In lieu of bundling + * a full set of references, the pattern covers the minimal necessary to test + * positively against the full set. + * + * "The ampersand must be followed by one of the names given in the named + * character references section, using the same case." + * + * Tested aginst "12.5 Named character references": + * + * ``` + * const references = [ ...document.querySelectorAll( + * '#named-character-references-table tr[id^=entity-] td:first-child' + * ) ].map( ( code ) => code.textContent ) + * references.every( ( reference ) => /^[\da-z]+$/i.test( reference ) ) + * ``` + * + * @link https://html.spec.whatwg.org/multipage/syntax.html#character-references + * @link https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references + * + * @type {RegExp} + */ +const REGEXP_NAMED_CHARACTER_REFERENCE = /^[\da-z]+$/i; + +/** + * Regular expression matching a decimal character reference. + * + * "The ampersand must be followed by a U+0023 NUMBER SIGN character (#), + * followed by one or more ASCII digits, representing a base-ten integer" + * + * @link https://html.spec.whatwg.org/multipage/syntax.html#character-references + * + * @type {RegExp} + */ +const REGEXP_DECIMAL_CHARACTER_REFERENCE = /^#\d+$/; + +/** + * Regular expression matching a hexadecimal character reference. + * + * "The ampersand must be followed by a U+0023 NUMBER SIGN character (#), which + * must be followed by either a U+0078 LATIN SMALL LETTER X character (x) or a + * U+0058 LATIN CAPITAL LETTER X character (X), which must then be followed by + * one or more ASCII hex digits, representing a hexadecimal integer" + * + * @link https://html.spec.whatwg.org/multipage/syntax.html#character-references + * + * @type {RegExp} + */ +const REGEXP_HEXADECIMAL_CHARACTER_REFERENCE = /^#x[\da-f]+$/i; + +/** + * Returns true if the given string is a valid character reference segment, or + * false otherwise. The text should be stripped of `&` and `;` demarcations. + * + * @param {string} text Text to test. + * + * @return {boolean} Whether text is valid character reference. + */ +export function isValidCharacterReference( text ) { + return ( + REGEXP_NAMED_CHARACTER_REFERENCE.test( text ) || + REGEXP_DECIMAL_CHARACTER_REFERENCE.test( text ) || + REGEXP_HEXADECIMAL_CHARACTER_REFERENCE.test( text ) + ); +} + +/** + * Subsitute EntityParser class for `simple-html-tokenizer` which uses the + * implementation of `decodeEntities` from `html-entities`, in order to avoid + * bundling a massive named character reference. * * @see https://github.com/tildeio/simple-html-tokenizer/tree/master/src/entity-parser.ts */ -export class IdentityEntityParser { +export class DecodeEntityParser { /** * Returns a substitute string for an entity string sequence between `&` * and `;`, or undefined if no substitution should occur. * - * In this implementation, undefined is always returned. - * * @param {string} entity Entity fragment discovered in HTML. * * @return {?string} Entity substitute value. */ parse( entity ) { - return decodeEntities( '&' + entity + ';' ); + if ( isValidCharacterReference( entity ) ) { + return decodeEntities( '&' + entity + ';' ); + } } } @@ -451,7 +518,7 @@ export function getNextNonWhitespaceToken( tokens ) { */ function getHTMLTokens( html ) { try { - return new Tokenizer( new IdentityEntityParser() ).tokenize( html ); + return new Tokenizer( new DecodeEntityParser() ).tokenize( html ); } catch ( e ) { log.warning( 'Malformed HTML detected: %s', html ); }