Skip to content

Commit

Permalink
Ignore attributes with illegal chars in name (fixes #23)
Browse files Browse the repository at this point in the history
This is neccesary because method "DOMElement::setAttribute"
throws exception for wrong names so DOM elements
can't contain these attributes.
  • Loading branch information
miso-belica committed Feb 21, 2014
1 parent 95f3cf8 commit 8f95f4a
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 6 deletions.
8 changes: 7 additions & 1 deletion src/HTML5/Parser/DOMTreeBuilder.php
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,13 @@ public function startTag($name, $attributes = array(), $selfClosing = FALSE) {
$aName = Elements::normalizeMathMlAttribute($aName);
}

$ele->setAttribute($aName, $aVal);
try {
$ele->setAttribute($aName, $aVal);
}
catch(\DOMException $e) {
$this->parseError("Illegal attribute name for tag $name. Ignoring: $aName");
continue;
}

// This is necessary on a non-DTD schema, like HTML5.
if ($aName == 'id') {
Expand Down
25 changes: 21 additions & 4 deletions src/HTML5/Parser/Tokenizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -414,16 +414,33 @@ protected function attribute(&$attributes) {
$name = $this->scanner->current();
$this->scanner->next();
}
if (preg_match('/[\'\"]/', $name)) {
//if (strspn($name, '\'\"')) {

$isValidAttribute = TRUE;
// Attribute names can contain most Unicode characters for HTML5.
// But method "DOMElement::setAttribute" is throwing exception
// because of it's own internal restriction so these have to be filtered.
// see issue #23: https://github.com/Masterminds/html5-php/issues/23
// and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name
if (preg_match("/[\x1-\x2C\\/\x3B-\x40\x5B-\x5E\x60\x7B-\x7F]/u", $name)) {
$this->parseError("Unexpected characters in attribute name: %s", $name);
$isValidAttribute = FALSE;
}
// There is no limitation for 1st character in HTML5.
// But method "DOMElement::setAttribute" is throwing exception for the
// characters below so they have to be filtered.
// see issue #23: https://github.com/Masterminds/html5-php/issues/23
// and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name
else if (preg_match("/^[0-9.-]/u", $name)) {
$this->parseError("Unexpected character at the begining of attribute name: %s", $name);
$isValidAttribute = FALSE;
}
// 8.1.2.3
$this->scanner->whitespace();

$val = $this->attributeValue();
//return array($name, $val);
$attributes[$name] = $val;
if($isValidAttribute) {
$attributes[$name] = $val;
}
return TRUE;
}

Expand Down
9 changes: 8 additions & 1 deletion test/HTML5/Parser/TokenizerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -363,11 +363,18 @@ public function testTagAttributes() {
// This will emit an entity lookup failure for &red.
"<foo a='blue&red'>" => array('foo', array('a' => 'blue&red'), FALSE),
"<foo a='blue&&amp;&red'>" => array('foo', array('a' => 'blue&&&red'), FALSE),
'<foo b"="baz">' => array('foo', array('b"' => 'baz'), FALSE),
'<foo bar=>' => array('foo', array('bar' => NULL), FALSE),
'<foo bar="oh' => array('foo', array('bar' => 'oh'), FALSE),
'<foo bar=oh">' => array('foo', array('bar' => 'oh"'), FALSE),

// these attributes are ignored because of current implementation
// of method "DOMElement::setAttribute"
// see issue #23: https://github.com/Masterminds/html5-php/issues/23
'<foo b"="baz">' => array('foo', array(), FALSE),
'<foo 2abc="baz">' => array('foo', array(), FALSE),
'<foo ?="baz">' => array('foo', array(), FALSE),
'<foo foo?bar="baz">' => array('foo', array(), FALSE),

);
foreach ($bad as $test => $expects) {
$events = $this->parse($test);
Expand Down

0 comments on commit 8f95f4a

Please sign in to comment.