diff --git a/.editorconfig b/.editorconfig index a96bcae..7974b6a 100644 --- a/.editorconfig +++ b/.editorconfig @@ -17,7 +17,7 @@ indent_style = space indent_size = 2 # don't add newlines to test files -[spec/examples/*] +[tests/*] indent_style = tabs trim_trailing_whitespace = false insert_final_newline = false diff --git a/CHANGELOG.md b/CHANGELOG.md index aab5930..2f39868 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [1.0.0] - TBC ### Changed - **Important:** Changed namespace from `\Html2Text\Html2Text` to `\Soundasleep\Html2text` [#45](https://github.com/soundasleep/html2text/issues/45) -- Convert non-breaking spaces to regular spaces to prevent output issues +- Treat non-breaking spaces consistently: never include them in output text [#64](https://github.com/soundasleep/html2text/pull/64) - Optimise/improve newline & whitespace handling [#47](https://github.com/soundasleep/html2text/pull/47) - Upgrade PHP support to PHP 7.3+ - Upgrade PHPUnit to 7.x diff --git a/README.md b/README.md index 25705c0..be735dc 100644 --- a/README.md +++ b/README.md @@ -33,10 +33,12 @@ Hello, World! This is some e-mail content. Even though it has whitespace and newlines, the e-mail converter will handle it correctly. Even mismatched tags. + A div Another div A div within a div + [A link](http://foo.com) ``` diff --git a/src/Html2Text.php b/src/Html2Text.php index 4926a12..0dc34f9 100644 --- a/src/Html2Text.php +++ b/src/Html2Text.php @@ -60,6 +60,13 @@ static function fixNewlines($text) { return $text; } + static function nbspCodes() { + return array( + "\xc2\xa0", + "\u00a0", + ); + } + /** * Remove leading or trailing spaces and excess empty lines from provided multiline text * @@ -80,7 +87,7 @@ static function processWhitespaceNewlines($text) { // convert non-breaking spaces to regular spaces to prevent output issues, // do it here so they do NOT get removed with other leading spaces, as they // are sometimes used for indentation - $text = str_replace("\xc2\xa0", " ", $text); + $text = str_replace(static::nbspCodes(), " ", $text); // remove trailing whitespace $text = rtrim($text); @@ -151,7 +158,7 @@ static function isOfficeDocument($html) { } static function isWhitespace($text) { - return strlen(trim($text, "\n\r\t ")) === 0; + return strlen(trim(str_replace(static::nbspCodes(), " ", $text), "\n\r\t ")) === 0; } static function nextChildName($node) { @@ -163,11 +170,14 @@ static function nextChildName($node) { break; } } + if ($nextNode instanceof \DOMElement) { break; } + $nextNode = $nextNode->nextSibling; } + $nextName = null; if (($nextNode instanceof \DOMElement || $nextNode instanceof \DOMText) && $nextNode != null) { $nextName = strtolower($nextNode->nodeName); @@ -177,28 +187,29 @@ static function nextChildName($node) { } static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false) { - if ($node instanceof \DOMText) { // Replace whitespace characters with a space (equivilant to \s) if ($in_pre) { - $text = "\n" . trim($node->wholeText, "\n\r\t ") . "\n"; + $text = "\n" . trim(str_replace(static::nbspCodes(), " ", $node->wholeText), "\n\r\t ") . "\n"; + // Remove trailing whitespace only $text = preg_replace("/[ \t]*\n/im", "\n", $text); + // armor newlines with \r. return str_replace("\n", "\r", $text); + } else { - $text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $node->wholeText); + $text = str_replace(static::nbspCodes(), " ", $node->wholeText); + $text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $text); + if (!static::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) { return "\n" . $text; } return $text; } } - if ($node instanceof \DOMDocumentType) { - // ignore - return ""; - } - if ($node instanceof \DOMProcessingInstruction) { + + if ($node instanceof \DOMDocumentType || $node instanceof \DOMProcessingInstruction) { // ignore return ""; } @@ -254,6 +265,7 @@ static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_of $name = 'br'; break; } + // add two lines $output = "\n\n"; break; @@ -293,7 +305,7 @@ static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_of $parts = array(); $trailing_whitespace = 0; - while($n != null) { + while ($n != null) { $text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document); @@ -317,7 +329,7 @@ static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_of } // Remove trailing whitespace, important for the br check below - while($trailing_whitespace-- > 0) { + while ($trailing_whitespace-- > 0) { array_pop($parts); } diff --git a/tests/nbsp.txt b/tests/nbsp.txt index efdde8c..b09619d 100644 --- a/tests/nbsp.txt +++ b/tests/nbsp.txt @@ -1 +1 @@ -hello world & people < > &NBSP; \ No newline at end of file +hello world & people < > &NBSP; \ No newline at end of file