Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Strip out any   in converted text #64

Merged
merged 2 commits into from
Feb 13, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ indent_style = space
indent_size = 2

# don't add newlines to test files
[spec/examples/*]
[tests/*]
indent_style = tabs
trim_trailing_whitespace = false
insert_final_newline = false
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [1.0.0] - TBC
### Changed
- **Important:** Changed namespace from `\Html2Text\Html2Text` to `\Soundasleep\Html2text` [#45](https://github.com/soundasleep/html2text/issues/45)
- Convert non-breaking spaces to regular spaces to prevent output issues
- Treat non-breaking spaces consistently: never include them in output text [#64](https://github.com/soundasleep/html2text/pull/64)
- Optimise/improve newline & whitespace handling [#47](https://github.com/soundasleep/html2text/pull/47)
- Upgrade PHP support to PHP 7.3+
- Upgrade PHPUnit to 7.x
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,12 @@ Hello, World!
This is some e-mail content. Even though it has whitespace and newlines, the e-mail converter will handle it correctly.

Even mismatched tags.

A div
Another div
A div
within a div

[A link](http://foo.com)
```

Expand Down
36 changes: 24 additions & 12 deletions src/Html2Text.php
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,13 @@ static function fixNewlines($text) {
return $text;
}

static function nbspCodes() {
return array(
"\xc2\xa0",
"\u00a0",
);
}

/**
* Remove leading or trailing spaces and excess empty lines from provided multiline text
*
Expand All @@ -80,7 +87,7 @@ static function processWhitespaceNewlines($text) {
// convert non-breaking spaces to regular spaces to prevent output issues,
// do it here so they do NOT get removed with other leading spaces, as they
// are sometimes used for indentation
$text = str_replace("\xc2\xa0", " ", $text);
$text = str_replace(static::nbspCodes(), " ", $text);

// remove trailing whitespace
$text = rtrim($text);
Expand Down Expand Up @@ -151,7 +158,7 @@ static function isOfficeDocument($html) {
}

static function isWhitespace($text) {
return strlen(trim($text, "\n\r\t ")) === 0;
return strlen(trim(str_replace(static::nbspCodes(), " ", $text), "\n\r\t ")) === 0;
}

static function nextChildName($node) {
Expand All @@ -163,11 +170,14 @@ static function nextChildName($node) {
break;
}
}

if ($nextNode instanceof \DOMElement) {
break;
}

$nextNode = $nextNode->nextSibling;
}

$nextName = null;
if (($nextNode instanceof \DOMElement || $nextNode instanceof \DOMText) && $nextNode != null) {
$nextName = strtolower($nextNode->nodeName);
Expand All @@ -177,28 +187,29 @@ static function nextChildName($node) {
}

static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false) {

if ($node instanceof \DOMText) {
// Replace whitespace characters with a space (equivilant to \s)
if ($in_pre) {
$text = "\n" . trim($node->wholeText, "\n\r\t ") . "\n";
$text = "\n" . trim(str_replace(static::nbspCodes(), " ", $node->wholeText), "\n\r\t ") . "\n";

// Remove trailing whitespace only
$text = preg_replace("/[ \t]*\n/im", "\n", $text);

// armor newlines with \r.
return str_replace("\n", "\r", $text);

} else {
$text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $node->wholeText);
$text = str_replace(static::nbspCodes(), " ", $node->wholeText);
$text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $text);

if (!static::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) {
return "\n" . $text;
}
return $text;
}
}
if ($node instanceof \DOMDocumentType) {
// ignore
return "";
}
if ($node instanceof \DOMProcessingInstruction) {

if ($node instanceof \DOMDocumentType || $node instanceof \DOMProcessingInstruction) {
// ignore
return "";
}
Expand Down Expand Up @@ -254,6 +265,7 @@ static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_of
$name = 'br';
break;
}

// add two lines
$output = "\n\n";
break;
Expand Down Expand Up @@ -293,7 +305,7 @@ static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_of
$parts = array();
$trailing_whitespace = 0;

while($n != null) {
while ($n != null) {

$text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document);

Expand All @@ -317,7 +329,7 @@ static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_of
}

// Remove trailing whitespace, important for the br check below
while($trailing_whitespace-- > 0) {
while ($trailing_whitespace-- > 0) {
array_pop($parts);
}

Expand Down
2 changes: 1 addition & 1 deletion tests/nbsp.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
hello world & people < > &NBSP;
hello world & people < > &NBSP;