Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow a hash of options to be passed to html2text #65

Merged
merged 3 commits into from
Feb 14, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,23 @@ $text = \Soundasleep\Html2Text::convert($html);

You can also include the supplied `html2text.php` and use `$text = convert_html_to_text($html);` instead.

### Options

| Option | Default | Description |
|--------|---------|-------------|
| **ignore_errors** | `false` | Set to `true` to ignore any XML parsing errors. |
| **drop_links** | `false` | Set to `true` to not render links as `[http://foo.com](My Link)`, but rather just `My Link`. |

Pass along options as a second argument to `convert`, for example:

```php
$options = array(
'ignore_errors' => true,
// other options go here
);
$text = \Soundasleep\Html2Text::convert($html, $options);
```

## Tests

Some very basic tests are provided in the `tests/` directory. Run them with `composer install && vendor/bin/phpunit`.
Expand Down
47 changes: 38 additions & 9 deletions src/Html2Text.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@

class Html2Text {

public static function defaultOptions() {
return array(
'ignore_errors' => false,
'drop_links' => false,
);
}

/**
* Tries to convert the given HTML into a plain text format - best suited for
* e-mail display, etc.
Expand All @@ -19,7 +26,21 @@ class Html2Text {
* @return string the HTML converted, as best as possible, to text
* @throws Html2TextException if the HTML could not be loaded as a {@link \DOMDocument}
*/
public static function convert($html, $ignore_error = false) {
public static function convert($html, $options = array()) {

if ($options === false || $options === true) {
// Using old style (< 1.0) of passing in options
$options = array('ignore_errors' => $options);
}

$options = array_merge(static::defaultOptions(), $options);

// check all options are valid
foreach ($options as $key => $value) {
if (!in_array($key, array_keys(static::defaultOptions()))) {
throw new \InvalidArgumentException("Unknown html2text option '$key'");
}
}

$is_office_document = static::isOfficeDocument($html);

Expand All @@ -33,9 +54,9 @@ public static function convert($html, $ignore_error = false) {
$html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8");
}

$doc = static::getDocument($html, $ignore_error);
$doc = static::getDocument($html, $options['ignore_errors']);

$output = static::iterateOverNode($doc, null, false, $is_office_document);
$output = static::iterateOverNode($doc, null, false, $is_office_document, $options);

// process output for whitespace/newlines
$output = static::processWhitespaceNewlines($output);
Expand Down Expand Up @@ -186,7 +207,7 @@ static function nextChildName($node) {
return $nextName;
}

static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false) {
static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false, $options) {
if ($node instanceof \DOMText) {
// Replace whitespace characters with a space (equivilant to \s)
if ($in_pre) {
Expand Down Expand Up @@ -307,7 +328,7 @@ static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_of

while ($n != null) {

$text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document);
$text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document, $options);

// Pass current node name to next child, as previousSibling does not appear to get populated
if ($n instanceof \DOMDocumentType
Expand Down Expand Up @@ -391,19 +412,27 @@ static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_of
if ($href == null) {
// it doesn't link anywhere
if ($node->getAttribute("name") != null) {
$output = "[$output]";
if ($options['drop_links']) {
$output = "$output";
} else {
$output = "[$output]";
}
}
} else {
if ($href == $output || $href == "mailto:$output" || $href == "http://$output" || $href == "https://$output") {
// link to the same address: just use link
$output;
$output = "$output";
} else {
// replace it
if ($output) {
$output = "[$output]($href)";
if ($options['drop_links']) {
$output = "$output";
} else {
$output = "[$output]($href)";
}
} else {
// empty string
$output = $href;
$output = "$href";
}
}
}
Expand Down
42 changes: 33 additions & 9 deletions tests/Html2TextTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,20 @@

class Html2TextTest extends \PHPUnit\Framework\TestCase {

function doTest($test, $ignoreXmlError = false) {
function doTest($test, $options = array()) {
return $this->doTestWithResults($test, $test, $options);
}

function doTestWithResults($test, $result, $options = array()) {
$this->assertTrue(file_exists(__DIR__ . "/$test.html"), "File '$test.html' did not exist");
$this->assertTrue(file_exists(__DIR__ . "/$test.txt"), "File '$test.txt' did not exist");
$this->assertTrue(file_exists(__DIR__ . "/$result.txt"), "File '$result.txt' did not exist");
$input = file_get_contents(__DIR__ . "/$test.html");
$expected = \Soundasleep\Html2Text::fixNewlines(file_get_contents(__DIR__ . "/$test.txt"));
$expected = \Soundasleep\Html2Text::fixNewlines(file_get_contents(__DIR__ . "/$result.txt"));

$output = \Soundasleep\Html2Text::convert($input, $ignoreXmlError);
$output = \Soundasleep\Html2Text::convert($input, $options);

if ($output != $expected) {
file_put_contents(__DIR__ . "/$test.output", $output);
file_put_contents(__DIR__ . "/$result.output", $output);
}
$this->assertEquals($output, $expected);
}
Expand Down Expand Up @@ -103,14 +107,34 @@ function testHugeMsoffice() {
}

/**
* @expectedException PHPUnit\Framework\Error\Warning
*/
* @expectedException PHPUnit\Framework\Error\Warning
*/
function testInvalidXML() {
$this->doTest("invalid", false);
$this->doTest("invalid", array('ignore_errors' => false));
}

function testInvalidXMLIgnore() {
$this->doTest("invalid", array('ignore_errors' => true));
}

function testInvalidXMLIgnoreOldSyntax() {
// for BC, allow old #convert(text, bool) syntax
$this->doTest("invalid", true);
}

}
/**
* @expectedException InvalidArgumentException
*/
function testInvalidOption() {
$this->doTest("basic", array('invalid_option' => true));
}

function testBasicDropLinks() {
$this->doTestWithResults("basic", "basic.no-links", array('drop_links' => true));
}

function testAnchorsDropLinks() {
$this->doTestWithResults("anchors", "anchors.no-links", array('drop_links' => true));
}

}
5 changes: 5 additions & 0 deletions tests/anchors.no-links.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
A document without any HTML open/closing tags.
---------------------------------------------------------------
We try and use the representation given by common browsers of the HTML document, so that it looks similar when converted to plain text. visit foo.com - or http://www.foo.com link

An anchor which will not appear
15 changes: 15 additions & 0 deletions tests/basic.no-links.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
Hello, World!

This is some e-mail content. Even though it has whitespace and newlines, the e-mail converter will handle it correctly.

Even mismatched tags.

A div
Another div
A div
within a div

Another line
Yet another line

A link