diff --git a/README.md b/README.md index be735dc..fbe1e67 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,23 @@ $text = \Soundasleep\Html2Text::convert($html); You can also include the supplied `html2text.php` and use `$text = convert_html_to_text($html);` instead. +### Options + +| Option | Default | Description | +|--------|---------|-------------| +| **ignore_errors** | `false` | Set to `true` to ignore any XML parsing errors. | +| **drop_links** | `false` | Set to `true` to not render links as `[http://foo.com](My Link)`, but rather just `My Link`. | + +Pass along options as a second argument to `convert`, for example: + +```php +$options = array( + 'ignore_errors' => true, + // other options go here +); +$text = \Soundasleep\Html2Text::convert($html, $options); +``` + ## Tests Some very basic tests are provided in the `tests/` directory. Run them with `composer install && vendor/bin/phpunit`. diff --git a/src/Html2Text.php b/src/Html2Text.php index 0dc34f9..0670173 100644 --- a/src/Html2Text.php +++ b/src/Html2Text.php @@ -4,6 +4,13 @@ class Html2Text { + public static function defaultOptions() { + return array( + 'ignore_errors' => false, + 'drop_links' => false, + ); + } + /** * Tries to convert the given HTML into a plain text format - best suited for * e-mail display, etc. @@ -19,7 +26,21 @@ class Html2Text { * @return string the HTML converted, as best as possible, to text * @throws Html2TextException if the HTML could not be loaded as a {@link \DOMDocument} */ - public static function convert($html, $ignore_error = false) { + public static function convert($html, $options = array()) { + + if ($options === false || $options === true) { + // Using old style (< 1.0) of passing in options + $options = array('ignore_errors' => $options); + } + + $options = array_merge(static::defaultOptions(), $options); + + // check all options are valid + foreach ($options as $key => $value) { + if (!in_array($key, array_keys(static::defaultOptions()))) { + throw new \InvalidArgumentException("Unknown html2text option '$key'"); + } + } $is_office_document = static::isOfficeDocument($html); @@ -33,9 +54,9 @@ public static function convert($html, $ignore_error = false) { $html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8"); } - $doc = static::getDocument($html, $ignore_error); + $doc = static::getDocument($html, $options['ignore_errors']); - $output = static::iterateOverNode($doc, null, false, $is_office_document); + $output = static::iterateOverNode($doc, null, false, $is_office_document, $options); // process output for whitespace/newlines $output = static::processWhitespaceNewlines($output); @@ -186,7 +207,7 @@ static function nextChildName($node) { return $nextName; } - static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false) { + static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false, $options) { if ($node instanceof \DOMText) { // Replace whitespace characters with a space (equivilant to \s) if ($in_pre) { @@ -307,7 +328,7 @@ static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_of while ($n != null) { - $text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document); + $text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document, $options); // Pass current node name to next child, as previousSibling does not appear to get populated if ($n instanceof \DOMDocumentType @@ -391,19 +412,27 @@ static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_of if ($href == null) { // it doesn't link anywhere if ($node->getAttribute("name") != null) { - $output = "[$output]"; + if ($options['drop_links']) { + $output = "$output"; + } else { + $output = "[$output]"; + } } } else { if ($href == $output || $href == "mailto:$output" || $href == "http://$output" || $href == "https://$output") { // link to the same address: just use link - $output; + $output = "$output"; } else { // replace it if ($output) { - $output = "[$output]($href)"; + if ($options['drop_links']) { + $output = "$output"; + } else { + $output = "[$output]($href)"; + } } else { // empty string - $output = $href; + $output = "$href"; } } } diff --git a/tests/Html2TextTest.php b/tests/Html2TextTest.php index 83f4c03..fe33c57 100644 --- a/tests/Html2TextTest.php +++ b/tests/Html2TextTest.php @@ -4,16 +4,20 @@ class Html2TextTest extends \PHPUnit\Framework\TestCase { - function doTest($test, $ignoreXmlError = false) { + function doTest($test, $options = array()) { + return $this->doTestWithResults($test, $test, $options); + } + + function doTestWithResults($test, $result, $options = array()) { $this->assertTrue(file_exists(__DIR__ . "/$test.html"), "File '$test.html' did not exist"); - $this->assertTrue(file_exists(__DIR__ . "/$test.txt"), "File '$test.txt' did not exist"); + $this->assertTrue(file_exists(__DIR__ . "/$result.txt"), "File '$result.txt' did not exist"); $input = file_get_contents(__DIR__ . "/$test.html"); - $expected = \Soundasleep\Html2Text::fixNewlines(file_get_contents(__DIR__ . "/$test.txt")); + $expected = \Soundasleep\Html2Text::fixNewlines(file_get_contents(__DIR__ . "/$result.txt")); - $output = \Soundasleep\Html2Text::convert($input, $ignoreXmlError); + $output = \Soundasleep\Html2Text::convert($input, $options); if ($output != $expected) { - file_put_contents(__DIR__ . "/$test.output", $output); + file_put_contents(__DIR__ . "/$result.output", $output); } $this->assertEquals($output, $expected); } @@ -103,14 +107,34 @@ function testHugeMsoffice() { } /** - * @expectedException PHPUnit\Framework\Error\Warning - */ + * @expectedException PHPUnit\Framework\Error\Warning + */ function testInvalidXML() { - $this->doTest("invalid", false); + $this->doTest("invalid", array('ignore_errors' => false)); } function testInvalidXMLIgnore() { + $this->doTest("invalid", array('ignore_errors' => true)); + } + + function testInvalidXMLIgnoreOldSyntax() { + // for BC, allow old #convert(text, bool) syntax $this->doTest("invalid", true); } -} + /** + * @expectedException InvalidArgumentException + */ + function testInvalidOption() { + $this->doTest("basic", array('invalid_option' => true)); + } + + function testBasicDropLinks() { + $this->doTestWithResults("basic", "basic.no-links", array('drop_links' => true)); + } + + function testAnchorsDropLinks() { + $this->doTestWithResults("anchors", "anchors.no-links", array('drop_links' => true)); + } + +} \ No newline at end of file diff --git a/tests/anchors.no-links.txt b/tests/anchors.no-links.txt new file mode 100644 index 0000000..b3dff43 --- /dev/null +++ b/tests/anchors.no-links.txt @@ -0,0 +1,5 @@ +A document without any HTML open/closing tags. +--------------------------------------------------------------- +We try and use the representation given by common browsers of the HTML document, so that it looks similar when converted to plain text. visit foo.com - or http://www.foo.com link + +An anchor which will not appear \ No newline at end of file diff --git a/tests/basic.no-links.txt b/tests/basic.no-links.txt new file mode 100644 index 0000000..8e5b8e4 --- /dev/null +++ b/tests/basic.no-links.txt @@ -0,0 +1,15 @@ +Hello, World! + +This is some e-mail content. Even though it has whitespace and newlines, the e-mail converter will handle it correctly. + +Even mismatched tags. + +A div +Another div +A div +within a div + +Another line +Yet another line + +A link \ No newline at end of file