Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Image ALT tags, conversion options, footer URLs and silent DOM. #24

Closed
wants to merge 4 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 69 additions & 5 deletions src/Html2Text.php
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,23 @@

class Html2Text {

/**
* Replaces URLs from <a> tags with a reference number (eg: "[1]") and moves the URL itself to
* the end of the document. Makes the resulting text much easier to read if your HTML contains
* many long URLs. The downside being the user has to scroll to the bottom of the document in
* order to find (and click on) the URL. It's a trade off and a decision you can make per
* document.
*/
const OPT_FOOTER_URLS = 1;

/**
* If you use the OPT_FOOTER_URLS option, this variable will keep track of which indexes point
* to which URLs, so they can be inserted at the end of the converted text.
* @var array Associative array, where the key is a URL, and the value is an associative array
* of properties (currently "index" and "text").
*/
static $_indexedUrls = array();

/**
* Tries to convert the given HTML into a plain text format - best suited for
* e-mail display, etc.
Expand All @@ -30,28 +47,53 @@ class Html2Text {
* </ul>
*
* @param string html the input HTML
* @param array An array of options of the Html2Text::OPT_* variety

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should start with @param array options An array ... instead. Variable name is missing.

* @return string the HTML converted, as best as possible, to text
* @throws Html2TextException if the HTML could not be loaded as a {@link DOMDocument}
*/
static function convert($html) {
static function convert($html,$options=array()) {

// reset
Html2Text::$_indexedUrls = array();

// DOMDocument doesn't support empty value and throws an error
if (!$html) {
return '';
}

// replace &nbsp; with spaces
$html = str_replace("&nbsp;", " ", $html);

$html = static::fixNewlines($html);

$doc = new \DOMDocument();
if (!$doc->loadHTML($html)) {
$doc->strictErrorChecking = FALSE;
$doc->recover = TRUE;
$doc->xmlStandalone = true;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nicer to add a comment for each of the above 3 lines. Telling the reason of the usage.

$prevValue = libxml_use_internal_errors(true); //prevent $doc to trhow any warnings

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo in the comment: trhow

$loaded = $doc->loadHTML($html,LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NONET);
libxml_use_internal_errors($prevValue); //restore original setting

if (!$loaded) {
throw new Html2TextException("Could not load HTML - badly formed?", $html);
}

$output = static::iterateOverNode($doc);
$output = static::iterateOverNode($doc,$options);

// remove leading and trailing spaces on each line
$output = preg_replace("/[ \t]*\n[ \t]*/im", "\n", $output);

// remove leading and trailing whitespace
$output = trim($output);

// if they want URLs at the end of the document instead of inline, append them here
if (in_array(static::OPT_FOOTER_URLS,$options) && Html2Text::$_indexedUrls) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As per the project's convention, there should be one space after commas.

$output .= "\n\n------\n\n";

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

$output .= "\n\n"; might be simpler and better.

foreach (Html2Text::$_indexedUrls as $url=>$info) {
$output .= "[".$info['index']."] ".($info['text']?$info['text']." ":"").$url."\n";
}
}

return $output;
}

Expand Down Expand Up @@ -106,7 +148,7 @@ static function prevChildName($node) {
return $nextName;
}

static function iterateOverNode($node) {
static function iterateOverNode($node,&$options) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why the &?

if ($node instanceof \DOMText) {
// Replace whitespace characters with a space (equivilant to \s)
return preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $node->wholeText);
Expand Down Expand Up @@ -163,6 +205,10 @@ static function iterateOverNode($node) {
$output = "- ";
break;

case "img":
$output = $node->getAttribute("alt");
break;

default:
// print out contents of unknown tags
$output = "";
Expand All @@ -176,7 +222,7 @@ static function iterateOverNode($node) {
for ($i = 0; $i < $node->childNodes->length; $i++) {
$n = $node->childNodes->item($i);

$text = static::iterateOverNode($n);
$text = static::iterateOverNode($n,$options);

$output .= $text;
}
Expand Down Expand Up @@ -226,6 +272,8 @@ static function iterateOverNode($node) {
if ($href == $output || $href == "mailto:$output" || $href == "http://$output" || $href == "https://$output") {
// link to the same address: just use link
$output;
} elseif (in_array(static::OPT_FOOTER_URLS,$options)) {
$output = $output."[".static::_indexUrl($href,$output)."]";
} else {
// replace it
$output = "[$output]($href)";
Expand All @@ -251,4 +299,20 @@ static function iterateOverNode($node) {
return $output;
}

/**
* Accepts a URL (and optionally the link text) and returns a unique index number for that URL.
* @param string $url The URL you want an index number for.
* @param string $text The text of the link (associated with the above URL).
* @return integer The index number that will refer to the URL passed.
*/
static function _indexUrl($url,$text=null) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about private static function indexUrl? IMHO, underscore in method name is an unhelpful convention in PHP.

if (!isset(static::$_indexedUrls[$url])) {
static::$_indexedUrls[$url] = array(
'index'=>count(static::$_indexedUrls),

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This way, indexes start with 0. I guess they should start with 1.

'text'=>$text,
);
}
return static::$_indexedUrls[$url]['index'];
}

}