-
Notifications
You must be signed in to change notification settings - Fork 135
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Image ALT tags, conversion options, footer URLs and silent DOM. #24
Changes from all commits
3d37921
e63bd59
403ab68
0ba3018
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,23 @@ | |
|
||
class Html2Text { | ||
|
||
/** | ||
* Replaces URLs from <a> tags with a reference number (eg: "[1]") and moves the URL itself to | ||
* the end of the document. Makes the resulting text much easier to read if your HTML contains | ||
* many long URLs. The downside being the user has to scroll to the bottom of the document in | ||
* order to find (and click on) the URL. It's a trade off and a decision you can make per | ||
* document. | ||
*/ | ||
const OPT_FOOTER_URLS = 1; | ||
|
||
/** | ||
* If you use the OPT_FOOTER_URLS option, this variable will keep track of which indexes point | ||
* to which URLs, so they can be inserted at the end of the converted text. | ||
* @var array Associative array, where the key is a URL, and the value is an associative array | ||
* of properties (currently "index" and "text"). | ||
*/ | ||
static $_indexedUrls = array(); | ||
|
||
/** | ||
* Tries to convert the given HTML into a plain text format - best suited for | ||
* e-mail display, etc. | ||
|
@@ -30,28 +47,53 @@ class Html2Text { | |
* </ul> | ||
* | ||
* @param string html the input HTML | ||
* @param array An array of options of the Html2Text::OPT_* variety | ||
* @return string the HTML converted, as best as possible, to text | ||
* @throws Html2TextException if the HTML could not be loaded as a {@link DOMDocument} | ||
*/ | ||
static function convert($html) { | ||
static function convert($html,$options=array()) { | ||
|
||
// reset | ||
Html2Text::$_indexedUrls = array(); | ||
|
||
// DOMDocument doesn't support empty value and throws an error | ||
if (!$html) { | ||
return ''; | ||
} | ||
|
||
// replace with spaces | ||
$html = str_replace(" ", " ", $html); | ||
|
||
$html = static::fixNewlines($html); | ||
|
||
$doc = new \DOMDocument(); | ||
if (!$doc->loadHTML($html)) { | ||
$doc->strictErrorChecking = FALSE; | ||
$doc->recover = TRUE; | ||
$doc->xmlStandalone = true; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be nicer to add a comment for each of the above 3 lines. Telling the reason of the usage. |
||
$prevValue = libxml_use_internal_errors(true); //prevent $doc to trhow any warnings | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Typo in the comment: trhow |
||
$loaded = $doc->loadHTML($html,LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NONET); | ||
libxml_use_internal_errors($prevValue); //restore original setting | ||
|
||
if (!$loaded) { | ||
throw new Html2TextException("Could not load HTML - badly formed?", $html); | ||
} | ||
|
||
$output = static::iterateOverNode($doc); | ||
$output = static::iterateOverNode($doc,$options); | ||
|
||
// remove leading and trailing spaces on each line | ||
$output = preg_replace("/[ \t]*\n[ \t]*/im", "\n", $output); | ||
|
||
// remove leading and trailing whitespace | ||
$output = trim($output); | ||
|
||
// if they want URLs at the end of the document instead of inline, append them here | ||
if (in_array(static::OPT_FOOTER_URLS,$options) && Html2Text::$_indexedUrls) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As per the project's convention, there should be one space after commas. |
||
$output .= "\n\n------\n\n"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
foreach (Html2Text::$_indexedUrls as $url=>$info) { | ||
$output .= "[".$info['index']."] ".($info['text']?$info['text']." ":"").$url."\n"; | ||
} | ||
} | ||
|
||
return $output; | ||
} | ||
|
||
|
@@ -106,7 +148,7 @@ static function prevChildName($node) { | |
return $nextName; | ||
} | ||
|
||
static function iterateOverNode($node) { | ||
static function iterateOverNode($node,&$options) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why the |
||
if ($node instanceof \DOMText) { | ||
// Replace whitespace characters with a space (equivilant to \s) | ||
return preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $node->wholeText); | ||
|
@@ -163,6 +205,10 @@ static function iterateOverNode($node) { | |
$output = "- "; | ||
break; | ||
|
||
case "img": | ||
$output = $node->getAttribute("alt"); | ||
break; | ||
|
||
default: | ||
// print out contents of unknown tags | ||
$output = ""; | ||
|
@@ -176,7 +222,7 @@ static function iterateOverNode($node) { | |
for ($i = 0; $i < $node->childNodes->length; $i++) { | ||
$n = $node->childNodes->item($i); | ||
|
||
$text = static::iterateOverNode($n); | ||
$text = static::iterateOverNode($n,$options); | ||
|
||
$output .= $text; | ||
} | ||
|
@@ -226,6 +272,8 @@ static function iterateOverNode($node) { | |
if ($href == $output || $href == "mailto:$output" || $href == "http://$output" || $href == "https://$output") { | ||
// link to the same address: just use link | ||
$output; | ||
} elseif (in_array(static::OPT_FOOTER_URLS,$options)) { | ||
$output = $output."[".static::_indexUrl($href,$output)."]"; | ||
} else { | ||
// replace it | ||
$output = "[$output]($href)"; | ||
|
@@ -251,4 +299,20 @@ static function iterateOverNode($node) { | |
return $output; | ||
} | ||
|
||
/** | ||
* Accepts a URL (and optionally the link text) and returns a unique index number for that URL. | ||
* @param string $url The URL you want an index number for. | ||
* @param string $text The text of the link (associated with the above URL). | ||
* @return integer The index number that will refer to the URL passed. | ||
*/ | ||
static function _indexUrl($url,$text=null) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about |
||
if (!isset(static::$_indexedUrls[$url])) { | ||
static::$_indexedUrls[$url] = array( | ||
'index'=>count(static::$_indexedUrls), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This way, indexes start with 0. I guess they should start with 1. |
||
'text'=>$text, | ||
); | ||
} | ||
return static::$_indexedUrls[$url]['index']; | ||
} | ||
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should start with
@param array options An array ...
instead. Variable name is missing.