diff --git a/src/Configuration.php b/src/Configuration.php
index 6d1f03f..446c155 100644
--- a/src/Configuration.php
+++ b/src/Configuration.php
@@ -13,80 +13,23 @@ class Configuration
{
use LoggerAwareTrait;
- /**
- * @var int
- */
- protected $maxTopCandidates = 5;
-
- /**
- * @var int
- */
- protected $charThreshold = 500;
-
- /**
- * @var bool
- */
- protected $articleByLine = false;
-
- /**
- * @var bool
- */
- protected $stripUnlikelyCandidates = true;
-
- /**
- * @var bool
- */
- protected $cleanConditionally = true;
-
- /**
- * @var bool
- */
- protected $weightClasses = true;
-
- /**
- * @var bool
- */
- protected $fixRelativeURLs = false;
-
- /**
- * @var bool
- */
- protected $substituteEntities = false;
-
- /**
- * @var bool
- */
- protected $normalizeEntities = false;
-
- /**
- * @var bool
- */
- protected $summonCthulhu = false;
-
- /**
- * @var string
- */
- protected $originalURL = 'http://fakehost';
-
- /**
- * @var string
- */
- protected $parser = 'html5';
-
- /**
- * @var bool
- */
- protected $keepClasses = false;
-
- /**
- * @var bool
- */
- protected $disableJSONLD = false;
+ protected int $maxTopCandidates = 5;
+ protected int $charThreshold = 500;
+ protected bool $articleByLine = false;
+ protected bool $stripUnlikelyCandidates = true;
+ protected bool $cleanConditionally = true;
+ protected bool $weightClasses = true;
+ protected bool $fixRelativeURLs = false;
+ protected bool $substituteEntities = false;
+ protected bool $normalizeEntities = false;
+ protected bool $summonCthulhu = false;
+ protected string $originalURL = 'http://fakehost';
+ protected string $parser = 'html5';
+ protected bool $keepClasses = false;
+ protected bool $disableJSONLD = false;
/**
* Configuration constructor.
- *
- * @param array $params
*/
public function __construct(array $params = [])
{
@@ -100,10 +43,8 @@ public function __construct(array $params = [])
/**
* Returns an array-representation of configuration.
- *
- * @return array
*/
- public function toArray()
+ public function toArray(): array
{
$out = [];
foreach ($this as $key => $value) {
@@ -117,9 +58,9 @@ public function toArray()
}
/**
- * @return LoggerInterface
+ * Get logger.
*/
- public function getLogger()
+ public function getLogger(): LoggerInterface
{
// If no logger has been set, just return a null logger
if ($this->logger === null) {
@@ -130,11 +71,9 @@ public function getLogger()
}
/**
- * @param LoggerInterface $logger
- *
- * @return Configuration
+ * Set logger.
*/
- public function setLogger(LoggerInterface $logger)
+ public function setLogger(LoggerInterface $logger): Configuration
{
$this->logger = $logger;
@@ -142,19 +81,17 @@ public function setLogger(LoggerInterface $logger)
}
/**
- * @return int
+ * Get max top candidates.
*/
- public function getMaxTopCandidates()
+ public function getMaxTopCandidates(): int
{
return $this->maxTopCandidates;
}
/**
- * @param int $maxTopCandidates
- *
- * @return $this
+ * Set max top candidates.
*/
- public function setMaxTopCandidates($maxTopCandidates)
+ public function setMaxTopCandidates(int $maxTopCandidates): Configuration
{
$this->maxTopCandidates = $maxTopCandidates;
@@ -162,19 +99,17 @@ public function setMaxTopCandidates($maxTopCandidates)
}
/**
- * @return int
+ * Get char threshold.
*/
- public function getCharThreshold()
+ public function getCharThreshold(): int
{
return $this->charThreshold;
}
/**
- * @param int $charThreshold
- *
- * @return $this
+ * Set char threshold.
*/
- public function setCharThreshold($charThreshold)
+ public function setCharThreshold(int $charThreshold): Configuration
{
$this->charThreshold = $charThreshold;
@@ -182,19 +117,17 @@ public function setCharThreshold($charThreshold)
}
/**
- * @return bool
+ * Get article by line.
*/
- public function getArticleByLine()
+ public function getArticleByLine(): bool
{
return $this->articleByLine;
}
/**
- * @param bool $articleByLine
- *
- * @return $this
+ * Set article by line.
*/
- public function setArticleByLine($articleByLine)
+ public function setArticleByLine(bool $articleByLine): Configuration
{
$this->articleByLine = $articleByLine;
@@ -202,19 +135,17 @@ public function setArticleByLine($articleByLine)
}
/**
- * @return bool
+ * Get strip unlikely candidates.
*/
- public function getStripUnlikelyCandidates()
+ public function getStripUnlikelyCandidates(): bool
{
return $this->stripUnlikelyCandidates;
}
/**
* @param bool $stripUnlikelyCandidates
- *
- * @return $this
*/
- public function setStripUnlikelyCandidates($stripUnlikelyCandidates)
+ public function setStripUnlikelyCandidates(bool $stripUnlikelyCandidates): Configuration
{
$this->stripUnlikelyCandidates = $stripUnlikelyCandidates;
@@ -222,19 +153,17 @@ public function setStripUnlikelyCandidates($stripUnlikelyCandidates)
}
/**
- * @return bool
+ * Get clean conditionally.
*/
- public function getCleanConditionally()
+ public function getCleanConditionally(): bool
{
return $this->cleanConditionally;
}
/**
- * @param bool $cleanConditionally
- *
- * @return $this
+ * Set clean conditionally.
*/
- public function setCleanConditionally($cleanConditionally)
+ public function setCleanConditionally(bool $cleanConditionally): Configuration
{
$this->cleanConditionally = $cleanConditionally;
@@ -242,19 +171,17 @@ public function setCleanConditionally($cleanConditionally)
}
/**
- * @return bool
+ * Get weight classes.
*/
- public function getWeightClasses()
+ public function getWeightClasses(): bool
{
return $this->weightClasses;
}
/**
- * @param bool $weightClasses
- *
- * @return $this
+ * Set weight classes.
*/
- public function setWeightClasses($weightClasses)
+ public function setWeightClasses(bool $weightClasses): Configuration
{
$this->weightClasses = $weightClasses;
@@ -262,19 +189,17 @@ public function setWeightClasses($weightClasses)
}
/**
- * @return bool
+ * Get fix relative URLs.
*/
- public function getFixRelativeURLs()
+ public function getFixRelativeURLs(): bool
{
return $this->fixRelativeURLs;
}
/**
- * @param bool $fixRelativeURLs
- *
- * @return $this
+ * Set fix relative URLs.
*/
- public function setFixRelativeURLs($fixRelativeURLs)
+ public function setFixRelativeURLs(bool $fixRelativeURLs): Configuration
{
$this->fixRelativeURLs = $fixRelativeURLs;
@@ -282,19 +207,17 @@ public function setFixRelativeURLs($fixRelativeURLs)
}
/**
- * @return bool
+ * Get substitute entities.
*/
- public function getSubstituteEntities()
+ public function getSubstituteEntities(): bool
{
return $this->substituteEntities;
}
/**
- * @param bool $substituteEntities
- *
- * @return $this
+ * Set substitute entities.
*/
- public function setSubstituteEntities($substituteEntities)
+ public function setSubstituteEntities(bool $substituteEntities): Configuration
{
$this->substituteEntities = $substituteEntities;
@@ -302,19 +225,17 @@ public function setSubstituteEntities($substituteEntities)
}
/**
- * @return bool
+ * Get normalize entities.
*/
- public function getNormalizeEntities()
+ public function getNormalizeEntities(): bool
{
return $this->normalizeEntities;
}
/**
- * @param bool $normalizeEntities
- *
- * @return $this
+ * Set normalize entities.
*/
- public function setNormalizeEntities($normalizeEntities)
+ public function setNormalizeEntities(bool $normalizeEntities): Configuration
{
$this->normalizeEntities = $normalizeEntities;
@@ -322,19 +243,17 @@ public function setNormalizeEntities($normalizeEntities)
}
/**
- * @return string
+ * Get original URL.
*/
- public function getOriginalURL()
+ public function getOriginalURL(): string
{
return $this->originalURL;
}
/**
- * @param string $originalURL
- *
- * @return $this
+ * Set original URL.
*/
- public function setOriginalURL($originalURL)
+ public function setOriginalURL(string $originalURL): Configuration
{
$this->originalURL = $originalURL;
@@ -342,19 +261,17 @@ public function setOriginalURL($originalURL)
}
/**
- * @return string
+ * Get parser.
*/
- public function getParser()
+ public function getParser(): string
{
return $this->parser;
}
/**
- * @param string $parser
- *
- * @return $this
+ * Set parser.
*/
- public function setParser($parser)
+ public function setParser(string $parser): Configuration
{
$this->parser = $parser;
@@ -362,19 +279,17 @@ public function setParser($parser)
}
/**
- * @return bool
+ * Get keep classes.
*/
- public function getKeepClasses()
+ public function getKeepClasses(): bool
{
return $this->keepClasses;
}
/**
- * @param bool $keepClasses
- *
- * @return $this
+ * Set keep classes.
*/
- public function setKeepClasses($keepClasses)
+ public function setKeepClasses(bool $keepClasses): Configuration
{
$this->keepClasses = $keepClasses;
@@ -382,19 +297,17 @@ public function setKeepClasses($keepClasses)
}
/**
- * @return bool
+ * Get disable JSON-LD.
*/
- public function getDisableJSONLD()
+ public function getDisableJSONLD(): bool
{
return $this->disableJSONLD;
}
/**
- * @param bool $disableJSONLD
- *
- * @return $this
+ * Set disable JSON-LD.
*/
- public function setDisableJSONLD($disableJSONLD)
+ public function setDisableJSONLD(bool $disableJSONLD): Configuration
{
$this->disableJSONLD = $disableJSONLD;
@@ -402,19 +315,17 @@ public function setDisableJSONLD($disableJSONLD)
}
/**
- * @return bool
+ * Get summon Cthulhu.
*/
- public function getSummonCthulhu()
+ public function getSummonCthulhu(): bool
{
return $this->summonCthulhu;
}
/**
- * @param bool $summonCthulhu
- *
- * @return $this
+ * Set summon Cthulhu.
*/
- public function setSummonCthulhu($summonCthulhu)
+ public function setSummonCthulhu(bool $summonCthulhu): Configuration
{
$this->summonCthulhu = $summonCthulhu;
diff --git a/src/Nodes/DOM/DOMElement.php b/src/Nodes/DOM/DOMElement.php
index b0da84f..0493c41 100644
--- a/src/Nodes/DOM/DOMElement.php
+++ b/src/Nodes/DOM/DOMElement.php
@@ -10,12 +10,10 @@ class DOMElement extends \DOMElement
/**
* Returns the child elements of this element.
- *
- * To get all child nodes, including non-element nodes like text and comment nodes, use childNodes.
*
- * @return DOMNodeList
+ * To get all child nodes, including non-element nodes like text and comment nodes, use childNodes.
*/
- public function children()
+ public function children(): DOMNodeList
{
$newList = new DOMNodeList();
foreach ($this->childNodes as $node) {
@@ -29,18 +27,10 @@ public function children()
/**
* Returns the Element immediately prior to the specified one in its parent's children list, or null if the specified element is the first one in the list.
*
- * @see https://wiki.php.net/rfc/dom_living_standard_api
- * @return DOMElement|null
+ * @deprecated Use previousElementSibling instead - introduced in PHP 8.0.
*/
- public function previousElementSibling()
+ public function previousElementSibling(): ?DOMElement
{
- $previous = $this->previousSibling;
- while ($previous) {
- if ($previous->nodeType === XML_ELEMENT_NODE) {
- return $previous;
- }
- $previous = $previous->previousSibling;
- }
- return null;
+ return $this->previousElementSibling;
}
}
diff --git a/src/Nodes/DOM/DOMNodeList.php b/src/Nodes/DOM/DOMNodeList.php
index a718c00..8a85d3b 100644
--- a/src/Nodes/DOM/DOMNodeList.php
+++ b/src/Nodes/DOM/DOMNodeList.php
@@ -40,11 +40,9 @@ public function __get($name)
}
/**
- * @param DOMNode|DOMElement|DOMComment $node
- *
- * @return DOMNodeList
+ * Add node to the list.
*/
- public function add($node)
+ public function add(DOMNode|DOMElement|DOMComment $node): DOMNodeList
{
$this->items[] = $node;
$this->length++;
@@ -53,17 +51,15 @@ public function add($node)
}
/**
- * @param int $offset
- *
- * @return DOMNode|DOMElement|DOMComment
+ * Get node.
*/
- public function item(int $offset)
+ public function item(int $offset): DOMNode|DOMElement|DOMComment
{
return $this->items[$offset];
}
/**
- * @return int|void
+ * Number of items.
*/
public function count(): int
{
diff --git a/src/Nodes/NodeTrait.php b/src/Nodes/NodeTrait.php
index 5598877..264bec5 100644
--- a/src/Nodes/NodeTrait.php
+++ b/src/Nodes/NodeTrait.php
@@ -70,18 +70,16 @@ trait NodeTrait
/**
* initialized getter.
- *
- * @return bool
*/
- public function isInitialized()
+ public function isInitialized(): bool
{
return $this->initialized;
}
/**
- * @return bool
+ * Check if this is a data table.
*/
- public function isReadabilityDataTable()
+ public function isReadabilityDataTable(): bool
{
/*
* This is a workaround that I'd like to remove in the future.
@@ -99,11 +97,10 @@ public function isReadabilityDataTable()
}
/**
- * @param bool $param
+ * Set data table flag.
*/
- public function setReadabilityDataTable($param)
+ public function setReadabilityDataTable(bool $param): void
{
- // Can't be "true" because DOMDocument casts it to "1"
$this->setAttribute('readabilityDataTable', $param ? '1' : '0');
// $this->readabilityDataTable = $param;
}
@@ -113,11 +110,9 @@ public function setReadabilityDataTable($param)
*
* @ TODO: I don't like the weightClasses param. How can we get the config here?
*
- * @param $weightClasses bool Weight classes?
- *
- * @return static
+ * @param bool $weightClasses Weight classes?
*/
- public function initializeNode($weightClasses)
+ public function initializeNode(bool $weightClasses): static
{
if (!$this->isInitialized()) {
$contentScore = 0;
@@ -166,12 +161,8 @@ public function initializeNode($weightClasses)
/**
* Override for native getAttribute method. Some nodes have the getAttribute method, some don't, so we need
* to check first the existence of the attributes property.
- *
- * @param $attributeName string Attribute to retrieve
- *
- * @return string
*/
- public function getAttribute($attributeName): string
+ public function getAttribute(string $attributeName): string
{
if (!is_null($this->attributes)) {
return parent::getAttribute($attributeName);
@@ -183,13 +174,9 @@ public function getAttribute($attributeName): string
/**
* Override for native hasAttribute.
*
- * @param $attributeName
- *
- * @return bool
- *
* @see getAttribute
*/
- public function hasAttribute($attributeName): bool
+ public function hasAttribute(string $attributeName): bool
{
if (!is_null($this->attributes)) {
return parent::hasAttribute($attributeName);
@@ -202,10 +189,8 @@ public function hasAttribute($attributeName): bool
* Get the ancestors of the current node.
*
* @param int|bool $maxLevel Max amount of ancestors to get. False for all of them
- *
- * @return array
*/
- public function getNodeAncestors($maxLevel = 3): array
+ public function getNodeAncestors(int|bool $maxLevel = 3): array
{
$ancestors = [];
$level = 0;
@@ -226,8 +211,6 @@ public function getNodeAncestors($maxLevel = 3): array
/**
* Returns all links from the current element.
- *
- * @return array
*/
public function getAllLinks(): array
{
@@ -237,10 +220,8 @@ public function getAllLinks(): array
/**
* Get the density of links as a percentage of the content
* This is the amount of text that is inside a link divided by the total text in the node.
- *
- * @return int
*/
- public function getLinkDensity()
+ public function getLinkDensity(): int
{
$textLength = mb_strlen($this->getTextContent(true));
if ($textLength === 0) {
@@ -265,10 +246,8 @@ public function getLinkDensity()
/**
* Calculates the weight of the class/id of the current element.
- *
- * @return int
*/
- public function getClassWeight()
+ public function getClassWeight(): int
{
$weight = 0;
@@ -303,10 +282,8 @@ public function getClassWeight()
* Returns the full text of the node.
*
* @param bool $normalize Normalize white space?
- *
- * @return string
*/
- public function getTextContent($normalize = true)
+ public function getTextContent(bool $normalize = true): string
{
$nodeValue = trim($this->textContent);
if ($normalize) {
@@ -318,10 +295,8 @@ public function getTextContent($normalize = true)
/**
* Return an array indicating how many rows and columns this table has.
- *
- * @return array
*/
- public function getRowAndColumnCount()
+ public function getRowAndColumnCount(): array
{
$rows = $columns = 0;
$trs = $this->getElementsByTagName('tr');
@@ -346,13 +321,8 @@ public function getRowAndColumnCount()
/**
* Creates a new node based on the text content of the original node.
- *
- * @param $originalNode DOMNode
- * @param $tagName string
- *
- * @return DOMElement
*/
- public function createNode($originalNode, $tagName)
+ public function createNode(DOMNode $originalNode, string $tagName): DOMElement
{
$text = $originalNode->getTextContent(false);
$newNode = $originalNode->ownerDocument->createElement($tagName, $text);
@@ -363,14 +333,8 @@ public function createNode($originalNode, $tagName)
/**
* Check if a given node has one of its ancestor tag name matching the
* provided one.
- *
- * @param string $tagName
- * @param int $maxDepth
- * @param callable $filterFn
- *
- * @return bool
*/
- public function hasAncestorTag($tagName, $maxDepth = 3, ?callable $filterFn = null)
+ public function hasAncestorTag(string $tagName, int $maxDepth = 3, ?callable $filterFn = null): bool
{
$depth = 0;
$node = $this;
@@ -394,12 +358,8 @@ public function hasAncestorTag($tagName, $maxDepth = 3, ?callable $filterFn = nu
/**
* Check if this node has only whitespace and a single element with given tag
* or if it contains no element with given tag or more than 1 element.
- *
- * @param $tag string Name of tag
- *
- * @return bool
*/
- public function hasSingleTagInsideElement($tag)
+ public function hasSingleTagInsideElement(string $tag): bool
{
// There should be exactly 1 element child with given tag
if (count($children = NodeUtility::filterTextNodes($this->childNodes)) !== 1 || $children->item(0)->nodeName !== $tag) {
@@ -420,10 +380,8 @@ public function hasSingleTagInsideElement($tag)
/**
* Check if the current element has a single child block element.
* Block elements are the ones defined in the divToPElements array.
- *
- * @return bool
*/
- public function hasSingleChildBlockElement()
+ public function hasSingleChildBlockElement(): bool
{
$result = false;
if ($this->hasChildNodes()) {
@@ -443,10 +401,8 @@ public function hasSingleChildBlockElement()
/**
* Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace.
- *
- * @return bool
*/
- public function isElementWithoutContent()
+ public function isElementWithoutContent(): bool
{
return $this instanceof DOMElement &&
mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent)) === 0 &&
@@ -469,11 +425,9 @@ public function isElementWithoutContent()
/**
* Determine if a node qualifies as phrasing content.
- * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content.
- *
- * @return bool
+ * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
*/
- public function isPhrasingContent()
+ public function isPhrasingContent(): bool
{
return $this->nodeType === XML_TEXT_NODE || in_array($this->nodeName, $this->phrasing_elems) !== false ||
(!is_null($this->childNodes) &&
@@ -487,10 +441,8 @@ public function isPhrasingContent()
/**
* In the original JS project they check if the node has the style display=none, which unfortunately
* in our case we have no way of knowing that. So we just check for the attribute hidden or "display: none".
- *
- * @return bool
*/
- public function isProbablyVisible()
+ public function isProbablyVisible(): bool
{
return !preg_match('/display:( )?none/i', $this->getAttribute('style')) &&
!$this->hasAttribute('hidden') &&
@@ -499,9 +451,9 @@ public function isProbablyVisible()
}
/**
- * @return bool
+ * Check if node is whitespace.
*/
- public function isWhitespace()
+ public function isWhitespace(): bool
{
return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) ||
($this->nodeType === XML_ELEMENT_NODE && $this->nodeName === 'br');
@@ -524,10 +476,8 @@ public function isWhitespace()
* used only when the results of the search are going to be used to remove the nodes.
*
* @param string $tag
- *
- * @return \Generator
*/
- public function shiftingAwareGetElementsByTagName($tag)
+ public function shiftingAwareGetElementsByTagName(string $tag): \Generator
{
$nodes = $this->getElementsByTagName($tag);
$count = $nodes->length;
@@ -549,10 +499,8 @@ public function shiftingAwareGetElementsByTagName($tag)
/**
* Mimics JS's firstElementChild property. PHP only has firstChild which could be any type of DOMNode. Use this
* function to get the first one that is an DOMElement node.
- *
- * @return DOMElement|null
*/
- public function getFirstElementChild()
+ public function getFirstElementChild(): ?DOMElement
{
if ($this->childNodes instanceof \Traversable) {
foreach ($this->childNodes as $node) {
diff --git a/src/Nodes/NodeUtility.php b/src/Nodes/NodeUtility.php
index 24775a4..3df4c2f 100644
--- a/src/Nodes/NodeUtility.php
+++ b/src/Nodes/NodeUtility.php
@@ -40,21 +40,17 @@ class NodeUtility
'b64DataUrl' => '/^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i',
// See: https://schema.org/Article
'jsonLdArticleTypes' => '/^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/'
-
+
];
/**
* Finds the next node, starting from the given node, and ignoring
* whitespace in between. If the given node is an element, the same node is
* returned.
- *
- * Imported from the Element class on league\html-to-markdown.
*
- * @param $node
- *
- * @return DOMNode
+ * Imported from the Element class on league\html-to-markdown.
*/
- public static function nextNode($node)
+ public static function nextNode(DOMNode $node): DOMNode
{
$next = $node;
while ($next
@@ -69,14 +65,8 @@ public static function nextNode($node)
/**
* Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new
* element with the new tag name and importing it to the main DOMDocument.
- *
- * @param DOMNode|DOMElement $node
- * @param string $value
- * @param bool $importAttributes
- *
- * @return DOMNode
*/
- public static function setNodeTag($node, $value, $importAttributes = true)
+ public static function setNodeTag(DOMNode|DOMElement $node, string $value, bool $importAttributes = true): DOMNode
{
$new = new DOMDocument('1.0', 'utf-8');
$new->appendChild($new->createElement($value));
@@ -104,12 +94,8 @@ public static function setNodeTag($node, $value, $importAttributes = true)
/**
* Removes the current node and returns the next node to be parsed (child, sibling or parent).
- *
- * @param DOMNode|DOMElement $node
- *
- * @return DOMNode
*/
- public static function removeAndGetNext($node)
+ public static function removeAndGetNext(DOMNode|DOMElement $node): DOMNode
{
$nextNode = self::getNextNode($node, true);
$node->parentNode->removeChild($node);
@@ -119,12 +105,8 @@ public static function removeAndGetNext($node)
/**
* Remove the selected node.
- *
- * @param $node DOMElement
- *
- * @return void
- **/
- public static function removeNode($node)
+ */
+ public static function removeNode(DOMElement $node): void
{
$parent = $node->parentNode;
if ($parent) {
@@ -135,13 +117,8 @@ public static function removeNode($node)
/**
* Returns the next node. First checks for children (if the flag allows it), then for siblings, and finally
* for parents.
- *
- * @param DOMNode|DOMElement|DOMDocument $originalNode
- * @param bool $ignoreSelfAndKids
- *
- * @return DOMNode
*/
- public static function getNextNode($originalNode, $ignoreSelfAndKids = false)
+ public static function getNextNode(DOMNode|DOMElement|DOMDocument $originalNode, bool $ignoreSelfAndKids = false): DOMNode
{
/*
* Traverse the DOM from node to node, starting at the node passed in.
@@ -173,12 +150,8 @@ public static function getNextNode($originalNode, $ignoreSelfAndKids = false)
/**
* Remove all empty DOMNodes from DOMNodeLists.
- *
- * @param \DOMNodeList $list
- *
- * @return DOMNodeList
*/
- public static function filterTextNodes(\DOMNodeList $list)
+ public static function filterTextNodes(\DOMNodeList $list): DOMNodeList
{
$newList = new DOMNodeList();
foreach ($list as $node) {
diff --git a/src/Readability.php b/src/Readability.php
index 9b1aa0d..54e8fc8 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -18,100 +18,74 @@ class Readability
{
/**
* Main DOMDocument where all the magic happens.
- *
- * @var DOMDocument
*/
- protected $dom;
+ protected DOMDocument $dom;
/**
* Title of the article.
- *
- * @var string|null
*/
- protected $title = null;
+ protected ?string $title = null;
/**
* Final DOMDocument with the fully parsed HTML.
- *
- * @var DOMDocument|null
*/
- protected $content = null;
+ protected ?DOMDocument $content = null;
/**
* Excerpt of the article.
- *
- * @var string|null
*/
- protected $excerpt = null;
+ protected ?string $excerpt = null;
/**
* Main image of the article.
- *
- * @var string|null
*/
- protected $image = null;
+ protected ?string $image = null;
/**
* Author of the article. Extracted from the byline tags and other social media properties.
- *
- * @var string|null
*/
- protected $author = null;
+ protected ?string $author = null;
/**
* Website name.
- *
- * @var string|null
*/
- protected $siteName = null;
+ protected ?string $siteName = null;
/**
* Direction of the text.
- *
- * @var string|null
*/
- protected $direction = null;
+ protected ?string $direction = null;
/**
* Base URI
* HTML5PHP doesn't appear to store it in the baseURI property like PHP's DOMDocument does when parsing with libxml
- *
- * @var string|null
*/
- protected $baseURI = null;
+ protected ?string $baseURI = null;
/**
* Configuration object.
- *
- * @var Configuration
*/
- private $configuration;
+ private Configuration $configuration;
/**
* Logger object.
- *
- * @var LoggerInterface
*/
- private $logger;
+ private LoggerInterface $logger;
/**
* JSON-LD
- *
- * @var array
*/
- private $jsonld = [];
+ private array $jsonld = [];
/**
* Collection of attempted text extractions.
- *
- * @var array
*/
- private $attempts = [];
+ private array $attempts = [];
/**
- * @var array
+ * Default tags to score.
*/
- private $defaultTagsToScore = [
+ private array $defaultTagsToScore = [
'section',
'h2',
'h3',
@@ -124,14 +98,14 @@ class Readability
];
/**
- * @var array
+ * Unlikely roles.
*/
- private $unlikelyRoles = ['menu', 'menubar', 'complementary', 'navigation', 'alert', 'alertdialog', 'dialog'];
+ private array $unlikelyRoles = ['menu', 'menubar', 'complementary', 'navigation', 'alert', 'alertdialog', 'dialog'];
/**
- * @var array
+ * Alter to DIV exceptions.
*/
- private $alterToDIVExceptions = [
+ private array $alterToDIVExceptions = [
'div',
'article',
'section',
@@ -139,9 +113,9 @@ class Readability
];
/**
- * @var array
+ * HTML escape map.
*/
- private $htmlEscapeMap = [
+ private array $htmlEscapeMap = [
'lt' => '<',
'gt' => '>',
'amp' => '&',
@@ -151,8 +125,6 @@ class Readability
/**
* Readability constructor.
- *
- * @param Configuration $configuration
*/
public function __construct(Configuration $configuration)
{
@@ -163,13 +135,9 @@ public function __construct(Configuration $configuration)
/**
* Main parse function.
*
- * @param $html|null
- *
* @throws ParseException
- *
- * @return bool
*/
- public function parse(?string $html = null)
+ public function parse(?string $html = null): bool
{
$this->logger->info('*** Starting parse process...');
@@ -296,8 +264,6 @@ public function parse(?string $html = null)
* Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs
* because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both
* objects and ruining the backup.
- *
- * @param string $html
*/
public function loadHTML(string $html): void
{
@@ -357,7 +323,6 @@ public function loadHTML(string $html): void
* Try to extract metadata from JSON-LD object.
* For now, only Schema.org objects of type Article or its subtypes are supported.
*
- * @param DOMDocument $dom
* @return array with any metadata that could be extracted (possibly none)
*/
private function getJSONLD(DOMDocument $dom): array
@@ -436,7 +401,7 @@ private function getJSONLD(DOMDocument $dom): array
/**
* Tries to guess relevant info from metadata of the html. Sets the results in the Readability properties.
*/
- private function getMetadata()
+ private function getMetadata(): void
{
$this->logger->debug('[Metadata] Retrieving metadata...');
@@ -597,7 +562,7 @@ public function getImages(): array
* Tries to get the main article image. Will only update the metadata if the getMetadata function couldn't
* find a correct image.
*/
- public function getMainImage()
+ public function getMainImage(): void
{
$imgUrl = false;
@@ -630,15 +595,11 @@ public function getMainImage()
/**
* Remove unnecessary nested elements
- *
- * @param DOMDocument $article
- *
- * @return void
*/
- private function simplifyNestedElements(DOMDocument $article)
+ private function simplifyNestedElements(DOMDocument $article): void
{
$node = $article;
-
+
while ($node) {
if ($node->parentNode && in_array($node->nodeName, ['div', 'section']) && !($node->hasAttribute('id') && strpos($node->getAttribute('id'), 'readability') === 0)) {
/** @var DOMElement $node */
@@ -655,17 +616,15 @@ private function simplifyNestedElements(DOMDocument $article)
continue;
}
}
-
+
$node = NodeUtility::getNextNode($node);
}
}
/**
* Returns the title of the html. Prioritizes the title from the metadata against the title tag.
- *
- * @return string|null
*/
- private function getArticleTitle()
+ private function getArticleTitle(): ?string
{
$originalTitle = null;
@@ -766,12 +725,8 @@ private function getArticleTitle()
/**
* Convert URI to an absolute URI.
- *
- * @param $uri string URI to convert
- *
- * @return string
*/
- private function toAbsoluteURI($uri)
+ private function toAbsoluteURI(string $uri): string
{
list($pathBase, $scheme, $prePath) = $this->getPathInfo($this->configuration->getOriginalURL());
@@ -813,11 +768,9 @@ private function toAbsoluteURI($uri)
/**
* Returns full path info of an URL.
*
- * @param string $url
- *
* @return array [$pathBase, $scheme, $prePath]
*/
- public function getPathInfo($url)
+ public function getPathInfo(string $url): array
{
// Check for base URLs
if ($this->baseURI !== null) {
@@ -840,12 +793,8 @@ public function getPathInfo($url)
/**
* Gets nodes from the root element.
- *
- * @param $node DOMNode|DOMText
- *
- * @return array
*/
- private function getNodes($node)
+ private function getNodes(DOMNode|DOMText $node): array
{
$this->logger->info('[Get Nodes] Retrieving nodes...');
@@ -990,7 +939,8 @@ private function getNodes($node)
*
* @return int 1 = same text, 0 = completely different text
*/
- private function textSimilarity(string $textA, string $textB) {
+ private function textSimilarity(string $textA, string $textB): int
+ {
$tokensA = array_filter(preg_split(NodeUtility::$regexps['tokenize'], mb_strtolower($textA)));
$tokensB = array_filter(preg_split(NodeUtility::$regexps['tokenize'], mb_strtolower($textB)));
if (!count($tokensA) || !count($tokensB)) {
@@ -1005,13 +955,8 @@ private function textSimilarity(string $textA, string $textB) {
/**
* Checks if the node is a byline.
- *
- * @param DOMNode $node
- * @param string $matchString
- *
- * @return bool
*/
- private function checkByline($node, $matchString)
+ private function checkByline(DOMNode $node, string $matchString): bool
{
if (!$this->configuration->getArticleByLine()) {
return false;
@@ -1039,12 +984,8 @@ private function checkByline($node, $matchString)
/**
* Checks the validity of a byLine. Based on string length.
- *
- * @param string $text
- *
- * @return bool
*/
- private function isValidByline($text)
+ private function isValidByline(string $text): bool
{
if (gettype($text) == 'string') {
$byline = trim($text);
@@ -1057,11 +998,9 @@ private function isValidByline($text)
/**
* Converts some of the common HTML entities in string to their corresponding characters.
- *
- * @param string $str - a string to unescape.
- * @return string without HTML entity.
*/
- private function unescapeHtmlEntities($str) {
+ private function unescapeHtmlEntities(string $str): string
+ {
if (!$str) {
return $str;
}
@@ -1086,10 +1025,9 @@ private function unescapeHtmlEntities($str) {
/**
* Check if node is image, or if node contains exactly only one image
* whether as a direct child or as its descendants.
- *
- * @param DOMElement $node
*/
- private function isSingleImage(DOMElement $node) {
+ private function isSingleImage(DOMElement $node): bool
+ {
if ($node->tagName === 'img') {
return true;
}
@@ -1106,10 +1044,9 @@ private function isSingleImage(DOMElement $node) {
* element. Replace the first image with the image from inside the