diff --git a/src/Configuration.php b/src/Configuration.php index 6d1f03f..446c155 100644 --- a/src/Configuration.php +++ b/src/Configuration.php @@ -13,80 +13,23 @@ class Configuration { use LoggerAwareTrait; - /** - * @var int - */ - protected $maxTopCandidates = 5; - - /** - * @var int - */ - protected $charThreshold = 500; - - /** - * @var bool - */ - protected $articleByLine = false; - - /** - * @var bool - */ - protected $stripUnlikelyCandidates = true; - - /** - * @var bool - */ - protected $cleanConditionally = true; - - /** - * @var bool - */ - protected $weightClasses = true; - - /** - * @var bool - */ - protected $fixRelativeURLs = false; - - /** - * @var bool - */ - protected $substituteEntities = false; - - /** - * @var bool - */ - protected $normalizeEntities = false; - - /** - * @var bool - */ - protected $summonCthulhu = false; - - /** - * @var string - */ - protected $originalURL = 'http://fakehost'; - - /** - * @var string - */ - protected $parser = 'html5'; - - /** - * @var bool - */ - protected $keepClasses = false; - - /** - * @var bool - */ - protected $disableJSONLD = false; + protected int $maxTopCandidates = 5; + protected int $charThreshold = 500; + protected bool $articleByLine = false; + protected bool $stripUnlikelyCandidates = true; + protected bool $cleanConditionally = true; + protected bool $weightClasses = true; + protected bool $fixRelativeURLs = false; + protected bool $substituteEntities = false; + protected bool $normalizeEntities = false; + protected bool $summonCthulhu = false; + protected string $originalURL = 'http://fakehost'; + protected string $parser = 'html5'; + protected bool $keepClasses = false; + protected bool $disableJSONLD = false; /** * Configuration constructor. - * - * @param array $params */ public function __construct(array $params = []) { @@ -100,10 +43,8 @@ public function __construct(array $params = []) /** * Returns an array-representation of configuration. - * - * @return array */ - public function toArray() + public function toArray(): array { $out = []; foreach ($this as $key => $value) { @@ -117,9 +58,9 @@ public function toArray() } /** - * @return LoggerInterface + * Get logger. */ - public function getLogger() + public function getLogger(): LoggerInterface { // If no logger has been set, just return a null logger if ($this->logger === null) { @@ -130,11 +71,9 @@ public function getLogger() } /** - * @param LoggerInterface $logger - * - * @return Configuration + * Set logger. */ - public function setLogger(LoggerInterface $logger) + public function setLogger(LoggerInterface $logger): Configuration { $this->logger = $logger; @@ -142,19 +81,17 @@ public function setLogger(LoggerInterface $logger) } /** - * @return int + * Get max top candidates. */ - public function getMaxTopCandidates() + public function getMaxTopCandidates(): int { return $this->maxTopCandidates; } /** - * @param int $maxTopCandidates - * - * @return $this + * Set max top candidates. */ - public function setMaxTopCandidates($maxTopCandidates) + public function setMaxTopCandidates(int $maxTopCandidates): Configuration { $this->maxTopCandidates = $maxTopCandidates; @@ -162,19 +99,17 @@ public function setMaxTopCandidates($maxTopCandidates) } /** - * @return int + * Get char threshold. */ - public function getCharThreshold() + public function getCharThreshold(): int { return $this->charThreshold; } /** - * @param int $charThreshold - * - * @return $this + * Set char threshold. */ - public function setCharThreshold($charThreshold) + public function setCharThreshold(int $charThreshold): Configuration { $this->charThreshold = $charThreshold; @@ -182,19 +117,17 @@ public function setCharThreshold($charThreshold) } /** - * @return bool + * Get article by line. */ - public function getArticleByLine() + public function getArticleByLine(): bool { return $this->articleByLine; } /** - * @param bool $articleByLine - * - * @return $this + * Set article by line. */ - public function setArticleByLine($articleByLine) + public function setArticleByLine(bool $articleByLine): Configuration { $this->articleByLine = $articleByLine; @@ -202,19 +135,17 @@ public function setArticleByLine($articleByLine) } /** - * @return bool + * Get strip unlikely candidates. */ - public function getStripUnlikelyCandidates() + public function getStripUnlikelyCandidates(): bool { return $this->stripUnlikelyCandidates; } /** * @param bool $stripUnlikelyCandidates - * - * @return $this */ - public function setStripUnlikelyCandidates($stripUnlikelyCandidates) + public function setStripUnlikelyCandidates(bool $stripUnlikelyCandidates): Configuration { $this->stripUnlikelyCandidates = $stripUnlikelyCandidates; @@ -222,19 +153,17 @@ public function setStripUnlikelyCandidates($stripUnlikelyCandidates) } /** - * @return bool + * Get clean conditionally. */ - public function getCleanConditionally() + public function getCleanConditionally(): bool { return $this->cleanConditionally; } /** - * @param bool $cleanConditionally - * - * @return $this + * Set clean conditionally. */ - public function setCleanConditionally($cleanConditionally) + public function setCleanConditionally(bool $cleanConditionally): Configuration { $this->cleanConditionally = $cleanConditionally; @@ -242,19 +171,17 @@ public function setCleanConditionally($cleanConditionally) } /** - * @return bool + * Get weight classes. */ - public function getWeightClasses() + public function getWeightClasses(): bool { return $this->weightClasses; } /** - * @param bool $weightClasses - * - * @return $this + * Set weight classes. */ - public function setWeightClasses($weightClasses) + public function setWeightClasses(bool $weightClasses): Configuration { $this->weightClasses = $weightClasses; @@ -262,19 +189,17 @@ public function setWeightClasses($weightClasses) } /** - * @return bool + * Get fix relative URLs. */ - public function getFixRelativeURLs() + public function getFixRelativeURLs(): bool { return $this->fixRelativeURLs; } /** - * @param bool $fixRelativeURLs - * - * @return $this + * Set fix relative URLs. */ - public function setFixRelativeURLs($fixRelativeURLs) + public function setFixRelativeURLs(bool $fixRelativeURLs): Configuration { $this->fixRelativeURLs = $fixRelativeURLs; @@ -282,19 +207,17 @@ public function setFixRelativeURLs($fixRelativeURLs) } /** - * @return bool + * Get substitute entities. */ - public function getSubstituteEntities() + public function getSubstituteEntities(): bool { return $this->substituteEntities; } /** - * @param bool $substituteEntities - * - * @return $this + * Set substitute entities. */ - public function setSubstituteEntities($substituteEntities) + public function setSubstituteEntities(bool $substituteEntities): Configuration { $this->substituteEntities = $substituteEntities; @@ -302,19 +225,17 @@ public function setSubstituteEntities($substituteEntities) } /** - * @return bool + * Get normalize entities. */ - public function getNormalizeEntities() + public function getNormalizeEntities(): bool { return $this->normalizeEntities; } /** - * @param bool $normalizeEntities - * - * @return $this + * Set normalize entities. */ - public function setNormalizeEntities($normalizeEntities) + public function setNormalizeEntities(bool $normalizeEntities): Configuration { $this->normalizeEntities = $normalizeEntities; @@ -322,19 +243,17 @@ public function setNormalizeEntities($normalizeEntities) } /** - * @return string + * Get original URL. */ - public function getOriginalURL() + public function getOriginalURL(): string { return $this->originalURL; } /** - * @param string $originalURL - * - * @return $this + * Set original URL. */ - public function setOriginalURL($originalURL) + public function setOriginalURL(string $originalURL): Configuration { $this->originalURL = $originalURL; @@ -342,19 +261,17 @@ public function setOriginalURL($originalURL) } /** - * @return string + * Get parser. */ - public function getParser() + public function getParser(): string { return $this->parser; } /** - * @param string $parser - * - * @return $this + * Set parser. */ - public function setParser($parser) + public function setParser(string $parser): Configuration { $this->parser = $parser; @@ -362,19 +279,17 @@ public function setParser($parser) } /** - * @return bool + * Get keep classes. */ - public function getKeepClasses() + public function getKeepClasses(): bool { return $this->keepClasses; } /** - * @param bool $keepClasses - * - * @return $this + * Set keep classes. */ - public function setKeepClasses($keepClasses) + public function setKeepClasses(bool $keepClasses): Configuration { $this->keepClasses = $keepClasses; @@ -382,19 +297,17 @@ public function setKeepClasses($keepClasses) } /** - * @return bool + * Get disable JSON-LD. */ - public function getDisableJSONLD() + public function getDisableJSONLD(): bool { return $this->disableJSONLD; } /** - * @param bool $disableJSONLD - * - * @return $this + * Set disable JSON-LD. */ - public function setDisableJSONLD($disableJSONLD) + public function setDisableJSONLD(bool $disableJSONLD): Configuration { $this->disableJSONLD = $disableJSONLD; @@ -402,19 +315,17 @@ public function setDisableJSONLD($disableJSONLD) } /** - * @return bool + * Get summon Cthulhu. */ - public function getSummonCthulhu() + public function getSummonCthulhu(): bool { return $this->summonCthulhu; } /** - * @param bool $summonCthulhu - * - * @return $this + * Set summon Cthulhu. */ - public function setSummonCthulhu($summonCthulhu) + public function setSummonCthulhu(bool $summonCthulhu): Configuration { $this->summonCthulhu = $summonCthulhu; diff --git a/src/Nodes/DOM/DOMElement.php b/src/Nodes/DOM/DOMElement.php index b0da84f..0493c41 100644 --- a/src/Nodes/DOM/DOMElement.php +++ b/src/Nodes/DOM/DOMElement.php @@ -10,12 +10,10 @@ class DOMElement extends \DOMElement /** * Returns the child elements of this element. - * - * To get all child nodes, including non-element nodes like text and comment nodes, use childNodes. * - * @return DOMNodeList + * To get all child nodes, including non-element nodes like text and comment nodes, use childNodes. */ - public function children() + public function children(): DOMNodeList { $newList = new DOMNodeList(); foreach ($this->childNodes as $node) { @@ -29,18 +27,10 @@ public function children() /** * Returns the Element immediately prior to the specified one in its parent's children list, or null if the specified element is the first one in the list. * - * @see https://wiki.php.net/rfc/dom_living_standard_api - * @return DOMElement|null + * @deprecated Use previousElementSibling instead - introduced in PHP 8.0. */ - public function previousElementSibling() + public function previousElementSibling(): ?DOMElement { - $previous = $this->previousSibling; - while ($previous) { - if ($previous->nodeType === XML_ELEMENT_NODE) { - return $previous; - } - $previous = $previous->previousSibling; - } - return null; + return $this->previousElementSibling; } } diff --git a/src/Nodes/DOM/DOMNodeList.php b/src/Nodes/DOM/DOMNodeList.php index a718c00..8a85d3b 100644 --- a/src/Nodes/DOM/DOMNodeList.php +++ b/src/Nodes/DOM/DOMNodeList.php @@ -40,11 +40,9 @@ public function __get($name) } /** - * @param DOMNode|DOMElement|DOMComment $node - * - * @return DOMNodeList + * Add node to the list. */ - public function add($node) + public function add(DOMNode|DOMElement|DOMComment $node): DOMNodeList { $this->items[] = $node; $this->length++; @@ -53,17 +51,15 @@ public function add($node) } /** - * @param int $offset - * - * @return DOMNode|DOMElement|DOMComment + * Get node. */ - public function item(int $offset) + public function item(int $offset): DOMNode|DOMElement|DOMComment { return $this->items[$offset]; } /** - * @return int|void + * Number of items. */ public function count(): int { diff --git a/src/Nodes/NodeTrait.php b/src/Nodes/NodeTrait.php index 5598877..264bec5 100644 --- a/src/Nodes/NodeTrait.php +++ b/src/Nodes/NodeTrait.php @@ -70,18 +70,16 @@ trait NodeTrait /** * initialized getter. - * - * @return bool */ - public function isInitialized() + public function isInitialized(): bool { return $this->initialized; } /** - * @return bool + * Check if this is a data table. */ - public function isReadabilityDataTable() + public function isReadabilityDataTable(): bool { /* * This is a workaround that I'd like to remove in the future. @@ -99,11 +97,10 @@ public function isReadabilityDataTable() } /** - * @param bool $param + * Set data table flag. */ - public function setReadabilityDataTable($param) + public function setReadabilityDataTable(bool $param): void { - // Can't be "true" because DOMDocument casts it to "1" $this->setAttribute('readabilityDataTable', $param ? '1' : '0'); // $this->readabilityDataTable = $param; } @@ -113,11 +110,9 @@ public function setReadabilityDataTable($param) * * @ TODO: I don't like the weightClasses param. How can we get the config here? * - * @param $weightClasses bool Weight classes? - * - * @return static + * @param bool $weightClasses Weight classes? */ - public function initializeNode($weightClasses) + public function initializeNode(bool $weightClasses): static { if (!$this->isInitialized()) { $contentScore = 0; @@ -166,12 +161,8 @@ public function initializeNode($weightClasses) /** * Override for native getAttribute method. Some nodes have the getAttribute method, some don't, so we need * to check first the existence of the attributes property. - * - * @param $attributeName string Attribute to retrieve - * - * @return string */ - public function getAttribute($attributeName): string + public function getAttribute(string $attributeName): string { if (!is_null($this->attributes)) { return parent::getAttribute($attributeName); @@ -183,13 +174,9 @@ public function getAttribute($attributeName): string /** * Override for native hasAttribute. * - * @param $attributeName - * - * @return bool - * * @see getAttribute */ - public function hasAttribute($attributeName): bool + public function hasAttribute(string $attributeName): bool { if (!is_null($this->attributes)) { return parent::hasAttribute($attributeName); @@ -202,10 +189,8 @@ public function hasAttribute($attributeName): bool * Get the ancestors of the current node. * * @param int|bool $maxLevel Max amount of ancestors to get. False for all of them - * - * @return array */ - public function getNodeAncestors($maxLevel = 3): array + public function getNodeAncestors(int|bool $maxLevel = 3): array { $ancestors = []; $level = 0; @@ -226,8 +211,6 @@ public function getNodeAncestors($maxLevel = 3): array /** * Returns all links from the current element. - * - * @return array */ public function getAllLinks(): array { @@ -237,10 +220,8 @@ public function getAllLinks(): array /** * Get the density of links as a percentage of the content * This is the amount of text that is inside a link divided by the total text in the node. - * - * @return int */ - public function getLinkDensity() + public function getLinkDensity(): int { $textLength = mb_strlen($this->getTextContent(true)); if ($textLength === 0) { @@ -265,10 +246,8 @@ public function getLinkDensity() /** * Calculates the weight of the class/id of the current element. - * - * @return int */ - public function getClassWeight() + public function getClassWeight(): int { $weight = 0; @@ -303,10 +282,8 @@ public function getClassWeight() * Returns the full text of the node. * * @param bool $normalize Normalize white space? - * - * @return string */ - public function getTextContent($normalize = true) + public function getTextContent(bool $normalize = true): string { $nodeValue = trim($this->textContent); if ($normalize) { @@ -318,10 +295,8 @@ public function getTextContent($normalize = true) /** * Return an array indicating how many rows and columns this table has. - * - * @return array */ - public function getRowAndColumnCount() + public function getRowAndColumnCount(): array { $rows = $columns = 0; $trs = $this->getElementsByTagName('tr'); @@ -346,13 +321,8 @@ public function getRowAndColumnCount() /** * Creates a new node based on the text content of the original node. - * - * @param $originalNode DOMNode - * @param $tagName string - * - * @return DOMElement */ - public function createNode($originalNode, $tagName) + public function createNode(DOMNode $originalNode, string $tagName): DOMElement { $text = $originalNode->getTextContent(false); $newNode = $originalNode->ownerDocument->createElement($tagName, $text); @@ -363,14 +333,8 @@ public function createNode($originalNode, $tagName) /** * Check if a given node has one of its ancestor tag name matching the * provided one. - * - * @param string $tagName - * @param int $maxDepth - * @param callable $filterFn - * - * @return bool */ - public function hasAncestorTag($tagName, $maxDepth = 3, ?callable $filterFn = null) + public function hasAncestorTag(string $tagName, int $maxDepth = 3, ?callable $filterFn = null): bool { $depth = 0; $node = $this; @@ -394,12 +358,8 @@ public function hasAncestorTag($tagName, $maxDepth = 3, ?callable $filterFn = nu /** * Check if this node has only whitespace and a single element with given tag * or if it contains no element with given tag or more than 1 element. - * - * @param $tag string Name of tag - * - * @return bool */ - public function hasSingleTagInsideElement($tag) + public function hasSingleTagInsideElement(string $tag): bool { // There should be exactly 1 element child with given tag if (count($children = NodeUtility::filterTextNodes($this->childNodes)) !== 1 || $children->item(0)->nodeName !== $tag) { @@ -420,10 +380,8 @@ public function hasSingleTagInsideElement($tag) /** * Check if the current element has a single child block element. * Block elements are the ones defined in the divToPElements array. - * - * @return bool */ - public function hasSingleChildBlockElement() + public function hasSingleChildBlockElement(): bool { $result = false; if ($this->hasChildNodes()) { @@ -443,10 +401,8 @@ public function hasSingleChildBlockElement() /** * Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace. - * - * @return bool */ - public function isElementWithoutContent() + public function isElementWithoutContent(): bool { return $this instanceof DOMElement && mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent)) === 0 && @@ -469,11 +425,9 @@ public function isElementWithoutContent() /** * Determine if a node qualifies as phrasing content. - * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content. - * - * @return bool + * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content */ - public function isPhrasingContent() + public function isPhrasingContent(): bool { return $this->nodeType === XML_TEXT_NODE || in_array($this->nodeName, $this->phrasing_elems) !== false || (!is_null($this->childNodes) && @@ -487,10 +441,8 @@ public function isPhrasingContent() /** * In the original JS project they check if the node has the style display=none, which unfortunately * in our case we have no way of knowing that. So we just check for the attribute hidden or "display: none". - * - * @return bool */ - public function isProbablyVisible() + public function isProbablyVisible(): bool { return !preg_match('/display:( )?none/i', $this->getAttribute('style')) && !$this->hasAttribute('hidden') && @@ -499,9 +451,9 @@ public function isProbablyVisible() } /** - * @return bool + * Check if node is whitespace. */ - public function isWhitespace() + public function isWhitespace(): bool { return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) || ($this->nodeType === XML_ELEMENT_NODE && $this->nodeName === 'br'); @@ -524,10 +476,8 @@ public function isWhitespace() * used only when the results of the search are going to be used to remove the nodes. * * @param string $tag - * - * @return \Generator */ - public function shiftingAwareGetElementsByTagName($tag) + public function shiftingAwareGetElementsByTagName(string $tag): \Generator { $nodes = $this->getElementsByTagName($tag); $count = $nodes->length; @@ -549,10 +499,8 @@ public function shiftingAwareGetElementsByTagName($tag) /** * Mimics JS's firstElementChild property. PHP only has firstChild which could be any type of DOMNode. Use this * function to get the first one that is an DOMElement node. - * - * @return DOMElement|null */ - public function getFirstElementChild() + public function getFirstElementChild(): ?DOMElement { if ($this->childNodes instanceof \Traversable) { foreach ($this->childNodes as $node) { diff --git a/src/Nodes/NodeUtility.php b/src/Nodes/NodeUtility.php index 24775a4..3df4c2f 100644 --- a/src/Nodes/NodeUtility.php +++ b/src/Nodes/NodeUtility.php @@ -40,21 +40,17 @@ class NodeUtility 'b64DataUrl' => '/^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i', // See: https://schema.org/Article 'jsonLdArticleTypes' => '/^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/' - + ]; /** * Finds the next node, starting from the given node, and ignoring * whitespace in between. If the given node is an element, the same node is * returned. - * - * Imported from the Element class on league\html-to-markdown. * - * @param $node - * - * @return DOMNode + * Imported from the Element class on league\html-to-markdown. */ - public static function nextNode($node) + public static function nextNode(DOMNode $node): DOMNode { $next = $node; while ($next @@ -69,14 +65,8 @@ public static function nextNode($node) /** * Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new * element with the new tag name and importing it to the main DOMDocument. - * - * @param DOMNode|DOMElement $node - * @param string $value - * @param bool $importAttributes - * - * @return DOMNode */ - public static function setNodeTag($node, $value, $importAttributes = true) + public static function setNodeTag(DOMNode|DOMElement $node, string $value, bool $importAttributes = true): DOMNode { $new = new DOMDocument('1.0', 'utf-8'); $new->appendChild($new->createElement($value)); @@ -104,12 +94,8 @@ public static function setNodeTag($node, $value, $importAttributes = true) /** * Removes the current node and returns the next node to be parsed (child, sibling or parent). - * - * @param DOMNode|DOMElement $node - * - * @return DOMNode */ - public static function removeAndGetNext($node) + public static function removeAndGetNext(DOMNode|DOMElement $node): DOMNode { $nextNode = self::getNextNode($node, true); $node->parentNode->removeChild($node); @@ -119,12 +105,8 @@ public static function removeAndGetNext($node) /** * Remove the selected node. - * - * @param $node DOMElement - * - * @return void - **/ - public static function removeNode($node) + */ + public static function removeNode(DOMElement $node): void { $parent = $node->parentNode; if ($parent) { @@ -135,13 +117,8 @@ public static function removeNode($node) /** * Returns the next node. First checks for children (if the flag allows it), then for siblings, and finally * for parents. - * - * @param DOMNode|DOMElement|DOMDocument $originalNode - * @param bool $ignoreSelfAndKids - * - * @return DOMNode */ - public static function getNextNode($originalNode, $ignoreSelfAndKids = false) + public static function getNextNode(DOMNode|DOMElement|DOMDocument $originalNode, bool $ignoreSelfAndKids = false): DOMNode { /* * Traverse the DOM from node to node, starting at the node passed in. @@ -173,12 +150,8 @@ public static function getNextNode($originalNode, $ignoreSelfAndKids = false) /** * Remove all empty DOMNodes from DOMNodeLists. - * - * @param \DOMNodeList $list - * - * @return DOMNodeList */ - public static function filterTextNodes(\DOMNodeList $list) + public static function filterTextNodes(\DOMNodeList $list): DOMNodeList { $newList = new DOMNodeList(); foreach ($list as $node) { diff --git a/src/Readability.php b/src/Readability.php index 9b1aa0d..54e8fc8 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -18,100 +18,74 @@ class Readability { /** * Main DOMDocument where all the magic happens. - * - * @var DOMDocument */ - protected $dom; + protected DOMDocument $dom; /** * Title of the article. - * - * @var string|null */ - protected $title = null; + protected ?string $title = null; /** * Final DOMDocument with the fully parsed HTML. - * - * @var DOMDocument|null */ - protected $content = null; + protected ?DOMDocument $content = null; /** * Excerpt of the article. - * - * @var string|null */ - protected $excerpt = null; + protected ?string $excerpt = null; /** * Main image of the article. - * - * @var string|null */ - protected $image = null; + protected ?string $image = null; /** * Author of the article. Extracted from the byline tags and other social media properties. - * - * @var string|null */ - protected $author = null; + protected ?string $author = null; /** * Website name. - * - * @var string|null */ - protected $siteName = null; + protected ?string $siteName = null; /** * Direction of the text. - * - * @var string|null */ - protected $direction = null; + protected ?string $direction = null; /** * Base URI * HTML5PHP doesn't appear to store it in the baseURI property like PHP's DOMDocument does when parsing with libxml - * - * @var string|null */ - protected $baseURI = null; + protected ?string $baseURI = null; /** * Configuration object. - * - * @var Configuration */ - private $configuration; + private Configuration $configuration; /** * Logger object. - * - * @var LoggerInterface */ - private $logger; + private LoggerInterface $logger; /** * JSON-LD - * - * @var array */ - private $jsonld = []; + private array $jsonld = []; /** * Collection of attempted text extractions. - * - * @var array */ - private $attempts = []; + private array $attempts = []; /** - * @var array + * Default tags to score. */ - private $defaultTagsToScore = [ + private array $defaultTagsToScore = [ 'section', 'h2', 'h3', @@ -124,14 +98,14 @@ class Readability ]; /** - * @var array + * Unlikely roles. */ - private $unlikelyRoles = ['menu', 'menubar', 'complementary', 'navigation', 'alert', 'alertdialog', 'dialog']; + private array $unlikelyRoles = ['menu', 'menubar', 'complementary', 'navigation', 'alert', 'alertdialog', 'dialog']; /** - * @var array + * Alter to DIV exceptions. */ - private $alterToDIVExceptions = [ + private array $alterToDIVExceptions = [ 'div', 'article', 'section', @@ -139,9 +113,9 @@ class Readability ]; /** - * @var array + * HTML escape map. */ - private $htmlEscapeMap = [ + private array $htmlEscapeMap = [ 'lt' => '<', 'gt' => '>', 'amp' => '&', @@ -151,8 +125,6 @@ class Readability /** * Readability constructor. - * - * @param Configuration $configuration */ public function __construct(Configuration $configuration) { @@ -163,13 +135,9 @@ public function __construct(Configuration $configuration) /** * Main parse function. * - * @param $html|null - * * @throws ParseException - * - * @return bool */ - public function parse(?string $html = null) + public function parse(?string $html = null): bool { $this->logger->info('*** Starting parse process...'); @@ -296,8 +264,6 @@ public function parse(?string $html = null) * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both * objects and ruining the backup. - * - * @param string $html */ public function loadHTML(string $html): void { @@ -357,7 +323,6 @@ public function loadHTML(string $html): void * Try to extract metadata from JSON-LD object. * For now, only Schema.org objects of type Article or its subtypes are supported. * - * @param DOMDocument $dom * @return array with any metadata that could be extracted (possibly none) */ private function getJSONLD(DOMDocument $dom): array @@ -436,7 +401,7 @@ private function getJSONLD(DOMDocument $dom): array /** * Tries to guess relevant info from metadata of the html. Sets the results in the Readability properties. */ - private function getMetadata() + private function getMetadata(): void { $this->logger->debug('[Metadata] Retrieving metadata...'); @@ -597,7 +562,7 @@ public function getImages(): array * Tries to get the main article image. Will only update the metadata if the getMetadata function couldn't * find a correct image. */ - public function getMainImage() + public function getMainImage(): void { $imgUrl = false; @@ -630,15 +595,11 @@ public function getMainImage() /** * Remove unnecessary nested elements - * - * @param DOMDocument $article - * - * @return void */ - private function simplifyNestedElements(DOMDocument $article) + private function simplifyNestedElements(DOMDocument $article): void { $node = $article; - + while ($node) { if ($node->parentNode && in_array($node->nodeName, ['div', 'section']) && !($node->hasAttribute('id') && strpos($node->getAttribute('id'), 'readability') === 0)) { /** @var DOMElement $node */ @@ -655,17 +616,15 @@ private function simplifyNestedElements(DOMDocument $article) continue; } } - + $node = NodeUtility::getNextNode($node); } } /** * Returns the title of the html. Prioritizes the title from the metadata against the title tag. - * - * @return string|null */ - private function getArticleTitle() + private function getArticleTitle(): ?string { $originalTitle = null; @@ -766,12 +725,8 @@ private function getArticleTitle() /** * Convert URI to an absolute URI. - * - * @param $uri string URI to convert - * - * @return string */ - private function toAbsoluteURI($uri) + private function toAbsoluteURI(string $uri): string { list($pathBase, $scheme, $prePath) = $this->getPathInfo($this->configuration->getOriginalURL()); @@ -813,11 +768,9 @@ private function toAbsoluteURI($uri) /** * Returns full path info of an URL. * - * @param string $url - * * @return array [$pathBase, $scheme, $prePath] */ - public function getPathInfo($url) + public function getPathInfo(string $url): array { // Check for base URLs if ($this->baseURI !== null) { @@ -840,12 +793,8 @@ public function getPathInfo($url) /** * Gets nodes from the root element. - * - * @param $node DOMNode|DOMText - * - * @return array */ - private function getNodes($node) + private function getNodes(DOMNode|DOMText $node): array { $this->logger->info('[Get Nodes] Retrieving nodes...'); @@ -990,7 +939,8 @@ private function getNodes($node) * * @return int 1 = same text, 0 = completely different text */ - private function textSimilarity(string $textA, string $textB) { + private function textSimilarity(string $textA, string $textB): int + { $tokensA = array_filter(preg_split(NodeUtility::$regexps['tokenize'], mb_strtolower($textA))); $tokensB = array_filter(preg_split(NodeUtility::$regexps['tokenize'], mb_strtolower($textB))); if (!count($tokensA) || !count($tokensB)) { @@ -1005,13 +955,8 @@ private function textSimilarity(string $textA, string $textB) { /** * Checks if the node is a byline. - * - * @param DOMNode $node - * @param string $matchString - * - * @return bool */ - private function checkByline($node, $matchString) + private function checkByline(DOMNode $node, string $matchString): bool { if (!$this->configuration->getArticleByLine()) { return false; @@ -1039,12 +984,8 @@ private function checkByline($node, $matchString) /** * Checks the validity of a byLine. Based on string length. - * - * @param string $text - * - * @return bool */ - private function isValidByline($text) + private function isValidByline(string $text): bool { if (gettype($text) == 'string') { $byline = trim($text); @@ -1057,11 +998,9 @@ private function isValidByline($text) /** * Converts some of the common HTML entities in string to their corresponding characters. - * - * @param string $str - a string to unescape. - * @return string without HTML entity. */ - private function unescapeHtmlEntities($str) { + private function unescapeHtmlEntities(string $str): string + { if (!$str) { return $str; } @@ -1086,10 +1025,9 @@ private function unescapeHtmlEntities($str) { /** * Check if node is image, or if node contains exactly only one image * whether as a direct child or as its descendants. - * - * @param DOMElement $node */ - private function isSingleImage(DOMElement $node) { + private function isSingleImage(DOMElement $node): bool + { if ($node->tagName === 'img') { return true; } @@ -1106,10 +1044,9 @@ private function isSingleImage(DOMElement $node) { * element. Replace the first image with the image from inside the