From f6d91bd07f724c7343ca9b52a9656c8c4a6c8fa6 Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Sun, 17 Mar 2024 00:49:46 +0100 Subject: [PATCH 1/6] Remove pointless `stdClass` `DOMNode::$childNodes` always contained `DOMNodeList`. --- src/Readability.php | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/Readability.php b/src/Readability.php index 836a333..ff628cd 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -1231,11 +1231,6 @@ protected function grabArticle(?\DOMElement $page = null) $parentOfTopCandidate = $topCandidate->parentNode; $siblingNodes = $parentOfTopCandidate->childNodes; - if (0 === $siblingNodes->length) { - $siblingNodes = new \stdClass(); - $siblingNodes->length = 0; - } - for ($s = 0, $sl = $siblingNodes->length; $s < $sl; ++$s) { $siblingNode = $siblingNodes->item($s); $siblingNodeName = $siblingNode->nodeName; From 76908b19bab81dbe6aa9fadc33abb45fdf444f16 Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Sun, 17 Mar 2024 03:17:21 +0100 Subject: [PATCH 2/6] Extract `for`-iterated items into variables This simplifies the code a bit and will make it slightly easier in case we decide to switch to `foreach` iteration. --- src/Readability.php | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/Readability.php b/src/Readability.php index ff628cd..6996e50 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -719,8 +719,9 @@ public function cleanHeaders(\DOMElement $e): void $headers = $e->getElementsByTagName('h' . $headerIndex); for ($i = $headers->length - 1; $i >= 0; --$i) { - if ($this->getWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { - $headers->item($i)->parentNode->removeChild($headers->item($i)); + $header = $headers->item($i); + if ($this->getWeight($header) < 0 || $this->getLinkDensity($header) > 0.33) { + $header->parentNode->removeChild($header); } } } @@ -812,12 +813,14 @@ protected function prepDocument(): void // Remove all style tags in head. $styleTags = $this->dom->getElementsByTagName('style'); for ($i = $styleTags->length - 1; $i >= 0; --$i) { - $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); + $styleTag = $styleTags->item($i); + $styleTag->parentNode->removeChild($styleTag); } $linkTags = $this->dom->getElementsByTagName('link'); for ($i = $linkTags->length - 1; $i >= 0; --$i) { - $linkTags->item($i)->parentNode->removeChild($linkTags->item($i)); + $linkTag = $linkTags->item($i); + $linkTag->parentNode->removeChild($linkTag); } } From fbc9d6788941c1b843f851bffec2dc283b1c7c29 Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Sun, 17 Mar 2024 03:23:04 +0100 Subject: [PATCH 3/6] Remove dead iteration code This was forgotten in b580cf216d9001f82c866bb9a6c8bcad1cc862d8. --- src/Readability.php | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/Readability.php b/src/Readability.php index 6996e50..54f5569 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -1079,11 +1079,6 @@ protected function grabArticle(?\DOMElement $page = null) } } - $candidates = $xpath->query('.//*[not(self::body) and (@class or @id or @style) and ((number(@readability) < 40) or not(@readability))]', $page->documentElement); - - for ($c = $candidates->length - 1; $c >= 0; --$c) { - $node = $candidates->item($c); - } unset($candidates); } From ed7d059625af8ee8c016d69c2b54facbdd3c6bcb Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Sun, 17 Mar 2024 00:29:39 +0100 Subject: [PATCH 4/6] Iterate node lists with `foreach` `DOMNodeList` implements `Traversable`. There are some `for` loops left but we cannot simply replace those: PHP follows the DOM specification, which requires that `NodeList` objects in the DOM are live. As a result, any operation that removes a node list member node from its parent (such as `removeChild`, `replaceChild` or `appendChild`) will cause the next node in the iterator to be skipped. We could work around that by converting those node lists to static arrays using `iterator_to_array` but not sure if it is worth it. --- src/Readability.php | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/Readability.php b/src/Readability.php index 54f5569..4b47daa 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -302,8 +302,7 @@ public function addFootnotes(\DOMElement $articleContent): void $articleLinks = $articleContent->getElementsByTagName('a'); $linkCount = 0; - for ($i = 0; $i < $articleLinks->length; ++$i) { - $articleLink = $articleLinks->item($i); + foreach ($articleLinks as $articleLink) { $footnoteLink = $articleLink->cloneNode(true); $refLink = $this->dom->createElement('a'); $footnote = $this->dom->createElement('li'); @@ -383,8 +382,8 @@ public function prepArticle(\DOMNode $articleContent): void // Remove service data-candidate attribute. $elems = $xpath->query('.//*[@data-candidate]', $articleContent); - for ($i = $elems->length - 1; $i >= 0; --$i) { - $elems->item($i)->removeAttribute('data-candidate'); + foreach ($elems as $elem) { + $elem->removeAttribute('data-candidate'); } // Clean out junk from the article content. @@ -520,11 +519,12 @@ public function getLinkDensity(\DOMElement $e, bool $excludeExternal = false): f $textLength = mb_strlen($this->getInnerText($e, true, true)); $linkLength = 0; - for ($dRe = $this->domainRegExp, $i = 0, $il = $links->length; $i < $il; ++$i) { - if ($excludeExternal && $dRe && !preg_match($dRe, $links->item($i)->getAttribute('href'))) { + $dRe = $this->domainRegExp; + foreach ($links as $link) { + if ($excludeExternal && $dRe && !preg_match($dRe, $link->getAttribute('href'))) { continue; } - $linkLength += mb_strlen($this->getInnerText($links->item($i))); + $linkLength += mb_strlen($this->getInnerText($link)); } if ($textLength > 0 && $linkLength > 0) { @@ -640,15 +640,15 @@ public function cleanConditionally(\DOMElement $e, string $tag): void $embedCount = 0; $embeds = $node->getElementsByTagName('embed'); - for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) { - if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) { + foreach ($embeds as $embed) { + if (preg_match($this->regexps['media'], $embed->getAttribute('src'))) { ++$embedCount; } } $embeds = $node->getElementsByTagName('iframe'); - for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) { - if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) { + foreach ($embeds as $embed) { + if (preg_match($this->regexps['media'], $embed->getAttribute('src'))) { ++$embedCount; } } @@ -1018,15 +1018,15 @@ protected function grabArticle(?\DOMElement $page = null) * A score is determined by things like number of commas, class names, etc. * Maybe eventually link density. */ - for ($pt = 0, $scored = \count($nodesToScore); $pt < $scored; ++$pt) { - $ancestors = $this->getAncestors($nodesToScore[$pt], 5); + foreach ($nodesToScore as $nodeToScore) { + $ancestors = $this->getAncestors($nodeToScore, 5); // No parent node? Move on... if (0 === \count($ancestors)) { continue; } - $innerText = $this->getInnerText($nodesToScore[$pt]); + $innerText = $this->getInnerText($nodeToScore); // If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it. if (mb_strlen($innerText) < self::MIN_PARAGRAPH_LENGTH) { From 5166622b8c7f082c0aba8d81be8059d7a2eb243b Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Mon, 18 Mar 2024 23:27:52 +0100 Subject: [PATCH 5/6] Iterate even more node lists with foreach --- src/Readability.php | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/src/Readability.php b/src/Readability.php index 4b47daa..e836b81 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -374,8 +374,7 @@ public function prepArticle(\DOMNode $articleContent): void * which is what they were before. */ $elems = $xpath->query('.//p[@data-readability-styled]', $articleContent); - for ($i = $elems->length - 1; $i >= 0; --$i) { - $e = $elems->item($i); + foreach (iterator_to_array($elems) as $e) { $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); } } @@ -415,9 +414,7 @@ public function prepArticle(\DOMNode $articleContent): void // Remove extra paragraphs. $articleParagraphs = $articleContent->getElementsByTagName('p'); - for ($i = $articleParagraphs->length - 1; $i >= 0; --$i) { - $item = $articleParagraphs->item($i); - + foreach (iterator_to_array($articleParagraphs) as $item) { $imgCount = $item->getElementsByTagName('img')->length; $embedCount = $item->getElementsByTagName('embed')->length; $objectCount = $item->getElementsByTagName('object')->length; @@ -573,10 +570,8 @@ public function clean(\DOMElement $e, string $tag): void $targetList = $e->getElementsByTagName($tag); $isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag); - for ($y = $targetList->length - 1; $y >= 0; --$y) { + foreach (iterator_to_array($targetList) as $currentItem) { // Allow youtube and vimeo videos through as people usually want to see those. - $currentItem = $targetList->item($y); - if ($isEmbed) { $attributeValues = $currentItem->getAttribute('src') . ' ' . $currentItem->getAttribute('href'); @@ -586,7 +581,7 @@ public function clean(\DOMElement $e, string $tag): void } // Then check the elements inside this element for the same. - if (preg_match($this->regexps['media'], $targetList->item($y)->getInnerHTML())) { + if (preg_match($this->regexps['media'], $currentItem->getInnerHTML())) { continue; } } @@ -607,7 +602,6 @@ public function cleanConditionally(\DOMElement $e, string $tag): void } $tagsList = $e->getElementsByTagName($tag); - $curTagsLength = $tagsList->length; /* * Gather counts for other typical elements embedded within. @@ -615,8 +609,7 @@ public function cleanConditionally(\DOMElement $e, string $tag): void * * TODO: Consider taking into account original contentScore here. */ - for ($i = $curTagsLength - 1; $i >= 0; --$i) { - $node = $tagsList->item($i); + foreach (iterator_to_array($tagsList) as $node) { $weight = $this->getWeight($node); $contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0; $this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : '')); @@ -718,8 +711,7 @@ public function cleanHeaders(\DOMElement $e): void for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) { $headers = $e->getElementsByTagName('h' . $headerIndex); - for ($i = $headers->length - 1; $i >= 0; --$i) { - $header = $headers->item($i); + foreach (iterator_to_array($headers) as $header) { if ($this->getWeight($header) < 0 || $this->getLinkDensity($header) > 0.33) { $header->parentNode->removeChild($header); } @@ -812,14 +804,12 @@ protected function prepDocument(): void // Remove all style tags in head. $styleTags = $this->dom->getElementsByTagName('style'); - for ($i = $styleTags->length - 1; $i >= 0; --$i) { - $styleTag = $styleTags->item($i); + foreach (iterator_to_array($styleTags) as $styleTag) { $styleTag->parentNode->removeChild($styleTag); } $linkTags = $this->dom->getElementsByTagName('link'); - for ($i = $linkTags->length - 1; $i >= 0; --$i) { - $linkTag = $linkTags->item($i); + foreach (iterator_to_array($linkTags) as $linkTag) { $linkTag->parentNode->removeChild($linkTag); } } @@ -1070,8 +1060,7 @@ protected function grabArticle(?\DOMElement $page = null) if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) { $candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement); - for ($c = $candidates->length - 1; $c >= 0; --$c) { - $node = $candidates->item($c); + foreach (iterator_to_array($candidates) as $node) { // node should be readable but not inside of an article otherwise it's probably non-readable block if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? 0 !== strcasecmp($node->parentNode->tagName, 'article') : true)) { $this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); @@ -1092,9 +1081,7 @@ protected function grabArticle(?\DOMElement $page = null) $candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement); $this->logger->debug('Candidates: ' . $candidates->length); - for ($c = $candidates->length - 1; $c >= 0; --$c) { - $item = $candidates->item($c); - + foreach (iterator_to_array($candidates) as $item) { // Scale the final candidates score based on link density. Good content should have a // relatively small link density (5% or less) and be mostly unaffected by this operation. // If not for this we would have used XPath to find maximum @readability. From 01e9b26afe0c4a9b2860249a0ac7052302b8e8f4 Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Sun, 17 Mar 2024 02:16:11 +0100 Subject: [PATCH 6/6] Iterate more node lists with foreach MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PHP follows the DOM specification, which requires that NodeList objects in the DOM are live. As a result, any operation that removes a NodeList node from its parent (e.g. removeChild, replaceChild or appendChild) will cause the next node in the iterator to be skipped. Let’s convert those node lists to arrays to make them static to allow us to use foreach and drop the index decrementing. This will make the code a bit clearer at the cost of slightly more work being performed. --- src/Readability.php | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/src/Readability.php b/src/Readability.php index e836b81..7378446 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -903,8 +903,7 @@ protected function grabArticle(?\DOMElement $page = null) $allElements = $page->getElementsByTagName('*'); - for ($nodeIndex = 0; $allElements->item($nodeIndex); ++$nodeIndex) { - $node = $allElements->item($nodeIndex); + foreach (iterator_to_array($allElements) as $node) { $tagName = $node->tagName; $nodeContent = $node->getInnerHTML(); @@ -917,7 +916,6 @@ protected function grabArticle(?\DOMElement $page = null) if (!$this->isNodeVisible($node)) { $this->logger->debug('Removing invisible node ' . $node->getNodePath()); $node->parentNode->removeChild($node); - --$nodeIndex; continue; } @@ -930,7 +928,6 @@ protected function grabArticle(?\DOMElement $page = null) ) { $this->logger->debug('Removing unlikely candidate (using conf) ' . $node->getNodePath() . ' by "' . $unlikelyMatchString . '"'); $node->parentNode->removeChild($node); - --$nodeIndex; continue; } @@ -949,7 +946,6 @@ protected function grabArticle(?\DOMElement $page = null) $newNode->setInnerHtml($nodeContent); $node->parentNode->replaceChild($newNode, $node); - --$nodeIndex; $nodesToScore[] = $newNode; } catch (\Exception $e) { $this->logger->error('Could not alter div/article to p, reverting back to div: ' . $e->getMessage()); @@ -1216,8 +1212,7 @@ protected function grabArticle(?\DOMElement $page = null) $parentOfTopCandidate = $topCandidate->parentNode; $siblingNodes = $parentOfTopCandidate->childNodes; - for ($s = 0, $sl = $siblingNodes->length; $s < $sl; ++$s) { - $siblingNode = $siblingNodes->item($s); + foreach (iterator_to_array($siblingNodes) as $siblingNode) { $siblingNodeName = $siblingNode->nodeName; $append = false; $this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . ((\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); @@ -1260,13 +1255,9 @@ protected function grabArticle(?\DOMElement $page = null) } catch (\Exception $e) { $this->logger->debug('Could not alter siblingNode "' . $siblingNodeName . '" to "div", reverting to original.'); $nodeToAppend = $siblingNode; - --$s; - --$sl; } } else { $nodeToAppend = $siblingNode; - --$s; - --$sl; } // To ensure a node does not interfere with readability styles, remove its classnames & ids.