From 14871f80669d3f299407ce4d0e35e67fa90bc1a5 Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Wed, 5 Sep 2018 21:21:11 +0100 Subject: Update initial parsing and add isWhitespace trait function. --- src/Nodes/NodeTrait.php | 10 ++++++++-- src/Readability.php | 25 ++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/src/Nodes/NodeTrait.php b/src/Nodes/NodeTrait.php index b4ca746..7661e15 100644 --- a/src/Nodes/NodeTrait.php +++ b/src/Nodes/NodeTrait.php @@ -473,9 +473,9 @@ trait NodeTrait */ public function isPhrasingContent() { - return $this->nodeType === XML_TEXT_NODE || !in_array($this->tagName, $this->phrasing_elems) || + return $this->nodeType === XML_TEXT_NODE || !in_array($this->nodeName , $this->phrasing_elems) || (!is_null($this->childNodes) && - ($this->tagName === 'a' || $this->tagName === 'del' || $this->tagName === 'ins') && + ($this->nodeName === 'a' || $this->nodeName === 'del' || $this->nodeName === 'ins') && array_reduce(iterator_to_array($this->childNodes), function ($carry, $node) { return $carry || $node->isPhrasingContent(); }) @@ -493,4 +493,10 @@ trait NodeTrait return !$this->hasAttribute('hidden'); } + + public function isWhitespace() + { + return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) || + ($this->nodeType === XML_ELEMENT_NODE && $this->nodeName === 'br'); + } } diff --git a/src/Readability.php b/src/Readability.php index f913d80..3e168fe 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -671,13 +671,36 @@ class Readability // Turn all divs that don't have children block level elements into p's if ($node->nodeName === 'div') { + // Put phrasing content into paragraphs. + $p = null; + $childNode = $node->firstChild; + while ($childNode) { + $nextSibling = $childNode->nextSibling; + if ($childNode->isPhrasingContent()) { + if ($p !== null) { + $p->appendChild($childNode); + } else if (!$childNode->isWhitespace()) { + $p = $this->dom->createElement('p'); + $node->replaceChild($p, $childNode); + $p->appendChild($childNode); + } + } else if ($p !== null) { + while ($p->lastChild && $p->lastChild->isWhitespace()) { + $p->removeChild($p->lastChild); + } + $p = null; + } + $childNode = $nextSibling; + } + + /* * Sites like http://mobile.slate.com encloses each paragraph with a DIV * element. DIVs with only a P element inside and no text content can be * safely converted into plain P elements to avoid confusing the scoring * algorithm with DIVs with are, in practice, paragraphs. */ - if ($node->hasSingleTagInsideElement('p')) { + if ($node->hasSingleTagInsideElement('p') && $node->getLinkDensity() < 0.25) { $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single P node, removing DIV. Node content is: \'%s\'', substr($node->nodeValue, 0, 128))); $pNode = $node->getChildren(true)[0]; $node->parentNode->replaceChild($pNode, $node); -- cgit v1.2.3