summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2018-09-05 21:21:11 +0100
committerAndres Rey <[email protected]>2018-09-05 21:21:11 +0100
commit14871f80669d3f299407ce4d0e35e67fa90bc1a5 (patch)
tree91f6ebdaed0d7e2c9214b65bc5aed1c11189e6a5 /src
parentf2db151e599f699ee432fcadf4ae872426202dea (diff)
Update initial parsing and add isWhitespace trait function.
Diffstat (limited to 'src')
-rw-r--r--src/Nodes/NodeTrait.php10
-rw-r--r--src/Readability.php25
2 files changed, 32 insertions, 3 deletions
diff --git a/src/Nodes/NodeTrait.php b/src/Nodes/NodeTrait.php
index b4ca746..7661e15 100644
--- a/src/Nodes/NodeTrait.php
+++ b/src/Nodes/NodeTrait.php
@@ -473,9 +473,9 @@ trait NodeTrait
*/
public function isPhrasingContent()
{
- return $this->nodeType === XML_TEXT_NODE || !in_array($this->tagName, $this->phrasing_elems) ||
+ return $this->nodeType === XML_TEXT_NODE || !in_array($this->nodeName , $this->phrasing_elems) ||
(!is_null($this->childNodes) &&
- ($this->tagName === 'a' || $this->tagName === 'del' || $this->tagName === 'ins') &&
+ ($this->nodeName === 'a' || $this->nodeName === 'del' || $this->nodeName === 'ins') &&
array_reduce(iterator_to_array($this->childNodes), function ($carry, $node) {
return $carry || $node->isPhrasingContent();
})
@@ -493,4 +493,10 @@ trait NodeTrait
return !$this->hasAttribute('hidden');
}
+
+ public function isWhitespace()
+ {
+ return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) ||
+ ($this->nodeType === XML_ELEMENT_NODE && $this->nodeName === 'br');
+ }
}
diff --git a/src/Readability.php b/src/Readability.php
index f913d80..3e168fe 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -671,13 +671,36 @@ class Readability
// Turn all divs that don't have children block level elements into p's
if ($node->nodeName === 'div') {
+ // Put phrasing content into paragraphs.
+ $p = null;
+ $childNode = $node->firstChild;
+ while ($childNode) {
+ $nextSibling = $childNode->nextSibling;
+ if ($childNode->isPhrasingContent()) {
+ if ($p !== null) {
+ $p->appendChild($childNode);
+ } else if (!$childNode->isWhitespace()) {
+ $p = $this->dom->createElement('p');
+ $node->replaceChild($p, $childNode);
+ $p->appendChild($childNode);
+ }
+ } else if ($p !== null) {
+ while ($p->lastChild && $p->lastChild->isWhitespace()) {
+ $p->removeChild($p->lastChild);
+ }
+ $p = null;
+ }
+ $childNode = $nextSibling;
+ }
+
+
/*
* Sites like http://mobile.slate.com encloses each paragraph with a DIV
* element. DIVs with only a P element inside and no text content can be
* safely converted into plain P elements to avoid confusing the scoring
* algorithm with DIVs with are, in practice, paragraphs.
*/
- if ($node->hasSingleTagInsideElement('p')) {
+ if ($node->hasSingleTagInsideElement('p') && $node->getLinkDensity() < 0.25) {
$this->logger->debug(sprintf('[Get Nodes] Found DIV with a single P node, removing DIV. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
$pNode = $node->getChildren(true)[0];
$node->parentNode->replaceChild($pNode, $node);