From 6f91f4f21f9bc88cc15f038febde4eb3b0c292bf Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Thu, 15 Dec 2016 16:20:40 +0000 Subject: Added prepDocument function. --- src/HTMLParser.php | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/src/HTMLParser.php b/src/HTMLParser.php index e005f5b..fbf3455 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -128,6 +128,8 @@ class HTMLParser $this->removeScripts(); + $this->prepDocument(); + // In case we need the original HTML to create a fake top candidate $this->backupdom = clone $this->dom; @@ -221,6 +223,74 @@ class HTMLParser } } + /* + * Prepares the document for parsing + */ + private function prepDocument() + { + foreach ($this->dom->getElementsByTagName('br') as $br) { + /** @var \DOMNode $br */ + $next = $br->nextSibling; + + /* + * Whether 2 or more
elements have been found and replaced with a + *

block. + */ + $replaced = false; + + /* + * If we find a
chain, remove the
s until we hit another element + * or non-whitespace. This leaves behind the first
in the chain + * (which will be replaced with a

later). + */ + while (($next = $this->nextElement($next)) && ($next->nodeName === 'br')) { + $replaced = true; + $brSibling = $next->nextSibling; + $next->parentNode->removeChild($next); + $next = $brSibling; + } + + /* + * If we removed a
chain, replace the remaining
with a

. Add + * all sibling nodes as children of the

until we hit another
+ * chain. + */ + + if ($replaced) { + $p = $this->dom->createElement('p'); + $br->parentNode->replaceChild($p, $br); + + $next = $p->nextSibling; + while ($next) { + // If we've hit another

, we're done adding children to this

. + if ($next->nodeName === 'br') { + $nextElem = $this->nextElement($next); + if ($nextElem && $nextElem->nodeName === 'br') { + break; + } + } + + // Otherwise, make this node a child of the new

. + $sibling = $next->nextSibling; + $p->appendChild($next); + $next = $sibling; + } + } + } + } + + private function nextElement($node) + { + $next = $node; + while ($next + && $next->nodeName !== '#text' + && trim($next->textContent)) { + $next = $next->nextSibling; + } + + return $next; + } + /** * Tries to guess relevant info from metadata of the html. * @@ -366,8 +436,8 @@ class HTMLParser } else { // EXPERIMENTAL foreach ($node->getChildren() as $child) { + /** @var Readability $child */ if ($child->isText()) { - /** @var Readability $child */ // Check if there's actual content on the node. if (trim($child->getTextContent())) { $newNode = $node->createNode($child, 'p'); -- cgit v1.2.3