diff options
author | Andres Rey <[email protected]> | 2016-12-15 16:20:40 +0000 |
---|---|---|
committer | Andres Rey <[email protected]> | 2016-12-15 16:20:40 +0000 |
commit | 6f91f4f21f9bc88cc15f038febde4eb3b0c292bf (patch) | |
tree | 69a868f83d9ddd31b8be4d8ff63fa709fc8fd4ea | |
parent | 0ea923c85532ed6ff25c582f7281b958c2f0daa4 (diff) |
Added prepDocument function.
-rw-r--r-- | src/HTMLParser.php | 72 |
1 files changed, 71 insertions, 1 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php index e005f5b..fbf3455 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -128,6 +128,8 @@ class HTMLParser $this->removeScripts(); + $this->prepDocument(); + // In case we need the original HTML to create a fake top candidate $this->backupdom = clone $this->dom; @@ -221,6 +223,74 @@ class HTMLParser } } + /* + * Prepares the document for parsing + */ + private function prepDocument() + { + foreach ($this->dom->getElementsByTagName('br') as $br) { + /** @var \DOMNode $br */ + $next = $br->nextSibling; + + /* + * Whether 2 or more <br> elements have been found and replaced with a + * <p> block. + */ + $replaced = false; + + /* + * If we find a <br> chain, remove the <br>s until we hit another element + * or non-whitespace. This leaves behind the first <br> in the chain + * (which will be replaced with a <p> later). + */ + while (($next = $this->nextElement($next)) && ($next->nodeName === 'br')) { + $replaced = true; + $brSibling = $next->nextSibling; + $next->parentNode->removeChild($next); + $next = $brSibling; + } + + /* + * If we removed a <br> chain, replace the remaining <br> with a <p>. Add + * all sibling nodes as children of the <p> until we hit another <br> + * chain. + */ + + if ($replaced) { + $p = $this->dom->createElement('p'); + $br->parentNode->replaceChild($p, $br); + + $next = $p->nextSibling; + while ($next) { + // If we've hit another <br><br>, we're done adding children to this <p>. + if ($next->nodeName === 'br') { + $nextElem = $this->nextElement($next); + if ($nextElem && $nextElem->nodeName === 'br') { + break; + } + } + + // Otherwise, make this node a child of the new <p>. + $sibling = $next->nextSibling; + $p->appendChild($next); + $next = $sibling; + } + } + } + } + + private function nextElement($node) + { + $next = $node; + while ($next + && $next->nodeName !== '#text' + && trim($next->textContent)) { + $next = $next->nextSibling; + } + + return $next; + } + /** * Tries to guess relevant info from metadata of the html. * @@ -366,8 +436,8 @@ class HTMLParser } else { // EXPERIMENTAL foreach ($node->getChildren() as $child) { + /** @var Readability $child */ if ($child->isText()) { - /** @var Readability $child */ // Check if there's actual content on the node. if (trim($child->getTextContent())) { $newNode = $node->createNode($child, 'p'); |