summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2016-12-15 16:20:40 +0000
committerAndres Rey <[email protected]>2016-12-15 16:20:40 +0000
commit6f91f4f21f9bc88cc15f038febde4eb3b0c292bf (patch)
tree69a868f83d9ddd31b8be4d8ff63fa709fc8fd4ea
parent0ea923c85532ed6ff25c582f7281b958c2f0daa4 (diff)
Added prepDocument function.
-rw-r--r--src/HTMLParser.php72
1 files changed, 71 insertions, 1 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index e005f5b..fbf3455 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -128,6 +128,8 @@ class HTMLParser
$this->removeScripts();
+ $this->prepDocument();
+
// In case we need the original HTML to create a fake top candidate
$this->backupdom = clone $this->dom;
@@ -221,6 +223,74 @@ class HTMLParser
}
}
+ /*
+ * Prepares the document for parsing
+ */
+ private function prepDocument()
+ {
+ foreach ($this->dom->getElementsByTagName('br') as $br) {
+ /** @var \DOMNode $br */
+ $next = $br->nextSibling;
+
+ /*
+ * Whether 2 or more <br> elements have been found and replaced with a
+ * <p> block.
+ */
+ $replaced = false;
+
+ /*
+ * If we find a <br> chain, remove the <br>s until we hit another element
+ * or non-whitespace. This leaves behind the first <br> in the chain
+ * (which will be replaced with a <p> later).
+ */
+ while (($next = $this->nextElement($next)) && ($next->nodeName === 'br')) {
+ $replaced = true;
+ $brSibling = $next->nextSibling;
+ $next->parentNode->removeChild($next);
+ $next = $brSibling;
+ }
+
+ /*
+ * If we removed a <br> chain, replace the remaining <br> with a <p>. Add
+ * all sibling nodes as children of the <p> until we hit another <br>
+ * chain.
+ */
+
+ if ($replaced) {
+ $p = $this->dom->createElement('p');
+ $br->parentNode->replaceChild($p, $br);
+
+ $next = $p->nextSibling;
+ while ($next) {
+ // If we've hit another <br><br>, we're done adding children to this <p>.
+ if ($next->nodeName === 'br') {
+ $nextElem = $this->nextElement($next);
+ if ($nextElem && $nextElem->nodeName === 'br') {
+ break;
+ }
+ }
+
+ // Otherwise, make this node a child of the new <p>.
+ $sibling = $next->nextSibling;
+ $p->appendChild($next);
+ $next = $sibling;
+ }
+ }
+ }
+ }
+
+ private function nextElement($node)
+ {
+ $next = $node;
+ while ($next
+ && $next->nodeName !== '#text'
+ && trim($next->textContent)) {
+ $next = $next->nextSibling;
+ }
+
+ return $next;
+ }
+
/**
* Tries to guess relevant info from metadata of the html.
*
@@ -366,8 +436,8 @@ class HTMLParser
} else {
// EXPERIMENTAL
foreach ($node->getChildren() as $child) {
+ /** @var Readability $child */
if ($child->isText()) {
- /** @var Readability $child */
// Check if there's actual content on the node.
if (trim($child->getTextContent())) {
$newNode = $node->createNode($child, 'p');