summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2018-09-01 20:50:25 +0100
committerAndres Rey <[email protected]>2018-09-01 20:50:25 +0100
commitd8471fc68e363ac884591cf9ab9e334773f97cae (patch)
tree48c83d9c520d035455e4dc5384e2e0b4955db2ae
parente1247d6d044d02605a3b306187f44fe9795db908 (diff)
Import the isPhrasingContent function. Might want to check the recursive loop there if it's actually doing what it should and if there's a better way to optimize it
-rw-r--r--src/Nodes/NodeTrait.php32
-rw-r--r--src/Readability.php4
2 files changed, 36 insertions, 0 deletions
diff --git a/src/Nodes/NodeTrait.php b/src/Nodes/NodeTrait.php
index 13611c9..9f24abc 100644
--- a/src/Nodes/NodeTrait.php
+++ b/src/Nodes/NodeTrait.php
@@ -51,6 +51,21 @@ trait NodeTrait
];
/**
+ * The commented out elements qualify as phrasing content but tend to be
+ * removed by readability when put into paragraphs, so we ignore them here.
+ *
+ * @var array
+ */
+ private $phrasing_elems = [
+ // 'CANVAS', 'IFRAME', 'SVG', 'VIDEO',
+ 'abbr', 'audio', 'b', 'bdo', 'br', 'button', 'cite', 'code', 'data',
+ 'datalist', 'dfn', 'em', 'embed', 'i', 'img', 'input', 'kbd', 'label',
+ 'mark', 'math', 'meter', 'noscript', 'object', 'output', 'progress', 'q',
+ 'ruby', 'samp', 'script', 'select', 'small', 'span', 'strong', 'sub',
+ 'sup', 'textarea', 'time', 'var', 'wbr'
+ ];
+
+ /**
* initialized getter.
*
* @return bool
@@ -431,4 +446,21 @@ trait NodeTrait
);
}
+
+ /**
+ * Determine if a node qualifies as phrasing content.
+ * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
+ *
+ * @return bool
+ */
+ public function isPhrasingContent()
+ {
+ return $this->nodeType === XML_TEXT_NODE || !in_array($this->tagName, $this->phrasing_elems) ||
+ (!is_null($this->childNodes) &&
+ ($this->tagName === 'a' || $this->tagName === 'del' || $this->tagName === 'ins') &&
+ array_reduce(iterator_to_array($this->childNodes), function ($carry, $node) {
+ return $carry || $node->isPhrasingContent();
+ })
+ );
+ }
}
diff --git a/src/Readability.php b/src/Readability.php
index 3f4d807..e0c9abf 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -831,6 +831,10 @@ class Readability
}
}
+ if ($next->isPhrasingContent()) {
+ break;
+ }
+
$this->logger->debug('[PrepDocument] Replacing BR with a P node...');
// Otherwise, make this node a child of the new <p>.