From d8471fc68e363ac884591cf9ab9e334773f97cae Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Sat, 1 Sep 2018 20:50:25 +0100 Subject: Import the isPhrasingContent function. Might want to check the recursive loop there if it's actually doing what it should and if there's a better way to optimize it --- src/Nodes/NodeTrait.php | 32 ++++++++++++++++++++++++++++++++ src/Readability.php | 4 ++++ 2 files changed, 36 insertions(+) diff --git a/src/Nodes/NodeTrait.php b/src/Nodes/NodeTrait.php index 13611c9..9f24abc 100644 --- a/src/Nodes/NodeTrait.php +++ b/src/Nodes/NodeTrait.php @@ -50,6 +50,21 @@ trait NodeTrait 'select', ]; + /** + * The commented out elements qualify as phrasing content but tend to be + * removed by readability when put into paragraphs, so we ignore them here. + * + * @var array + */ + private $phrasing_elems = [ + // 'CANVAS', 'IFRAME', 'SVG', 'VIDEO', + 'abbr', 'audio', 'b', 'bdo', 'br', 'button', 'cite', 'code', 'data', + 'datalist', 'dfn', 'em', 'embed', 'i', 'img', 'input', 'kbd', 'label', + 'mark', 'math', 'meter', 'noscript', 'object', 'output', 'progress', 'q', + 'ruby', 'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', + 'sup', 'textarea', 'time', 'var', 'wbr' + ]; + /** * initialized getter. * @@ -431,4 +446,21 @@ trait NodeTrait ); } + + /** + * Determine if a node qualifies as phrasing content. + * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content + * + * @return bool + */ + public function isPhrasingContent() + { + return $this->nodeType === XML_TEXT_NODE || !in_array($this->tagName, $this->phrasing_elems) || + (!is_null($this->childNodes) && + ($this->tagName === 'a' || $this->tagName === 'del' || $this->tagName === 'ins') && + array_reduce(iterator_to_array($this->childNodes), function ($carry, $node) { + return $carry || $node->isPhrasingContent(); + }) + ); + } } diff --git a/src/Readability.php b/src/Readability.php index 3f4d807..e0c9abf 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -831,6 +831,10 @@ class Readability } } + if ($next->isPhrasingContent()) { + break; + } + $this->logger->debug('[PrepDocument] Replacing BR with a P node...'); // Otherwise, make this node a child of the new

. -- cgit v1.2.3