summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2017-11-05 12:09:10 +0000
committerAndres Rey <[email protected]>2017-11-05 12:09:10 +0000
commit701748db8c5d373cc297257915d497d3ae61b2d1 (patch)
tree6e7f48cb959e88c494800d5c230ce73a2b5a90c9 /src
parent38449c7a88a67d5d5bd178c70c069ffb6a1554ad (diff)
Add isElementWithoutContent function
Diffstat (limited to 'src')
-rw-r--r--src/HTMLParser.php10
-rw-r--r--src/Readability.php33
2 files changed, 35 insertions, 8 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 3b2b657..14dfa59 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -651,6 +651,16 @@ class HTMLParser
}
}
+ // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
+ if (($node->tagNameEqualsTo('div') || $node->tagNameEqualsTo('section') || $node->tagNameEqualsTo('header') ||
+ $node->tagNameEqualsTo('h1') || $node->tagNameEqualsTo('h2') || $node->tagNameEqualsTo('h3') ||
+ $node->tagNameEqualsTo('h4') || $node->tagNameEqualsTo('h5') || $node->tagNameEqualsTo('h6') ) &&
+ $node->isElementWithoutContent()) {
+ $node = $node->removeAndGetNext($node);
+ continue;
+ }
+
+
if (in_array(strtolower($node->getTagName()), $this->defaultTagsToScore)) {
$elementsToScore[] = $node;
}
diff --git a/src/Readability.php b/src/Readability.php
index 4ee85e1..5e0d4b4 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -13,7 +13,7 @@ use League\HTMLToMarkdown\Element;
class Readability extends Element implements ReadabilityInterface
{
/**
- * @var \DOMNode
+ * @var \DOMNode|\DOMElement
*/
protected $node;
@@ -50,7 +50,7 @@ class Readability extends Element implements ReadabilityInterface
* An if must be added before calling the getAttribute function, because if we reach the DOMDocument
* by getting the node parents we'll get a undefined function fatal error
*/
- if (method_exists($node, 'getAttribute')) {
+ if (method_exists($node, 'getAttribute')) {
if ($node->hasAttribute('data-readability')) {
// Node was initialized previously. Restoring score and setting flag.
$this->initialized = true;
@@ -250,7 +250,7 @@ class Readability extends Element implements ReadabilityInterface
{
// Check if the setAttribute method exists, as some elements lack of it (and calling it anyway throws an exception)
if (method_exists($this->node, 'setAttribute')) {
- $this->contentScore = (float) $score;
+ $this->contentScore = (float)$score;
// Set score in an attribute of the tag to prevent losing it while creating new Readability objects.
$this->node->setAttribute('data-readability', $this->contentScore);
@@ -283,7 +283,7 @@ class Readability extends Element implements ReadabilityInterface
* element with the new tag name and importing it to the main DOMDocument.
*
* @param string $value
- * @param bool $importAttributes
+ * @param bool $importAttributes
*/
public function setNodeTag($value, $importAttributes = false)
{
@@ -340,7 +340,7 @@ class Readability extends Element implements ReadabilityInterface
* for parents.
*
* @param Readability $originalNode
- * @param bool $ignoreSelfAndKids
+ * @param bool $ignoreSelfAndKids
*
* @return Readability
*/
@@ -416,7 +416,7 @@ class Readability extends Element implements ReadabilityInterface
* Creates a new node based on the text content of the original node.
*
* @param Readability $originalNode
- * @param string $tagName
+ * @param string $tagName
*
* @return Readability
*/
@@ -463,8 +463,8 @@ class Readability extends Element implements ReadabilityInterface
* provided one.
*
* @param Readability $node
- * @param string $tagName
- * @param int $maxDepth
+ * @param string $tagName
+ * @param int $maxDepth
*
* @return bool
*/
@@ -486,6 +486,8 @@ class Readability extends Element implements ReadabilityInterface
}
/**
+ * Returns the children of the current node
+ *
* @param bool $filterEmptyDOMText Filter empty DOMText nodes?
*
* @return array
@@ -504,4 +506,19 @@ class Readability extends Element implements ReadabilityInterface
return $ret;
}
+
+
+ /**
+ * Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace
+ *
+ * @return bool
+ */
+ public function isElementWithoutContent()
+ {
+ return ($this->node instanceof \DOMElement &&
+ mb_strlen(trim($this->node->textContent)) === 0 &&
+ ($this->node->childNodes->length === 0 ||
+ $this->node->childNodes->length === $this->node->getElementsByTagName('br')->length + $this->node->getElementsByTagName('hr')->length
+ ));
+ }
}