diff options
-rw-r--r-- | src/HTMLParser.php | 18 | ||||
-rw-r--r-- | src/Readability.php | 30 |
2 files changed, 43 insertions, 5 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 21bb88f..5a684a5 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -438,7 +438,8 @@ class HTMLParser $append = false; // TODO Check if this comparison working as expected - if ($sibling === $topCandidate) { + // On the original js project it was a simple $sibling == $topCandidate comparison. + if ($this->compareNodes($sibling, $topCandidate)) { $append = true; } else { $contentBonus = 0; @@ -473,7 +474,7 @@ class HTMLParser // $sibling->setNodeName('div'); } - $import = $articleContent->importNode($sibling->getDOMNode()); + $import = $articleContent->importNode($sibling->getDOMNode(), true); $articleContent->appendChild($import); } } @@ -504,4 +505,17 @@ class HTMLParser } return false; } + + private function compareNodes($node1, $node2) + { + if ($node1->getTagName() !== $node2->getTagName()) { + return false; + } + + if ($node1->getTextContent() !== $node2->getTextContent()) { + return false; + } + + return true; + } } diff --git a/src/Readability.php b/src/Readability.php index a9c1592..39e1a28 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -38,6 +38,22 @@ class Readability extends Element implements ReadabilityInterface public function __construct(\DOMNode $node) { parent::__construct($node); + + if (get_class($node) !== 'DOMText') { + /* + * Restore the score if the object has been already scored. + * + * And if must be added before calling the getAttribute function, because if we reacht eh DOMDocument + * by geting the node parents we'll get a undefined function fatal error + */ + $score = 0; + + if (get_class($node) !== 'DOMDocument') { + $score = $node->getAttribute('readability'); + } + + $this->setContentScore(($score) ? $score : 0); + } } /** @@ -240,10 +256,18 @@ class Readability extends Element implements ReadabilityInterface */ public function setContentScore($score) { - // To prevent the -0 value - $this->contentScore = ($score === (double)-0) ? 0 : $score; + if (get_class($this->node) !== 'DOMDocument') { - return $this->contentScore; + // To prevent the -0 value + $this->contentScore = ($score === (double)-0) ? 0 : $score; + + // Set score in an attribute of the tag to prevent losing it while creating new Readability objects. + $this->node->setAttribute('readability', $this->contentScore); + + return $this->contentScore; + } + + return 0; } /** |