summaryrefslogtreecommitdiff
path: root/src/HTMLParser.php
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2016-11-02 09:55:00 +0000
committerAndres Rey <[email protected]>2016-11-02 09:55:00 +0000
commit9238420ea7bfb6975610739b3685005b9ac8e5e6 (patch)
tree5bd66e3e94b61e141b7089b6cf6d5560b139c30d /src/HTMLParser.php
parentf46b43d0b5351af668c1c7229a0cddaef3ba5582 (diff)
Added removeAndGetNext but needs some work. Also articleByLine
Diffstat (limited to 'src/HTMLParser.php')
-rw-r--r--src/HTMLParser.php93
1 files changed, 66 insertions, 27 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 75e7c02..2bfb671 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -68,6 +68,7 @@ class HTMLParser
{
$defaults = array(
'maxTopCandidates' => 5, // Max amount of top level candidates
+ 'articleByLine' => null,
);
$this->environment = Environment::createDefaultEnvironment($defaults);
@@ -101,7 +102,10 @@ class HTMLParser
throw new \InvalidArgumentException('Invalid HTML was provided');
}
- $root = new Readability($root);
+ // TODO: Check if this is correct. Originally the body was sent as root but this caused problems because
+ // the script wasn't able to find nextSiblings to scan the dom tree. Now we are sending the first child.
+ // Is this correct?
+ $root = new Readability($root->firstChild);
$this->getNodes($root);
@@ -225,38 +229,46 @@ class HTMLParser
/**
* Gets nodes from the root element.
*
- * @param $node ReadabilityInterface
+ * @param $node Readability
*/
- private function getNodes(ReadabilityInterface $node)
+ private function getNodes(Readability $node)
{
- $matchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id');
-
- // Avoid elements that are unlikely to have any useful information.
- if (
- preg_match($this->regexps['unlikelyCandidates'], $matchString) &&
- !preg_match($this->regexps['okMaybeItsACandidate'], $matchString) &&
- !$node->tagNameEqualsTo('body') &&
- !$node->tagNameEqualsTo('a')
- ) {
- return;
- }
+ while ($node) {
+
+ $matchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id');
- // Loop over the element if it has children
- if ($node->hasChildren()) {
- foreach ($node->getChildren() as $child) {
- $this->getNodes($child);
+ // Check to see if this node is a byline, and remove it if it is.
+ if ($this->checkByline($node, $matchString)) {
+ $node = $node->removeAndGetNext($node);
+ continue;
}
- }
- // Check for nodes that have only on P node as a child and convert them to a single P node
- if ($node->hasSinglePNode()) {
- $pNode = $node->getChildren();
- $node = $pNode[0];
- }
- // If there's any info on the node, add it to the elements to score in the next step.
- if (trim($node->getValue())) {
- $this->elementsToScore[] = $node;
+ // Avoid elements that are unlikely to have any useful information.
+ if (
+ preg_match($this->regexps['unlikelyCandidates'], $matchString) &&
+ !preg_match($this->regexps['okMaybeItsACandidate'], $matchString) &&
+ !$node->tagNameEqualsTo('body') &&
+ !$node->tagNameEqualsTo('a')
+ ) {
+ $node = $node->removeAndGetNext($node);
+ continue;
+ }
+
+
+ // Check for nodes that have only on P node as a child and convert them to a single P node
+ if ($node->hasSinglePNode()) {
+ $pNode = $node->getChildren();
+ $node = $pNode[0];
+ }
+
+ // If there's any info on the node, add it to the elements to score in the next step.
+ if (trim($node->getValue())) {
+ $this->elementsToScore[] = $node;
+ }
+
+ // TODO Unfuck this:
+ $node = $node->removeAndGetNext($node);
}
}
@@ -370,6 +382,8 @@ class HTMLParser
* tree.
*/
+ // TODO, while calling getParent, the new object should carry its own score.
+ // Should be calculated when gets created or we must nest all Readability objects to carry their own score?
$parentOfTopCandidate = $topCandidate->getParent();
$lastScore = $topCandidate->getContentScore();
@@ -451,4 +465,29 @@ class HTMLParser
}
$test = 1;
}
+
+ private function checkByline($node, $matchString)
+ {
+ if (!$this->getConfig()->getOption('articleByLine')) {
+ return false;
+ }
+
+ $rel = $node->getAttribute('rel');
+
+ if ($rel === 'author' || preg_match($this->regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent())) {
+ $this->metadata['byline'] = trim($node->getTextContent());
+ return true;
+ }
+
+ return false;
+ }
+
+ private function isValidByline($text)
+ {
+ if (gettype($text) == 'string') {
+ $byline = trim($text);
+ return (strlen($byline) > 0) && (strlen($text) < 100);
+ }
+ return false;
+ }
}