summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/HTMLParser.php293
-rw-r--r--src/Readability.php36
2 files changed, 291 insertions, 38 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 6f76a25..27dd8e5 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -17,6 +17,7 @@ class HTMLParser
private $dom = null;
/**
+ * TODO Make this an object? Instead of a dumb array
* @var array
*/
private $metadata = [];
@@ -25,7 +26,7 @@ class HTMLParser
* @var array
*/
private $regexps = [
- 'unlikelyCandidates' => '/banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i',
+ 'unlikelyCandidates' => '/banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i',
'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i',
'byline' => '/byline|author|dateline|writtenby|p-author/i',
@@ -184,6 +185,7 @@ class HTMLParser
'image' => isset($this->metadata['image']) ? $this->metadata['image'] : null,
'article' => $result,
'html' => $result->C14N(),
+ 'dir' => isset($this->metadata['articleDir']) ? $this->metadata['articleDir'] : null,
];
}
@@ -536,16 +538,83 @@ class HTMLParser
*/
private function getTitle()
{
+ $originalTitle = null;
+
if (isset($this->metadata['title'])) {
- return $this->metadata['title'];
+ $originalTitle = $this->metadata['title'];
+ } else {
+ $titleTag = $this->dom->getElementsByTagName('title');
+ if ($titleTag->length > 0) {
+ $originalTitle = $titleTag->item(0)->nodeValue;
+ }
}
- $title = $this->dom->getElementsByTagName('title');
- if ($title->length > 0) {
- return $title->item(0)->nodeValue;
+ if ($originalTitle === null) {
+ return null;
+ }
+
+ $curTitle = $originalTitle;
+ $titleHadHierarchicalSeparators = false;
+
+ /*
+ * If there's a separator in the title, first remove the final part
+ *
+ * Sanity warning: if you eval this match in PHPStorm's "Evaluate expression" box, it will return false
+ * I can assure you it works properly if you let the code run.
+ */
+ if (preg_match('/ [\|\-\\\\\/>»] /i', $curTitle)) {
+ $titleHadHierarchicalSeparators = (bool)preg_match('/ [\\\\\/>»] /', $curTitle);
+ $curTitle = preg_replace('/(.*)[\|\-\\\\\/>»] .*/i', '$1', $originalTitle);
+
+ // If the resulting title is too short (3 words or fewer), remove
+ // the first part instead:
+ if (count(preg_split('/\s+/', $curTitle)) < 3) {
+ $curTitle = preg_replace('/[^\|\-\\\\\/>»]*[\|\-\\\\\/>»](.*)/i', '$1', $originalTitle);
+ }
+ } else if (strpos($curTitle, ': ') !== false) {
+ // Check if we have an heading containing this exact string, so we
+ // could assume it's the full title.
+ $match = false;
+ for ($i = 1; $i <= 2; $i++) {
+ foreach ($this->dom->getElementsByTagName('h' . $i) as $hTag) {
+ if ($hTag->nodeValue === $curTitle) {
+ $match = true;
+ }
+ }
+ }
+
+ // If we don't, let's extract the title out of the original title string.
+ if (!$match) {
+ $curTitle = substr($originalTitle, strrpos($originalTitle, ':') + 1);
+
+ // If the title is now too short, try the first colon instead:
+ if (count(preg_split('/\s+/', $curTitle)) < 3)
+ $curTitle = substr($originalTitle, strpos($originalTitle, ':') + 1);
+ }
+ } else if (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) {
+ $hOnes = $this->dom->getElementsByTagName('h1');
+
+ if ($hOnes->length === 1) {
+ $curTitle = $hOnes->item(0)->nodeValue;
+ }
+ }
+
+ $curTitle = trim($curTitle);
+
+ /*
+ * If we now have 4 words or fewer as our title, and either no
+ * 'hierarchical' separators (\, /, > or ») were found in the original
+ * title or we decreased the number of words by more than 1 word, use
+ * the original title.
+ */
+ $curTitleWordCount = count(preg_split('/\s+/', $curTitle));
+
+ if ($curTitleWordCount <= 4 &&
+ (!$titleHadHierarchicalSeparators || $curTitleWordCount !== preg_split('/\s+/', preg_replace('/[\|\-\\\\\/>»]+/', '', $originalTitle)) - 1)) {
+ $curTitle = $originalTitle;
}
- return null;
+ return $curTitle;
}
/**
@@ -589,6 +658,16 @@ class HTMLParser
}
}
+ // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
+ if (($node->tagNameEqualsTo('div') || $node->tagNameEqualsTo('section') || $node->tagNameEqualsTo('header') ||
+ $node->tagNameEqualsTo('h1') || $node->tagNameEqualsTo('h2') || $node->tagNameEqualsTo('h3') ||
+ $node->tagNameEqualsTo('h4') || $node->tagNameEqualsTo('h5') || $node->tagNameEqualsTo('h6')) &&
+ $node->isElementWithoutContent()) {
+ $node = $node->removeAndGetNext($node);
+ continue;
+ }
+
+
if (in_array(strtolower($node->getTagName()), $this->defaultTagsToScore)) {
$elementsToScore[] = $node;
}
@@ -612,12 +691,9 @@ class HTMLParser
// EXPERIMENTAL
foreach ($node->getChildren() as $child) {
/** @var Readability $child */
- if ($child->isText()) {
- // Check if there's actual content on the node.
- if (trim($child->getTextContent())) {
- $newNode = $node->createNode($child, 'p');
- $child->replaceChild($newNode);
- }
+ if ($child->isText() && mb_strlen(trim($child->getTextContent())) > 0) {
+ $newNode = $node->createNode($child, 'p');
+ $child->replaceChild($newNode);
}
}
}
@@ -737,6 +813,7 @@ class HTMLParser
$topCandidate = isset($topCandidates[0]) ? $topCandidates[0] : null;
$neededToCreateTopCandidate = false;
+ $parentOfTopCandidate = null;
/*
* If we still have no top candidate, just use the body as a last resort.
@@ -745,8 +822,6 @@ class HTMLParser
if ($topCandidate === null || $topCandidate->tagNameEqualsTo('body')) {
// Move all of the page's children into topCandidate
- $neededToCreateTopCandidate = true;
-
$topCandidate = new DOMDocument('1.0', 'utf-8');
$topCandidate->encoding = 'UTF-8';
$topCandidate->appendChild($topCandidate->createElement('div', ''));
@@ -765,6 +840,31 @@ class HTMLParser
//TODO on the original code, $topCandidate is added to the page variable, which holds the whole HTML
// Should be done this here also? (line 823 in readability.js)
} elseif ($topCandidate) {
+ // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
+ // and whose scores are quite closed with current `topCandidate` node.
+ $alternativeCandidateAncestors = [];
+ for ($i = 0; $i < count($topCandidates) - 1; $i++) {
+ if ($topCandidates[$i]->getContentScore() / $topCandidate->getContentScore() >= 0.75) {
+ $alternativeCandidateAncestors[$i] = $topCandidates[$i]->getNodeAncestors(5);
+ }
+ }
+
+ $MINIMUM_TOPCANDIDATES = 3;
+ if (count($alternativeCandidateAncestors) >= $MINIMUM_TOPCANDIDATES) {
+ $parentOfTopCandidate = $topCandidate->getParent();
+ while (!$parentOfTopCandidate->tagNameEqualsTo('body')) {
+ $listsContainingThisAncestor = 0;
+ for ($ancestorIndex = 0; $ancestorIndex < count($alternativeCandidateAncestors) && $listsContainingThisAncestor < $MINIMUM_TOPCANDIDATES; $ancestorIndex++) {
+ $listsContainingThisAncestor += (int)in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex]);
+ }
+ if ($listsContainingThisAncestor >= $MINIMUM_TOPCANDIDATES) {
+ $topCandidate = $parentOfTopCandidate;
+ break;
+ }
+ $parentOfTopCandidate = $parentOfTopCandidate->getParent();
+ }
+ }
+
/*
* Because of our bonus system, parents of candidates might have scores
* themselves. They get half of the node. There won't be nodes with higher
@@ -781,8 +881,8 @@ class HTMLParser
// The scores shouldn't get too low.
$scoreThreshold = $lastScore / 3;
- while ($parentOfTopCandidate) {
- /* @var Readability $parentOfTopCandidate */
+ /* @var Readability $parentOfTopCandidate */
+ while (!$parentOfTopCandidate->tagNameEqualsTo('body')) {
$parentScore = $parentOfTopCandidate->getContentScore();
if ($parentScore < $scoreThreshold) {
break;
@@ -796,6 +896,14 @@ class HTMLParser
$lastScore = $parentOfTopCandidate->getContentScore();
$parentOfTopCandidate = $parentOfTopCandidate->getParent();
}
+
+ // If the top candidate is the only child, use parent instead. This will help sibling
+ // joining logic when adjacent content is actually located in parent's sibling node.
+ $parentOfTopCandidate = $topCandidate->getParent();
+ while (!$parentOfTopCandidate->tagNameEqualsTo('body') && count($parentOfTopCandidate->getChildren()) === 1) {
+ $topCandidate = $parentOfTopCandidate;
+ $parentOfTopCandidate = $topCandidate->getParent();
+ }
}
/*
@@ -808,7 +916,9 @@ class HTMLParser
$articleContent->createElement('div');
$siblingScoreThreshold = max(10, $topCandidate->getContentScore() * 0.2);
- $siblings = $topCandidate->getParent()->getChildren();
+ // Keep potential top candidate's parent node to try to get text direction of it later.
+ $parentOfTopCandidate = $topCandidate->getParent();
+ $siblings = $parentOfTopCandidate->getChildren();
$hasContent = false;
@@ -866,6 +976,16 @@ class HTMLParser
$articleContent = $this->prepArticle($articleContent);
if ($hasContent) {
+ // Find out text direction from ancestors of final top candidate.
+ $ancestors = array_merge([$parentOfTopCandidate, $topCandidate], $parentOfTopCandidate->getNodeAncestors());
+ foreach ($ancestors as $ancestor) {
+ $articleDir = $ancestor->getAttribute('dir');
+ if ($articleDir) {
+ $this->metadata['articleDir'] = $articleDir;
+ break;
+ }
+ }
+
return $articleContent;
} else {
return false;
@@ -881,8 +1001,14 @@ class HTMLParser
*/
public function prepArticle(DOMDocument $article)
{
+ // Check for data tables before we continue, to avoid removing items in
+ // those tables, which will often be isolated even though they're
+ // visually linked to other content-ful elements (text, images, etc.).
+ $this->_markDataTables($article);
+
// Clean out junk from the article content
$this->_cleanConditionally($article, 'form');
+ $this->_cleanConditionally($article, 'fieldset');
$this->_clean($article, 'object');
$this->_clean($article, 'embed');
$this->_clean($article, 'h1');
@@ -891,13 +1017,29 @@ class HTMLParser
// Readability.js cleans styles on prepDocument but we do it here.
$this->_clean($article, 'style');
- // If there is only one h2, they are probably using it as a header
- // and not a subheader, so remove it since we already have a header.
- if ($article->getElementsByTagName('h2')->length === 1) {
- $this->_clean($article, 'h2');
+ /*
+ * If there is only one h2 and its text content substantially equals article title,
+ * they are probably using it as a header and not a subheader,
+ * so remove it since we already extract the title separately.
+ */
+ $h2 = $article->getElementsByTagName('h2');
+ if ($h2->length === 1) {
+ $lengthSimilarRate = (mb_strlen($h2->item(0)->textContent) - mb_strlen($this->metadata['title'])) / mb_strlen($this->metadata['title']);
+ if (abs($lengthSimilarRate) < 0.5 &&
+ ($lengthSimilarRate > 0 ?
+ strpos($h2->item(0)->textContent, $this->metadata['title']) !== false :
+ strpos($this->metadata['title'], $h2->item(0)->textContent) !== false
+ )
+ ) {
+ $this->_clean($article, 'h2');
+ }
}
$this->_clean($article, 'iframe');
+ $this->_clean($article, 'input');
+ $this->_clean($article, 'textarea');
+ $this->_clean($article, 'select');
+ $this->_clean($article, 'button');
$this->_cleanHeaders($article);
// Do these last as the previous stuff may have removed junk
@@ -924,6 +1066,98 @@ class HTMLParser
}
/**
+ * Look for 'data' (as opposed to 'layout') tables, for which we use
+ * similar checks as
+ * https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920
+ *
+ * TODO To be moved to Readability. WARNING: check if we actually keep the "readabilityDataTable" param and
+ * maybe switch to a readability data-tag?
+ *
+ * @param DOMDocument $article
+ *
+ * @return void
+ */
+ public function _markDataTables(DOMDocument $article)
+ {
+ $tables = $article->getElementsByTagName('table');
+ foreach ($tables as $table) {
+ /** @var \DOMElement $table */
+ $role = $table->getAttribute('role');
+ if ($role === "presentation") {
+ $table->readabilityDataTable = false;
+ continue;
+ }
+ $datatable = $table->getAttribute('datatable');
+ if ($datatable == '0') {
+ $table->readabilityDataTable = false;
+ continue;
+ }
+ $summary = $table->getAttribute('summary');
+ if ($summary) {
+ $table->readabilityDataTable = true;
+ continue;
+ }
+
+ $caption = $table->getElementsByTagName('caption');
+ if ($caption->length > 0 && $caption->item(0)->childNodes->length > 0) {
+ $table->readabilityDataTable = true;
+ continue;
+ }
+
+ // If the table has a descendant with any of these tags, consider a data table:
+ foreach (['col', 'colgroup', 'tfoot', 'thead', 'th'] as $dataTableDescendants) {
+ if ($table->getElementsByTagName($dataTableDescendants)->length > 0) {
+ $table->readabilityDataTable = true;
+ continue 2;
+ }
+ }
+
+ // Nested tables indicate a layout table:
+ if ($table->getElementsByTagName('table')->length > 0) {
+ $table->readabilityDataTable = false;
+ continue;
+ }
+
+ $sizeInfo = $this->_getRowAndColumnCount($table);
+ if ($sizeInfo['rows'] >= 10 || $sizeInfo['columns'] > 4) {
+ $table->readabilityDataTable = true;
+ continue;
+ }
+ // Now just go by size entirely:
+ $table->readabilityDataTable = $sizeInfo['rows'] * $sizeInfo['columns'] > 10;
+ }
+ }
+
+ /**
+ * Return an array indicating how many rows and columns this table has.
+ * @param \DOMElement $table
+ *
+ * @return array
+ */
+ public function _getRowAndColumnCount(\DOMElement $table)
+ {
+ $rows = $columns = 0;
+ $trs = $table->getElementsByTagName('tr');
+ foreach ($trs as $tr) {
+ /** @var \DOMElement $tr */
+ $rowspan = $tr->getAttribute('rowspan');
+ $rows += ($rowspan || 1);
+
+ // Now look for column-related info
+ $columnsInThisRow = 0;
+ $cells = $tr->getElementsByTagName('td');
+ foreach ($cells as $cell) {
+ /** @var \DOMElement $cell */
+ $colspan = $cell->getAttribute('colspan');
+ $columnsInThisRow += ($colspan || 1);
+ }
+ $columns = max($columns, $columnsInThisRow);
+ }
+
+ return ['rows' => $rows, 'columns' => $columns];
+ }
+
+ /**
* TODO To be moved to Readability.
*
* @param DOMDocument $article
@@ -992,6 +1226,12 @@ class HTMLParser
$node = $DOMNodeList->item($length - 1 - $i);
$node = new Readability($node);
+
+ // First check if we're in a data table, in which case don't remove us.
+ if ($node->hasAncestorTag($node, 'table', -1) && isset($node->readabilityDataTable)) {
+ continue;
+ }
+
$weight = $node->getClassWeight();
if ($weight < 0) {
@@ -1025,11 +1265,10 @@ class HTMLParser
$contentLength = mb_strlen($node->getTextContent(true));
$haveToRemove =
- // Make an exception for elements with no p's and exactly 1 img.
- ($img > $p && $node->hasAncestorTag($node, 'figure')) ||
+ ($img > 1 && $p / $img < 0.5 && !$node->hasAncestorTag($node, 'figure')) ||
(!$isList && $li > $p) ||
($input > floor($p / 3)) ||
- (!$isList && $contentLength < 25 && ($img === 0 || $img > 2)) ||
+ (!$isList && $contentLength < 25 && ($img === 0 || $img > 2) && !$node->hasAncestorTag($node, 'figure')) ||
(!$isList && $weight < 25 && $linkDensity > 0.2) ||
($weight >= 25 && $linkDensity > 0.5) ||
(($embedCount === 1 && $contentLength < 75) || $embedCount > 1);
@@ -1047,8 +1286,8 @@ class HTMLParser
*
* TODO To be moved to Readability
*
- * @param Element
- * @param string tag to clean
+ * @param $article DOMDocument
+ * @param $tag string tag to clean
*
* @return void
**/
@@ -1126,7 +1365,7 @@ class HTMLParser
* Checks if the node is a byline.
*
* @param Readability $node
- * @param string $matchString
+ * @param string $matchString
*
* @return bool
*/
diff --git a/src/Readability.php b/src/Readability.php
index c55e0ad..b0a5830 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -13,7 +13,7 @@ use League\HTMLToMarkdown\Element;
class Readability extends Element implements ReadabilityInterface
{
/**
- * @var \DOMNode
+ * @var \DOMNode|\DOMElement
*/
protected $node;
@@ -50,9 +50,6 @@ class Readability extends Element implements ReadabilityInterface
* An if must be added before calling the getAttribute function, because if we reach the DOMDocument
* by getting the node parents we'll get a undefined function fatal error
*/
- $score = 0;
-
- // Check if the getAttribute method exists, as some elements lack of it (and calling it anyway throws an exception)
if (method_exists($node, 'getAttribute')) {
if ($node->hasAttribute('data-readability')) {
// Node was initialized previously. Restoring score and setting flag.
@@ -253,7 +250,7 @@ class Readability extends Element implements ReadabilityInterface
{
// Check if the setAttribute method exists, as some elements lack of it (and calling it anyway throws an exception)
if (method_exists($this->node, 'setAttribute')) {
- $this->contentScore = (float) $score;
+ $this->contentScore = (float)$score;
// Set score in an attribute of the tag to prevent losing it while creating new Readability objects.
$this->node->setAttribute('data-readability', $this->contentScore);
@@ -286,7 +283,7 @@ class Readability extends Element implements ReadabilityInterface
* element with the new tag name and importing it to the main DOMDocument.
*
* @param string $value
- * @param bool $importAttributes
+ * @param bool $importAttributes
*/
public function setNodeTag($value, $importAttributes = false)
{
@@ -343,7 +340,7 @@ class Readability extends Element implements ReadabilityInterface
* for parents.
*
* @param Readability $originalNode
- * @param bool $ignoreSelfAndKids
+ * @param bool $ignoreSelfAndKids
*
* @return Readability
*/
@@ -419,7 +416,7 @@ class Readability extends Element implements ReadabilityInterface
* Creates a new node based on the text content of the original node.
*
* @param Readability $originalNode
- * @param string $tagName
+ * @param string $tagName
*
* @return Readability
*/
@@ -466,8 +463,8 @@ class Readability extends Element implements ReadabilityInterface
* provided one.
*
* @param Readability $node
- * @param string $tagName
- * @param int $maxDepth
+ * @param string $tagName
+ * @param int $maxDepth
*
* @return bool
*/
@@ -475,7 +472,7 @@ class Readability extends Element implements ReadabilityInterface
{
$depth = 0;
while ($node->getParent()) {
- if ($depth > $maxDepth) {
+ if ($maxDepth > 0 && $depth > $maxDepth) {
return false;
}
if ($node->getParent()->tagNameEqualsTo($tagName)) {
@@ -489,6 +486,8 @@ class Readability extends Element implements ReadabilityInterface
}
/**
+ * Returns the children of the current node
+ *
* @param bool $filterEmptyDOMText Filter empty DOMText nodes?
*
* @return array
@@ -507,4 +506,19 @@ class Readability extends Element implements ReadabilityInterface
return $ret;
}
+
+
+ /**
+ * Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace
+ *
+ * @return bool
+ */
+ public function isElementWithoutContent()
+ {
+ return ($this->node instanceof \DOMElement &&
+ mb_strlen(trim($this->node->textContent)) === 0 &&
+ ($this->node->childNodes->length === 0 ||
+ $this->node->childNodes->length === $this->node->getElementsByTagName('br')->length + $this->node->getElementsByTagName('hr')->length
+ ));
+ }
}