diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/HTMLParser.php | 293 | ||||
-rw-r--r-- | src/Readability.php | 36 |
2 files changed, 291 insertions, 38 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 6f76a25..27dd8e5 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -17,6 +17,7 @@ class HTMLParser private $dom = null; /** + * TODO Make this an object? Instead of a dumb array * @var array */ private $metadata = []; @@ -25,7 +26,7 @@ class HTMLParser * @var array */ private $regexps = [ - 'unlikelyCandidates' => '/banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i', + 'unlikelyCandidates' => '/banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i', 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', 'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i', 'byline' => '/byline|author|dateline|writtenby|p-author/i', @@ -184,6 +185,7 @@ class HTMLParser 'image' => isset($this->metadata['image']) ? $this->metadata['image'] : null, 'article' => $result, 'html' => $result->C14N(), + 'dir' => isset($this->metadata['articleDir']) ? $this->metadata['articleDir'] : null, ]; } @@ -536,16 +538,83 @@ class HTMLParser */ private function getTitle() { + $originalTitle = null; + if (isset($this->metadata['title'])) { - return $this->metadata['title']; + $originalTitle = $this->metadata['title']; + } else { + $titleTag = $this->dom->getElementsByTagName('title'); + if ($titleTag->length > 0) { + $originalTitle = $titleTag->item(0)->nodeValue; + } } - $title = $this->dom->getElementsByTagName('title'); - if ($title->length > 0) { - return $title->item(0)->nodeValue; + if ($originalTitle === null) { + return null; + } + + $curTitle = $originalTitle; + $titleHadHierarchicalSeparators = false; + + /* + * If there's a separator in the title, first remove the final part + * + * Sanity warning: if you eval this match in PHPStorm's "Evaluate expression" box, it will return false + * I can assure you it works properly if you let the code run. + */ + if (preg_match('/ [\|\-\\\\\/>»] /i', $curTitle)) { + $titleHadHierarchicalSeparators = (bool)preg_match('/ [\\\\\/>»] /', $curTitle); + $curTitle = preg_replace('/(.*)[\|\-\\\\\/>»] .*/i', '$1', $originalTitle); + + // If the resulting title is too short (3 words or fewer), remove + // the first part instead: + if (count(preg_split('/\s+/', $curTitle)) < 3) { + $curTitle = preg_replace('/[^\|\-\\\\\/>»]*[\|\-\\\\\/>»](.*)/i', '$1', $originalTitle); + } + } else if (strpos($curTitle, ': ') !== false) { + // Check if we have an heading containing this exact string, so we + // could assume it's the full title. + $match = false; + for ($i = 1; $i <= 2; $i++) { + foreach ($this->dom->getElementsByTagName('h' . $i) as $hTag) { + if ($hTag->nodeValue === $curTitle) { + $match = true; + } + } + } + + // If we don't, let's extract the title out of the original title string. + if (!$match) { + $curTitle = substr($originalTitle, strrpos($originalTitle, ':') + 1); + + // If the title is now too short, try the first colon instead: + if (count(preg_split('/\s+/', $curTitle)) < 3) + $curTitle = substr($originalTitle, strpos($originalTitle, ':') + 1); + } + } else if (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) { + $hOnes = $this->dom->getElementsByTagName('h1'); + + if ($hOnes->length === 1) { + $curTitle = $hOnes->item(0)->nodeValue; + } + } + + $curTitle = trim($curTitle); + + /* + * If we now have 4 words or fewer as our title, and either no + * 'hierarchical' separators (\, /, > or ») were found in the original + * title or we decreased the number of words by more than 1 word, use + * the original title. + */ + $curTitleWordCount = count(preg_split('/\s+/', $curTitle)); + + if ($curTitleWordCount <= 4 && + (!$titleHadHierarchicalSeparators || $curTitleWordCount !== preg_split('/\s+/', preg_replace('/[\|\-\\\\\/>»]+/', '', $originalTitle)) - 1)) { + $curTitle = $originalTitle; } - return null; + return $curTitle; } /** @@ -589,6 +658,16 @@ class HTMLParser } } + // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). + if (($node->tagNameEqualsTo('div') || $node->tagNameEqualsTo('section') || $node->tagNameEqualsTo('header') || + $node->tagNameEqualsTo('h1') || $node->tagNameEqualsTo('h2') || $node->tagNameEqualsTo('h3') || + $node->tagNameEqualsTo('h4') || $node->tagNameEqualsTo('h5') || $node->tagNameEqualsTo('h6')) && + $node->isElementWithoutContent()) { + $node = $node->removeAndGetNext($node); + continue; + } + + if (in_array(strtolower($node->getTagName()), $this->defaultTagsToScore)) { $elementsToScore[] = $node; } @@ -612,12 +691,9 @@ class HTMLParser // EXPERIMENTAL foreach ($node->getChildren() as $child) { /** @var Readability $child */ - if ($child->isText()) { - // Check if there's actual content on the node. - if (trim($child->getTextContent())) { - $newNode = $node->createNode($child, 'p'); - $child->replaceChild($newNode); - } + if ($child->isText() && mb_strlen(trim($child->getTextContent())) > 0) { + $newNode = $node->createNode($child, 'p'); + $child->replaceChild($newNode); } } } @@ -737,6 +813,7 @@ class HTMLParser $topCandidate = isset($topCandidates[0]) ? $topCandidates[0] : null; $neededToCreateTopCandidate = false; + $parentOfTopCandidate = null; /* * If we still have no top candidate, just use the body as a last resort. @@ -745,8 +822,6 @@ class HTMLParser if ($topCandidate === null || $topCandidate->tagNameEqualsTo('body')) { // Move all of the page's children into topCandidate - $neededToCreateTopCandidate = true; - $topCandidate = new DOMDocument('1.0', 'utf-8'); $topCandidate->encoding = 'UTF-8'; $topCandidate->appendChild($topCandidate->createElement('div', '')); @@ -765,6 +840,31 @@ class HTMLParser //TODO on the original code, $topCandidate is added to the page variable, which holds the whole HTML // Should be done this here also? (line 823 in readability.js) } elseif ($topCandidate) { + // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array + // and whose scores are quite closed with current `topCandidate` node. + $alternativeCandidateAncestors = []; + for ($i = 0; $i < count($topCandidates) - 1; $i++) { + if ($topCandidates[$i]->getContentScore() / $topCandidate->getContentScore() >= 0.75) { + $alternativeCandidateAncestors[$i] = $topCandidates[$i]->getNodeAncestors(5); + } + } + + $MINIMUM_TOPCANDIDATES = 3; + if (count($alternativeCandidateAncestors) >= $MINIMUM_TOPCANDIDATES) { + $parentOfTopCandidate = $topCandidate->getParent(); + while (!$parentOfTopCandidate->tagNameEqualsTo('body')) { + $listsContainingThisAncestor = 0; + for ($ancestorIndex = 0; $ancestorIndex < count($alternativeCandidateAncestors) && $listsContainingThisAncestor < $MINIMUM_TOPCANDIDATES; $ancestorIndex++) { + $listsContainingThisAncestor += (int)in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex]); + } + if ($listsContainingThisAncestor >= $MINIMUM_TOPCANDIDATES) { + $topCandidate = $parentOfTopCandidate; + break; + } + $parentOfTopCandidate = $parentOfTopCandidate->getParent(); + } + } + /* * Because of our bonus system, parents of candidates might have scores * themselves. They get half of the node. There won't be nodes with higher @@ -781,8 +881,8 @@ class HTMLParser // The scores shouldn't get too low. $scoreThreshold = $lastScore / 3; - while ($parentOfTopCandidate) { - /* @var Readability $parentOfTopCandidate */ + /* @var Readability $parentOfTopCandidate */ + while (!$parentOfTopCandidate->tagNameEqualsTo('body')) { $parentScore = $parentOfTopCandidate->getContentScore(); if ($parentScore < $scoreThreshold) { break; @@ -796,6 +896,14 @@ class HTMLParser $lastScore = $parentOfTopCandidate->getContentScore(); $parentOfTopCandidate = $parentOfTopCandidate->getParent(); } + + // If the top candidate is the only child, use parent instead. This will help sibling + // joining logic when adjacent content is actually located in parent's sibling node. + $parentOfTopCandidate = $topCandidate->getParent(); + while (!$parentOfTopCandidate->tagNameEqualsTo('body') && count($parentOfTopCandidate->getChildren()) === 1) { + $topCandidate = $parentOfTopCandidate; + $parentOfTopCandidate = $topCandidate->getParent(); + } } /* @@ -808,7 +916,9 @@ class HTMLParser $articleContent->createElement('div'); $siblingScoreThreshold = max(10, $topCandidate->getContentScore() * 0.2); - $siblings = $topCandidate->getParent()->getChildren(); + // Keep potential top candidate's parent node to try to get text direction of it later. + $parentOfTopCandidate = $topCandidate->getParent(); + $siblings = $parentOfTopCandidate->getChildren(); $hasContent = false; @@ -866,6 +976,16 @@ class HTMLParser $articleContent = $this->prepArticle($articleContent); if ($hasContent) { + // Find out text direction from ancestors of final top candidate. + $ancestors = array_merge([$parentOfTopCandidate, $topCandidate], $parentOfTopCandidate->getNodeAncestors()); + foreach ($ancestors as $ancestor) { + $articleDir = $ancestor->getAttribute('dir'); + if ($articleDir) { + $this->metadata['articleDir'] = $articleDir; + break; + } + } + return $articleContent; } else { return false; @@ -881,8 +1001,14 @@ class HTMLParser */ public function prepArticle(DOMDocument $article) { + // Check for data tables before we continue, to avoid removing items in + // those tables, which will often be isolated even though they're + // visually linked to other content-ful elements (text, images, etc.). + $this->_markDataTables($article); + // Clean out junk from the article content $this->_cleanConditionally($article, 'form'); + $this->_cleanConditionally($article, 'fieldset'); $this->_clean($article, 'object'); $this->_clean($article, 'embed'); $this->_clean($article, 'h1'); @@ -891,13 +1017,29 @@ class HTMLParser // Readability.js cleans styles on prepDocument but we do it here. $this->_clean($article, 'style'); - // If there is only one h2, they are probably using it as a header - // and not a subheader, so remove it since we already have a header. - if ($article->getElementsByTagName('h2')->length === 1) { - $this->_clean($article, 'h2'); + /* + * If there is only one h2 and its text content substantially equals article title, + * they are probably using it as a header and not a subheader, + * so remove it since we already extract the title separately. + */ + $h2 = $article->getElementsByTagName('h2'); + if ($h2->length === 1) { + $lengthSimilarRate = (mb_strlen($h2->item(0)->textContent) - mb_strlen($this->metadata['title'])) / mb_strlen($this->metadata['title']); + if (abs($lengthSimilarRate) < 0.5 && + ($lengthSimilarRate > 0 ? + strpos($h2->item(0)->textContent, $this->metadata['title']) !== false : + strpos($this->metadata['title'], $h2->item(0)->textContent) !== false + ) + ) { + $this->_clean($article, 'h2'); + } } $this->_clean($article, 'iframe'); + $this->_clean($article, 'input'); + $this->_clean($article, 'textarea'); + $this->_clean($article, 'select'); + $this->_clean($article, 'button'); $this->_cleanHeaders($article); // Do these last as the previous stuff may have removed junk @@ -924,6 +1066,98 @@ class HTMLParser } /** + * Look for 'data' (as opposed to 'layout') tables, for which we use + * similar checks as + * https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920 + * + * TODO To be moved to Readability. WARNING: check if we actually keep the "readabilityDataTable" param and + * maybe switch to a readability data-tag? + * + * @param DOMDocument $article + * + * @return void + */ + public function _markDataTables(DOMDocument $article) + { + $tables = $article->getElementsByTagName('table'); + foreach ($tables as $table) { + /** @var \DOMElement $table */ + $role = $table->getAttribute('role'); + if ($role === "presentation") { + $table->readabilityDataTable = false; + continue; + } + $datatable = $table->getAttribute('datatable'); + if ($datatable == '0') { + $table->readabilityDataTable = false; + continue; + } + $summary = $table->getAttribute('summary'); + if ($summary) { + $table->readabilityDataTable = true; + continue; + } + + $caption = $table->getElementsByTagName('caption'); + if ($caption->length > 0 && $caption->item(0)->childNodes->length > 0) { + $table->readabilityDataTable = true; + continue; + } + + // If the table has a descendant with any of these tags, consider a data table: + foreach (['col', 'colgroup', 'tfoot', 'thead', 'th'] as $dataTableDescendants) { + if ($table->getElementsByTagName($dataTableDescendants)->length > 0) { + $table->readabilityDataTable = true; + continue 2; + } + } + + // Nested tables indicate a layout table: + if ($table->getElementsByTagName('table')->length > 0) { + $table->readabilityDataTable = false; + continue; + } + + $sizeInfo = $this->_getRowAndColumnCount($table); + if ($sizeInfo['rows'] >= 10 || $sizeInfo['columns'] > 4) { + $table->readabilityDataTable = true; + continue; + } + // Now just go by size entirely: + $table->readabilityDataTable = $sizeInfo['rows'] * $sizeInfo['columns'] > 10; + } + } + + /** + * Return an array indicating how many rows and columns this table has. + * @param \DOMElement $table + * + * @return array + */ + public function _getRowAndColumnCount(\DOMElement $table) + { + $rows = $columns = 0; + $trs = $table->getElementsByTagName('tr'); + foreach ($trs as $tr) { + /** @var \DOMElement $tr */ + $rowspan = $tr->getAttribute('rowspan'); + $rows += ($rowspan || 1); + + // Now look for column-related info + $columnsInThisRow = 0; + $cells = $tr->getElementsByTagName('td'); + foreach ($cells as $cell) { + /** @var \DOMElement $cell */ + $colspan = $cell->getAttribute('colspan'); + $columnsInThisRow += ($colspan || 1); + } + $columns = max($columns, $columnsInThisRow); + } + + return ['rows' => $rows, 'columns' => $columns]; + } + + /** * TODO To be moved to Readability. * * @param DOMDocument $article @@ -992,6 +1226,12 @@ class HTMLParser $node = $DOMNodeList->item($length - 1 - $i); $node = new Readability($node); + + // First check if we're in a data table, in which case don't remove us. + if ($node->hasAncestorTag($node, 'table', -1) && isset($node->readabilityDataTable)) { + continue; + } + $weight = $node->getClassWeight(); if ($weight < 0) { @@ -1025,11 +1265,10 @@ class HTMLParser $contentLength = mb_strlen($node->getTextContent(true)); $haveToRemove = - // Make an exception for elements with no p's and exactly 1 img. - ($img > $p && $node->hasAncestorTag($node, 'figure')) || + ($img > 1 && $p / $img < 0.5 && !$node->hasAncestorTag($node, 'figure')) || (!$isList && $li > $p) || ($input > floor($p / 3)) || - (!$isList && $contentLength < 25 && ($img === 0 || $img > 2)) || + (!$isList && $contentLength < 25 && ($img === 0 || $img > 2) && !$node->hasAncestorTag($node, 'figure')) || (!$isList && $weight < 25 && $linkDensity > 0.2) || ($weight >= 25 && $linkDensity > 0.5) || (($embedCount === 1 && $contentLength < 75) || $embedCount > 1); @@ -1047,8 +1286,8 @@ class HTMLParser * * TODO To be moved to Readability * - * @param Element - * @param string tag to clean + * @param $article DOMDocument + * @param $tag string tag to clean * * @return void **/ @@ -1126,7 +1365,7 @@ class HTMLParser * Checks if the node is a byline. * * @param Readability $node - * @param string $matchString + * @param string $matchString * * @return bool */ diff --git a/src/Readability.php b/src/Readability.php index c55e0ad..b0a5830 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -13,7 +13,7 @@ use League\HTMLToMarkdown\Element; class Readability extends Element implements ReadabilityInterface { /** - * @var \DOMNode + * @var \DOMNode|\DOMElement */ protected $node; @@ -50,9 +50,6 @@ class Readability extends Element implements ReadabilityInterface * An if must be added before calling the getAttribute function, because if we reach the DOMDocument * by getting the node parents we'll get a undefined function fatal error */ - $score = 0; - - // Check if the getAttribute method exists, as some elements lack of it (and calling it anyway throws an exception) if (method_exists($node, 'getAttribute')) { if ($node->hasAttribute('data-readability')) { // Node was initialized previously. Restoring score and setting flag. @@ -253,7 +250,7 @@ class Readability extends Element implements ReadabilityInterface { // Check if the setAttribute method exists, as some elements lack of it (and calling it anyway throws an exception) if (method_exists($this->node, 'setAttribute')) { - $this->contentScore = (float) $score; + $this->contentScore = (float)$score; // Set score in an attribute of the tag to prevent losing it while creating new Readability objects. $this->node->setAttribute('data-readability', $this->contentScore); @@ -286,7 +283,7 @@ class Readability extends Element implements ReadabilityInterface * element with the new tag name and importing it to the main DOMDocument. * * @param string $value - * @param bool $importAttributes + * @param bool $importAttributes */ public function setNodeTag($value, $importAttributes = false) { @@ -343,7 +340,7 @@ class Readability extends Element implements ReadabilityInterface * for parents. * * @param Readability $originalNode - * @param bool $ignoreSelfAndKids + * @param bool $ignoreSelfAndKids * * @return Readability */ @@ -419,7 +416,7 @@ class Readability extends Element implements ReadabilityInterface * Creates a new node based on the text content of the original node. * * @param Readability $originalNode - * @param string $tagName + * @param string $tagName * * @return Readability */ @@ -466,8 +463,8 @@ class Readability extends Element implements ReadabilityInterface * provided one. * * @param Readability $node - * @param string $tagName - * @param int $maxDepth + * @param string $tagName + * @param int $maxDepth * * @return bool */ @@ -475,7 +472,7 @@ class Readability extends Element implements ReadabilityInterface { $depth = 0; while ($node->getParent()) { - if ($depth > $maxDepth) { + if ($maxDepth > 0 && $depth > $maxDepth) { return false; } if ($node->getParent()->tagNameEqualsTo($tagName)) { @@ -489,6 +486,8 @@ class Readability extends Element implements ReadabilityInterface } /** + * Returns the children of the current node + * * @param bool $filterEmptyDOMText Filter empty DOMText nodes? * * @return array @@ -507,4 +506,19 @@ class Readability extends Element implements ReadabilityInterface return $ret; } + + + /** + * Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace + * + * @return bool + */ + public function isElementWithoutContent() + { + return ($this->node instanceof \DOMElement && + mb_strlen(trim($this->node->textContent)) === 0 && + ($this->node->childNodes->length === 0 || + $this->node->childNodes->length === $this->node->getElementsByTagName('br')->length + $this->node->getElementsByTagName('hr')->length + )); + } } |