summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2017-11-07 00:01:00 +0000
committerAndres Rey <[email protected]>2017-11-07 00:01:00 +0000
commitc247dc2e631ce135a0334a6af9b1956235a3cc2d (patch)
treee25024e4978f166aa233fa8cd0257d3632072d1e /src
parent857736c6e90fbe2da8cb870855e84f04e8e67aee (diff)
Mark datatables and avoid removing them during cleaning
Diffstat (limited to 'src')
-rw-r--r--src/HTMLParser.php103
1 files changed, 103 insertions, 0 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index fa8a609..bf999f2 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -995,6 +995,11 @@ class HTMLParser
*/
public function prepArticle(DOMDocument $article)
{
+ // Check for data tables before we continue, to avoid removing items in
+ // those tables, which will often be isolated even though they're
+ // visually linked to other content-ful elements (text, images, etc.).
+ $this->_markDataTables($article);
+
// Clean out junk from the article content
$this->_cleanConditionally($article, 'form');
$this->_cleanConditionally($article, 'fieldset');
@@ -1055,6 +1060,98 @@ class HTMLParser
}
/**
+ * Look for 'data' (as opposed to 'layout') tables, for which we use
+ * similar checks as
+ * https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920
+ *
+ * TODO To be moved to Readability. WARNING: check if we actually keep the "readabilityDataTable" param and
+ * maybe switch to a readability data-tag?
+ *
+ * @param DOMDocument $article
+ *
+ * @return void
+ */
+ public function _markDataTables(DOMDocument $article)
+ {
+ $tables = $article->getElementsByTagName('table');
+ foreach ($tables as $table) {
+ /** @var \DOMElement $table */
+ $role = $table->getAttribute('role');
+ if ($role === "presentation") {
+ $table->readabilityDataTable = false;
+ continue;
+ }
+ $datatable = $table->getAttribute('datatable');
+ if ($datatable == '0') {
+ $table->readabilityDataTable = false;
+ continue;
+ }
+ $summary = $table->getAttribute('summary');
+ if ($summary) {
+ $table->readabilityDataTable = true;
+ continue;
+ }
+
+ $caption = $table->getElementsByTagName('caption');
+ if ($caption->length > 0 && $caption->item(0)->childNodes->length > 0) {
+ $table->readabilityDataTable = true;
+ continue;
+ }
+
+ // If the table has a descendant with any of these tags, consider a data table:
+ foreach (['col', 'colgroup', 'tfoot', 'thead', 'th'] as $dataTableDescendants) {
+ if ($table->getElementsByTagName($dataTableDescendants)->length > 0) {
+ $table->readabilityDataTable = true;
+ continue 2;
+ }
+ }
+
+ // Nested tables indicate a layout table:
+ if ($table->getElementsByTagName('table')->length > 0) {
+ $table->readabilityDataTable = false;
+ continue;
+ }
+
+ $sizeInfo = $this->_getRowAndColumnCount($table);
+ if ($sizeInfo['rows'] >= 10 || $sizeInfo['columns'] > 4) {
+ $table->readabilityDataTable = true;
+ continue;
+ }
+ // Now just go by size entirely:
+ $table->readabilityDataTable = $sizeInfo['rows'] * $sizeInfo['columns'] > 10;
+ }
+ }
+
+ /**
+ * Return an array indicating how many rows and columns this table has.
+ * @param \DOMElement $table
+ *
+ * @return array
+ */
+ public function _getRowAndColumnCount(\DOMElement $table)
+ {
+ $rows = $columns = 0;
+ $trs = $table->getElementsByTagName('tr');
+ foreach ($trs as $tr) {
+ /** @var \DOMElement $tr */
+ $rowspan = $tr->getAttribute('rowspan');
+ $rows += ($rowspan || 1);
+
+ // Now look for column-related info
+ $columnsInThisRow = 0;
+ $cells = $tr->getElementsByTagName('td');
+ foreach ($cells as $cell) {
+ /** @var \DOMElement $cell */
+ $colspan = $cell->getAttribute('colspan');
+ $columnsInThisRow += ($colspan || 1);
+ }
+ $columns = max($columns, $columnsInThisRow);
+ }
+
+ return ['rows' => $rows, 'columns' => $columns];
+ }
+
+ /**
* TODO To be moved to Readability.
*
* @param DOMDocument $article
@@ -1123,6 +1220,12 @@ class HTMLParser
$node = $DOMNodeList->item($length - 1 - $i);
$node = new Readability($node);
+
+ // First check if we're in a data table, in which case don't remove us.
+ if ($node->hasAncestorTag($node, 'table', -1) && isset($node->readabilityDataTable)) {
+ continue;
+ }
+
$weight = $node->getClassWeight();
if ($weight < 0) {