Readability.php 65 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774
  1. <?php
  2. namespace andreskrey\Readability;
  3. use andreskrey\Readability\Nodes\DOM\DOMDocument;
  4. use andreskrey\Readability\Nodes\DOM\DOMElement;
  5. use andreskrey\Readability\Nodes\DOM\DOMNode;
  6. use andreskrey\Readability\Nodes\DOM\DOMText;
  7. use andreskrey\Readability\Nodes\NodeUtility;
  8. use Psr\Log\LoggerInterface;
  9. /**
  10. * Class Readability.
  11. */
  12. class Readability
  13. {
  14. /**
  15. * Main DOMDocument where all the magic happens.
  16. *
  17. * @var DOMDocument
  18. */
  19. protected $dom;
  20. /**
  21. * Title of the article.
  22. *
  23. * @var string|null
  24. */
  25. protected $title = null;
  26. /**
  27. * Final DOMDocument with the fully parsed HTML.
  28. *
  29. * @var DOMDocument|null
  30. */
  31. protected $content = null;
  32. /**
  33. * Excerpt of the article.
  34. *
  35. * @var string|null
  36. */
  37. protected $excerpt = null;
  38. /**
  39. * Main image of the article.
  40. *
  41. * @var string|null
  42. */
  43. protected $image = null;
  44. /**
  45. * Author of the article. Extracted from the byline tags and other social media properties.
  46. *
  47. * @var string|null
  48. */
  49. protected $author = null;
  50. /**
  51. * Direction of the text.
  52. *
  53. * @var string|null
  54. */
  55. protected $direction = null;
  56. /**
  57. * Configuration object.
  58. *
  59. * @var Configuration
  60. */
  61. private $configuration;
  62. /**
  63. * Logger object.
  64. *
  65. * @var LoggerInterface
  66. */
  67. private $logger;
  68. /**
  69. * Collection of attempted text extractions.
  70. *
  71. * @var array
  72. */
  73. private $attempts = [];
  74. /**
  75. * @var array
  76. */
  77. private $defaultTagsToScore = [
  78. 'section',
  79. 'h2',
  80. 'h3',
  81. 'h4',
  82. 'h5',
  83. 'h6',
  84. 'p',
  85. 'td',
  86. 'pre',
  87. ];
  88. /**
  89. * @var array
  90. */
  91. private $alterToDIVExceptions = [
  92. 'div',
  93. 'article',
  94. 'section',
  95. 'p',
  96. ];
  97. /**
  98. * Readability constructor.
  99. *
  100. * @param Configuration $configuration
  101. */
  102. public function __construct(Configuration $configuration)
  103. {
  104. $this->configuration = $configuration;
  105. $this->logger = $this->configuration->getLogger();
  106. }
  107. /**
  108. * Main parse function.
  109. *
  110. * @param $html
  111. *
  112. * @throws ParseException
  113. *
  114. * @return bool
  115. */
  116. public function parse($html)
  117. {
  118. $this->logger->info('*** Starting parse process...');
  119. $this->dom = $this->loadHTML($html);
  120. // Checking for minimum HTML to work with.
  121. if (!($root = $this->dom->getElementsByTagName('body')->item(0)) || !$root->firstChild) {
  122. $this->logger->emergency('No body tag present or body tag empty');
  123. throw new ParseException('Invalid or incomplete HTML.');
  124. }
  125. $this->getMetadata();
  126. $this->getMainImage();
  127. while (true) {
  128. $root = $root->firstChild;
  129. $elementsToScore = $this->getNodes($root);
  130. $this->logger->debug(sprintf('Elements to score: \'%s\'', count($elementsToScore)));
  131. $result = $this->rateNodes($elementsToScore);
  132. /*
  133. * Now that we've gone through the full algorithm, check to see if
  134. * we got any meaningful content. If we didn't, we may need to re-run
  135. * grabArticle with different flags set. This gives us a higher likelihood of
  136. * finding the content, and the sieve approach gives us a higher likelihood of
  137. * finding the -right- content.
  138. */
  139. $length = mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $result->textContent));
  140. $this->logger->info(sprintf('[Parsing] Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getCharThreshold()));
  141. if ($result && $length < $this->configuration->getCharThreshold()) {
  142. $this->dom = $this->loadHTML($html);
  143. $root = $this->dom->getElementsByTagName('body')->item(0);
  144. if ($this->configuration->getStripUnlikelyCandidates()) {
  145. $this->logger->debug('[Parsing] Threshold not met, trying again setting StripUnlikelyCandidates as false');
  146. $this->configuration->setStripUnlikelyCandidates(false);
  147. $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
  148. } elseif ($this->configuration->getWeightClasses()) {
  149. $this->logger->debug('[Parsing] Threshold not met, trying again setting WeightClasses as false');
  150. $this->configuration->setWeightClasses(false);
  151. $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
  152. } elseif ($this->configuration->getCleanConditionally()) {
  153. $this->logger->debug('[Parsing] Threshold not met, trying again setting CleanConditionally as false');
  154. $this->configuration->setCleanConditionally(false);
  155. $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
  156. } else {
  157. $this->logger->debug('[Parsing] Threshold not met, searching across attempts for some content.');
  158. $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
  159. // No luck after removing flags, just return the longest text we found during the different loops
  160. usort($this->attempts, function ($a, $b) {
  161. return $a['textLength'] < $b['textLength'];
  162. });
  163. // But first check if we actually have something
  164. if (!$this->attempts[0]['textLength']) {
  165. $this->logger->emergency('[Parsing] Could not parse text, giving up :(');
  166. throw new ParseException('Could not parse text.');
  167. }
  168. $this->logger->debug('[Parsing] Threshold not met, but found some content in previous attempts.');
  169. $result = $this->attempts[0]['articleContent'];
  170. break;
  171. }
  172. } else {
  173. break;
  174. }
  175. }
  176. $result = $this->postProcessContent($result);
  177. // If we haven't found an excerpt in the article's metadata, use the article's
  178. // first paragraph as the excerpt. This can be used for displaying a preview of
  179. // the article's content.
  180. if (!$this->getExcerpt()) {
  181. $this->logger->debug('[Parsing] No excerpt text found on metadata, extracting first p node and using it as excerpt.');
  182. $paragraphs = $result->getElementsByTagName('p');
  183. if ($paragraphs->length > 0) {
  184. $this->setExcerpt(trim($paragraphs->item(0)->textContent));
  185. }
  186. }
  187. $this->setContent($result);
  188. $this->logger->info('*** Parse successful :)');
  189. return true;
  190. }
  191. /**
  192. * Creates a DOM Document object and loads the provided HTML on it.
  193. *
  194. * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text)
  195. * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs
  196. * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both
  197. * objects and ruining the backup.
  198. *
  199. * @param string $html
  200. *
  201. * @return DOMDocument
  202. */
  203. private function loadHTML($html)
  204. {
  205. $this->logger->debug('[Loading] Loading HTML...');
  206. // To avoid throwing a gazillion of errors on malformed HTMLs
  207. libxml_use_internal_errors(true);
  208. $dom = new DOMDocument('1.0', 'utf-8');
  209. if (!$this->configuration->getSubstituteEntities()) {
  210. // Keep the original HTML entities
  211. $dom->substituteEntities = false;
  212. }
  213. if ($this->configuration->getNormalizeEntities()) {
  214. $this->logger->debug('[Loading] Normalized entities via mb_convert_encoding.');
  215. // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content
  216. $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
  217. }
  218. if ($this->configuration->getSummonCthulhu()) {
  219. $this->logger->debug('[Loading] Removed script tags via regex H̶͈̩̟̬̱͠E̡̨̬͔̳̜͢͠ ̡̧̯͉̩͙̩̹̞̠͎͈̹̥̠͞ͅͅC̶͉̞̘̖̝̗͓̬̯͍͉̤̬͢͢͞Ò̟̘͉͖͎͉̱̭̣̕M̴̯͈̻̱̱̣̗͈̠̙̲̥͘͞E̷̛͙̼̲͍͕̹͍͇̗̻̬̮̭̱̥͢Ş̛̟͔̙̜̤͇̮͍̙̝̀͘');
  220. $html = preg_replace('/<script\b[^>]*>([\s\S]*?)<\/script>/', '', $html);
  221. }
  222. // Prepend the XML tag to avoid having issues with special characters. Should be harmless.
  223. $dom->loadHTML('<?xml encoding="UTF-8">' . $html);
  224. $dom->encoding = 'UTF-8';
  225. $this->removeScripts($dom);
  226. $this->prepDocument($dom);
  227. $this->logger->debug('[Loading] Loaded HTML successfully.');
  228. return $dom;
  229. }
  230. /**
  231. * Tries to guess relevant info from metadata of the html. Sets the results in the Readability properties.
  232. */
  233. private function getMetadata()
  234. {
  235. $this->logger->debug('[Metadata] Retrieving metadata...');
  236. $values = [];
  237. // property is a space-separated list of values
  238. $propertyPattern = '/\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|image)\s*/i';
  239. // name is a single value
  240. $namePattern = '/^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image)\s*$/i';
  241. // Find description tags.
  242. foreach ($this->dom->getElementsByTagName('meta') as $meta) {
  243. /* @var DOMNode $meta */
  244. $elementName = $meta->getAttribute('name');
  245. $elementProperty = $meta->getAttribute('property');
  246. $content = $meta->getAttribute('content');
  247. $matches = null;
  248. $name = null;
  249. if ($elementProperty) {
  250. if (preg_match($propertyPattern, $elementProperty, $matches)) {
  251. for ($i = count($matches) - 1; $i >= 0; $i--) {
  252. // Convert to lowercase, and remove any whitespace
  253. // so we can match below.
  254. $name = preg_replace('/\s/', '', mb_strtolower($matches[$i]));
  255. // multiple authors
  256. $values[$name] = trim($content);
  257. }
  258. }
  259. }
  260. if (!$matches && $elementName && preg_match($namePattern, $elementName)) {
  261. $name = $elementName;
  262. if ($content) {
  263. // Convert to lowercase, remove any whitespace, and convert dots
  264. // to colons so we can match below.
  265. $name = preg_replace(['/\s/', '/\./'], ['', ':'], mb_strtolower($name));
  266. $values[$name] = trim($content);
  267. }
  268. }
  269. }
  270. // get title
  271. /*
  272. * This is a very convoluted way of extracting the first matching key of the $values array
  273. * against a set of options.
  274. *
  275. * This could be easily replaced with an ugly set of isset($values['key']) or a bunch of ??s.
  276. * Will probably replace it with ??s after dropping support of PHP5.6
  277. */
  278. $key = current(array_intersect([
  279. 'dc:title',
  280. 'dcterm:title',
  281. 'og:title',
  282. 'weibo:article:title',
  283. 'weibo:webpage:title',
  284. 'title',
  285. 'twitter:title'
  286. ], array_keys($values)));
  287. $this->setTitle(isset($values[$key]) ? trim($values[$key]) : null);
  288. if (!$this->getTitle()) {
  289. $this->setTitle($this->getArticleTitle());
  290. }
  291. // get author
  292. $key = current(array_intersect([
  293. 'dc:creator',
  294. 'dcterm:creator',
  295. 'author'
  296. ], array_keys($values)));
  297. $this->setAuthor(isset($values[$key]) ? $values[$key] : null);
  298. // get description
  299. $key = current(array_intersect([
  300. 'dc:description',
  301. 'dcterm:description',
  302. 'og:description',
  303. 'weibo:article:description',
  304. 'weibo:webpage:description',
  305. 'description',
  306. 'twitter:description'
  307. ], array_keys($values)));
  308. $this->setExcerpt(isset($values[$key]) ? $values[$key] : null);
  309. // get main image
  310. $key = current(array_intersect([
  311. 'og:image',
  312. 'twitter:image'
  313. ], array_keys($values)));
  314. $this->setImage(isset($values[$key]) ? $values[$key] : null);
  315. }
  316. /**
  317. * Returns all the images of the parsed article.
  318. *
  319. * @return array
  320. */
  321. public function getImages()
  322. {
  323. $result = [];
  324. if ($this->getImage()) {
  325. $result[] = $this->getImage();
  326. }
  327. if (null == $this->getDOMDocument()) {
  328. return $result;
  329. }
  330. foreach ($this->getDOMDocument()->getElementsByTagName('img') as $img) {
  331. if ($src = $img->getAttribute('src')) {
  332. $result[] = $src;
  333. }
  334. }
  335. if ($this->configuration->getFixRelativeURLs()) {
  336. foreach ($result as &$imgSrc) {
  337. $imgSrc = $this->toAbsoluteURI($imgSrc);
  338. }
  339. }
  340. $result = array_unique(array_filter($result));
  341. return $result;
  342. }
  343. /**
  344. * Tries to get the main article image. Will only update the metadata if the getMetadata function couldn't
  345. * find a correct image.
  346. */
  347. public function getMainImage()
  348. {
  349. $imgUrl = false;
  350. if ($this->getImage() !== null) {
  351. $imgUrl = $this->getImage();
  352. }
  353. if (!$imgUrl) {
  354. foreach ($this->dom->getElementsByTagName('link') as $link) {
  355. /** @var \DOMElement $link */
  356. /*
  357. * Check for the rel attribute, then check if the rel attribute is either img_src or image_src, and
  358. * finally check for the existence of the href attribute, which should hold the image url.
  359. */
  360. if ($link->hasAttribute('rel') && ($link->getAttribute('rel') === 'img_src' || $link->getAttribute('rel') === 'image_src') && $link->hasAttribute('href')) {
  361. $imgUrl = $link->getAttribute('href');
  362. break;
  363. }
  364. }
  365. }
  366. if (!empty($imgUrl) && $this->configuration->getFixRelativeURLs()) {
  367. $this->setImage($this->toAbsoluteURI($imgUrl));
  368. }
  369. }
  370. /**
  371. * Returns the title of the html. Prioritizes the title from the metadata against the title tag.
  372. *
  373. * @return string|null
  374. */
  375. private function getArticleTitle()
  376. {
  377. $originalTitle = null;
  378. if ($this->getTitle()) {
  379. $originalTitle = $this->getTitle();
  380. } else {
  381. $this->logger->debug('[Metadata] Could not find title in metadata, searching for the title tag...');
  382. $titleTag = $this->dom->getElementsByTagName('title');
  383. if ($titleTag->length > 0) {
  384. $this->logger->info(sprintf('[Metadata] Using title tag as article title: \'%s\'', $titleTag->item(0)->nodeValue));
  385. $originalTitle = $titleTag->item(0)->nodeValue;
  386. }
  387. }
  388. if ($originalTitle === null) {
  389. return null;
  390. }
  391. $curTitle = $originalTitle = trim($originalTitle);
  392. $titleHadHierarchicalSeparators = false;
  393. /*
  394. * If there's a separator in the title, first remove the final part
  395. *
  396. * Sanity warning: if you eval this match in PHPStorm's "Evaluate expression" box, it will return false
  397. * I can assure you it works properly if you let the code run.
  398. */
  399. if (preg_match('/ [\|\-\\\\\/>»] /i', $curTitle)) {
  400. $titleHadHierarchicalSeparators = (bool)preg_match('/ [\\\\\/>»] /', $curTitle);
  401. $curTitle = preg_replace('/(.*)[\|\-\\\\\/>»] .*/i', '$1', $originalTitle);
  402. $this->logger->info(sprintf('[Metadata] Found hierarchical separators in title, new title is: \'%s\'', $curTitle));
  403. // If the resulting title is too short (3 words or fewer), remove
  404. // the first part instead:
  405. if (count(preg_split('/\s+/', $curTitle)) < 3) {
  406. $curTitle = preg_replace('/[^\|\-\\\\\/>»]*[\|\-\\\\\/>»](.*)/i', '$1', $originalTitle);
  407. $this->logger->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle));
  408. }
  409. } elseif (strpos($curTitle, ': ') !== false) {
  410. // Check if we have an heading containing this exact string, so we
  411. // could assume it's the full title.
  412. $match = false;
  413. for ($i = 1; $i <= 2; $i++) {
  414. foreach ($this->dom->getElementsByTagName('h' . $i) as $hTag) {
  415. // Trim texts to avoid having false negatives when the title is surrounded by spaces or tabs
  416. if (trim($hTag->nodeValue) === trim($curTitle)) {
  417. $match = true;
  418. }
  419. }
  420. }
  421. // If we don't, let's extract the title out of the original title string.
  422. if (!$match) {
  423. $curTitle = substr($originalTitle, strrpos($originalTitle, ':') + 1);
  424. $this->logger->info(sprintf('[Metadata] Title has a colon in the middle, new title is: \'%s\'', $curTitle));
  425. // If the title is now too short, try the first colon instead:
  426. if (count(preg_split('/\s+/', $curTitle)) < 3) {
  427. $curTitle = substr($originalTitle, strpos($originalTitle, ':') + 1);
  428. $this->logger->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle));
  429. } elseif (count(preg_split('/\s+/', substr($curTitle, 0, strpos($curTitle, ':')))) > 5) {
  430. // But if we have too many words before the colon there's something weird
  431. // with the titles and the H tags so let's just use the original title instead
  432. $curTitle = $originalTitle;
  433. }
  434. }
  435. } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) {
  436. $hOnes = $this->dom->getElementsByTagName('h1');
  437. if ($hOnes->length === 1) {
  438. $curTitle = $hOnes->item(0)->nodeValue;
  439. $this->logger->info(sprintf('[Metadata] Using title from an H1 node: \'%s\'', $curTitle));
  440. }
  441. }
  442. $curTitle = trim($curTitle);
  443. /*
  444. * If we now have 4 words or fewer as our title, and either no
  445. * 'hierarchical' separators (\, /, > or ») were found in the original
  446. * title or we decreased the number of words by more than 1 word, use
  447. * the original title.
  448. */
  449. $curTitleWordCount = count(preg_split('/\s+/', $curTitle));
  450. $originalTitleWordCount = count(preg_split('/\s+/', preg_replace('/[\|\-\\\\\/>»]+/', '', $originalTitle))) - 1;
  451. if ($curTitleWordCount <= 4 &&
  452. (!$titleHadHierarchicalSeparators || $curTitleWordCount !== $originalTitleWordCount)) {
  453. $curTitle = $originalTitle;
  454. $this->logger->info(sprintf('Using title from an H1 node: \'%s\'', $curTitle));
  455. }
  456. return $curTitle;
  457. }
  458. /**
  459. * Convert URI to an absolute URI.
  460. *
  461. * @param $uri string URI to convert
  462. *
  463. * @return string
  464. */
  465. private function toAbsoluteURI($uri)
  466. {
  467. list($pathBase, $scheme, $prePath) = $this->getPathInfo($this->configuration->getOriginalURL());
  468. // If this is already an absolute URI, return it.
  469. if (preg_match('/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/', $uri)) {
  470. return $uri;
  471. }
  472. // Scheme-rooted relative URI.
  473. if (substr($uri, 0, 2) === '//') {
  474. return $scheme . '://' . substr($uri, 2);
  475. }
  476. // Prepath-rooted relative URI.
  477. if (substr($uri, 0, 1) === '/') {
  478. return $prePath . $uri;
  479. }
  480. // Dotslash relative URI.
  481. if (strpos($uri, './') === 0) {
  482. return $pathBase . substr($uri, 2);
  483. }
  484. // Ignore hash URIs:
  485. if (substr($uri, 0, 1) === '#') {
  486. return $uri;
  487. }
  488. // Standard relative URI; add entire path. pathBase already includes a
  489. // trailing "/".
  490. return $pathBase . $uri;
  491. }
  492. /**
  493. * Returns full path info of an URL.
  494. *
  495. * @param string $url
  496. *
  497. * @return array [$pathBase, $scheme, $prePath]
  498. */
  499. public function getPathInfo($url)
  500. {
  501. // Check for base URLs
  502. if ($this->dom->baseURI !== null) {
  503. if (substr($this->dom->baseURI, 0, 1) === '/') {
  504. // URLs starting with '/' override completely the URL defined in the link
  505. $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . $this->dom->baseURI;
  506. } else {
  507. // Otherwise just prepend the base to the actual path
  508. $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . dirname(parse_url($url, PHP_URL_PATH)) . '/' . rtrim($this->dom->baseURI, '/') . '/';
  509. }
  510. } else {
  511. $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . dirname(parse_url($url, PHP_URL_PATH)) . '/';
  512. }
  513. $scheme = parse_url($pathBase, PHP_URL_SCHEME);
  514. $prePath = $scheme . '://' . parse_url($pathBase, PHP_URL_HOST);
  515. return [$pathBase, $scheme, $prePath];
  516. }
  517. /**
  518. * Gets nodes from the root element.
  519. *
  520. * @param $node DOMNode|DOMText
  521. *
  522. * @return array
  523. */
  524. private function getNodes($node)
  525. {
  526. $this->logger->info('[Get Nodes] Retrieving nodes...');
  527. $stripUnlikelyCandidates = $this->configuration->getStripUnlikelyCandidates();
  528. $elementsToScore = [];
  529. /*
  530. * First, node prepping. Trash nodes that look cruddy (like ones with the
  531. * class name "comment", etc), and turn divs into P tags where they have been
  532. * used inappropriately (as in, where they contain no other block level elements.)
  533. */
  534. while ($node) {
  535. // Remove DOMComments nodes as we don't need them and mess up children counting
  536. if ($node->nodeType === XML_COMMENT_NODE) {
  537. $this->logger->debug(sprintf('[Get Nodes] Found comment node, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128)));
  538. $node = NodeUtility::removeAndGetNext($node);
  539. continue;
  540. }
  541. $matchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id');
  542. if (!$node->isProbablyVisible()) {
  543. $this->logger->debug(sprintf('[Get Nodes] Removing hidden node... Match string was: \'%s\'', $matchString));
  544. $node = NodeUtility::removeAndGetNext($node);
  545. continue;
  546. }
  547. // Check to see if this node is a byline, and remove it if it is.
  548. if ($this->checkByline($node, $matchString)) {
  549. $this->logger->debug(sprintf('[Get Nodes] Found byline, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128)));
  550. $node = NodeUtility::removeAndGetNext($node);
  551. continue;
  552. }
  553. // Remove unlikely candidates
  554. if ($stripUnlikelyCandidates) {
  555. if (
  556. preg_match(NodeUtility::$regexps['unlikelyCandidates'], $matchString) &&
  557. !preg_match(NodeUtility::$regexps['okMaybeItsACandidate'], $matchString) &&
  558. $node->nodeName !== 'body' &&
  559. $node->nodeName !== 'a'
  560. ) {
  561. $this->logger->debug(sprintf('[Get Nodes] Removing unlikely candidate. Node content was: \'%s\'', substr($node->nodeValue, 0, 128)));
  562. $node = NodeUtility::removeAndGetNext($node);
  563. continue;
  564. }
  565. }
  566. // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
  567. if (($node->nodeName === 'div' || $node->nodeName === 'section' || $node->nodeName === 'header' ||
  568. $node->nodeName === 'h1' || $node->nodeName === 'h2' || $node->nodeName === 'h3' ||
  569. $node->nodeName === 'h4' || $node->nodeName === 'h5' || $node->nodeName === 'h6' ||
  570. $node->nodeName === 'p') &&
  571. $node->isElementWithoutContent()) {
  572. $this->logger->debug(sprintf('[Get Nodes] Removing empty \'%s\' node.', $node->nodeName));
  573. $node = NodeUtility::removeAndGetNext($node);
  574. continue;
  575. }
  576. if (in_array(strtolower($node->nodeName), $this->defaultTagsToScore)) {
  577. $this->logger->debug(sprintf('[Get Nodes] Adding node to score list, node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
  578. $elementsToScore[] = $node;
  579. }
  580. // Turn all divs that don't have children block level elements into p's
  581. if ($node->nodeName === 'div') {
  582. // Put phrasing content into paragraphs.
  583. $p = null;
  584. $childNode = $node->firstChild;
  585. while ($childNode) {
  586. $nextSibling = $childNode->nextSibling;
  587. if ($childNode->isPhrasingContent()) {
  588. if ($p !== null) {
  589. $p->appendChild($childNode);
  590. } elseif (!$childNode->isWhitespace()) {
  591. $p = $this->dom->createElement('p');
  592. $node->replaceChild($p, $childNode);
  593. $p->appendChild($childNode);
  594. }
  595. } elseif ($p !== null) {
  596. while ($p->lastChild && $p->lastChild->isWhitespace()) {
  597. $p->removeChild($p->lastChild);
  598. }
  599. $p = null;
  600. }
  601. $childNode = $nextSibling;
  602. }
  603. /*
  604. * Sites like http://mobile.slate.com encloses each paragraph with a DIV
  605. * element. DIVs with only a P element inside and no text content can be
  606. * safely converted into plain P elements to avoid confusing the scoring
  607. * algorithm with DIVs with are, in practice, paragraphs.
  608. */
  609. if ($node->hasSingleTagInsideElement('p') && $node->getLinkDensity() < 0.25) {
  610. $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single P node, removing DIV. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
  611. $pNode = $node->getChildren(true)[0];
  612. $node->parentNode->replaceChild($pNode, $node);
  613. $node = $pNode;
  614. $elementsToScore[] = $node;
  615. } elseif (!$node->hasSingleChildBlockElement()) {
  616. $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single child block element, converting to a P node. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
  617. $node = NodeUtility::setNodeTag($node, 'p');
  618. $elementsToScore[] = $node;
  619. }
  620. }
  621. $node = NodeUtility::getNextNode($node);
  622. }
  623. return $elementsToScore;
  624. }
  625. /**
  626. * Checks if the node is a byline.
  627. *
  628. * @param DOMNode $node
  629. * @param string $matchString
  630. *
  631. * @return bool
  632. */
  633. private function checkByline($node, $matchString)
  634. {
  635. if (!$this->configuration->getArticleByLine()) {
  636. return false;
  637. }
  638. /*
  639. * Check if the byline is already set
  640. */
  641. if ($this->getAuthor()) {
  642. return false;
  643. }
  644. $rel = $node->getAttribute('rel');
  645. if ($rel === 'author' || preg_match(NodeUtility::$regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent())) {
  646. $this->logger->info(sprintf('[Metadata] Found article author: \'%s\'', $node->getTextContent()));
  647. $this->setAuthor(trim($node->getTextContent()));
  648. return true;
  649. }
  650. return false;
  651. }
  652. /**
  653. * Checks the validity of a byLine. Based on string length.
  654. *
  655. * @param string $text
  656. *
  657. * @return bool
  658. */
  659. private function isValidByline($text)
  660. {
  661. if (gettype($text) == 'string') {
  662. $byline = trim($text);
  663. return (mb_strlen($byline) > 0) && (mb_strlen($byline) < 100);
  664. }
  665. return false;
  666. }
  667. /**
  668. * Removes all the scripts of the html.
  669. *
  670. * @param DOMDocument $dom
  671. */
  672. private function removeScripts(DOMDocument $dom)
  673. {
  674. foreach (['script', 'noscript'] as $tag) {
  675. $nodes = $dom->getElementsByTagName($tag);
  676. foreach (iterator_to_array($nodes) as $node) {
  677. NodeUtility::removeNode($node);
  678. }
  679. }
  680. }
  681. /**
  682. * Prepares the document for parsing.
  683. *
  684. * @param DOMDocument $dom
  685. */
  686. private function prepDocument(DOMDocument $dom)
  687. {
  688. $this->logger->info('[PrepDocument] Preparing document for parsing...');
  689. foreach ($dom->shiftingAwareGetElementsByTagName('br') as $br) {
  690. $next = $br->nextSibling;
  691. /*
  692. * Whether 2 or more <br> elements have been found and replaced with a
  693. * <p> block.
  694. */
  695. $replaced = false;
  696. /*
  697. * If we find a <br> chain, remove the <br>s until we hit another element
  698. * or non-whitespace. This leaves behind the first <br> in the chain
  699. * (which will be replaced with a <p> later).
  700. */
  701. while (($next = NodeUtility::nextElement($next)) && ($next->nodeName === 'br')) {
  702. $this->logger->debug('[PrepDocument] Removing chain of BR nodes...');
  703. $replaced = true;
  704. $brSibling = $next->nextSibling;
  705. $next->parentNode->removeChild($next);
  706. $next = $brSibling;
  707. }
  708. /*
  709. * If we removed a <br> chain, replace the remaining <br> with a <p>. Add
  710. * all sibling nodes as children of the <p> until we hit another <br>
  711. * chain.
  712. */
  713. if ($replaced) {
  714. $p = $dom->createElement('p');
  715. $br->parentNode->replaceChild($p, $br);
  716. $next = $p->nextSibling;
  717. while ($next) {
  718. // If we've hit another <br><br>, we're done adding children to this <p>.
  719. if ($next->nodeName === 'br') {
  720. $nextElem = NodeUtility::nextElement($next->nextSibling);
  721. if ($nextElem && $nextElem->nodeName === 'br') {
  722. break;
  723. }
  724. }
  725. if (!$next->isPhrasingContent()) {
  726. break;
  727. }
  728. $this->logger->debug('[PrepDocument] Replacing BR with a P node...');
  729. // Otherwise, make this node a child of the new <p>.
  730. $sibling = $next->nextSibling;
  731. $p->appendChild($next);
  732. $next = $sibling;
  733. }
  734. while ($p->lastChild && $p->lastChild->isWhitespace()) {
  735. $p->removeChild($p->lastChild);
  736. }
  737. if ($p->parentNode->tagName === 'p') {
  738. NodeUtility::setNodeTag($p->parentNode, 'div');
  739. }
  740. }
  741. }
  742. // Replace font tags with span
  743. $fonts = $dom->getElementsByTagName('font');
  744. $length = $fonts->length;
  745. for ($i = 0; $i < $length; $i++) {
  746. $this->logger->debug('[PrepDocument] Converting font tag into a span tag.');
  747. $font = $fonts->item($length - 1 - $i);
  748. NodeUtility::setNodeTag($font, 'span');
  749. }
  750. }
  751. /**
  752. * Assign scores to each node. Returns full article parsed or false on error.
  753. *
  754. * @param array $nodes
  755. *
  756. * @return DOMDocument|bool
  757. */
  758. private function rateNodes($nodes)
  759. {
  760. $this->logger->info('[Rating] Rating nodes...');
  761. $candidates = [];
  762. /** @var DOMElement $node */
  763. foreach ($nodes as $node) {
  764. if (is_null($node->parentNode)) {
  765. continue;
  766. }
  767. // Discard nodes with less than 25 characters, without blank space
  768. if (mb_strlen($node->getTextContent(true)) < 25) {
  769. continue;
  770. }
  771. $ancestors = $node->getNodeAncestors();
  772. // Exclude nodes with no ancestor
  773. if (count($ancestors) === 0) {
  774. continue;
  775. }
  776. // Start with a point for the paragraph itself as a base.
  777. $contentScore = 1;
  778. // Add points for any commas within this paragraph.
  779. $contentScore += count(explode(',', $node->getTextContent(true)));
  780. // For every 100 characters in this paragraph, add another point. Up to 3 points.
  781. $contentScore += min(floor(mb_strlen($node->getTextContent(true)) / 100), 3);
  782. $this->logger->debug(sprintf('[Rating] Node score %s, content: \'%s\'', $contentScore, substr($node->nodeValue, 0, 128)));
  783. /** @var $ancestor DOMElement */
  784. foreach ($ancestors as $level => $ancestor) {
  785. $this->logger->debug('[Rating] Found ancestor, initializing and adding it as a candidate...');
  786. if (!$ancestor->isInitialized()) {
  787. $ancestor->initializeNode($this->configuration->getWeightClasses());
  788. $candidates[] = $ancestor;
  789. }
  790. /*
  791. * Node score divider:
  792. * - parent: 1 (no division)
  793. * - grandparent: 2
  794. * - great grandparent+: ancestor level * 3
  795. */
  796. if ($level === 0) {
  797. $scoreDivider = 1;
  798. } elseif ($level === 1) {
  799. $scoreDivider = 2;
  800. } else {
  801. $scoreDivider = $level * 3;
  802. }
  803. $currentScore = $ancestor->contentScore;
  804. $ancestor->contentScore = $currentScore + ($contentScore / $scoreDivider);
  805. $this->logger->debug(sprintf('[Rating] Ancestor score %s, value: \'%s\'', $ancestor->contentScore, substr($ancestor->nodeValue, 0, 128)));
  806. }
  807. }
  808. /*
  809. * After we've calculated scores, loop through all of the possible
  810. * candidate nodes we found and find the one with the highest score.
  811. */
  812. $topCandidates = [];
  813. foreach ($candidates as $candidate) {
  814. /*
  815. * Scale the final candidates score based on link density. Good content
  816. * should have a relatively small link density (5% or less) and be mostly
  817. * unaffected by this operation.
  818. */
  819. $candidate->contentScore = $candidate->contentScore * (1 - $candidate->getLinkDensity());
  820. for ($i = 0; $i < $this->configuration->getMaxTopCandidates(); $i++) {
  821. $aTopCandidate = isset($topCandidates[$i]) ? $topCandidates[$i] : null;
  822. if (!$aTopCandidate || $candidate->contentScore > $aTopCandidate->contentScore) {
  823. array_splice($topCandidates, $i, 0, [$candidate]);
  824. if (count($topCandidates) > $this->configuration->getMaxTopCandidates()) {
  825. array_pop($topCandidates);
  826. }
  827. break;
  828. }
  829. }
  830. }
  831. $topCandidate = isset($topCandidates[0]) ? $topCandidates[0] : null;
  832. $parentOfTopCandidate = null;
  833. /*
  834. * If we still have no top candidate, just use the body as a last resort.
  835. * We also have to copy the body node so it is something we can modify.
  836. */
  837. if ($topCandidate === null || $topCandidate->nodeName === 'body') {
  838. $this->logger->info('[Rating] No top candidate found or top candidate is the body tag. Moving all child nodes to a new DIV node.');
  839. // Move all of the page's children into topCandidate
  840. $topCandidate = new DOMDocument('1.0', 'utf-8');
  841. $topCandidate->encoding = 'UTF-8';
  842. $topCandidate->appendChild($topCandidate->createElement('div', ''));
  843. $kids = $this->dom->getElementsByTagName('body')->item(0)->childNodes;
  844. // Cannot be foreached, don't ask me why.
  845. for ($i = 0; $i < $kids->length; $i++) {
  846. $import = $topCandidate->importNode($kids->item($i), true);
  847. $topCandidate->firstChild->appendChild($import);
  848. }
  849. // Candidate must be created using firstChild to grab the DOMElement instead of the DOMDocument.
  850. $topCandidate = $topCandidate->firstChild;
  851. } elseif ($topCandidate) {
  852. $this->logger->info(sprintf('[Rating] Found top candidate, score: %s', $topCandidate->contentScore));
  853. // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
  854. // and whose scores are quite closed with current `topCandidate` node.
  855. $alternativeCandidateAncestors = [];
  856. for ($i = 1; $i < count($topCandidates); $i++) {
  857. // In some cases we may end up with a top candidate with zero content score. To avoid dividing by zero
  858. // we have to use max() and replace zero with a low value like 0.1
  859. if ($topCandidates[$i]->contentScore / max($topCandidate->contentScore, 0.1) >= 0.75) {
  860. array_push($alternativeCandidateAncestors, $topCandidates[$i]->getNodeAncestors(false));
  861. }
  862. }
  863. $MINIMUM_TOPCANDIDATES = 3;
  864. if (count($alternativeCandidateAncestors) >= $MINIMUM_TOPCANDIDATES) {
  865. $parentOfTopCandidate = $topCandidate->parentNode;
  866. // Check if we are actually dealing with a DOMNode and not a DOMDocument node or higher
  867. while ($parentOfTopCandidate->nodeName !== 'body' && $parentOfTopCandidate->nodeType === XML_ELEMENT_NODE) {
  868. $listsContainingThisAncestor = 0;
  869. for ($ancestorIndex = 0; $ancestorIndex < count($alternativeCandidateAncestors) && $listsContainingThisAncestor < $MINIMUM_TOPCANDIDATES; $ancestorIndex++) {
  870. $listsContainingThisAncestor += (int)in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex]);
  871. }
  872. if ($listsContainingThisAncestor >= $MINIMUM_TOPCANDIDATES) {
  873. $topCandidate = $parentOfTopCandidate;
  874. break;
  875. }
  876. $parentOfTopCandidate = $parentOfTopCandidate->parentNode;
  877. }
  878. }
  879. /*
  880. * Because of our bonus system, parents of candidates might have scores
  881. * themselves. They get half of the node. There won't be nodes with higher
  882. * scores than our topCandidate, but if we see the score going *up* in the first
  883. * few steps up the tree, that's a decent sign that there might be more content
  884. * lurking in other places that we want to unify in. The sibling stuff
  885. * below does some of that - but only if we've looked high enough up the DOM
  886. * tree.
  887. */
  888. $parentOfTopCandidate = $topCandidate->parentNode;
  889. $lastScore = $topCandidate->contentScore;
  890. // The scores shouldn't get too low.
  891. $scoreThreshold = $lastScore / 3;
  892. /* @var DOMElement $parentOfTopCandidate */
  893. while ($parentOfTopCandidate->nodeName !== 'body') {
  894. $parentScore = $parentOfTopCandidate->contentScore;
  895. if ($parentScore < $scoreThreshold) {
  896. break;
  897. }
  898. if ($parentScore > $lastScore) {
  899. // Alright! We found a better parent to use.
  900. $topCandidate = $parentOfTopCandidate;
  901. $this->logger->info('[Rating] Found a better top candidate.');
  902. break;
  903. }
  904. $lastScore = $parentOfTopCandidate->contentScore;
  905. $parentOfTopCandidate = $parentOfTopCandidate->parentNode;
  906. }
  907. // If the top candidate is the only child, use parent instead. This will help sibling
  908. // joining logic when adjacent content is actually located in parent's sibling node.
  909. $parentOfTopCandidate = $topCandidate->parentNode;
  910. while ($parentOfTopCandidate->nodeName !== 'body' && count($parentOfTopCandidate->getChildren(true)) === 1) {
  911. $topCandidate = $parentOfTopCandidate;
  912. $parentOfTopCandidate = $topCandidate->parentNode;
  913. }
  914. }
  915. /*
  916. * Now that we have the top candidate, look through its siblings for content
  917. * that might also be related. Things like preambles, content split by ads
  918. * that we removed, etc.
  919. */
  920. $this->logger->info('[Rating] Creating final article content document...');
  921. $articleContent = new DOMDocument('1.0', 'utf-8');
  922. $articleContent->createElement('div');
  923. $siblingScoreThreshold = max(10, $topCandidate->contentScore * 0.2);
  924. // Keep potential top candidate's parent node to try to get text direction of it later.
  925. $parentOfTopCandidate = $topCandidate->parentNode;
  926. $siblings = $parentOfTopCandidate->getChildren();
  927. $hasContent = false;
  928. $this->logger->info('[Rating] Adding top candidate siblings...');
  929. /** @var DOMElement $sibling */
  930. foreach ($siblings as $sibling) {
  931. $append = false;
  932. if ($sibling === $topCandidate) {
  933. $this->logger->debug('[Rating] Sibling is equal to the top candidate, adding to the final article...');
  934. $append = true;
  935. } else {
  936. $contentBonus = 0;
  937. // Give a bonus if sibling nodes and top candidates have the example same classname
  938. if ($sibling->getAttribute('class') === $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') !== '') {
  939. $contentBonus += $topCandidate->contentScore * 0.2;
  940. }
  941. if ($sibling->contentScore + $contentBonus >= $siblingScoreThreshold) {
  942. $append = true;
  943. } elseif ($sibling->nodeName === 'p') {
  944. $linkDensity = $sibling->getLinkDensity();
  945. $nodeContent = $sibling->getTextContent(true);
  946. if (mb_strlen($nodeContent) > 80 && $linkDensity < 0.25) {
  947. $append = true;
  948. } elseif ($nodeContent && mb_strlen($nodeContent) < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) {
  949. $append = true;
  950. }
  951. }
  952. }
  953. if ($append) {
  954. $this->logger->debug(sprintf('[Rating] Appending sibling to final article, content is: \'%s\'', substr($sibling->nodeValue, 0, 128)));
  955. $hasContent = true;
  956. if (!in_array(strtolower($sibling->nodeName), $this->alterToDIVExceptions)) {
  957. /*
  958. * We have a node that isn't a common block level element, like a form or td tag.
  959. * Turn it into a div so it doesn't get filtered out later by accident.
  960. */
  961. $sibling = NodeUtility::setNodeTag($sibling, 'div');
  962. }
  963. $import = $articleContent->importNode($sibling, true);
  964. $articleContent->appendChild($import);
  965. /*
  966. * No node shifting needs to be check because when calling getChildren, an array is made with the
  967. * children of the parent node, instead of using the DOMElement childNodes function, which, when used
  968. * along with appendChild, would shift the nodes position and the current foreach will behave in
  969. * unpredictable ways.
  970. */
  971. }
  972. }
  973. $articleContent = $this->prepArticle($articleContent);
  974. if ($hasContent) {
  975. // Find out text direction from ancestors of final top candidate.
  976. $ancestors = array_merge([$parentOfTopCandidate, $topCandidate], $parentOfTopCandidate->getNodeAncestors());
  977. foreach ($ancestors as $ancestor) {
  978. $articleDir = $ancestor->getAttribute('dir');
  979. if ($articleDir) {
  980. $this->setDirection($articleDir);
  981. $this->logger->debug(sprintf('[Rating] Found article direction: %s', $articleDir));
  982. break;
  983. }
  984. }
  985. return $articleContent;
  986. } else {
  987. return false;
  988. }
  989. }
  990. /**
  991. * Cleans up the final article.
  992. *
  993. * @param DOMDocument $article
  994. *
  995. * @return DOMDocument
  996. */
  997. public function prepArticle(DOMDocument $article)
  998. {
  999. $this->logger->info('[PrepArticle] Preparing final article...');
  1000. $this->_cleanStyles($article);
  1001. $this->_clean($article, 'style');
  1002. // Check for data tables before we continue, to avoid removing items in
  1003. // those tables, which will often be isolated even though they're
  1004. // visually linked to other content-ful elements (text, images, etc.).
  1005. $this->_markDataTables($article);
  1006. // Clean out junk from the article content
  1007. $this->_cleanConditionally($article, 'form');
  1008. $this->_cleanConditionally($article, 'fieldset');
  1009. $this->_clean($article, 'object');
  1010. $this->_clean($article, 'embed');
  1011. $this->_clean($article, 'h1');
  1012. $this->_clean($article, 'footer');
  1013. $this->_clean($article, 'link');
  1014. $this->_clean($article, 'aside');
  1015. // Clean out elements have "share" in their id/class combinations from final top candidates,
  1016. // which means we don't remove the top candidates even they have "share".
  1017. foreach ($article->childNodes as $child) {
  1018. $this->_cleanMatchedNodes($child, '/share/i');
  1019. }
  1020. /*
  1021. * If there is only one h2 and its text content substantially equals article title,
  1022. * they are probably using it as a header and not a subheader,
  1023. * so remove it since we already extract the title separately.
  1024. */
  1025. $h2 = $article->getElementsByTagName('h2');
  1026. if ($h2->length === 1) {
  1027. $lengthSimilarRate = (mb_strlen($h2->item(0)->textContent) - mb_strlen($this->getTitle())) / max(mb_strlen($this->getTitle()), 1);
  1028. if (abs($lengthSimilarRate) < 0.5) {
  1029. if ($lengthSimilarRate > 0) {
  1030. $titlesMatch = strpos($h2->item(0)->textContent, $this->getTitle()) !== false;
  1031. } else {
  1032. $titlesMatch = strpos($this->getTitle(), $h2->item(0)->textContent) !== false;
  1033. }
  1034. if ($titlesMatch) {
  1035. $this->logger->info('[PrepArticle] Found title repeated in an H2 node, removing...');
  1036. $this->_clean($article, 'h2');
  1037. }
  1038. }
  1039. }
  1040. $this->_clean($article, 'iframe');
  1041. $this->_clean($article, 'input');
  1042. $this->_clean($article, 'textarea');
  1043. $this->_clean($article, 'select');
  1044. $this->_clean($article, 'button');
  1045. $this->_cleanHeaders($article);
  1046. // Do these last as the previous stuff may have removed junk
  1047. // that will affect these
  1048. $this->_cleanConditionally($article, 'table');
  1049. $this->_cleanConditionally($article, 'ul');
  1050. $this->_cleanConditionally($article, 'div');
  1051. $this->_cleanExtraParagraphs($article);
  1052. foreach (iterator_to_array($article->getElementsByTagName('br')) as $br) {
  1053. $next = $br->nextSibling;
  1054. if ($next && $next->nodeName === 'p') {
  1055. $this->logger->debug('[PrepArticle] Removing br node next to a p node.');
  1056. $br->parentNode->removeChild($br);
  1057. }
  1058. }
  1059. // Remove single-cell tables
  1060. foreach ($article->shiftingAwareGetElementsByTagName('table') as $table) {
  1061. /** @var DOMNode $table */
  1062. $tbody = $table->hasSingleTagInsideElement('tbody') ? $table->childNodes[0] : $table;
  1063. if ($tbody->hasSingleTagInsideElement('tr')) {
  1064. $row = $tbody->firstChild;
  1065. if ($row->hasSingleTagInsideElement('td')) {
  1066. $cell = $row->firstChild;
  1067. $cell = NodeUtility::setNodeTag($cell, (array_reduce(iterator_to_array($cell->childNodes), function ($carry, $node) {
  1068. return $node->isPhrasingContent() && $carry;
  1069. }, true)) ? 'p' : 'div');
  1070. $table->parentNode->replaceChild($cell, $table);
  1071. }
  1072. }
  1073. }
  1074. return $article;
  1075. }
  1076. /**
  1077. * Look for 'data' (as opposed to 'layout') tables, for which we use
  1078. * similar checks as
  1079. * https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920.
  1080. *
  1081. * @param DOMDocument $article
  1082. *
  1083. * @return void
  1084. */
  1085. public function _markDataTables(DOMDocument $article)
  1086. {
  1087. $tables = $article->getElementsByTagName('table');
  1088. foreach ($tables as $table) {
  1089. /** @var DOMElement $table */
  1090. $role = $table->getAttribute('role');
  1091. if ($role === 'presentation') {
  1092. $table->setReadabilityDataTable(false);
  1093. continue;
  1094. }
  1095. $datatable = $table->getAttribute('datatable');
  1096. if ($datatable == '0') {
  1097. $table->setReadabilityDataTable(false);
  1098. continue;
  1099. }
  1100. $summary = $table->getAttribute('summary');
  1101. if ($summary) {
  1102. $table->setReadabilityDataTable(true);
  1103. continue;
  1104. }
  1105. $caption = $table->getElementsByTagName('caption');
  1106. if ($caption->length > 0 && $caption->item(0)->childNodes->length > 0) {
  1107. $table->setReadabilityDataTable(true);
  1108. continue;
  1109. }
  1110. // If the table has a descendant with any of these tags, consider a data table:
  1111. foreach (['col', 'colgroup', 'tfoot', 'thead', 'th'] as $dataTableDescendants) {
  1112. if ($table->getElementsByTagName($dataTableDescendants)->length > 0) {
  1113. $table->setReadabilityDataTable(true);
  1114. continue 2;
  1115. }
  1116. }
  1117. // Nested tables indicate a layout table:
  1118. if ($table->getElementsByTagName('table')->length > 0) {
  1119. $table->setReadabilityDataTable(false);
  1120. continue;
  1121. }
  1122. $sizeInfo = $table->getRowAndColumnCount();
  1123. if ($sizeInfo['rows'] >= 10 || $sizeInfo['columns'] > 4) {
  1124. $table->setReadabilityDataTable(true);
  1125. continue;
  1126. }
  1127. // Now just go by size entirely:
  1128. $table->setReadabilityDataTable($sizeInfo['rows'] * $sizeInfo['columns'] > 10);
  1129. }
  1130. }
  1131. /**
  1132. * Remove the style attribute on every e and under.
  1133. *
  1134. * @param $node DOMDocument|DOMNode
  1135. **/
  1136. public function _cleanStyles($node)
  1137. {
  1138. if (property_exists($node, 'tagName') && $node->tagName === 'svg') {
  1139. return;
  1140. }
  1141. // Do not bother if there's no method to remove an attribute
  1142. if (method_exists($node, 'removeAttribute')) {
  1143. $presentational_attributes = ['align', 'background', 'bgcolor', 'border', 'cellpadding', 'cellspacing', 'frame', 'hspace', 'rules', 'style', 'valign', 'vspace'];
  1144. // Remove `style` and deprecated presentational attributes
  1145. foreach ($presentational_attributes as $presentational_attribute) {
  1146. $node->removeAttribute($presentational_attribute);
  1147. }
  1148. $deprecated_size_attribute_elems = ['table', 'th', 'td', 'hr', 'pre'];
  1149. if (property_exists($node, 'tagName') && in_array($node->tagName, $deprecated_size_attribute_elems)) {
  1150. $node->removeAttribute('width');
  1151. $node->removeAttribute('height');
  1152. }
  1153. }
  1154. $cur = $node->firstChild;
  1155. while ($cur !== null) {
  1156. $this->_cleanStyles($cur);
  1157. $cur = $cur->nextSibling;
  1158. }
  1159. }
  1160. /**
  1161. * Clean out elements whose id/class combinations match specific string.
  1162. *
  1163. * @param $node DOMElement Node to clean
  1164. * @param $regex string Match id/class combination.
  1165. *
  1166. * @return void
  1167. **/
  1168. public function _cleanMatchedNodes($node, $regex)
  1169. {
  1170. $endOfSearchMarkerNode = NodeUtility::getNextNode($node, true);
  1171. $next = NodeUtility::getNextNode($node);
  1172. while ($next && $next !== $endOfSearchMarkerNode) {
  1173. if (preg_match($regex, sprintf('%s %s', $next->getAttribute('class'), $next->getAttribute('id')))) {
  1174. $this->logger->debug(sprintf('Removing matched node with regex: \'%s\', node class was: \'%s\', id: \'%s\'', $regex, $next->getAttribute('class'), $next->getAttribute('id')));
  1175. $next = NodeUtility::removeAndGetNext($next);
  1176. } else {
  1177. $next = NodeUtility::getNextNode($next);
  1178. }
  1179. }
  1180. }
  1181. /**
  1182. * @param DOMDocument $article
  1183. *
  1184. * @return void
  1185. */
  1186. public function _cleanExtraParagraphs(DOMDocument $article)
  1187. {
  1188. $paragraphs = $article->getElementsByTagName('p');
  1189. $length = $paragraphs->length;
  1190. for ($i = 0; $i < $length; $i++) {
  1191. $paragraph = $paragraphs->item($length - 1 - $i);
  1192. $imgCount = $paragraph->getElementsByTagName('img')->length;
  1193. $embedCount = $paragraph->getElementsByTagName('embed')->length;
  1194. $objectCount = $paragraph->getElementsByTagName('object')->length;
  1195. // At this point, nasty iframes have been removed, only remain embedded video ones.
  1196. $iframeCount = $paragraph->getElementsByTagName('iframe')->length;
  1197. $totalCount = $imgCount + $embedCount + $objectCount + $iframeCount;
  1198. if ($totalCount === 0 && !preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $paragraph->textContent)) {
  1199. $this->logger->debug(sprintf('[PrepArticle] Removing extra paragraph. Text content was: \'%s\'', substr($paragraph->textContent, 0, 128)));
  1200. $paragraph->parentNode->removeChild($paragraph);
  1201. }
  1202. }
  1203. }
  1204. /**
  1205. * @param DOMDocument $article
  1206. * @param string $tag Tag to clean conditionally
  1207. *
  1208. * @return void
  1209. */
  1210. public function _cleanConditionally(DOMDocument $article, $tag)
  1211. {
  1212. if (!$this->configuration->getCleanConditionally()) {
  1213. return;
  1214. }
  1215. $isList = in_array($tag, ['ul', 'ol']);
  1216. /*
  1217. * Gather counts for other typical elements embedded within.
  1218. * Traverse backwards so we can remove nodes at the same time
  1219. * without effecting the traversal.
  1220. */
  1221. $DOMNodeList = $article->getElementsByTagName($tag);
  1222. $length = $DOMNodeList->length;
  1223. for ($i = 0; $i < $length; $i++) {
  1224. /** @var $node DOMElement */
  1225. $node = $DOMNodeList->item($length - 1 - $i);
  1226. // First check if we're in a data table, in which case don't remove us.
  1227. if ($node->hasAncestorTag('table', -1, function ($node) {
  1228. return $node->isReadabilityDataTable();
  1229. })) {
  1230. continue;
  1231. }
  1232. $weight = 0;
  1233. if ($this->configuration->getWeightClasses()) {
  1234. $weight = $node->getClassWeight();
  1235. }
  1236. if ($weight < 0) {
  1237. $this->logger->debug(sprintf('[PrepArticle] Removing tag \'%s\' with 0 or less weight', $tag));
  1238. NodeUtility::removeNode($node);
  1239. continue;
  1240. }
  1241. if (substr_count($node->getTextContent(), ',') < 10) {
  1242. /*
  1243. * If there are not very many commas, and the number of
  1244. * non-paragraph elements is more than paragraphs or other
  1245. * ominous signs, remove the element.
  1246. */
  1247. $p = $node->getElementsByTagName('p')->length;
  1248. $img = $node->getElementsByTagName('img')->length;
  1249. $li = $node->getElementsByTagName('li')->length - 100;
  1250. $input = $node->getElementsByTagName('input')->length;
  1251. $embedCount = 0;
  1252. $embeds = $node->getElementsByTagName('embed');
  1253. foreach ($embeds as $embedNode) {
  1254. if (preg_match(NodeUtility::$regexps['videos'], $embedNode->C14N())) {
  1255. $embedCount++;
  1256. }
  1257. }
  1258. $linkDensity = $node->getLinkDensity();
  1259. $contentLength = mb_strlen($node->getTextContent(true));
  1260. $haveToRemove =
  1261. ($img > 1 && $p / $img < 0.5 && !$node->hasAncestorTag('figure')) ||
  1262. (!$isList && $li > $p) ||
  1263. ($input > floor($p / 3)) ||
  1264. (!$isList && $contentLength < 25 && ($img === 0 || $img > 2) && !$node->hasAncestorTag('figure')) ||
  1265. (!$isList && $weight < 25 && $linkDensity > 0.2) ||
  1266. ($weight >= 25 && $linkDensity > 0.5) ||
  1267. (($embedCount === 1 && $contentLength < 75) || $embedCount > 1);
  1268. if ($haveToRemove) {
  1269. $this->logger->debug(sprintf('[PrepArticle] Removing tag \'%s\'.', $tag));
  1270. NodeUtility::removeNode($node);
  1271. }
  1272. }
  1273. }
  1274. }
  1275. /**
  1276. * Clean a node of all elements of type "tag".
  1277. * (Unless it's a youtube/vimeo video. People love movies.).
  1278. *
  1279. * @param $article DOMDocument
  1280. * @param $tag string tag to clean
  1281. *
  1282. * @return void
  1283. **/
  1284. public function _clean(DOMDocument $article, $tag)
  1285. {
  1286. $isEmbed = in_array($tag, ['object', 'embed', 'iframe']);
  1287. $DOMNodeList = $article->getElementsByTagName($tag);
  1288. $length = $DOMNodeList->length;
  1289. for ($i = 0; $i < $length; $i++) {
  1290. $item = $DOMNodeList->item($length - 1 - $i);
  1291. // Allow youtube and vimeo videos through as people usually want to see those.
  1292. if ($isEmbed) {
  1293. $attributeValues = [];
  1294. foreach ($item->attributes as $value) {
  1295. $attributeValues[] = $value->nodeValue;
  1296. }
  1297. $attributeValues = implode('|', $attributeValues);
  1298. // First, check the elements attributes to see if any of them contain youtube or vimeo
  1299. if (preg_match(NodeUtility::$regexps['videos'], $attributeValues)) {
  1300. continue;
  1301. }
  1302. // Then check the elements inside this element for the same.
  1303. if (preg_match(NodeUtility::$regexps['videos'], $item->C14N())) {
  1304. continue;
  1305. }
  1306. }
  1307. $this->logger->debug(sprintf('[PrepArticle] Removing node \'%s\'.', $item->tagName));
  1308. NodeUtility::removeNode($item);
  1309. }
  1310. }
  1311. /**
  1312. * Clean out spurious headers from an Element. Checks things like classnames and link density.
  1313. *
  1314. * @param DOMDocument $article
  1315. *
  1316. * @return void
  1317. **/
  1318. public function _cleanHeaders(DOMDocument $article)
  1319. {
  1320. for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {
  1321. $headers = $article->getElementsByTagName('h' . $headerIndex);
  1322. /** @var $header DOMElement */
  1323. foreach ($headers as $header) {
  1324. $weight = 0;
  1325. if ($this->configuration->getWeightClasses()) {
  1326. $weight = $header->getClassWeight();
  1327. }
  1328. if ($weight < 0) {
  1329. $this->logger->debug(sprintf('[PrepArticle] Removing H node with 0 or less weight. Content was: \'%s\'', substr($header->nodeValue, 0, 128)));
  1330. NodeUtility::removeNode($header);
  1331. }
  1332. }
  1333. }
  1334. }
  1335. /**
  1336. * Removes the class="" attribute from every element in the given
  1337. * subtree.
  1338. *
  1339. * Readability.js has a special filter to avoid cleaning the classes that the algorithm adds. We don't add classes
  1340. * here so no need to filter those.
  1341. *
  1342. * @param DOMDocument|DOMNode $node
  1343. *
  1344. * @return void
  1345. **/
  1346. public function _cleanClasses($node)
  1347. {
  1348. if ($node->getAttribute('class') !== '') {
  1349. $node->removeAttribute('class');
  1350. }
  1351. for ($node = $node->firstChild; $node !== null; $node = $node->nextSibling) {
  1352. $this->_cleanClasses($node);
  1353. }
  1354. }
  1355. /**
  1356. * @param DOMDocument $article
  1357. *
  1358. * @return DOMDocument
  1359. */
  1360. public function postProcessContent(DOMDocument $article)
  1361. {
  1362. $this->logger->info('[PostProcess] PostProcessing content...');
  1363. // Readability cannot open relative uris so we convert them to absolute uris.
  1364. if ($this->configuration->getFixRelativeURLs()) {
  1365. foreach (iterator_to_array($article->getElementsByTagName('a')) as $link) {
  1366. /** @var DOMElement $link */
  1367. $href = $link->getAttribute('href');
  1368. if ($href) {
  1369. // Replace links with javascript: URIs with text content, since
  1370. // they won't work after scripts have been removed from the page.
  1371. if (strpos($href, 'javascript:') === 0) {
  1372. $this->logger->debug(sprintf('[PostProcess] Removing \'javascript:\' link. Content is: \'%s\'', substr($link->textContent, 0, 128)));
  1373. $text = $article->createTextNode($link->textContent);
  1374. $link->parentNode->replaceChild($text, $link);
  1375. } else {
  1376. $this->logger->debug(sprintf('[PostProcess] Converting link to absolute URI: \'%s\'', substr($href, 0, 128)));
  1377. $link->setAttribute('href', $this->toAbsoluteURI($href));
  1378. }
  1379. }
  1380. }
  1381. foreach ($article->getElementsByTagName('img') as $img) {
  1382. /** @var DOMElement $img */
  1383. /*
  1384. * Extract all possible sources of img url and select the first one on the list.
  1385. */
  1386. $url = [
  1387. $img->getAttribute('src'),
  1388. $img->getAttribute('data-src'),
  1389. $img->getAttribute('data-original'),
  1390. $img->getAttribute('data-orig'),
  1391. $img->getAttribute('data-url')
  1392. ];
  1393. $src = array_filter($url);
  1394. $src = reset($src);
  1395. if ($src) {
  1396. $this->logger->debug(sprintf('[PostProcess] Converting image URL to absolute URI: \'%s\'', substr($src, 0, 128)));
  1397. $img->setAttribute('src', $this->toAbsoluteURI($src));
  1398. }
  1399. }
  1400. }
  1401. $this->_cleanClasses($article);
  1402. return $article;
  1403. }
  1404. /**
  1405. * @return null|string
  1406. */
  1407. public function __toString()
  1408. {
  1409. return sprintf('<h1>%s</h1>%s', $this->getTitle(), $this->getContent());
  1410. }
  1411. /**
  1412. * @return string|null
  1413. */
  1414. public function getTitle()
  1415. {
  1416. return $this->title;
  1417. }
  1418. /**
  1419. * @param string $title
  1420. */
  1421. protected function setTitle($title)
  1422. {
  1423. $this->title = $title;
  1424. }
  1425. /**
  1426. * @return string|null
  1427. */
  1428. public function getContent()
  1429. {
  1430. return ($this->content instanceof DOMDocument) ? $this->content->C14N() : null;
  1431. }
  1432. /**
  1433. * @return DOMDocument|null
  1434. */
  1435. public function getDOMDocument()
  1436. {
  1437. return $this->content;
  1438. }
  1439. /**
  1440. * @param DOMDocument $content
  1441. */
  1442. protected function setContent(DOMDocument $content)
  1443. {
  1444. $this->content = $content;
  1445. }
  1446. /**
  1447. * @return null|string
  1448. */
  1449. public function getExcerpt()
  1450. {
  1451. return $this->excerpt;
  1452. }
  1453. /**
  1454. * @param null|string $excerpt
  1455. */
  1456. public function setExcerpt($excerpt)
  1457. {
  1458. $this->excerpt = $excerpt;
  1459. }
  1460. /**
  1461. * @return string|null
  1462. */
  1463. public function getImage()
  1464. {
  1465. return $this->image;
  1466. }
  1467. /**
  1468. * @param string $image
  1469. */
  1470. protected function setImage($image)
  1471. {
  1472. $this->image = $image;
  1473. }
  1474. /**
  1475. * @return string|null
  1476. */
  1477. public function getAuthor()
  1478. {
  1479. return $this->author;
  1480. }
  1481. /**
  1482. * @param string $author
  1483. */
  1484. protected function setAuthor($author)
  1485. {
  1486. $this->author = $author;
  1487. }
  1488. /**
  1489. * @return null|string
  1490. */
  1491. public function getDirection()
  1492. {
  1493. return $this->direction;
  1494. }
  1495. /**
  1496. * @param null|string $direction
  1497. */
  1498. public function setDirection($direction)
  1499. {
  1500. $this->direction = $direction;
  1501. }
  1502. }