diff options
author | Andres Rey <[email protected]> | 2016-12-28 16:23:07 -0300 |
---|---|---|
committer | Andres Rey <[email protected]> | 2016-12-28 16:23:07 -0300 |
commit | 550fe35fbf3bbefa3257ddb7ec32afaea48b6726 (patch) | |
tree | 8a245374b4b33374b9dc0592cbb58da2038eaff9 /src | |
parent | 1b0ac19d31473b728edfb2e72d3ec0cf1ebac35f (diff) |
Removed the private var title since it wasn't used
Diffstat (limited to 'src')
-rw-r--r-- | src/HTMLParser.php | 13 |
1 files changed, 6 insertions, 7 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php index a8c28ff..6f7afe7 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -29,11 +29,6 @@ class HTMLParser /** * @var array */ - private $title = []; - - /** - * @var array - */ private $regexps = [ 'unlikelyCandidates' => '/banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i', 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', @@ -137,7 +132,7 @@ class HTMLParser $this->metadata = $this->getMetadata(); - $this->title = $this->getTitle(); + $this->metadata['title'] = $this->getTitle(); // Checking for minimum HTML to work with. if (!($root = $this->dom->getElementsByTagName('body')->item(0))) { @@ -162,7 +157,11 @@ class HTMLParser // TODO Better way to count resulting text. Textcontent usually has alt titles and that stuff // that doesn't really count to the quality of the result. - if ($result && mb_strlen($result->textContent) < 500) { + $length = 0; + foreach($result->getElementsByTagName('p') as $p){ + $length += mb_strlen($p->textContent); + } + if ($result && mb_strlen(preg_replace('/\s/', '', $result->textContent)) < 500) { $root = $this->backupdom->getElementsByTagName('body')->item(0); if ($this->getConfig()->getOption('stripUnlikelyCandidates')) { |