diff options
author | Andres Rey <[email protected]> | 2017-02-12 20:13:02 +0000 |
---|---|---|
committer | Andres Rey <[email protected]> | 2017-02-12 20:13:02 +0000 |
commit | ae573a6fe5d369c014b427b813fdc0bdfc20de54 (patch) | |
tree | fe2f250421f3ff39986bf7565b6a92fcdb822f51 /src | |
parent | 550fe35fbf3bbefa3257ddb7ec32afaea48b6726 (diff) |
Extract top image when og:image and twitter:image are missing on the HTML
Diffstat (limited to 'src')
-rw-r--r-- | src/HTMLParser.php | 32 |
1 files changed, 31 insertions, 1 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 6f7afe7..5866bd9 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -132,6 +132,8 @@ class HTMLParser $this->metadata = $this->getMetadata(); + $this->metadata['image'] = $this->getMainImage(); + $this->metadata['title'] = $this->getTitle(); // Checking for minimum HTML to work with. @@ -158,7 +160,7 @@ class HTMLParser // TODO Better way to count resulting text. Textcontent usually has alt titles and that stuff // that doesn't really count to the quality of the result. $length = 0; - foreach($result->getElementsByTagName('p') as $p){ + foreach ($result->getElementsByTagName('p') as $p) { $length += mb_strlen($p->textContent); } if ($result && mb_strlen(preg_replace('/\s/', '', $result->textContent)) < 500) { @@ -431,12 +433,40 @@ class HTMLParser if (array_key_exists('og:image', $values) || array_key_exists('twitter:image', $values)) { $metadata['image'] = ($values['og:image']) ? $values['og:image'] : $values['twitter:image']; + } else { + $metadata['image'] = null; } return $metadata; } /** + * Tries to get the main article image. Will only update the metadata if the getMetadata function couldn't + * find a correct image. + * + * @return bool|string URL of the top image or false if unsuccessful. + */ + public function getMainImage() + { + if ($this->metadata['image'] !== null) { + return $this->metadata['image']; + } + + foreach ($this->dom->getElementsByTagName('link') as $link) { + /** @var \DOMElement $link */ + /* + * Check for the rel attribute, then check if the rel attribute is either img_src or image_src, and + * finally check for the existence of the href attribute, which should hold the image url. + */ + if ($link->hasAttribute('rel') && ($link->getAttribute('rel') === 'img_src' || $link->getAttribute('rel') === 'image_src') && $link->hasAttribute('href')){ + return $link->getAttribute('href'); + } + } + + return false; + } + + /** * Get the density of links as a percentage of the content * This is the amount of text that is inside a link divided by the total text in the node. * |