summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2017-02-12 20:13:02 +0000
committerAndres Rey <[email protected]>2017-02-12 20:13:02 +0000
commitae573a6fe5d369c014b427b813fdc0bdfc20de54 (patch)
treefe2f250421f3ff39986bf7565b6a92fcdb822f51 /src
parent550fe35fbf3bbefa3257ddb7ec32afaea48b6726 (diff)
Extract top image when og:image and twitter:image are missing on the HTML
Diffstat (limited to 'src')
-rw-r--r--src/HTMLParser.php32
1 files changed, 31 insertions, 1 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 6f7afe7..5866bd9 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -132,6 +132,8 @@ class HTMLParser
$this->metadata = $this->getMetadata();
+ $this->metadata['image'] = $this->getMainImage();
+
$this->metadata['title'] = $this->getTitle();
// Checking for minimum HTML to work with.
@@ -158,7 +160,7 @@ class HTMLParser
// TODO Better way to count resulting text. Textcontent usually has alt titles and that stuff
// that doesn't really count to the quality of the result.
$length = 0;
- foreach($result->getElementsByTagName('p') as $p){
+ foreach ($result->getElementsByTagName('p') as $p) {
$length += mb_strlen($p->textContent);
}
if ($result && mb_strlen(preg_replace('/\s/', '', $result->textContent)) < 500) {
@@ -431,12 +433,40 @@ class HTMLParser
if (array_key_exists('og:image', $values) || array_key_exists('twitter:image', $values)) {
$metadata['image'] = ($values['og:image']) ? $values['og:image'] : $values['twitter:image'];
+ } else {
+ $metadata['image'] = null;
}
return $metadata;
}
/**
+ * Tries to get the main article image. Will only update the metadata if the getMetadata function couldn't
+ * find a correct image.
+ *
+ * @return bool|string URL of the top image or false if unsuccessful.
+ */
+ public function getMainImage()
+ {
+ if ($this->metadata['image'] !== null) {
+ return $this->metadata['image'];
+ }
+
+ foreach ($this->dom->getElementsByTagName('link') as $link) {
+ /** @var \DOMElement $link */
+ /*
+ * Check for the rel attribute, then check if the rel attribute is either img_src or image_src, and
+ * finally check for the existence of the href attribute, which should hold the image url.
+ */
+ if ($link->hasAttribute('rel') && ($link->getAttribute('rel') === 'img_src' || $link->getAttribute('rel') === 'image_src') && $link->hasAttribute('href')){
+ return $link->getAttribute('href');
+ }
+ }
+
+ return false;
+ }
+
+ /**
* Get the density of links as a percentage of the content
* This is the amount of text that is inside a link divided by the total text in the node.
*