From 8fd4d10b08265088801670a228b011adad57cd97 Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Mon, 26 Dec 2016 16:46:47 -0300 Subject: Updated the getMetadata function --- src/HTMLParser.php | 60 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 16 deletions(-) (limited to 'src') diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 7ee0594..d5308f9 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -377,30 +377,57 @@ class HTMLParser */ private function getMetadata() { - $metadata = []; + $metadata = $values = []; + // Match "description", or Twitter's "twitter:description" (Cards) + // in name attribute. + $namePattern = '/^\s*((twitter)\s*:\s*)?(description|title)\s*$/i'; + + // Match Facebook's Open Graph title & description properties. + $propertyPattern = '/^\s*og\s*:\s*(description|title)\s*$/i'; + foreach ($this->dom->getElementsByTagName('meta') as $meta) { /* @var Readability $meta */ - $name = $meta->getAttribute('name'); - $property = $meta->getAttribute('property'); - - // Select either name or property - $item = ($name ? $name : $property); + $elementName = $meta->getAttribute('name'); + $elementProperty = $meta->getAttribute('property'); - if ($item == 'og:title' || $item == 'twitter:title') { - $metadata['title'] = $meta->getAttribute('content'); + if (in_array('author', [$elementName, $elementProperty])) { + $metadata['byline'] = $meta->getAttribute('content'); + continue; } - if ($item == 'og:description' || $item == 'twitter:description') { - $metadata['excerpt'] = $meta->getAttribute('content'); + $name = null; + if (preg_match($namePattern, $elementName)) { + $name = $elementName; + } elseif (preg_match($propertyPattern, $elementProperty)) { + $name = $elementProperty; } - if ($item == 'author') { - $metadata['byline'] = $meta->getAttribute('content'); + if ($name) { + $content = $meta->getAttribute('content'); + if ($content) { + // Convert to lowercase and remove any whitespace + // so we can match below. + $name = preg_replace('/\s/', '', strtolower($name)); + $values[$name] = trim($content); + } } + } + if (array_key_exists('description', $values)) { + $metadata['excerpt'] = $values['description']; + } elseif (array_key_exists('og:description', $values)) { + // Use facebook open graph description. + $metadata['excerpt'] = $values['og:description']; + } elseif (array_key_exists('twitter:description', $values)) { + // Use twitter cards description. + $metadata['excerpt'] = $values['twitter:description']; + } - if ($item == 'og:image' || $item == 'twitter:image') { - $metadata['image'] = $meta->getAttribute('content'); - } + if (array_key_exists('og:title', $values)) { + // Use facebook open graph title. + $metadata['title'] = $values['og:title']; + } elseif (array_key_exists('twitter:title', $values)) { + // Use twitter cards title. + $metadata['title'] = $values['twitter:title']; } return $metadata; @@ -458,6 +485,7 @@ class HTMLParser * Gets nodes from the root element. * * @param $node Readability + * @return array */ private function getNodes(Readability $node) { @@ -1030,7 +1058,7 @@ class HTMLParser * Checks if the node is a byline. * * @param Readability $node - * @param string $matchString + * @param string $matchString * * @return bool */ -- cgit v1.2.3