summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTitouan Galopin <[email protected]>2018-10-22 23:49:05 +0200
committerTitouan Galopin <[email protected]>2018-11-02 21:44:20 +0100
commit80b8e9177f587365535e9dd6bff45addad9c2bb1 (patch)
tree7083457d7e3a48a8b2b74f31b59313eea1ca7f45
parentfb50d43c6c640683d78651535ba46c06e452c628 (diff)
Add more extensions on composer.json, improve phpdocs and remove dead code
-rw-r--r--README.md2
-rw-r--r--composer.json1
-rw-r--r--src/HTML5.php17
-rw-r--r--src/HTML5/Parser/DOMTreeBuilder.php27
-rw-r--r--src/HTML5/Parser/FileInputStream.php5
-rw-r--r--src/HTML5/Parser/StringInputStream.php27
-rw-r--r--src/HTML5/Parser/Tokenizer.php45
-rw-r--r--src/HTML5/Parser/TreeBuildingRules.php1
-rw-r--r--src/HTML5/Parser/UTF8Utils.php18
9 files changed, 105 insertions, 38 deletions
diff --git a/README.md b/README.md
index 5fe0936..862579f 100644
--- a/README.md
+++ b/README.md
@@ -75,8 +75,6 @@ print $html5->saveHTML($dom);
// Or save it to a file:
$html5->save($dom, 'out.html');
-
-?>
```
The `$dom` created by the parser is a full `DOMDocument` object. And the
diff --git a/composer.json b/composer.json
index 1841346..ec37882 100644
--- a/composer.json
+++ b/composer.json
@@ -20,6 +20,7 @@
}
],
"require" : {
+ "ext-ctype": "*",
"ext-dom": "*",
"ext-libxml" : "*",
"php" : ">=5.3.0"
diff --git a/src/HTML5.php b/src/HTML5.php
index d5beff6..6c57553 100644
--- a/src/HTML5.php
+++ b/src/HTML5.php
@@ -2,6 +2,7 @@
namespace Masterminds;
use Masterminds\HTML5\Parser\FileInputStream;
+use Masterminds\HTML5\Parser\InputStream;
use Masterminds\HTML5\Parser\StringInputStream;
use Masterminds\HTML5\Parser\DOMTreeBuilder;
use Masterminds\HTML5\Parser\Scanner;
@@ -160,8 +161,13 @@ class HTML5
*
* Lower-level loading function. This requires an input stream instead
* of a string, file, or resource.
+ *
+ * @param InputStream $input
+ * @param array $options
+ *
+ * @return \DOMDocument
*/
- public function parse(\Masterminds\HTML5\Parser\InputStream $input, array $options = array())
+ public function parse(InputStream $input, array $options = array())
{
$this->errors = array();
$options = array_merge($this->getOptions(), $options);
@@ -180,8 +186,15 @@ class HTML5
*
* Lower-level loading function. This requires an input stream instead
* of a string, file, or resource.
+ *
+ * @param InputStream $input
+ * The input data to parse in the form of a InputStream instance.
+ * @param array $options
+ * An array of options
+ *
+ * @return \DOMDocumentFragment
*/
- public function parseFragment(\Masterminds\HTML5\Parser\InputStream $input, array $options = array())
+ public function parseFragment(InputStream $input, array $options = array())
{
$options = array_merge($this->getOptions(), $options);
$events = new DOMTreeBuilder(true, $options);
diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php
index 865c5f0..7f0e16a 100644
--- a/src/HTML5/Parser/DOMTreeBuilder.php
+++ b/src/HTML5/Parser/DOMTreeBuilder.php
@@ -136,6 +136,7 @@ class DOMTreeBuilder implements EventHandler
protected $stack = array();
protected $current; // Pointer in the tag hierarchy.
+ protected $rules;
protected $doc;
protected $frag;
@@ -216,7 +217,7 @@ class DOMTreeBuilder implements EventHandler
*
* @see http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#concept-frag-parse-context
*
- * @return \DOMFragmentDocumentFragment
+ * @return \DOMDocumentFragment
*/
public function fragment()
{
@@ -650,15 +651,19 @@ class DOMTreeBuilder implements EventHandler
/**
* Automatically climb the tree and close the closest node with the matching $tag.
+ *
+ * @param string $tagName
+ *
+ * @return bool
*/
- protected function autoclose($tag)
+ protected function autoclose($tagName)
{
$working = $this->current;
do {
if ($working->nodeType != XML_ELEMENT_NODE) {
return false;
}
- if ($working->tagName == $tag) {
+ if ($working->tagName == $tagName) {
$this->current = $working->parentNode;
return true;
@@ -672,12 +677,16 @@ class DOMTreeBuilder implements EventHandler
*
* If $this->current or anything above $this->current matches the given tag
* name, this returns true.
+ *
+ * @param string $tagName
+ *
+ * @return bool
*/
- protected function isAncestor($tagname)
+ protected function isAncestor($tagName)
{
$candidate = $this->current;
while ($candidate->nodeType === XML_ELEMENT_NODE) {
- if ($candidate->tagName == $tagname) {
+ if ($candidate->tagName == $tagName) {
return true;
}
$candidate = $candidate->parentNode;
@@ -688,9 +697,13 @@ class DOMTreeBuilder implements EventHandler
/**
* Returns true if the immediate parent element is of the given tagname.
+ *
+ * @param string $tagName
+ *
+ * @return bool
*/
- protected function isParent($tagname)
+ protected function isParent($tagName)
{
- return $this->current->tagName == $tagname;
+ return $this->current->tagName == $tagName;
}
}
diff --git a/src/HTML5/Parser/FileInputStream.php b/src/HTML5/Parser/FileInputStream.php
index e58006a..fbf006d 100644
--- a/src/HTML5/Parser/FileInputStream.php
+++ b/src/HTML5/Parser/FileInputStream.php
@@ -19,8 +19,9 @@ class FileInputStream extends StringInputStream implements InputStream
/**
* Load a file input stream.
*
- * @param string $data
- * The file or url path to load.
+ * @param string $data The file or url path to load.
+ * @param string $encoding The encoding to use for the data.
+ * @param string $debug A fprintf format to use to echo the data on stdout.
*/
public function __construct($data, $encoding = 'UTF-8', $debug = '')
{
diff --git a/src/HTML5/Parser/StringInputStream.php b/src/HTML5/Parser/StringInputStream.php
index 4cac3c2..0973941 100644
--- a/src/HTML5/Parser/StringInputStream.php
+++ b/src/HTML5/Parser/StringInputStream.php
@@ -65,22 +65,21 @@ class StringInputStream implements InputStream
/**
* Create a new InputStream wrapper.
*
- * @param $data Data
- * to parse
+ * @param string $data Data to parse
+ * @param string $encoding The encoding to use for the data.
+ * @param string $debug A fprintf format to use to echo the data on stdout.
*/
public function __construct($data, $encoding = 'UTF-8', $debug = '')
{
$data = UTF8Utils::convertToUTF8($data, $encoding);
- if ($debug)
+ if ($debug) {
fprintf(STDOUT, $debug, $data, strlen($data));
+ }
- // There is good reason to question whether it makes sense to
- // do this here, since most of these checks are done during
- // parsing, and since this check doesn't actually *do* anything.
+ // There is good reason to question whether it makes sense to
+ // do this here, since most of these checks are done during
+ // parsing, and since this check doesn't actually *do* anything.
$this->errors = UTF8Utils::checkForIllegalCodepoints($data);
- // if (!empty($e)) {
- // throw new ParseError("UTF-8 encoding issues: " . implode(', ', $e));
- // }
$data = $this->replaceLinefeeds($data);
@@ -95,7 +94,11 @@ class StringInputStream implements InputStream
protected function replaceLinefeeds($data)
{
/*
- * U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED (LF) characters are treated specially. Any CR characters that are followed by LF characters must be removed, and any CR characters not followed by LF characters must be converted to LF characters. Thus, newlines in HTML DOMs are represented by LF characters, and there are never any CR characters in the input to the tokenization stage.
+ * U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED (LF) characters are treated specially.
+ * Any CR characters that are followed by LF characters must be removed, and any CR characters not
+ * followed by LF characters must be converted to LF characters. Thus, newlines in HTML DOMs are
+ * represented by LF characters, and there are never any CR characters in the input to the tokenization
+ * stage.
*/
$crlfTable = array(
"\0" => "\xEF\xBF\xBD",
@@ -126,7 +129,7 @@ class StringInputStream implements InputStream
*/
public function getCurrentLine()
{
- return currentLine();
+ return $this->currentLine();
}
/**
@@ -281,6 +284,8 @@ class StringInputStream implements InputStream
* substring.
* @param int $max
* The max number of chars to read.
+ *
+ * @return string
*/
public function charsWhile($bytes, $max = null)
{
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index 1f6868b..9645f83 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -431,6 +431,12 @@ class Tokenizer
/**
* Parse attributes from inside of a tag.
+ *
+ * @param string[] $attributes
+ *
+ * @return bool
+ *
+ * @throws ParseError
*/
protected function attribute(&$attributes)
{
@@ -489,6 +495,8 @@ class Tokenizer
/**
* Consume an attribute value.
* 8.2.4.37 and after.
+ *
+ * @return string|null
*/
protected function attributeValue()
{
@@ -590,6 +598,8 @@ class Tokenizer
* Prepend any leading characters. This essentially
* negates the need to backtrack, but it's sort of
* a hack.
+ *
+ * @return bool
*/
protected function bogusComment($leading = '')
{
@@ -614,6 +624,8 @@ class Tokenizer
* Read a comment.
*
* Expects the first tok to be inside of the comment.
+ *
+ * @return bool
*/
protected function comment()
{
@@ -645,6 +657,8 @@ class Tokenizer
/**
* Check if the scanner has reached the end of a comment.
+ *
+ * @return bool
*/
protected function isCommentEnd()
{
@@ -679,6 +693,8 @@ class Tokenizer
* not Quirksmode is enabled on the event handler.
*
* @todo This method is a little long. Should probably refactor.
+ *
+ * @return bool
*/
protected function doctype()
{
@@ -701,13 +717,9 @@ class Tokenizer
return $this->eof();
}
- $doctypeName = '';
-
// NULL char: convert.
if ($tok === "\0") {
$this->parseError("Unexpected null character in DOCTYPE.");
- $doctypeName .= UTF8::FFFD;
- $tok = $this->scanner->next();
}
$stop = " \n\f>";
@@ -792,6 +804,7 @@ class Tokenizer
* @param string $stopchars
* Characters (in addition to a close-quote) that should stop the string.
* E.g. sometimes '>' is higher precedence than '"' or "'".
+ *
* @return mixed String if one is found (quotations omitted)
*/
protected function quotedString($stopchars)
@@ -813,6 +826,8 @@ class Tokenizer
/**
* Handle a CDATA section.
+ *
+ * @return bool
*/
protected function cdataSection()
{
@@ -856,6 +871,8 @@ class Tokenizer
* treated as "bogus comments". However, since we're not a user
* agent, we allow them. We consume until ?> and then issue a
* EventListener::processingInstruction() event.
+ *
+ * @return bool
*/
protected function processingInstruction()
{
@@ -900,6 +917,10 @@ class Tokenizer
/**
* Read from the input stream until we get to the desired sequene
* or hit the end of the input stream.
+ *
+ * @param string $sequence
+ *
+ * @return string
*/
protected function readUntilSequence($sequence)
{
@@ -935,6 +956,11 @@ class Tokenizer
* Example: $this->sequenceMatches('</script>') will
* see if the input stream is at the start of a
* '</script>' string.
+ *
+ * @param string $sequence
+ * @param bool $caseSensitive
+ *
+ * @return bool
*/
protected function sequenceMatches($sequence, $caseSensitive = true)
{
@@ -976,6 +1002,8 @@ class Tokenizer
* Add text to the temporary buffer.
*
* @see flushBuffer()
+ *
+ * @param string $str
*/
protected function buffer($str)
{
@@ -987,6 +1015,10 @@ class Tokenizer
*
* A parse error always returns false because it never consumes any
* characters.
+ *
+ * @param string $msg
+ *
+ * @return string
*/
protected function parseError($msg)
{
@@ -1009,9 +1041,11 @@ class Tokenizer
* Returns false if the entity could not be found. If $inAttribute is set
* to true, a bare & will be returned as-is.
*
- * @param boolean $inAttribute
+ * @param bool $inAttribute
* Set to true if the text is inside of an attribute value.
* false otherwise.
+ *
+ * @return bool|string
*/
protected function decodeCharacterReference($inAttribute = false)
{
@@ -1023,7 +1057,6 @@ class Tokenizer
// Next char after &.
$tok = $this->scanner->next();
- $entity = '';
$start = $this->scanner->position();
if ($tok == false) {
diff --git a/src/HTML5/Parser/TreeBuildingRules.php b/src/HTML5/Parser/TreeBuildingRules.php
index 6236208..d092872 100644
--- a/src/HTML5/Parser/TreeBuildingRules.php
+++ b/src/HTML5/Parser/TreeBuildingRules.php
@@ -127,7 +127,6 @@ class TreeBuildingRules
protected function closeIfCurrentMatches($ele, $current, $match)
{
- $tname = $current->tagName;
if (in_array($current->tagName, $match)) {
$current->parentNode->appendChild($ele);
} else {
diff --git a/src/HTML5/Parser/UTF8Utils.php b/src/HTML5/Parser/UTF8Utils.php
index 44affb6..451c155 100644
--- a/src/HTML5/Parser/UTF8Utils.php
+++ b/src/HTML5/Parser/UTF8Utils.php
@@ -26,6 +26,9 @@ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
+
+use Masterminds\HTML5\Exception;
+
/**
* UTF-8 Utilities
*/
@@ -44,6 +47,10 @@ class UTF8Utils
* MB, libxml, and finally a custom counter.
*
* @todo Move this to a general utility class.
+ *
+ * @param string $string
+ *
+ * @return int
*/
public static function countChars($string)
{
@@ -73,6 +80,8 @@ class UTF8Utils
* The data to convert.
* @param string $encoding
* A valid encoding. Examples: http://www.php.net/manual/en/mbstring.supported-encodings.php
+ *
+ * @return string
*/
public static function convertToUTF8($data, $encoding = 'UTF-8')
{
@@ -107,7 +116,6 @@ class UTF8Utils
// - Incomplete sequences generate a warning.
$data = @iconv($encoding, 'UTF-8//IGNORE', $data);
} else {
- // we can make a conforming native implementation
throw new Exception('Not implemented, please install mbstring or iconv');
}
@@ -124,16 +132,12 @@ class UTF8Utils
/**
* Checks for Unicode code points that are not valid in a document.
*
- * @param string $data
- * A string to analyze.
+ * @param string $data A string to analyze.
+ *
* @return array An array of (string) error messages produced by the scanning.
*/
public static function checkForIllegalCodepoints($data)
{
- if (! function_exists('preg_match_all')) {
- throw\Exception('The PCRE library is not loaded or is not available.');
- }
-
// Vestigal error handling.
$errors = array();