Parser.php 9.3 KB


  1. <?php
  2. /**
  3. * Part of Text_LanguageDetect
  4. *
  5. * PHP version 5
  6. *
  7. * @category Text
  8. * @package Text_LanguageDetect
  9. * @author Nicholas Pisarro <[email protected]>
  10. * @copyright 2006 Nicholas Pisarro
  11. * @license BSD http://www.opensource.org/licenses/bsd-license.php
  12. * @link http://pear.php.net/package/Text_LanguageDetect/
  13. */
  14. /**
  15. * This class represents a text sample to be parsed.
  16. *
  17. * This separates the analysis of a text sample from the primary LanguageDetect
  18. * class. After a new profile has been built, the data can be retrieved using
  19. * the accessor functions.
  20. *
  21. * This class is intended to be used by the Text_LanguageDetect class, not
  22. * end-users.
  23. *
  24. * @category Text
  25. * @package Text_LanguageDetect
  26. * @author Nicholas Pisarro <[email protected]>
  27. * @copyright 2006 Nicholas Pisarro
  28. * @license BSD http://www.opensource.org/licenses/bsd-license.php
  29. * @version Release: @[email protected]
  30. * @link http://pear.php.net/package/Text_LanguageDetect/
  31. */
  32. class Text_LanguageDetect_Parser extends Text_LanguageDetect
  33. {
  34. /**
  35. * The piece of text being parsed
  36. *
  37. * @var string
  38. */
  39. protected $_string;
  40. /**
  41. * Stores the trigram frequencies of the sample
  42. *
  43. * @var string
  44. */
  45. protected $_trigrams = array();
  46. /**
  47. * Stores the trigram ranks of the sample
  48. *
  49. * @var array
  50. */
  51. protected $_trigram_ranks = array();
  52. /**
  53. * Stores the unicode blocks of the sample
  54. *
  55. * @var array
  56. */
  57. protected $_unicode_blocks = array();
  58. /**
  59. * Whether the parser should compile the unicode ranges
  60. *
  61. * @var bool
  62. */
  63. protected $_compile_unicode = false;
  64. /**
  65. * Whether the parser should compile trigrams
  66. *
  67. * @var bool
  68. */
  69. protected $_compile_trigram = false;
  70. /**
  71. * Whether the trigram parser should pad the beginning of the string
  72. *
  73. * @var bool
  74. */
  75. protected $_trigram_pad_start = false;
  76. /**
  77. * Whether the unicode parser should skip non-alphabetical ascii chars
  78. *
  79. * @var bool
  80. */
  81. protected $_unicode_skip_symbols = true;
  82. /**
  83. * Constructor
  84. *
  85. * @param string $string string to be parsed
  86. */
  87. public function __construct($string)
  88. {
  89. $this->_string = $string;
  90. }
  91. /**
  92. * PHP 4 constructor for backwards compatibility.
  93. *
  94. * @param string $string string to be parsed
  95. *
  96. * @return void
  97. */
  98. public function Text_LanguageDetect_Parser($string)
  99. {
  100. self::__construct($string);
  101. }
  102. /**
  103. * Returns true if a string is suitable for parsing
  104. *
  105. * @param string $str input string to test
  106. *
  107. * @return bool true if acceptable, false if not
  108. */
  109. public static function validateString($str)
  110. {
  111. if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
  112. return true;
  113. } else {
  114. return false;
  115. }
  116. }
  117. /**
  118. * Turn on/off trigram counting
  119. *
  120. * @param bool $bool true for on, false for off
  121. *
  122. * @return void
  123. */
  124. public function prepareTrigram($bool = true)
  125. {
  126. $this->_compile_trigram = $bool;
  127. }
  128. /**
  129. * Turn on/off unicode block counting
  130. *
  131. * @param bool $bool true for on, false for off
  132. *
  133. * @return void
  134. */
  135. public function prepareUnicode($bool = true)
  136. {
  137. $this->_compile_unicode = $bool;
  138. }
  139. /**
  140. * Turn on/off padding the beginning of the sample string
  141. *
  142. * @param bool $bool true for on, false for off
  143. *
  144. * @return void
  145. */
  146. public function setPadStart($bool = true)
  147. {
  148. $this->_trigram_pad_start = $bool;
  149. }
  150. /**
  151. * Should the unicode block counter skip non-alphabetical ascii chars?
  152. *
  153. * @param bool $bool true for on, false for off
  154. *
  155. * @return void
  156. */
  157. public function setUnicodeSkipSymbols($bool = true)
  158. {
  159. $this->_unicode_skip_symbols = $bool;
  160. }
  161. /**
  162. * Returns the trigram ranks for the text sample
  163. *
  164. * @return array Trigram ranks in the text sample
  165. */
  166. public function getTrigramRanks()
  167. {
  168. return $this->_trigram_ranks;
  169. }
  170. /**
  171. * Return the trigram freqency table
  172. *
  173. * Only used in testing to make sure the parser is working
  174. *
  175. * @return array Trigram freqencies in the text sample
  176. */
  177. public function getTrigramFreqs()
  178. {
  179. return $this->_trigram;
  180. }
  181. /**
  182. * Returns the array of unicode blocks
  183. *
  184. * @return array Unicode blocks in the text sample
  185. */
  186. public function getUnicodeBlocks()
  187. {
  188. return $this->_unicode_blocks;
  189. }
  190. /**
  191. * Executes the parsing operation
  192. *
  193. * Be sure to call the set*() functions to set options and the
  194. * prepare*() functions first to tell it what kind of data to compute
  195. *
  196. * Afterwards the get*() functions can be used to access the compiled
  197. * information.
  198. *
  199. * @return void
  200. */
  201. public function analyze()
  202. {
  203. $len = strlen($this->_string);
  204. $byte_counter = 0;
  205. // unicode startup
  206. if ($this->_compile_unicode) {
  207. $blocks = $this->_read_unicode_block_db();
  208. $block_count = count($blocks);
  209. $skipped_count = 0;
  210. $unicode_chars = array();
  211. }
  212. // trigram startup
  213. if ($this->_compile_trigram) {
  214. // initialize them as blank so the parser will skip the first two
  215. // (since it skips trigrams with more than 2 contiguous spaces)
  216. $a = ' ';
  217. $b = ' ';
  218. // kludge
  219. // if it finds a valid trigram to start and the start pad option is
  220. // off, then set a variable that will be used to reduce this
  221. // trigram after parsing has finished
  222. if (!$this->_trigram_pad_start) {
  223. $a = $this->_next_char($this->_string, $byte_counter, true);
  224. if ($a != ' ') {
  225. $b = $this->_next_char($this->_string, $byte_counter, true);
  226. $dropone = " $a$b";
  227. }
  228. $byte_counter = 0;
  229. $a = ' ';
  230. $b = ' ';
  231. }
  232. }
  233. while ($byte_counter < $len) {
  234. $char = $this->_next_char($this->_string, $byte_counter, true);
  235. // language trigram detection
  236. if ($this->_compile_trigram) {
  237. if (!($b == ' ' && ($a == ' ' || $char == ' '))) {
  238. if (!isset($this->_trigram[$a . $b . $char])) {
  239. $this->_trigram[$a . $b . $char] = 1;
  240. } else {
  241. $this->_trigram[$a . $b . $char]++;
  242. }
  243. }
  244. $a = $b;
  245. $b = $char;
  246. }
  247. // unicode block detection
  248. if ($this->_compile_unicode) {
  249. if ($this->_unicode_skip_symbols
  250. && strlen($char) == 1
  251. && ($char < 'A' || $char > 'z'
  252. || ($char > 'Z' && $char < 'a'))
  253. && $char != "'"
  254. ) { // does not skip the apostrophe
  255. // since it's included in the language
  256. // models
  257. $skipped_count++;
  258. continue;
  259. }
  260. // build an array of all the characters
  261. if (isset($unicode_chars[$char])) {
  262. $unicode_chars[$char]++;
  263. } else {
  264. $unicode_chars[$char] = 1;
  265. }
  266. }
  267. // todo: add byte detection here
  268. }
  269. // unicode cleanup
  270. if ($this->_compile_unicode) {
  271. foreach ($unicode_chars as $utf8_char => $count) {
  272. $search_result = $this->_unicode_block_name(
  273. $this->_utf8char2unicode($utf8_char), $blocks, $block_count
  274. );
  275. if ($search_result != -1) {
  276. $block_name = $search_result[2];
  277. } else {
  278. $block_name = '[Malformatted]';
  279. }
  280. if (isset($this->_unicode_blocks[$block_name])) {
  281. $this->_unicode_blocks[$block_name] += $count;
  282. } else {
  283. $this->_unicode_blocks[$block_name] = $count;
  284. }
  285. }
  286. }
  287. // trigram cleanup
  288. if ($this->_compile_trigram) {
  289. // pad the end
  290. if ($b != ' ') {
  291. if (!isset($this->_trigram["$a$b "])) {
  292. $this->_trigram["$a$b "] = 1;
  293. } else {
  294. $this->_trigram["$a$b "]++;
  295. }
  296. }
  297. // perl compatibility; Language::Guess does not pad the beginning
  298. // kludge
  299. if (isset($dropone)) {
  300. if ($this->_trigram[$dropone] == 1) {
  301. unset($this->_trigram[$dropone]);
  302. } else {
  303. $this->_trigram[$dropone]--;
  304. }
  305. }
  306. if (!empty($this->_trigram)) {
  307. $this->_trigram_ranks = $this->_arr_rank($this->_trigram);
  308. } else {
  309. $this->_trigram_ranks = array();
  310. }
  311. }
  312. }
  313. }
  314. /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */