Finalseg.php 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. <?php
  2. /**
  3. * Finalseg.php
  4. *
  5. * PHP version 5
  6. *
  7. * @category PHP
  8. * @package /src/class/
  9. * @author Fukuball Lin <fukuball@gmail.com>
  10. * @license MIT Licence
  11. * @version GIT: <fukuball/jieba-php>
  12. * @link https://github.com/fukuball/jieba-php
  13. */
  14. namespace Fukuball\Jieba;
  15. /**
  16. * Finalseg
  17. *
  18. * @category PHP
  19. * @package /src/class/
  20. * @author Fukuball Lin <fukuball@gmail.com>
  21. * @license MIT Licence
  22. * @version Release: <0.16>
  23. * @link https://github.com/fukuball/jieba-php
  24. */
  25. class Finalseg
  26. {
  27. public static $prob_start = array();
  28. public static $prob_trans = array();
  29. public static $prob_emit = array();
  30. /**
  31. * Static method init
  32. *
  33. * @param array $options # other options
  34. *
  35. * @return void
  36. */
  37. public static function init($options = array())
  38. {
  39. $defaults = array(
  40. 'mode'=>'default'
  41. );
  42. $options = array_merge($defaults, $options);
  43. self::$prob_start = self::loadModel(dirname(dirname(__FILE__)).'/model/prob_start.json');
  44. self::$prob_trans = self::loadModel(dirname(dirname(__FILE__)).'/model/prob_trans.json');
  45. self::$prob_emit = self::loadModel(dirname(dirname(__FILE__)).'/model/prob_emit.json');
  46. }// end function init
  47. /**
  48. * Static method loadModel
  49. *
  50. * @param string $f_name # input f_name
  51. * @param array $options # other options
  52. *
  53. * @return void
  54. */
  55. public static function loadModel($f_name, $options = array())
  56. {
  57. $defaults = array(
  58. 'mode'=>'default'
  59. );
  60. $options = array_merge($defaults, $options);
  61. return json_decode(file_get_contents($f_name), true);
  62. }// end function loadModel
  63. /**
  64. * Static method viterbi
  65. *
  66. * @param string $sentence # input sentence
  67. * @param array $options # other options
  68. *
  69. * @return array $viterbi
  70. */
  71. public static function viterbi($sentence, $options = array())
  72. {
  73. $defaults = array(
  74. 'mode'=>'default'
  75. );
  76. $options = array_merge($defaults, $options);
  77. $obs = $sentence;
  78. $states = array('B', 'M', 'E', 'S');
  79. $V = array();
  80. $V[0] = array();
  81. $path = array();
  82. foreach ($states as $key => $state) {
  83. $y = $state;
  84. $c = mb_substr($obs, 0, 1, 'UTF-8');
  85. $prob_emit = 0.0;
  86. if (isset(self::$prob_emit[$y][$c])) {
  87. $prob_emit = self::$prob_emit[$y][$c];
  88. } else {
  89. $prob_emit = MIN_FLOAT;
  90. }
  91. $V[0][$y] = self::$prob_start[$y] + $prob_emit;
  92. $path[$y] = $y;
  93. }
  94. for ($t=1; $t<mb_strlen($obs, 'UTF-8'); $t++) {
  95. $c = mb_substr($obs, $t, 1, 'UTF-8');
  96. $V[$t] = array();
  97. $newpath = array();
  98. foreach ($states as $key => $state) {
  99. $y = $state;
  100. $temp_prob_array = array();
  101. foreach ($states as $key => $state0) {
  102. $y0 = $state0;
  103. $prob_trans = 0.0;
  104. if (isset(self::$prob_trans[$y0][$y])) {
  105. $prob_trans = self::$prob_trans[$y0][$y];
  106. } else {
  107. $prob_trans = MIN_FLOAT;
  108. }
  109. $prob_emit = 0.0;
  110. if (isset(self::$prob_emit[$y][$c])) {
  111. $prob_emit = self::$prob_emit[$y][$c];
  112. } else {
  113. $prob_emit = MIN_FLOAT;
  114. }
  115. $temp_prob_array[$y0] = $V[$t-1][$y0] + $prob_trans + $prob_emit;
  116. }
  117. arsort($temp_prob_array);
  118. $max_prob = reset($temp_prob_array);
  119. $max_key = key($temp_prob_array);
  120. $V[$t][$y] = $max_prob;
  121. if (is_array($path[$max_key])) {
  122. $newpath[$y] = array();
  123. foreach ($path[$max_key] as $key => $path_value) {
  124. $newpath[$y][] = $path_value;
  125. }
  126. $newpath[$y][] = $y;
  127. } else {
  128. $newpath[$y] = array($path[$max_key], $y);
  129. }
  130. }
  131. $path = $newpath;
  132. }
  133. $es_states = array('E','S');
  134. $temp_prob_array = array();
  135. $len = mb_strlen($obs, 'UTF-8');
  136. foreach ($es_states as $key => $state) {
  137. $y = $state;
  138. $temp_prob_array[$y] = $V[$len-1][$y];
  139. }
  140. arsort($temp_prob_array);
  141. $prob = reset($temp_prob_array);
  142. $state = key($temp_prob_array);
  143. return array("prob"=>$prob, "pos_list"=>$path[$state]);
  144. }// end function viterbi
  145. /**
  146. * Static method __cut
  147. *
  148. * @param string $sentence # input sentence
  149. * @param array $options # other options
  150. *
  151. * @return array $words
  152. */
  153. public static function __cut($sentence, $options = array())
  154. {
  155. $defaults = array(
  156. 'mode'=>'default'
  157. );
  158. $options = array_merge($defaults, $options);
  159. $words = array();
  160. $viterbi_array = self::viterbi($sentence);
  161. $prob = $viterbi_array['prob'];
  162. $pos_list = $viterbi_array['pos_list'];
  163. $begin = 0;
  164. $next = 0;
  165. $len = mb_strlen($sentence, 'UTF-8');
  166. for ($i=0; $i<$len; $i++) {
  167. $char = mb_substr($sentence, $i, 1, 'UTF-8');
  168. $pos = $pos_list[$i];
  169. if ($pos=='B') {
  170. $begin = $i;
  171. } elseif ($pos=='E') {
  172. $words[] = mb_substr($sentence, $begin, (($i+1)-$begin), 'UTF-8');
  173. $next = $i+1;
  174. } elseif ($pos=='S') {
  175. $words[] = $char;
  176. $next = $i+1;
  177. }
  178. }
  179. if ($next<$len) {
  180. $words[] = mb_substr($sentence, $next, null, 'UTF-8');
  181. }
  182. return $words;
  183. }// end function __cut
  184. /**
  185. * Static method cut
  186. *
  187. * @param string $sentence # input sentence
  188. * @param array $options # other options
  189. *
  190. * @return array $seg_list
  191. */
  192. public static function cut($sentence, $options = array())
  193. {
  194. $defaults = array(
  195. 'mode'=>'default'
  196. );
  197. $options = array_merge($defaults, $options);
  198. $seg_list = array();
  199. $re_cjk_pattern = '([\x{3040}-\x{309F}]+)|([\x{30A0}-\x{30FF}]+)|([\x{4E00}-\x{9FA5}]+)|([\x{AC00}-\x{D7AF}]+)';
  200. $re_skip_pattern = '([a-zA-Z0-9+#&=\._\r\n]+)';
  201. preg_match_all(
  202. '/('.$re_cjk_pattern.'|'.$re_skip_pattern.')/u',
  203. $sentence,
  204. $matches,
  205. PREG_PATTERN_ORDER
  206. );
  207. $blocks = $matches[0];
  208. foreach ($blocks as $blk) {
  209. if (preg_match('/'.$re_cjk_pattern.'/u', $blk)) {
  210. $words = self::__cut($blk);
  211. foreach ($words as $word) {
  212. $seg_list[] = $word;
  213. }
  214. } else {
  215. $seg_list[] = $blk;
  216. }// end else (preg_match('/'.$re_han_pattern.'/u', $blk))
  217. }// end foreach ($blocks as $blk)
  218. return $seg_list;
  219. }// end function cut
  220. }