Posseg.php 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654
  1. <?php
  2. /**
  3. * Posseg.php
  4. *
  5. * PHP version 5
  6. *
  7. * @category PHP
  8. * @package /src/class/
  9. * @author Fukuball Lin <fukuball@gmail.com>
  10. * @license MIT Licence
  11. * @version GIT: <fukuball/jieba-php>
  12. * @link https://github.com/fukuball/jieba-php
  13. */
  14. namespace Fukuball\Jieba;
  15. /**
  16. * Posseg
  17. *
  18. * @category PHP
  19. * @package /src/class/
  20. * @author Fukuball Lin <fukuball@gmail.com>
  21. * @license MIT Licence
  22. * @version Release: <0.16>
  23. * @link https://github.com/fukuball/jieba-php
  24. */
  25. class Posseg
  26. {
  27. public static $prob_start = array();
  28. public static $prob_trans = array();
  29. public static $prob_emit = array();
  30. public static $char_state = array();
  31. public static $word_tag = array();
  32. public static $pos_tag_readable = array();
  33. /**
  34. * Static method init
  35. *
  36. * @param array $options # other options
  37. *
  38. * @return void
  39. */
  40. public static function init($options = array())
  41. {
  42. $defaults = array(
  43. 'mode'=>'default'
  44. );
  45. $options = array_merge($defaults, $options);
  46. self::$prob_start = self::loadModel(dirname(dirname(__FILE__)).'/model/pos/prob_start.json');
  47. self::$prob_trans = self::loadModel(dirname(dirname(__FILE__)).'/model/pos/prob_trans.json');
  48. self::$prob_emit = self::loadModel(dirname(dirname(__FILE__)).'/model/pos/prob_emit.json');
  49. self::$char_state = self::loadModel(dirname(dirname(__FILE__)).'/model/pos/char_state.json');
  50. if (Jieba::$dictname!='') {
  51. $content = fopen(dirname(dirname(__FILE__)).'/dict/'.Jieba::$dictname, 'r');
  52. while (($line = fgets($content)) !== false) {
  53. $explode_line = explode(' ', trim($line));
  54. $word = $explode_line[0];
  55. $freq = $explode_line[1];
  56. $tag = $explode_line[2];
  57. self::$word_tag[$word] = $tag;
  58. }
  59. fclose($content);
  60. }
  61. if (sizeof(Jieba::$user_dictname)!=0) {
  62. for ($i = 0; $i<sizeof(Jieba::$user_dictname); $i++) {
  63. $content = fopen(Jieba::$user_dictname[$i], 'r');
  64. while (($line = fgets($content)) !== false) {
  65. $explode_line = explode(' ', trim($line));
  66. $word = $explode_line[0];
  67. $freq = $explode_line[1];
  68. $tag = $explode_line[2];
  69. self::$word_tag[$word] = $tag;
  70. }
  71. fclose($content);
  72. }
  73. }
  74. $content = fopen(dirname(dirname(__FILE__)).'/dict/pos_tag_readable.txt', 'r');
  75. while (($line = fgets($content)) !== false) {
  76. $explode_line = explode(' ', trim($line));
  77. $tag = $explode_line[0];
  78. $meaning = $explode_line[1];
  79. self::$pos_tag_readable[$tag] = $meaning;
  80. }
  81. fclose($content);
  82. }// end function init
  83. /**
  84. * Static method loadModel
  85. *
  86. * @param string $f_name # input f_name
  87. * @param array $options # other options
  88. *
  89. * @return void
  90. */
  91. public static function loadModel($f_name, $options = array())
  92. {
  93. $defaults = array(
  94. 'mode'=>'default'
  95. );
  96. $options = array_merge($defaults, $options);
  97. return json_decode(file_get_contents($f_name), true);
  98. }// end function loadModel
  99. /**
  100. * Static method getTopStates
  101. *
  102. * @param array $t_state_v # input t_state_v
  103. * @param int $top_k # input top_k
  104. * @param array $options # other options
  105. *
  106. * @return array $top_states
  107. */
  108. public static function getTopStates($t_state_v, $top_k = 4, $options = array())
  109. {
  110. arsort($t_state_v);
  111. $top_states = array_slice($t_state_v, 0, $top_k);
  112. return $top_states;
  113. }// end function getTopStates
  114. /**
  115. * Static method viterbi
  116. *
  117. * @param string $sentence # input sentence
  118. * @param array $options # other options
  119. *
  120. * @return array $viterbi
  121. */
  122. public static function viterbi($sentence, $options = array())
  123. {
  124. $defaults = array(
  125. 'mode'=>'default'
  126. );
  127. $options = array_merge($defaults, $options);
  128. $obs = $sentence;
  129. $states = self::$char_state;
  130. $V = array();
  131. $V[0] = array();
  132. $mem_path = array();
  133. $mem_path[0] = array();
  134. $all_states = array_keys(self::$prob_trans);
  135. $c = mb_substr($obs, 0, 1, 'UTF-8');
  136. if (isset($states[$c]) && !empty($states[$c])) {
  137. $c_states = $states[$c];
  138. } else {
  139. $c_states = $all_states;
  140. }
  141. foreach ($c_states as $key => $state) {
  142. $y = $state;
  143. $c = mb_substr($obs, 0, 1, 'UTF-8');
  144. $prob_emit = 0.0;
  145. if (isset(self::$prob_emit[$y][$c])) {
  146. $prob_emit = self::$prob_emit[$y][$c];
  147. } else {
  148. $prob_emit = MIN_FLOAT;
  149. }
  150. $V[0][$y] = self::$prob_start[$y] + $prob_emit;
  151. $mem_path[0][$y] = '';
  152. }
  153. for ($t=1; $t<mb_strlen($obs, 'UTF-8'); $t++) {
  154. $c = mb_substr($obs, $t, 1, 'UTF-8');
  155. $V[$t] = array();
  156. $mem_path[$t] = array();
  157. $prev_states = array_keys(self::getTopStates($V[$t-1]));
  158. $prev_mem_path = array_keys($mem_path[$t-1]);
  159. $prev_states = array();
  160. foreach ($prev_mem_path as $mem_path_state) {
  161. if (count(self::$prob_trans[$mem_path_state])>0) {
  162. $prev_states[] = $mem_path_state;
  163. }
  164. }
  165. $prev_states_expect_next = array();
  166. foreach ($prev_states as $prev_state) {
  167. $prev_states_expect_next
  168. = array_unique(
  169. array_merge(
  170. $prev_states_expect_next,
  171. array_keys(self::$prob_trans[$prev_state])
  172. )
  173. );
  174. }
  175. $obs_states = array();
  176. if (isset($states[$c])) {
  177. $obs_states = $states[$c];
  178. } else {
  179. $obs_states = $all_states;
  180. }
  181. $obs_states = array_intersect($obs_states, $prev_states_expect_next);
  182. if (count($obs_states)==0) {
  183. $obs_states = $all_states;
  184. }
  185. foreach ($obs_states as $y) {
  186. $temp_prob_array = array();
  187. foreach ($prev_states as $y0) {
  188. $prob_trans = 0.0;
  189. if (isset(self::$prob_trans[$y0][$y])) {
  190. $prob_trans = self::$prob_trans[$y0][$y];
  191. } else {
  192. $prob_trans = MIN_FLOAT;
  193. }
  194. $prob_emit = 0.0;
  195. if (isset(self::$prob_emit[$y][$c])) {
  196. $prob_emit = self::$prob_emit[$y][$c];
  197. } else {
  198. $prob_emit = MIN_FLOAT;
  199. }
  200. $temp_prob_array[$y0] = $V[$t-1][$y0] + $prob_trans + $prob_emit;
  201. }
  202. arsort($temp_prob_array);
  203. $max_prob = reset($temp_prob_array);
  204. $max_key = key($temp_prob_array);
  205. $V[$t][$y] = $max_prob;
  206. $mem_path[$t][$y] = $max_key;
  207. }
  208. }
  209. $last = array();
  210. $mem_path_end_keys = array_keys(end($mem_path));
  211. foreach ($mem_path_end_keys as $y) {
  212. $end_array = end($V);
  213. $last[$y] = $end_array[$y];
  214. }
  215. arsort($last);
  216. $return_prob = reset($last);
  217. $return_prob_key = key($last);
  218. $obs_length = mb_strlen($obs, 'UTF-8');
  219. $route = array();
  220. for ($t=0; $t<$obs_length; $t++) {
  221. $route[] = 'None';
  222. }
  223. $i = $obs_length-1;
  224. while ($i >= 0) {
  225. $route[$i] = $return_prob_key;
  226. $return_prob_key = $mem_path[$i][$return_prob_key];
  227. $i-=1;
  228. }
  229. return array('prob'=>$return_prob, 'pos_list'=>$route);
  230. }// end function viterbi
  231. /**
  232. * Static method __cut
  233. *
  234. * @param string $sentence # input sentence
  235. * @param array $options # other options
  236. *
  237. * @return array $words
  238. */
  239. public static function __cut($sentence, $options = array('HMM' => true))
  240. {
  241. $defaults = array(
  242. 'mode'=>'default'
  243. );
  244. $options = array_merge($defaults, $options);
  245. $words = array();
  246. $viterbi_array = self::viterbi($sentence);
  247. $prob = $viterbi_array['prob'];
  248. $pos_list = $viterbi_array['pos_list'];
  249. $begin = 0;
  250. $next = 0;
  251. $len = mb_strlen($sentence, 'UTF-8');
  252. for ($i=0; $i<$len; $i++) {
  253. $char = mb_substr($sentence, $i, 1, 'UTF-8');
  254. eval('$pos_array = array'.$pos_list[$i].';');
  255. $pos = $pos_array[0];
  256. if ($pos=='B') {
  257. $begin = $i;
  258. } elseif ($pos=='E') {
  259. eval('$this_pos_array = array'.$pos_list[$i].';');
  260. $this_pos = $this_pos_array[1];
  261. $this_word_pair = array(
  262. 'word'=>mb_substr($sentence, $begin, (($i+1)-$begin), 'UTF-8'),
  263. 'tag'=>$this_pos
  264. );
  265. $words[] = $this_word_pair;
  266. $next = $i+1;
  267. } elseif ($pos=='S') {
  268. eval('$this_pos_array = array'.$pos_list[$i].';');
  269. $this_pos = $this_pos_array[1];
  270. $this_word_pair = array(
  271. 'word'=>$char,
  272. 'tag'=>$this_pos
  273. );
  274. $words[] = $this_word_pair;
  275. $next = $i+1;
  276. }
  277. }
  278. if ($next<$len) {
  279. eval('$this_pos_array = array'.$pos_list[$next].';');
  280. $this_pos = $this_pos_array[1];
  281. $this_word_pair = array(
  282. 'word'=>mb_substr($sentence, $next, null, 'UTF-8'),
  283. 'tag'=>$this_pos
  284. );
  285. $words[] = $this_word_pair;
  286. }
  287. return $words;
  288. }// end function __cut
  289. /**
  290. * Static method __cutDetail
  291. *
  292. * @param string $sentence # input sentence
  293. * @param array $options # other options
  294. *
  295. * @return array $words
  296. */
  297. public static function __cutDetail($sentence, $options = array())
  298. {
  299. $defaults = array(
  300. 'mode'=>'default'
  301. );
  302. $options = array_merge($defaults, $options);
  303. $words = array();
  304. $re_han_pattern = '([\x{4E00}-\x{9FA5}]+)';
  305. $re_skip_pattern = '([a-zA-Z0-9+#&=\._\r\n]+)';
  306. $re_punctuation_pattern = '([\x{ff5e}\x{ff01}\x{ff08}\x{ff09}\x{300e}'.
  307. '\x{300c}\x{300d}\x{300f}\x{3001}\x{ff1a}\x{ff1b}'.
  308. '\x{ff0c}\x{ff1f}\x{3002}]+)';
  309. $re_eng_pattern = '[a-zA-Z0-9]+';
  310. $re_num_pattern = '[\.0-9]+';
  311. preg_match_all(
  312. '/('.$re_han_pattern.'|'.$re_skip_pattern.'|'.$re_punctuation_pattern.')/u',
  313. $sentence,
  314. $matches,
  315. PREG_PATTERN_ORDER
  316. );
  317. $blocks = $matches[0];
  318. foreach ($blocks as $blk) {
  319. if (preg_match('/'.$re_han_pattern.'/u', $blk)) {
  320. $blk_words = self::__cut($blk);
  321. foreach ($blk_words as $blk_word) {
  322. $words[] = $blk_word;
  323. }
  324. } elseif (preg_match('/'.$re_skip_pattern.'/u', $blk)) {
  325. if (preg_match('/'.$re_num_pattern.'/u', $blk)) {
  326. $words[] = array('word' => $blk, 'tag' => 'm');
  327. } elseif (preg_match('/'.$re_eng_pattern.'/u', $blk)) {
  328. $words[] = array('word' => $blk, 'tag' => 'eng');
  329. }
  330. } elseif (preg_match('/'.$re_punctuation_pattern.'/u', $blk)) {
  331. $words[] = array('word' => $blk, 'tag' => 'w');
  332. }
  333. }
  334. return $words;
  335. }// end function __cutDetail
  336. /**
  337. * Static method __cutDAG
  338. *
  339. * @param string $sentence # input sentence
  340. * @param array $options # other options
  341. *
  342. * @return array $words
  343. */
  344. public static function __cutDAG($sentence, $options = array())
  345. {
  346. $defaults = array(
  347. 'mode'=>'default'
  348. );
  349. $options = array_merge($defaults, $options);
  350. $words = array();
  351. $N = mb_strlen($sentence, 'UTF-8');
  352. $DAG = Jieba::getDAG($sentence);
  353. Jieba::calc($sentence, $DAG);
  354. $x = 0;
  355. $buf = '';
  356. while ($x < $N) {
  357. $current_route_keys = array_keys(Jieba::$route[$x]);
  358. $y = $current_route_keys[0]+1;
  359. $l_word = mb_substr($sentence, $x, ($y-$x), 'UTF-8');
  360. if (($y-$x)==1) {
  361. $buf = $buf.$l_word;
  362. } else {
  363. if (mb_strlen($buf, 'UTF-8')>0) {
  364. if (mb_strlen($buf, 'UTF-8')==1) {
  365. if (isset(self::$word_tag[$buf])) {
  366. $buf_tag = self::$word_tag[$buf];
  367. } else {
  368. $buf_tag = 'x';
  369. }
  370. $words[] = array('word' => $buf, 'tag' => $buf_tag);
  371. $buf = '';
  372. } else {
  373. if (! isset(Jieba::$FREQ[$buf])) {
  374. $regognized = self::__cutDetail($buf);
  375. foreach ($regognized as $key => $word) {
  376. $words[] = $word;
  377. }
  378. } else {
  379. $elem_array = preg_split('//u', $buf, -1, PREG_SPLIT_NO_EMPTY);
  380. foreach ($elem_array as $word) {
  381. if (isset(self::$word_tag[$word])) {
  382. $buf_tag = self::$word_tag[$word];
  383. } else {
  384. $buf_tag = 'x';
  385. }
  386. $words[] = array('word' => $word, 'tag' => $buf_tag);
  387. }
  388. }
  389. $buf = '';
  390. }
  391. }
  392. if (isset(self::$word_tag[$l_word])) {
  393. $buf_tag = self::$word_tag[$l_word];
  394. } else {
  395. $buf_tag = 'x';
  396. }
  397. $words[] = array('word' => $l_word, 'tag' => $buf_tag);
  398. }
  399. $x = $y;
  400. }
  401. if (mb_strlen($buf, 'UTF-8')>0) {
  402. if (mb_strlen($buf, 'UTF-8')==1) {
  403. if (isset(self::$word_tag[$buf])) {
  404. $buf_tag = self::$word_tag[$buf];
  405. } else {
  406. $buf_tag = 'x';
  407. }
  408. $words[] = array('word'=>$buf, 'tag'=>$buf_tag);
  409. } else {
  410. if (! isset(Jieba::$FREQ[$buf])) {
  411. $regognized = self::__cutDetail($buf);
  412. foreach ($regognized as $key => $word) {
  413. $words[] = $word;
  414. }
  415. } else {
  416. $elem_array = preg_split('//u', $buf, -1, PREG_SPLIT_NO_EMPTY);
  417. foreach ($elem_array as $word) {
  418. if (isset(self::$word_tag[$word])) {
  419. $buf_tag = self::$word_tag[$word];
  420. } else {
  421. $buf_tag = 'x';
  422. }
  423. $words[] = array('word'=>$word, 'tag'=>$buf_tag);
  424. }
  425. }
  426. }
  427. }
  428. return $words;
  429. }// end function __cutDAG
  430. /**
  431. * Static method __cutDAGNoHMM
  432. *
  433. * @param string $sentence # input sentence
  434. * @param array $options # other options
  435. *
  436. * @return array $words
  437. */
  438. public static function __cutDAGNoHMM($sentence, $options = array())
  439. {
  440. $defaults = array(
  441. 'mode'=>'default'
  442. );
  443. $options = array_merge($defaults, $options);
  444. $words = array();
  445. $N = mb_strlen($sentence, 'UTF-8');
  446. $DAG = Jieba::getDAG($sentence);
  447. Jieba::calc($sentence, $DAG);
  448. $x = 0;
  449. $buf = '';
  450. $re_eng_pattern = '[a-zA-Z+#]+';
  451. while ($x < $N) {
  452. $current_route_keys = array_keys(Jieba::$route[$x]);
  453. $y = $current_route_keys[0]+1;
  454. $l_word = mb_substr($sentence, $x, ($y-$x), 'UTF-8');
  455. if (preg_match('/'.$re_eng_pattern.'/u', $l_word)) {
  456. $buf = $buf.$l_word;
  457. $x = $y;
  458. } else {
  459. if (mb_strlen($buf, 'UTF-8')>0) {
  460. if (isset(self::$word_tag[$buf])) {
  461. $buf_tag = self::$word_tag[$buf];
  462. } else {
  463. $buf_tag = 'x';
  464. }
  465. $words[] = array('word' => $buf, 'tag' => $buf_tag);
  466. $buf = '';
  467. }
  468. if (isset(self::$word_tag[$l_word])) {
  469. $buf_tag = self::$word_tag[$l_word];
  470. } else {
  471. $buf_tag = 'x';
  472. }
  473. $words[] = array('word' => $l_word, 'tag' => $buf_tag);
  474. $x = $y;
  475. }
  476. }
  477. if (mb_strlen($buf, 'UTF-8')>0) {
  478. if (isset(self::$word_tag[$buf])) {
  479. $buf_tag = self::$word_tag[$buf];
  480. } else {
  481. $buf_tag = 'x';
  482. }
  483. $words[] = array('word' => $buf, 'tag' => $buf_tag);
  484. }
  485. return $words;
  486. }// end function __cutDAGNoHMM
  487. /**
  488. * Static method cut
  489. *
  490. * @param string $sentence # input sentence
  491. * @param array $options # other options
  492. *
  493. * @return array $seg_list
  494. */
  495. public static function cut($sentence, $options = array('HMM' => true))
  496. {
  497. $defaults = array(
  498. 'mode'=>'default'
  499. );
  500. @$options = array_merge($defaults, $options);
  501. $seg_list = array();
  502. $re_han_pattern = '([\x{4E00}-\x{9FA5}]+)';
  503. $re_skip_pattern = '([a-zA-Z0-9+#\r\n]+)';
  504. $re_punctuation_pattern = '([\x{ff5e}\x{ff01}\x{ff08}\x{ff09}\x{300e}'.
  505. '\x{300c}\x{300d}\x{300f}\x{3001}\x{ff1a}\x{ff1b}'.
  506. '\x{ff0c}\x{ff1f}\x{3002}]+)';
  507. $re_eng_pattern = '[a-zA-Z+#]+';
  508. $re_num_pattern = '[0-9]+';
  509. preg_match_all(
  510. '/('.$re_han_pattern.'|'.$re_skip_pattern.'|'.$re_punctuation_pattern.')/u',
  511. $sentence,
  512. $matches,
  513. PREG_PATTERN_ORDER
  514. );
  515. $blocks = $matches[0];
  516. foreach ($blocks as $blk) {
  517. if (preg_match('/'.$re_han_pattern.'/u', $blk)) {
  518. if ($options['HMM']) {
  519. $words = Posseg::__cutDAG($blk);
  520. } else {
  521. $words = Posseg::__cutDAGNoHMM($blk);
  522. }
  523. foreach ($words as $word) {
  524. $seg_list[] = $word;
  525. }
  526. } elseif (preg_match('/'.$re_skip_pattern.'/u', $blk)) {
  527. if (preg_match('/'.$re_num_pattern.'/u', $blk)) {
  528. $seg_list[] = array('word' => $blk, 'tag' => 'm');
  529. } elseif (preg_match('/'.$re_eng_pattern.'/u', $blk)) {
  530. $seg_list[] = array('word' => $blk, 'tag' => 'eng');
  531. }
  532. } elseif (preg_match('/'.$re_punctuation_pattern.'/u', $blk)) {
  533. $seg_list[] = array('word' => $blk, 'tag' => 'w');
  534. }
  535. }
  536. return $seg_list;
  537. }// end function cut
  538. /**
  539. * Static method posTagReadable
  540. *
  541. * @param array $seg_list # input seg_list
  542. * @param array $options # other options
  543. *
  544. * @return array $new_seg_list
  545. */
  546. public static function posTagReadable($seg_list, $options = array())
  547. {
  548. $defaults = array(
  549. 'mode'=>'default'
  550. );
  551. $options = array_merge($defaults, $options);
  552. $new_seg_list = array();
  553. foreach ($seg_list as $seg) {
  554. $seg['tag_readable'] = self::$pos_tag_readable[$seg['tag']];
  555. $new_seg_list[] = $seg;
  556. }
  557. return $new_seg_list;
  558. }// end function posTagReadable
  559. }