123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268 |
- <?php
- /**
- * Finalseg.php
- *
- * PHP version 5
- *
- * @category PHP
- * @package /src/class/
- * @author Fukuball Lin <fukuball@gmail.com>
- * @license MIT Licence
- * @version GIT: <fukuball/jieba-php>
- * @link https://github.com/fukuball/jieba-php
- */
- namespace Fukuball\Jieba;
- /**
- * Finalseg
- *
- * @category PHP
- * @package /src/class/
- * @author Fukuball Lin <fukuball@gmail.com>
- * @license MIT Licence
- * @version Release: <0.16>
- * @link https://github.com/fukuball/jieba-php
- */
- class Finalseg
- {
- public static $prob_start = array();
- public static $prob_trans = array();
- public static $prob_emit = array();
- /**
- * Static method init
- *
- * @param array $options # other options
- *
- * @return void
- */
- public static function init($options = array())
- {
- $defaults = array(
- 'mode'=>'default'
- );
- $options = array_merge($defaults, $options);
- self::$prob_start = self::loadModel(dirname(dirname(__FILE__)).'/model/prob_start.json');
- self::$prob_trans = self::loadModel(dirname(dirname(__FILE__)).'/model/prob_trans.json');
- self::$prob_emit = self::loadModel(dirname(dirname(__FILE__)).'/model/prob_emit.json');
- }// end function init
- /**
- * Static method loadModel
- *
- * @param string $f_name # input f_name
- * @param array $options # other options
- *
- * @return void
- */
- public static function loadModel($f_name, $options = array())
- {
- $defaults = array(
- 'mode'=>'default'
- );
- $options = array_merge($defaults, $options);
- return json_decode(file_get_contents($f_name), true);
- }// end function loadModel
- /**
- * Static method viterbi
- *
- * @param string $sentence # input sentence
- * @param array $options # other options
- *
- * @return array $viterbi
- */
- public static function viterbi($sentence, $options = array())
- {
- $defaults = array(
- 'mode'=>'default'
- );
- $options = array_merge($defaults, $options);
- $obs = $sentence;
- $states = array('B', 'M', 'E', 'S');
- $V = array();
- $V[0] = array();
- $path = array();
- foreach ($states as $key => $state) {
- $y = $state;
- $c = mb_substr($obs, 0, 1, 'UTF-8');
- $prob_emit = 0.0;
- if (isset(self::$prob_emit[$y][$c])) {
- $prob_emit = self::$prob_emit[$y][$c];
- } else {
- $prob_emit = MIN_FLOAT;
- }
- $V[0][$y] = self::$prob_start[$y] + $prob_emit;
- $path[$y] = $y;
- }
- for ($t=1; $t<mb_strlen($obs, 'UTF-8'); $t++) {
- $c = mb_substr($obs, $t, 1, 'UTF-8');
- $V[$t] = array();
- $newpath = array();
- foreach ($states as $key => $state) {
- $y = $state;
- $temp_prob_array = array();
- foreach ($states as $key => $state0) {
- $y0 = $state0;
- $prob_trans = 0.0;
- if (isset(self::$prob_trans[$y0][$y])) {
- $prob_trans = self::$prob_trans[$y0][$y];
- } else {
- $prob_trans = MIN_FLOAT;
- }
- $prob_emit = 0.0;
- if (isset(self::$prob_emit[$y][$c])) {
- $prob_emit = self::$prob_emit[$y][$c];
- } else {
- $prob_emit = MIN_FLOAT;
- }
- $temp_prob_array[$y0] = $V[$t-1][$y0] + $prob_trans + $prob_emit;
- }
- arsort($temp_prob_array);
- $max_prob = reset($temp_prob_array);
- $max_key = key($temp_prob_array);
- $V[$t][$y] = $max_prob;
- if (is_array($path[$max_key])) {
- $newpath[$y] = array();
- foreach ($path[$max_key] as $key => $path_value) {
- $newpath[$y][] = $path_value;
- }
- $newpath[$y][] = $y;
- } else {
- $newpath[$y] = array($path[$max_key], $y);
- }
- }
- $path = $newpath;
- }
- $es_states = array('E','S');
- $temp_prob_array = array();
- $len = mb_strlen($obs, 'UTF-8');
- foreach ($es_states as $key => $state) {
- $y = $state;
- $temp_prob_array[$y] = $V[$len-1][$y];
- }
- arsort($temp_prob_array);
- $prob = reset($temp_prob_array);
- $state = key($temp_prob_array);
- return array("prob"=>$prob, "pos_list"=>$path[$state]);
- }// end function viterbi
- /**
- * Static method __cut
- *
- * @param string $sentence # input sentence
- * @param array $options # other options
- *
- * @return array $words
- */
- public static function __cut($sentence, $options = array())
- {
- $defaults = array(
- 'mode'=>'default'
- );
- $options = array_merge($defaults, $options);
- $words = array();
- $viterbi_array = self::viterbi($sentence);
- $prob = $viterbi_array['prob'];
- $pos_list = $viterbi_array['pos_list'];
- $begin = 0;
- $next = 0;
- $len = mb_strlen($sentence, 'UTF-8');
- for ($i=0; $i<$len; $i++) {
- $char = mb_substr($sentence, $i, 1, 'UTF-8');
- $pos = $pos_list[$i];
- if ($pos=='B') {
- $begin = $i;
- } elseif ($pos=='E') {
- $words[] = mb_substr($sentence, $begin, (($i+1)-$begin), 'UTF-8');
- $next = $i+1;
- } elseif ($pos=='S') {
- $words[] = $char;
- $next = $i+1;
- }
- }
- if ($next<$len) {
- $words[] = mb_substr($sentence, $next, null, 'UTF-8');
- }
- return $words;
- }// end function __cut
- /**
- * Static method cut
- *
- * @param string $sentence # input sentence
- * @param array $options # other options
- *
- * @return array $seg_list
- */
- public static function cut($sentence, $options = array())
- {
- $defaults = array(
- 'mode'=>'default'
- );
- $options = array_merge($defaults, $options);
- $seg_list = array();
- $re_cjk_pattern = '([\x{3040}-\x{309F}]+)|([\x{30A0}-\x{30FF}]+)|([\x{4E00}-\x{9FA5}]+)|([\x{AC00}-\x{D7AF}]+)';
- $re_skip_pattern = '([a-zA-Z0-9+#&=\._\r\n]+)';
- preg_match_all(
- '/('.$re_cjk_pattern.'|'.$re_skip_pattern.')/u',
- $sentence,
- $matches,
- PREG_PATTERN_ORDER
- );
- $blocks = $matches[0];
- foreach ($blocks as $blk) {
- if (preg_match('/'.$re_cjk_pattern.'/u', $blk)) {
- $words = self::__cut($blk);
- foreach ($words as $word) {
- $seg_list[] = $word;
- }
- } else {
- $seg_list[] = $blk;
- }// end else (preg_match('/'.$re_han_pattern.'/u', $blk))
- }// end foreach ($blocks as $blk)
- return $seg_list;
- }// end function cut
- }
|