123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654 |
- <?php
- /**
- * Posseg.php
- *
- * PHP version 5
- *
- * @category PHP
- * @package /src/class/
- * @author Fukuball Lin <fukuball@gmail.com>
- * @license MIT Licence
- * @version GIT: <fukuball/jieba-php>
- * @link https://github.com/fukuball/jieba-php
- */
- namespace Fukuball\Jieba;
- /**
- * Posseg
- *
- * @category PHP
- * @package /src/class/
- * @author Fukuball Lin <fukuball@gmail.com>
- * @license MIT Licence
- * @version Release: <0.16>
- * @link https://github.com/fukuball/jieba-php
- */
- class Posseg
- {
- public static $prob_start = array();
- public static $prob_trans = array();
- public static $prob_emit = array();
- public static $char_state = array();
- public static $word_tag = array();
- public static $pos_tag_readable = array();
- /**
- * Static method init
- *
- * @param array $options # other options
- *
- * @return void
- */
- public static function init($options = array())
- {
- $defaults = array(
- 'mode'=>'default'
- );
- $options = array_merge($defaults, $options);
- self::$prob_start = self::loadModel(dirname(dirname(__FILE__)).'/model/pos/prob_start.json');
- self::$prob_trans = self::loadModel(dirname(dirname(__FILE__)).'/model/pos/prob_trans.json');
- self::$prob_emit = self::loadModel(dirname(dirname(__FILE__)).'/model/pos/prob_emit.json');
- self::$char_state = self::loadModel(dirname(dirname(__FILE__)).'/model/pos/char_state.json');
- if (Jieba::$dictname!='') {
- $content = fopen(dirname(dirname(__FILE__)).'/dict/'.Jieba::$dictname, 'r');
- while (($line = fgets($content)) !== false) {
- $explode_line = explode(' ', trim($line));
- $word = $explode_line[0];
- $freq = $explode_line[1];
- $tag = $explode_line[2];
- self::$word_tag[$word] = $tag;
- }
- fclose($content);
- }
- if (sizeof(Jieba::$user_dictname)!=0) {
- for ($i = 0; $i<sizeof(Jieba::$user_dictname); $i++) {
- $content = fopen(Jieba::$user_dictname[$i], 'r');
- while (($line = fgets($content)) !== false) {
- $explode_line = explode(' ', trim($line));
- $word = $explode_line[0];
- $freq = $explode_line[1];
- $tag = $explode_line[2];
- self::$word_tag[$word] = $tag;
- }
- fclose($content);
- }
- }
- $content = fopen(dirname(dirname(__FILE__)).'/dict/pos_tag_readable.txt', 'r');
- while (($line = fgets($content)) !== false) {
- $explode_line = explode(' ', trim($line));
- $tag = $explode_line[0];
- $meaning = $explode_line[1];
- self::$pos_tag_readable[$tag] = $meaning;
- }
- fclose($content);
- }// end function init
- /**
- * Static method loadModel
- *
- * @param string $f_name # input f_name
- * @param array $options # other options
- *
- * @return void
- */
- public static function loadModel($f_name, $options = array())
- {
- $defaults = array(
- 'mode'=>'default'
- );
- $options = array_merge($defaults, $options);
- return json_decode(file_get_contents($f_name), true);
- }// end function loadModel
- /**
- * Static method getTopStates
- *
- * @param array $t_state_v # input t_state_v
- * @param int $top_k # input top_k
- * @param array $options # other options
- *
- * @return array $top_states
- */
- public static function getTopStates($t_state_v, $top_k = 4, $options = array())
- {
- arsort($t_state_v);
- $top_states = array_slice($t_state_v, 0, $top_k);
- return $top_states;
- }// end function getTopStates
- /**
- * Static method viterbi
- *
- * @param string $sentence # input sentence
- * @param array $options # other options
- *
- * @return array $viterbi
- */
- public static function viterbi($sentence, $options = array())
- {
- $defaults = array(
- 'mode'=>'default'
- );
- $options = array_merge($defaults, $options);
- $obs = $sentence;
- $states = self::$char_state;
- $V = array();
- $V[0] = array();
- $mem_path = array();
- $mem_path[0] = array();
- $all_states = array_keys(self::$prob_trans);
- $c = mb_substr($obs, 0, 1, 'UTF-8');
- if (isset($states[$c]) && !empty($states[$c])) {
- $c_states = $states[$c];
- } else {
- $c_states = $all_states;
- }
- foreach ($c_states as $key => $state) {
- $y = $state;
- $c = mb_substr($obs, 0, 1, 'UTF-8');
- $prob_emit = 0.0;
- if (isset(self::$prob_emit[$y][$c])) {
- $prob_emit = self::$prob_emit[$y][$c];
- } else {
- $prob_emit = MIN_FLOAT;
- }
- $V[0][$y] = self::$prob_start[$y] + $prob_emit;
- $mem_path[0][$y] = '';
- }
- for ($t=1; $t<mb_strlen($obs, 'UTF-8'); $t++) {
- $c = mb_substr($obs, $t, 1, 'UTF-8');
- $V[$t] = array();
- $mem_path[$t] = array();
- $prev_states = array_keys(self::getTopStates($V[$t-1]));
- $prev_mem_path = array_keys($mem_path[$t-1]);
- $prev_states = array();
- foreach ($prev_mem_path as $mem_path_state) {
- if (count(self::$prob_trans[$mem_path_state])>0) {
- $prev_states[] = $mem_path_state;
- }
- }
- $prev_states_expect_next = array();
- foreach ($prev_states as $prev_state) {
- $prev_states_expect_next
- = array_unique(
- array_merge(
- $prev_states_expect_next,
- array_keys(self::$prob_trans[$prev_state])
- )
- );
- }
- $obs_states = array();
- if (isset($states[$c])) {
- $obs_states = $states[$c];
- } else {
- $obs_states = $all_states;
- }
- $obs_states = array_intersect($obs_states, $prev_states_expect_next);
- if (count($obs_states)==0) {
- $obs_states = $all_states;
- }
- foreach ($obs_states as $y) {
- $temp_prob_array = array();
- foreach ($prev_states as $y0) {
- $prob_trans = 0.0;
- if (isset(self::$prob_trans[$y0][$y])) {
- $prob_trans = self::$prob_trans[$y0][$y];
- } else {
- $prob_trans = MIN_FLOAT;
- }
- $prob_emit = 0.0;
- if (isset(self::$prob_emit[$y][$c])) {
- $prob_emit = self::$prob_emit[$y][$c];
- } else {
- $prob_emit = MIN_FLOAT;
- }
- $temp_prob_array[$y0] = $V[$t-1][$y0] + $prob_trans + $prob_emit;
- }
- arsort($temp_prob_array);
- $max_prob = reset($temp_prob_array);
- $max_key = key($temp_prob_array);
- $V[$t][$y] = $max_prob;
- $mem_path[$t][$y] = $max_key;
- }
- }
- $last = array();
- $mem_path_end_keys = array_keys(end($mem_path));
- foreach ($mem_path_end_keys as $y) {
- $end_array = end($V);
- $last[$y] = $end_array[$y];
- }
- arsort($last);
- $return_prob = reset($last);
- $return_prob_key = key($last);
- $obs_length = mb_strlen($obs, 'UTF-8');
- $route = array();
- for ($t=0; $t<$obs_length; $t++) {
- $route[] = 'None';
- }
- $i = $obs_length-1;
- while ($i >= 0) {
- $route[$i] = $return_prob_key;
- $return_prob_key = $mem_path[$i][$return_prob_key];
- $i-=1;
- }
- return array('prob'=>$return_prob, 'pos_list'=>$route);
- }// end function viterbi
- /**
- * Static method __cut
- *
- * @param string $sentence # input sentence
- * @param array $options # other options
- *
- * @return array $words
- */
- public static function __cut($sentence, $options = array('HMM' => true))
- {
- $defaults = array(
- 'mode'=>'default'
- );
- $options = array_merge($defaults, $options);
- $words = array();
- $viterbi_array = self::viterbi($sentence);
- $prob = $viterbi_array['prob'];
- $pos_list = $viterbi_array['pos_list'];
- $begin = 0;
- $next = 0;
- $len = mb_strlen($sentence, 'UTF-8');
- for ($i=0; $i<$len; $i++) {
- $char = mb_substr($sentence, $i, 1, 'UTF-8');
- eval('$pos_array = array'.$pos_list[$i].';');
- $pos = $pos_array[0];
- if ($pos=='B') {
- $begin = $i;
- } elseif ($pos=='E') {
- eval('$this_pos_array = array'.$pos_list[$i].';');
- $this_pos = $this_pos_array[1];
- $this_word_pair = array(
- 'word'=>mb_substr($sentence, $begin, (($i+1)-$begin), 'UTF-8'),
- 'tag'=>$this_pos
- );
- $words[] = $this_word_pair;
- $next = $i+1;
- } elseif ($pos=='S') {
- eval('$this_pos_array = array'.$pos_list[$i].';');
- $this_pos = $this_pos_array[1];
- $this_word_pair = array(
- 'word'=>$char,
- 'tag'=>$this_pos
- );
- $words[] = $this_word_pair;
- $next = $i+1;
- }
- }
- if ($next<$len) {
- eval('$this_pos_array = array'.$pos_list[$next].';');
- $this_pos = $this_pos_array[1];
- $this_word_pair = array(
- 'word'=>mb_substr($sentence, $next, null, 'UTF-8'),
- 'tag'=>$this_pos
- );
- $words[] = $this_word_pair;
- }
- return $words;
- }// end function __cut
- /**
- * Static method __cutDetail
- *
- * @param string $sentence # input sentence
- * @param array $options # other options
- *
- * @return array $words
- */
- public static function __cutDetail($sentence, $options = array())
- {
- $defaults = array(
- 'mode'=>'default'
- );
- $options = array_merge($defaults, $options);
- $words = array();
- $re_han_pattern = '([\x{4E00}-\x{9FA5}]+)';
- $re_skip_pattern = '([a-zA-Z0-9+#&=\._\r\n]+)';
- $re_punctuation_pattern = '([\x{ff5e}\x{ff01}\x{ff08}\x{ff09}\x{300e}'.
- '\x{300c}\x{300d}\x{300f}\x{3001}\x{ff1a}\x{ff1b}'.
- '\x{ff0c}\x{ff1f}\x{3002}]+)';
- $re_eng_pattern = '[a-zA-Z0-9]+';
- $re_num_pattern = '[\.0-9]+';
- preg_match_all(
- '/('.$re_han_pattern.'|'.$re_skip_pattern.'|'.$re_punctuation_pattern.')/u',
- $sentence,
- $matches,
- PREG_PATTERN_ORDER
- );
- $blocks = $matches[0];
- foreach ($blocks as $blk) {
- if (preg_match('/'.$re_han_pattern.'/u', $blk)) {
- $blk_words = self::__cut($blk);
- foreach ($blk_words as $blk_word) {
- $words[] = $blk_word;
- }
- } elseif (preg_match('/'.$re_skip_pattern.'/u', $blk)) {
- if (preg_match('/'.$re_num_pattern.'/u', $blk)) {
- $words[] = array('word' => $blk, 'tag' => 'm');
- } elseif (preg_match('/'.$re_eng_pattern.'/u', $blk)) {
- $words[] = array('word' => $blk, 'tag' => 'eng');
- }
- } elseif (preg_match('/'.$re_punctuation_pattern.'/u', $blk)) {
- $words[] = array('word' => $blk, 'tag' => 'w');
- }
- }
- return $words;
- }// end function __cutDetail
- /**
- * Static method __cutDAG
- *
- * @param string $sentence # input sentence
- * @param array $options # other options
- *
- * @return array $words
- */
- public static function __cutDAG($sentence, $options = array())
- {
- $defaults = array(
- 'mode'=>'default'
- );
- $options = array_merge($defaults, $options);
- $words = array();
- $N = mb_strlen($sentence, 'UTF-8');
- $DAG = Jieba::getDAG($sentence);
- Jieba::calc($sentence, $DAG);
- $x = 0;
- $buf = '';
- while ($x < $N) {
- $current_route_keys = array_keys(Jieba::$route[$x]);
- $y = $current_route_keys[0]+1;
- $l_word = mb_substr($sentence, $x, ($y-$x), 'UTF-8');
- if (($y-$x)==1) {
- $buf = $buf.$l_word;
- } else {
- if (mb_strlen($buf, 'UTF-8')>0) {
- if (mb_strlen($buf, 'UTF-8')==1) {
- if (isset(self::$word_tag[$buf])) {
- $buf_tag = self::$word_tag[$buf];
- } else {
- $buf_tag = 'x';
- }
- $words[] = array('word' => $buf, 'tag' => $buf_tag);
- $buf = '';
- } else {
- if (! isset(Jieba::$FREQ[$buf])) {
- $regognized = self::__cutDetail($buf);
- foreach ($regognized as $key => $word) {
- $words[] = $word;
- }
- } else {
- $elem_array = preg_split('//u', $buf, -1, PREG_SPLIT_NO_EMPTY);
- foreach ($elem_array as $word) {
- if (isset(self::$word_tag[$word])) {
- $buf_tag = self::$word_tag[$word];
- } else {
- $buf_tag = 'x';
- }
- $words[] = array('word' => $word, 'tag' => $buf_tag);
- }
- }
- $buf = '';
- }
- }
- if (isset(self::$word_tag[$l_word])) {
- $buf_tag = self::$word_tag[$l_word];
- } else {
- $buf_tag = 'x';
- }
- $words[] = array('word' => $l_word, 'tag' => $buf_tag);
- }
- $x = $y;
- }
- if (mb_strlen($buf, 'UTF-8')>0) {
- if (mb_strlen($buf, 'UTF-8')==1) {
- if (isset(self::$word_tag[$buf])) {
- $buf_tag = self::$word_tag[$buf];
- } else {
- $buf_tag = 'x';
- }
- $words[] = array('word'=>$buf, 'tag'=>$buf_tag);
- } else {
- if (! isset(Jieba::$FREQ[$buf])) {
- $regognized = self::__cutDetail($buf);
- foreach ($regognized as $key => $word) {
- $words[] = $word;
- }
- } else {
- $elem_array = preg_split('//u', $buf, -1, PREG_SPLIT_NO_EMPTY);
- foreach ($elem_array as $word) {
- if (isset(self::$word_tag[$word])) {
- $buf_tag = self::$word_tag[$word];
- } else {
- $buf_tag = 'x';
- }
- $words[] = array('word'=>$word, 'tag'=>$buf_tag);
- }
- }
- }
- }
- return $words;
- }// end function __cutDAG
- /**
- * Static method __cutDAGNoHMM
- *
- * @param string $sentence # input sentence
- * @param array $options # other options
- *
- * @return array $words
- */
- public static function __cutDAGNoHMM($sentence, $options = array())
- {
- $defaults = array(
- 'mode'=>'default'
- );
- $options = array_merge($defaults, $options);
- $words = array();
- $N = mb_strlen($sentence, 'UTF-8');
- $DAG = Jieba::getDAG($sentence);
- Jieba::calc($sentence, $DAG);
- $x = 0;
- $buf = '';
- $re_eng_pattern = '[a-zA-Z+#]+';
- while ($x < $N) {
- $current_route_keys = array_keys(Jieba::$route[$x]);
- $y = $current_route_keys[0]+1;
- $l_word = mb_substr($sentence, $x, ($y-$x), 'UTF-8');
- if (preg_match('/'.$re_eng_pattern.'/u', $l_word)) {
- $buf = $buf.$l_word;
- $x = $y;
- } else {
- if (mb_strlen($buf, 'UTF-8')>0) {
- if (isset(self::$word_tag[$buf])) {
- $buf_tag = self::$word_tag[$buf];
- } else {
- $buf_tag = 'x';
- }
- $words[] = array('word' => $buf, 'tag' => $buf_tag);
- $buf = '';
- }
- if (isset(self::$word_tag[$l_word])) {
- $buf_tag = self::$word_tag[$l_word];
- } else {
- $buf_tag = 'x';
- }
- $words[] = array('word' => $l_word, 'tag' => $buf_tag);
- $x = $y;
- }
- }
- if (mb_strlen($buf, 'UTF-8')>0) {
- if (isset(self::$word_tag[$buf])) {
- $buf_tag = self::$word_tag[$buf];
- } else {
- $buf_tag = 'x';
- }
- $words[] = array('word' => $buf, 'tag' => $buf_tag);
- }
- return $words;
- }// end function __cutDAGNoHMM
- /**
- * Static method cut
- *
- * @param string $sentence # input sentence
- * @param array $options # other options
- *
- * @return array $seg_list
- */
- public static function cut($sentence, $options = array('HMM' => true))
- {
- $defaults = array(
- 'mode'=>'default'
- );
- @$options = array_merge($defaults, $options);
- $seg_list = array();
- $re_han_pattern = '([\x{4E00}-\x{9FA5}]+)';
- $re_skip_pattern = '([a-zA-Z0-9+#\r\n]+)';
- $re_punctuation_pattern = '([\x{ff5e}\x{ff01}\x{ff08}\x{ff09}\x{300e}'.
- '\x{300c}\x{300d}\x{300f}\x{3001}\x{ff1a}\x{ff1b}'.
- '\x{ff0c}\x{ff1f}\x{3002}]+)';
- $re_eng_pattern = '[a-zA-Z+#]+';
- $re_num_pattern = '[0-9]+';
- preg_match_all(
- '/('.$re_han_pattern.'|'.$re_skip_pattern.'|'.$re_punctuation_pattern.')/u',
- $sentence,
- $matches,
- PREG_PATTERN_ORDER
- );
- $blocks = $matches[0];
- foreach ($blocks as $blk) {
- if (preg_match('/'.$re_han_pattern.'/u', $blk)) {
- if ($options['HMM']) {
- $words = Posseg::__cutDAG($blk);
- } else {
- $words = Posseg::__cutDAGNoHMM($blk);
- }
- foreach ($words as $word) {
- $seg_list[] = $word;
- }
- } elseif (preg_match('/'.$re_skip_pattern.'/u', $blk)) {
- if (preg_match('/'.$re_num_pattern.'/u', $blk)) {
- $seg_list[] = array('word' => $blk, 'tag' => 'm');
- } elseif (preg_match('/'.$re_eng_pattern.'/u', $blk)) {
- $seg_list[] = array('word' => $blk, 'tag' => 'eng');
- }
- } elseif (preg_match('/'.$re_punctuation_pattern.'/u', $blk)) {
- $seg_list[] = array('word' => $blk, 'tag' => 'w');
- }
- }
- return $seg_list;
- }// end function cut
- /**
- * Static method posTagReadable
- *
- * @param array $seg_list # input seg_list
- * @param array $options # other options
- *
- * @return array $new_seg_list
- */
- public static function posTagReadable($seg_list, $options = array())
- {
- $defaults = array(
- 'mode'=>'default'
- );
- $options = array_merge($defaults, $options);
- $new_seg_list = array();
- foreach ($seg_list as $seg) {
- $seg['tag_readable'] = self::$pos_tag_readable[$seg['tag']];
- $new_seg_list[] = $seg;
- }
- return $new_seg_list;
- }// end function posTagReadable
- }
|