JiebaCache.php 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511
  1. <?php
  2. /**
  3. * Jieba.php
  4. *
  5. * PHP version 5
  6. *
  7. * @category PHP
  8. * @package /src/class/
  9. * @author Fukuball Lin <fukuball@gmail.com>
  10. * @license MIT Licence
  11. * @version GIT: <fukuball/jieba-php>
  12. * @link https://github.com/fukuball/jieba-php
  13. */
  14. namespace Fukuball\Jieba;
  15. use Fukuball\Tebru\MultiArray;
  16. define("MIN_FLOAT", -3.14e+100);
  17. /**
  18. * Jieba
  19. *
  20. * @category PHP
  21. * @package /src/class/
  22. * @author Fukuball Lin <fukuball@gmail.com>
  23. * @license MIT Licence
  24. * @version Release: <0.16>
  25. * @link https://github.com/fukuball/jieba-php
  26. */
  27. class JiebaCache
  28. {
  29. public static $total = 0.0;
  30. public static $trie = array();
  31. public static $FREQ = array();
  32. public static $min_freq = 0.0;
  33. public static $route = array();
  34. public static $dictname;
  35. public static $user_dictname=array();
  36. /**
  37. * Static method init
  38. *
  39. * @param array $options # other options
  40. *
  41. * @return void
  42. */
  43. public static function init($options = array())
  44. {
  45. $defaults = array(
  46. 'mode'=>'default',
  47. 'dict'=>'normal'
  48. );
  49. $options = array_merge($defaults, $options);
  50. if ($options['mode']=='test') {
  51. echo "Building Trie...\n";
  52. }
  53. if ($options['dict']=='small') {
  54. $f_name = "dict.small.txt";
  55. self::$dictname="dict.small.txt";
  56. } elseif ($options['dict']=='big') {
  57. $f_name = "dict.big.txt";
  58. self::$dictname="dict.big.txt";
  59. } else {
  60. $f_name = "dict.txt";
  61. self::$dictname="dict.txt";
  62. }
  63. $t1 = microtime(true);
  64. self::$trie = Jieba::genTrie(dirname(dirname(__FILE__))."/dict/".$f_name);
  65. if ($options['mode']=='test') {
  66. echo "loading model cost ".(microtime(true) - $t1)." seconds.\n";
  67. echo "Trie has been built succesfully.\n";
  68. }
  69. }// end function init
  70. /**
  71. * Static method calc
  72. *
  73. * @param string $sentence # input sentence
  74. * @param array $DAG # DAG
  75. * @param array $options # other options
  76. *
  77. * @return array self::$route
  78. */
  79. public static function calc($sentence, $DAG, $options = array())
  80. {
  81. $N = mb_strlen($sentence, 'UTF-8');
  82. self::$route = array();
  83. self::$route[$N] = array($N => 1.0);
  84. for ($i=($N-1); $i>=0; $i--) {
  85. $candidates = array();
  86. foreach ($DAG[$i] as $x) {
  87. $w_c = mb_substr($sentence, $i, (($x+1)-$i), 'UTF-8');
  88. $previous_freq = current(self::$route[$x+1]);
  89. if (isset(self::$FREQ[$w_c])) {
  90. $current_freq = (float) $previous_freq + self::$FREQ[$w_c];
  91. } else {
  92. $current_freq = (float) $previous_freq + self::$min_freq;
  93. }
  94. $candidates[$x] = $current_freq;
  95. }
  96. arsort($candidates);
  97. $max_prob = reset($candidates);
  98. $max_key = key($candidates);
  99. self::$route[$i] = array($max_key => $max_prob);
  100. }
  101. return self::$route;
  102. }// end function calc
  103. /**
  104. * Static method genTrie
  105. *
  106. * @param string $f_name # input f_name
  107. * @param array $options # other options
  108. *
  109. * @return array self::$trie
  110. */
  111. public static function genTrie($f_name, $options = array())
  112. {
  113. $defaults = array(
  114. 'mode'=>'default'
  115. );
  116. // 配置缓存文件
  117. $cachepath = dirname($f_name).'/cache/';
  118. if (!file_exists($cachepath)) {
  119. mkdir($cachepath);
  120. }
  121. $triecachefile = 'trie.cache';
  122. $freqcachefile = 'freq.cache';
  123. $totalcachefile = 'total.cache';
  124. $minfreqcachefile = 'minfreq.cache';
  125. $flag1 = file_exists($cachepath.$triecachefile) && file_exists($cachepath.$freqcachefile);
  126. $flag2 = file_exists($cachepath.$totalcachefile) && file_exists($cachepath.$minfreqcachefile);
  127. $flag = $flag1 && $flag2;
  128. if ($flag) {
  129. // 读取缓存文件
  130. $triecache = fopen($cachepath.$triecachefile, 'r');
  131. $triesize = filesize($cachepath.$triecachefile);
  132. self::$trie = unserialize(fread($triecache, $triesize));
  133. $freqcache = fopen($cachepath.$freqcachefile, 'r');
  134. $freqsize = filesize($cachepath.$freqcachefile);
  135. self::$FREQ = unserialize(fread($freqcache, $freqsize));
  136. $totalcache = fopen($cachepath.$totalcachefile, 'r');
  137. $totalsize = filesize($cachepath.$totalcachefile);
  138. self::$total = unserialize(fread($totalcache, $totalsize));
  139. $minfreqcache = fopen($cachepath.$minfreqcachefile, 'r');
  140. $minfreqsize = filesize($cachepath.$minfreqcachefile);
  141. self::$min_freq = unserialize(fread($minfreqcache, $minfreqsize));
  142. } else {
  143. // 建立树并缓存
  144. $options = array_merge($defaults, $options);
  145. self::$trie = new MultiArray(file_get_contents($f_name.'.json'));
  146. self::$trie->cache = new MultiArray(file_get_contents($f_name.'.cache.json'));
  147. $content = fopen($f_name, "r");
  148. while (($line = fgets($content)) !== false) {
  149. $explode_line = explode(" ", trim($line));
  150. $word = $explode_line[0];
  151. $freq = $explode_line[1];
  152. $tag = $explode_line[2];
  153. $freq = (float) $freq;
  154. self::$FREQ[$word] = $freq;
  155. self::$total += $freq;
  156. //$l = mb_strlen($word, 'UTF-8');
  157. //$word_c = array();
  158. //for ($i=0; $i<$l; $i++) {
  159. // $c = mb_substr($word, $i, 1, 'UTF-8');
  160. // $word_c[] = $c;
  161. //}
  162. //$word_c_key = implode('.', $word_c);
  163. //self::$trie->set($word_c_key, array("end"=>""));
  164. }
  165. fclose($content);
  166. foreach (self::$FREQ as $key => $value) {
  167. self::$FREQ[$key] = log($value / self::$total);
  168. }
  169. self::$min_freq = min(self::$FREQ);
  170. // 缓存文件
  171. $triecache = fopen($cachepath.$triecachefile, 'w');
  172. $triecontent = serialize(self::$trie);
  173. fwrite($triecache, $triecontent);
  174. fclose($triecache);
  175. $freqcache = fopen($cachepath.$freqcachefile, 'w');
  176. $freqcontent = serialize(self::$FREQ);
  177. fwrite($freqcache, $freqcontent);
  178. fclose($freqcache);
  179. $totalcache = fopen($cachepath.$totalcachefile, 'w');
  180. $totalcontent = serialize(self::$total);
  181. fwrite($totalcache, $totalcontent);
  182. fclose($totalcache);
  183. $minfreqcache = fopen($cachepath.$minfreqcachefile, 'w');
  184. $minfreqcontent = serialize(self::$min_freq);
  185. fwrite($minfreqcache, $minfreqcontent);
  186. fclose($minfreqcache);
  187. }
  188. return self::$trie;
  189. }// end function genTrie
  190. /**
  191. * Static method loadUserDict
  192. *
  193. * @param string $f_name # input f_name
  194. * @param array $options # other options
  195. *
  196. * @return array self::$trie
  197. */
  198. public static function loadUserDict($f_name, $options = array())
  199. {
  200. self::$user_dictname[] = $f_name;
  201. $content = fopen($f_name, "r");
  202. while (($line = fgets($content)) !== false) {
  203. $explode_line = explode(" ", trim($line));
  204. $word = $explode_line[0];
  205. $freq = $explode_line[1];
  206. $tag = $explode_line[2];
  207. $freq = (float) $freq;
  208. self::$total += $freq;
  209. self::$FREQ[$word] = log($freq / self::$total);
  210. $l = mb_strlen($word, 'UTF-8');
  211. $word_c = array();
  212. for ($i=0; $i<$l; $i++) {
  213. $c = mb_substr($word, $i, 1, 'UTF-8');
  214. $word_c[] = $c;
  215. }
  216. $word_c_key = implode('.', $word_c);
  217. self::$trie->set($word_c_key, array("end"=>""));
  218. }
  219. fclose($content);
  220. return self::$trie;
  221. }// end function loadUserDict
  222. /**
  223. * Static method __cutAll
  224. *
  225. * @param string $sentence # input sentence
  226. * @param array $options # other options
  227. *
  228. * @return array $words
  229. */
  230. public static function __cutAll($sentence, $options = array())
  231. {
  232. $defaults = array(
  233. 'mode'=>'default'
  234. );
  235. $options = array_merge($defaults, $options);
  236. $words = array();
  237. $DAG = self::getDAG($sentence);
  238. $old_j = -1;
  239. foreach ($DAG as $k => $L) {
  240. if (count($L) == 1 && $k > $old_j) {
  241. $word = mb_substr($sentence, $k, (($L[0]-$k)+1), 'UTF-8');
  242. $words[] = $word;
  243. $old_j = $L[0];
  244. } else {
  245. foreach ($L as $j) {
  246. if ($j > $k) {
  247. $word = mb_substr($sentence, $k, ($j-$k)+1, 'UTF-8');
  248. $words[] = $word;
  249. $old_j = $j;
  250. }
  251. }
  252. }
  253. }
  254. return $words;
  255. }// end function __cutAll
  256. /**
  257. * Static method getDAG
  258. *
  259. * @param string $sentence # input sentence
  260. * @param array $options # other options
  261. *
  262. * @return array $DAG
  263. */
  264. public static function getDAG($sentence, $options = array())
  265. {
  266. $defaults = array(
  267. 'mode'=>'default'
  268. );
  269. $options = array_merge($defaults, $options);
  270. $N = mb_strlen($sentence, 'UTF-8');
  271. $i = 0;
  272. $j = 0;
  273. $DAG = array();
  274. $word_c = array();
  275. while ($i < $N) {
  276. $c = mb_substr($sentence, $j, 1, 'UTF-8');
  277. if (count($word_c)==0) {
  278. $next_word_key = $c;
  279. } else {
  280. $next_word_key = implode('.', $word_c).'.'.$c;
  281. }
  282. if (self::$trie->exists($next_word_key)) {
  283. $word_c[] = $c;
  284. $next_word_key_value = self::$trie->get($next_word_key);
  285. if ($next_word_key_value == array("end"=>"")
  286. || isset($next_word_key_value["end"])
  287. || isset($next_word_key_value[0]["end"])
  288. ) {
  289. if (!isset($DAG[$i])) {
  290. $DAG[$i] = array();
  291. }
  292. $DAG[$i][] = $j;
  293. }
  294. $j += 1;
  295. if ($j >= $N) {
  296. $word_c = array();
  297. $i += 1;
  298. $j = $i;
  299. }
  300. } else {
  301. $word_c = array();
  302. $i += 1;
  303. $j = $i;
  304. }
  305. }
  306. for ($i=0; $i<$N; $i++) {
  307. if (!isset($DAG[$i])) {
  308. $DAG[$i] = array($i);
  309. }
  310. }
  311. return $DAG;
  312. }// end function getDAG
  313. /**
  314. * Static method __cutDAG
  315. *
  316. * @param string $sentence # input sentence
  317. * @param array $options # other options
  318. *
  319. * @return array $words
  320. */
  321. public static function __cutDAG($sentence, $options = array())
  322. {
  323. $defaults = array(
  324. 'mode'=>'default'
  325. );
  326. $options = array_merge($defaults, $options);
  327. $words = array();
  328. $N = mb_strlen($sentence, 'UTF-8');
  329. $DAG = self::getDAG($sentence);
  330. self::calc($sentence, $DAG);
  331. $x = 0;
  332. $buf = '';
  333. while ($x < $N) {
  334. $current_route_keys = array_keys(self::$route[$x]);
  335. $y = $current_route_keys[0]+1;
  336. $l_word = mb_substr($sentence, $x, ($y-$x), 'UTF-8');
  337. if (($y-$x)==1) {
  338. $buf = $buf.$l_word;
  339. } else {
  340. if (mb_strlen($buf, 'UTF-8')>0) {
  341. if (mb_strlen($buf, 'UTF-8')==1) {
  342. $words[] = $buf;
  343. $buf = '';
  344. } else {
  345. $regognized = Finalseg::cut($buf);
  346. foreach ($regognized as $key => $word) {
  347. $words[] = $word;
  348. }
  349. $buf = '';
  350. }
  351. }
  352. $words[] = $l_word;
  353. }
  354. $x = $y;
  355. }
  356. if (mb_strlen($buf, 'UTF-8')>0) {
  357. if (mb_strlen($buf, 'UTF-8')==1) {
  358. $words[] = $buf;
  359. } else {
  360. $regognized = Finalseg::cut($buf);
  361. foreach ($regognized as $key => $word) {
  362. $words[] = $word;
  363. }
  364. }
  365. }
  366. return $words;
  367. }// end function __cutDAG
  368. /**
  369. * Static method cut
  370. *
  371. * @param string $sentence # input sentence
  372. * @param boolean $cut_all # cut_all or not
  373. * @param array $options # other options
  374. *
  375. * @return array $seg_list
  376. */
  377. public static function cut($sentence, $cut_all = false, $options = array())
  378. {
  379. $defaults = array(
  380. 'mode'=>'default'
  381. );
  382. $options = array_merge($defaults, $options);
  383. $seg_list = array();
  384. $re_han_pattern = '([\x{4E00}-\x{9FA5}]+)';
  385. $re_skip_pattern = '([a-zA-Z0-9+#\r\n]+)';
  386. preg_match_all(
  387. '/('.$re_han_pattern.'|'.$re_skip_pattern.')/u',
  388. $sentence,
  389. $matches,
  390. PREG_PATTERN_ORDER
  391. );
  392. $blocks = $matches[0];
  393. foreach ($blocks as $blk) {
  394. if (preg_match('/'.$re_han_pattern.'/u', $blk)) {
  395. if ($cut_all) {
  396. $words = Jieba::__cutAll($blk);
  397. } else {
  398. $words = Jieba::__cutDAG($blk);
  399. }
  400. foreach ($words as $word) {
  401. $seg_list[] = $word;
  402. }
  403. } else {
  404. $seg_list[] = $blk;
  405. }// end else (preg_match('/'.$re_han_pattern.'/u', $blk))
  406. }// end foreach ($blocks as $blk)
  407. return $seg_list;
  408. }// end function cut
  409. /**
  410. * Static method cutForSearch
  411. *
  412. * @param string $sentence # input sentence
  413. * @param array $options # other options
  414. *
  415. * @return array $seg_list
  416. */
  417. public static function cutForSearch($sentence, $options = array())
  418. {
  419. $defaults = array(
  420. 'mode'=>'default'
  421. );
  422. $options = array_merge($defaults, $options);
  423. $seg_list = array();
  424. $cut_seg_list = Jieba::cut($sentence);
  425. foreach ($cut_seg_list as $w) {
  426. $len = mb_strlen($w, 'UTF-8');
  427. if ($len>2) {
  428. for ($i=0; $i<($len-1); $i++) {
  429. $gram2 = mb_substr($w, $i, 2, 'UTF-8');
  430. if (isset(self::$FREQ[$gram2])) {
  431. $seg_list[] = $gram2;
  432. }
  433. }
  434. }
  435. if (mb_strlen($w, 'UTF-8')>3) {
  436. for ($i=0; $i<($len-2); $i++) {
  437. $gram3 = mb_substr($w, $i, 3, 'UTF-8');
  438. if (isset(self::$FREQ[$gram3])) {
  439. $seg_list[] = $gram3;
  440. }
  441. }
  442. }
  443. $seg_list[] = $w;
  444. }
  445. return $seg_list;
  446. }// end function cutForSearch
  447. }// end of class JiebaCache