JiebaTest.php 6.6 KB


  1. <?php
  2. use Fukuball\Jieba\Jieba;
  3. use Fukuball\Jieba\Finalseg;
  4. use Fukuball\Jieba\JiebaAnalyse;
  5. use Fukuball\Jieba\Posseg;
  6. use PHPUnit\Framework\TestCase;
  7. class JiebaTest extends TestCase
  8. {
  9. public function testJiebaInit()
  10. {
  11. Jieba::init();
  12. $this->assertGreaterThan(0, Jieba::$total);
  13. }
  14. public function testFinalsegInit()
  15. {
  16. Finalseg::init();
  17. $array_count = count(Finalseg::$prob_start);
  18. $this->assertEquals(4, $array_count);
  19. }
  20. public function testJiebaAnalyseInit()
  21. {
  22. Jieba::init();
  23. JiebaAnalyse::init();
  24. $this->assertGreaterThan(0, JiebaAnalyse::$max_idf);
  25. }
  26. public function testPossegInit()
  27. {
  28. Posseg::init();
  29. $array_count = count(Posseg::$prob_start);
  30. $this->assertEquals(256, $array_count);
  31. }
  32. public function testJiebaCut()
  33. {
  34. $case_array = array(
  35. "怜香惜玉",
  36. "也",
  37. "得",
  38. "要",
  39. "看",
  40. "对象",
  41. "啊",
  42. "!"
  43. );
  44. $seg_list = Jieba::cut("怜香惜玉也得要看对象啊!");
  45. $this->assertEquals($case_array, $seg_list);
  46. $case_array = array(
  47. "我",
  48. "来到",
  49. "北京",
  50. "清华大学"
  51. );
  52. $seg_list = Jieba::cut("我来到北京清华大学");
  53. $this->assertEquals($case_array, $seg_list);
  54. $case_array = array(
  55. "他",
  56. "来到",
  57. "了",
  58. "网易",
  59. "杭研",
  60. "大厦"
  61. );
  62. $seg_list = Jieba::cut("他来到了网易杭研大厦");
  63. $this->assertEquals($case_array, $seg_list);
  64. }
  65. public function testJiebaCutAll()
  66. {
  67. $case_array = array(
  68. "我",
  69. "来到",
  70. "北京",
  71. "清华",
  72. "清华大学",
  73. "华大",
  74. "大学"
  75. );
  76. $seg_list = Jieba::cut("我来到北京清华大学", true);
  77. $this->assertEquals($case_array, $seg_list);
  78. }
  79. public function testJiebaCutForSearch()
  80. {
  81. $case_array = array(
  82. "小",
  83. "明",
  84. "硕士",
  85. "毕业",
  86. "于",
  87. "中国",
  88. "科学",
  89. "学院",
  90. "科学院",
  91. "中国科学院",
  92. "计算",
  93. "计算所",
  94. ",",
  95. "后",
  96. "在",
  97. "日本",
  98. "京都",
  99. "大学",
  100. "日本京都大学",
  101. "深造"
  102. );
  103. $seg_list = Jieba::cutForSEarch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造");
  104. $this->assertEquals($case_array, $seg_list);
  105. }
  106. public function testFinalsegCut()
  107. {
  108. $case_array = array(
  109. "怜香惜",
  110. "玉",
  111. "也",
  112. "得",
  113. "要",
  114. "看",
  115. "对象",
  116. "啊"
  117. );
  118. $seg_list = Finalseg::cut("怜香惜玉也得要看对象啊!");
  119. $this->assertEquals($case_array, $seg_list);
  120. }
  121. public function testExtractTags()
  122. {
  123. $case_array = array(
  124. "所謂"=>1.0102620424985915,
  125. "是否"=>0.7386504806253521,
  126. "一般"=>0.60759968349154936,
  127. "沒有"=>0.33675401416619716,
  128. "肌迫"=>0.33675401416619716,
  129. "雖然"=>0.33675401416619716,
  130. "退縮"=>0.33675401416619716,
  131. "矯作"=>0.33675401416619716,
  132. "怯懦"=>0.27109891642140843
  133. );
  134. $top_k = 9;
  135. $content = file_get_contents(dirname(dirname(__FILE__))."/src/dict/lyric.txt", "r");
  136. $tags = JiebaAnalyse::extractTags($content, $top_k);
  137. $this->assertEquals($case_array, $tags);
  138. }
  139. public function testLoadUserDict()
  140. {
  141. $case_array = array(
  142. "李小福",
  143. "是",
  144. "创新办",
  145. "主任",
  146. "也",
  147. "是",
  148. "云计算",
  149. "方面",
  150. "的",
  151. "专家"
  152. );
  153. Jieba::loadUserDict(dirname(dirname(__FILE__)).'/src/dict/user_dict.txt');
  154. $seg_list = Jieba::cut("李小福是创新办主任也是云计算方面的专家");
  155. $this->assertEquals($case_array, $seg_list);
  156. }
  157. public function testPossegCut()
  158. {
  159. $case_array = array(
  160. array(
  161. "word" => "这",
  162. "tag" => "r"
  163. ),
  164. array(
  165. "word" => "是",
  166. "tag" => "v"
  167. ),
  168. array(
  169. "word" => "一个",
  170. "tag" => "m"
  171. ),
  172. array(
  173. "word" => "伸手不见五指",
  174. "tag" => "i"
  175. ),
  176. array(
  177. "word" => "的",
  178. "tag" => "uj"
  179. ),
  180. array(
  181. "word" => "黑夜",
  182. "tag" => "n"
  183. ),
  184. array(
  185. "word" => "。",
  186. "tag" => "w"
  187. ),
  188. array(
  189. "word" => "我",
  190. "tag" => "r"
  191. ),
  192. array(
  193. "word" => "叫",
  194. "tag" => "v"
  195. ),
  196. array(
  197. "word" => "孙悟空",
  198. "tag" => "nr"
  199. ),
  200. array(
  201. "word" => ",",
  202. "tag" => "w"
  203. ),
  204. array(
  205. "word" => "我",
  206. "tag" => "r"
  207. ),
  208. array(
  209. "word" => "爱",
  210. "tag" => "v"
  211. ),
  212. array(
  213. "word" => "北京",
  214. "tag" => "ns"
  215. ),
  216. array(
  217. "word" => ",",
  218. "tag" => "w"
  219. ),
  220. array(
  221. "word" => "我",
  222. "tag" => "r"
  223. ),
  224. array(
  225. "word" => "爱",
  226. "tag" => "v"
  227. ),
  228. array(
  229. "word" => "Python",
  230. "tag" => "eng"
  231. ),
  232. array(
  233. "word" => "和",
  234. "tag" => "c"
  235. ),
  236. array(
  237. "word" => "C++",
  238. "tag" => "eng"
  239. ),
  240. array(
  241. "word" => "。",
  242. "tag" => "w"
  243. )
  244. );
  245. $seg_list = Posseg::cut("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。");
  246. $this->assertEquals($case_array, $seg_list);
  247. }
  248. }