CollectorService.php 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347
  1. <?php
  2. namespace App\JsonRpc;
  3. use App\Model\ArticleData;
  4. use App\Model\OldModel\Article as OldArticle;
  5. use App\Model\Article;
  6. use App\Model\Rule;
  7. use App\Model\Web;
  8. use Hyperf\DbConnection\Db;
  9. use Hyperf\RpcServer\Annotation\RpcService;
  10. use App\Tools\Result;
  11. use QL\QueryList;
  12. use Swoole\Coroutine;
  13. #[RpcService(name: "CollectorService", protocol: "jsonrpc-http", server: "jsonrpc-http")]
  14. class CollectorService implements CollectorServiceInterface
  15. {
  16. /**
  17. * 添加网站
  18. * @param array $data
  19. * @return array|mixed
  20. */
  21. public function addWeb(array $data): array
  22. {
  23. $where = [
  24. 'name' => $data['name']
  25. ];
  26. $isweb = Web::where($where)->first();
  27. if(empty($isweb)){
  28. date_default_timezone_set('Asia/Shanghai');
  29. $time = time();
  30. $catetime = date('Y-m-d H:i:s', $time);
  31. $data['created_at'] = $catetime;
  32. $web = Web::insert($data);
  33. }else{
  34. return Result::error('此网站已存在,不可重复添加!');
  35. }
  36. if(empty($web)){
  37. return Result::error('添加失败');
  38. }
  39. return Result::success('添加成功');
  40. }
  41. /**
  42. * 获取并搜索网站
  43. * @param array $data
  44. * @return array|mixed
  45. */
  46. public function getWeb(array $data): array
  47. {
  48. if(isset($data['keyWord'])){
  49. $where = [
  50. ['name','like','%'.$data['keyWord'].'%']
  51. ];
  52. $webss = Web::where($where)->first();
  53. if(empty($webss)){
  54. return Result::error('未查找到相关网站!');
  55. }
  56. }else{
  57. $web = Web::get();
  58. }
  59. if(empty($web)){
  60. return Result::error('您还未添加网站,请先去添加!');
  61. }
  62. return Result::success($web);
  63. }
  64. /**
  65. * 修改网站
  66. * @param array $data
  67. * @return array|mixed
  68. */
  69. public function upWeb(array $data): array
  70. {
  71. $web = Web::where('id',$data['id'])->first();
  72. if(empty($web)){
  73. return Result::error('请输入正确的网站id!');
  74. }else{
  75. $id = Web::where('id',$data['id'])->update($data);
  76. if(empty($id)){
  77. return Result::error('无法修改!');
  78. }
  79. }
  80. return Result::success($id);
  81. }
  82. /**
  83. * 删除网站
  84. * @param array $data
  85. * @return array|mixed
  86. */
  87. public function delWeb(array $data): array
  88. {
  89. $web = Web::where('id',$data['id'])->first();
  90. if(empty($web)){
  91. return Result::error('请输入正确的网站id!');
  92. }else{
  93. $id = Web::where('id',$data['id'])->delete();
  94. if(empty($id)){
  95. return Result::error('无法删除!');
  96. }
  97. }
  98. return Result::success($id);
  99. }
  100. /**
  101. * @param array $data
  102. * @return array
  103. */
  104. public function sendCrawler(array $data): array
  105. {
  106. //通过规则id 查询规则类型
  107. $where = [
  108. 'rule.id'=>$data['id']
  109. ];
  110. $info = Rule::where($where)->leftJoin('web','rule.web_id','web.id')
  111. ->select("rule.*","web.name as web_name","web.url as web_url","web.type as web_type")
  112. ->first();
  113. $info = $info->toArray();
  114. switch ($info['web_type']){
  115. case 1:
  116. var_dump("wojinlailaile======",$info);
  117. Rule::where(['id'=>$data['id']])->update(['status'=>1]);
  118. $data['copyfrom'] = $info['web_name'];
  119. $data['author'] = '刘德华';
  120. $data['first_url'] = $info['first_url'];
  121. $data['second_start'] = $info['second_start'];
  122. $data['second_num'] = $info['second_num'];
  123. $data['second_end'] = $info['second_end'];
  124. $data['end_pagenum']= $info['end_pagenum'];
  125. $data['rule_id']= $data['id'];
  126. $data['admin_user_id']= $data['admin_user_id'];
  127. // $data['newUrlStr'] =
  128. $urlList = $this->addUrlArr($data);
  129. if($urlList){
  130. foreach ($urlList as $val){
  131. var_dump("单列表地址:",$val);
  132. $this->ruleCollection($val,$data);
  133. }
  134. }
  135. Rule::where(['id'=>$data['id']])->update(['status'=>2]);
  136. break;
  137. case 2:
  138. Rule::where(['id'=>$data['id']])->update(['status'=>1]);
  139. $wecUrl = $info['first_url'];//'https://www.ndcpa.gov.cn/queryList';
  140. $parames = json_decode($info['parameter'],true);
  141. // var_dump($parames);die;
  142. $parames['webSiteCode'] = [trim($parames['webSiteCode'], "[]")]; //['jbkzzx'];//
  143. $parames['channelCode'] = [trim($parames['channelCode'], "[]")]; // ['c100008'];//
  144. $other = [
  145. 'web_url'=>$info['web_url'],
  146. 'copyfrom'=>$info['web_name'],
  147. 'admin_user_id'=>$data['admin_user_id'],
  148. 'rule_id'=>$data['id']
  149. ];
  150. var_dump("开始调用接口方法====",$parames);
  151. // die;
  152. $this->foreachCurl($wecUrl,$parames,$other);
  153. Rule::where(['id'=>$data['id']])->update(['status'=>2]);
  154. }
  155. return Result::success([]);
  156. }
  157. /**
  158. * 把可采集的列表页连接 打包成一个大数组
  159. * @return void
  160. */
  161. public function addUrlArr($data)
  162. {
  163. $arrList = [];
  164. array_push($arrList,$data['first_url']);
  165. $exit = false;
  166. $i = 0;
  167. while(!$exit){
  168. $i++;
  169. $url = $data['second_start'].$i.$data['second_end'];
  170. $respon1 = Result::pageExists($url);
  171. var_dump("采集地址:",$respon1,$url);
  172. // Coroutine::sleep(2);
  173. if ($i==intval($data['end_pagenum'])-1) {
  174. $exit = true;
  175. // Coroutine::exit(); // 退出循环
  176. }else{
  177. array_push($arrList,$url);
  178. }
  179. }
  180. return $arrList;
  181. }
  182. /**
  183. * 按照规则采集数据
  184. * @return void
  185. */
  186. public function ruleCollection($url,$data)
  187. {
  188. var_dump("采集参数:",$data);
  189. $list = QueryList::get($url);
  190. $dataList = $list->rules([
  191. 'title' => ['a', 'text'],
  192. 'link' => ['a', 'href'],
  193. ])->range('.list1 li')->query()->getData();
  194. var_dump("采集的内容:",$dataList);
  195. // var_dump("====",$dataList);die;
  196. $firstUrlArr = explode("/", $url);
  197. array_pop($firstUrlArr);
  198. $firstUrlArr = implode('/',$firstUrlArr);
  199. $dataList = $dataList->toArray();
  200. if($dataList){
  201. foreach ($dataList as $tiem){
  202. $newUrl = substr($tiem['link'], 1);
  203. $newUrlStr = $firstUrlArr.$newUrl;
  204. $detailContent = QueryList::get($newUrlStr);
  205. $detailData = $detailContent->rules([
  206. 'title'=>['h1','text'],
  207. 'content'=>['.TRS_UEDITOR','html'],
  208. ])->range(".news-details")->query()->getData();
  209. $detailData = $detailData->toArray();
  210. var_dump("内容详情:",$detailData,$newUrlStr);
  211. if($detailData){
  212. foreach ($detailData as $val){
  213. var_dump("进没进foreach:",$newUrlStr,$val);
  214. $data['fromurl'] = $newUrlStr;
  215. $data['title'] = $val['title'];
  216. $data['content'] = $val['content'];
  217. $data['newUrlStr'] = $newUrlStr;
  218. $data['source'] = '';
  219. $data['introduce'] = $val['title']??'';
  220. $data['keyword'] = $val['title']??'';
  221. $data['copyfrom'] = $data['copyfrom'];
  222. $data['source'] = $data['source']??$data['copyfrom'];
  223. $data['admin_user_id'] = $data['admin_user_id']??'';
  224. $data['rule_id'] = $data['rule_id']??'';
  225. // $data['copyfrom'] = $data['copyfrom'];
  226. var_dump("要插入的数据:",$data);
  227. $this->insertArticleData($data);
  228. }
  229. }
  230. }
  231. //
  232. }
  233. }
  234. /**
  235. * 插入数据
  236. * @param $data
  237. * @return void
  238. */
  239. public function insertArticleData($data=[])
  240. {
  241. if($data){
  242. Db::beginTransaction();
  243. try{
  244. $articleInfo = Article::where(['title'=>$data['title']])->first();
  245. var_dump("获取详情:",$articleInfo,$data);
  246. if(empty($articleInfo)){
  247. $insertData = [];
  248. $insertData['fromurl'] =$data['newUrlStr'];
  249. $insertData['oldtitle'] =$data['title'];
  250. $insertData['title'] = $data['title'];
  251. $insertData['copyfrom'] = $data['copyfrom'];
  252. $insertData['author'] = $data['author'];
  253. $insertData['introduce'] = $data['title'];
  254. $insertData['keyword'] = $data['title'];
  255. $insertData['source'] = isset($data['source']) && $data['source']!=''? $data['source']:$data['copyfrom'];
  256. $insertData['admin_user_id'] = $data['admin_user_id'];
  257. $insertData['rule_id'] = $data['rule_id'];
  258. // var_dump("插入Article:",$insertData);
  259. $article_id = Article::insertGetId($insertData);
  260. $insertDataDetail = [];
  261. $insertDataDetail['article_id'] = $article_id;
  262. $insertDataDetail['content'] = $data['content'];
  263. var_dump("插入ArticleData:",$insertDataDetail);
  264. ArticleData::insertGetId($insertDataDetail);
  265. // Coroutine::sleep(2);
  266. // var_dump("插入成功一次:",$article_id,$insertDataDetail);
  267. }
  268. Db::commit();
  269. }catch (\Exception $e){
  270. Db::rollBack();
  271. var_dump("插入失败:",$e->getMessage());
  272. }
  273. }else{
  274. var_dump("没有数据可以插入:");
  275. }
  276. }
  277. /**
  278. * 分页采集
  279. * @return void
  280. */
  281. public function foreachCurl($wecUrl,$parames,$other,&$page=1)
  282. {
  283. $options = [
  284. CURLOPT_HEADER => true, // 如果想包含头部信息在响应中,可以设置为true
  285. CURLOPT_TIMEOUT => 30 // 设置请求超时时间为30秒
  286. ];
  287. $result = Result::http_post($wecUrl,$parames,$options);
  288. $result = json_decode($result['response'],true);
  289. var_dump("获取数据:",$result);
  290. if($result['data'] && $result['data']['results']){
  291. $dataList = $result['data']['results'];
  292. // var_dump("取数据结构体:",$dataList);
  293. foreach ($dataList as $val){
  294. // var_dump("进入循环插入:",$val);
  295. $newUrlStr = json_decode($val['source']['urls'],true);
  296. $newUrlStr = $other['web_url'].$newUrlStr['common'];
  297. // var_dump("来源地址:",$newUrlStr);
  298. $insertData = [
  299. 'newUrlStr'=>$newUrlStr,
  300. 'title'=>$val['source']['title']??'',
  301. 'source'=>$val['source']['contentSource']??'',
  302. 'copyfrom'=>$other['copyfrom']??'',
  303. 'content'=>$val['source']['content']['content']??'',
  304. 'admin_user_id'=>$other['admin_user_id']??'',
  305. 'rule_id'=>$other['rule_id']??'',
  306. 'author'=>'冯蕊'
  307. ];
  308. // var_dump("调用插入数据方法,组装数据:",$insertData);
  309. $this->insertArticleData($insertData);
  310. }
  311. }
  312. $pages = intval($parames['current']);
  313. $pages = $pages+1;
  314. $parames['current'] = $pages;
  315. $twoResult = Result::http_post($wecUrl,$parames,$options);
  316. if($result['data'] && $result['data']['results'] && count($result['data']['results'])>0){
  317. var_dump("分页测试:",$parames,$parames['current']);
  318. $this->foreachCurl($wecUrl,$parames,$other,$pages);
  319. }
  320. // var_dump("正确的数据:",$result);
  321. }
  322. }