$data['name'] ]; $isweb = Web::where($where)->first(); if(empty($isweb)){ date_default_timezone_set('Asia/Shanghai'); $time = time(); $catetime = date('Y-m-d H:i:s', $time); $data['created_at'] = $catetime; $web = Web::insert($data); }else{ return Result::error('此网站已存在,不可重复添加!'); } if(empty($web)){ return Result::error('添加失败'); } return Result::success('添加成功'); } /** * 获取并搜索网站 * @param array $data * @return array|mixed */ public function getWeb(array $data): array { if(isset($data['keyWord'])){ $where = [ ['name','like','%'.$data['keyWord'].'%'] ]; $webss = Web::where($where)->first(); if(empty($webss)){ return Result::error('未查找到相关网站!'); } }else{ $web = Web::get(); } if(empty($web)){ return Result::error('您还未添加网站,请先去添加!'); } return Result::success($web); } /** * 修改网站 * @param array $data * @return array|mixed */ public function upWeb(array $data): array { $web = Web::where('id',$data['id'])->first(); if(empty($web)){ return Result::error('请输入正确的网站id!'); }else{ $id = Web::where('id',$data['id'])->update($data); if(empty($id)){ return Result::error('无法修改!'); } } return Result::success($id); } /** * 删除网站 * @param array $data * @return array|mixed */ public function delWeb(array $data): array { $web = Web::where('id',$data['id'])->first(); if(empty($web)){ return Result::error('请输入正确的网站id!'); }else{ $id = Web::where('id',$data['id'])->delete(); if(empty($id)){ return Result::error('无法删除!'); } } return Result::success($id); } /** * 添加任务规则 * @param array $data * @return array|mixed */ public function addRule(array $data): array { $web = Web::where('id',$data['web_id'])->get(); if(empty($web->toArray())){ return Result::error('请输入正确的网站id!'); }else{ $rulename = Rule::where('name',$data['name'])->get(); //查找是否存在规则名称重复的 if(empty($rulename->toArray())){ //(若是多类型参数一起传过来则根据类型,只获取对应类型需要的参数) switch($data['type']){ case 1: $rule = [ 'name' => $data['name'], 'web_id' => $data['web_id'], 'first_url' => $data['first_url'], 'second_start' => $data['second_start'], 'second_num' => $data['second_num'], 'second_end' => $data['second_end'], 'end_pagenum' => $data['end_pagenum'], 'start' => $data['start'], 'title' => $data['title'], 'content' => $data['content'] ]; // var_dump("============1============"); break; case 2: $rule = [ 'name' => $data['name'], 'web_id' => $data['web_id'], 'first_url' => $data['first_url'], 'parameter' => $data['parameter'], 'start' => $data['start'], 'title' => $data['title'], 'content' => $data['content'] ]; // var_dump("============2============"); break; default: $rule = [ 'name' => $data['name'], 'web_id' => $data['web_id'], 'diy_rule' => $data['diy_rule'] ]; // var_dump("============3============"); break; } if(!empty($data['source']) && $data['type'] != 3){ $rule ['source'] = $data['source']; } if(isset($data['writer_class']) && $data['type'] != 3){ $rule ['writer_class'] = $data['writer_class']; } if(isset($data['writer']) && $data['type'] != 3){ $rule ['writer'] = $data['writer']; } date_default_timezone_set('Asia/Shanghai'); //若不存在,根据网站类型添加到不行类型的规则表中 $result = Rule::insertGetId($rule); }else{ return Result::error('此任务已存在!'); } } return Result::success($result); } /** * 获取并搜索规则任务 * @param array $data * @return array|mixed */ public function getRule(array $data): array { $where = []; if(isset($data['web_id'])){ $web = Web::where('id',$data['web_id'])->get(); if(empty($web->toArray())){ return Result::error('请输入正确的网站id!'); }else{ //若是根据网站跳转到的规则任务则存到$where数组中 $where = [ ['web_id','=', $data['web_id']] ]; } } if(isset($data['keyWord'])){ //若存在搜索词,则存到条件数组$where中 $where = [ ['name','like','%'.$data['keyWord'].'%'] ]; } if(empty($where)){ $rep = Rule::withCount(relations:'arts')->limit($data['pageSize'])->orderBy("created_at","desc")->offset(($data['page']-1)*$data['pageSize'])->get(); }else{ $rep = Rule::withCount(relations:'arts')->where($where)->limit($data['pageSize'])->orderBy("created_at","desc")->offset(($data['page']-1)*$data['pageSize'])->get(); } $count = Rule::where($where)->count(); if($count==0){ return Result::error('暂无相关规则任务!'); } $data = [ 'rep' => $rep->toArray(), 'count' => $count ]; return Result::success($data); } /** * 获取某个任务规则 * @param array $data * @return array|mixed */ public function getOneRule(array $data): array { $result = Rule::where('id',$data['id'])->first(); if(empty($result)){ return Result::error('请输入正确的规则任务id!'); }else{ return Result::success($result); } } /** * 修改规则任务 * @param array $data * @return array|mixed */ public function upRule(array $data): array { $rule = Rule::where('id',$data['id'])->select('id')->first(); unset($data['type']); if(empty($rule)){ return Result::error('请输入正确的规则任务id!'); }else{ $rulename = Rule::where('id','!=',$rule['id'])->where('name',$data['name'])->select('name')->first(); if(empty($rulename)){ $result = Rule::where('id',$data['id'])->update($data); }else{ return Result::error('已存在此任务规则名称!'); } } return Result::success($result); } /** * 删除规则任务 * @param array $data * @return array */ public function sendCrawler(array $data): array { var_dump("接收到的数据:",$data); $message = new GatherProducer($data); $producer = ContextApplicationContext::getContainer()->get(Producer::class); $a = $producer->produce($message); var_dump("生产者:",$a); // $result = $this->Gservice->push($data,rand(5,20)); return Result::success([]); } /** * @param array $data * @return array */ public function goCrawler(array $data): array { //通过规则id 查询规则类型 $where = [ 'rule.id'=>$data['id'] ]; $info = Rule::where($where)->leftJoin('web','rule.web_id','web.id') ->select("rule.*","web.name as web_name","web.url as web_url","web.type as web_type") ->first(); $info = $info->toArray(); // var_dump("规则信息:",$info); switch ($info['web_type']){ case 1: var_dump("===========规则采集======",$info); Rule::where(['id'=>$data['id']])->update(['status'=>1]); //添加几个值 $info['admin_user_id'] = $data['admin_user_id']; $info['rule_id'] = $data['id']; $info['copyfrom'] = $info['web_name']; $info['author'] = $info['writer']; // var_dump("++++++++++++++++++"); $urlList = $this->addUrlArr($info); // var_dump("采集列表:",$urlList); if($urlList){ foreach ($urlList as $val){ $this->ruleCollection($val,$info); } } Rule::where(['id'=>$data['id']])->update(['status'=>2]); break; case 2: Rule::where(['id'=>$data['id']])->update(['status'=>1]); $wecUrl = $info['first_url'];//'https://www.ndcpa.gov.cn/queryList'; $parames = json_decode($info['parameter'],true); // var_dump($parames);die; $parames['webSiteCode'] = [trim($parames['webSiteCode'], "[]")]; //['jbkzzx'];// $parames['channelCode'] = [trim($parames['channelCode'], "[]")]; // ['c100008'];// $other = [ 'web_url'=>$info['web_url'], 'copyfrom'=>$info['web_name'], 'admin_user_id'=>$data['admin_user_id'], 'rule_id'=>$data['id'], 'writer'=>$info['writer'], ]; var_dump("=======开始接口采集====",$parames); // die; $this->foreachCurl($wecUrl,$parames,$other); Rule::where(['id'=>$data['id']])->update(['status'=>2]); } return Result::success([]); } /** * 把可采集的列表页连接 打包成一个大数组 * @return void */ public function addUrlArr($data) { $arrList = []; array_push($arrList,$data['first_url']); $exit = false; $i = 0; while(!$exit){ $i++; $url = $data['second_start'].$i.$data['second_end']; $respon1 = Result::pageExists($url); // Coroutine::sleep(2); if ($i==intval($data['end_pagenum'])-1 || intval($data['end_pagenum'])-1==0) { $exit = true; // Coroutine::exit(); // 退出循环 }else{ array_push($arrList,$url); } } return $arrList; } /** * 按照规则采集数据 * @return void */ public function ruleCollection($url,$info) { var_dump("采集参数:",$url,$info['start']); $list = QueryList::get($url); $dataList = $list->rules([ 'title' => ['a:eq(0)', 'text'], 'link' => ['a:eq(0)', 'href'], ])->range($info['start'])->query()->getData(); var_dump("采集的内容:",$dataList); $firstUrlArr = explode("/", $url); array_pop($firstUrlArr); $firstUrlArr = implode('/',$firstUrlArr); $dataList = $dataList->toArray(); if($dataList){ foreach ($dataList as $tiem){ //检测采集的url是否存在网站域名 。存在就继续,不存在就检测是否是三方跳转 $newUrlStr = $tiem['link']; if (strpos($tiem['link'], $info['web_url']) === false) { $array = ['http','https']; $link = $tiem['link']; $found = array_filter($array, function($item) use ($link) { return str_contains($link, $item); }); if(count($found)>0){ continue; } $newUrlStr = $info['con_url'].$tiem['link']; } // $detailContent = QueryList::get($newUrlStr); $detailContent = QueryList::get($newUrlStr); $rules = []; if($info['title']){ $rules['title'] = [$info['title'],'text']; } if($info['content']){ $rules['content'] = [$info['content'],'html']; } //详情页范围 $detailRange = $info['con_start']??''; var_dump("打印规则:",$rules,"详情起始:", $info['con_start']); $detailData = $detailContent->rules($rules)->range($detailRange)->query()->getData(); $detailData = $detailData->toArray(); var_dump("内容详情:",$detailData,$newUrlStr); if($detailData){ foreach ($detailData as $val){ // var_dump("进没进foreach:",$newUrlStr,$val); $data = []; $data['fromurl'] = $newUrlStr; $data['title'] = $val['title']; $data['content'] = $val['content']; $data['newUrlStr'] = $newUrlStr; $data['introduce'] = $val['title']??''; $data['keyword'] = $val['title']??''; $data['copyfrom'] = $info['copyfrom']; $data['source'] = $info['source']??$info['copyfrom']; $data['admin_user_id'] = $info['admin_user_id']??''; $data['rule_id'] = $info['rule_id']??''; $data['author'] = $info['author']??''; $this->insertArticleData($data); } } } } } /** * 插入数据 * @param $data * @return void */ public function insertArticleData($data=[]) { if($data){ Db::beginTransaction(); try{ $articleInfo = Article::where(['title'=>$data['title']])->first(); // var_dump("获取详情:",$articleInfo,$data); if(empty($articleInfo)){ $insertData = []; $insertData['fromurl'] =$data['newUrlStr']; $insertData['oldtitle'] =$data['title']; $insertData['title'] = $data['title']; $insertData['copyfrom'] = $data['copyfrom']; $insertData['author'] = $data['author']; $insertData['introduce'] = $data['title']; $insertData['keyword'] = $data['title']; $insertData['source'] = isset($data['source']) && $data['source']!=''? $data['source']:$data['copyfrom']; $insertData['admin_user_id'] = $data['admin_user_id']; $insertData['rule_id'] = $data['rule_id']; // var_dump("插入Article:",$insertData); $article_id = Article::insertGetId($insertData); $insertDataDetail = []; $insertDataDetail['article_id'] = $article_id; $insertDataDetail['content'] = $data['content']; // var_dump("插入ArticleData:",$insertDataDetail); ArticleData::insertGetId($insertDataDetail); // Coroutine::sleep(2); // var_dump("插入成功一次:",$article_id,$insertDataDetail); } Db::commit(); }catch (\Exception $e){ Db::rollBack(); var_dump("插入失败:",$e->getMessage()); } }else{ var_dump("没有数据可以插入:"); } } /** * 分页采集 * @return void */ public function foreachCurl($wecUrl,$parames,$other,&$page=1) { $options = [ CURLOPT_HEADER => true, // 如果想包含头部信息在响应中,可以设置为true CURLOPT_TIMEOUT => 30 // 设置请求超时时间为30秒 ]; $result = Result::http_post($wecUrl,$parames,$options); $result = json_decode($result['response'],true); // var_dump("获取数据:",$result); if($result['data'] && $result['data']['results']){ $dataList = $result['data']['results']; // var_dump("取数据结构体:",$dataList); foreach ($dataList as $val){ // var_dump("进入循环插入:",$val); $newUrlStr = json_decode($val['source']['urls'],true); $newUrlStr = $other['web_url'].$newUrlStr['common']; // var_dump("来源地址:",$newUrlStr); $insertData = [ 'newUrlStr'=>$newUrlStr, 'title'=>$val['source']['title']??'', 'source'=>$val['source']['contentSource']??'', 'copyfrom'=>$other['copyfrom']??'', 'content'=>$val['source']['content']['content']??'', 'admin_user_id'=>$other['admin_user_id']??'', 'rule_id'=>$other['rule_id']??'', 'author'=>$other['writer']??'' ]; // var_dump("调用插入数据方法,组装数据:",$insertData); $this->insertArticleData($insertData); } } $pages = intval($parames['current']); $pages = $pages+1; $parames['current'] = $pages; $twoResult = Result::http_post($wecUrl,$parames,$options); if($result['data'] && $result['data']['results'] && count($result['data']['results'])>0){ // var_dump("分页测试:",$parames,$parames['current']); $this->foreachCurl($wecUrl,$parames,$other,$pages); } // var_dump("正确的数据:",$result); } }