123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553 |
- <?php
- namespace App\JsonRpc;
- use App\Amqp\Producer\GatherProducer;
- use App\Model\ArticleData;
- use App\Model\OldModel\Article as OldArticle;
- use App\Model\Article;
- use App\Model\Rule;
- use App\Model\Web;
- use Hyperf\Amqp\Producer;
- use Hyperf\Context\ApplicationContext as ContextApplicationContext;
- use Hyperf\DbConnection\Db;
- use Hyperf\Di\Annotation\Inject;
- use Hyperf\RpcServer\Annotation\RpcService;
- use App\Tools\Result;
- use QL\QueryList;
- use Swoole\Coroutine;
- //use App\Service\GatherQueueService;
- #[RpcService(name: "CollectorService", protocol: "jsonrpc-http", server: "jsonrpc-http")]
- class CollectorService implements CollectorServiceInterface
- {
- // #[Inject]
- // protected GatherQueueService $Gservice;
- /**
- * 添加网站
- * @param array $data
- * @return array|mixed
- */
- public function addWeb(array $data): array
- {
- $where = [
- 'name' => $data['name']
- ];
- $isweb = Web::where($where)->first();
- if(empty($isweb)){
- date_default_timezone_set('Asia/Shanghai');
- $time = time();
- $catetime = date('Y-m-d H:i:s', $time);
- $data['created_at'] = $catetime;
- $web = Web::insert($data);
-
- }else{
- return Result::error('此网站已存在,不可重复添加!');
- }
- if(empty($web)){
- return Result::error('添加失败');
- }
- return Result::success('添加成功');
- }
- /**
- * 获取并搜索网站
- * @param array $data
- * @return array|mixed
- */
- public function getWeb(array $data): array
- {
-
- if(isset($data['keyWord'])){
- $where = [
- ['name','like','%'.$data['keyWord'].'%']
- ];
- $webss = Web::where($where)->first();
- if(empty($webss)){
- return Result::error('未查找到相关网站!');
- }
- }else{
- $web = Web::get();
- }
-
- if(empty($web)){
- return Result::error('您还未添加网站,请先去添加!');
-
- }
-
- return Result::success($web);
- }
- /**
- * 修改网站
- * @param array $data
- * @return array|mixed
- */
- public function upWeb(array $data): array
- {
- $web = Web::where('id',$data['id'])->first();
- if(empty($web)){
- return Result::error('请输入正确的网站id!');
-
- }else{
- $id = Web::where('id',$data['id'])->update($data);
- if(empty($id)){
- return Result::error('无法修改!');
- }
- }
- return Result::success($id);
- }
- /**
- * 删除网站
- * @param array $data
- * @return array|mixed
- */
- public function delWeb(array $data): array
- {
- $web = Web::where('id',$data['id'])->first();
- if(empty($web)){
- return Result::error('请输入正确的网站id!');
-
- }else{
- $id = Web::where('id',$data['id'])->delete();
- if(empty($id)){
- return Result::error('无法删除!');
- }
- }
- return Result::success($id);
- }
- /**
- * 添加任务规则
- * @param array $data
- * @return array|mixed
- */
- public function addRule(array $data): array
- {
- $web = Web::where('id',$data['web_id'])->get();
- if(empty($web->toArray())){
- return Result::error('请输入正确的网站id!');
-
- }else{
- $rulename = Rule::where('name',$data['name'])->get();
- //查找是否存在规则名称重复的
- if(empty($rulename->toArray())){
- //(若是多类型参数一起传过来则根据类型,只获取对应类型需要的参数)
- switch($data['type']){
- case 1:
- $rule = [
- 'name' => $data['name'],
- 'web_id' => $data['web_id'],
- 'first_url' => $data['first_url'],
- 'second_start' => $data['second_start'],
- 'second_num' => $data['second_num'],
- 'second_end' => $data['second_end'],
- 'end_pagenum' => $data['end_pagenum'],
- 'start' => $data['start'],
- 'title' => $data['title'],
- 'content' => $data['content']
- ];
- // var_dump("============1============");
- break;
- case 2:
- $rule = [
- 'name' => $data['name'],
- 'web_id' => $data['web_id'],
- 'first_url' => $data['first_url'],
- 'parameter' => $data['parameter'],
- 'start' => $data['start'],
- 'title' => $data['title'],
- 'content' => $data['content']
- ];
- // var_dump("============2============");
- break;
- default:
- $rule = [
- 'name' => $data['name'],
- 'web_id' => $data['web_id'],
- 'diy_rule' => $data['diy_rule']
- ];
- // var_dump("============3============");
- break;
- }
- if(!empty($data['source']) && $data['type'] != 3){
- $rule ['source'] = $data['source'];
- }
- if(isset($data['writer_class']) && $data['type'] != 3){
- $rule ['writer_class'] = $data['writer_class'];
- }
- if(isset($data['writer']) && $data['type'] != 3){
- $rule ['writer'] = $data['writer'];
- }
- date_default_timezone_set('Asia/Shanghai');
- //若不存在,根据网站类型添加到不行类型的规则表中
- $result = Rule::insertGetId($rule);
-
- }else{
- return Result::error('此任务已存在!');
- }
-
- }
- return Result::success($result);
- }
- /**
- * 获取并搜索规则任务
- * @param array $data
- * @return array|mixed
- */
- public function getRule(array $data): array
- {
- $where = [];
-
- if(isset($data['web_id'])){
- $web = Web::where('id',$data['web_id'])->get();
- if(empty($web->toArray())){
- return Result::error('请输入正确的网站id!');
-
- }else{
- //若是根据网站跳转到的规则任务则存到$where数组中
- $where = [
- ['web_id','=', $data['web_id']]
- ];
- }
- }
- if(isset($data['keyWord'])){
- //若存在搜索词,则存到条件数组$where中
- $where = [
- ['name','like','%'.$data['keyWord'].'%']
- ];
- }
- if(empty($where)){
- $rep = Rule::withCount(relations:'arts')->limit($data['pageSize'])->orderBy("created_at","desc")->offset(($data['page']-1)*$data['pageSize'])->get();
- }else{
- $rep = Rule::withCount(relations:'arts')->where($where)->limit($data['pageSize'])->orderBy("created_at","desc")->offset(($data['page']-1)*$data['pageSize'])->get();
- }
-
- $count = Rule::where($where)->count();
- if($count==0){
- return Result::error('暂无相关规则任务!');
- }
-
- $data = [
- 'rep' => $rep->toArray(),
- 'count' => $count
- ];
- return Result::success($data);
- }
-
- /**
- * 获取某个任务规则
- * @param array $data
- * @return array|mixed
- */
- public function getOneRule(array $data): array
- {
- $result = Rule::where('id',$data['id'])->first();
- if(empty($result)){
- return Result::error('请输入正确的规则任务id!');
-
- }else{
- return Result::success($result);
- }
- }
- /**
- * 修改规则任务
- * @param array $data
- * @return array|mixed
- */
- public function upRule(array $data): array
- {
- $rule = Rule::where('id',$data['id'])->select('id')->first();
- unset($data['type']);
- if(empty($rule)){
- return Result::error('请输入正确的规则任务id!');
-
- }else{
- $rulename = Rule::where('id','!=',$rule['id'])->where('name',$data['name'])->select('name')->first();
- if(empty($rulename)){
- $result = Rule::where('id',$data['id'])->update($data);
- }else{
- return Result::error('已存在此任务规则名称!');
- }
- }
-
- return Result::success($result);
- }
- /**
- * 删除规则任务
- * @param array $data
- * @return array
- */
- public function sendCrawler(array $data): array
- {
- var_dump("接收到的数据:",$data);
- $message = new GatherProducer($data);
- $producer = ContextApplicationContext::getContainer()->get(Producer::class);
- $a = $producer->produce($message);
- var_dump("生产者:",$a);
- // $result = $this->Gservice->push($data,rand(5,20));
- return Result::success([]);
- }
- /**
- * @param array $data
- * @return array
- */
- public function goCrawler(array $data): array
- {
- //通过规则id 查询规则类型
- $where = [
- 'rule.id'=>$data['id']
- ];
- $info = Rule::where($where)->leftJoin('web','rule.web_id','web.id')
- ->select("rule.*","web.name as web_name","web.url as web_url","web.type as web_type")
- ->first();
- $info = $info->toArray();
- var_dump("规则信息:",$info);
- switch ($info['web_type']){
- case 1:
- var_dump("===========规则采集======",$info);
- try {
- Rule::where(['id'=>$data['id']])->update(['status'=>1]);
- //添加几个值
- $info['admin_user_id'] = $data['admin_user_id'];
- $info['rule_id'] = $data['id'];
- $info['copyfrom'] = $info['web_name'];
- $info['author'] = $info['writer'];
- // var_dump("++++++++++++++++++");
- $urlList = $this->addUrlArr($info);
- // var_dump("采集列表:",$urlList);
- if($urlList){
- foreach ($urlList as $val){
- $this->ruleCollection($val,$info);
- }
- }
- Rule::where(['id'=>$data['id']])->update(['status'=>2]);
- }catch (\Exception $e){
- var_dump("采集失败报错:",$e->getMessage());
- Rule::where(['id'=>$data['id']])->update(['status'=>2]);
- }
- break;
- case 2:
- Rule::where(['id'=>$data['id']])->update(['status'=>1]);
- $wecUrl = $info['first_url'];//'https://www.ndcpa.gov.cn/queryList';
- $parames = json_decode($info['parameter'],true);
- // var_dump($parames);die;
- $parames['webSiteCode'] = [trim($parames['webSiteCode'], "[]")]; //['jbkzzx'];//
- $parames['channelCode'] = [trim($parames['channelCode'], "[]")]; // ['c100008'];//
- $other = [
- 'web_url'=>$info['web_url'],
- 'copyfrom'=>$info['web_name'],
- 'admin_user_id'=>$data['admin_user_id'],
- 'rule_id'=>$data['id'],
- 'writer'=>$info['writer'],
- ];
- var_dump("=======开始接口采集====",$parames);
- // die;
- $this->foreachCurl($wecUrl,$parames,$other);
- Rule::where(['id'=>$data['id']])->update(['status'=>2]);
- }
- return Result::success([]);
- }
- /**
- * 把可采集的列表页连接 打包成一个大数组
- * @return void
- */
- public function addUrlArr($data)
- {
- $arrList = [];
- array_push($arrList,$data['first_url']);
- $exit = false;
- $i = 0;
- while(!$exit){
- $i++;
- $url = $data['second_start'].$i.$data['second_end'];
- $respon1 = Result::pageExists($url);
- // Coroutine::sleep(2);
- if ($i==intval($data['end_pagenum'])-1 || intval($data['end_pagenum'])-1==0) {
- $exit = true;
- // Coroutine::exit(); // 退出循环
- }else{
- array_push($arrList,$url);
- }
- }
- return $arrList;
- }
- /**
- * 按照规则采集数据
- * @return void
- */
- public function ruleCollection($url,$info)
- {
- // var_dump("采集参数:",$url,$info['start']);
- $list = QueryList::get($url);
- $dataList = $list->rules([
- 'title' => ['a:eq(0)', 'text'],
- 'link' => ['a:eq(0)', 'href'],
- ])->range($info['start'])->query()->getData();
- var_dump("采集的列表:",$dataList);
- $firstUrlArr = explode("/", $url);
- array_pop($firstUrlArr);
- $firstUrlArr = implode('/',$firstUrlArr);
- $dataList = $dataList->toArray();
- if($dataList){
- foreach ($dataList as $tiem){
- //检测采集的url是否存在网站域名 。存在就继续,不存在就检测是否是三方跳转
- $newUrlStr = $tiem['link'];
- if (strpos($tiem['link'], $info['web_url']) === false) {
- $array = ['http','https'];
- $link = $tiem['link'];
- $found = array_filter($array, function($item) use ($link) {
- return str_contains($link, $item);
- });
- if(count($found)>0){
- continue;
- }
- $newUrlStr = $info['con_url'].$tiem['link'];
- }
- var_dump("详情地址:",$newUrlStr);
- $detailContent = QueryList::get($newUrlStr);
- $rules = [];
- if($info['title']){
- $rules['title'] = [$info['title'],'text'];
- }
- if($info['content']){
- $rules['content'] = [$info['content'],'html'];
- }
- //详情页范围
- $detailRange = $info['con_start']??'';
- var_dump("打印规则:",$rules,"详情起始:", $info['con_start']);
- $detailData = $detailContent->rules($rules)->range($detailRange)->query()->getData();
- $detailData = $detailData->toArray();
- var_dump("内容详情:",$detailData,$newUrlStr);
- if($detailData){
- foreach ($detailData as $val){
- // var_dump("进没进foreach:",$newUrlStr,$val);
- $data = [];
- $data['fromurl'] = $newUrlStr;
- $data['title'] = $val['title'];
- $data['content'] = $val['content'];
- $data['newUrlStr'] = $newUrlStr;
- $data['introduce'] = $val['title']??'';
- $data['keyword'] = $val['title']??'';
- $data['copyfrom'] = $info['copyfrom'];
- $data['source'] = $info['source']??$info['copyfrom'];
- $data['admin_user_id'] = $info['admin_user_id']??'';
- $data['rule_id'] = $info['rule_id']??'';
- $data['author'] = $info['author']??'';
- $this->insertArticleData($data);
- }
- }
- }
- }
- }
- /**
- * 插入数据
- * @param $data
- * @return void
- */
- public function insertArticleData($data=[])
- {
- if($data){
- Db::beginTransaction();
- try{
- $articleInfo = Article::where(['title'=>$data['title']])->first();
- // var_dump("获取详情:",$articleInfo,$data);
- if(empty($articleInfo)){
- $insertData = [];
- $insertData['fromurl'] =$data['newUrlStr'];
- $insertData['oldtitle'] =$data['title'];
- $insertData['title'] = $data['title'];
- $insertData['copyfrom'] = $data['copyfrom'];
- $insertData['author'] = $data['author'];
- $insertData['introduce'] = $data['title'];
- $insertData['keyword'] = $data['title'];
- $insertData['source'] = isset($data['source']) && $data['source']!=''? $data['source']:$data['copyfrom'];
- $insertData['admin_user_id'] = $data['admin_user_id'];
- $insertData['rule_id'] = $data['rule_id'];
- // var_dump("插入Article:",$insertData);
- $article_id = Article::insertGetId($insertData);
- $insertDataDetail = [];
- $insertDataDetail['article_id'] = $article_id;
- $insertDataDetail['content'] = $data['content'];
- // var_dump("插入ArticleData:",$insertDataDetail);
- ArticleData::insertGetId($insertDataDetail);
- // Coroutine::sleep(2);
- // var_dump("插入成功一次:",$article_id,$insertDataDetail);
- }
- Db::commit();
- }catch (\Exception $e){
- Db::rollBack();
- var_dump("插入失败:",$e->getMessage());
- }
- }else{
- var_dump("没有数据可以插入:");
- }
- }
- /**
- * 分页采集
- * @return void
- */
- public function foreachCurl($wecUrl,$parames,$other,&$page=1)
- {
- $options = [
- CURLOPT_HEADER => true, // 如果想包含头部信息在响应中,可以设置为true
- CURLOPT_TIMEOUT => 30 // 设置请求超时时间为30秒
- ];
- $result = Result::http_post($wecUrl,$parames,$options);
- $result = json_decode($result['response'],true);
- // var_dump("获取数据:",$result);
- if($result['data'] && $result['data']['results']){
- $dataList = $result['data']['results'];
- // var_dump("取数据结构体:",$dataList);
- foreach ($dataList as $val){
- // var_dump("进入循环插入:",$val);
- $newUrlStr = json_decode($val['source']['urls'],true);
- $newUrlStr = $other['web_url'].$newUrlStr['common'];
- // var_dump("来源地址:",$newUrlStr);
- $insertData = [
- 'newUrlStr'=>$newUrlStr,
- 'title'=>$val['source']['title']??'',
- 'source'=>$val['source']['contentSource']??'',
- 'copyfrom'=>$other['copyfrom']??'',
- 'content'=>$val['source']['content']['content']??'',
- 'admin_user_id'=>$other['admin_user_id']??'',
- 'rule_id'=>$other['rule_id']??'',
- 'author'=>$other['writer']??''
- ];
- // var_dump("调用插入数据方法,组装数据:",$insertData);
- $this->insertArticleData($insertData);
- }
- }
- $pages = intval($parames['current']);
- $pages = $pages+1;
- $parames['current'] = $pages;
- $twoResult = Result::http_post($wecUrl,$parames,$options);
- if($result['data'] && $result['data']['results'] && count($result['data']['results'])>0){
- // var_dump("分页测试:",$parames,$parames['current']);
- $this->foreachCurl($wecUrl,$parames,$other,$pages);
- }
- // var_dump("正确的数据:",$result);
- }
- }
|