Przeglądaj źródła

获取并搜索任务规则

15313670163 4 miesięcy temu
rodzic
commit
67c0cf45a1
1 zmienionych plików z 240 dodań i 10 usunięć
  1. 240 10
      app/JsonRpc/CollectorService.php

+ 240 - 10
app/JsonRpc/CollectorService.php

@@ -1,22 +1,25 @@
 <?php
 namespace App\JsonRpc;
 
+use App\Model\ArticleData;
 use App\Model\OldModel\Article as OldArticle;
 use App\Model\OldModel\ArticleData as OldArticleData;
 use App\Model\OldModel\Category;
 use App\Model\Article;
-use App\Model\Web;
 use App\Model\Rule;
-use App\Model\ArticleData;
+use App\Model\Web;
 use Hyperf\DbConnection\Db;
 use Hyperf\RpcServer\Annotation\RpcService;
 use App\Tools\Result;
+use QL\QueryList;
+use Swoole\Coroutine;
 
 use function Hyperf\Support\retry;
 
 #[RpcService(name: "CollectorService", protocol: "jsonrpc-http", server: "jsonrpc-http")]
 class CollectorService implements CollectorServiceInterface
 {
+
     /**
      * 添加网站
      * @param array $data
@@ -218,6 +221,7 @@ class CollectorService implements CollectorServiceInterface
      */
     public function getRule(array $data): array
     {
+        $where = [];
         if(isset($data['web_id'])){
             $web = Web::where('id',$data['web_id'])->get();
             if(empty($web->toArray())){
@@ -235,7 +239,13 @@ class CollectorService implements CollectorServiceInterface
                 ['name','like','%'.$data['keyWord'].'%']
             ];
         }
-        $rep = Rule::withCount(relations:'arts')->where($where)->limit($data['pageSize'])->orderBy("created_at","desc")->offset(($data['page']-1)*$data['pageSize'])->get();
+        // var_dump("===============",$where);
+        if(empty($where)){
+            $rep = Rule::withCount(relations:'arts')->limit($data['pageSize'])->orderBy("created_at","desc")->offset(($data['page']-1)*$data['pageSize'])->get();
+        }else{
+            $rep = Rule::withCount(relations:'arts')->where($where)->limit($data['pageSize'])->orderBy("created_at","desc")->offset(($data['page']-1)*$data['pageSize'])->get();
+        }
+        
         $count = Rule::where($where)->count();
         if($count==0){
             return Result::error('暂无相关规则任务!');
@@ -245,7 +255,7 @@ class CollectorService implements CollectorServiceInterface
             'rep' => $rep->toArray(),
             'count' => $count
         ];
-        return Result::success($data);
+        return Result::success($data)
     }
      /**
      * 获取某个任务规则
@@ -332,13 +342,233 @@ class CollectorService implements CollectorServiceInterface
      */
     public function sendCrawler(array $data): array
     {
-        $result =  Article::get();
-        $b = OldArticle::get();
-        $a = [
-            'old'=>$b,
-            'new'=>$result
+        //通过规则id 查询规则类型
+        $where = [
+            'rule.id'=>$data['id']
         ];
-        return  Result::success($a);
+        $info = Rule::where($where)->leftJoin('web','rule.web_id','web.id')
+            ->select("rule.*","web.name as web_name","web.url as web_url","web.type as web_type")
+            ->first();
+        $info = $info->toArray();
+
+        switch ($info['web_type']){
+            case 1:
+                var_dump("wojinlailaile======",$info);
+                Rule::where(['id'=>$data['id']])->update(['status'=>1]);
+                $data['copyfrom'] = $info['web_name'];
+                $data['author'] = '刘德华';
+                $data['first_url'] = $info['first_url'];
+                $data['second_start'] = $info['second_start'];
+                $data['second_num'] = $info['second_num'];
+                $data['second_end'] = $info['second_end'];
+                $data['end_pagenum']= $info['end_pagenum'];
+                $data['rule_id']= $data['id'];
+                $data['admin_user_id']= $data['admin_user_id'];
+//                $data['newUrlStr'] =
+                $urlList = $this->addUrlArr($data);
+                if($urlList){
+                    foreach ($urlList as $val){
+                        var_dump("单列表地址:",$val);
+                        $this->ruleCollection($val,$data);
+                    }
+                }
+                Rule::where(['id'=>$data['id']])->update(['status'=>2]);
+                break;
+            case 2:
+
+                $wecUrl = $info['first_url'];//'https://www.ndcpa.gov.cn/queryList';
+                $parames = json_decode($info['parameter'],true);
+                $parames['webSiteCode'] = [trim($parames['webSiteCode'], "[]")]; //['jbkzzx'];//
+                $parames['channelCode'] =  [trim($parames['channelCode'], "[]")]; // ['c100008'];//
+                $other = [
+                    'web_url'=>$info['web_url'],
+                    'copyfrom'=>$info['web_name'],
+                    'admin_user_id'=>$data['admin_user_id'],
+                    'rule_id'=>$data['id']
+                ];
+                var_dump("开始调用接口方法====",$parames);
+//                die;
+                $this->foreachCurl($wecUrl,$parames,$other);
+
+        }
+        return  Result::success([]);
+    }
+
+    /**
+     * 把可采集的列表页连接 打包成一个大数组
+     * @return void
+     */
+    public function addUrlArr($data)
+    {
+        $arrList = [];
+        array_push($arrList,$data['first_url']);
+
+            $exit = false;
+            $i = 0;
+            while(!$exit){
+                $i++;
+                $url = $data['second_start'].$i.$data['second_end'];
+                $respon1 = Result::pageExists($url);
+                var_dump("采集地址:",$respon1,$url);
+//                Coroutine::sleep(2);
+                if ($i==intval($data['end_pagenum'])-1) {
+                    $exit = true;
+//                    Coroutine::exit(); // 退出循环
+                }else{
+                    array_push($arrList,$url);
+                }
+            }
+
+
+        return $arrList;
+    }
+
+    /**
+     * 按照规则采集数据
+     * @return void
+     */
+    public function ruleCollection($url,$data)
+    {
+        var_dump("采集参数:",$data);
+        $list = QueryList::get($url);
+        $dataList = $list->rules([
+            'title' => ['a', 'text'],
+            'link'  => ['a', 'href'],
+        ])->range('.list1 li')->query()->getData();
+        var_dump("采集的内容:",$dataList);
+//        var_dump("====",$dataList);die;
+        $firstUrlArr =  explode("/", $url);
+        array_pop($firstUrlArr);
+        $firstUrlArr = implode('/',$firstUrlArr);
+
+        $dataList = $dataList->toArray();
+        if($dataList){
+            foreach ($dataList as $tiem){
+                $newUrl =  substr($tiem['link'], 1);
+                $newUrlStr = $firstUrlArr.$newUrl;
+                $detailContent = QueryList::get($newUrlStr);
+                $detailData = $detailContent->rules([
+                    'title'=>['h1','text'],
+                    'content'=>['.TRS_UEDITOR','html'],
+                ])->range(".news-details")->query()->getData();
+
+                $detailData = $detailData->toArray();
+                var_dump("内容详情:",$detailData,$newUrlStr);
+                if($detailData){
+                    foreach ($detailData as $val){
+                        var_dump("进没进foreach:",$newUrlStr,$val);
+                        $data['fromurl'] = $newUrlStr;
+                        $data['title'] = $val['title'];
+                        $data['content'] = $val['content'];
+                        $data['newUrlStr'] = $newUrlStr;
+                        $data['source'] = '';
+                        $data['introduce'] = $val['title']??'';
+                        $data['keyword'] = $val['title']??'';
+                        $data['copyfrom'] = $data['copyfrom'];
+                        $data['source'] = $data['source']??$data['copyfrom'];
+                        $data['admin_user_id'] = $data['admin_user_id']??'';
+                        $data['rule_id'] = $data['rule_id']??'';
+//                        $data['copyfrom'] = $data['copyfrom'];
+                        var_dump("要插入的数据:",$data);
+                        $this->insertArticleData($data);
+                    }
+                }
+
+            }
+//
+        }
+    }
+
+    /**
+     * 插入数据
+     * @param $data
+     * @return void
+     */
+    public function insertArticleData($data=[])
+    {
+        if($data){
+            Db::beginTransaction();
+            try{
+                $articleInfo =  Article::where(['title'=>$data['title']])->first();
+                var_dump("获取详情:",$articleInfo,$data);
+                if(empty($articleInfo)){
+                    $insertData = [];
+                    $insertData['fromurl'] =$data['newUrlStr'];
+                    $insertData['oldtitle'] =$data['title'];
+                    $insertData['title'] = $data['title'];
+                    $insertData['copyfrom'] =  $data['copyfrom'];
+                    $insertData['author'] = $data['author'];
+                    $insertData['introduce'] = $data['title'];
+                    $insertData['keyword'] = $data['title'];
+                    $insertData['source'] = isset($data['source']) && $data['source']!=''? $data['source']:$data['copyfrom'];
+                    $insertData['admin_user_id'] = $data['admin_user_id'];
+                    $insertData['rule_id'] = $data['rule_id'];
+//                    var_dump("插入Article:",$insertData);
+                    $article_id = Article::insertGetId($insertData);
+                    $insertDataDetail = [];
+                    $insertDataDetail['article_id'] = $article_id;
+                    $insertDataDetail['content'] = $data['content'];
+                    var_dump("插入ArticleData:",$insertDataDetail);
+                    ArticleData::insertGetId($insertDataDetail);
+//                        Coroutine::sleep(2);
+//                    var_dump("插入成功一次:",$article_id,$insertDataDetail);
+                }
+                Db::commit();
+            }catch (\Exception $e){
+                Db::rollBack();
+                var_dump("插入失败:",$e->getMessage());
+            }
+
+        }else{
+            var_dump("没有数据可以插入:");
+        }
+
+    }
+
+    /**
+     * 分页采集
+     * @return void
+     */
+    public function foreachCurl($wecUrl,$parames,$other,&$page=1)
+    {
+        $options = [
+            CURLOPT_HEADER => true, // 如果想包含头部信息在响应中,可以设置为true
+            CURLOPT_TIMEOUT => 30 // 设置请求超时时间为30秒
+        ];
+        $result = Result::http_post($wecUrl,$parames,$options);
+        $result = json_decode($result['response'],true);
+        var_dump("获取数据:",$result);
+        if($result['data'] && $result['data']['results']){
+            $dataList  = $result['data']['results'];
+//            var_dump("取数据结构体:",$dataList);
+            foreach ($dataList as $val){
+//                var_dump("进入循环插入:",$val);
+                $newUrlStr = json_decode($val['source']['urls'],true);
+                $newUrlStr = $other['web_url'].$newUrlStr['common'];
+//                var_dump("来源地址:",$newUrlStr);
+                $insertData = [
+                    'newUrlStr'=>$newUrlStr,
+                    'title'=>$val['source']['title']??'',
+                    'source'=>$val['source']['contentSource']??'',
+                    'copyfrom'=>$other['copyfrom']??'',
+                    'content'=>$val['source']['content']['content']??'',
+                    'admin_user_id'=>$other['admin_user_id']??'',
+                    'rule_id'=>$other['rule_id']??'',
+                    'author'=>'冯蕊'
+                ];
+//                var_dump("调用插入数据方法,组装数据:",$insertData);
+                $this->insertArticleData($insertData);
+            }
+        }
+        $pages = intval($parames['current']);
+        $pages =  $pages+1;
+        $parames['current'] = $pages;
+        $twoResult = Result::http_post($wecUrl,$parames,$options);
+        if($result['data'] && $result['data']['results'] && count($result['data']['results'])>0){
+            var_dump("分页测试:",$parames,$parames['current']);
+            $this->foreachCurl($wecUrl,$parames,$other,$pages);
+        }
+//        var_dump("正确的数据:",$result);
     }
     /**
      * 获取并搜索资讯