rkljw 4 hónapja
szülő
commit
f6d8984ee2

+ 49 - 49
app/JsonRpc/CollectorService.php

@@ -15,15 +15,15 @@ use Hyperf\RpcServer\Annotation\RpcService;
 use App\Tools\Result;
 use App\Tools\Result;
 use QL\QueryList;
 use QL\QueryList;
 use Swoole\Coroutine;
 use Swoole\Coroutine;
-use App\Service\GatherQueueService;
+//use App\Service\GatherQueueService;
 
 
 
 
 
 
 #[RpcService(name: "CollectorService", protocol: "jsonrpc-http", server: "jsonrpc-http")]
 #[RpcService(name: "CollectorService", protocol: "jsonrpc-http", server: "jsonrpc-http")]
 class CollectorService implements CollectorServiceInterface
 class CollectorService implements CollectorServiceInterface
 {
 {
-    #[Inject]
-    protected GatherQueueService $Gservice;
+//    #[Inject]
+//    protected GatherQueueService $Gservice;
     /**
     /**
      * 添加网站
      * 添加网站
      * @param array $data
      * @param array $data
@@ -310,31 +310,23 @@ class CollectorService implements CollectorServiceInterface
             ->select("rule.*","web.name as web_name","web.url as web_url","web.type as web_type")
             ->select("rule.*","web.name as web_name","web.url as web_url","web.type as web_type")
             ->first();
             ->first();
         $info = $info->toArray();
         $info = $info->toArray();
-
+//        var_dump("规则信息:",$info);
         switch ($info['web_type']){
         switch ($info['web_type']){
             case 1:
             case 1:
                 var_dump("===========规则采集======",$info);
                 var_dump("===========规则采集======",$info);
                 Rule::where(['id'=>$data['id']])->update(['status'=>1]);
                 Rule::where(['id'=>$data['id']])->update(['status'=>1]);
-                $data['copyfrom'] = $info['web_name'];
-                $data['author'] = $info['writer'];;
-                $data['first_url'] = $info['first_url'];
-                $data['second_start'] = $info['second_start'];
-                $data['second_num'] = $info['second_num'];
-                $data['second_end'] = $info['second_end'];
-                $data['end_pagenum']= $info['end_pagenum'];
-                $data['rule_id']= $data['id'];
-                $data['admin_user_id']= $data['admin_user_id'];
-                $data['start'] = $info['start']??'';
-                $data['title'] = $info['title']??'';
-                $data['content'] = $info['content']??'';
-                var_dump("++++++++++++++++++");
-//                $data['newUrlStr'] =
-                $urlList = $this->addUrlArr($data);
-                var_dump("采集列表:",$urlList);
+                //添加几个值
+                $info['admin_user_id'] = $data['admin_user_id'];
+                $info['rule_id'] = $data['id'];
+                $info['copyfrom'] = $info['web_name'];
+                $info['author'] = $info['writer'];
+//                var_dump("++++++++++++++++++");
+                $urlList = $this->addUrlArr($info);
+//                var_dump("采集列表:",$urlList);
                 if($urlList){
                 if($urlList){
                     foreach ($urlList as $val){
                     foreach ($urlList as $val){
-//                        var_dump("单列表地址:",$val);
-                        $this->ruleCollection($val,$data);
+
+                        $this->ruleCollection($val,$info);
                     }
                     }
                 }
                 }
                 Rule::where(['id'=>$data['id']])->update(['status'=>2]);
                 Rule::where(['id'=>$data['id']])->update(['status'=>2]);
@@ -377,7 +369,7 @@ class CollectorService implements CollectorServiceInterface
                 $i++;
                 $i++;
                 $url = $data['second_start'].$i.$data['second_end'];
                 $url = $data['second_start'].$i.$data['second_end'];
                 $respon1 = Result::pageExists($url);
                 $respon1 = Result::pageExists($url);
-                var_dump("采集地址:",$respon1,$url);
+
 //                Coroutine::sleep(2);
 //                Coroutine::sleep(2);
                 if ($i==intval($data['end_pagenum'])-1 || intval($data['end_pagenum'])-1==0) {
                 if ($i==intval($data['end_pagenum'])-1 || intval($data['end_pagenum'])-1==0) {
                     $exit = true;
                     $exit = true;
@@ -386,8 +378,6 @@ class CollectorService implements CollectorServiceInterface
                     array_push($arrList,$url);
                     array_push($arrList,$url);
                 }
                 }
             }
             }
-
-
         return $arrList;
         return $arrList;
     }
     }
 
 
@@ -395,62 +385,72 @@ class CollectorService implements CollectorServiceInterface
      * 按照规则采集数据
      * 按照规则采集数据
      * @return void
      * @return void
      */
      */
-    public function ruleCollection($url,$data)
+    public function ruleCollection($url,$info)
     {
     {
-        var_dump("采集参数:",$url,$data);
+        var_dump("采集参数:",$url,$info['start']);
         $list = QueryList::get($url);
         $list = QueryList::get($url);
         $dataList = $list->rules([
         $dataList = $list->rules([
-            'title' => ['a', 'text'],
-            'link'  => ['a', 'href'],
-        ])->range($data['start'])->query()->getData();
+            'title' => ['a:eq(0)', 'text'],
+            'link'  => ['a:eq(0)', 'href'],
+        ])->range($info['start'])->query()->getData();
         var_dump("采集的内容:",$dataList);
         var_dump("采集的内容:",$dataList);
-//        var_dump("====",$dataList);die;
         $firstUrlArr =  explode("/", $url);
         $firstUrlArr =  explode("/", $url);
         array_pop($firstUrlArr);
         array_pop($firstUrlArr);
         $firstUrlArr = implode('/',$firstUrlArr);
         $firstUrlArr = implode('/',$firstUrlArr);
-
         $dataList = $dataList->toArray();
         $dataList = $dataList->toArray();
-//        var_dump($dataList);die;
+
         if($dataList){
         if($dataList){
             foreach ($dataList as $tiem){
             foreach ($dataList as $tiem){
-                $newUrl =  substr($tiem['link'], 1);
-                $newUrlStr = $firstUrlArr.$newUrl;
+                //检测采集的url是否存在网站域名 。存在就继续,不存在就检测是否是三方跳转
+                $newUrlStr = $tiem['link'];
+                if (strpos($tiem['link'], $info['web_url']) === false) {
+                    $array = ['http','https'];
+                    $link = $tiem['link'];
+                    $found = array_filter($array, function($item) use ($link) {
+                        return str_contains($link, $item);
+                    });
+                    if(count($found)>0){
+                        continue;
+                    }
+                    $newUrlStr = $info['con_url'].$tiem['link'];
+                }
+
+//                $detailContent = QueryList::get($newUrlStr);
                 $detailContent = QueryList::get($newUrlStr);
                 $detailContent = QueryList::get($newUrlStr);
                 $rules = [];
                 $rules = [];
-                if($data['title']){
-                    $rules['title'] = [$data['title'],'text'];
+                if($info['title']){
+                    $rules['title'] = [$info['title'],'text'];
                 }
                 }
-                if($data['content']){
-                    $rules['content'] = [$data['content'],'html'];
+                if($info['content']){
+                    $rules['content'] = [$info['content'],'html'];
                 }
                 }
                 //详情页范围
                 //详情页范围
-                $detailRange = '.news-details';
-                var_dump("打印规则:",$rules);
+                $detailRange = $info['con_start']??'';
+                var_dump("打印规则:",$rules,"详情起始:", $info['con_start']);
                 $detailData = $detailContent->rules($rules)->range($detailRange)->query()->getData();
                 $detailData = $detailContent->rules($rules)->range($detailRange)->query()->getData();
-
                 $detailData = $detailData->toArray();
                 $detailData = $detailData->toArray();
                 var_dump("内容详情:",$detailData,$newUrlStr);
                 var_dump("内容详情:",$detailData,$newUrlStr);
                 if($detailData){
                 if($detailData){
                     foreach ($detailData as $val){
                     foreach ($detailData as $val){
 //                        var_dump("进没进foreach:",$newUrlStr,$val);
 //                        var_dump("进没进foreach:",$newUrlStr,$val);
+                        $data = [];
                         $data['fromurl'] = $newUrlStr;
                         $data['fromurl'] = $newUrlStr;
                         $data['title'] = $val['title'];
                         $data['title'] = $val['title'];
                         $data['content'] = $val['content'];
                         $data['content'] = $val['content'];
                         $data['newUrlStr'] = $newUrlStr;
                         $data['newUrlStr'] = $newUrlStr;
                         $data['introduce'] = $val['title']??'';
                         $data['introduce'] = $val['title']??'';
                         $data['keyword'] = $val['title']??'';
                         $data['keyword'] = $val['title']??'';
-                        $data['copyfrom'] = $data['copyfrom'];
-                        $data['source'] = $data['source']??$data['copyfrom'];
-                        $data['admin_user_id'] = $data['admin_user_id']??'';
-                        $data['rule_id'] = $data['rule_id']??'';
-//                        $data['copyfrom'] = $data['copyfrom'];
-//                        var_dump("要插入的数据:",$data);
+                        $data['copyfrom'] = $info['copyfrom'];
+                        $data['source'] = $info['source']??$info['copyfrom'];
+                        $data['admin_user_id'] = $info['admin_user_id']??'';
+                        $data['rule_id'] = $info['rule_id']??'';
+                        $data['author'] = $info['author']??'';
+
                         $this->insertArticleData($data);
                         $this->insertArticleData($data);
                     }
                     }
                 }
                 }
 
 
             }
             }
-//
         }
         }
     }
     }
 
 

A különbségek nem kerülnek megjelenítésre, a fájl túl nagy
+ 0 - 0
runtime/container/scan.cache


+ 1 - 1
runtime/hyperf.pid

@@ -1 +1 @@
-14350
+63188

A különbségek nem kerülnek megjelenítésre, a fájl túl nagy
+ 1419 - 5
runtime/logs/hyperf.log


Nem az összes módosított fájl került megjelenítésre, mert túl sok fájl változott