|
@@ -15,15 +15,15 @@ use Hyperf\RpcServer\Annotation\RpcService;
|
|
use App\Tools\Result;
|
|
use App\Tools\Result;
|
|
use QL\QueryList;
|
|
use QL\QueryList;
|
|
use Swoole\Coroutine;
|
|
use Swoole\Coroutine;
|
|
-use App\Service\GatherQueueService;
|
|
|
|
|
|
+//use App\Service\GatherQueueService;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[RpcService(name: "CollectorService", protocol: "jsonrpc-http", server: "jsonrpc-http")]
|
|
#[RpcService(name: "CollectorService", protocol: "jsonrpc-http", server: "jsonrpc-http")]
|
|
class CollectorService implements CollectorServiceInterface
|
|
class CollectorService implements CollectorServiceInterface
|
|
{
|
|
{
|
|
- #[Inject]
|
|
|
|
- protected GatherQueueService $Gservice;
|
|
|
|
|
|
+// #[Inject]
|
|
|
|
+// protected GatherQueueService $Gservice;
|
|
/**
|
|
/**
|
|
* 添加网站
|
|
* 添加网站
|
|
* @param array $data
|
|
* @param array $data
|
|
@@ -310,31 +310,23 @@ class CollectorService implements CollectorServiceInterface
|
|
->select("rule.*","web.name as web_name","web.url as web_url","web.type as web_type")
|
|
->select("rule.*","web.name as web_name","web.url as web_url","web.type as web_type")
|
|
->first();
|
|
->first();
|
|
$info = $info->toArray();
|
|
$info = $info->toArray();
|
|
-
|
|
|
|
|
|
+// var_dump("规则信息:",$info);
|
|
switch ($info['web_type']){
|
|
switch ($info['web_type']){
|
|
case 1:
|
|
case 1:
|
|
var_dump("===========规则采集======",$info);
|
|
var_dump("===========规则采集======",$info);
|
|
Rule::where(['id'=>$data['id']])->update(['status'=>1]);
|
|
Rule::where(['id'=>$data['id']])->update(['status'=>1]);
|
|
- $data['copyfrom'] = $info['web_name'];
|
|
|
|
- $data['author'] = $info['writer'];;
|
|
|
|
- $data['first_url'] = $info['first_url'];
|
|
|
|
- $data['second_start'] = $info['second_start'];
|
|
|
|
- $data['second_num'] = $info['second_num'];
|
|
|
|
- $data['second_end'] = $info['second_end'];
|
|
|
|
- $data['end_pagenum']= $info['end_pagenum'];
|
|
|
|
- $data['rule_id']= $data['id'];
|
|
|
|
- $data['admin_user_id']= $data['admin_user_id'];
|
|
|
|
- $data['start'] = $info['start']??'';
|
|
|
|
- $data['title'] = $info['title']??'';
|
|
|
|
- $data['content'] = $info['content']??'';
|
|
|
|
- var_dump("++++++++++++++++++");
|
|
|
|
-// $data['newUrlStr'] =
|
|
|
|
- $urlList = $this->addUrlArr($data);
|
|
|
|
- var_dump("采集列表:",$urlList);
|
|
|
|
|
|
+ //添加几个值
|
|
|
|
+ $info['admin_user_id'] = $data['admin_user_id'];
|
|
|
|
+ $info['rule_id'] = $data['id'];
|
|
|
|
+ $info['copyfrom'] = $info['web_name'];
|
|
|
|
+ $info['author'] = $info['writer'];
|
|
|
|
+// var_dump("++++++++++++++++++");
|
|
|
|
+ $urlList = $this->addUrlArr($info);
|
|
|
|
+// var_dump("采集列表:",$urlList);
|
|
if($urlList){
|
|
if($urlList){
|
|
foreach ($urlList as $val){
|
|
foreach ($urlList as $val){
|
|
-// var_dump("单列表地址:",$val);
|
|
|
|
- $this->ruleCollection($val,$data);
|
|
|
|
|
|
+
|
|
|
|
+ $this->ruleCollection($val,$info);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
Rule::where(['id'=>$data['id']])->update(['status'=>2]);
|
|
Rule::where(['id'=>$data['id']])->update(['status'=>2]);
|
|
@@ -377,7 +369,7 @@ class CollectorService implements CollectorServiceInterface
|
|
$i++;
|
|
$i++;
|
|
$url = $data['second_start'].$i.$data['second_end'];
|
|
$url = $data['second_start'].$i.$data['second_end'];
|
|
$respon1 = Result::pageExists($url);
|
|
$respon1 = Result::pageExists($url);
|
|
- var_dump("采集地址:",$respon1,$url);
|
|
|
|
|
|
+
|
|
// Coroutine::sleep(2);
|
|
// Coroutine::sleep(2);
|
|
if ($i==intval($data['end_pagenum'])-1 || intval($data['end_pagenum'])-1==0) {
|
|
if ($i==intval($data['end_pagenum'])-1 || intval($data['end_pagenum'])-1==0) {
|
|
$exit = true;
|
|
$exit = true;
|
|
@@ -386,8 +378,6 @@ class CollectorService implements CollectorServiceInterface
|
|
array_push($arrList,$url);
|
|
array_push($arrList,$url);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
-
|
|
|
|
-
|
|
|
|
return $arrList;
|
|
return $arrList;
|
|
}
|
|
}
|
|
|
|
|
|
@@ -395,62 +385,72 @@ class CollectorService implements CollectorServiceInterface
|
|
* 按照规则采集数据
|
|
* 按照规则采集数据
|
|
* @return void
|
|
* @return void
|
|
*/
|
|
*/
|
|
- public function ruleCollection($url,$data)
|
|
|
|
|
|
+ public function ruleCollection($url,$info)
|
|
{
|
|
{
|
|
- var_dump("采集参数:",$url,$data);
|
|
|
|
|
|
+ var_dump("采集参数:",$url,$info['start']);
|
|
$list = QueryList::get($url);
|
|
$list = QueryList::get($url);
|
|
$dataList = $list->rules([
|
|
$dataList = $list->rules([
|
|
- 'title' => ['a', 'text'],
|
|
|
|
- 'link' => ['a', 'href'],
|
|
|
|
- ])->range($data['start'])->query()->getData();
|
|
|
|
|
|
+ 'title' => ['a:eq(0)', 'text'],
|
|
|
|
+ 'link' => ['a:eq(0)', 'href'],
|
|
|
|
+ ])->range($info['start'])->query()->getData();
|
|
var_dump("采集的内容:",$dataList);
|
|
var_dump("采集的内容:",$dataList);
|
|
-// var_dump("====",$dataList);die;
|
|
|
|
$firstUrlArr = explode("/", $url);
|
|
$firstUrlArr = explode("/", $url);
|
|
array_pop($firstUrlArr);
|
|
array_pop($firstUrlArr);
|
|
$firstUrlArr = implode('/',$firstUrlArr);
|
|
$firstUrlArr = implode('/',$firstUrlArr);
|
|
-
|
|
|
|
$dataList = $dataList->toArray();
|
|
$dataList = $dataList->toArray();
|
|
-// var_dump($dataList);die;
|
|
|
|
|
|
+
|
|
if($dataList){
|
|
if($dataList){
|
|
foreach ($dataList as $tiem){
|
|
foreach ($dataList as $tiem){
|
|
- $newUrl = substr($tiem['link'], 1);
|
|
|
|
- $newUrlStr = $firstUrlArr.$newUrl;
|
|
|
|
|
|
+ //检测采集的url是否存在网站域名 。存在就继续,不存在就检测是否是三方跳转
|
|
|
|
+ $newUrlStr = $tiem['link'];
|
|
|
|
+ if (strpos($tiem['link'], $info['web_url']) === false) {
|
|
|
|
+ $array = ['http','https'];
|
|
|
|
+ $link = $tiem['link'];
|
|
|
|
+ $found = array_filter($array, function($item) use ($link) {
|
|
|
|
+ return str_contains($link, $item);
|
|
|
|
+ });
|
|
|
|
+ if(count($found)>0){
|
|
|
|
+ continue;
|
|
|
|
+ }
|
|
|
|
+ $newUrlStr = $info['con_url'].$tiem['link'];
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+// $detailContent = QueryList::get($newUrlStr);
|
|
$detailContent = QueryList::get($newUrlStr);
|
|
$detailContent = QueryList::get($newUrlStr);
|
|
$rules = [];
|
|
$rules = [];
|
|
- if($data['title']){
|
|
|
|
- $rules['title'] = [$data['title'],'text'];
|
|
|
|
|
|
+ if($info['title']){
|
|
|
|
+ $rules['title'] = [$info['title'],'text'];
|
|
}
|
|
}
|
|
- if($data['content']){
|
|
|
|
- $rules['content'] = [$data['content'],'html'];
|
|
|
|
|
|
+ if($info['content']){
|
|
|
|
+ $rules['content'] = [$info['content'],'html'];
|
|
}
|
|
}
|
|
//详情页范围
|
|
//详情页范围
|
|
- $detailRange = '.news-details';
|
|
|
|
- var_dump("打印规则:",$rules);
|
|
|
|
|
|
+ $detailRange = $info['con_start']??'';
|
|
|
|
+ var_dump("打印规则:",$rules,"详情起始:", $info['con_start']);
|
|
$detailData = $detailContent->rules($rules)->range($detailRange)->query()->getData();
|
|
$detailData = $detailContent->rules($rules)->range($detailRange)->query()->getData();
|
|
-
|
|
|
|
$detailData = $detailData->toArray();
|
|
$detailData = $detailData->toArray();
|
|
var_dump("内容详情:",$detailData,$newUrlStr);
|
|
var_dump("内容详情:",$detailData,$newUrlStr);
|
|
if($detailData){
|
|
if($detailData){
|
|
foreach ($detailData as $val){
|
|
foreach ($detailData as $val){
|
|
// var_dump("进没进foreach:",$newUrlStr,$val);
|
|
// var_dump("进没进foreach:",$newUrlStr,$val);
|
|
|
|
+ $data = [];
|
|
$data['fromurl'] = $newUrlStr;
|
|
$data['fromurl'] = $newUrlStr;
|
|
$data['title'] = $val['title'];
|
|
$data['title'] = $val['title'];
|
|
$data['content'] = $val['content'];
|
|
$data['content'] = $val['content'];
|
|
$data['newUrlStr'] = $newUrlStr;
|
|
$data['newUrlStr'] = $newUrlStr;
|
|
$data['introduce'] = $val['title']??'';
|
|
$data['introduce'] = $val['title']??'';
|
|
$data['keyword'] = $val['title']??'';
|
|
$data['keyword'] = $val['title']??'';
|
|
- $data['copyfrom'] = $data['copyfrom'];
|
|
|
|
- $data['source'] = $data['source']??$data['copyfrom'];
|
|
|
|
- $data['admin_user_id'] = $data['admin_user_id']??'';
|
|
|
|
- $data['rule_id'] = $data['rule_id']??'';
|
|
|
|
-// $data['copyfrom'] = $data['copyfrom'];
|
|
|
|
-// var_dump("要插入的数据:",$data);
|
|
|
|
|
|
+ $data['copyfrom'] = $info['copyfrom'];
|
|
|
|
+ $data['source'] = $info['source']??$info['copyfrom'];
|
|
|
|
+ $data['admin_user_id'] = $info['admin_user_id']??'';
|
|
|
|
+ $data['rule_id'] = $info['rule_id']??'';
|
|
|
|
+ $data['author'] = $info['author']??'';
|
|
|
|
+
|
|
$this->insertArticleData($data);
|
|
$this->insertArticleData($data);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
}
|
|
}
|
|
-//
|
|
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|