|
@@ -1,22 +1,25 @@
|
|
|
<?php
|
|
|
namespace App\JsonRpc;
|
|
|
|
|
|
+use App\Model\ArticleData;
|
|
|
use App\Model\OldModel\Article as OldArticle;
|
|
|
use App\Model\OldModel\ArticleData as OldArticleData;
|
|
|
use App\Model\OldModel\Category;
|
|
|
use App\Model\Article;
|
|
|
-use App\Model\Web;
|
|
|
use App\Model\Rule;
|
|
|
-use App\Model\ArticleData;
|
|
|
+use App\Model\Web;
|
|
|
use Hyperf\DbConnection\Db;
|
|
|
use Hyperf\RpcServer\Annotation\RpcService;
|
|
|
use App\Tools\Result;
|
|
|
+use QL\QueryList;
|
|
|
+use Swoole\Coroutine;
|
|
|
|
|
|
use function Hyperf\Support\retry;
|
|
|
|
|
|
#[RpcService(name: "CollectorService", protocol: "jsonrpc-http", server: "jsonrpc-http")]
|
|
|
class CollectorService implements CollectorServiceInterface
|
|
|
{
|
|
|
+
|
|
|
/**
|
|
|
* 添加网站
|
|
|
* @param array $data
|
|
@@ -218,6 +221,7 @@ class CollectorService implements CollectorServiceInterface
|
|
|
*/
|
|
|
public function getRule(array $data): array
|
|
|
{
|
|
|
+ $where = [];
|
|
|
if(isset($data['web_id'])){
|
|
|
$web = Web::where('id',$data['web_id'])->get();
|
|
|
if(empty($web->toArray())){
|
|
@@ -235,7 +239,13 @@ class CollectorService implements CollectorServiceInterface
|
|
|
['name','like','%'.$data['keyWord'].'%']
|
|
|
];
|
|
|
}
|
|
|
- $rep = Rule::withCount(relations:'arts')->where($where)->limit($data['pageSize'])->orderBy("created_at","desc")->offset(($data['page']-1)*$data['pageSize'])->get();
|
|
|
+ // var_dump("===============",$where);
|
|
|
+ if(empty($where)){
|
|
|
+ $rep = Rule::withCount(relations:'arts')->limit($data['pageSize'])->orderBy("created_at","desc")->offset(($data['page']-1)*$data['pageSize'])->get();
|
|
|
+ }else{
|
|
|
+ $rep = Rule::withCount(relations:'arts')->where($where)->limit($data['pageSize'])->orderBy("created_at","desc")->offset(($data['page']-1)*$data['pageSize'])->get();
|
|
|
+ }
|
|
|
+
|
|
|
$count = Rule::where($where)->count();
|
|
|
if($count==0){
|
|
|
return Result::error('暂无相关规则任务!');
|
|
@@ -245,7 +255,7 @@ class CollectorService implements CollectorServiceInterface
|
|
|
'rep' => $rep->toArray(),
|
|
|
'count' => $count
|
|
|
];
|
|
|
- return Result::success($data);
|
|
|
+ return Result::success($data)
|
|
|
}
|
|
|
/**
|
|
|
* 获取某个任务规则
|
|
@@ -332,13 +342,233 @@ class CollectorService implements CollectorServiceInterface
|
|
|
*/
|
|
|
public function sendCrawler(array $data): array
|
|
|
{
|
|
|
- $result = Article::get();
|
|
|
- $b = OldArticle::get();
|
|
|
- $a = [
|
|
|
- 'old'=>$b,
|
|
|
- 'new'=>$result
|
|
|
+ //通过规则id 查询规则类型
|
|
|
+ $where = [
|
|
|
+ 'rule.id'=>$data['id']
|
|
|
];
|
|
|
- return Result::success($a);
|
|
|
+ $info = Rule::where($where)->leftJoin('web','rule.web_id','web.id')
|
|
|
+ ->select("rule.*","web.name as web_name","web.url as web_url","web.type as web_type")
|
|
|
+ ->first();
|
|
|
+ $info = $info->toArray();
|
|
|
+
|
|
|
+ switch ($info['web_type']){
|
|
|
+ case 1:
|
|
|
+ var_dump("wojinlailaile======",$info);
|
|
|
+ Rule::where(['id'=>$data['id']])->update(['status'=>1]);
|
|
|
+ $data['copyfrom'] = $info['web_name'];
|
|
|
+ $data['author'] = '刘德华';
|
|
|
+ $data['first_url'] = $info['first_url'];
|
|
|
+ $data['second_start'] = $info['second_start'];
|
|
|
+ $data['second_num'] = $info['second_num'];
|
|
|
+ $data['second_end'] = $info['second_end'];
|
|
|
+ $data['end_pagenum']= $info['end_pagenum'];
|
|
|
+ $data['rule_id']= $data['id'];
|
|
|
+ $data['admin_user_id']= $data['admin_user_id'];
|
|
|
+// $data['newUrlStr'] =
|
|
|
+ $urlList = $this->addUrlArr($data);
|
|
|
+ if($urlList){
|
|
|
+ foreach ($urlList as $val){
|
|
|
+ var_dump("单列表地址:",$val);
|
|
|
+ $this->ruleCollection($val,$data);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ Rule::where(['id'=>$data['id']])->update(['status'=>2]);
|
|
|
+ break;
|
|
|
+ case 2:
|
|
|
+
|
|
|
+ $wecUrl = $info['first_url'];//'https://www.ndcpa.gov.cn/queryList';
|
|
|
+ $parames = json_decode($info['parameter'],true);
|
|
|
+ $parames['webSiteCode'] = [trim($parames['webSiteCode'], "[]")]; //['jbkzzx'];//
|
|
|
+ $parames['channelCode'] = [trim($parames['channelCode'], "[]")]; // ['c100008'];//
|
|
|
+ $other = [
|
|
|
+ 'web_url'=>$info['web_url'],
|
|
|
+ 'copyfrom'=>$info['web_name'],
|
|
|
+ 'admin_user_id'=>$data['admin_user_id'],
|
|
|
+ 'rule_id'=>$data['id']
|
|
|
+ ];
|
|
|
+ var_dump("开始调用接口方法====",$parames);
|
|
|
+// die;
|
|
|
+ $this->foreachCurl($wecUrl,$parames,$other);
|
|
|
+
|
|
|
+ }
|
|
|
+ return Result::success([]);
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 把可采集的列表页连接 打包成一个大数组
|
|
|
+ * @return void
|
|
|
+ */
|
|
|
+ public function addUrlArr($data)
|
|
|
+ {
|
|
|
+ $arrList = [];
|
|
|
+ array_push($arrList,$data['first_url']);
|
|
|
+
|
|
|
+ $exit = false;
|
|
|
+ $i = 0;
|
|
|
+ while(!$exit){
|
|
|
+ $i++;
|
|
|
+ $url = $data['second_start'].$i.$data['second_end'];
|
|
|
+ $respon1 = Result::pageExists($url);
|
|
|
+ var_dump("采集地址:",$respon1,$url);
|
|
|
+// Coroutine::sleep(2);
|
|
|
+ if ($i==intval($data['end_pagenum'])-1) {
|
|
|
+ $exit = true;
|
|
|
+// Coroutine::exit(); // 退出循环
|
|
|
+ }else{
|
|
|
+ array_push($arrList,$url);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ return $arrList;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 按照规则采集数据
|
|
|
+ * @return void
|
|
|
+ */
|
|
|
+ public function ruleCollection($url,$data)
|
|
|
+ {
|
|
|
+ var_dump("采集参数:",$data);
|
|
|
+ $list = QueryList::get($url);
|
|
|
+ $dataList = $list->rules([
|
|
|
+ 'title' => ['a', 'text'],
|
|
|
+ 'link' => ['a', 'href'],
|
|
|
+ ])->range('.list1 li')->query()->getData();
|
|
|
+ var_dump("采集的内容:",$dataList);
|
|
|
+// var_dump("====",$dataList);die;
|
|
|
+ $firstUrlArr = explode("/", $url);
|
|
|
+ array_pop($firstUrlArr);
|
|
|
+ $firstUrlArr = implode('/',$firstUrlArr);
|
|
|
+
|
|
|
+ $dataList = $dataList->toArray();
|
|
|
+ if($dataList){
|
|
|
+ foreach ($dataList as $tiem){
|
|
|
+ $newUrl = substr($tiem['link'], 1);
|
|
|
+ $newUrlStr = $firstUrlArr.$newUrl;
|
|
|
+ $detailContent = QueryList::get($newUrlStr);
|
|
|
+ $detailData = $detailContent->rules([
|
|
|
+ 'title'=>['h1','text'],
|
|
|
+ 'content'=>['.TRS_UEDITOR','html'],
|
|
|
+ ])->range(".news-details")->query()->getData();
|
|
|
+
|
|
|
+ $detailData = $detailData->toArray();
|
|
|
+ var_dump("内容详情:",$detailData,$newUrlStr);
|
|
|
+ if($detailData){
|
|
|
+ foreach ($detailData as $val){
|
|
|
+ var_dump("进没进foreach:",$newUrlStr,$val);
|
|
|
+ $data['fromurl'] = $newUrlStr;
|
|
|
+ $data['title'] = $val['title'];
|
|
|
+ $data['content'] = $val['content'];
|
|
|
+ $data['newUrlStr'] = $newUrlStr;
|
|
|
+ $data['source'] = '';
|
|
|
+ $data['introduce'] = $val['title']??'';
|
|
|
+ $data['keyword'] = $val['title']??'';
|
|
|
+ $data['copyfrom'] = $data['copyfrom'];
|
|
|
+ $data['source'] = $data['source']??$data['copyfrom'];
|
|
|
+ $data['admin_user_id'] = $data['admin_user_id']??'';
|
|
|
+ $data['rule_id'] = $data['rule_id']??'';
|
|
|
+// $data['copyfrom'] = $data['copyfrom'];
|
|
|
+ var_dump("要插入的数据:",$data);
|
|
|
+ $this->insertArticleData($data);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+//
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 插入数据
|
|
|
+ * @param $data
|
|
|
+ * @return void
|
|
|
+ */
|
|
|
+ public function insertArticleData($data=[])
|
|
|
+ {
|
|
|
+ if($data){
|
|
|
+ Db::beginTransaction();
|
|
|
+ try{
|
|
|
+ $articleInfo = Article::where(['title'=>$data['title']])->first();
|
|
|
+ var_dump("获取详情:",$articleInfo,$data);
|
|
|
+ if(empty($articleInfo)){
|
|
|
+ $insertData = [];
|
|
|
+ $insertData['fromurl'] =$data['newUrlStr'];
|
|
|
+ $insertData['oldtitle'] =$data['title'];
|
|
|
+ $insertData['title'] = $data['title'];
|
|
|
+ $insertData['copyfrom'] = $data['copyfrom'];
|
|
|
+ $insertData['author'] = $data['author'];
|
|
|
+ $insertData['introduce'] = $data['title'];
|
|
|
+ $insertData['keyword'] = $data['title'];
|
|
|
+ $insertData['source'] = isset($data['source']) && $data['source']!=''? $data['source']:$data['copyfrom'];
|
|
|
+ $insertData['admin_user_id'] = $data['admin_user_id'];
|
|
|
+ $insertData['rule_id'] = $data['rule_id'];
|
|
|
+// var_dump("插入Article:",$insertData);
|
|
|
+ $article_id = Article::insertGetId($insertData);
|
|
|
+ $insertDataDetail = [];
|
|
|
+ $insertDataDetail['article_id'] = $article_id;
|
|
|
+ $insertDataDetail['content'] = $data['content'];
|
|
|
+ var_dump("插入ArticleData:",$insertDataDetail);
|
|
|
+ ArticleData::insertGetId($insertDataDetail);
|
|
|
+// Coroutine::sleep(2);
|
|
|
+// var_dump("插入成功一次:",$article_id,$insertDataDetail);
|
|
|
+ }
|
|
|
+ Db::commit();
|
|
|
+ }catch (\Exception $e){
|
|
|
+ Db::rollBack();
|
|
|
+ var_dump("插入失败:",$e->getMessage());
|
|
|
+ }
|
|
|
+
|
|
|
+ }else{
|
|
|
+ var_dump("没有数据可以插入:");
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 分页采集
|
|
|
+ * @return void
|
|
|
+ */
|
|
|
+ public function foreachCurl($wecUrl,$parames,$other,&$page=1)
|
|
|
+ {
|
|
|
+ $options = [
|
|
|
+ CURLOPT_HEADER => true, // 如果想包含头部信息在响应中,可以设置为true
|
|
|
+ CURLOPT_TIMEOUT => 30 // 设置请求超时时间为30秒
|
|
|
+ ];
|
|
|
+ $result = Result::http_post($wecUrl,$parames,$options);
|
|
|
+ $result = json_decode($result['response'],true);
|
|
|
+ var_dump("获取数据:",$result);
|
|
|
+ if($result['data'] && $result['data']['results']){
|
|
|
+ $dataList = $result['data']['results'];
|
|
|
+// var_dump("取数据结构体:",$dataList);
|
|
|
+ foreach ($dataList as $val){
|
|
|
+// var_dump("进入循环插入:",$val);
|
|
|
+ $newUrlStr = json_decode($val['source']['urls'],true);
|
|
|
+ $newUrlStr = $other['web_url'].$newUrlStr['common'];
|
|
|
+// var_dump("来源地址:",$newUrlStr);
|
|
|
+ $insertData = [
|
|
|
+ 'newUrlStr'=>$newUrlStr,
|
|
|
+ 'title'=>$val['source']['title']??'',
|
|
|
+ 'source'=>$val['source']['contentSource']??'',
|
|
|
+ 'copyfrom'=>$other['copyfrom']??'',
|
|
|
+ 'content'=>$val['source']['content']['content']??'',
|
|
|
+ 'admin_user_id'=>$other['admin_user_id']??'',
|
|
|
+ 'rule_id'=>$other['rule_id']??'',
|
|
|
+ 'author'=>'冯蕊'
|
|
|
+ ];
|
|
|
+// var_dump("调用插入数据方法,组装数据:",$insertData);
|
|
|
+ $this->insertArticleData($insertData);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ $pages = intval($parames['current']);
|
|
|
+ $pages = $pages+1;
|
|
|
+ $parames['current'] = $pages;
|
|
|
+ $twoResult = Result::http_post($wecUrl,$parames,$options);
|
|
|
+ if($result['data'] && $result['data']['results'] && count($result['data']['results'])>0){
|
|
|
+ var_dump("分页测试:",$parames,$parames['current']);
|
|
|
+ $this->foreachCurl($wecUrl,$parames,$other,$pages);
|
|
|
+ }
|
|
|
+// var_dump("正确的数据:",$result);
|
|
|
}
|
|
|
/**
|
|
|
* 获取并搜索资讯
|