test
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

97 lines
4.2 KiB

<?php
defined('IN_IA') or exit('Access Denied');
require_once PATH_CORE . "library/querylist/QueryList.class.php";
use QL\QueryList;
class GatherArticle {
function get_caiji($url) {
global $_W;
load()->func('file');
load()->func('communication');
//采集规则
$html = ihttp_request($url, '', array('CURLOPT_REFERER' => 'http://www.qq.com'));
// $html = file_get_contents($url);
$html = str_replace("<!--headTrap<body></body><head></head><html></html>-->", "", $html['content']);
$reg = array(
//采集文章标题
'title' => array('#activity-name', 'text'),
//采集文章发布日期,这里用到了QueryList的过滤功能,过滤掉span标签和a标签
//采集文章正文内容,利用过滤功能去掉文章中的超链接,但保留超链接的文字,并去掉版权、JS代码等无用信息
'content' => array('#js_content', 'html'),
'nickname' => array('.profile_nickname', 'text'),
'video' => array('.video_iframe', 'data-src', '', function ($video) {
$video = explode('vid=', $video);
$video = explode('&', $video['1']);
return $video['0'];
}),
'logo' => array(':contains(msg_cdn_url)', 'text', '', function ($logo) {
$logo = explode('var msg_cdn_url = "', $logo);
$logo = explode('";', $logo['1']);
$logo = 'web/index.php?c=utility&a=wxcode&do=image&attach=' . $logo['0'];
return $logo;
}),
'desc' => array(':contains(msg_cdn_url)', 'text', '', function ($desc) {
$desc = explode('var msg_desc = "', $desc);
$desc = explode('";', $desc['1']);
return $desc['0'];
}),
);
$rang = 'body';
$ql = QueryList::Query($html, $reg, $rang, 'UTF-8');
$con = $ql->getData();
$contents = $con['0']['content'];
//如果出现中文乱码使用下面代码
//$getcontent = iconv("gb2312", "utf-8",$contents);
preg_match_all('/<\s*img\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $contents, $match);
$pic1 = $match['0'];
$img = $match['2'];
foreach ($pic1 as $key => $value) {
$url = $value;
$path = $_W['siteroot'] . 'web/index.php?c=utility&a=wxcode&do=image&attach=' . $img[$key];
// $imgarr = getimagesize($path);
// if ($imgarr['0'] > 300 && $imgarr['1'] > 10) {
// $fileurl = '<img src="' . tomedia($path) . '" width="100%"/>';
// } else {
// $fileurl = '<img src="' . tomedia($path) . '" width="' . $imgarr[0] . '" />';
// }
// if ($imgarr['0'] > 300 && $imgarr['1'] > 200) {
// if ($key < 4) {
// $pic .= tomedia($path) . ',';
// }
// }
$fileurl = '<img src="' . tomedia($path) . '" width="' . $imgarr[0] . '" />';
$pic .= tomedia($path) . ',';
$contents = str_replace("{$url}", $fileurl, $contents);
}
preg_match_all('/<\s*iframe\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $contents, $match);
$fs = $match['0'];
$fskey = $match['2'];
foreach ($fs as $key => $value) {
$fileurl = "<iframe border='0' width='100%' height='250' src='http://v.qq.com/iframe/player.html?vid={$con['0']['video']}&tiny=0&auto=0' allowfullscreen></iframe>";
$contents = str_replace("$value", $fileurl, $contents);
}
$pic = rtrim($pic, ",");
$pic = explode(",", $pic);
if (count($pic) == 3) {
$pic = iserializer($pic);
} else {
$pic = null;
}
$data = array(
'title' => $con['0']['title'],
'contents' => $contents,
'desc' => $con['0']['desc'],
'pic' => $pic,
'vid' => $con['0']['video'],
'thumb' => $_W['siteroot'] . $con['0']['logo'],
'nickname' => $con['0']['nickname']
);
return $data;
}
}