记录2年前写的wordpress 半手工采集php代码

上一版我的博客后期采用的采集方法，分享给大家；

二个文件,caijiClass.php(核心),index.php

第1个文件caijiClass.php

<?php
class caiji{
	private $url='zlkb.net'; //域名
	private $user='admin';//账户
	private $pwd='admin';//密码
	private $keyword='dc#4fsadfasfok@!$';//安全key
	private $post_url;
	private $check_url;
	private $img_url;
	private $img_path;
	private $imgfiles;
 
	function __construct(){
		define('PATH',  realpath(dirname(__FILE__))); 
		$this->post_url="http://{$this->url}/wp-admin/etpost.php";//发布文章接口
		$this->check_url="http://{$this->url}/wp-admin/etchk.php";//检查文章接口
		$this->img_url="http://{$this->url}/wp-admin/etimg.php";//更新图片接口
		$year=date('Y');
		$month=str_pad(date('m'),2,'0',STR_PAD_LEFT);
		$this->img_path="http://{$this->url}/wp-content/uploads/{$year}/{$month}/";
		$this->imgfiles=PATH.'/cache/';
	}
	/*
	* 参数说明：title为文章标题
	*           content 为文章内容
	*           cateid 为文章分类
	*           url 为采集源地址，用来替换文章内容中的一些原始地址
	*           publicimg 是否立即采集图片 0，否，1是 默认是
	*/
	public function post($title,$content,$cateid,$url,$publicimg=1){
 
		$file=array(
			'title'=>$title,
			'content'=>$content,
			'cateid'=>$cateid,
			'url'=>$url
		);
		file_put_contents(time().'.txt',json_encode($file));	
		sleep(1);
		//分析文章图片地址
		$imgs=array();
		$pattern="/<[img|IMG].*?src=[\'|\"](.*?(?:[\.gif|\.jpg|\.png]))[\'|\"].*?[\/]?>/"; 
		preg_match_all($pattern,$content,$match); 
		if(isset($match[1]) AND !empty($match[1])){
			foreach($match[1] AS $img){
				$newimg=$this->img_path.basename($img);
				$content = str_replace($img, $newimg, $content);
				//这里要处理一下img，
				$array_src=parse_url($img);
				if(!isset($array_src['host'])){
					$array_url=parse_url($url);
					$img=$array_url['scheme'].'://'.$array_url['host'].$img;
				}			
				$imgs[]=array('old'=>$img,'new'=>$newimg);
			}
			file_put_contents($this->imgfiles.date('YmdHis').'.txt',json_encode($imgs),FILE_APPEND);//保存下当前文章的图片地址对照表
		}
		//如果设置为立即发布图片，则开始发布图片
		if($publicimg){
			$this->img();
		}
		//替换采集域名为本站域名
		if($url){
			$array_url=parse_url($url);
			if(isset($array_url['host']) AND $array_url['host']){
				$content = str_replace($array_url['host'], $this->url, $content);
			}
		}		
 
		//发布参数 
		$params=array(
			'log'=>$this->user,
			'pwd'=>$this->pwd,
			'post_title'=>$title,
			'content'=>$content,
			'vercode'=>$this->keyword,
			'post_status'=>'publish',
			'post_category'=>array(1=>$cateid),
		);
		if($this->check($title)){
			return $result= $this->post_url_contents($this->post_url,$params);	
		}else{
			return array('code'=>0,'title'=>$title,'msg'=>'已经存在');
		}
	}
 
 
	public function img($local=0){
		$result=array();
		$files=$this->getFile($this->imgfiles);
		foreach($files AS $f){
			if($f){
				$content=file_get_contents($this->imgfiles.$f);//读取文档的内容
				$imgs=json_decode($content,TRUE);
				if(!empty($imgs)){
					if($local){
						//直接本地就采集回来
						foreach($imgs AS $img){
							$a=file_get_contents($img['old']);
							$newimg=$this->imgfiles.basename($img['old']);
							file_put_contents($newimg,$a);				
						}					
					}else{
						//直接发布图片到wordpress图片接口
						$params['vercode']=$this->keyword;
						$params['imgs']=$imgs;
						$result[]=$this->post_url_contents($this->img_url,$params);
					}
				}
				unset($content,$params);	
			}
		}
		return $result;
	}
 
	private function check($title){
		//检查参数 
		$checkparams=array(
			'post_title'=>$title,
			'vercode'=>$this->keyword
		);
		$checked= $this->post_url_contents($this->check_url,$checkparams);
		//file_put_contents('checked.txt',$checked,FILE_APPEND);
		if($checked=='[no]'){
			return TRUE;
		}else{
			return FALSE;
		}
	}
	//获取文件列表
	private function getFile($dir) {
		$fileArray[]=NULL;
		if (false != ($handle = opendir ( $dir ))) {
			$i=0;
			while ( false !== ($file = readdir ( $handle )) ) {
				//去掉"“.”、“..”以及带“.xxx”后缀的文件
				if ($file != "." && $file != ".."&&strpos($file,".")) {
					$fileArray[$i]=$file;
					if($i==100){
						break;
					}
					$i++;
				}
			}
			//关闭句柄
			closedir ( $handle );
		}
		return $fileArray;
	}	
	private function post_url_contents($url,$params){
		$oCurl = curl_init();
		if(stripos($url,"https://")!==FALSE){
			curl_setopt($oCurl, CURLOPT_SSL_VERIFYPEER, FALSE);
			curl_setopt($oCurl, CURLOPT_SSL_VERIFYHOST, FALSE);
		}
 
		curl_setopt($oCurl, CURLOPT_URL, $url);
		curl_setopt($oCurl, CURLOPT_RETURNTRANSFER, 1 );
		curl_setopt($oCurl, CURLOPT_POST,true);
		curl_setopt($oCurl, CURLOPT_POSTFIELDS, http_build_query($params));
		$sContent = curl_exec($oCurl);
		curl_close($oCurl);
		//file_put_contents('post.txt',json_encode($sContent),FILE_APPEND);
		return $sContent;
	}
 
	public function get_url_contents($url,$params=''){
		if(is_array($params) AND !empty($params)){
			$url .= "?";
			foreach ( $params as $field => $data ){
				$url .= "{$field}=". $data ."&";
			}
			$url = substr( $url, 0, 0 - 1 );	
		}
 
		$ip = rand(1,255).".".rand(1,255).".".rand(1,255).".".rand(1,255).""; 
		$headers['Accept-Language'] = "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3";
		$headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1; rv:38.0) Gecko/20100101 Firefox/38.0";
		$headers['X-FORWARDED-FOR'] =$ip;
		$headers['CLIENT-IP'] =$ip;
		$headerArr = array(); 
		foreach( $headers as $n => $v ) { 
			$headerArr[] = $n .':' . $v;  
		}	
 
		$ch = curl_init();
		curl_setopt($ch, CURLOPT_REFERER, "http://www.baidu.com/");  
		curl_setopt($ch, CURLOPT_AUTOREFERER, 1 ); // 自动设置Referer  
		curl_setopt($ch, CURLOPT_HTTPHEADER , $headerArr );  //构造IP
		curl_setopt($ch, CURLOPT_URL, $url);
		curl_setopt($ch, CURLOPT_TIMEOUT, 30 ); // 设置超时限制防止死循环  
		curl_setopt($ch, CURLOPT_HEADER, 0 ); // 显示返回的Header区域内容  
		curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1 ); // 获取的信息以文件流的形式返回  	
		curl_setopt($ch, CURLOPT_FOLLOWLOCATION,1);
		$html =  curl_exec($ch);
		curl_close($ch);
 
				//$html=mb_convert_encoding($html, 'utf-8', 'gbk');
				$html=preg_replace("/\s+/", " ", $html); //过滤多余回车 
				$html=preg_replace("/<[ ]+/si","<",$html); //过滤<__("<"号后面带空格) 
				$html=preg_replace("/<\!--.*?-->/si","",$html); //注释 
				$html=preg_replace("/<(\!.*?)>/si","",$html); //过滤DOCTYPE 
				$html=preg_replace("/<(\/?html.*?)>/si","",$html); //过滤html标签 
				$html=preg_replace("/<(\/?head.*?)>/si","",$html); //过滤head标签 
				$html=preg_replace("/<(\/?meta.*?)>/si","",$html); //过滤meta标签 
				$html=preg_replace("/<(\/?body.*?)>/si","",$html); //过滤body标签 
				$html=preg_replace("/<(\/?link.*?)>/si","",$html); //过滤link标签 
				$html=preg_replace("/<(\/?form.*?)>/si","",$html); //过滤form标签 
				$html=preg_replace("/cookie/si","COOKIE",$html); //过滤COOKIE标签 
				$html=preg_replace("/<(applet.*?)>(.*?)<(\/applet.*?)>/si","",$html); //过滤applet标签 
				$html=preg_replace("/<(\/?applet.*?)>/si","",$html); //过滤applet标签 
				$html=preg_replace("/<(style.*?)>(.*?)<(\/style.*?)>/si","",$html); //过滤style标签 
				$html=preg_replace("/<(\/?style.*?)>/si","",$html); //过滤style标签 
				$html=preg_replace('/(.*?style=\\").*?(\\".*?)/i','$1$2',$html);
				$html=preg_replace("/<(title.*?)>(.*?)<(\/title.*?)>/si","",$html); //过滤title标签 
				$html=preg_replace("/<(\/?title.*?)>/si","",$html); //过滤title标签 
				$html=preg_replace("/<(object.*?)>(.*?)<(\/object.*?)>/si","",$html); //过滤object标签 
				$html=preg_replace("/<(\/?objec.*?)>/si","",$html); //过滤object标签 
				$html=preg_replace("/<(noframes.*?)>(.*?)<(\/noframes.*?)>/si","",$html); //过滤noframes标签 
				$html=preg_replace("/<(\/?noframes.*?)>/si","",$html); //过滤noframes标签 
				$html=preg_replace("/<(i?frame.*?)>(.*?)<(\/i?frame.*?)>/si","",$html); //过滤frame标签 
				$html=preg_replace("/<(\/?i?frame.*?)>/si","",$html); //过滤frame标签 
				$html=preg_replace("/<(script.*?)>(.*?)<(\/script.*?)>/si","",$html); //过滤script标签 
				$html=preg_replace("/<(\/?script.*?)>/si","",$html); //过滤script标签 
				$html=preg_replace("/javascript/si","Javascript",$html); //过滤script标签 
				$html=preg_replace("/vbscript/si","Vbscript",$html); //过滤script标签 
				$html=preg_replace("/on([a-z]+)\s*=/si","On\\1=",$html); //过滤script标签 
				$html=preg_replace("/&#/si","&＃",$html); //过滤script标签，如javAsCript:alert( 			
				$html=preg_replace("/[\t\n\r]+/","",$html);
				$html = preg_replace(array('/\s*(<br\s*\/?\s*>\s*){2,}/im','/(<p>(\s|\s*<br\s*\/?\s*>\s*)*<\/p>)+/im'),array('\\1\\1',''),$html);		
		return $html;
	}
}

<?php class caiji{ private $url='zlkb.net'; //域名 private $user='admin';//账户 private $pwd='admin';//密码 private $keyword='dc#4fsadfasfok@!$';//安全key private $post_url; private $check_url; private $img_url; private $img_path; private $imgfiles; function __construct(){ define('PATH', realpath(dirname(__FILE__))); $this->post_url="http://{$this->url}/wp-admin/etpost.php";//发布文章接口 $this->check_url="http://{$this->url}/wp-admin/etchk.php";//检查文章接口 $this->img_url="http://{$this->url}/wp-admin/etimg.php";//更新图片接口 $year=date('Y'); $month=str_pad(date('m'),2,'0',STR_PAD_LEFT); $this->img_path="http://{$this->url}/wp-content/uploads/{$year}/{$month}/"; $this->imgfiles=PATH.'/cache/'; } /* * 参数说明：title为文章标题 * content 为文章内容 * cateid 为文章分类 * url 为采集源地址，用来替换文章内容中的一些原始地址 * publicimg 是否立即采集图片 0，否，1是默认是 */ public function post($title,$content,$cateid,$url,$publicimg=1){ $file=array( 'title'=>$title, 'content'=>$content, 'cateid'=>$cateid, 'url'=>$url ); file_put_contents(time().'.txt',json_encode($file)); sleep(1); //分析文章图片地址 $imgs=array(); $pattern="/<[img|IMG].*?src=[\'|\"](.*?(?:[\.gif|\.jpg|\.png]))[\'|\"].*?[\/]?>/"; preg_match_all($pattern,$content,$match); if(isset($match[1]) AND !empty($match[1])){ foreach($match[1] AS $img){ $newimg=$this->img_path.basename($img); $content = str_replace($img, $newimg, $content); //这里要处理一下img， $array_src=parse_url($img); if(!isset($array_src['host'])){ $array_url=parse_url($url); $img=$array_url['scheme'].'://'.$array_url['host'].$img; } $imgs[]=array('old'=>$img,'new'=>$newimg); } file_put_contents($this->imgfiles.date('YmdHis').'.txt',json_encode($imgs),FILE_APPEND);//保存下当前文章的图片地址对照表 } //如果设置为立即发布图片，则开始发布图片 if($publicimg){ $this->img(); } //替换采集域名为本站域名 if($url){ $array_url=parse_url($url); if(isset($array_url['host']) AND $array_url['host']){ $content = str_replace($array_url['host'], $this->url, $content); } } //发布参数 $params=array( 'log'=>$this->user, 'pwd'=>$this->pwd, 'post_title'=>$title, 'content'=>$content, 'vercode'=>$this->keyword, 'post_status'=>'publish', 'post_category'=>array(1=>$cateid), ); if($this->check($title)){ return $result= $this->post_url_contents($this->post_url,$params); }else{ return array('code'=>0,'title'=>$title,'msg'=>'已经存在'); } } public function img($local=0){ $result=array(); $files=$this->getFile($this->imgfiles); foreach($files AS $f){ if($f){ $content=file_get_contents($this->imgfiles.$f);//读取文档的内容 $imgs=json_decode($content,TRUE); if(!empty($imgs)){ if($local){ //直接本地就采集回来 foreach($imgs AS $img){ $a=file_get_contents($img['old']); $newimg=$this->imgfiles.basename($img['old']); file_put_contents($newimg,$a); } }else{ //直接发布图片到wordpress图片接口 $params['vercode']=$this->keyword; $params['imgs']=$imgs; $result[]=$this->post_url_contents($this->img_url,$params); } } unset($content,$params); } } return $result; } private function check($title){ //检查参数 $checkparams=array( 'post_title'=>$title, 'vercode'=>$this->keyword ); $checked= $this->post_url_contents($this->check_url,$checkparams); //file_put_contents('checked.txt',$checked,FILE_APPEND); if($checked=='[no]'){ return TRUE; }else{ return FALSE; } } //获取文件列表 private function getFile($dir) { $fileArray[]=NULL; if (false != ($handle = opendir ( $dir ))) { $i=0; while ( false !== ($file = readdir ( $handle )) ) { //去掉"“.”、“..”以及带“.xxx”后缀的文件 if ($file != "." && $file != ".."&&strpos($file,".")) { $fileArray[$i]=$file; if($i==100){ break; } $i++; } } //关闭句柄 closedir ( $handle ); } return $fileArray; } private function post_url_contents($url,$params){ $oCurl = curl_init(); if(stripos($url,"https://")!==FALSE){ curl_setopt($oCurl, CURLOPT_SSL_VERIFYPEER, FALSE); curl_setopt($oCurl, CURLOPT_SSL_VERIFYHOST, FALSE); } curl_setopt($oCurl, CURLOPT_URL, $url); curl_setopt($oCurl, CURLOPT_RETURNTRANSFER, 1 ); curl_setopt($oCurl, CURLOPT_POST,true); curl_setopt($oCurl, CURLOPT_POSTFIELDS, http_build_query($params)); $sContent = curl_exec($oCurl); curl_close($oCurl); //file_put_contents('post.txt',json_encode($sContent),FILE_APPEND); return $sContent; } public function get_url_contents($url,$params=''){ if(is_array($params) AND !empty($params)){ $url .= "?"; foreach ( $params as $field => $data ){ $url .= "{$field}=". $data ."&"; } $url = substr( $url, 0, 0 - 1 ); } $ip = rand(1,255).".".rand(1,255).".".rand(1,255).".".rand(1,255).""; $headers['Accept-Language'] = "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3"; $headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1; rv:38.0) Gecko/20100101 Firefox/38.0"; $headers['X-FORWARDED-FOR'] =$ip; $headers['CLIENT-IP'] =$ip; $headerArr = array(); foreach( $headers as $n => $v ) { $headerArr[] = $n .':' . $v; } $ch = curl_init(); curl_setopt($ch, CURLOPT_REFERER, "http://www.baidu.com/"); curl_setopt($ch, CURLOPT_AUTOREFERER, 1 ); // 自动设置Referer curl_setopt($ch, CURLOPT_HTTPHEADER , $headerArr ); //构造IP curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_TIMEOUT, 30 ); // 设置超时限制防止死循环 curl_setopt($ch, CURLOPT_HEADER, 0 ); // 显示返回的Header区域内容 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1 ); // 获取的信息以文件流的形式返回 curl_setopt($ch, CURLOPT_FOLLOWLOCATION,1); $html = curl_exec($ch); curl_close($ch); //$html=mb_convert_encoding($html, 'utf-8', 'gbk'); $html=preg_replace("/\s+/", " ", $html); //过滤多余回车 $html=preg_replace("/<[ ]+/si","<",$html); //过滤<__("<"号后面带空格) $html=preg_replace("/<\!--.*?-->/si","",$html); //注释 $html=preg_replace("/<(\!.*?)>/si","",$html); //过滤DOCTYPE $html=preg_replace("/<(\/?html.*?)>/si","",$html); //过滤html标签 $html=preg_replace("/<(\/?head.*?)>/si","",$html); //过滤head标签 $html=preg_replace("/<(\/?meta.*?)>/si","",$html); //过滤meta标签 $html=preg_replace("/<(\/?body.*?)>/si","",$html); //过滤body标签 $html=preg_replace("/<(\/?link.*?)>/si","",$html); //过滤link标签 $html=preg_replace("/<(\/?form.*?)>/si","",$html); //过滤form标签 $html=preg_replace("/cookie/si","COOKIE",$html); //过滤COOKIE标签 $html=preg_replace("/<(applet.*?)>(.*?)<(\/applet.*?)>/si","",$html); //过滤applet标签 $html=preg_replace("/<(\/?applet.*?)>/si","",$html); //过滤applet标签 $html=preg_replace("/<(style.*?)>(.*?)<(\/style.*?)>/si","",$html); //过滤style标签 $html=preg_replace("/<(\/?style.*?)>/si","",$html); //过滤style标签 $html=preg_replace('/(.*?style=\\").*?(\\".*?)/i','$1$2',$html); $html=preg_replace("/<(title.*?)>(.*?)<(\/title.*?)>/si","",$html); //过滤title标签 $html=preg_replace("/<(\/?title.*?)>/si","",$html); //过滤title标签 $html=preg_replace("/<(object.*?)>(.*?)<(\/object.*?)>/si","",$html); //过滤object标签 $html=preg_replace("/<(\/?objec.*?)>/si","",$html); //过滤object标签 $html=preg_replace("/<(noframes.*?)>(.*?)<(\/noframes.*?)>/si","",$html); //过滤noframes标签 $html=preg_replace("/<(\/?noframes.*?)>/si","",$html); //过滤noframes标签 $html=preg_replace("/<(i?frame.*?)>(.*?)<(\/i?frame.*?)>/si","",$html); //过滤frame标签 $html=preg_replace("/<(\/?i?frame.*?)>/si","",$html); //过滤frame标签 $html=preg_replace("/<(script.*?)>(.*?)<(\/script.*?)>/si","",$html); //过滤script标签 $html=preg_replace("/<(\/?script.*?)>/si","",$html); //过滤script标签 $html=preg_replace("/javascript/si","Javascript",$html); //过滤script标签 $html=preg_replace("/vbscript/si","Vbscript",$html); //过滤script标签 $html=preg_replace("/on([a-z]+)\s*=/si","On\\1=",$html); //过滤script标签 $html=preg_replace("/&#/si","&＃",$html); //过滤script标签，如javAsCript:alert( $html=preg_replace("/[\t\n\r]+/","",$html); $html = preg_replace(array('/\s*(<br\s*\/?\s*>\s*){2,}/im','/(<p>(\s|\s*<br\s*\/?\s*>\s*)*<\/p>)+/im'),array('\\1\\1',''),$html); return $html; } }

第2个文件index.php

<?php
include('caijiClass.php');
$c=new caiji();
//$c->img(1);exit();
$cateid='5';//php
$cateid='408';//主机
$cateid='81';//wordpress
$cateid='214';//新闻
$cateid='51';//linux
//采集
 
	for($i=1;$i<=6;$i++){
		//根据分类列表采集
		$cate_url="http://www.sijitao.net/category/os/page/{$i}/";   //列表规则 
		$cate_reg = '/<h2><a href=\"(.*?)\" rel=\"bookmark\" title=\"(.*?)\".*?>(.*?)<\/a><\/h2>/';//列表规则 
		$cate_html= $c->get_url_contents($cate_url,array());
		preg_match_all($cate_reg , $cate_html , $cate_matches); 
		//print_r( $cate_matches);exit();
 
		if(isset($cate_matches[1]) And !empty($cate_matches[1]) AND isset($cate_matches[2]) And !empty($cate_matches[2])){
			foreach($cate_matches[1] AS $key=>$url){
				if($key>0){
				$title=isset($cate_matches[3][$key])?$cate_matches[3][$key]:'';
				//开始采集数据并进行初步过滤
				$html= $c->get_url_contents($url,array());
 
 
				//删除
				//$html=preg_replace("/<div class=\"ad300\"><\/div>/","",$html);	
				$reg = '/<div.*?class=\"entry-content\".*?>(.*?)<footer class=\"entry-meta\".*?>/';//列表规则 
				$reg = '/<div class=\"single-content\".*?>.*?<div class=\"ad-pc ad-site\".*?>(.*?)<div class=\"ad-pc ad-site\".*?>/';//列表规则 
				$reg='/<div id=\"singlead\">.*?<\/div>(.*?)<div class=\"clear\">/';
 
				preg_match_all($reg , $html , $matches);
				//print_r($matches);exit();
				$content=$matches[1][0];
				$content=trim($content);
				$content=trim($content,'</div>');
 
				if($content AND $title){
					$c->post($title,$content,$cateid,$url,1);
					//print_r(array('title'=>$title,'content'=>$content,'r'=>$r));
				}
				//exit();
				sleep(1);
				}
			}
		}
		exit();
	}
 
 
/***************************************www.widuu.com**************************************************************
	for($i=9;$i<=10;$i++){
		//根据分类列表采集
		$cate_url="http://www.free521.com/category/news/page/{$i}/";   //列表规则 
		$cate_reg = '/<h2 class=\"entry-title\".*?><a href=\"(.*?)\" rel=\"bookmark\".*?>(.*?)<\/a><\/h2>/';//列表规则 
		$cate_html= $c->get_url_contents($cate_url,array());
		preg_match_all($cate_reg , $cate_html , $cate_matches); 
		if(isset($cate_matches[1]) And !empty($cate_matches[1]) AND isset($cate_matches[2]) And !empty($cate_matches[2])){
			foreach($cate_matches[1] AS $key=>$url){
 
				$title=isset($cate_matches[2][$key])?$cate_matches[2][$key]:'';
				//开始采集数据并进行初步过滤
				$html= $c->get_url_contents($url,array());
 
 
				//删除
				//$html=preg_replace("/<div class=\"ad300\"><\/div>/","",$html);	
				$reg = '/<div.*?class=\"entry-content\".*?>(.*?)<footer class=\"entry-meta\".*?>/';//列表规则 
				$reg = '/<div class=\"single-content\".*?>.*?<div class=\"ad-pc ad-site\".*?>(.*?)<div class=\"ad-pc ad-site\".*?>/';//列表规则 
 
				preg_match_all($reg , $html , $matches);
 
				$content=$matches[1][0];
				$content=trim($content);
				$content=trim($content,'</div>');
 
 
				print_r(array('title'=>$title,'content'=>$content));
				if($content AND $title){
					$c->post($title,$content,$cateid,$url,1);
					//print_r(array('title'=>$title,'content'=>$content,'r'=>$r));
				}
				//exit();
				sleep(1);
			}
		}
		exit();
	}
******************************************************************************************************/
 
/***************************************www.widuu.com**************************************************************
	for($i=9;$i<=11;$i++){
		//根据分类列表采集
		$cate_url="http://www.widuu.com/archives/category/wdphp/page/{$i}/";  //列表规则 
		$cate_reg = '#<h2><a href="(.*?)" title="(.*?)">(.*?)</a></h2>#';//列表规则 
		$cate_html= get_url_contents($cate_url,array());
		//$cate_html=mb_convert_encoding($cate_html, 'utf-8', 'gbk');
		preg_match_all($cate_reg , $cate_html , $cate_matches); 
 
		if(isset($cate_matches[1]) And !empty($cate_matches[1]) AND isset($cate_matches[2]) And !empty($cate_matches[2])){
			foreach($cate_matches[1] AS $key=>$url){
				$title=isset($cate_matches[3][$key])?$cate_matches[3][$key]:'';
 
				//开始采集数据并进行初步过滤
				$html= get_url_contents($url,array());
				//$html=mb_convert_encoding($html, 'utf-8', 'gbk');
				$html=preg_replace("/\s+/", " ", $html); //过滤多余回车 
				$html=preg_replace("/<[ ]+/si","<",$html); //过滤<__("<"号后面带空格) 
				$html=preg_replace("/<\!--.*?-->/si","",$html); //注释 
				$html=preg_replace("/<(\!.*?)>/si","",$html); //过滤DOCTYPE 
				$html=preg_replace("/<(\/?html.*?)>/si","",$html); //过滤html标签 
				$html=preg_replace("/<(\/?head.*?)>/si","",$html); //过滤head标签 
				$html=preg_replace("/<(\/?meta.*?)>/si","",$html); //过滤meta标签 
				$html=preg_replace("/<(\/?body.*?)>/si","",$html); //过滤body标签 
				$html=preg_replace("/<(\/?link.*?)>/si","",$html); //过滤link标签 
				$html=preg_replace("/<(\/?form.*?)>/si","",$html); //过滤form标签 
				$html=preg_replace("/cookie/si","COOKIE",$html); //过滤COOKIE标签 
				$html=preg_replace("/<(applet.*?)>(.*?)<(\/applet.*?)>/si","",$html); //过滤applet标签 
				$html=preg_replace("/<(\/?applet.*?)>/si","",$html); //过滤applet标签 
				$html=preg_replace("/<(style.*?)>(.*?)<(\/style.*?)>/si","",$html); //过滤style标签 
				$html=preg_replace("/<(\/?style.*?)>/si","",$html); //过滤style标签 
				$html=preg_replace('/(.*?style=\\").*?(\\".*?)/i','$1$2',$html);
				$html=preg_replace("/<(title.*?)>(.*?)<(\/title.*?)>/si","",$html); //过滤title标签 
				$html=preg_replace("/<(\/?title.*?)>/si","",$html); //过滤title标签 
				$html=preg_replace("/<(object.*?)>(.*?)<(\/object.*?)>/si","",$html); //过滤object标签 
				$html=preg_replace("/<(\/?objec.*?)>/si","",$html); //过滤object标签 
				$html=preg_replace("/<(noframes.*?)>(.*?)<(\/noframes.*?)>/si","",$html); //过滤noframes标签 
				$html=preg_replace("/<(\/?noframes.*?)>/si","",$html); //过滤noframes标签 
				$html=preg_replace("/<(i?frame.*?)>(.*?)<(\/i?frame.*?)>/si","",$html); //过滤frame标签 
				$html=preg_replace("/<(\/?i?frame.*?)>/si","",$html); //过滤frame标签 
				$html=preg_replace("/<(script.*?)>(.*?)<(\/script.*?)>/si","",$html); //过滤script标签 
				$html=preg_replace("/<(\/?script.*?)>/si","",$html); //过滤script标签 
				$html=preg_replace("/javascript/si","Javascript",$html); //过滤script标签 
				$html=preg_replace("/vbscript/si","Vbscript",$html); //过滤script标签 
				$html=preg_replace("/on([a-z]+)\s*=/si","On\\1=",$html); //过滤script标签 
				$html=preg_replace("/&#/si","&＃",$html); //过滤script标签，如javAsCript:alert( 			
				$html=preg_replace("/[\t\n\r]+/","",$html);
				$html = preg_replace(array('/\s*(<br\s*\/?\s*>\s*){2,}/im','/(<p>(\s|\s*<br\s*\/?\s*>\s*)*<\/p>)+/im'),array('\\1\\1',''),$html);
 
				//删除
				$html=preg_replace("/<div class=\"ad300\"><\/div>/","",$html);	
				$reg = '/<div class=\"content\".*?>(.*?)<\/div>/';//列表规则 
				$reg = '/<article class=\"article-content\".*?>(.*?)<\/article>/';//列表规则 
 
				preg_match_all($reg , $html , $matches);
 
				$content=$matches[1][0];
				if($content AND $title){
					//print_r(array('title'=>$title,'content'=>$content));
					echo $c->post($title,$content,$cateid,$url);
				}
				sleep(1);
			}
		}
	}
******************************************************************************************************/
 
 
/***************************************www.xker.com***************************************************************
	//根据分类列表采集
    $cate_url="http://www.xker.com/tag.php?/DirectAdmin/";  //列表规则 
	$cate_reg = '#<dd><a href="(.*?)">(.*?)</a><span>(.*?)</span></dd>#';//列表规则 
	$cate_html= get_url_contents($cate_url,array());
	$cate_html=mb_convert_encoding($cate_html, 'utf-8', 'gbk');
	preg_match_all($cate_reg , $cate_html , $cate_matches); 
 
	if(isset($cate_matches[1]) And !empty($cate_matches[1]) AND isset($cate_matches[2]) And !empty($cate_matches[2])){
		foreach($cate_matches[1] AS $key=>$url){
			$title=isset($cate_matches[2][$key])?$cate_matches[2][$key]:'';
 
			//开始采集数据并进行初步过滤
			$html= get_url_contents($url,array());
			$html=mb_convert_encoding($html, 'utf-8', 'gbk');
			$html=preg_replace("/\s+/", " ", $html); //过滤多余回车 
			$html=preg_replace("/<[ ]+/si","<",$html); //过滤<__("<"号后面带空格) 
			$html=preg_replace("/<\!--.*?-->/si","",$html); //注释 
			$html=preg_replace("/<(\!.*?)>/si","",$html); //过滤DOCTYPE 
			$html=preg_replace("/<(\/?html.*?)>/si","",$html); //过滤html标签 
			$html=preg_replace("/<(\/?head.*?)>/si","",$html); //过滤head标签 
			$html=preg_replace("/<(\/?meta.*?)>/si","",$html); //过滤meta标签 
			$html=preg_replace("/<(\/?body.*?)>/si","",$html); //过滤body标签 
			$html=preg_replace("/<(\/?link.*?)>/si","",$html); //过滤link标签 
			$html=preg_replace("/<(\/?form.*?)>/si","",$html); //过滤form标签 
			$html=preg_replace("/cookie/si","COOKIE",$html); //过滤COOKIE标签 
			$html=preg_replace("/<(applet.*?)>(.*?)<(\/applet.*?)>/si","",$html); //过滤applet标签 
			$html=preg_replace("/<(\/?applet.*?)>/si","",$html); //过滤applet标签 
			$html=preg_replace("/<(style.*?)>(.*?)<(\/style.*?)>/si","",$html); //过滤style标签 
			$html=preg_replace("/<(\/?style.*?)>/si","",$html); //过滤style标签 
			$html=preg_replace('/(.*?style=\\").*?(\\".*?)/i','$1$2',$html);
			$html=preg_replace("/<(title.*?)>(.*?)<(\/title.*?)>/si","",$html); //过滤title标签 
			$html=preg_replace("/<(\/?title.*?)>/si","",$html); //过滤title标签 
			$html=preg_replace("/<(object.*?)>(.*?)<(\/object.*?)>/si","",$html); //过滤object标签 
			$html=preg_replace("/<(\/?objec.*?)>/si","",$html); //过滤object标签 
			$html=preg_replace("/<(noframes.*?)>(.*?)<(\/noframes.*?)>/si","",$html); //过滤noframes标签 
			$html=preg_replace("/<(\/?noframes.*?)>/si","",$html); //过滤noframes标签 
			$html=preg_replace("/<(i?frame.*?)>(.*?)<(\/i?frame.*?)>/si","",$html); //过滤frame标签 
			$html=preg_replace("/<(\/?i?frame.*?)>/si","",$html); //过滤frame标签 
			$html=preg_replace("/<(script.*?)>(.*?)<(\/script.*?)>/si","",$html); //过滤script标签 
			$html=preg_replace("/<(\/?script.*?)>/si","",$html); //过滤script标签 
			$html=preg_replace("/javascript/si","Javascript",$html); //过滤script标签 
			$html=preg_replace("/vbscript/si","Vbscript",$html); //过滤script标签 
			$html=preg_replace("/on([a-z]+)\s*=/si","On\\1=",$html); //过滤script标签 
			$html=preg_replace("/&#/si","&＃",$html); //过滤script标签，如javAsCript:alert( 			
			$html=preg_replace("/[\t\n\r]+/","",$html);
			$html = preg_replace(array('/\s*(<br\s*\/?\s*>\s*){2,}/im','/(<p>(\s|\s*<br\s*\/?\s*>\s*)*<\/p>)+/im'),array('\\1\\1',''),$html);
 
			//删除
			$html=preg_replace("/<div class=\"ad300\"><\/div>/","",$html);	
			$reg = '/<div class=\"content\".*?>(.*?)<\/div>/';//列表规则 
			preg_match_all($reg , $html , $matches);
			$content=$matches[1][0];
			if($content AND $title){
				//print_r(array('title'=>$title,'content'=>$content));
				echo $c->post($title,$content,$cateid,$url);
			}
			sleep(1);
		}
	}
******************************************************************************************************/
 
 
/***************************************www.cloudvps.cc***************************************************************
$domain='www.cloudvps.cc';//目标域名
$start=1;
$end=1;
$cat=8;//目标分类ID
$cateid='51';//本站分类ID
//采集
for($i=$start;$i<=$end;$i++){
	//根据分类列表采集
    $cate_url="http://{$domain}/category-{$cat}_{$i}.html";  //列表规则 
	$cate_reg = '#<h2><a href="(.*?)" title="(.*?)">(.*?)</a></h2>#';//列表规则 
	$reg = '#<h1 class="article-title"><a href="(.*?)">(.*?)</a></h1>#';//内容 
 
	$cate_html= get_url_contents($cate_url,array());
	preg_match_all($cate_reg , $cate_html , $cate_matches); 
 
	if(isset($cate_matches[1]) And !empty($cate_matches[1]) AND isset($cate_matches[2]) And !empty($cate_matches[2])){
		foreach($cate_matches[1] AS $url){
			//采集文章
			$html= get_url_contents($url,array());
			preg_match_all($reg , $html , $matches);
			$html_start=strpos($html,'<p>'); 
			$html_end= strrpos($html,'</article>'); 
			if($html_start>0  AND $html_end>$html_start){
				$content=substr($html,$html_start,$html_end-$html_start);//从指定位置开始截取字符串，可以指定截取的长度〿
				$title=isset($matches[2][0])?$matches[2][0]:'';
				if($content AND $title){
					echo $c->post($title,$content,$cateid,$url);
				}
				sleep(1);
			}
		}
	}
}
******************************************************************************************************/

<?php include('caijiClass.php'); $c=new caiji(); //$c->img(1);exit(); $cateid='5';//php $cateid='408';//主机 $cateid='81';//wordpress $cateid='214';//新闻 $cateid='51';//linux //采集 for($i=1;$i<=6;$i++){ //根据分类列表采集 $cate_url="http://www.sijitao.net/category/os/page/{$i}/"; //列表规则 $cate_reg = '/<h2><a href=\"(.*?)\" rel=\"bookmark\" title=\"(.*?)\".*?>(.*?)<\/a><\/h2>/';//列表规则 $cate_html= $c->get_url_contents($cate_url,array()); preg_match_all($cate_reg , $cate_html , $cate_matches); //print_r( $cate_matches);exit(); if(isset($cate_matches[1]) And !empty($cate_matches[1]) AND isset($cate_matches[2]) And !empty($cate_matches[2])){ foreach($cate_matches[1] AS $key=>$url){ if($key>0){ $title=isset($cate_matches[3][$key])?$cate_matches[3][$key]:''; //开始采集数据并进行初步过滤 $html= $c->get_url_contents($url,array()); //删除 //$html=preg_replace("/<div class=\"ad300\"><\/div>/","",$html); $reg = '/<div.*?class=\"entry-content\".*?>(.*?)<footer class=\"entry-meta\".*?>/';//列表规则 $reg = '/<div class=\"single-content\".*?>.*?<div class=\"ad-pc ad-site\".*?>(.*?)<div class=\"ad-pc ad-site\".*?>/';//列表规则 $reg='/<div id=\"singlead\">.*?<\/div>(.*?)<div class=\"clear\">/'; preg_match_all($reg , $html , $matches); //print_r($matches);exit(); $content=$matches[1][0]; $content=trim($content); $content=trim($content,'</div>'); if($content AND $title){ $c->post($title,$content,$cateid,$url,1); //print_r(array('title'=>$title,'content'=>$content,'r'=>$r)); } //exit(); sleep(1); } } } exit(); } /***************************************www.widuu.com************************************************************** for($i=9;$i<=10;$i++){ //根据分类列表采集 $cate_url="http://www.free521.com/category/news/page/{$i}/"; //列表规则 $cate_reg = '/<h2 class=\"entry-title\".*?><a href=\"(.*?)\" rel=\"bookmark\".*?>(.*?)<\/a><\/h2>/';//列表规则 $cate_html= $c->get_url_contents($cate_url,array()); preg_match_all($cate_reg , $cate_html , $cate_matches); if(isset($cate_matches[1]) And !empty($cate_matches[1]) AND isset($cate_matches[2]) And !empty($cate_matches[2])){ foreach($cate_matches[1] AS $key=>$url){ $title=isset($cate_matches[2][$key])?$cate_matches[2][$key]:''; //开始采集数据并进行初步过滤 $html= $c->get_url_contents($url,array()); //删除 //$html=preg_replace("/<div class=\"ad300\"><\/div>/","",$html); $reg = '/<div.*?class=\"entry-content\".*?>(.*?)<footer class=\"entry-meta\".*?>/';//列表规则 $reg = '/<div class=\"single-content\".*?>.*?<div class=\"ad-pc ad-site\".*?>(.*?)<div class=\"ad-pc ad-site\".*?>/';//列表规则 preg_match_all($reg , $html , $matches); $content=$matches[1][0]; $content=trim($content); $content=trim($content,'</div>'); print_r(array('title'=>$title,'content'=>$content)); if($content AND $title){ $c->post($title,$content,$cateid,$url,1); //print_r(array('title'=>$title,'content'=>$content,'r'=>$r)); } //exit(); sleep(1); } } exit(); } ******************************************************************************************************/ /***************************************www.widuu.com************************************************************** for($i=9;$i<=11;$i++){ //根据分类列表采集 $cate_url="http://www.widuu.com/archives/category/wdphp/page/{$i}/"; //列表规则 $cate_reg = '#<h2><a href="(.*?)" title="(.*?)">(.*?)</a></h2>#';//列表规则 $cate_html= get_url_contents($cate_url,array()); //$cate_html=mb_convert_encoding($cate_html, 'utf-8', 'gbk'); preg_match_all($cate_reg , $cate_html , $cate_matches); if(isset($cate_matches[1]) And !empty($cate_matches[1]) AND isset($cate_matches[2]) And !empty($cate_matches[2])){ foreach($cate_matches[1] AS $key=>$url){ $title=isset($cate_matches[3][$key])?$cate_matches[3][$key]:''; //开始采集数据并进行初步过滤 $html= get_url_contents($url,array()); //$html=mb_convert_encoding($html, 'utf-8', 'gbk'); $html=preg_replace("/\s+/", " ", $html); //过滤多余回车 $html=preg_replace("/<[ ]+/si","<",$html); //过滤<__("<"号后面带空格) $html=preg_replace("/<\!--.*?-->/si","",$html); //注释 $html=preg_replace("/<(\!.*?)>/si","",$html); //过滤DOCTYPE $html=preg_replace("/<(\/?html.*?)>/si","",$html); //过滤html标签 $html=preg_replace("/<(\/?head.*?)>/si","",$html); //过滤head标签 $html=preg_replace("/<(\/?meta.*?)>/si","",$html); //过滤meta标签 $html=preg_replace("/<(\/?body.*?)>/si","",$html); //过滤body标签 $html=preg_replace("/<(\/?link.*?)>/si","",$html); //过滤link标签 $html=preg_replace("/<(\/?form.*?)>/si","",$html); //过滤form标签 $html=preg_replace("/cookie/si","COOKIE",$html); //过滤COOKIE标签 $html=preg_replace("/<(applet.*?)>(.*?)<(\/applet.*?)>/si","",$html); //过滤applet标签 $html=preg_replace("/<(\/?applet.*?)>/si","",$html); //过滤applet标签 $html=preg_replace("/<(style.*?)>(.*?)<(\/style.*?)>/si","",$html); //过滤style标签 $html=preg_replace("/<(\/?style.*?)>/si","",$html); //过滤style标签 $html=preg_replace('/(.*?style=\\").*?(\\".*?)/i','$1$2',$html); $html=preg_replace("/<(title.*?)>(.*?)<(\/title.*?)>/si","",$html); //过滤title标签 $html=preg_replace("/<(\/?title.*?)>/si","",$html); //过滤title标签 $html=preg_replace("/<(object.*?)>(.*?)<(\/object.*?)>/si","",$html); //过滤object标签 $html=preg_replace("/<(\/?objec.*?)>/si","",$html); //过滤object标签 $html=preg_replace("/<(noframes.*?)>(.*?)<(\/noframes.*?)>/si","",$html); //过滤noframes标签 $html=preg_replace("/<(\/?noframes.*?)>/si","",$html); //过滤noframes标签 $html=preg_replace("/<(i?frame.*?)>(.*?)<(\/i?frame.*?)>/si","",$html); //过滤frame标签 $html=preg_replace("/<(\/?i?frame.*?)>/si","",$html); //过滤frame标签 $html=preg_replace("/<(script.*?)>(.*?)<(\/script.*?)>/si","",$html); //过滤script标签 $html=preg_replace("/<(\/?script.*?)>/si","",$html); //过滤script标签 $html=preg_replace("/javascript/si","Javascript",$html); //过滤script标签 $html=preg_replace("/vbscript/si","Vbscript",$html); //过滤script标签 $html=preg_replace("/on([a-z]+)\s*=/si","On\\1=",$html); //过滤script标签 $html=preg_replace("/&#/si","&＃",$html); //过滤script标签，如javAsCript:alert( $html=preg_replace("/[\t\n\r]+/","",$html); $html = preg_replace(array('/\s*(<br\s*\/?\s*>\s*){2,}/im','/(<p>(\s|\s*<br\s*\/?\s*>\s*)*<\/p>)+/im'),array('\\1\\1',''),$html); //删除 $html=preg_replace("/<div class=\"ad300\"><\/div>/","",$html); $reg = '/<div class=\"content\".*?>(.*?)<\/div>/';//列表规则 $reg = '/<article class=\"article-content\".*?>(.*?)<\/article>/';//列表规则 preg_match_all($reg , $html , $matches); $content=$matches[1][0]; if($content AND $title){ //print_r(array('title'=>$title,'content'=>$content)); echo $c->post($title,$content,$cateid,$url); } sleep(1); } } } ******************************************************************************************************/ /***************************************www.xker.com*************************************************************** //根据分类列表采集 $cate_url="http://www.xker.com/tag.php?/DirectAdmin/"; //列表规则 $cate_reg = '#<dd><a href="(.*?)">(.*?)</a><span>(.*?)</span></dd>#';//列表规则 $cate_html= get_url_contents($cate_url,array()); $cate_html=mb_convert_encoding($cate_html, 'utf-8', 'gbk'); preg_match_all($cate_reg , $cate_html , $cate_matches); if(isset($cate_matches[1]) And !empty($cate_matches[1]) AND isset($cate_matches[2]) And !empty($cate_matches[2])){ foreach($cate_matches[1] AS $key=>$url){ $title=isset($cate_matches[2][$key])?$cate_matches[2][$key]:''; //开始采集数据并进行初步过滤 $html= get_url_contents($url,array()); $html=mb_convert_encoding($html, 'utf-8', 'gbk'); $html=preg_replace("/\s+/", " ", $html); //过滤多余回车 $html=preg_replace("/<[ ]+/si","<",$html); //过滤<__("<"号后面带空格) $html=preg_replace("/<\!--.*?-->/si","",$html); //注释 $html=preg_replace("/<(\!.*?)>/si","",$html); //过滤DOCTYPE $html=preg_replace("/<(\/?html.*?)>/si","",$html); //过滤html标签 $html=preg_replace("/<(\/?head.*?)>/si","",$html); //过滤head标签 $html=preg_replace("/<(\/?meta.*?)>/si","",$html); //过滤meta标签 $html=preg_replace("/<(\/?body.*?)>/si","",$html); //过滤body标签 $html=preg_replace("/<(\/?link.*?)>/si","",$html); //过滤link标签 $html=preg_replace("/<(\/?form.*?)>/si","",$html); //过滤form标签 $html=preg_replace("/cookie/si","COOKIE",$html); //过滤COOKIE标签 $html=preg_replace("/<(applet.*?)>(.*?)<(\/applet.*?)>/si","",$html); //过滤applet标签 $html=preg_replace("/<(\/?applet.*?)>/si","",$html); //过滤applet标签 $html=preg_replace("/<(style.*?)>(.*?)<(\/style.*?)>/si","",$html); //过滤style标签 $html=preg_replace("/<(\/?style.*?)>/si","",$html); //过滤style标签 $html=preg_replace('/(.*?style=\\").*?(\\".*?)/i','$1$2',$html); $html=preg_replace("/<(title.*?)>(.*?)<(\/title.*?)>/si","",$html); //过滤title标签 $html=preg_replace("/<(\/?title.*?)>/si","",$html); //过滤title标签 $html=preg_replace("/<(object.*?)>(.*?)<(\/object.*?)>/si","",$html); //过滤object标签 $html=preg_replace("/<(\/?objec.*?)>/si","",$html); //过滤object标签 $html=preg_replace("/<(noframes.*?)>(.*?)<(\/noframes.*?)>/si","",$html); //过滤noframes标签 $html=preg_replace("/<(\/?noframes.*?)>/si","",$html); //过滤noframes标签 $html=preg_replace("/<(i?frame.*?)>(.*?)<(\/i?frame.*?)>/si","",$html); //过滤frame标签 $html=preg_replace("/<(\/?i?frame.*?)>/si","",$html); //过滤frame标签 $html=preg_replace("/<(script.*?)>(.*?)<(\/script.*?)>/si","",$html); //过滤script标签 $html=preg_replace("/<(\/?script.*?)>/si","",$html); //过滤script标签 $html=preg_replace("/javascript/si","Javascript",$html); //过滤script标签 $html=preg_replace("/vbscript/si","Vbscript",$html); //过滤script标签 $html=preg_replace("/on([a-z]+)\s*=/si","On\\1=",$html); //过滤script标签 $html=preg_replace("/&#/si","&＃",$html); //过滤script标签，如javAsCript:alert( $html=preg_replace("/[\t\n\r]+/","",$html); $html = preg_replace(array('/\s*(<br\s*\/?\s*>\s*){2,}/im','/(<p>(\s|\s*<br\s*\/?\s*>\s*)*<\/p>)+/im'),array('\\1\\1',''),$html); //删除 $html=preg_replace("/<div class=\"ad300\"><\/div>/","",$html); $reg = '/<div class=\"content\".*?>(.*?)<\/div>/';//列表规则 preg_match_all($reg , $html , $matches); $content=$matches[1][0]; if($content AND $title){ //print_r(array('title'=>$title,'content'=>$content)); echo $c->post($title,$content,$cateid,$url); } sleep(1); } } ******************************************************************************************************/ /***************************************www.cloudvps.cc*************************************************************** $domain='www.cloudvps.cc';//目标域名 $start=1; $end=1; $cat=8;//目标分类ID $cateid='51';//本站分类ID //采集 for($i=$start;$i<=$end;$i++){ //根据分类列表采集 $cate_url="http://{$domain}/category-{$cat}_{$i}.html"; //列表规则 $cate_reg = '#<h2><a href="(.*?)" title="(.*?)">(.*?)</a></h2>#';//列表规则 $reg = '#<h1 class="article-title"><a href="(.*?)">(.*?)</a></h1>#';//内容 $cate_html= get_url_contents($cate_url,array()); preg_match_all($cate_reg , $cate_html , $cate_matches); if(isset($cate_matches[1]) And !empty($cate_matches[1]) AND isset($cate_matches[2]) And !empty($cate_matches[2])){ foreach($cate_matches[1] AS $url){ //采集文章 $html= get_url_contents($url,array()); preg_match_all($reg , $html , $matches); $html_start=strpos($html,'<p>'); $html_end= strrpos($html,'</article>'); if($html_start>0 AND $html_end>$html_start){ $content=substr($html,$html_start,$html_end-$html_start);//从指定位置开始截取字符串，可以指定截取的长度〿 $title=isset($matches[2][0])?$matches[2][0]:''; if($content AND $title){ echo $c->post($title,$content,$cateid,$url); } sleep(1); } } } } ******************************************************************************************************/

资料空白

good good study , day day up!

记录2年前写的wordpress 半手工采集php代码