/*
@desc:爬虫原型
@author [Lee] <[]>
@param url 初始url
@param callback 处理业务的回调函数
@param 挖掘url的深度 默认3
*/
function crawl($url,$callback,$depth = 3){
if($depth > 0){
$depth--;
$http = new http($url);
$content = $http->get()->exec();
// 业务处理开始
call_user_func($callback,$content);
// 业务处理结束
$preg = '/<[a|A].*?href=[\'\"]{0,1}([^>\'\"\ ]*).*?>/';
$bool = preg_match_all($preg,$content,$res);
$urls = array();
if($bool){
$urls = $res[1];
}
$urls = array_unique($urls);
$info = parse_url($url);
$scheme = $info["scheme"]?:'http';
$user = $info["user"];
$pass = $info["pass"];
$host = $info["host"];
$port = $info["port"];
$path = $info["path"];
$url = $scheme . '://';
if ($user && $pass) {
$url .= $user . ":" . $pass . "@";
}
$url .= $host;
if ($port) {
$url .= ":" . $port;
}
$url .= $path;
if (is_array($urls)) {
foreach ($urls as $u) {
if (preg_match('/^http/', $u)) {
$returl = $u;
} else {
$real = $url . '/' . $u;
$returl = $real;
}
crawl($returl,$callback,$depth);
}
}
}
}
当前文章:php爬虫原型
当前URL:
http://gzruizhi.cn/article/pepeeh.html