php获取并分析网页
更多文章...

记录备忘。
获取网页,支持https。

//-->get html

$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $link);  //设置需要获取的url
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HEADER,0);
//curl_setopt($ch, CURLOPT_NOBODY, TRUE);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST,false);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER,false);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla 5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0' );  //设置user agent,模拟浏览器
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT,10); //超时限制
$html=curl_exec($ch);
curl_close($ch);

//-->parse html

$doc = new DOMDocument();
@$doc->loadHTML($html); 
$nodes = $doc->getElementsByTagName('title');

$title = $nodes->item(0)->nodeValue;  //网页标题

$meta_array = array();
$metas = $doc->getElementsByTagName('meta');
for ($i = 0; $i < $metas->length; $i++) {
    $meta = $metas->item($i);
	if($meta->getAttribute('name')) {
	    $meta_array[strtolower($meta->getAttribute('name'))] = $meta->getAttribute('content');
	}
	
	if($meta->getAttribute('http-equiv')) {
	    $meta_array[strtolower($meta->getAttribute('http-equiv'))] = $meta->getAttribute('content');
	}
		
	if($meta->getAttribute('scheme')) {
	    $meta_array[strtolower($meta->getAttribute('scheme'))] = $meta->getAttribute('content');
	}
} 


			
@$keywords=$meta_array['keywords'];  //网页关键字
@$description=$meta_array['description']; //网页描述
			



		





© time.org.cn