![]() |
|
Snippets |
|
this crawler returns an array. format is like this.
array( 'url_address' => array( 'page_title', 'status_code', 'content_type' ) )
*requires PEAR::HTTP_Client
with the returned array, you can create sitemap page and/or sitemap xml for google webmaster central.
<?php require_once('HTTP/Client.php'); class HTTP_Crawler extends HTTP_Client { function _getMetaRedirect(&$request){ return $not_to_redirected_automatically = null; } function getAbsoluteUrl($netUrl, $ambiguousUrl){ return $this->_redirectUrl($netUrl, $ambiguousUrl); } } class Crawler { private $baseNetUrl, $baseUrlEscaped, $client, $urlList = array(), $sleepMicroSeconds = 500000; //0.5seconds /** * constructor * * @param string $baseUrl your domain name * @param int $sleepMicroSeconds crawler sleep time on each request */ function __construct($baseUrl, $sleepMicroSeconds = 500000){ $this->baseNetUrl = new Net_URL($baseUrl, true); $this->baseUrlEscaped = str_replace('/', '\/', preg_quote($baseUrl)); $this->sleepMicroSeconds = $sleepMicroSeconds; $this->client = new HTTP_Crawler(); $this->client->setDefaultHeader('User-Agent', 'php crawler'); } /** * returns an array of found pages * * format: * array( * 'url_address' => array( * 'page_title', * 'status_code', * 'content_type' * ) * ) * * @param string $url crawling starting URL */ public function clawThis($url){ $url = $this->client->getAbsoluteUrl($this->baseNetUrl, $url); $this->setUrls($url); $links = $this->retrieveLinks($url); $this->setUrls($links); foreach ($this->urlList as $eachUrl => $statusCode){ if($statusCode === false){ $this->clawThis($eachUrl); } } } private static function isValidFile($each){ $doNotCrawlFiles = array( 'jpg', 'gif', 'png', 'zip', 'lzh', 'xls', 'ppt', 'doc', 'tif', 'exe', 'avi', 'mpg', 'swf', 'mp3', '#[^#]*' ); if(preg_match('/\.(' . join('|', $doNotCrawlFiles) . ')$/i', $each)){ return false; } return true; } private function isLocalAddress($url){ if(preg_match(sprintf('/^%s/i', $this->baseUrlEscaped), $url)){ return true; } return false; } private function setUrls($urls = array()){ if(!is_array($urls)){ $urls = array($urls); } foreach($urls as $each){ $each = $this->client->getAbsoluteUrl($this->baseNetUrl, $each); if(!$this->isLocalAddress($each) || !$this->isValidFile($each) || isset($this->urlList[$each])){ continue; } $this->urlList[$each] = false; } } private function retrieveLinks($url){ if($this->urlList[$url] !== false){ return array(); } if(!$this->isLocalAddress($url)){ return array(); } $this->client->get($url); $response = $this->client->currentResponse(); $body = mb_convert_encoding($response['body'], mb_internal_encoding(), mb_detect_encoding($response['body'])); $this->urlList[$url]['title'] = ''; $this->urlList[$url]['code'] = $response['code']; $this->urlList[$url]['type'] = $response['headers']['content-type']; $document = new DOMDocument('1.0', mb_internal_encoding()); $document->preserveWhiteSpace = false; $document->LoadHTML($body); foreach ($document->getElementsByTagName('title') as $title){ $this->urlList[$url]['title'] = $title->textContent; //innerHTML break; } $links = array(); foreach ($document->getElementsByTagName('a') as $eachElement){ $links[] = $eachElement->getAttribute('href'); } foreach ($document->getElementsByTagName('meta') as $eachElement){ if(preg_match('/refresh/i', $eachElement->getAttribute('http-equiv'))){ $content = $eachElement->getAttribute('content'); foreach (split(';', $content) as $each){ if(preg_match('/url=(.*)/i', trim($each), $match)){ $links[] = $match[1]; break; } } } } usleep($this->sleepMicroSeconds); return array_unique($links); } public function getUrlList(){ $list = array(); foreach ($this->urlList as $url =>$attributes){ if( preg_match('/text\/html/i', $attributes['type']) && !preg_match('/#(.*)$/i', $url) ){ $list[$url] = $attributes; } } ksort($list); return $list; } }
you can write a batch script to crawl your site periodically.
<?php ini_set('max_execution_time', 0); error_reporting(E_ALL); require_once('Crawler.class.php'); require_once('File.php'); $root = 'http://YOUR_OWN_SITE_URL'; $crawler = new Crawler($root); $crawler->clawThis($root); $pageList = $crawler->getUrlList(); $fileName = dirname(__FILE__) . DIRECTORY_SEPARATOR . 'Sitemap.txt'; File::writeLine($fileName, serialize($pageList), FILE_MODE_WRITE); File::close($fileName, FILE_MODE_WRITE);
to output sitemap xml, write an action like this
public function executeSitemapXml(){ error_reporting(E_ALL); $sitemapCacheFile = sfConfig::get('sf_bin_dir') . DIRECTORY_SEPARATOR . 'sitemap.txt'; require_once('File.php'); $sitemapCache = File::readAll($sitemapCacheFile); $this->pageList = unserialize($sitemapCache); $this->today = date('Y-m-d', filemtime($sitemapCacheFile)); }
its template is like this
<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.google.com/schemas/sitemap/0.84" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.google.com/schemas/sitemap/0.84 http://www.google.com/schemas/sitemap/0.84/sitemap.xsd"> <?php foreach ($pageList as $url => $attributes):?> <url> <loc><?php echo $url?></loc> <lastmod><?php echo $today?></lastmod> <priority>0.8</priority> <changefreq>weekly</changefreq> </url> <?php endforeach;?> </urlset>
google needs the sitemap xml to be in the top directory, so you need to wirte a route rule like this.
sitemap_xml: url: /sitemapxml param: { module: default, action: sitemapXml }
this is actually used on our site sticker20.com