Snippets

Create an account or login to be able to add, comment and rate snippets.

Navigation

crawler (google sitemap xml)

this crawler returns an array. format is like this.

array( 'url_address' => array( 'page_title', 'status_code', 'content_type' ) )

*requires PEAR::HTTP_Client

with the returned array, you can create sitemap page and/or sitemap xml for google webmaster central.

<?php
require_once('HTTP/Client.php');
 
class HTTP_Crawler extends HTTP_Client {
 
    function _getMetaRedirect(&$request){
        return $not_to_redirected_automatically = null;
    }
 
    function getAbsoluteUrl($netUrl, $ambiguousUrl){
        return $this->_redirectUrl($netUrl, $ambiguousUrl);
    }
}
 
class Crawler {
 
    private 
        $baseNetUrl,
        $baseUrlEscaped,
        $client,
        $urlList = array(),
        $sleepMicroSeconds = 500000;    //0.5seconds
 
 
    /**
     * constructor
     *
     * @param string $baseUrl your domain name
     * @param int $sleepMicroSeconds crawler sleep time on each request
     */
    function __construct($baseUrl, $sleepMicroSeconds = 500000){
        $this->baseNetUrl = new Net_URL($baseUrl, true);
        $this->baseUrlEscaped = str_replace('/', '\/', preg_quote($baseUrl));
 
        $this->sleepMicroSeconds = $sleepMicroSeconds;
 
        $this->client = new HTTP_Crawler();
        $this->client->setDefaultHeader('User-Agent', 'php crawler');
    }
 
    /**
     * returns an array of found pages
     *
     * format:
     * array(
     *      'url_address' => array(
     *          'page_title',
     *          'status_code',
     *          'content_type'
     *      )
     * )
     * 
     * @param string $url crawling starting URL
     */
    public function clawThis($url){
        $url = $this->client->getAbsoluteUrl($this->baseNetUrl, $url);
        $this->setUrls($url);
        $links = $this->retrieveLinks($url);
        $this->setUrls($links);
 
        foreach ($this->urlList as $eachUrl => $statusCode){
            if($statusCode === false){
                $this->clawThis($eachUrl);
            }
        }
    }
 
    private static function isValidFile($each){
 
        $doNotCrawlFiles = array(
            'jpg',
            'gif',
            'png',
            'zip',
            'lzh',
            'xls',
            'ppt',
            'doc',
            'tif',
            'exe',
            'avi',
            'mpg',
            'swf',
            'mp3',
            '#[^#]*'
        );
 
        if(preg_match('/\.(' . join('|', $doNotCrawlFiles) . ')$/i', $each)){
            return false;
        }
        return true;
    }
 
    private function isLocalAddress($url){
        if(preg_match(sprintf('/^%s/i', $this->baseUrlEscaped), $url)){
            return true;
        }
        return false;
    }
 
    private function setUrls($urls = array()){
        if(!is_array($urls)){
            $urls = array($urls);
        }
 
        foreach($urls as $each){
            $each = $this->client->getAbsoluteUrl($this->baseNetUrl, $each);
 
            if(!$this->isLocalAddress($each) ||
                !$this->isValidFile($each) ||
                isset($this->urlList[$each])){
                continue;
            }
 
            $this->urlList[$each] = false;
        }
    }
 
    private function retrieveLinks($url){
        if($this->urlList[$url] !== false){
            return array();
        }
 
        if(!$this->isLocalAddress($url)){
            return array();
        }
 
        $this->client->get($url);
        $response = $this->client->currentResponse();
        $body = mb_convert_encoding($response['body'], mb_internal_encoding(), mb_detect_encoding($response['body']));
 
 
        $this->urlList[$url]['title'] = '';
        $this->urlList[$url]['code'] = $response['code'];
        $this->urlList[$url]['type'] = $response['headers']['content-type'];
 
        $document = new DOMDocument('1.0', mb_internal_encoding());
        $document->preserveWhiteSpace = false;
        $document->LoadHTML($body);
 
        foreach ($document->getElementsByTagName('title') as $title){
            $this->urlList[$url]['title'] = $title->textContent; //innerHTML
            break;
        }
 
 
        $links = array();
        foreach ($document->getElementsByTagName('a') as $eachElement){
            $links[] = $eachElement->getAttribute('href');
        }
        foreach ($document->getElementsByTagName('meta') as $eachElement){
            if(preg_match('/refresh/i', $eachElement->getAttribute('http-equiv'))){
                $content = $eachElement->getAttribute('content');
 
                foreach (split(';', $content) as $each){
                    if(preg_match('/url=(.*)/i', trim($each), $match)){
                        $links[] = $match[1];
                        break;
                    }
                }
            }
        }
 
        usleep($this->sleepMicroSeconds);
        return array_unique($links);
    }
 
    public function getUrlList(){
        $list = array();
        foreach ($this->urlList as $url =>$attributes){
            if(
                preg_match('/text\/html/i', $attributes['type']) &&
                !preg_match('/#(.*)$/i', $url)
            ){
                $list[$url] = $attributes;
            }
        }
 
        ksort($list);
        return $list;
    }
}

you can write a batch script to crawl your site periodically.

<?php
ini_set('max_execution_time', 0);
error_reporting(E_ALL);
require_once('Crawler.class.php');
require_once('File.php');
 
$root = 'http://YOUR_OWN_SITE_URL';
 
$crawler = new Crawler($root);
$crawler->clawThis($root);
 
$pageList = $crawler->getUrlList();
$fileName = dirname(__FILE__) . DIRECTORY_SEPARATOR . 'Sitemap.txt';
File::writeLine($fileName, serialize($pageList), FILE_MODE_WRITE);
File::close($fileName, FILE_MODE_WRITE);

to output sitemap xml, write an action like this

public function executeSitemapXml(){
    error_reporting(E_ALL);
 
    $sitemapCacheFile = sfConfig::get('sf_bin_dir') . DIRECTORY_SEPARATOR . 'sitemap.txt';
 
    require_once('File.php');
    $sitemapCache = File::readAll($sitemapCacheFile);
    $this->pageList = unserialize($sitemapCache);
    $this->today = date('Y-m-d', filemtime($sitemapCacheFile));
}

its template is like this

<?xml version="1.0" encoding="UTF-8"?>
<urlset
  xmlns="http://www.google.com/schemas/sitemap/0.84"
  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://www.google.com/schemas/sitemap/0.84
                      http://www.google.com/schemas/sitemap/0.84/sitemap.xsd">
 
<?php foreach ($pageList as $url => $attributes):?>
<url>
  <loc><?php echo $url?></loc>
  <lastmod><?php echo $today?></lastmod>
  <priority>0.8</priority>
  <changefreq>weekly</changefreq>
</url>
<?php endforeach;?>
 
</urlset>

google needs the sitemap xml to be in the top directory, so you need to wirte a route rule like this.

sitemap_xml:
  url:   /sitemapxml
  param: { module: default, action: sitemapXml }

this is actually used on our site sticker20.com

by sticker 20 on 2006-10-04, tagged crawler  google  sitemap  xml 

Comments on this snippet

gravatar icon
#1 Gordon Franke on 2006-10-05 at 08:46

very great. can you make a plugin for that?

sfGoogleSitemapPlugin

greetings Gordon

gravatar icon
#2 Francois Zaninotto on 2006-10-05 at 10:29

Instead of using PEAR::HTTP_Client, maybe you could make your snippet/plugin completely self-sufficient by using the ping google sitemap script from there:

http://www.gidnetwork.com/b-54.html

gravatar icon
#3 brikou on 2006-10-08 at 11:59

yes this is a great snippet, and could be much better if written in a self-sufficient plugin :)

gravatar icon
#4 excessive demon on 2007-09-10 at 07:07

can someon post instructions on how to use this snippet in a self-sufficient way? i found a plugin: sfSiteMapPlugin but could not get it to work.. i always get an error saying:

[pakeException] Task "sitemap" is not defined.

I already tried researching and posting on the forums but could not get any help or resources.. please help me.. i'm a n00b..

You need to create an account or log in to post a comment or rate this snippet.