Snippets

Create an account or login to be able to add, comment and rate snippets.

Navigation

Refine Tags

Snippets tagged "xml sitemap" Snippets tagged "xml sitemap"

crawler (google sitemap xml)

this crawler returns an array. format is like this.

array( 'url_address' => array( 'page_title', 'status_code', 'content_type' ) )

*requires PEAR::HTTP_Client

with the returned array, you can create sitemap page and/or sitemap xml for google webmaster central.

<?php
require_once('HTTP/Client.php');
 
class HTTP_Crawler extends HTTP_Client {
 
    function _getMetaRedirect(&$request){
        return $not_to_redirected_automatically = null;
    }
 
    function getAbsoluteUrl($netUrl, $ambiguousUrl){
        return $this->_redirectUrl($netUrl, $ambiguousUrl);
    }
}
 
class Crawler {
 
    private 
        $baseNetUrl,
        $baseUrlEscaped,
        $client,
        $urlList = array(),
        $sleepMicroSeconds = 500000;    //0.5seconds
 
 
    /**
     * constructor
     *
     * @param string $baseUrl your domain name
     * @param int $sleepMicroSeconds crawler sleep time on each request
     */
    function __construct($baseUrl, $sleepMicroSeconds = 500000){
        $this->baseNetUrl = new Net_URL($baseUrl, true);
        $this->baseUrlEscaped = str_replace('/', '\/', preg_quote($baseUrl));
 
        $this->sleepMicroSeconds = $sleepMicroSeconds;
 
        $this->client = new HTTP_Crawler();
        $this->client->setDefaultHeader('User-Agent', 'php crawler');
    }
 
    /**
     * returns an array of found pages
     *
     * format:
     * array(
     *      'url_address' => array(
     *          'page_title',
     *          'status_code',
     *          'content_type'
     *      )
     * )
     * 
     * @param string $url crawling starting URL
     */
    public function clawThis($url){
        $url = $this->client->getAbsoluteUrl($this->baseNetUrl, $url);
        $this->setUrls($url);
        $links = $this->retrieveLinks($url);
        $this->setUrls($links);
 
        foreach ($this->urlList as $eachUrl => $statusCode){
            if($statusCode === false){
                $this->clawThis($eachUrl);
            }
        }
    }
 
    private static function isValidFile($each){
 
        $doNotCrawlFiles = array(
            'jpg',
            'gif',
            'png',
            'zip',
            'lzh',
            'xls',
            'ppt',
            'doc',
            'tif',
            'exe',
            'avi',
            'mpg',
            'swf',
            'mp3',
            '#[^#]*'
        );
 
        if(preg_match('/\.(' . join('|', $doNotCrawlFiles) . ')$/i', $each)){
            return false;
        }
        return true;
    }
 
    private function isLocalAddress($url){
        if(preg_match(sprintf('/^%s/i', $this->baseUrlEscaped), $url)){
            return true;
        }
        return false;
    }
 
    private function setUrls($urls = array()){
        if(!is_array($urls)){
            $urls = array($urls);
        }
 
        foreach($urls as $each){
            $each = $this->client->getAbsoluteUrl($this->baseNetUrl, $each);
 
            if(!$this->isLocalAddress($each) ||
                !$this->isValidFile($each) ||
                isset($this->urlList[$each])){
                continue;
            }
 
            $this->urlList[$each] = false;
        }
    }
 
    private function retrieveLinks($url){
        if($this->urlList[$url] !== false){
            return array();
        }
 
        if(!$this->isLocalAddress($url)){
            return array();
        }
 
        $this->client->get($url);
        $response = $this->client->currentResponse();
        $body = mb_convert_encoding($response['body'], mb_internal_encoding(), mb_detect_encoding($response['body']));
 
 
        $this->urlList[$url]['title'] = '';
        $this->urlList[$url]['code'] = $response['code'];
        $this->urlList[$url]['type'] = $response['headers']['content-type'];
 
        $document = new DOMDocument('1.0', mb_internal_encoding());
        $document->preserveWhiteSpace = false;
        $document->LoadHTML($body);
 
        foreach ($document->getElementsByTagName('title') as $title){
            $this->urlList[$url]['title'] = $title->textContent; //innerHTML
            break;
        }
 
 
        $links = array();
        foreach ($document->getElementsByTagName('a') as $eachElement){
            $links[] = $eachElement->getAttribute('href');
        }
        foreach ($document->getElementsByTagName('meta') as $eachElement){
            if(preg_match('/refresh/i', $eachElement->getAttribute('http-equiv'))){
                $content = $eachElement->getAttribute('content');
 
                foreach (split(';', $content) as $each){
                    if(preg_match('/url=(.*)/i', trim($each), $match)){
                        $links[] = $match[1];
                        break;
                    }
                }
            }
        }
 
        usleep($this->sleepMicroSeconds);
        return array_unique($links);
    }
 
    public function getUrlList(){
        $list = array();
        foreach ($this->urlList as $url =>$attributes){
            if(
                preg_match('/text\/html/i', $attributes['type']) &&
                !preg_match('/#(.*)$/i', $url)
            ){
                $list[$url] = $attributes;
            }
        }
 
        ksort($list);
        return $list;
    }
}

you can write a batch script to crawl your site periodically.

<?php
ini_set('max_execution_time', 0);
error_reporting(E_ALL);
require_once('Crawler.class.php');
require_once('File.php');
 
$root = 'http://YOUR_OWN_SITE_URL';
 
$crawler = new Crawler($root);
$crawler->clawThis($root);
 
$pageList = $crawler->getUrlList();
$fileName = dirname(__FILE__) . DIRECTORY_SEPARATOR . 'Sitemap.txt';
File::writeLine($fileName, serialize($pageList), FILE_MODE_WRITE);
File::close($fileName, FILE_MODE_WRITE);

to output sitemap xml, write an action like this

public function executeSitemapXml(){
    error_reporting(E_ALL);
 
    $sitemapCacheFile = sfConfig::get('sf_bin_dir') . DIRECTORY_SEPARATOR . 'sitemap.txt';
 
    require_once('File.php');
    $sitemapCache = File::readAll($sitemapCacheFile);
    $this->pageList = unserialize($sitemapCache);
    $this->today = date('Y-m-d', filemtime($sitemapCacheFile));
}

its template is like this

<?xml version="1.0" encoding="UTF-8"?>
<urlset
  xmlns="http://www.google.com/schemas/sitemap/0.84"
  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://www.google.com/schemas/sitemap/0.84
                      http://www.google.com/schemas/sitemap/0.84/sitemap.xsd">
 
<?php foreach ($pageList as $url => $attributes):?>
<url>
  <loc><?php echo $url?></loc>
  <lastmod><?php echo $today?></lastmod>
  <priority>0.8</priority>
  <changefreq>weekly</changefreq>
</url>
<?php endforeach;?>
 
</urlset>

google needs the sitemap xml to be in the top directory, so you need to wirte a route rule like this.

sitemap_xml:
  url:   /sitemapxml
  param: { module: default, action: sitemapXml }

this is actually used on our site sticker20.com

by sticker 20 on 2006-10-04, tagged crawler  google  sitemap  xml 
(4 comments)