The other week I found myself needing to populate a database with information found on various spots around the web. Unfortunately none of the information was in a standard format (RSS, XML) so I had to scrape the HTML pages the old-fashioned way. I wrote this little scraper class to help out with the parsing.
Here's a usage example:
-
$scraper = new Scraper();
-
$scraper->getRemoteText("http://flickr.com/explore/");
-
$scraper->jumpNextToken('"Interestingness"');
-
$imgURL = $scraper->scrapeNext('src="', '"');
$imgURL would then contain the URL of the main image found on the explore page of flickr. There are of course much better ways to do things like this using XML and RSS when available, but sometime you've just got to take what you're given.
-
class Scraper
-
{
-
/**
-
* Scraper class, for scraping data out of bodies of text (html)
-
* Author: Ian Marsh
-
* Version 0.1
-
**/
-
var $haystack; // the text we will be scraping
-
var $head; // current position in the haystack
-
-
/**
-
* Constructor
-
**/
-
function Scraper($text = "") {
-
$this->haystack = $text;
-
$this->head = 0;
-
}
-
-
/**
-
* getRemoteText
-
**/
-
function getRemoteText($url, $timeout = 5) {
-
$ch = curl_init();
-
curl_setopt ($ch, CURLOPT_URL, $url);
-
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
-
curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
-
$this->haystack = curl_exec($ch);
-
curl_close($ch);
-
}
-
-
/**
-
* setPosition
-
**/
-
function setPosition($newPos = 0) {
-
$this->head = $newPos;
-
}
-
-
/**
-
* getPosition
-
**/
-
function getPosition() {
-
return $this->head;
-
}
-
-
/**
-
* hasMoreTokens
-
**/
-
function hasMoreTokens($token) {
-
if($nextPos != false)
-
return true;
-
else
-
return false;
-
}
-
-
/**
-
* jumpNextToken
-
**/
-
function jumpNextToken($token = " ", $endOfToken = true) {
-
if($find> -1) {
-
if($endOfToken)
-
else
-
$this->setPosition($find);
-
return true;
-
}
-
else {
-
return false;
-
}
-
}
-
-
/**
-
* scrapeNext
-
**/
-
function scrapeNext($startToken, $endToken) {
-
$this->jumpNextToken($startToken);
-
if($endScrape> $this->head) {
-
-
$this->setPosition($endScrape);
-
return $scrape;
-
}
-
else {
-
return "";
-
}
-
}
-
}




