Included are a few simple examples of using the node-car-scraper wrapper.
Follow the instructions on the Node website.
git clone https://www.github.com/JTarasovic/node-car-examples MyFolder
cd MyFolder && npm install
node examples/simple.js
This version just passes a site (reddit.com) to node-car-scraper
which fetches the site and returns a cheerio
object.
processSite('http://www.reddit.com', oneb, two, three, four);
function oneb (err, $, cb) {
var arr = ['https://developer.mozilla.org', 'http://nodejs.org'];
return cb(arr);
}
The returned object is ignored and an array with two different sites are passed back to the callback. node-car-scraper
gets both those pages and calls the link callback which parses all of the absolutely links off of the pages and sends it back to node-car-scraper
which requests each of those links.
function two (err, $, cb) {
var arr = [];
$('a').each(function (i, elem) {
temp = $(elem).attr('href');
if (temp.startsWith('http') && !temp.endsWith('tar.gz')) {
arr.push(temp);
}
});
cb(arr);
return;
}
The results of those requests are, again, ignored for simplicity; however, the callback is called.
function three (err,$,cb) {
return(cb());
}
After all of the links are processed, the final callback is called.
function four () {
console.log('HOLY SHIT!!');
}
node index.js
This version uses a class that contains the necessary methods and properties to query a car dealership, get all of the search results and store them in MongoDB.
node server.js
This is very much a quick and dirty web server that returns the results in a nice table if you navigate to localhost:3000
. I'd like to migrate this to Handlebars or something similar instead of fiddling with the HTML directly but it works for the short term.
There are a couple of options:
See bmw.js for more details on this method.
This is probably the preferred method as you can create a separate class for each dealership/website that you would like to query.
var processSite = require('node-car-scraper');
var Car = require('MyCarClass');
var car = new Car();
processSite(car.url, car.siteCallback, car.pageCallback, car.linkCallback, car.finalCallback);
var processSite = require('node-car-scraper');
processSite('http://www.example.com',
function(err, $, callback){
// ... send back an array of pages to visit based on the first URL visited
callback(arr);
},
function(err, $, callback){
// ... send back an array of links for individual pages to query
callback(arr);
},
function(err, $, callback){
// ... given one page with the necessary details, parse out what it relevent to you.
// ... this is where you'd add an entry into a DB if desired
// ... call the callback with no args
callback();
},
function(){
// ... final callback. Only called once when all of the links have been processed.
console.log('WOOT! Finished!');
});
processSite(url, getPagesFromSite, getLinksFromPage, getDetailsFromIndividualPage, finished);
function getPagesFromSite (err, $, callback){
// ... send back an array of pages to visit based on the first URL visited
callback(arr);
}
function getLinksFromPage (err, $, callback){
// ... send back an array of links for individual pages to query
callback(arr);
},
function getDetailsFromIndividualPage (err, $, callback){
// ... given one page with the necessary details, parse out what it relevent to you.
// ... this is where you'd add an entry into a DB if desired
// ... call the callback with no args
callback();
},
function finished (){
// ... final callback. Only called once when all of the links have been processed.
console.log('WOOT! Finished!');
};
NODE_ENV=debug node fileToRun.js
Using the debug flag causes node-car-scrapper
to output quite a lot of information about what it is doing to stdout. As such, it may be helpful to redirect fd 1 (stdout) to a file.
NODE_ENV=debug node fileToRun.js > debug.log