Integrate Scrapoxy to Node.js
Goal
Is it easy to find a good Javascript developer on Paris ? No!
So, it’s time to build a scraper with Node.js, Request and Cheerio to find our perfect profile.
The site Scraping Challenge indexes a lot of profiles (fake, for demo purposes). We want to list them.
However, the site is protected against scraping ! We must use Scrapoxy to bypass the protection.
Step 1: Create a Node.js project
Install Node.js
Install the latest Node.js version.
Create a new project
Create a directory for the project:
mkdir nodejs-request
cd nodejs-request
Create the package.json:
npm init --yes
Add dependencies:
npm install lodash bluebird cheerio request winston@2.x --save
What are these dependencies ?
lodash is a javascript helper,
bluebird is a promise library,
cheerio is a JQuery parser,
requests makes HTTP requests,
winston is a logger.
Add a scraper
Add this content to index.js
const _ = require('lodash'),
Promise = require('bluebird'),
cheerio = require('cheerio'),
request = require('request'),
winston = require('winston');
winston.level = 'debug';
const config = {
// URL of the site
source: 'http://scraping-challenge-2.herokuapp.com',
opts: {
},
};
// Get all URLs
getProfilesUrls(config.source, config.opts)
.then((urls) => {
winston.info('Found %d profiles', urls.length);
winston.info('Wait 120 seconds to scale instances');
return urls;
})
// Wait 2 minutes to scale instances
.delay(2 * 60 * 1000)
.then((urls) => {
// Get profiles one by one.
return Promise.map(urls,
(url) => getProfile(url, config.opts)
.then((profile) => {
winston.debug('Found %s', profile.name);
return profile;
})
.catch(() => {
winston.debug('Cannot retrieve %s', url);
})
, {concurrency: 1})
.then((profiles) => {
const results = _.compact(profiles);
winston.info('Extract %d on %d profiles', results.length, urls.length);
});
})
.catch((err) => winston.error('Error: ', err));
////////////
/**
* Get all the urls of the profiles
* @param url Main URL
* @param defaultOpts options for http request
* @returns {promise}
*/
function getProfilesUrls(url, defaultOpts) {
return new Promise((resolve, reject) => {
// Create options for the HTTP request
// Add the URL to the default options
const opts = _.merge({}, defaultOpts, {url});
request(opts, (err, res, body) => {
if (err) {
return reject(err);
}
if (res.statusCode !== 200) {
return reject(body);
}
// Load content into a JQuery parser
const $ = cheerio.load(body);
// Extract all urls
const urls = $('.profile a')
.map((i, el) => $(el).attr('href'))
.get()
.map((url) => `${config.source}${url}`);
resolve(urls);
});
});
}
/**
* Get the profile and extract the name
* @param url URL of the profile
* @param defaultOpts options for http request
* @returns {promise}
*/
function getProfile(url, defaultOpts) {
return new Promise((resolve, reject) => {
// Create options for the HTTP request
// Add the URL to the default options
const opts = _.merge({}, defaultOpts, {url});
request(opts, (err, res, body) => {
if (err) {
return reject(err);
}
if (res.statusCode !== 200) {
return reject(body);
}
// Load content into a JQuery parser
const $ = cheerio.load(body);
// Extract the names
const name = $('.profile-info-name').text();
resolve({name});
});
});
}
Run the script
Let’s try our new scraper!
Run this command:
node index.js
The script scraps the site and list profiles.
However, Scraping Challenge is protected! All requests fail…
We will integrate Scrapoxy to bypass the protection.
Step 2: Integrate Scrapoxy to the script
Install Scrapoxy
See Quick Start to install Scrapoxy.
Start Scrapoxy
Set the maximum of instances to 6, and start Scrapoxy (see Change scaling with GUI).
Warning
Don’t forget to set the maximum of instances!
Edit the script
Open index.js
and modify the config value
const config = {
// URL of the site
source: 'http://scraping-challenge-2.herokuapp.com',
opts: {
// URL of Scrapoxy
proxy: 'http://localhost:8888',
// HTTPS over HTTP
tunnel: false,
}
};
Run the script
Run this command:
node index.js
Now, all profiles are listed!