Download website to a local directory (including all css, images, js, etc.)
You can try it in demo app (source)
npm install website-scraper
var scraper = require('website-scraper');
var options = {
urls: ['http://nodejs.org/'],
directory: '/path/to/save/',
};
// with callback
scraper.scrape(options, function (error, result) {
/* some code here */
});
// or with promise
scraper.scrape(options).then(function (result) {
/* some code here */
});Makes requests to urls and saves all files found with sources to directory.
options - object containing next options:
urls:array of urls to load and filenames for them (required, see example below)directory:path to save loaded files (required)defaultFilename:filename for index page (optional, default: 'index.html')sources:array of objects to load, specifies selectors and attribute values to select files for loading (optional, see default value inlib/config/defaults.js)subdirectories:array of objects, specifies subdirectories for file extensions. Ifnullall files will be saved todirectory(optional, see example below)request: object, custom options for request (optional, see example below)recursive: boolean, iftruescraper will follow anchors in html files. Don't forget to setmaxDepthto avoid infinite downloading (optional, see example below)maxDepth: positive number, maximum allowed depth for dependencies (optional, see example below)
callback - callback function (optional), includes following parameters:
error:if error -Errorobject, if success -nullresult:if error -null, if success - array if objects containing:url:url of loaded pagefilename:filename where page was saved (relative todirectory)
Let's scrape some pages from http://nodejs.org/ with images, css, js files and save them to /path/to/save/.
Imagine we want to load:
- Home page to
index.html - About page to
about.html - Blog to
blog.html
and separate files into directories:
imgfor .jpg, .png, .svg (full path/path/to/save/img)jsfor .js (full path/path/to/save/js)cssfor .css (full path/path/to/save/css)
var scraper = require('website-scraper');
scraper.scrape({
urls: [
'http://nodejs.org/', // Will be saved with default filename 'index.html'
{url: 'http://nodejs.org/about', filename: 'about.html'},
{url: 'http://blog.nodejs.org/', filename: 'blog.html'}
],
directory: '/path/to/save',
subdirectories: [
{directory: 'img', extensions: ['.jpg', '.png', '.svg']},
{directory: 'js', extensions: ['.js']},
{directory: 'css', extensions: ['.css']}
],
sources: [
{selector: 'img', attr: 'src'},
{selector: 'link[rel="stylesheet"]', attr: 'href'},
{selector: 'script', attr: 'src'}
],
request: {
headers: {
'User-Agent': 'Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 4 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19'
}
}
}).then(function (result) {
console.log(result);
}).catch(function(err){
console.log(err);
});// Links from example.com will be followed
// Links from links will be ignored because theirs depth = 2 is greater than maxDepth
var scraper = require('website-scraper');
scraper.scrape({
urls: ['http://example.com/'],
directory: '/path/to/save',
recursive: true,
maxDepth: 1
}).then(console.log).catch(console.log);