Node.js: parsing a remote HTML page

Node.js: parsing a remote HTML page

In Node.js parsing a remote HTML page is quite simple.

In Node.js parsing a remote HTML page is quite simple.

We need two packages for our task:


npm install request --save
npm install cheerio --save

Our code is as follows:


'use strict';

const request = require('request');
const cheerio = require('cheerio');
const url = 'http://www.html.it/autore/gabroman/';
const fs = require('fs');

const getPage = ( cb ) => {
    request(url, {
        timeout: 3000
    }, (error, response, body) => {
        if(!error) {
            cb(body);
        }
    });
};

const savePage = ( data ) => {
    let contents = "'use strict';" + '\n\n';
        contents += 'const HTMLItArticles = ';
        contents += JSON.stringify( data ) + ';\n\n';
        contents += 'module.exports = HTMLItArticles;';

        fs.writeFileSync(__dirname + '/articles.js', contents);
};

const parsePage = ( data ) => {
    const $ = cheerio.load(data);
    let output = [];
    $( ".author-article" ).each( (i, elem ) => {
        let $a = $(elem).find( 'a' );
        let datum = {
            title: $a.text(),
            url: $a.attr( 'href' )
        };
        output.push(datum);
    });
    return output;
};

getPage() performs the remote request and gets the HTML page as a string. Its output will be passed to parsePage() that retrieves the relevant data using jQuery-like selectors, methods and functions provided by the cheerio package. This approach quite resembles the operations you can run with PHP using either the DOM and the SimpleXML extensions combined together.

savePage() simply creates a JavaScript file with an array of objects containing the relevant information gathered. Finally, we run our code:


getPage( (html) => {
    let data = parsePage( html );
    savePage(data);
});

The result is shown below.


    'use strict';

    const HTMLItArticles = [{
        "title": "WooCommerce REST API: sincronizzazione con NodeJS e MongoDB",
        "url": "http://www.html.it/guide/woocommerce-rest-api-sincronizzazione-con-nodejs-e-mongodb/?cref="
    }, {
        "title": "Creare Plugin per WordPress, la guida",
        "url": "http://www.html.it/guide/creare-plugin-per-wordpress/?cref=development"
    }, {
        "title": "Connettere WordPress e Excel grazie a Python",
        "url": "http://www.html.it/articoli/wordpress-dialogo-con-python-tramite-api-xml-rpc/?cref=development"
    }, {
        "title": "WordPress e la gestione degli errori",
        "url": "http://www.html.it/articoli/wordpress-e-la-gestione-degli-errori/?cref=development"
    }, {
        "title": "WordPress: utilizzo dei tag condizionali",
        "url": "http://www.html.it/articoli/wordpress-utilizzo-dei-tag-condizionali/?cref=development"
    }, {
        "title": "WordPress: eseguire la migrazione di un sito esistente",
        "url": "http://www.html.it/articoli/wordpress-eseguire-la-migrazione-di-un-sito-esistente/?cref=development"
    }, {
        "title": "WordPress e gli “attachments”",
        "url": "http://www.html.it/articoli/wordpress-gestire-i-loop-degli-allegati/?cref=development"
    }, {
        "title": "WordPress: aggiungere una sezione portfolio a TwentyFourteen",
        "url": "http://www.html.it/articoli/wordpress-aggiungere-una-sezione-portfolio-a-twentyfourteen/?cref=development"
    }, {
        "title": "Pulsanti social per WordPress con jQuery",
        "url": "http://www.html.it/articoli/definire-una-classe-per-gestire-le-condivisioni-social-con-wordpress-2/?cref=development"
    }, {
        "title": "Creare una classe per gestire gli shortcode di WordPress",
        "url": "http://www.html.it/articoli/creare-una-classe-per-gestire-gli-shortcode-di-wordpress/?cref=development"
    }];
    
    module.exports = HTMLItArticles;