Node.js: how to convert RSS to JSON

Node.js: how to convert RSS to JSON

In this tutorial we're going to convert an RSS feed into a JSON object with Node.js.

In this tutorial we're going to convert an RSS feed into a JSON object with Node.js.

Unlike PHP, Node.js needs an external module in order to turn an XML string into a DOM document. In our case, we're going to use the cheerio NPM package.

In an RSS feed, CDATA blocks can contain HTML strings. In such case we need to use cheerio in order to convert the HTML string into a DOM fragment.

Certain elements may be hosted within an XML namespace. In this case we have to use the XPath notation because the : token will be interpreted as a pseudo-element CSS delimiter.

We can start with the following structure:

'use strict';

const https = require('https');
const cheerio = require('cheerio');
const noop = arg => arg;

class FeedParser {
    constructor({ url, linkFormatter = noop, contentFormatter = noop, dateFormatter = noop }) {
        this.url = url;
        this.linkFormatter = linkFormatter;
        this.contentFormatter = contentFormatter;
        this.dateFormatter = dateFormatter;
    }
}    

Formatting links, contents and dates is up to the client of our class so we define here three callback functions that have a noop routine as their default value. We use a dummy callback function just to make sure that we will not return undefined.

The first method to be defined will fetch the remote feed.

getFeed() {
        return new Promise((resolve, reject) => {  
            try {
                const uri = new URL(this.url);
                const { hostname, pathname } = uri;
                const options = {
                    hostname: hostname,
                    port: 443,
                    path: pathname,
                    method: 'GET'
                };
                const req = https.request(options, res => {
                    let body = '';
                    res.on('data', d => {
                        body += d;
                    });
                    res.on('end', () => {
                       resolve(body); 
                    });
                });
        
                req.on('error', error => {
                    reject(error);
                });
                  
                req.end();
                  
            } catch(err) {
                reject(err);
            }
        });
    }

Then we need to remove the CDATA markers and fetch the first image and paragraph of each post.

removeCDATA(str) {
        return str.replace('<![CDATA[', '').replace(']]>', '').trim();
    }

    getImage(content) {
        const $ = cheerio.load(content);
       return $('img').eq(0).attr('src');
    }

    getContent(html) {
        const $ = cheerio.load(html);
        return $('p').eq(0).text();
    }

Later, we can parse the RSS feed and return an array of items.

parseFeed(xml) {
        const $ = cheerio.load(xml, null, false);
        const output = [];
        const self = this;

        $('item').each(function() {
            let $item = $(this);
            let title = self.removeCDATA($item.find('title').text());
            let link = $item.find('guid').text();
            let published = self.dateFormatter(new Date($item.find('pubDate').text()));
            let permalink = self.linkFormatter(link);
            let content = self.contentFormatter(self.getContent(self.removeCDATA($item.find('content\\:encoded').html())));
            let image = self.getImage(self.removeCDATA($item.find('content\\:encoded').html()));

            output.push({
                title,
                permalink,
                published,
                content,
                image
            });
        });
        return output;
    }

Finally, we can create the main method of our class.

async render() {
        try {
            const xml = await this.getFeed();
            const data = this.parseFeed(xml);
            return data;
        } catch(err) {
            return new Error('Error while parsing RSS feed.');
        }
    }

We can use our class as follows:

(async () => {
    const feedParser = new FeedParser({
        url: 'https://medium.com/feed/@gabriele-romanato',
        linkFormatter(link) {
            return link.replace('https://medium.com/p/', 'https://gabrieleromanato.it/');
        },
        dateFormatter(pubDate) {
            const year = pubDate.getFullYear().toString();
            const month = (pubDate.getMonth() + 1) >= 10 ? (pubDate.getMonth() + 1).toString() : '0' + (pubDate.getMonth() + 1);
            const day = pubDate.getDate() >= 10 ? pubDate.getDate().toString() : '0' + pubDate.getDate();

            return `${day}/${month}/${year}`;
        }
    }); 
    
    const data = await mediumFeedParser.render();

    console.log(JSON.stringify(data));
})();