123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349 |
- "use strict";
- const http = require('http');
- const https = require('https');
- const xml2js = require('xml2js');
- const url = require('url');
- const fields = require('./fields');
- const utils = require('./utils');
- const DEFAULT_HEADERS = {
- 'User-Agent': 'rss-parser',
- 'Accept': 'application/rss+xml',
- }
- const DEFAULT_MAX_REDIRECTS = 5;
- const DEFAULT_TIMEOUT = 60000;
- class Parser {
- constructor(options={}) {
- options.headers = options.headers || {};
- options.xml2js = options.xml2js || {};
- options.customFields = options.customFields || {};
- options.customFields.item = options.customFields.item || [];
- options.customFields.feed = options.customFields.feed || [];
- options.requestOptions = options.requestOptions || {};
- if (!options.maxRedirects) options.maxRedirects = DEFAULT_MAX_REDIRECTS;
- if (!options.timeout) options.timeout = DEFAULT_TIMEOUT;
- this.options = options;
- this.xmlParser = new xml2js.Parser(this.options.xml2js);
- }
- parseString(xml, callback) {
- let prom = new Promise((resolve, reject) => {
- this.xmlParser.parseString(xml, (err, result) => {
- if (err) return reject(err);
- if (!result) {
- return reject(new Error('Unable to parse XML.'));
- }
- let feed = null;
- if (result.feed) {
- feed = this.buildAtomFeed(result);
- } else if (result.rss && result.rss.$ && result.rss.$.version && result.rss.$.version.match(/^2/)) {
- feed = this.buildRSS2(result);
- } else if (result['rdf:RDF']) {
- feed = this.buildRSS1(result);
- } else if (result.rss && result.rss.$ && result.rss.$.version && result.rss.$.version.match(/0\.9/)) {
- feed = this.buildRSS0_9(result);
- } else if (result.rss && this.options.defaultRSS) {
- switch(this.options.defaultRSS) {
- case 0.9:
- feed = this.buildRSS0_9(result);
- break;
- case 1:
- feed = this.buildRSS1(result);
- break;
- case 2:
- feed = this.buildRSS2(result);
- break;
- default:
- return reject(new Error("default RSS version not recognized."))
- }
- } else {
- return reject(new Error("Feed not recognized as RSS 1 or 2."))
- }
- resolve(feed);
- });
- });
- prom = utils.maybePromisify(callback, prom);
- return prom;
- }
- parseURL(feedUrl, callback, redirectCount=0) {
- let xml = '';
- let get = feedUrl.indexOf('https') === 0 ? https.get : http.get;
- let urlParts = url.parse(feedUrl);
- let headers = Object.assign({}, DEFAULT_HEADERS, this.options.headers);
- let timeout = null;
- let prom = new Promise((resolve, reject) => {
- const requestOpts = Object.assign({headers}, urlParts, this.options.requestOptions);
- let req = get(requestOpts, (res) => {
- if (this.options.maxRedirects && res.statusCode >= 300 && res.statusCode < 400 && res.headers['location']) {
- if (redirectCount === this.options.maxRedirects) {
- return reject(new Error("Too many redirects"));
- } else {
- const newLocation = url.resolve(feedUrl, res.headers['location']);
- return this.parseURL(newLocation, null, redirectCount + 1).then(resolve, reject);
- }
- } else if (res.statusCode >= 300) {
- return reject(new Error("Status code " + res.statusCode))
- }
- let encoding = utils.getEncodingFromContentType(res.headers['content-type']);
- res.setEncoding(encoding);
- res.on('data', (chunk) => {
- xml += chunk;
- });
- res.on('end', () => {
- return this.parseString(xml).then(resolve, reject);
- });
- })
- req.on('error', reject);
- timeout = setTimeout(() => {
- return reject(new Error("Request timed out after " + this.options.timeout + "ms"));
- }, this.options.timeout);
- }).then(data => {
- clearTimeout(timeout);
- return Promise.resolve(data);
- }, e => {
- clearTimeout(timeout);
- return Promise.reject(e);
- });
- prom = utils.maybePromisify(callback, prom);
- return prom;
- }
- buildAtomFeed(xmlObj) {
- let feed = {items: []};
- utils.copyFromXML(xmlObj.feed, feed, this.options.customFields.feed);
- if (xmlObj.feed.link) {
- feed.link = utils.getLink(xmlObj.feed.link, 'alternate', 0);
- feed.feedUrl = utils.getLink(xmlObj.feed.link, 'self', 1);
- }
- if (xmlObj.feed.title) {
- let title = xmlObj.feed.title[0] || '';
- if (title._) title = title._
- if (title) feed.title = title;
- }
- if (xmlObj.feed.updated) {
- feed.lastBuildDate = xmlObj.feed.updated[0];
- }
- feed.items = (xmlObj.feed.entry || []).map(entry => this.parseItemAtom(entry));
- return feed;
- }
- parseItemAtom(entry) {
- let item = {};
- utils.copyFromXML(entry, item, this.options.customFields.item);
- if (entry.title) {
- let title = entry.title[0] || '';
- if (title._) title = title._;
- if (title) item.title = title;
- }
- if (entry.link && entry.link.length) {
- item.link = utils.getLink(entry.link, 'alternate', 0);
- }
- if (entry.published && entry.published.length && entry.published[0].length) item.pubDate = new Date(entry.published[0]).toISOString();
- if (!item.pubDate && entry.updated && entry.updated.length && entry.updated[0].length) item.pubDate = new Date(entry.updated[0]).toISOString();
- if (entry.author && entry.author.length && entry.author[0].name && entry.author[0].name.length) item.author = entry.author[0].name[0];
- if (entry.content && entry.content.length) {
- item.content = utils.getContent(entry.content[0]);
- item.contentSnippet = utils.getSnippet(item.content)
- }
- if (entry.summary && entry.summary.length) {
- item.summary = utils.getContent(entry.summary[0]);
- }
- if (entry.id) {
- item.id = entry.id[0];
- }
- this.setISODate(item);
- return item;
- }
- buildRSS0_9(xmlObj) {
- var channel = xmlObj.rss.channel[0];
- var items = channel.item;
- return this.buildRSS(channel, items);
- }
- buildRSS1(xmlObj) {
- xmlObj = xmlObj['rdf:RDF'];
- let channel = xmlObj.channel[0];
- let items = xmlObj.item;
- return this.buildRSS(channel, items);
- }
- buildRSS2(xmlObj) {
- let channel = xmlObj.rss.channel[0];
- let items = channel.item;
- let feed = this.buildRSS(channel, items);
- if (xmlObj.rss.$ && xmlObj.rss.$['xmlns:itunes']) {
- this.decorateItunes(feed, channel);
- }
- return feed;
- }
- buildRSS(channel, items) {
- items = items || [];
- let feed = {items: []};
- let feedFields = fields.feed.concat(this.options.customFields.feed);
- let itemFields = fields.item.concat(this.options.customFields.item);
- if (channel['atom:link'] && channel['atom:link'][0] && channel['atom:link'][0].$) {
- feed.feedUrl = channel['atom:link'][0].$.href;
- }
- if (channel.image && channel.image[0] && channel.image[0].url) {
- feed.image = {};
- let image = channel.image[0];
- if (image.link) feed.image.link = image.link[0];
- if (image.url) feed.image.url = image.url[0];
- if (image.title) feed.image.title = image.title[0];
- if (image.width) feed.image.width = image.width[0];
- if (image.height) feed.image.height = image.height[0];
- }
- const paginationLinks = this.generatePaginationLinks(channel);
- if (Object.keys(paginationLinks).length) {
- feed.paginationLinks = paginationLinks;
- }
- utils.copyFromXML(channel, feed, feedFields);
- feed.items = items.map(xmlItem => this.parseItemRss(xmlItem, itemFields));
- return feed;
- }
- parseItemRss(xmlItem, itemFields) {
- let item = {};
- utils.copyFromXML(xmlItem, item, itemFields);
- if (xmlItem.enclosure) {
- item.enclosure = xmlItem.enclosure[0].$;
- }
- if (xmlItem.description) {
- item.content = utils.getContent(xmlItem.description[0]);
- item.contentSnippet = utils.getSnippet(item.content);
- }
- if (xmlItem.guid) {
- item.guid = xmlItem.guid[0];
- if (item.guid._) item.guid = item.guid._;
- }
- if (xmlItem.$ && xmlItem.$['rdf:about']) {
- item['rdf:about'] = xmlItem.$['rdf:about']
- }
- if (xmlItem.category) item.categories = xmlItem.category;
- this.setISODate(item);
- return item;
- }
- /**
- * Add iTunes specific fields from XML to extracted JSON
- *
- * @access public
- * @param {object} feed extracted
- * @param {object} channel parsed XML
- */
- decorateItunes(feed, channel) {
- let items = channel.item || [];
- let categories = [];
- feed.itunes = {}
- if (channel['itunes:owner']) {
- let owner = {};
- if(channel['itunes:owner'][0]['itunes:name']) {
- owner.name = channel['itunes:owner'][0]['itunes:name'][0];
- }
- if(channel['itunes:owner'][0]['itunes:email']) {
- owner.email = channel['itunes:owner'][0]['itunes:email'][0];
- }
- feed.itunes.owner = owner;
- }
- if (channel['itunes:image']) {
- let image;
- let hasImageHref = (channel['itunes:image'][0] &&
- channel['itunes:image'][0].$ &&
- channel['itunes:image'][0].$.href);
- image = hasImageHref ? channel['itunes:image'][0].$.href : null;
- if (image) {
- feed.itunes.image = image;
- }
- }
- if (channel['itunes:category']) {
- const categoriesWithSubs = channel['itunes:category'].map((category) => {
- return {
- name: category && category.$ && category.$.text,
- subs: category['itunes:category'] ?
- category['itunes:category']
- .map((subcategory) => ({
- name: subcategory && subcategory.$ && subcategory.$.text
- })) : null,
- };
- });
- feed.itunes.categories = categoriesWithSubs.map((category) => category.name);
- feed.itunes.categoriesWithSubs = categoriesWithSubs;
- }
- if (channel['itunes:keywords']) {
- if (channel['itunes:keywords'].length > 1) {
- feed.itunes.keywords = channel['itunes:keywords'].map(
- keyword => keyword && keyword.$ && keyword.$.text
- );
- } else {
- let keywords = channel['itunes:keywords'][0];
- if (keywords && typeof keywords._ === 'string') {
- keywords = keywords._;
- }
- if (keywords && keywords.$ && keywords.$.text) {
- feed.itunes.keywords = keywords.$.text.split(',')
- } else if (typeof keywords === "string") {
- feed.itunes.keywords = keywords.split(',');
- }
- }
- }
- utils.copyFromXML(channel, feed.itunes, fields.podcastFeed);
- items.forEach((item, index) => {
- let entry = feed.items[index];
- entry.itunes = {};
- utils.copyFromXML(item, entry.itunes, fields.podcastItem);
- let image = item['itunes:image'];
- if (image && image[0] && image[0].$ && image[0].$.href) {
- entry.itunes.image = image[0].$.href;
- }
- });
- }
- setISODate(item) {
- let date = item.pubDate || item.date;
- if (date) {
- try {
- item.isoDate = new Date(date.trim()).toISOString();
- } catch (e) {
- // Ignore bad date format
- }
- }
- }
- /**
- * Generates a pagination object where the rel attribute is the key and href attribute is the value
- * { self: 'self-url', first: 'first-url', ... }
- *
- * @access private
- * @param {Object} channel parsed XML
- * @returns {Object}
- */
- generatePaginationLinks(channel) {
- if (!channel['atom:link']) {
- return {};
- }
- const paginationRelAttributes = ['self', 'first', 'next', 'prev', 'last'];
- return channel['atom:link'].reduce((paginationLinks, link) => {
- if (!link.$ || !paginationRelAttributes.includes(link.$.rel)) {
- return paginationLinks;
- }
- paginationLinks[link.$.rel] = link.$.href;
- return paginationLinks;
- }, {});
- }
- }
- module.exports = Parser;
|