parser.js 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349
  1. "use strict";
  2. const http = require('http');
  3. const https = require('https');
  4. const xml2js = require('xml2js');
  5. const url = require('url');
  6. const fields = require('./fields');
  7. const utils = require('./utils');
  8. const DEFAULT_HEADERS = {
  9. 'User-Agent': 'rss-parser',
  10. 'Accept': 'application/rss+xml',
  11. }
  12. const DEFAULT_MAX_REDIRECTS = 5;
  13. const DEFAULT_TIMEOUT = 60000;
  14. class Parser {
  15. constructor(options={}) {
  16. options.headers = options.headers || {};
  17. options.xml2js = options.xml2js || {};
  18. options.customFields = options.customFields || {};
  19. options.customFields.item = options.customFields.item || [];
  20. options.customFields.feed = options.customFields.feed || [];
  21. options.requestOptions = options.requestOptions || {};
  22. if (!options.maxRedirects) options.maxRedirects = DEFAULT_MAX_REDIRECTS;
  23. if (!options.timeout) options.timeout = DEFAULT_TIMEOUT;
  24. this.options = options;
  25. this.xmlParser = new xml2js.Parser(this.options.xml2js);
  26. }
  27. parseString(xml, callback) {
  28. let prom = new Promise((resolve, reject) => {
  29. this.xmlParser.parseString(xml, (err, result) => {
  30. if (err) return reject(err);
  31. if (!result) {
  32. return reject(new Error('Unable to parse XML.'));
  33. }
  34. let feed = null;
  35. if (result.feed) {
  36. feed = this.buildAtomFeed(result);
  37. } else if (result.rss && result.rss.$ && result.rss.$.version && result.rss.$.version.match(/^2/)) {
  38. feed = this.buildRSS2(result);
  39. } else if (result['rdf:RDF']) {
  40. feed = this.buildRSS1(result);
  41. } else if (result.rss && result.rss.$ && result.rss.$.version && result.rss.$.version.match(/0\.9/)) {
  42. feed = this.buildRSS0_9(result);
  43. } else if (result.rss && this.options.defaultRSS) {
  44. switch(this.options.defaultRSS) {
  45. case 0.9:
  46. feed = this.buildRSS0_9(result);
  47. break;
  48. case 1:
  49. feed = this.buildRSS1(result);
  50. break;
  51. case 2:
  52. feed = this.buildRSS2(result);
  53. break;
  54. default:
  55. return reject(new Error("default RSS version not recognized."))
  56. }
  57. } else {
  58. return reject(new Error("Feed not recognized as RSS 1 or 2."))
  59. }
  60. resolve(feed);
  61. });
  62. });
  63. prom = utils.maybePromisify(callback, prom);
  64. return prom;
  65. }
  66. parseURL(feedUrl, callback, redirectCount=0) {
  67. let xml = '';
  68. let get = feedUrl.indexOf('https') === 0 ? https.get : http.get;
  69. let urlParts = url.parse(feedUrl);
  70. let headers = Object.assign({}, DEFAULT_HEADERS, this.options.headers);
  71. let timeout = null;
  72. let prom = new Promise((resolve, reject) => {
  73. const requestOpts = Object.assign({headers}, urlParts, this.options.requestOptions);
  74. let req = get(requestOpts, (res) => {
  75. if (this.options.maxRedirects && res.statusCode >= 300 && res.statusCode < 400 && res.headers['location']) {
  76. if (redirectCount === this.options.maxRedirects) {
  77. return reject(new Error("Too many redirects"));
  78. } else {
  79. const newLocation = url.resolve(feedUrl, res.headers['location']);
  80. return this.parseURL(newLocation, null, redirectCount + 1).then(resolve, reject);
  81. }
  82. } else if (res.statusCode >= 300) {
  83. return reject(new Error("Status code " + res.statusCode))
  84. }
  85. let encoding = utils.getEncodingFromContentType(res.headers['content-type']);
  86. res.setEncoding(encoding);
  87. res.on('data', (chunk) => {
  88. xml += chunk;
  89. });
  90. res.on('end', () => {
  91. return this.parseString(xml).then(resolve, reject);
  92. });
  93. })
  94. req.on('error', reject);
  95. timeout = setTimeout(() => {
  96. return reject(new Error("Request timed out after " + this.options.timeout + "ms"));
  97. }, this.options.timeout);
  98. }).then(data => {
  99. clearTimeout(timeout);
  100. return Promise.resolve(data);
  101. }, e => {
  102. clearTimeout(timeout);
  103. return Promise.reject(e);
  104. });
  105. prom = utils.maybePromisify(callback, prom);
  106. return prom;
  107. }
  108. buildAtomFeed(xmlObj) {
  109. let feed = {items: []};
  110. utils.copyFromXML(xmlObj.feed, feed, this.options.customFields.feed);
  111. if (xmlObj.feed.link) {
  112. feed.link = utils.getLink(xmlObj.feed.link, 'alternate', 0);
  113. feed.feedUrl = utils.getLink(xmlObj.feed.link, 'self', 1);
  114. }
  115. if (xmlObj.feed.title) {
  116. let title = xmlObj.feed.title[0] || '';
  117. if (title._) title = title._
  118. if (title) feed.title = title;
  119. }
  120. if (xmlObj.feed.updated) {
  121. feed.lastBuildDate = xmlObj.feed.updated[0];
  122. }
  123. feed.items = (xmlObj.feed.entry || []).map(entry => this.parseItemAtom(entry));
  124. return feed;
  125. }
  126. parseItemAtom(entry) {
  127. let item = {};
  128. utils.copyFromXML(entry, item, this.options.customFields.item);
  129. if (entry.title) {
  130. let title = entry.title[0] || '';
  131. if (title._) title = title._;
  132. if (title) item.title = title;
  133. }
  134. if (entry.link && entry.link.length) {
  135. item.link = utils.getLink(entry.link, 'alternate', 0);
  136. }
  137. if (entry.published && entry.published.length && entry.published[0].length) item.pubDate = new Date(entry.published[0]).toISOString();
  138. if (!item.pubDate && entry.updated && entry.updated.length && entry.updated[0].length) item.pubDate = new Date(entry.updated[0]).toISOString();
  139. if (entry.author && entry.author.length && entry.author[0].name && entry.author[0].name.length) item.author = entry.author[0].name[0];
  140. if (entry.content && entry.content.length) {
  141. item.content = utils.getContent(entry.content[0]);
  142. item.contentSnippet = utils.getSnippet(item.content)
  143. }
  144. if (entry.summary && entry.summary.length) {
  145. item.summary = utils.getContent(entry.summary[0]);
  146. }
  147. if (entry.id) {
  148. item.id = entry.id[0];
  149. }
  150. this.setISODate(item);
  151. return item;
  152. }
  153. buildRSS0_9(xmlObj) {
  154. var channel = xmlObj.rss.channel[0];
  155. var items = channel.item;
  156. return this.buildRSS(channel, items);
  157. }
  158. buildRSS1(xmlObj) {
  159. xmlObj = xmlObj['rdf:RDF'];
  160. let channel = xmlObj.channel[0];
  161. let items = xmlObj.item;
  162. return this.buildRSS(channel, items);
  163. }
  164. buildRSS2(xmlObj) {
  165. let channel = xmlObj.rss.channel[0];
  166. let items = channel.item;
  167. let feed = this.buildRSS(channel, items);
  168. if (xmlObj.rss.$ && xmlObj.rss.$['xmlns:itunes']) {
  169. this.decorateItunes(feed, channel);
  170. }
  171. return feed;
  172. }
  173. buildRSS(channel, items) {
  174. items = items || [];
  175. let feed = {items: []};
  176. let feedFields = fields.feed.concat(this.options.customFields.feed);
  177. let itemFields = fields.item.concat(this.options.customFields.item);
  178. if (channel['atom:link'] && channel['atom:link'][0] && channel['atom:link'][0].$) {
  179. feed.feedUrl = channel['atom:link'][0].$.href;
  180. }
  181. if (channel.image && channel.image[0] && channel.image[0].url) {
  182. feed.image = {};
  183. let image = channel.image[0];
  184. if (image.link) feed.image.link = image.link[0];
  185. if (image.url) feed.image.url = image.url[0];
  186. if (image.title) feed.image.title = image.title[0];
  187. if (image.width) feed.image.width = image.width[0];
  188. if (image.height) feed.image.height = image.height[0];
  189. }
  190. const paginationLinks = this.generatePaginationLinks(channel);
  191. if (Object.keys(paginationLinks).length) {
  192. feed.paginationLinks = paginationLinks;
  193. }
  194. utils.copyFromXML(channel, feed, feedFields);
  195. feed.items = items.map(xmlItem => this.parseItemRss(xmlItem, itemFields));
  196. return feed;
  197. }
  198. parseItemRss(xmlItem, itemFields) {
  199. let item = {};
  200. utils.copyFromXML(xmlItem, item, itemFields);
  201. if (xmlItem.enclosure) {
  202. item.enclosure = xmlItem.enclosure[0].$;
  203. }
  204. if (xmlItem.description) {
  205. item.content = utils.getContent(xmlItem.description[0]);
  206. item.contentSnippet = utils.getSnippet(item.content);
  207. }
  208. if (xmlItem.guid) {
  209. item.guid = xmlItem.guid[0];
  210. if (item.guid._) item.guid = item.guid._;
  211. }
  212. if (xmlItem.$ && xmlItem.$['rdf:about']) {
  213. item['rdf:about'] = xmlItem.$['rdf:about']
  214. }
  215. if (xmlItem.category) item.categories = xmlItem.category;
  216. this.setISODate(item);
  217. return item;
  218. }
  219. /**
  220. * Add iTunes specific fields from XML to extracted JSON
  221. *
  222. * @access public
  223. * @param {object} feed extracted
  224. * @param {object} channel parsed XML
  225. */
  226. decorateItunes(feed, channel) {
  227. let items = channel.item || [];
  228. let categories = [];
  229. feed.itunes = {}
  230. if (channel['itunes:owner']) {
  231. let owner = {};
  232. if(channel['itunes:owner'][0]['itunes:name']) {
  233. owner.name = channel['itunes:owner'][0]['itunes:name'][0];
  234. }
  235. if(channel['itunes:owner'][0]['itunes:email']) {
  236. owner.email = channel['itunes:owner'][0]['itunes:email'][0];
  237. }
  238. feed.itunes.owner = owner;
  239. }
  240. if (channel['itunes:image']) {
  241. let image;
  242. let hasImageHref = (channel['itunes:image'][0] &&
  243. channel['itunes:image'][0].$ &&
  244. channel['itunes:image'][0].$.href);
  245. image = hasImageHref ? channel['itunes:image'][0].$.href : null;
  246. if (image) {
  247. feed.itunes.image = image;
  248. }
  249. }
  250. if (channel['itunes:category']) {
  251. const categoriesWithSubs = channel['itunes:category'].map((category) => {
  252. return {
  253. name: category && category.$ && category.$.text,
  254. subs: category['itunes:category'] ?
  255. category['itunes:category']
  256. .map((subcategory) => ({
  257. name: subcategory && subcategory.$ && subcategory.$.text
  258. })) : null,
  259. };
  260. });
  261. feed.itunes.categories = categoriesWithSubs.map((category) => category.name);
  262. feed.itunes.categoriesWithSubs = categoriesWithSubs;
  263. }
  264. if (channel['itunes:keywords']) {
  265. if (channel['itunes:keywords'].length > 1) {
  266. feed.itunes.keywords = channel['itunes:keywords'].map(
  267. keyword => keyword && keyword.$ && keyword.$.text
  268. );
  269. } else {
  270. let keywords = channel['itunes:keywords'][0];
  271. if (keywords && typeof keywords._ === 'string') {
  272. keywords = keywords._;
  273. }
  274. if (keywords && keywords.$ && keywords.$.text) {
  275. feed.itunes.keywords = keywords.$.text.split(',')
  276. } else if (typeof keywords === "string") {
  277. feed.itunes.keywords = keywords.split(',');
  278. }
  279. }
  280. }
  281. utils.copyFromXML(channel, feed.itunes, fields.podcastFeed);
  282. items.forEach((item, index) => {
  283. let entry = feed.items[index];
  284. entry.itunes = {};
  285. utils.copyFromXML(item, entry.itunes, fields.podcastItem);
  286. let image = item['itunes:image'];
  287. if (image && image[0] && image[0].$ && image[0].$.href) {
  288. entry.itunes.image = image[0].$.href;
  289. }
  290. });
  291. }
  292. setISODate(item) {
  293. let date = item.pubDate || item.date;
  294. if (date) {
  295. try {
  296. item.isoDate = new Date(date.trim()).toISOString();
  297. } catch (e) {
  298. // Ignore bad date format
  299. }
  300. }
  301. }
  302. /**
  303. * Generates a pagination object where the rel attribute is the key and href attribute is the value
  304. * { self: 'self-url', first: 'first-url', ... }
  305. *
  306. * @access private
  307. * @param {Object} channel parsed XML
  308. * @returns {Object}
  309. */
  310. generatePaginationLinks(channel) {
  311. if (!channel['atom:link']) {
  312. return {};
  313. }
  314. const paginationRelAttributes = ['self', 'first', 'next', 'prev', 'last'];
  315. return channel['atom:link'].reduce((paginationLinks, link) => {
  316. if (!link.$ || !paginationRelAttributes.includes(link.$.rel)) {
  317. return paginationLinks;
  318. }
  319. paginationLinks[link.$.rel] = link.$.href;
  320. return paginationLinks;
  321. }, {});
  322. }
  323. }
  324. module.exports = Parser;