import * as plugins from '../plugins.js'; /** * Parsed feed structure compatible with rss-parser output */ export interface IParsedFeed { title?: string; description?: string; link?: string; feedUrl?: string; image?: { link?: string; url?: string; title?: string; }; items: IParsedItem[]; [key: string]: any; } /** * Parsed item structure compatible with rss-parser output */ export interface IParsedItem { title?: string; link?: string; pubDate?: string; author?: string; content?: string; contentSnippet?: string; id?: string; isoDate?: string; [key: string]: any; } /** * Gets text content from XML element, handling both direct text and CDATA */ function getContent(element: any): string { if (!element) return ''; if (typeof element === 'string') return element; if (element['#text']) return element['#text']; if (element._) return element._; return String(element); } /** * Creates a snippet from HTML content (removes tags, truncates) */ function getSnippet(html: string, maxLength: number = 200): string { if (!html) return ''; // Remove HTML tags let text = html.replace(/<[^>]+>/g, ''); // Decode common HTML entities text = text .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') .replace(/'/g, "'"); // Truncate if (text.length > maxLength) { text = text.substring(0, maxLength) + '...'; } return text.trim(); } /** * Formats date to ISO string, handling various date formats */ function toISODate(dateString: string): string | undefined { if (!dateString) return undefined; try { const date = new Date(dateString.trim()); return date.toISOString(); } catch (e) { return undefined; } } /** * Parses RSS 2.0 feed */ function parseRSS2(xmlObj: any): IParsedFeed { const channel = xmlObj.rss?.channel; if (!channel) { throw new Error('Invalid RSS 2.0 feed: missing channel element'); } const feed: IParsedFeed = { items: [], }; // Channel metadata if (channel.title) feed.title = getContent(channel.title); if (channel.description) feed.description = getContent(channel.description); if (channel.link) feed.link = getContent(channel.link); if (channel.language) feed.language = getContent(channel.language); if (channel.copyright) feed.copyright = getContent(channel.copyright); if (channel.generator) feed.generator = getContent(channel.generator); if (channel.lastBuildDate) feed.lastBuildDate = getContent(channel.lastBuildDate); // Feed URL from atom:link if (channel['atom:link']) { const atomLinks = Array.isArray(channel['atom:link']) ? channel['atom:link'] : [channel['atom:link']]; for (const link of atomLinks) { if (link['@_rel'] === 'self' && link['@_href']) { feed.feedUrl = link['@_href']; break; } } } // Image if (channel.image) { feed.image = {}; if (channel.image.url) feed.image.url = getContent(channel.image.url); if (channel.image.title) feed.image.title = getContent(channel.image.title); if (channel.image.link) feed.image.link = getContent(channel.image.link); } // Items const items = channel.item ? (Array.isArray(channel.item) ? channel.item : [channel.item]) : []; feed.items = items.map((xmlItem: any) => { const item: IParsedItem = {}; if (xmlItem.title) item.title = getContent(xmlItem.title); if (xmlItem.link) item.link = getContent(xmlItem.link); if (xmlItem.description) { item.content = getContent(xmlItem.description); item.contentSnippet = getSnippet(item.content); } if (xmlItem.pubDate) { item.pubDate = getContent(xmlItem.pubDate); item.isoDate = toISODate(item.pubDate); } if (xmlItem.author) item.author = getContent(xmlItem.author); if (xmlItem['dc:creator']) item.author = getContent(xmlItem['dc:creator']); // ID/GUID if (xmlItem.guid) { const guid = xmlItem.guid; item.id = typeof guid === 'object' && guid['#text'] ? guid['#text'] : getContent(guid); } if (!item.id && xmlItem.link) { item.id = getContent(xmlItem.link); } // Enclosure if (xmlItem.enclosure && xmlItem.enclosure['@_url']) { item.enclosure = { url: xmlItem.enclosure['@_url'], type: xmlItem.enclosure['@_type'], length: xmlItem.enclosure['@_length'], }; } // Categories if (xmlItem.category) { item.categories = Array.isArray(xmlItem.category) ? xmlItem.category.map((cat: any) => getContent(cat)) : [getContent(xmlItem.category)]; } return item; }); return feed; } /** * Parses Atom 1.0 feed */ function parseAtom(xmlObj: any): IParsedFeed { const atomFeed = xmlObj.feed; if (!atomFeed) { throw new Error('Invalid Atom feed: missing feed element'); } const feed: IParsedFeed = { items: [], }; // Feed metadata if (atomFeed.title) feed.title = getContent(atomFeed.title); if (atomFeed.subtitle) feed.description = getContent(atomFeed.subtitle); if (atomFeed.id) feed.feedUrl = getContent(atomFeed.id); // Links if (atomFeed.link) { const links = Array.isArray(atomFeed.link) ? atomFeed.link : [atomFeed.link]; for (const link of links) { if (link['@_rel'] === 'alternate' && link['@_href']) { feed.link = link['@_href']; } if (link['@_rel'] === 'self' && link['@_href']) { feed.feedUrl = link['@_href']; } } } // Entries const entries = atomFeed.entry ? (Array.isArray(atomFeed.entry) ? atomFeed.entry : [atomFeed.entry]) : []; feed.items = entries.map((entry: any) => { const item: IParsedItem = {}; if (entry.title) item.title = getContent(entry.title); if (entry.id) item.id = getContent(entry.id); // Link if (entry.link) { const links = Array.isArray(entry.link) ? entry.link : [entry.link]; for (const link of links) { if (link['@_rel'] === 'alternate' && link['@_href']) { item.link = link['@_href']; break; } if (!item.link && link['@_href']) { item.link = link['@_href']; } } } // Dates if (entry.published) { item.pubDate = getContent(entry.published); item.isoDate = toISODate(item.pubDate); } else if (entry.updated) { item.pubDate = getContent(entry.updated); item.isoDate = toISODate(item.pubDate); } // Author if (entry.author && entry.author.name) { item.author = getContent(entry.author.name); } // Content if (entry.content) { item.content = getContent(entry.content); item.contentSnippet = getSnippet(item.content); } else if (entry.summary) { item.content = getContent(entry.summary); item.contentSnippet = getSnippet(item.content); } return item; }); return feed; } /** * Parses RSS 1.0 (RDF) feed */ function parseRSS1(xmlObj: any): IParsedFeed { const rdf = xmlObj['rdf:RDF']; if (!rdf) { throw new Error('Invalid RSS 1.0 feed: missing rdf:RDF element'); } const feed: IParsedFeed = { items: [], }; const channel = rdf.channel; if (channel) { if (channel.title) feed.title = getContent(channel.title); if (channel.description) feed.description = getContent(channel.description); if (channel.link) feed.link = getContent(channel.link); } // Items const items = rdf.item ? (Array.isArray(rdf.item) ? rdf.item : [rdf.item]) : []; feed.items = items.map((xmlItem: any) => { const item: IParsedItem = {}; if (xmlItem.title) item.title = getContent(xmlItem.title); if (xmlItem.link) item.link = getContent(xmlItem.link); if (xmlItem.description) { item.content = getContent(xmlItem.description); item.contentSnippet = getSnippet(item.content); } if (xmlItem['dc:date']) { item.pubDate = getContent(xmlItem['dc:date']); item.isoDate = toISODate(item.pubDate); } if (xmlItem['dc:creator']) { item.author = getContent(xmlItem['dc:creator']); } if (xmlItem['@_rdf:about']) { item.id = xmlItem['@_rdf:about']; } return item; }); return feed; } /** * Detects feed type and parses accordingly */ export function parseFeedXML(xmlString: string): IParsedFeed { const parser = new plugins.XMLParser({ ignoreAttributes: false, attributeNamePrefix: '@_', textNodeName: '#text', parseAttributeValue: false, }); const xmlObj = parser.parse(xmlString); // Detect feed type if (xmlObj.rss && xmlObj.rss.channel) { // RSS 2.0 or 0.9x return parseRSS2(xmlObj); } else if (xmlObj.feed) { // Atom 1.0 return parseAtom(xmlObj); } else if (xmlObj['rdf:RDF']) { // RSS 1.0 (RDF) return parseRSS1(xmlObj); } else { throw new Error('Feed not recognized as RSS or Atom'); } }