Files
smartfeed/ts/lib/feedparser.ts

327 lines
8.7 KiB
TypeScript

import * as plugins from '../plugins.js';
/**
* Parsed feed structure compatible with rss-parser output
*/
export interface IParsedFeed {
title?: string;
description?: string;
link?: string;
feedUrl?: string;
image?: {
link?: string;
url?: string;
title?: string;
};
items: IParsedItem[];
[key: string]: any;
}
/**
* Parsed item structure compatible with rss-parser output
*/
export interface IParsedItem {
title?: string;
link?: string;
pubDate?: string;
author?: string;
content?: string;
contentSnippet?: string;
id?: string;
isoDate?: string;
[key: string]: any;
}
/**
* Gets text content from XML element, handling both direct text and CDATA
*/
function getContent(element: any): string {
if (!element) return '';
if (typeof element === 'string') return element;
if (element['#text']) return element['#text'];
if (element._) return element._;
return String(element);
}
/**
* Creates a snippet from HTML content (removes tags, truncates)
*/
function getSnippet(html: string, maxLength: number = 200): string {
if (!html) return '';
// Remove HTML tags
let text = html.replace(/<[^>]+>/g, '');
// Decode common HTML entities
text = text
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
// Truncate
if (text.length > maxLength) {
text = text.substring(0, maxLength) + '...';
}
return text.trim();
}
/**
* Formats date to ISO string, handling various date formats
*/
function toISODate(dateString: string): string | undefined {
if (!dateString) return undefined;
try {
const date = new Date(dateString.trim());
return date.toISOString();
} catch (e) {
return undefined;
}
}
/**
* Parses RSS 2.0 feed
*/
function parseRSS2(xmlObj: any): IParsedFeed {
const channel = xmlObj.rss?.channel;
if (!channel) {
throw new Error('Invalid RSS 2.0 feed: missing channel element');
}
const feed: IParsedFeed = {
items: [],
};
// Channel metadata
if (channel.title) feed.title = getContent(channel.title);
if (channel.description) feed.description = getContent(channel.description);
if (channel.link) feed.link = getContent(channel.link);
if (channel.language) feed.language = getContent(channel.language);
if (channel.copyright) feed.copyright = getContent(channel.copyright);
if (channel.generator) feed.generator = getContent(channel.generator);
if (channel.lastBuildDate) feed.lastBuildDate = getContent(channel.lastBuildDate);
// Feed URL from atom:link
if (channel['atom:link']) {
const atomLinks = Array.isArray(channel['atom:link']) ? channel['atom:link'] : [channel['atom:link']];
for (const link of atomLinks) {
if (link['@_rel'] === 'self' && link['@_href']) {
feed.feedUrl = link['@_href'];
break;
}
}
}
// Image
if (channel.image) {
feed.image = {};
if (channel.image.url) feed.image.url = getContent(channel.image.url);
if (channel.image.title) feed.image.title = getContent(channel.image.title);
if (channel.image.link) feed.image.link = getContent(channel.image.link);
}
// Items
const items = channel.item ? (Array.isArray(channel.item) ? channel.item : [channel.item]) : [];
feed.items = items.map((xmlItem: any) => {
const item: IParsedItem = {};
if (xmlItem.title) item.title = getContent(xmlItem.title);
if (xmlItem.link) item.link = getContent(xmlItem.link);
if (xmlItem.description) {
item.content = getContent(xmlItem.description);
item.contentSnippet = getSnippet(item.content);
}
if (xmlItem.pubDate) {
item.pubDate = getContent(xmlItem.pubDate);
item.isoDate = toISODate(item.pubDate);
}
if (xmlItem.author) item.author = getContent(xmlItem.author);
if (xmlItem['dc:creator']) item.author = getContent(xmlItem['dc:creator']);
// ID/GUID
if (xmlItem.guid) {
const guid = xmlItem.guid;
item.id = typeof guid === 'object' && guid['#text'] ? guid['#text'] : getContent(guid);
}
if (!item.id && xmlItem.link) {
item.id = getContent(xmlItem.link);
}
// Enclosure
if (xmlItem.enclosure && xmlItem.enclosure['@_url']) {
item.enclosure = {
url: xmlItem.enclosure['@_url'],
type: xmlItem.enclosure['@_type'],
length: xmlItem.enclosure['@_length'],
};
}
// Categories
if (xmlItem.category) {
item.categories = Array.isArray(xmlItem.category)
? xmlItem.category.map((cat: any) => getContent(cat))
: [getContent(xmlItem.category)];
}
return item;
});
return feed;
}
/**
* Parses Atom 1.0 feed
*/
function parseAtom(xmlObj: any): IParsedFeed {
const atomFeed = xmlObj.feed;
if (!atomFeed) {
throw new Error('Invalid Atom feed: missing feed element');
}
const feed: IParsedFeed = {
items: [],
};
// Feed metadata
if (atomFeed.title) feed.title = getContent(atomFeed.title);
if (atomFeed.subtitle) feed.description = getContent(atomFeed.subtitle);
if (atomFeed.id) feed.feedUrl = getContent(atomFeed.id);
// Links
if (atomFeed.link) {
const links = Array.isArray(atomFeed.link) ? atomFeed.link : [atomFeed.link];
for (const link of links) {
if (link['@_rel'] === 'alternate' && link['@_href']) {
feed.link = link['@_href'];
}
if (link['@_rel'] === 'self' && link['@_href']) {
feed.feedUrl = link['@_href'];
}
}
}
// Entries
const entries = atomFeed.entry ? (Array.isArray(atomFeed.entry) ? atomFeed.entry : [atomFeed.entry]) : [];
feed.items = entries.map((entry: any) => {
const item: IParsedItem = {};
if (entry.title) item.title = getContent(entry.title);
if (entry.id) item.id = getContent(entry.id);
// Link
if (entry.link) {
const links = Array.isArray(entry.link) ? entry.link : [entry.link];
for (const link of links) {
if (link['@_rel'] === 'alternate' && link['@_href']) {
item.link = link['@_href'];
break;
}
if (!item.link && link['@_href']) {
item.link = link['@_href'];
}
}
}
// Dates
if (entry.published) {
item.pubDate = getContent(entry.published);
item.isoDate = toISODate(item.pubDate);
} else if (entry.updated) {
item.pubDate = getContent(entry.updated);
item.isoDate = toISODate(item.pubDate);
}
// Author
if (entry.author && entry.author.name) {
item.author = getContent(entry.author.name);
}
// Content
if (entry.content) {
item.content = getContent(entry.content);
item.contentSnippet = getSnippet(item.content);
} else if (entry.summary) {
item.content = getContent(entry.summary);
item.contentSnippet = getSnippet(item.content);
}
return item;
});
return feed;
}
/**
* Parses RSS 1.0 (RDF) feed
*/
function parseRSS1(xmlObj: any): IParsedFeed {
const rdf = xmlObj['rdf:RDF'];
if (!rdf) {
throw new Error('Invalid RSS 1.0 feed: missing rdf:RDF element');
}
const feed: IParsedFeed = {
items: [],
};
const channel = rdf.channel;
if (channel) {
if (channel.title) feed.title = getContent(channel.title);
if (channel.description) feed.description = getContent(channel.description);
if (channel.link) feed.link = getContent(channel.link);
}
// Items
const items = rdf.item ? (Array.isArray(rdf.item) ? rdf.item : [rdf.item]) : [];
feed.items = items.map((xmlItem: any) => {
const item: IParsedItem = {};
if (xmlItem.title) item.title = getContent(xmlItem.title);
if (xmlItem.link) item.link = getContent(xmlItem.link);
if (xmlItem.description) {
item.content = getContent(xmlItem.description);
item.contentSnippet = getSnippet(item.content);
}
if (xmlItem['dc:date']) {
item.pubDate = getContent(xmlItem['dc:date']);
item.isoDate = toISODate(item.pubDate);
}
if (xmlItem['dc:creator']) {
item.author = getContent(xmlItem['dc:creator']);
}
if (xmlItem['@_rdf:about']) {
item.id = xmlItem['@_rdf:about'];
}
return item;
});
return feed;
}
/**
* Detects feed type and parses accordingly
*/
export function parseFeedXML(xmlString: string): IParsedFeed {
const parser = new plugins.XMLParser({
ignoreAttributes: false,
attributeNamePrefix: '@_',
textNodeName: '#text',
parseAttributeValue: false,
});
const xmlObj = parser.parse(xmlString);
// Detect feed type
if (xmlObj.rss && xmlObj.rss.channel) {
// RSS 2.0 or 0.9x
return parseRSS2(xmlObj);
} else if (xmlObj.feed) {
// Atom 1.0
return parseAtom(xmlObj);
} else if (xmlObj['rdf:RDF']) {
// RSS 1.0 (RDF)
return parseRSS1(xmlObj);
} else {
throw new Error('Feed not recognized as RSS or Atom');
}
}