327 lines
8.7 KiB
TypeScript
327 lines
8.7 KiB
TypeScript
import * as plugins from '../plugins.js';
|
|
|
|
/**
|
|
* Parsed feed structure compatible with rss-parser output
|
|
*/
|
|
export interface IParsedFeed {
|
|
title?: string;
|
|
description?: string;
|
|
link?: string;
|
|
feedUrl?: string;
|
|
image?: {
|
|
link?: string;
|
|
url?: string;
|
|
title?: string;
|
|
};
|
|
items: IParsedItem[];
|
|
[key: string]: any;
|
|
}
|
|
|
|
/**
|
|
* Parsed item structure compatible with rss-parser output
|
|
*/
|
|
export interface IParsedItem {
|
|
title?: string;
|
|
link?: string;
|
|
pubDate?: string;
|
|
author?: string;
|
|
content?: string;
|
|
contentSnippet?: string;
|
|
id?: string;
|
|
isoDate?: string;
|
|
[key: string]: any;
|
|
}
|
|
|
|
/**
|
|
* Gets text content from XML element, handling both direct text and CDATA
|
|
*/
|
|
function getContent(element: any): string {
|
|
if (!element) return '';
|
|
if (typeof element === 'string') return element;
|
|
if (element['#text']) return element['#text'];
|
|
if (element._) return element._;
|
|
return String(element);
|
|
}
|
|
|
|
/**
|
|
* Creates a snippet from HTML content (removes tags, truncates)
|
|
*/
|
|
function getSnippet(html: string, maxLength: number = 200): string {
|
|
if (!html) return '';
|
|
// Remove HTML tags
|
|
let text = html.replace(/<[^>]+>/g, '');
|
|
// Decode common HTML entities
|
|
text = text
|
|
.replace(/&/g, '&')
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>')
|
|
.replace(/"/g, '"')
|
|
.replace(/'/g, "'");
|
|
// Truncate
|
|
if (text.length > maxLength) {
|
|
text = text.substring(0, maxLength) + '...';
|
|
}
|
|
return text.trim();
|
|
}
|
|
|
|
/**
|
|
* Formats date to ISO string, handling various date formats
|
|
*/
|
|
function toISODate(dateString: string): string | undefined {
|
|
if (!dateString) return undefined;
|
|
try {
|
|
const date = new Date(dateString.trim());
|
|
return date.toISOString();
|
|
} catch (e) {
|
|
return undefined;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parses RSS 2.0 feed
|
|
*/
|
|
function parseRSS2(xmlObj: any): IParsedFeed {
|
|
const channel = xmlObj.rss?.channel;
|
|
if (!channel) {
|
|
throw new Error('Invalid RSS 2.0 feed: missing channel element');
|
|
}
|
|
|
|
const feed: IParsedFeed = {
|
|
items: [],
|
|
};
|
|
|
|
// Channel metadata
|
|
if (channel.title) feed.title = getContent(channel.title);
|
|
if (channel.description) feed.description = getContent(channel.description);
|
|
if (channel.link) feed.link = getContent(channel.link);
|
|
if (channel.language) feed.language = getContent(channel.language);
|
|
if (channel.copyright) feed.copyright = getContent(channel.copyright);
|
|
if (channel.generator) feed.generator = getContent(channel.generator);
|
|
if (channel.lastBuildDate) feed.lastBuildDate = getContent(channel.lastBuildDate);
|
|
|
|
// Feed URL from atom:link
|
|
if (channel['atom:link']) {
|
|
const atomLinks = Array.isArray(channel['atom:link']) ? channel['atom:link'] : [channel['atom:link']];
|
|
for (const link of atomLinks) {
|
|
if (link['@_rel'] === 'self' && link['@_href']) {
|
|
feed.feedUrl = link['@_href'];
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Image
|
|
if (channel.image) {
|
|
feed.image = {};
|
|
if (channel.image.url) feed.image.url = getContent(channel.image.url);
|
|
if (channel.image.title) feed.image.title = getContent(channel.image.title);
|
|
if (channel.image.link) feed.image.link = getContent(channel.image.link);
|
|
}
|
|
|
|
// Items
|
|
const items = channel.item ? (Array.isArray(channel.item) ? channel.item : [channel.item]) : [];
|
|
feed.items = items.map((xmlItem: any) => {
|
|
const item: IParsedItem = {};
|
|
|
|
if (xmlItem.title) item.title = getContent(xmlItem.title);
|
|
if (xmlItem.link) item.link = getContent(xmlItem.link);
|
|
if (xmlItem.description) {
|
|
item.content = getContent(xmlItem.description);
|
|
item.contentSnippet = getSnippet(item.content);
|
|
}
|
|
if (xmlItem.pubDate) {
|
|
item.pubDate = getContent(xmlItem.pubDate);
|
|
item.isoDate = toISODate(item.pubDate);
|
|
}
|
|
if (xmlItem.author) item.author = getContent(xmlItem.author);
|
|
if (xmlItem['dc:creator']) item.author = getContent(xmlItem['dc:creator']);
|
|
|
|
// ID/GUID
|
|
if (xmlItem.guid) {
|
|
const guid = xmlItem.guid;
|
|
item.id = typeof guid === 'object' && guid['#text'] ? guid['#text'] : getContent(guid);
|
|
}
|
|
if (!item.id && xmlItem.link) {
|
|
item.id = getContent(xmlItem.link);
|
|
}
|
|
|
|
// Enclosure
|
|
if (xmlItem.enclosure && xmlItem.enclosure['@_url']) {
|
|
item.enclosure = {
|
|
url: xmlItem.enclosure['@_url'],
|
|
type: xmlItem.enclosure['@_type'],
|
|
length: xmlItem.enclosure['@_length'],
|
|
};
|
|
}
|
|
|
|
// Categories
|
|
if (xmlItem.category) {
|
|
item.categories = Array.isArray(xmlItem.category)
|
|
? xmlItem.category.map((cat: any) => getContent(cat))
|
|
: [getContent(xmlItem.category)];
|
|
}
|
|
|
|
return item;
|
|
});
|
|
|
|
return feed;
|
|
}
|
|
|
|
/**
|
|
* Parses Atom 1.0 feed
|
|
*/
|
|
function parseAtom(xmlObj: any): IParsedFeed {
|
|
const atomFeed = xmlObj.feed;
|
|
if (!atomFeed) {
|
|
throw new Error('Invalid Atom feed: missing feed element');
|
|
}
|
|
|
|
const feed: IParsedFeed = {
|
|
items: [],
|
|
};
|
|
|
|
// Feed metadata
|
|
if (atomFeed.title) feed.title = getContent(atomFeed.title);
|
|
if (atomFeed.subtitle) feed.description = getContent(atomFeed.subtitle);
|
|
if (atomFeed.id) feed.feedUrl = getContent(atomFeed.id);
|
|
|
|
// Links
|
|
if (atomFeed.link) {
|
|
const links = Array.isArray(atomFeed.link) ? atomFeed.link : [atomFeed.link];
|
|
for (const link of links) {
|
|
if (link['@_rel'] === 'alternate' && link['@_href']) {
|
|
feed.link = link['@_href'];
|
|
}
|
|
if (link['@_rel'] === 'self' && link['@_href']) {
|
|
feed.feedUrl = link['@_href'];
|
|
}
|
|
}
|
|
}
|
|
|
|
// Entries
|
|
const entries = atomFeed.entry ? (Array.isArray(atomFeed.entry) ? atomFeed.entry : [atomFeed.entry]) : [];
|
|
feed.items = entries.map((entry: any) => {
|
|
const item: IParsedItem = {};
|
|
|
|
if (entry.title) item.title = getContent(entry.title);
|
|
if (entry.id) item.id = getContent(entry.id);
|
|
|
|
// Link
|
|
if (entry.link) {
|
|
const links = Array.isArray(entry.link) ? entry.link : [entry.link];
|
|
for (const link of links) {
|
|
if (link['@_rel'] === 'alternate' && link['@_href']) {
|
|
item.link = link['@_href'];
|
|
break;
|
|
}
|
|
if (!item.link && link['@_href']) {
|
|
item.link = link['@_href'];
|
|
}
|
|
}
|
|
}
|
|
|
|
// Dates
|
|
if (entry.published) {
|
|
item.pubDate = getContent(entry.published);
|
|
item.isoDate = toISODate(item.pubDate);
|
|
} else if (entry.updated) {
|
|
item.pubDate = getContent(entry.updated);
|
|
item.isoDate = toISODate(item.pubDate);
|
|
}
|
|
|
|
// Author
|
|
if (entry.author && entry.author.name) {
|
|
item.author = getContent(entry.author.name);
|
|
}
|
|
|
|
// Content
|
|
if (entry.content) {
|
|
item.content = getContent(entry.content);
|
|
item.contentSnippet = getSnippet(item.content);
|
|
} else if (entry.summary) {
|
|
item.content = getContent(entry.summary);
|
|
item.contentSnippet = getSnippet(item.content);
|
|
}
|
|
|
|
return item;
|
|
});
|
|
|
|
return feed;
|
|
}
|
|
|
|
/**
|
|
* Parses RSS 1.0 (RDF) feed
|
|
*/
|
|
function parseRSS1(xmlObj: any): IParsedFeed {
|
|
const rdf = xmlObj['rdf:RDF'];
|
|
if (!rdf) {
|
|
throw new Error('Invalid RSS 1.0 feed: missing rdf:RDF element');
|
|
}
|
|
|
|
const feed: IParsedFeed = {
|
|
items: [],
|
|
};
|
|
|
|
const channel = rdf.channel;
|
|
if (channel) {
|
|
if (channel.title) feed.title = getContent(channel.title);
|
|
if (channel.description) feed.description = getContent(channel.description);
|
|
if (channel.link) feed.link = getContent(channel.link);
|
|
}
|
|
|
|
// Items
|
|
const items = rdf.item ? (Array.isArray(rdf.item) ? rdf.item : [rdf.item]) : [];
|
|
feed.items = items.map((xmlItem: any) => {
|
|
const item: IParsedItem = {};
|
|
|
|
if (xmlItem.title) item.title = getContent(xmlItem.title);
|
|
if (xmlItem.link) item.link = getContent(xmlItem.link);
|
|
if (xmlItem.description) {
|
|
item.content = getContent(xmlItem.description);
|
|
item.contentSnippet = getSnippet(item.content);
|
|
}
|
|
if (xmlItem['dc:date']) {
|
|
item.pubDate = getContent(xmlItem['dc:date']);
|
|
item.isoDate = toISODate(item.pubDate);
|
|
}
|
|
if (xmlItem['dc:creator']) {
|
|
item.author = getContent(xmlItem['dc:creator']);
|
|
}
|
|
if (xmlItem['@_rdf:about']) {
|
|
item.id = xmlItem['@_rdf:about'];
|
|
}
|
|
|
|
return item;
|
|
});
|
|
|
|
return feed;
|
|
}
|
|
|
|
/**
|
|
* Detects feed type and parses accordingly
|
|
*/
|
|
export function parseFeedXML(xmlString: string): IParsedFeed {
|
|
const parser = new plugins.XMLParser({
|
|
ignoreAttributes: false,
|
|
attributeNamePrefix: '@_',
|
|
textNodeName: '#text',
|
|
parseAttributeValue: false,
|
|
});
|
|
|
|
const xmlObj = parser.parse(xmlString);
|
|
|
|
// Detect feed type
|
|
if (xmlObj.rss && xmlObj.rss.channel) {
|
|
// RSS 2.0 or 0.9x
|
|
return parseRSS2(xmlObj);
|
|
} else if (xmlObj.feed) {
|
|
// Atom 1.0
|
|
return parseAtom(xmlObj);
|
|
} else if (xmlObj['rdf:RDF']) {
|
|
// RSS 1.0 (RDF)
|
|
return parseRSS1(xmlObj);
|
|
} else {
|
|
throw new Error('Feed not recognized as RSS or Atom');
|
|
}
|
|
}
|