smartfeed/ts/lib/feedparser.ts

import * as plugins from '../plugins.js';

/**
 * Parsed feed structure compatible with rss-parser output
 */
export interface IParsedFeed {
  title?: string;
  description?: string;
  link?: string;
  feedUrl?: string;
  image?: {
    link?: string;
    url?: string;
    title?: string;
  };
  items: IParsedItem[];
  [key: string]: any;
}

/**
 * Parsed item structure compatible with rss-parser output
 */
export interface IParsedItem {
  title?: string;
  link?: string;
  pubDate?: string;
  author?: string;
  content?: string;
  contentSnippet?: string;
  id?: string;
  isoDate?: string;
  [key: string]: any;
}

/**
 * Gets text content from XML element, handling both direct text and CDATA
 */
function getContent(element: any): string {
  if (!element) return '';
  if (typeof element === 'string') return element;
  if (element['#text']) return element['#text'];
  if (element._) return element._;
  return String(element);
}

/**
 * Creates a snippet from HTML content (removes tags, truncates)
 */
function getSnippet(html: string, maxLength: number = 200): string {
  if (!html) return '';
  // Remove HTML tags
  let text = html.replace(/<[^>]+>/g, '');
  // Decode common HTML entities
  text = text
    .replace(/&amp;/g, '&')
    .replace(/&lt;/g, '<')
    .replace(/&gt;/g, '>')
    .replace(/&quot;/g, '"')
    .replace(/&#39;/g, "'");
  // Truncate
  if (text.length > maxLength) {
    text = text.substring(0, maxLength) + '...';
  }
  return text.trim();
}

/**
 * Formats date to ISO string, handling various date formats
 */
function toISODate(dateString: string): string | undefined {
  if (!dateString) return undefined;
  try {
    const date = new Date(dateString.trim());
    return date.toISOString();
  } catch (e) {
    return undefined;
  }
}

/**
 * Parses RSS 2.0 feed
 */
function parseRSS2(xmlObj: any): IParsedFeed {
  const channel = xmlObj.rss?.channel;
  if (!channel) {
    throw new Error('Invalid RSS 2.0 feed: missing channel element');
  }

  const feed: IParsedFeed = {
    items: [],
  };

  // Channel metadata
  if (channel.title) feed.title = getContent(channel.title);
  if (channel.description) feed.description = getContent(channel.description);
  if (channel.link) feed.link = getContent(channel.link);
  if (channel.language) feed.language = getContent(channel.language);
  if (channel.copyright) feed.copyright = getContent(channel.copyright);
  if (channel.generator) feed.generator = getContent(channel.generator);
  if (channel.lastBuildDate) feed.lastBuildDate = getContent(channel.lastBuildDate);

  // Feed URL from atom:link
  if (channel['atom:link']) {
    const atomLinks = Array.isArray(channel['atom:link']) ? channel['atom:link'] : [channel['atom:link']];
    for (const link of atomLinks) {
      if (link['@_rel'] === 'self' && link['@_href']) {
        feed.feedUrl = link['@_href'];
        break;
      }
    }
  }

  // Image
  if (channel.image) {
    feed.image = {};
    if (channel.image.url) feed.image.url = getContent(channel.image.url);
    if (channel.image.title) feed.image.title = getContent(channel.image.title);
    if (channel.image.link) feed.image.link = getContent(channel.image.link);
  }

  // Items
  const items = channel.item ? (Array.isArray(channel.item) ? channel.item : [channel.item]) : [];
  feed.items = items.map((xmlItem: any) => {
    const item: IParsedItem = {};

    if (xmlItem.title) item.title = getContent(xmlItem.title);
    if (xmlItem.link) item.link = getContent(xmlItem.link);
    if (xmlItem.description) {
      item.content = getContent(xmlItem.description);
      item.contentSnippet = getSnippet(item.content);
    }
    if (xmlItem.pubDate) {
      item.pubDate = getContent(xmlItem.pubDate);
      item.isoDate = toISODate(item.pubDate);
    }
    if (xmlItem.author) item.author = getContent(xmlItem.author);
    if (xmlItem['dc:creator']) item.author = getContent(xmlItem['dc:creator']);

    // ID/GUID
    if (xmlItem.guid) {
      const guid = xmlItem.guid;
      item.id = typeof guid === 'object' && guid['#text'] ? guid['#text'] : getContent(guid);
    }
    if (!item.id && xmlItem.link) {
      item.id = getContent(xmlItem.link);
    }

    // Enclosure
    if (xmlItem.enclosure && xmlItem.enclosure['@_url']) {
      item.enclosure = {
        url: xmlItem.enclosure['@_url'],
        type: xmlItem.enclosure['@_type'],
        length: xmlItem.enclosure['@_length'],
      };
    }

    // Categories
    if (xmlItem.category) {
      item.categories = Array.isArray(xmlItem.category)
        ? xmlItem.category.map((cat: any) => getContent(cat))
        : [getContent(xmlItem.category)];
    }

    return item;
  });

  return feed;
}

/**
 * Parses Atom 1.0 feed
 */
function parseAtom(xmlObj: any): IParsedFeed {
  const atomFeed = xmlObj.feed;
  if (!atomFeed) {
    throw new Error('Invalid Atom feed: missing feed element');
  }

  const feed: IParsedFeed = {
    items: [],
  };

  // Feed metadata
  if (atomFeed.title) feed.title = getContent(atomFeed.title);
  if (atomFeed.subtitle) feed.description = getContent(atomFeed.subtitle);
  if (atomFeed.id) feed.feedUrl = getContent(atomFeed.id);

  // Links
  if (atomFeed.link) {
    const links = Array.isArray(atomFeed.link) ? atomFeed.link : [atomFeed.link];
    for (const link of links) {
      if (link['@_rel'] === 'alternate' && link['@_href']) {
        feed.link = link['@_href'];
      }
      if (link['@_rel'] === 'self' && link['@_href']) {
        feed.feedUrl = link['@_href'];
      }
    }
  }

  // Entries
  const entries = atomFeed.entry ? (Array.isArray(atomFeed.entry) ? atomFeed.entry : [atomFeed.entry]) : [];
  feed.items = entries.map((entry: any) => {
    const item: IParsedItem = {};

    if (entry.title) item.title = getContent(entry.title);
    if (entry.id) item.id = getContent(entry.id);

    // Link
    if (entry.link) {
      const links = Array.isArray(entry.link) ? entry.link : [entry.link];
      for (const link of links) {
        if (link['@_rel'] === 'alternate' && link['@_href']) {
          item.link = link['@_href'];
          break;
        }
        if (!item.link && link['@_href']) {
          item.link = link['@_href'];
        }
      }
    }

    // Dates
    if (entry.published) {
      item.pubDate = getContent(entry.published);
      item.isoDate = toISODate(item.pubDate);
    } else if (entry.updated) {
      item.pubDate = getContent(entry.updated);
      item.isoDate = toISODate(item.pubDate);
    }

    // Author
    if (entry.author && entry.author.name) {
      item.author = getContent(entry.author.name);
    }

    // Content
    if (entry.content) {
      item.content = getContent(entry.content);
      item.contentSnippet = getSnippet(item.content);
    } else if (entry.summary) {
      item.content = getContent(entry.summary);
      item.contentSnippet = getSnippet(item.content);
    }

    return item;
  });

  return feed;
}

/**
 * Parses RSS 1.0 (RDF) feed
 */
function parseRSS1(xmlObj: any): IParsedFeed {
  const rdf = xmlObj['rdf:RDF'];
  if (!rdf) {
    throw new Error('Invalid RSS 1.0 feed: missing rdf:RDF element');
  }

  const feed: IParsedFeed = {
    items: [],
  };

  const channel = rdf.channel;
  if (channel) {
    if (channel.title) feed.title = getContent(channel.title);
    if (channel.description) feed.description = getContent(channel.description);
    if (channel.link) feed.link = getContent(channel.link);
  }

  // Items
  const items = rdf.item ? (Array.isArray(rdf.item) ? rdf.item : [rdf.item]) : [];
  feed.items = items.map((xmlItem: any) => {
    const item: IParsedItem = {};

    if (xmlItem.title) item.title = getContent(xmlItem.title);
    if (xmlItem.link) item.link = getContent(xmlItem.link);
    if (xmlItem.description) {
      item.content = getContent(xmlItem.description);
      item.contentSnippet = getSnippet(item.content);
    }
    if (xmlItem['dc:date']) {
      item.pubDate = getContent(xmlItem['dc:date']);
      item.isoDate = toISODate(item.pubDate);
    }
    if (xmlItem['dc:creator']) {
      item.author = getContent(xmlItem['dc:creator']);
    }
    if (xmlItem['@_rdf:about']) {
      item.id = xmlItem['@_rdf:about'];
    }

    return item;
  });

  return feed;
}

/**
 * Detects feed type and parses accordingly
 */
export function parseFeedXML(xmlString: string): IParsedFeed {
  const parser = new plugins.XMLParser({
    ignoreAttributes: false,
    attributeNamePrefix: '@_',
    textNodeName: '#text',
    parseAttributeValue: false,
  });

  const xmlObj = parser.parse(xmlString);

  // Detect feed type
  if (xmlObj.rss && xmlObj.rss.channel) {
    // RSS 2.0 or 0.9x
    return parseRSS2(xmlObj);
  } else if (xmlObj.feed) {
    // Atom 1.0
    return parseAtom(xmlObj);
  } else if (xmlObj['rdf:RDF']) {
    // RSS 1.0 (RDF)
    return parseRSS1(xmlObj);
  } else {
    throw new Error('Feed not recognized as RSS or Atom');
  }
}