Files
smartsitemap/ts/smartsitemap.classes.sitemapparser.ts

252 lines
9.3 KiB
TypeScript
Raw Normal View History

import * as plugins from './smartsitemap.plugins.js';
import type * as interfaces from './interfaces/index.js';
import { UrlsetBuilder } from './smartsitemap.classes.urlsetbuilder.js';
/**
* Parses existing sitemap XML into structured data.
* Handles both <urlset> sitemaps and <sitemapindex> files.
*/
export class SitemapParser {
/**
* Parse a sitemap XML string into structured data.
*/
static async parse(xml: string): Promise<interfaces.IParsedSitemap> {
const smartXml = new plugins.smartxml.SmartXml();
const parsed = smartXml.parseXmlToObject(xml);
// The parser returns ordered format (preserveOrder: true)
// We need to walk the structure to extract urls or sitemap entries
return SitemapParser.processOrderedParsed(parsed);
}
/**
* Fetch and parse a sitemap from a URL.
*/
static async parseUrl(url: string): Promise<interfaces.IParsedSitemap> {
const response = await plugins.webrequest.webrequest(url);
const xml = await response.text();
return SitemapParser.parse(xml);
}
/**
* Parse a sitemap XML and return a pre-populated UrlsetBuilder.
*/
static async toBuilder(xml: string, options?: interfaces.ISitemapOptions): Promise<UrlsetBuilder> {
const parsed = await SitemapParser.parse(xml);
const builder = new UrlsetBuilder(options);
builder.addUrls(parsed.urls);
return builder;
}
/**
* Detect whether XML is a urlset or sitemapindex without full parsing.
*/
static detectType(xml: string): 'urlset' | 'sitemapindex' | 'unknown' {
if (xml.includes('<urlset')) return 'urlset';
if (xml.includes('<sitemapindex')) return 'sitemapindex';
return 'unknown';
}
/**
* Process the ordered-format output from smartxml's parseXmlToObject.
* The ordered format uses arrays of objects where each object has a single key.
*/
private static processOrderedParsed(parsed: any[]): interfaces.IParsedSitemap {
const result: interfaces.IParsedSitemap = {
type: 'urlset',
urls: [],
sitemaps: [],
};
if (!Array.isArray(parsed)) {
return result;
}
for (const node of parsed) {
if (node.urlset) {
result.type = 'urlset';
result.urls = SitemapParser.extractUrls(node.urlset);
} else if (node.sitemapindex) {
result.type = 'sitemapindex';
result.sitemaps = SitemapParser.extractIndexEntries(node.sitemapindex);
}
}
return result;
}
/**
* Extract URL entries from an ordered-format urlset.
*/
private static extractUrls(urlsetNodes: any[]): interfaces.ISitemapUrl[] {
const urls: interfaces.ISitemapUrl[] = [];
if (!Array.isArray(urlsetNodes)) return urls;
for (const node of urlsetNodes) {
if (node.url) {
const urlData = SitemapParser.extractUrlData(node.url);
if (urlData) urls.push(urlData);
}
}
return urls;
}
/**
* Extract a single URL entry from ordered-format nodes.
*/
private static extractUrlData(urlNodes: any[]): interfaces.ISitemapUrl | null {
if (!Array.isArray(urlNodes)) return null;
const url: interfaces.ISitemapUrl = { loc: '' };
for (const node of urlNodes) {
if (node.loc) {
url.loc = SitemapParser.extractText(node.loc);
} else if (node.lastmod) {
url.lastmod = SitemapParser.extractText(node.lastmod);
} else if (node.changefreq) {
url.changefreq = SitemapParser.extractText(node.changefreq) as interfaces.TChangeFreq;
} else if (node.priority) {
const pText = SitemapParser.extractText(node.priority);
url.priority = parseFloat(pText);
} else if (node['image:image']) {
if (!url.images) url.images = [];
url.images.push(SitemapParser.extractImageData(node['image:image']));
} else if (node['video:video']) {
if (!url.videos) url.videos = [];
url.videos.push(SitemapParser.extractVideoData(node['video:video']));
} else if (node['news:news']) {
url.news = SitemapParser.extractNewsData(node['news:news']);
} else if (node['xhtml:link']) {
if (!url.alternates) url.alternates = [];
const attrs = node[':@'] || {};
if (attrs['@_hreflang'] && attrs['@_href']) {
url.alternates.push({
hreflang: attrs['@_hreflang'],
href: attrs['@_href'],
});
}
}
}
return url.loc ? url : null;
}
/**
* Extract image data from ordered-format nodes.
*/
private static extractImageData(nodes: any[]): interfaces.ISitemapImage {
const img: interfaces.ISitemapImage = { loc: '' };
if (!Array.isArray(nodes)) return img;
for (const node of nodes) {
if (node['image:loc']) img.loc = SitemapParser.extractText(node['image:loc']);
else if (node['image:caption']) img.caption = SitemapParser.extractText(node['image:caption']);
else if (node['image:title']) img.title = SitemapParser.extractText(node['image:title']);
else if (node['image:geo_location']) img.geoLocation = SitemapParser.extractText(node['image:geo_location']);
else if (node['image:license']) img.licenseUrl = SitemapParser.extractText(node['image:license']);
}
return img;
}
/**
* Extract video data from ordered-format nodes.
*/
private static extractVideoData(nodes: any[]): interfaces.ISitemapVideo {
const vid: interfaces.ISitemapVideo = { thumbnailLoc: '', title: '', description: '' };
if (!Array.isArray(nodes)) return vid;
for (const node of nodes) {
if (node['video:thumbnail_loc']) vid.thumbnailLoc = SitemapParser.extractText(node['video:thumbnail_loc']);
else if (node['video:title']) vid.title = SitemapParser.extractText(node['video:title']);
else if (node['video:description']) vid.description = SitemapParser.extractText(node['video:description']);
else if (node['video:content_loc']) vid.contentLoc = SitemapParser.extractText(node['video:content_loc']);
else if (node['video:player_loc']) vid.playerLoc = SitemapParser.extractText(node['video:player_loc']);
else if (node['video:duration']) vid.duration = parseInt(SitemapParser.extractText(node['video:duration']));
else if (node['video:rating']) vid.rating = parseFloat(SitemapParser.extractText(node['video:rating']));
else if (node['video:view_count']) vid.viewCount = parseInt(SitemapParser.extractText(node['video:view_count']));
else if (node['video:publication_date']) vid.publicationDate = SitemapParser.extractText(node['video:publication_date']);
else if (node['video:family_friendly']) vid.familyFriendly = SitemapParser.extractText(node['video:family_friendly']) === 'yes';
else if (node['video:live']) vid.live = SitemapParser.extractText(node['video:live']) === 'yes';
else if (node['video:requires_subscription']) vid.requiresSubscription = SitemapParser.extractText(node['video:requires_subscription']) === 'yes';
else if (node['video:tag']) {
if (!vid.tags) vid.tags = [];
vid.tags.push(SitemapParser.extractText(node['video:tag']));
}
}
return vid;
}
/**
* Extract news data from ordered-format nodes.
*/
private static extractNewsData(nodes: any[]): interfaces.ISitemapNews {
const news: interfaces.ISitemapNews = {
publication: { name: '', language: '' },
publicationDate: '',
title: '',
};
if (!Array.isArray(nodes)) return news;
for (const node of nodes) {
if (node['news:publication']) {
const pubNodes = node['news:publication'];
if (Array.isArray(pubNodes)) {
for (const pNode of pubNodes) {
if (pNode['news:name']) news.publication.name = SitemapParser.extractText(pNode['news:name']);
else if (pNode['news:language']) news.publication.language = SitemapParser.extractText(pNode['news:language']);
}
}
} else if (node['news:publication_date']) {
news.publicationDate = SitemapParser.extractText(node['news:publication_date']);
} else if (node['news:title']) {
news.title = SitemapParser.extractText(node['news:title']);
} else if (node['news:keywords']) {
news.keywords = SitemapParser.extractText(node['news:keywords']);
}
}
return news;
}
/**
* Extract sitemap index entries from ordered-format nodes.
*/
private static extractIndexEntries(indexNodes: any[]): interfaces.ISitemapIndexEntry[] {
const entries: interfaces.ISitemapIndexEntry[] = [];
if (!Array.isArray(indexNodes)) return entries;
for (const node of indexNodes) {
if (node.sitemap) {
const entry: interfaces.ISitemapIndexEntry = { loc: '' };
if (Array.isArray(node.sitemap)) {
for (const sNode of node.sitemap) {
if (sNode.loc) entry.loc = SitemapParser.extractText(sNode.loc);
else if (sNode.lastmod) entry.lastmod = SitemapParser.extractText(sNode.lastmod);
}
}
if (entry.loc) entries.push(entry);
}
}
return entries;
}
/**
* Extract text content from an ordered-format node.
* In ordered format, text is stored as [{ '#text': 'value' }].
*/
private static extractText(nodes: any): string {
if (typeof nodes === 'string') return nodes;
if (typeof nodes === 'number') return String(nodes);
if (Array.isArray(nodes)) {
for (const n of nodes) {
if (n['#text'] != null) return String(n['#text']);
}
}
return '';
}
}