252 lines
9.3 KiB
TypeScript
252 lines
9.3 KiB
TypeScript
import * as plugins from './smartsitemap.plugins.js';
|
|
import type * as interfaces from './interfaces/index.js';
|
|
import { UrlsetBuilder } from './smartsitemap.classes.urlsetbuilder.js';
|
|
|
|
/**
|
|
* Parses existing sitemap XML into structured data.
|
|
* Handles both <urlset> sitemaps and <sitemapindex> files.
|
|
*/
|
|
export class SitemapParser {
|
|
/**
|
|
* Parse a sitemap XML string into structured data.
|
|
*/
|
|
static async parse(xml: string): Promise<interfaces.IParsedSitemap> {
|
|
const smartXml = new plugins.smartxml.SmartXml();
|
|
const parsed = smartXml.parseXmlToObject(xml);
|
|
|
|
// The parser returns ordered format (preserveOrder: true)
|
|
// We need to walk the structure to extract urls or sitemap entries
|
|
return SitemapParser.processOrderedParsed(parsed);
|
|
}
|
|
|
|
/**
|
|
* Fetch and parse a sitemap from a URL.
|
|
*/
|
|
static async parseUrl(url: string): Promise<interfaces.IParsedSitemap> {
|
|
const response = await plugins.webrequest.webrequest(url);
|
|
const xml = await response.text();
|
|
return SitemapParser.parse(xml);
|
|
}
|
|
|
|
/**
|
|
* Parse a sitemap XML and return a pre-populated UrlsetBuilder.
|
|
*/
|
|
static async toBuilder(xml: string, options?: interfaces.ISitemapOptions): Promise<UrlsetBuilder> {
|
|
const parsed = await SitemapParser.parse(xml);
|
|
const builder = new UrlsetBuilder(options);
|
|
builder.addUrls(parsed.urls);
|
|
return builder;
|
|
}
|
|
|
|
/**
|
|
* Detect whether XML is a urlset or sitemapindex without full parsing.
|
|
*/
|
|
static detectType(xml: string): 'urlset' | 'sitemapindex' | 'unknown' {
|
|
if (xml.includes('<urlset')) return 'urlset';
|
|
if (xml.includes('<sitemapindex')) return 'sitemapindex';
|
|
return 'unknown';
|
|
}
|
|
|
|
/**
|
|
* Process the ordered-format output from smartxml's parseXmlToObject.
|
|
* The ordered format uses arrays of objects where each object has a single key.
|
|
*/
|
|
private static processOrderedParsed(parsed: any[]): interfaces.IParsedSitemap {
|
|
const result: interfaces.IParsedSitemap = {
|
|
type: 'urlset',
|
|
urls: [],
|
|
sitemaps: [],
|
|
};
|
|
|
|
if (!Array.isArray(parsed)) {
|
|
return result;
|
|
}
|
|
|
|
for (const node of parsed) {
|
|
if (node.urlset) {
|
|
result.type = 'urlset';
|
|
result.urls = SitemapParser.extractUrls(node.urlset);
|
|
} else if (node.sitemapindex) {
|
|
result.type = 'sitemapindex';
|
|
result.sitemaps = SitemapParser.extractIndexEntries(node.sitemapindex);
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Extract URL entries from an ordered-format urlset.
|
|
*/
|
|
private static extractUrls(urlsetNodes: any[]): interfaces.ISitemapUrl[] {
|
|
const urls: interfaces.ISitemapUrl[] = [];
|
|
|
|
if (!Array.isArray(urlsetNodes)) return urls;
|
|
|
|
for (const node of urlsetNodes) {
|
|
if (node.url) {
|
|
const urlData = SitemapParser.extractUrlData(node.url);
|
|
if (urlData) urls.push(urlData);
|
|
}
|
|
}
|
|
|
|
return urls;
|
|
}
|
|
|
|
/**
|
|
* Extract a single URL entry from ordered-format nodes.
|
|
*/
|
|
private static extractUrlData(urlNodes: any[]): interfaces.ISitemapUrl | null {
|
|
if (!Array.isArray(urlNodes)) return null;
|
|
|
|
const url: interfaces.ISitemapUrl = { loc: '' };
|
|
|
|
for (const node of urlNodes) {
|
|
if (node.loc) {
|
|
url.loc = SitemapParser.extractText(node.loc);
|
|
} else if (node.lastmod) {
|
|
url.lastmod = SitemapParser.extractText(node.lastmod);
|
|
} else if (node.changefreq) {
|
|
url.changefreq = SitemapParser.extractText(node.changefreq) as interfaces.TChangeFreq;
|
|
} else if (node.priority) {
|
|
const pText = SitemapParser.extractText(node.priority);
|
|
url.priority = parseFloat(pText);
|
|
} else if (node['image:image']) {
|
|
if (!url.images) url.images = [];
|
|
url.images.push(SitemapParser.extractImageData(node['image:image']));
|
|
} else if (node['video:video']) {
|
|
if (!url.videos) url.videos = [];
|
|
url.videos.push(SitemapParser.extractVideoData(node['video:video']));
|
|
} else if (node['news:news']) {
|
|
url.news = SitemapParser.extractNewsData(node['news:news']);
|
|
} else if (node['xhtml:link']) {
|
|
if (!url.alternates) url.alternates = [];
|
|
const attrs = node[':@'] || {};
|
|
if (attrs['@_hreflang'] && attrs['@_href']) {
|
|
url.alternates.push({
|
|
hreflang: attrs['@_hreflang'],
|
|
href: attrs['@_href'],
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
return url.loc ? url : null;
|
|
}
|
|
|
|
/**
|
|
* Extract image data from ordered-format nodes.
|
|
*/
|
|
private static extractImageData(nodes: any[]): interfaces.ISitemapImage {
|
|
const img: interfaces.ISitemapImage = { loc: '' };
|
|
if (!Array.isArray(nodes)) return img;
|
|
|
|
for (const node of nodes) {
|
|
if (node['image:loc']) img.loc = SitemapParser.extractText(node['image:loc']);
|
|
else if (node['image:caption']) img.caption = SitemapParser.extractText(node['image:caption']);
|
|
else if (node['image:title']) img.title = SitemapParser.extractText(node['image:title']);
|
|
else if (node['image:geo_location']) img.geoLocation = SitemapParser.extractText(node['image:geo_location']);
|
|
else if (node['image:license']) img.licenseUrl = SitemapParser.extractText(node['image:license']);
|
|
}
|
|
return img;
|
|
}
|
|
|
|
/**
|
|
* Extract video data from ordered-format nodes.
|
|
*/
|
|
private static extractVideoData(nodes: any[]): interfaces.ISitemapVideo {
|
|
const vid: interfaces.ISitemapVideo = { thumbnailLoc: '', title: '', description: '' };
|
|
if (!Array.isArray(nodes)) return vid;
|
|
|
|
for (const node of nodes) {
|
|
if (node['video:thumbnail_loc']) vid.thumbnailLoc = SitemapParser.extractText(node['video:thumbnail_loc']);
|
|
else if (node['video:title']) vid.title = SitemapParser.extractText(node['video:title']);
|
|
else if (node['video:description']) vid.description = SitemapParser.extractText(node['video:description']);
|
|
else if (node['video:content_loc']) vid.contentLoc = SitemapParser.extractText(node['video:content_loc']);
|
|
else if (node['video:player_loc']) vid.playerLoc = SitemapParser.extractText(node['video:player_loc']);
|
|
else if (node['video:duration']) vid.duration = parseInt(SitemapParser.extractText(node['video:duration']));
|
|
else if (node['video:rating']) vid.rating = parseFloat(SitemapParser.extractText(node['video:rating']));
|
|
else if (node['video:view_count']) vid.viewCount = parseInt(SitemapParser.extractText(node['video:view_count']));
|
|
else if (node['video:publication_date']) vid.publicationDate = SitemapParser.extractText(node['video:publication_date']);
|
|
else if (node['video:family_friendly']) vid.familyFriendly = SitemapParser.extractText(node['video:family_friendly']) === 'yes';
|
|
else if (node['video:live']) vid.live = SitemapParser.extractText(node['video:live']) === 'yes';
|
|
else if (node['video:requires_subscription']) vid.requiresSubscription = SitemapParser.extractText(node['video:requires_subscription']) === 'yes';
|
|
else if (node['video:tag']) {
|
|
if (!vid.tags) vid.tags = [];
|
|
vid.tags.push(SitemapParser.extractText(node['video:tag']));
|
|
}
|
|
}
|
|
return vid;
|
|
}
|
|
|
|
/**
|
|
* Extract news data from ordered-format nodes.
|
|
*/
|
|
private static extractNewsData(nodes: any[]): interfaces.ISitemapNews {
|
|
const news: interfaces.ISitemapNews = {
|
|
publication: { name: '', language: '' },
|
|
publicationDate: '',
|
|
title: '',
|
|
};
|
|
if (!Array.isArray(nodes)) return news;
|
|
|
|
for (const node of nodes) {
|
|
if (node['news:publication']) {
|
|
const pubNodes = node['news:publication'];
|
|
if (Array.isArray(pubNodes)) {
|
|
for (const pNode of pubNodes) {
|
|
if (pNode['news:name']) news.publication.name = SitemapParser.extractText(pNode['news:name']);
|
|
else if (pNode['news:language']) news.publication.language = SitemapParser.extractText(pNode['news:language']);
|
|
}
|
|
}
|
|
} else if (node['news:publication_date']) {
|
|
news.publicationDate = SitemapParser.extractText(node['news:publication_date']);
|
|
} else if (node['news:title']) {
|
|
news.title = SitemapParser.extractText(node['news:title']);
|
|
} else if (node['news:keywords']) {
|
|
news.keywords = SitemapParser.extractText(node['news:keywords']);
|
|
}
|
|
}
|
|
return news;
|
|
}
|
|
|
|
/**
|
|
* Extract sitemap index entries from ordered-format nodes.
|
|
*/
|
|
private static extractIndexEntries(indexNodes: any[]): interfaces.ISitemapIndexEntry[] {
|
|
const entries: interfaces.ISitemapIndexEntry[] = [];
|
|
|
|
if (!Array.isArray(indexNodes)) return entries;
|
|
|
|
for (const node of indexNodes) {
|
|
if (node.sitemap) {
|
|
const entry: interfaces.ISitemapIndexEntry = { loc: '' };
|
|
if (Array.isArray(node.sitemap)) {
|
|
for (const sNode of node.sitemap) {
|
|
if (sNode.loc) entry.loc = SitemapParser.extractText(sNode.loc);
|
|
else if (sNode.lastmod) entry.lastmod = SitemapParser.extractText(sNode.lastmod);
|
|
}
|
|
}
|
|
if (entry.loc) entries.push(entry);
|
|
}
|
|
}
|
|
|
|
return entries;
|
|
}
|
|
|
|
/**
|
|
* Extract text content from an ordered-format node.
|
|
* In ordered format, text is stored as [{ '#text': 'value' }].
|
|
*/
|
|
private static extractText(nodes: any): string {
|
|
if (typeof nodes === 'string') return nodes;
|
|
if (typeof nodes === 'number') return String(nodes);
|
|
if (Array.isArray(nodes)) {
|
|
for (const n of nodes) {
|
|
if (n['#text'] != null) return String(n['#text']);
|
|
}
|
|
}
|
|
return '';
|
|
}
|
|
}
|