BREAKING CHANGE(api): redesign smartsitemap around builder-based sitemap creation, parsing, validation, and import utilities

This commit is contained in:
2026-03-20 14:03:33 +00:00
parent 61f6bcebd4
commit 4e707347dd
22 changed files with 4843 additions and 2196 deletions

View File

@@ -3,6 +3,6 @@
*/
export const commitinfo = {
name: '@push.rocks/smartsitemap',
version: '2.0.4',
description: 'A module for generating and managing sitemaps, supporting dynamic sitemap generation from feeds.'
version: '4.0.0',
description: 'A comprehensive TypeScript sitemap library with builder API, supporting standard, news, image, video, and hreflang sitemaps with auto-splitting, streaming, validation, and RSS feed integration.'
}

View File

@@ -1,3 +1,44 @@
export * from './smartsitemap.classes.smartsitemap.js';
export * from './smartsitemap.classes.sitemapnews.js';
export * from './smartsitemap.classes.sitemapwebsite.js';
// Main facade
export { SmartSitemap } from './smartsitemap.classes.smartsitemap.js';
// Builders
export { UrlsetBuilder } from './smartsitemap.classes.urlsetbuilder.js';
export { NewsSitemapBuilder } from './smartsitemap.classes.newsbuilder.js';
export { SitemapIndexBuilder } from './smartsitemap.classes.indexbuilder.js';
// Parser
export { SitemapParser } from './smartsitemap.classes.sitemapparser.js';
// Stream
export { SitemapStream } from './smartsitemap.classes.sitemapstream.js';
// Utilities
export { XmlRenderer } from './smartsitemap.classes.xmlrenderer.js';
export { SitemapValidator } from './smartsitemap.classes.validator.js';
// Feed & YAML importers
export { FeedImporter } from './smartsitemap.classes.feedimporter.js';
export { YamlImporter } from './smartsitemap.classes.yamlimporter.js';
// All interfaces and types
export type {
TChangeFreq,
TOutputFormat,
ISitemapUrl,
ISitemapImage,
ISitemapVideo,
ISitemapNews,
ISitemapAlternate,
ISitemapIndexEntry,
ISitemapOptions,
INewsSitemapOptions,
IFeedImportOptions,
IFeedItem,
ISitemapYamlConfig,
IParsedSitemap,
IValidationError,
IValidationWarning,
IValidationResult,
ISitemapStats,
ISitemapSet,
} from './interfaces/index.js';

View File

@@ -1,42 +1,277 @@
export interface ISitemapYaml {
daily: string[];
// ============================================================
// CORE TYPES
// ============================================================
/**
* Change frequency values per the sitemap protocol specification.
* Note: Google ignores changefreq, but other search engines may use it.
*/
export type TChangeFreq =
| 'always'
| 'hourly'
| 'daily'
| 'weekly'
| 'monthly'
| 'yearly'
| 'never';
/** Supported output formats */
export type TOutputFormat = 'xml' | 'txt' | 'json';
// ============================================================
// URL ENTRY — the core unit of a sitemap
// ============================================================
/**
* A single URL entry in a sitemap, supporting all standard extensions.
*/
export interface ISitemapUrl {
/** Absolute URL of the page (required, max 2048 chars) */
loc: string;
/** Last modification date — accepts Date, ISO string, or Unix timestamp (ms) */
lastmod?: Date | string | number;
/** How frequently the page changes */
changefreq?: TChangeFreq;
/** Priority relative to other URLs on your site, 0.0 to 1.0 */
priority?: number;
/** Image sitemap extension entries */
images?: ISitemapImage[];
/** Video sitemap extension entries */
videos?: ISitemapVideo[];
/** News sitemap extension */
news?: ISitemapNews;
/** Alternate language versions (hreflang) */
alternates?: ISitemapAlternate[];
}
export interface IRssItem {
[key: string]: any;
link?: string;
guid?: string;
// ============================================================
// SITEMAP EXTENSIONS
// ============================================================
export interface ISitemapImage {
/** URL of the image (required) */
loc: string;
/** Caption for the image */
caption?: string;
/** Title of the image */
title?: string;
pubDate?: string;
creator?: string;
content?: string;
isoDate?: string;
categories?: string[];
contentSnippet?: string;
enclosure?: any;
/** Geographic location (e.g. "New York, USA") */
geoLocation?: string;
/** URL to the image license */
licenseUrl?: string;
}
export interface IParsedSiteMap {
urlset: {
url:
| {
loc: string;
lastmod: string;
changefreq: string;
}
| {
loc: string;
lastmod: string;
changefreq: string;
}[]
| {
loc: string;
'news:news': {
'news:publication': [];
'news:keywords': string;
'news:publication_date': string;
'news:title': string;
};
}[];
};
export interface ISitemapVideo {
/** URL to the video thumbnail (required) */
thumbnailLoc: string;
/** Title of the video (required) */
title: string;
/** Description of the video, max 2048 chars (required) */
description: string;
/** URL of the actual video media file */
contentLoc?: string;
/** URL of the embeddable player — at least one of contentLoc or playerLoc required */
playerLoc?: string;
/** Duration in seconds (128800) */
duration?: number;
/** Rating 0.0 to 5.0 */
rating?: number;
/** Number of views */
viewCount?: number;
/** Publication date */
publicationDate?: Date | string;
/** Whether the video is family friendly (default true) */
familyFriendly?: boolean;
/** Tags for the video (max 32) */
tags?: string[];
/** Whether this is a live stream */
live?: boolean;
/** Whether a subscription is required to view */
requiresSubscription?: boolean;
}
export interface ISitemapNews {
/** Publication information */
publication: {
/** Publication name (e.g. "The New York Times") */
name: string;
/** Language code (ISO 639, e.g. "en", "de", "zh-cn") */
language: string;
};
/** Publication date of the article */
publicationDate: Date | string | number;
/** Article title */
title: string;
/** Keywords (array or comma-separated string) */
keywords?: string[] | string;
}
export interface ISitemapAlternate {
/** Language code (ISO 639) or 'x-default' for the default version */
hreflang: string;
/** URL for this language version */
href: string;
}
// ============================================================
// SITEMAP INDEX
// ============================================================
export interface ISitemapIndexEntry {
/** URL to the sitemap file */
loc: string;
/** Last modification date of the referenced sitemap */
lastmod?: Date | string | number;
}
// ============================================================
// CONFIGURATION
// ============================================================
export interface ISitemapOptions {
/** Base URL for the website (used to resolve relative URLs and for auto-split filenames) */
baseUrl?: string;
/** XSL stylesheet URL for browser-viewable sitemaps */
xslUrl?: string;
/** Default changefreq for URLs that don't specify one */
defaultChangeFreq?: TChangeFreq;
/** Default priority for URLs that don't specify one (0.01.0) */
defaultPriority?: number;
/** Whether to pretty-print XML output (default: true) */
prettyPrint?: boolean;
/** Maximum URLs per sitemap file before auto-splitting (default: 50000, max: 50000) */
maxUrlsPerSitemap?: number;
/** Enable gzip compression for toGzipBuffer() */
gzip?: boolean;
/** Whether to validate URLs and fields (default: true) */
validate?: boolean;
}
export interface INewsSitemapOptions extends ISitemapOptions {
/** Publication name — required for news sitemaps */
publicationName: string;
/** Publication language (default: 'en') */
publicationLanguage?: string;
}
export interface IFeedImportOptions {
/** Publication name for news sitemap mapping */
publicationName?: string;
/** Publication language for news sitemap mapping */
publicationLanguage?: string;
/** Only include items newer than this date */
newerThan?: Date | number;
/** Maximum number of items to import */
limit?: number;
/** Custom mapping function from feed item to sitemap URL (return null to skip) */
mapItem?: (item: IFeedItem) => ISitemapUrl | null;
}
/** Shape of a parsed RSS/Atom feed item */
export interface IFeedItem {
title?: string;
link?: string;
pubDate?: string;
author?: string;
content?: string;
contentSnippet?: string;
isoDate?: string;
id?: string;
categories?: string[];
enclosure?: {
url?: string;
type?: string;
length?: string;
};
[key: string]: any;
}
// ============================================================
// YAML CONFIG
// ============================================================
/**
* Enhanced YAML configuration format for defining sitemaps declaratively.
* Supports per-frequency URL groups, default settings, and feed imports.
*/
export interface ISitemapYamlConfig {
/** Base URL to prepend to relative paths */
baseUrl?: string;
/** Default values for all URLs */
defaults?: {
changefreq?: TChangeFreq;
priority?: number;
};
/** URL groups organized by change frequency */
urls?: { [K in TChangeFreq]?: string[] };
/** RSS/Atom feeds to import */
feeds?: Array<{
url: string;
type: 'news' | 'standard';
publicationName?: string;
publicationLanguage?: string;
}>;
}
// ============================================================
// PARSED SITEMAP (bidirectional)
// ============================================================
export interface IParsedSitemap {
/** Whether this is a urlset or a sitemap index */
type: 'urlset' | 'sitemapindex';
/** Parsed URL entries (populated when type is 'urlset') */
urls: ISitemapUrl[];
/** Parsed index entries (populated when type is 'sitemapindex') */
sitemaps: ISitemapIndexEntry[];
}
// ============================================================
// VALIDATION
// ============================================================
export interface IValidationError {
field: string;
message: string;
url?: string;
}
export interface IValidationWarning {
field: string;
message: string;
url?: string;
}
export interface IValidationResult {
valid: boolean;
errors: IValidationError[];
warnings: IValidationWarning[];
stats: ISitemapStats;
}
// ============================================================
// STATISTICS
// ============================================================
export interface ISitemapStats {
urlCount: number;
imageCount: number;
videoCount: number;
newsCount: number;
alternateCount: number;
estimatedSizeBytes: number;
needsIndex: boolean;
}
// ============================================================
// AUTO-SPLIT OUTPUT
// ============================================================
export interface ISitemapSet {
/** Whether the URL count exceeded maxUrlsPerSitemap */
needsIndex: boolean;
/** The sitemap index XML (null if all URLs fit in one sitemap) */
indexXml: string | null;
/** Individual sitemap chunks */
sitemaps: Array<{ filename: string; xml: string }>;
}

View File

@@ -0,0 +1,159 @@
import * as plugins from './smartsitemap.plugins.js';
import type * as interfaces from './interfaces/index.js';
/**
* Imports RSS/Atom feeds and converts them to sitemap URL entries.
* This is a unique feature of smartsitemap that competitors don't offer.
*/
export class FeedImporter {
/**
* Import from a feed URL, returning standard sitemap URL entries.
*/
static async fromUrl(
feedUrl: string,
options?: interfaces.IFeedImportOptions,
): Promise<interfaces.ISitemapUrl[]> {
const smartfeed = new plugins.smartfeed.Smartfeed();
const feed = await smartfeed.parseFeedFromUrl(feedUrl);
return FeedImporter.mapItems(feed.items, options);
}
/**
* Import from a feed XML string, returning standard sitemap URL entries.
*/
static async fromString(
feedXml: string,
options?: interfaces.IFeedImportOptions,
): Promise<interfaces.ISitemapUrl[]> {
const smartfeed = new plugins.smartfeed.Smartfeed();
const feed = await smartfeed.parseFeedFromString(feedXml);
return FeedImporter.mapItems(feed.items, options);
}
/**
* Import from a feed URL, returning news sitemap URL entries.
*/
static async fromUrlAsNews(
feedUrl: string,
publicationName: string,
publicationLanguage?: string,
options?: interfaces.IFeedImportOptions,
): Promise<interfaces.ISitemapUrl[]> {
const smartfeed = new plugins.smartfeed.Smartfeed();
const feed = await smartfeed.parseFeedFromUrl(feedUrl);
return FeedImporter.mapItemsAsNews(feed.items, publicationName, publicationLanguage ?? 'en', options);
}
/**
* Import from a feed string, returning news sitemap URL entries.
*/
static async fromStringAsNews(
feedXml: string,
publicationName: string,
publicationLanguage?: string,
options?: interfaces.IFeedImportOptions,
): Promise<interfaces.ISitemapUrl[]> {
const smartfeed = new plugins.smartfeed.Smartfeed();
const feed = await smartfeed.parseFeedFromString(feedXml);
return FeedImporter.mapItemsAsNews(feed.items, publicationName, publicationLanguage ?? 'en', options);
}
/**
* Map parsed feed items to standard sitemap URLs.
*/
private static mapItems(
items: any[],
options?: interfaces.IFeedImportOptions,
): interfaces.ISitemapUrl[] {
let filtered = FeedImporter.filterItems(items, options);
if (options?.mapItem) {
const results: interfaces.ISitemapUrl[] = [];
for (const item of filtered) {
const mapped = options.mapItem(item as interfaces.IFeedItem);
if (mapped) results.push(mapped);
}
return results;
}
return filtered
.filter((item: any) => item.link)
.map((item: any) => {
const url: interfaces.ISitemapUrl = {
loc: item.link,
};
if (item.isoDate) {
url.lastmod = item.isoDate;
}
return url;
});
}
/**
* Map parsed feed items to news sitemap URLs.
*/
private static mapItemsAsNews(
items: any[],
publicationName: string,
publicationLanguage: string,
options?: interfaces.IFeedImportOptions,
): interfaces.ISitemapUrl[] {
let filtered = FeedImporter.filterItems(items, options);
if (options?.mapItem) {
const results: interfaces.ISitemapUrl[] = [];
for (const item of filtered) {
const mapped = options.mapItem(item as interfaces.IFeedItem);
if (mapped) results.push(mapped);
}
return results;
}
return filtered
.filter((item: any) => item.link)
.map((item: any) => {
const url: interfaces.ISitemapUrl = {
loc: item.link,
news: {
publication: {
name: publicationName,
language: publicationLanguage,
},
publicationDate: item.isoDate || new Date().toISOString(),
title: item.title || '',
keywords: item.categories,
},
};
if (item.isoDate) {
url.lastmod = item.isoDate;
}
return url;
});
}
/**
* Apply date and limit filters to feed items.
*/
private static filterItems(items: any[], options?: interfaces.IFeedImportOptions): any[] {
let result = [...items];
// Filter by date
if (options?.newerThan != null) {
const threshold = options.newerThan instanceof Date
? options.newerThan.getTime()
: options.newerThan;
result = result.filter((item: any) => {
if (!item.isoDate) return true; // keep items without dates
return new Date(item.isoDate).getTime() >= threshold;
});
}
// Apply limit
if (options?.limit != null && options.limit > 0) {
result = result.slice(0, options.limit);
}
return result;
}
}

View File

@@ -0,0 +1,82 @@
import type * as interfaces from './interfaces/index.js';
import { XmlRenderer } from './smartsitemap.classes.xmlrenderer.js';
import { UrlsetBuilder } from './smartsitemap.classes.urlsetbuilder.js';
/**
* Builder for sitemap index files (<sitemapindex>).
* Used when you have multiple sitemaps that need to be referenced from a single index.
* Every mutating method returns `this` for fluent chaining.
*/
export class SitemapIndexBuilder {
private entries: interfaces.ISitemapIndexEntry[] = [];
private options: interfaces.ISitemapOptions;
constructor(options?: interfaces.ISitemapOptions) {
this.options = options ?? {};
}
/** Add a sitemap index entry */
add(entry: interfaces.ISitemapIndexEntry): this {
this.entries.push(entry);
return this;
}
/** Add a sitemap by URL, optionally with lastmod */
addSitemap(loc: string, lastmod?: Date | string | number): this {
const entry: interfaces.ISitemapIndexEntry = { loc };
if (lastmod != null) {
entry.lastmod = lastmod;
}
this.entries.push(entry);
return this;
}
/** Add multiple sitemap entries */
addSitemaps(entries: interfaces.ISitemapIndexEntry[]): this {
this.entries.push(...entries);
return this;
}
/**
* Build an index and individual sitemaps from a UrlsetBuilder that needs splitting.
* The builder's URLs are divided into chunks of maxUrlsPerSitemap.
*/
static fromBuilder(
builder: UrlsetBuilder,
baseUrl: string,
): { index: SitemapIndexBuilder; sitemaps: UrlsetBuilder[] } {
const urls = builder.getUrls();
const options = builder.getOptions();
const maxUrls = Math.min(options.maxUrlsPerSitemap ?? 50000, 50000);
const index = new SitemapIndexBuilder(options);
const sitemaps: UrlsetBuilder[] = [];
for (let i = 0; i < urls.length; i += maxUrls) {
const chunk = urls.slice(i, i + maxUrls);
const chunkBuilder = new UrlsetBuilder(options);
chunkBuilder.addUrls(chunk);
sitemaps.push(chunkBuilder);
const filename = `sitemap-${sitemaps.length}.xml`;
index.addSitemap(`${baseUrl.replace(/\/$/, '')}/${filename}`);
}
return { index, sitemaps };
}
/** Export as sitemap index XML string */
toXml(): string {
return XmlRenderer.renderIndex(this.entries, this.options);
}
/** Get all entries */
getEntries(): interfaces.ISitemapIndexEntry[] {
return [...this.entries];
}
/** Get the number of sitemaps in this index */
get count(): number {
return this.entries.length;
}
}

View File

@@ -0,0 +1,95 @@
import * as plugins from './smartsitemap.plugins.js';
import type * as interfaces from './interfaces/index.js';
import { UrlsetBuilder } from './smartsitemap.classes.urlsetbuilder.js';
import { FeedImporter } from './smartsitemap.classes.feedimporter.js';
/**
* Specialized builder for Google News sitemaps.
* Extends UrlsetBuilder with news-specific convenience methods.
* All standard builder methods (add, filter, merge, etc.) are inherited.
*/
export class NewsSitemapBuilder extends UrlsetBuilder {
private publicationName: string;
private publicationLanguage: string;
constructor(options: interfaces.INewsSitemapOptions) {
super(options);
this.publicationName = options.publicationName;
this.publicationLanguage = options.publicationLanguage ?? 'en';
}
/**
* Add a news article URL with convenient parameters.
* Automatically fills in publication name and language from constructor options.
*/
addNewsUrl(
loc: string,
title: string,
publicationDate: Date | string | number,
keywords?: string[] | string,
): this {
this.add({
loc,
news: {
publication: {
name: this.publicationName,
language: this.publicationLanguage,
},
publicationDate,
title,
keywords,
},
});
return this;
}
/**
* Import from RSS/Atom feed URL, automatically mapping items to news entries.
*/
async importFromFeedUrl(feedUrl: string, options?: interfaces.IFeedImportOptions): Promise<this> {
const imported = await FeedImporter.fromUrlAsNews(
feedUrl,
options?.publicationName ?? this.publicationName,
options?.publicationLanguage ?? this.publicationLanguage,
options,
);
this.addUrls(imported);
return this;
}
/**
* Import from RSS/Atom feed string, automatically mapping items to news entries.
*/
async importFromFeedString(feedXml: string, options?: interfaces.IFeedImportOptions): Promise<this> {
const imported = await FeedImporter.fromStringAsNews(
feedXml,
options?.publicationName ?? this.publicationName,
options?.publicationLanguage ?? this.publicationLanguage,
options,
);
this.addUrls(imported);
return this;
}
/**
* Import from @tsclass/tsclass IArticle array with proper news mapping.
*/
importFromArticles(articles: plugins.tsclass.content.IArticle[]): this {
for (const article of articles) {
this.add({
loc: article.url,
lastmod: article.timestamp ? new Date(article.timestamp) : undefined,
news: {
publication: {
name: this.publicationName,
language: this.publicationLanguage,
},
publicationDate: article.timestamp ? new Date(article.timestamp) : new Date(),
title: article.title || '',
keywords: article.tags,
},
});
}
return this;
}
}

View File

@@ -1,79 +0,0 @@
import * as plugins from './smartsitemap.plugins.js';
import * as interfaces from './interfaces/index.js';
export class SitemapNews {
public rssItems: interfaces.IRssItem[] = [];
constructor(optionsArg: {}) {}
public async readAndAddFromRssFeedString(feedStringArg: string) {
const smartfeedInstance = new plugins.smartfeed.Smartfeed();
const parsedFeed =
await smartfeedInstance.parseFeedFromString(feedStringArg);
this.rssItems = this.rssItems.concat(parsedFeed.items);
}
public async readAndAddFromRssFeedUrl(urlArg: string) {
const smartfeedInstance = new plugins.smartfeed.Smartfeed();
const parsedFeed = await smartfeedInstance.parseFeedFromUrl(urlArg);
this.rssItems = this.rssItems.concat(parsedFeed.items);
}
public async readAndParseArticles(
articleArrayArg: plugins.tsclass.content.IArticle[],
) {
const rssItemArray = articleArrayArg.map(
(articleArg): interfaces.IRssItem => {
return {
title: articleArg.title,
content: articleArg.content,
isoDate:
new Date(/* TODO: put article timestamp here */).toISOString(),
link: articleArg.url,
};
},
);
this.rssItems = this.rssItems.concat(rssItemArray);
}
public exportSitemapXml() {
const urls: {
loc: string;
'news:news': {
'news:publication': {
'news:name': string;
'news:language': string;
};
'news:publication_date': string;
'news:keywords': string;
'news:title': string;
};
}[] = [];
for (const itemArg of this.rssItems) {
console.log(itemArg);
urls.push({
loc: itemArg.link,
'news:news': {
'news:publication': {
'news:language': 'en',
'news:name': 'some name',
},
'news:keywords': '',
'news:publication_date': itemArg.isoDate,
'news:title': itemArg.title,
},
});
}
const sitemapObject: any = {
urlset: {
'@_xmlns': 'http://www.sitemaps.org/schemas/sitemap/0.9',
'@_xmlns:news': 'http://www.google.com/schemas/sitemap-news/0.9',
url: urls,
},
};
const smartxmlInstance = new plugins.smartxml.SmartXml();
const sitemapString = smartxmlInstance.createXmlFromObject(sitemapObject);
return sitemapString;
}
}

View File

@@ -0,0 +1,251 @@
import * as plugins from './smartsitemap.plugins.js';
import type * as interfaces from './interfaces/index.js';
import { UrlsetBuilder } from './smartsitemap.classes.urlsetbuilder.js';
/**
* Parses existing sitemap XML into structured data.
* Handles both <urlset> sitemaps and <sitemapindex> files.
*/
export class SitemapParser {
/**
* Parse a sitemap XML string into structured data.
*/
static async parse(xml: string): Promise<interfaces.IParsedSitemap> {
const smartXml = new plugins.smartxml.SmartXml();
const parsed = smartXml.parseXmlToObject(xml);
// The parser returns ordered format (preserveOrder: true)
// We need to walk the structure to extract urls or sitemap entries
return SitemapParser.processOrderedParsed(parsed);
}
/**
* Fetch and parse a sitemap from a URL.
*/
static async parseUrl(url: string): Promise<interfaces.IParsedSitemap> {
const response = await plugins.webrequest.webrequest(url);
const xml = await response.text();
return SitemapParser.parse(xml);
}
/**
* Parse a sitemap XML and return a pre-populated UrlsetBuilder.
*/
static async toBuilder(xml: string, options?: interfaces.ISitemapOptions): Promise<UrlsetBuilder> {
const parsed = await SitemapParser.parse(xml);
const builder = new UrlsetBuilder(options);
builder.addUrls(parsed.urls);
return builder;
}
/**
* Detect whether XML is a urlset or sitemapindex without full parsing.
*/
static detectType(xml: string): 'urlset' | 'sitemapindex' | 'unknown' {
if (xml.includes('<urlset')) return 'urlset';
if (xml.includes('<sitemapindex')) return 'sitemapindex';
return 'unknown';
}
/**
* Process the ordered-format output from smartxml's parseXmlToObject.
* The ordered format uses arrays of objects where each object has a single key.
*/
private static processOrderedParsed(parsed: any[]): interfaces.IParsedSitemap {
const result: interfaces.IParsedSitemap = {
type: 'urlset',
urls: [],
sitemaps: [],
};
if (!Array.isArray(parsed)) {
return result;
}
for (const node of parsed) {
if (node.urlset) {
result.type = 'urlset';
result.urls = SitemapParser.extractUrls(node.urlset);
} else if (node.sitemapindex) {
result.type = 'sitemapindex';
result.sitemaps = SitemapParser.extractIndexEntries(node.sitemapindex);
}
}
return result;
}
/**
* Extract URL entries from an ordered-format urlset.
*/
private static extractUrls(urlsetNodes: any[]): interfaces.ISitemapUrl[] {
const urls: interfaces.ISitemapUrl[] = [];
if (!Array.isArray(urlsetNodes)) return urls;
for (const node of urlsetNodes) {
if (node.url) {
const urlData = SitemapParser.extractUrlData(node.url);
if (urlData) urls.push(urlData);
}
}
return urls;
}
/**
* Extract a single URL entry from ordered-format nodes.
*/
private static extractUrlData(urlNodes: any[]): interfaces.ISitemapUrl | null {
if (!Array.isArray(urlNodes)) return null;
const url: interfaces.ISitemapUrl = { loc: '' };
for (const node of urlNodes) {
if (node.loc) {
url.loc = SitemapParser.extractText(node.loc);
} else if (node.lastmod) {
url.lastmod = SitemapParser.extractText(node.lastmod);
} else if (node.changefreq) {
url.changefreq = SitemapParser.extractText(node.changefreq) as interfaces.TChangeFreq;
} else if (node.priority) {
const pText = SitemapParser.extractText(node.priority);
url.priority = parseFloat(pText);
} else if (node['image:image']) {
if (!url.images) url.images = [];
url.images.push(SitemapParser.extractImageData(node['image:image']));
} else if (node['video:video']) {
if (!url.videos) url.videos = [];
url.videos.push(SitemapParser.extractVideoData(node['video:video']));
} else if (node['news:news']) {
url.news = SitemapParser.extractNewsData(node['news:news']);
} else if (node['xhtml:link']) {
if (!url.alternates) url.alternates = [];
const attrs = node[':@'] || {};
if (attrs['@_hreflang'] && attrs['@_href']) {
url.alternates.push({
hreflang: attrs['@_hreflang'],
href: attrs['@_href'],
});
}
}
}
return url.loc ? url : null;
}
/**
* Extract image data from ordered-format nodes.
*/
private static extractImageData(nodes: any[]): interfaces.ISitemapImage {
const img: interfaces.ISitemapImage = { loc: '' };
if (!Array.isArray(nodes)) return img;
for (const node of nodes) {
if (node['image:loc']) img.loc = SitemapParser.extractText(node['image:loc']);
else if (node['image:caption']) img.caption = SitemapParser.extractText(node['image:caption']);
else if (node['image:title']) img.title = SitemapParser.extractText(node['image:title']);
else if (node['image:geo_location']) img.geoLocation = SitemapParser.extractText(node['image:geo_location']);
else if (node['image:license']) img.licenseUrl = SitemapParser.extractText(node['image:license']);
}
return img;
}
/**
* Extract video data from ordered-format nodes.
*/
private static extractVideoData(nodes: any[]): interfaces.ISitemapVideo {
const vid: interfaces.ISitemapVideo = { thumbnailLoc: '', title: '', description: '' };
if (!Array.isArray(nodes)) return vid;
for (const node of nodes) {
if (node['video:thumbnail_loc']) vid.thumbnailLoc = SitemapParser.extractText(node['video:thumbnail_loc']);
else if (node['video:title']) vid.title = SitemapParser.extractText(node['video:title']);
else if (node['video:description']) vid.description = SitemapParser.extractText(node['video:description']);
else if (node['video:content_loc']) vid.contentLoc = SitemapParser.extractText(node['video:content_loc']);
else if (node['video:player_loc']) vid.playerLoc = SitemapParser.extractText(node['video:player_loc']);
else if (node['video:duration']) vid.duration = parseInt(SitemapParser.extractText(node['video:duration']));
else if (node['video:rating']) vid.rating = parseFloat(SitemapParser.extractText(node['video:rating']));
else if (node['video:view_count']) vid.viewCount = parseInt(SitemapParser.extractText(node['video:view_count']));
else if (node['video:publication_date']) vid.publicationDate = SitemapParser.extractText(node['video:publication_date']);
else if (node['video:family_friendly']) vid.familyFriendly = SitemapParser.extractText(node['video:family_friendly']) === 'yes';
else if (node['video:live']) vid.live = SitemapParser.extractText(node['video:live']) === 'yes';
else if (node['video:requires_subscription']) vid.requiresSubscription = SitemapParser.extractText(node['video:requires_subscription']) === 'yes';
else if (node['video:tag']) {
if (!vid.tags) vid.tags = [];
vid.tags.push(SitemapParser.extractText(node['video:tag']));
}
}
return vid;
}
/**
* Extract news data from ordered-format nodes.
*/
private static extractNewsData(nodes: any[]): interfaces.ISitemapNews {
const news: interfaces.ISitemapNews = {
publication: { name: '', language: '' },
publicationDate: '',
title: '',
};
if (!Array.isArray(nodes)) return news;
for (const node of nodes) {
if (node['news:publication']) {
const pubNodes = node['news:publication'];
if (Array.isArray(pubNodes)) {
for (const pNode of pubNodes) {
if (pNode['news:name']) news.publication.name = SitemapParser.extractText(pNode['news:name']);
else if (pNode['news:language']) news.publication.language = SitemapParser.extractText(pNode['news:language']);
}
}
} else if (node['news:publication_date']) {
news.publicationDate = SitemapParser.extractText(node['news:publication_date']);
} else if (node['news:title']) {
news.title = SitemapParser.extractText(node['news:title']);
} else if (node['news:keywords']) {
news.keywords = SitemapParser.extractText(node['news:keywords']);
}
}
return news;
}
/**
* Extract sitemap index entries from ordered-format nodes.
*/
private static extractIndexEntries(indexNodes: any[]): interfaces.ISitemapIndexEntry[] {
const entries: interfaces.ISitemapIndexEntry[] = [];
if (!Array.isArray(indexNodes)) return entries;
for (const node of indexNodes) {
if (node.sitemap) {
const entry: interfaces.ISitemapIndexEntry = { loc: '' };
if (Array.isArray(node.sitemap)) {
for (const sNode of node.sitemap) {
if (sNode.loc) entry.loc = SitemapParser.extractText(sNode.loc);
else if (sNode.lastmod) entry.lastmod = SitemapParser.extractText(sNode.lastmod);
}
}
if (entry.loc) entries.push(entry);
}
}
return entries;
}
/**
* Extract text content from an ordered-format node.
* In ordered format, text is stored as [{ '#text': 'value' }].
*/
private static extractText(nodes: any): string {
if (typeof nodes === 'string') return nodes;
if (typeof nodes === 'number') return String(nodes);
if (Array.isArray(nodes)) {
for (const n of nodes) {
if (n['#text'] != null) return String(n['#text']);
}
}
return '';
}
}

View File

@@ -0,0 +1,168 @@
import * as plugins from './smartsitemap.plugins.js';
import type * as interfaces from './interfaces/index.js';
import { XmlRenderer } from './smartsitemap.classes.xmlrenderer.js';
/**
* A Node.js Readable stream that generates sitemap XML incrementally.
* Suitable for very large sitemaps (millions of URLs) that cannot be held in memory.
*
* Usage:
* const stream = new SitemapStream();
* stream.pipe(createWriteStream('sitemap.xml'));
* stream.pushUrl({ loc: 'https://example.com/' });
* stream.pushUrl({ loc: 'https://example.com/about' });
* stream.finish();
*/
export class SitemapStream extends plugins.Readable {
private options: interfaces.ISitemapOptions;
private urlCount = 0;
private headerWritten = false;
private finished = false;
private namespaces: Set<string> = new Set();
constructor(options?: interfaces.ISitemapOptions) {
super({ encoding: 'utf-8' });
this.options = {
prettyPrint: true,
...options,
};
}
/**
* Push a URL entry into the stream.
* The URL is immediately rendered to XML and pushed to the readable buffer.
*/
pushUrl(url: interfaces.ISitemapUrl): boolean {
if (this.finished) {
throw new Error('Cannot push URLs after calling finish()');
}
// Detect needed namespaces
if (url.images?.length) this.namespaces.add('image');
if (url.videos?.length) this.namespaces.add('video');
if (url.news) this.namespaces.add('news');
if (url.alternates?.length) this.namespaces.add('xhtml');
// Write header on first URL
if (!this.headerWritten) {
this.writeHeader();
}
// Build URL element XML using XmlRenderer internals
const indent = this.options.prettyPrint !== false ? ' ' : '';
const nl = this.options.prettyPrint !== false ? '\n' : '';
let urlXml = `${indent}<url>${nl}`;
urlXml += `${indent}${indent}<loc>${XmlRenderer.escapeXml(url.loc)}</loc>${nl}`;
if (url.lastmod != null) {
urlXml += `${indent}${indent}<lastmod>${XmlRenderer.formatDate(url.lastmod)}</lastmod>${nl}`;
}
const changefreq = url.changefreq ?? this.options.defaultChangeFreq;
if (changefreq) {
urlXml += `${indent}${indent}<changefreq>${changefreq}</changefreq>${nl}`;
}
const priority = url.priority ?? this.options.defaultPriority;
if (priority != null) {
urlXml += `${indent}${indent}<priority>${priority.toFixed(1)}</priority>${nl}`;
}
// Extensions (simplified inline rendering for streaming)
if (url.images) {
for (const img of url.images) {
urlXml += `${indent}${indent}<image:image>${nl}`;
urlXml += `${indent}${indent}${indent}<image:loc>${XmlRenderer.escapeXml(img.loc)}</image:loc>${nl}`;
if (img.caption) urlXml += `${indent}${indent}${indent}<image:caption>${XmlRenderer.escapeXml(img.caption)}</image:caption>${nl}`;
if (img.title) urlXml += `${indent}${indent}${indent}<image:title>${XmlRenderer.escapeXml(img.title)}</image:title>${nl}`;
urlXml += `${indent}${indent}</image:image>${nl}`;
}
}
if (url.news) {
urlXml += `${indent}${indent}<news:news>${nl}`;
urlXml += `${indent}${indent}${indent}<news:publication>${nl}`;
urlXml += `${indent}${indent}${indent}${indent}<news:name>${XmlRenderer.escapeXml(url.news.publication.name)}</news:name>${nl}`;
urlXml += `${indent}${indent}${indent}${indent}<news:language>${url.news.publication.language}</news:language>${nl}`;
urlXml += `${indent}${indent}${indent}</news:publication>${nl}`;
urlXml += `${indent}${indent}${indent}<news:publication_date>${XmlRenderer.formatDate(url.news.publicationDate)}</news:publication_date>${nl}`;
urlXml += `${indent}${indent}${indent}<news:title>${XmlRenderer.escapeXml(url.news.title)}</news:title>${nl}`;
if (url.news.keywords) {
const kw = Array.isArray(url.news.keywords) ? url.news.keywords.join(', ') : url.news.keywords;
urlXml += `${indent}${indent}${indent}<news:keywords>${XmlRenderer.escapeXml(kw)}</news:keywords>${nl}`;
}
urlXml += `${indent}${indent}</news:news>${nl}`;
}
if (url.alternates) {
for (const alt of url.alternates) {
urlXml += `${indent}${indent}<xhtml:link rel="alternate" hreflang="${alt.hreflang}" href="${XmlRenderer.escapeXml(alt.href)}"/>${nl}`;
}
}
urlXml += `${indent}</url>${nl}`;
this.urlCount++;
return this.push(urlXml);
}
/**
* Signal that no more URLs will be added.
* Writes the closing tag and ends the stream.
*/
finish(): void {
if (this.finished) return;
this.finished = true;
if (!this.headerWritten) {
// Empty sitemap
this.writeHeader();
}
this.push('</urlset>\n');
this.push(null); // signal end of stream
}
/** Get the number of URLs written so far */
get count(): number {
return this.urlCount;
}
// Required by Readable
_read(): void {
// Data is pushed via pushUrl(), not pulled
}
/**
* Write the XML header and opening urlset tag.
* Namespace declarations are based on what's been detected so far.
*/
private writeHeader(): void {
this.headerWritten = true;
const nl = this.options.prettyPrint !== false ? '\n' : '';
let header = `<?xml version="1.0" encoding="UTF-8"?>${nl}`;
if (this.options.xslUrl) {
header += `<?xml-stylesheet type="text/xsl" href="${XmlRenderer.escapeXml(this.options.xslUrl)}"?>${nl}`;
}
header += '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"';
if (this.namespaces.has('image')) {
header += `${nl} xmlns:image="http://www.google.com/schemas/sitemap-image/1.1"`;
}
if (this.namespaces.has('video')) {
header += `${nl} xmlns:video="http://www.google.com/schemas/sitemap-video/1.1"`;
}
if (this.namespaces.has('news')) {
header += `${nl} xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"`;
}
if (this.namespaces.has('xhtml')) {
header += `${nl} xmlns:xhtml="http://www.w3.org/1999/xhtml"`;
}
header += `>${nl}`;
this.push(header);
}
}

View File

@@ -1,47 +0,0 @@
import * as plugins from './smartsitemap.plugins.js';
export type TUpdateFrequency =
| 'never'
| 'daily'
| 'weekly'
| 'monthly'
| 'yearly';
export interface IUrlInfo {
url: string;
timestamp: number;
frequency?: TUpdateFrequency;
}
export class SitemapWebsite {
urlInfos: IUrlInfo[] = [];
constructor() {}
public addUrl(urlInfoArg: IUrlInfo) {
this.urlInfos.push(urlInfoArg);
}
public exportSitemapXml() {
const urls: {
loc: string;
lastmod: string;
changefreq: TUpdateFrequency;
}[] = [];
for (const urlInfoArg of this.urlInfos) {
urls.push({
loc: urlInfoArg.url,
lastmod: new Date(urlInfoArg.timestamp).toISOString(),
changefreq: urlInfoArg.frequency ? urlInfoArg.frequency : 'weekly',
});
}
const sitemapObject: any = {
urlset: {
'@_xmlns': 'http://www.sitemaps.org/schemas/sitemap/0.9',
url: urls,
},
};
const smartxmlInstance = new plugins.smartxml.SmartXml();
const sitemapString = smartxmlInstance.createXmlFromObject(sitemapObject);
return sitemapString;
}
}

View File

@@ -1,92 +1,112 @@
import { SitemapNews } from './smartsitemap.classes.sitemapnews.js';
import {
type IUrlInfo,
SitemapWebsite,
} from './smartsitemap.classes.sitemapwebsite.js';
import * as plugins from './smartsitemap.plugins.js';
import * as interfaces from './interfaces/index.js';
import type * as interfaces from './interfaces/index.js';
import { UrlsetBuilder } from './smartsitemap.classes.urlsetbuilder.js';
import { NewsSitemapBuilder } from './smartsitemap.classes.newsbuilder.js';
import { SitemapIndexBuilder } from './smartsitemap.classes.indexbuilder.js';
import { SitemapParser } from './smartsitemap.classes.sitemapparser.js';
import { FeedImporter } from './smartsitemap.classes.feedimporter.js';
import { YamlImporter } from './smartsitemap.classes.yamlimporter.js';
import { SitemapValidator } from './smartsitemap.classes.validator.js';
/**
* Main entry point for @push.rocks/smartsitemap.
* Provides static factory methods for creating, parsing, and validating sitemaps.
*
* @example Simple sitemap
* ```typescript
* const xml = SmartSitemap.create()
* .addUrl('https://example.com/')
* .addUrl('https://example.com/about')
* .toXml();
* ```
*
* @example News sitemap from RSS feed
* ```typescript
* const builder = SmartSitemap.createNews({ publicationName: 'My Pub' });
* await builder.importFromFeedUrl('https://example.com/rss/');
* const xml = builder.toXml();
* ```
*/
export class SmartSitemap {
constructor() {}
// ──────────────────────────────────────────────
// Static Factory Methods
// ──────────────────────────────────────────────
/**
* creates a sitemap for news from feedurl
*/
public async createSitemapNewsFromFeedUrl(
feedUrlArg: string,
): Promise<string> {
const sitemapNewsInstance = new SitemapNews({});
await sitemapNewsInstance.readAndAddFromRssFeedUrl(feedUrlArg);
return sitemapNewsInstance.exportSitemapXml();
/** Create a standard sitemap builder */
static create(options?: interfaces.ISitemapOptions): UrlsetBuilder {
return new UrlsetBuilder(options);
}
/**
* creates a sitemap for news from feedxmlstring
*/
public async createSitemapNewsFromAFeedStringArg(
feedStringArg: string,
): Promise<string> {
const sitemapNewsInstance = new SitemapNews({});
await sitemapNewsInstance.readAndAddFromRssFeedString(feedStringArg);
return sitemapNewsInstance.exportSitemapXml();
/** Create a news sitemap builder */
static createNews(options: interfaces.INewsSitemapOptions): NewsSitemapBuilder {
return new NewsSitemapBuilder(options);
}
/**
* creates a sitemap for news from an array of articles
*/
public async createSitemapNewsFromArticleArray(
articleArrayArg: plugins.tsclass.content.IArticle[],
): Promise<string> {
const sitemapNewsInstance = new SitemapNews({});
await sitemapNewsInstance.readAndParseArticles(articleArrayArg);
return sitemapNewsInstance.exportSitemapXml();
/** Create a sitemap index builder */
static createIndex(options?: interfaces.ISitemapOptions): SitemapIndexBuilder {
return new SitemapIndexBuilder(options);
}
/**
* creates a normal sitemap from a list of urls
*/
public async createSitemapFromYmlString(yamlString: string): Promise<string> {
const yamlObject: interfaces.ISitemapYaml =
await plugins.smartyaml.yamlStringToObject(yamlString);
const sitemapWebsite = new SitemapWebsite();
for (const urlArg of yamlObject.daily) {
sitemapWebsite.addUrl({
url: urlArg,
timestamp: Date.now() - 10000,
frequency: 'daily',
});
}
return sitemapWebsite.exportSitemapXml();
/** Parse a sitemap XML string into structured data */
static async parse(xml: string): Promise<interfaces.IParsedSitemap> {
return SitemapParser.parse(xml);
}
/**
* creates a normal sitemap from a list of urls
*/
public async createSitemapFromUrlInfoArray(urlInfosArg: IUrlInfo[]) {
const sitemapWebsite = new SitemapWebsite();
for (const urlInfo of urlInfosArg) {
sitemapWebsite.addUrl(urlInfo);
}
return sitemapWebsite.exportSitemapXml();
/** Fetch and parse a sitemap from a URL */
static async parseUrl(url: string): Promise<interfaces.IParsedSitemap> {
return SitemapParser.parseUrl(url);
}
/**
* parses a sitemap url
*/
public async parseSitemapUrl(urlArg: string) {
const response = await plugins.webrequest.webrequest(urlArg);
const sitemapXml = await response.text();
const parsedSitemap = await this.parseSitemap(sitemapXml);
return parsedSitemap;
/** Create a UrlsetBuilder populated from an RSS/Atom feed URL */
static async fromFeedUrl(
feedUrl: string,
options?: interfaces.IFeedImportOptions,
): Promise<UrlsetBuilder> {
const urls = await FeedImporter.fromUrl(feedUrl, options);
const builder = new UrlsetBuilder();
builder.addUrls(urls);
return builder;
}
/**
* parses a sitemap
*/
public async parseSitemap(
sitemapXmlArg: string,
): Promise<interfaces.IParsedSiteMap> {
return new plugins.smartxml.SmartXml().parseXmlToObject(sitemapXmlArg);
/** Create a UrlsetBuilder populated from an RSS/Atom feed string */
static async fromFeedString(
feedXml: string,
options?: interfaces.IFeedImportOptions,
): Promise<UrlsetBuilder> {
const urls = await FeedImporter.fromString(feedXml, options);
const builder = new UrlsetBuilder();
builder.addUrls(urls);
return builder;
}
/** Create a UrlsetBuilder populated from a YAML config string */
static async fromYaml(yamlString: string): Promise<UrlsetBuilder> {
const urls = await YamlImporter.parseConfig(yamlString);
const builder = new UrlsetBuilder();
builder.addUrls(urls);
return builder;
}
/** Create a NewsSitemapBuilder populated from @tsclass/tsclass IArticle array */
static fromArticles(
articles: plugins.tsclass.content.IArticle[],
options: interfaces.INewsSitemapOptions,
): NewsSitemapBuilder {
const builder = new NewsSitemapBuilder(options);
builder.importFromArticles(articles);
return builder;
}
/** Create a UrlsetBuilder from a simple URL string array */
static fromUrls(urls: string[], options?: interfaces.ISitemapOptions): UrlsetBuilder {
const builder = new UrlsetBuilder(options);
builder.addFromArray(urls);
return builder;
}
/** Validate a sitemap XML string */
static async validate(xml: string): Promise<interfaces.IValidationResult> {
const parsed = await SitemapParser.parse(xml);
return SitemapValidator.validateUrlset(parsed.urls);
}
}

View File

@@ -0,0 +1,274 @@
import * as plugins from './smartsitemap.plugins.js';
import type * as interfaces from './interfaces/index.js';
import { XmlRenderer } from './smartsitemap.classes.xmlrenderer.js';
import { SitemapValidator } from './smartsitemap.classes.validator.js';
import { FeedImporter } from './smartsitemap.classes.feedimporter.js';
import { YamlImporter } from './smartsitemap.classes.yamlimporter.js';
import type { SitemapStream } from './smartsitemap.classes.sitemapstream.js';
/**
* Chainable builder for creating standard XML sitemaps (<urlset>).
* Every mutating method returns `this` for fluent chaining.
*
* Supports all sitemap extensions (images, videos, news, hreflang),
* auto-splitting at 50K URLs, multiple output formats, and validation.
*/
export class UrlsetBuilder {
protected urls: interfaces.ISitemapUrl[] = [];
protected options: interfaces.ISitemapOptions;
constructor(options?: interfaces.ISitemapOptions) {
this.options = {
prettyPrint: true,
maxUrlsPerSitemap: 50000,
validate: true,
...options,
};
}
// ──────────────────────────────────────────────
// Adding URLs
// ──────────────────────────────────────────────
/** Add a single URL with full options */
add(url: interfaces.ISitemapUrl): this {
this.urls.push(url);
return this;
}
/** Add a URL by loc string, optionally with lastmod */
addUrl(loc: string, lastmod?: Date | string | number): this {
const url: interfaces.ISitemapUrl = { loc };
if (lastmod != null) {
url.lastmod = lastmod;
}
this.urls.push(url);
return this;
}
/** Add multiple URL objects */
addUrls(urls: interfaces.ISitemapUrl[]): this {
this.urls.push(...urls);
return this;
}
/** Add URLs from a plain string array */
addFromArray(locs: string[]): this {
for (const loc of locs) {
this.urls.push({ loc });
}
return this;
}
// ──────────────────────────────────────────────
// Bulk operations
// ──────────────────────────────────────────────
/** Merge all URLs from another UrlsetBuilder */
merge(other: UrlsetBuilder): this {
this.urls.push(...other.getUrls());
return this;
}
/** Filter URLs by predicate (in-place) */
filter(predicate: (url: interfaces.ISitemapUrl) => boolean): this {
this.urls = this.urls.filter(predicate);
return this;
}
/** Transform URLs (in-place) */
map(transform: (url: interfaces.ISitemapUrl) => interfaces.ISitemapUrl): this {
this.urls = this.urls.map(transform);
return this;
}
/** Sort URLs (in-place) */
sort(compareFn?: (a: interfaces.ISitemapUrl, b: interfaces.ISitemapUrl) => number): this {
this.urls.sort(compareFn ?? ((a, b) => a.loc.localeCompare(b.loc)));
return this;
}
/** Remove duplicate URLs by loc */
dedupe(): this {
const seen = new Set<string>();
this.urls = this.urls.filter((url) => {
if (seen.has(url.loc)) return false;
seen.add(url.loc);
return true;
});
return this;
}
// ──────────────────────────────────────────────
// Defaults
// ──────────────────────────────────────────────
/** Set default changefreq for URLs that don't specify one */
setDefaultChangeFreq(freq: interfaces.TChangeFreq): this {
this.options.defaultChangeFreq = freq;
return this;
}
/** Set default priority for URLs that don't specify one */
setDefaultPriority(priority: number): this {
this.options.defaultPriority = priority;
return this;
}
/** Set XSL stylesheet URL for browser rendering */
setXslUrl(url: string): this {
this.options.xslUrl = url;
return this;
}
// ──────────────────────────────────────────────
// Import sources (async, return Promise<this>)
// ──────────────────────────────────────────────
/** Import URLs from an RSS/Atom feed URL */
async importFromFeedUrl(feedUrl: string, options?: interfaces.IFeedImportOptions): Promise<this> {
const imported = await FeedImporter.fromUrl(feedUrl, options);
this.urls.push(...imported);
return this;
}
/** Import URLs from an RSS/Atom feed XML string */
async importFromFeedString(feedXml: string, options?: interfaces.IFeedImportOptions): Promise<this> {
const imported = await FeedImporter.fromString(feedXml, options);
this.urls.push(...imported);
return this;
}
/** Import URLs from a YAML config string */
async importFromYaml(yamlString: string): Promise<this> {
const imported = await YamlImporter.parseConfig(yamlString);
this.urls.push(...imported);
return this;
}
/** Import from @tsclass/tsclass IArticle array */
importFromArticles(articles: plugins.tsclass.content.IArticle[]): this {
for (const article of articles) {
const url: interfaces.ISitemapUrl = {
loc: article.url,
lastmod: article.timestamp ? new Date(article.timestamp) : undefined,
};
this.urls.push(url);
}
return this;
}
// ──────────────────────────────────────────────
// Output
// ──────────────────────────────────────────────
/** Export as sitemap XML string */
toXml(): string {
return XmlRenderer.renderUrlset(this.urls, this.options);
}
/** Export as plain text (one URL per line) */
toTxt(): string {
return XmlRenderer.renderTxt(this.urls);
}
/** Export as JSON string */
toJson(): string {
return XmlRenderer.renderJson(this.urls);
}
/** Export as gzipped XML buffer */
async toGzipBuffer(): Promise<Buffer> {
const xml = this.toXml();
const gzip = plugins.promisify(plugins.zlib.gzip);
return gzip(Buffer.from(xml, 'utf-8')) as Promise<Buffer>;
}
/**
* Export with automatic index splitting.
* If URL count exceeds maxUrlsPerSitemap, returns a sitemap index
* plus individual sitemap chunks.
*/
toSitemapSet(): interfaces.ISitemapSet {
const maxUrls = Math.min(this.options.maxUrlsPerSitemap ?? 50000, 50000);
if (this.urls.length <= maxUrls) {
return {
needsIndex: false,
indexXml: null,
sitemaps: [{ filename: 'sitemap.xml', xml: this.toXml() }],
};
}
// Split into chunks
const chunks: interfaces.ISitemapUrl[][] = [];
for (let i = 0; i < this.urls.length; i += maxUrls) {
chunks.push(this.urls.slice(i, i + maxUrls));
}
const baseUrl = this.options.baseUrl || '';
const sitemaps: Array<{ filename: string; xml: string }> = [];
const indexEntries: Array<{ loc: string; lastmod?: string }> = [];
for (let i = 0; i < chunks.length; i++) {
const filename = `sitemap-${i + 1}.xml`;
const xml = XmlRenderer.renderUrlset(chunks[i], this.options);
sitemaps.push({ filename, xml });
indexEntries.push({
loc: baseUrl ? `${baseUrl.replace(/\/$/, '')}/${filename}` : filename,
});
}
const indexXml = XmlRenderer.renderIndex(indexEntries, this.options);
return {
needsIndex: true,
indexXml,
sitemaps,
};
}
/** Create a Node.js Readable stream for large sitemaps */
toStream(): SitemapStream {
// Lazy import to avoid circular dependency issues at module level
const { SitemapStream: SitemapStreamClass } = require('./smartsitemap.classes.sitemapstream.js');
const stream = new SitemapStreamClass(this.options);
// Push all URLs into the stream asynchronously
process.nextTick(() => {
for (const url of this.urls) {
stream.pushUrl(url);
}
stream.finish();
});
return stream;
}
// ──────────────────────────────────────────────
// Inspection
// ──────────────────────────────────────────────
/** Get the raw URL array */
getUrls(): interfaces.ISitemapUrl[] {
return [...this.urls];
}
/** Get the number of URLs */
get count(): number {
return this.urls.length;
}
/** Validate this sitemap against the protocol specification */
validate(): interfaces.IValidationResult {
return SitemapValidator.validateUrlset(this.urls, this.options);
}
/** Get statistics about this sitemap */
stats(): interfaces.ISitemapStats {
return SitemapValidator.computeStats(this.urls, this.options);
}
/** Get the options for this builder */
getOptions(): interfaces.ISitemapOptions {
return { ...this.options };
}
}

View File

@@ -0,0 +1,289 @@
import type * as interfaces from './interfaces/index.js';
const VALID_CHANGEFREQS: interfaces.TChangeFreq[] = [
'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never',
];
const MAX_URL_LENGTH = 2048;
const MAX_URLS_PER_SITEMAP = 50000;
const MAX_SITEMAP_SIZE_BYTES = 52_428_800; // 50 MB
const MAX_IMAGES_PER_URL = 1000;
const MAX_VIDEO_TAGS = 32;
const MAX_VIDEO_DURATION = 28800;
const MAX_VIDEO_DESCRIPTION_LENGTH = 2048;
/**
* Validates sitemap URLs and fields against the sitemap protocol specification.
*/
export class SitemapValidator {
/**
* Validate a single URL entry.
*/
static validateUrl(url: interfaces.ISitemapUrl): interfaces.IValidationError[] {
const errors: interfaces.IValidationError[] = [];
// loc is required
if (!url.loc) {
errors.push({ field: 'loc', message: 'URL loc is required', url: url.loc });
} else {
errors.push(...SitemapValidator.validateUrlString(url.loc));
}
// priority range
if (url.priority != null && (url.priority < 0 || url.priority > 1)) {
errors.push({
field: 'priority',
message: 'Priority must be between 0.0 and 1.0',
url: url.loc,
});
}
// changefreq
if (url.changefreq && !VALID_CHANGEFREQS.includes(url.changefreq)) {
errors.push({
field: 'changefreq',
message: `Invalid changefreq "${url.changefreq}". Must be one of: ${VALID_CHANGEFREQS.join(', ')}`,
url: url.loc,
});
}
// lastmod date validation
if (url.lastmod != null) {
const date = url.lastmod instanceof Date ? url.lastmod : new Date(url.lastmod as any);
if (isNaN(date.getTime())) {
errors.push({
field: 'lastmod',
message: `Invalid lastmod date: "${url.lastmod}"`,
url: url.loc,
});
}
}
// Images
if (url.images) {
if (url.images.length > MAX_IMAGES_PER_URL) {
errors.push({
field: 'images',
message: `Maximum ${MAX_IMAGES_PER_URL} images per URL, got ${url.images.length}`,
url: url.loc,
});
}
for (const img of url.images) {
if (!img.loc) {
errors.push({ field: 'image:loc', message: 'Image loc is required', url: url.loc });
}
}
}
// Videos
if (url.videos) {
for (const vid of url.videos) {
if (!vid.thumbnailLoc) {
errors.push({ field: 'video:thumbnail_loc', message: 'Video thumbnail_loc is required', url: url.loc });
}
if (!vid.title) {
errors.push({ field: 'video:title', message: 'Video title is required', url: url.loc });
}
if (!vid.description) {
errors.push({ field: 'video:description', message: 'Video description is required', url: url.loc });
}
if (vid.description && vid.description.length > MAX_VIDEO_DESCRIPTION_LENGTH) {
errors.push({
field: 'video:description',
message: `Video description exceeds ${MAX_VIDEO_DESCRIPTION_LENGTH} chars`,
url: url.loc,
});
}
if (!vid.contentLoc && !vid.playerLoc) {
errors.push({
field: 'video:content_loc',
message: 'Video must have at least one of contentLoc or playerLoc',
url: url.loc,
});
}
if (vid.duration != null && (vid.duration < 1 || vid.duration > MAX_VIDEO_DURATION)) {
errors.push({
field: 'video:duration',
message: `Video duration must be 1${MAX_VIDEO_DURATION} seconds`,
url: url.loc,
});
}
if (vid.rating != null && (vid.rating < 0 || vid.rating > 5)) {
errors.push({
field: 'video:rating',
message: 'Video rating must be 0.05.0',
url: url.loc,
});
}
if (vid.tags && vid.tags.length > MAX_VIDEO_TAGS) {
errors.push({
field: 'video:tag',
message: `Maximum ${MAX_VIDEO_TAGS} video tags, got ${vid.tags.length}`,
url: url.loc,
});
}
}
}
// News
if (url.news) {
if (!url.news.publication?.name) {
errors.push({ field: 'news:publication:name', message: 'News publication name is required', url: url.loc });
}
if (!url.news.publication?.language) {
errors.push({ field: 'news:publication:language', message: 'News publication language is required', url: url.loc });
}
if (!url.news.title) {
errors.push({ field: 'news:title', message: 'News title is required', url: url.loc });
}
if (url.news.publicationDate == null) {
errors.push({ field: 'news:publication_date', message: 'News publication date is required', url: url.loc });
}
}
// Alternates
if (url.alternates) {
for (const alt of url.alternates) {
if (!alt.hreflang) {
errors.push({ field: 'xhtml:link:hreflang', message: 'Alternate hreflang is required', url: url.loc });
}
if (!alt.href) {
errors.push({ field: 'xhtml:link:href', message: 'Alternate href is required', url: url.loc });
}
}
}
return errors;
}
/**
* Validate an entire URL array.
*/
static validateUrlset(urls: interfaces.ISitemapUrl[], options?: interfaces.ISitemapOptions): interfaces.IValidationResult {
const errors: interfaces.IValidationError[] = [];
const warnings: interfaces.IValidationWarning[] = [];
for (const url of urls) {
errors.push(...SitemapValidator.validateUrl(url));
}
// Check for duplicates
const locs = new Set<string>();
for (const url of urls) {
if (locs.has(url.loc)) {
warnings.push({
field: 'loc',
message: `Duplicate URL: "${url.loc}"`,
url: url.loc,
});
}
locs.add(url.loc);
}
const maxUrls = options?.maxUrlsPerSitemap ?? MAX_URLS_PER_SITEMAP;
// Size limit warnings
if (urls.length > maxUrls) {
warnings.push({
field: 'urlset',
message: `URL count (${urls.length}) exceeds maximum of ${maxUrls} per sitemap. Use toSitemapSet() for auto-splitting.`,
});
}
const stats = SitemapValidator.computeStats(urls, options);
return {
valid: errors.length === 0,
errors,
warnings,
stats,
};
}
/**
* Validate a URL string for proper format.
*/
static validateUrlString(url: string): interfaces.IValidationError[] {
const errors: interfaces.IValidationError[] = [];
if (url.length > MAX_URL_LENGTH) {
errors.push({
field: 'loc',
message: `URL exceeds maximum length of ${MAX_URL_LENGTH} characters`,
url,
});
}
try {
new URL(url);
} catch {
errors.push({
field: 'loc',
message: `Invalid URL: "${url}"`,
url,
});
}
return errors;
}
/**
* Compute statistics for a set of URLs.
*/
static computeStats(urls: interfaces.ISitemapUrl[], options?: interfaces.ISitemapOptions): interfaces.ISitemapStats {
let imageCount = 0;
let videoCount = 0;
let newsCount = 0;
let alternateCount = 0;
for (const url of urls) {
if (url.images) imageCount += url.images.length;
if (url.videos) videoCount += url.videos.length;
if (url.news) newsCount++;
if (url.alternates) alternateCount += url.alternates.length;
}
// Rough estimate: ~200 bytes per basic URL entry, more for extensions
const estimatedSizeBytes =
200 + // XML header + urlset tags
urls.length * 200 + // base URL entries
imageCount * 150 +
videoCount * 400 +
newsCount * 300 +
alternateCount * 100;
const maxUrls = options?.maxUrlsPerSitemap ?? MAX_URLS_PER_SITEMAP;
return {
urlCount: urls.length,
imageCount,
videoCount,
newsCount,
alternateCount,
estimatedSizeBytes,
needsIndex: urls.length > maxUrls,
};
}
/**
* Check size limits for a URL set.
*/
static checkSizeLimits(urls: interfaces.ISitemapUrl[], options?: interfaces.ISitemapOptions): {
withinLimits: boolean;
urlCount: number;
maxUrls: number;
estimatedSizeBytes: number;
maxSizeBytes: number;
} {
const maxUrls = Math.min(options?.maxUrlsPerSitemap ?? MAX_URLS_PER_SITEMAP, MAX_URLS_PER_SITEMAP);
const stats = SitemapValidator.computeStats(urls, options);
return {
withinLimits: urls.length <= maxUrls && stats.estimatedSizeBytes <= MAX_SITEMAP_SIZE_BYTES,
urlCount: urls.length,
maxUrls,
estimatedSizeBytes: stats.estimatedSizeBytes,
maxSizeBytes: MAX_SITEMAP_SIZE_BYTES,
};
}
}

View File

@@ -0,0 +1,294 @@
import * as plugins from './smartsitemap.plugins.js';
import type * as interfaces from './interfaces/index.js';
// Sitemap XML namespace constants
const NS_SITEMAP = 'http://www.sitemaps.org/schemas/sitemap/0.9';
const NS_IMAGE = 'http://www.google.com/schemas/sitemap-image/1.1';
const NS_VIDEO = 'http://www.google.com/schemas/sitemap-video/1.1';
const NS_NEWS = 'http://www.google.com/schemas/sitemap-news/0.9';
const NS_XHTML = 'http://www.w3.org/1999/xhtml';
/**
* Handles all XML generation for sitemaps.
* Supports proper escaping, namespace detection, date formatting,
* XSL stylesheet references, and pretty printing.
*/
export class XmlRenderer {
/**
* Escape a string for use in XML content.
* Handles the 5 XML special characters.
*/
static escapeXml(str: string): string {
return str
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&apos;');
}
/**
* Format a date value (Date, ISO string, or Unix timestamp in ms)
* to W3C Datetime format suitable for sitemaps.
*/
static formatDate(date: Date | string | number): string {
if (date instanceof Date) {
return date.toISOString();
}
if (typeof date === 'number') {
return new Date(date).toISOString();
}
// Already a string — validate it parses
const parsed = new Date(date);
if (isNaN(parsed.getTime())) {
return date; // Return as-is if unparseable
}
return parsed.toISOString();
}
/**
* Detect which XML namespaces are needed based on URL entries.
*/
static detectNamespaces(urls: interfaces.ISitemapUrl[]): Record<string, string> {
const ns: Record<string, string> = {
'@_xmlns': NS_SITEMAP,
};
for (const url of urls) {
if (url.images && url.images.length > 0) {
ns['@_xmlns:image'] = NS_IMAGE;
}
if (url.videos && url.videos.length > 0) {
ns['@_xmlns:video'] = NS_VIDEO;
}
if (url.news) {
ns['@_xmlns:news'] = NS_NEWS;
}
if (url.alternates && url.alternates.length > 0) {
ns['@_xmlns:xhtml'] = NS_XHTML;
}
}
return ns;
}
/**
* Render a URL array to sitemap XML string.
*/
static renderUrlset(urls: interfaces.ISitemapUrl[], options?: interfaces.ISitemapOptions): string {
const namespaces = XmlRenderer.detectNamespaces(urls);
const urlElements = urls.map((url) => XmlRenderer.buildUrlElement(url, options));
const xmlObj: any = {
urlset: {
...namespaces,
url: urlElements,
},
};
const smartXml = new plugins.smartxml.SmartXml();
let xml = smartXml.createXmlFromObject(xmlObj);
// Insert XSL stylesheet processing instruction if specified
if (options?.xslUrl) {
xml = XmlRenderer.insertXslInstruction(xml, options.xslUrl);
}
return xml;
}
/**
* Render a sitemap index XML string.
*/
static renderIndex(entries: interfaces.ISitemapIndexEntry[], options?: interfaces.ISitemapOptions): string {
const sitemapElements = entries.map((entry) => {
const el: any = {
loc: XmlRenderer.escapeXml(entry.loc),
};
if (entry.lastmod != null) {
el.lastmod = XmlRenderer.formatDate(entry.lastmod);
}
return el;
});
const xmlObj: any = {
sitemapindex: {
'@_xmlns': NS_SITEMAP,
sitemap: sitemapElements,
},
};
const smartXml = new plugins.smartxml.SmartXml();
let xml = smartXml.createXmlFromObject(xmlObj);
if (options?.xslUrl) {
xml = XmlRenderer.insertXslInstruction(xml, options.xslUrl);
}
return xml;
}
/**
* Render URLs as plain text (one URL per line).
*/
static renderTxt(urls: interfaces.ISitemapUrl[]): string {
return urls.map((u) => u.loc).join('\n');
}
/**
* Render URLs as JSON.
*/
static renderJson(urls: interfaces.ISitemapUrl[]): string {
return JSON.stringify(urls, null, 2);
}
/**
* Build a single <url> element object for use with smartxml.
*/
private static buildUrlElement(url: interfaces.ISitemapUrl, options?: interfaces.ISitemapOptions): any {
const el: any = {
loc: XmlRenderer.escapeXml(url.loc),
};
// lastmod
if (url.lastmod != null) {
el.lastmod = XmlRenderer.formatDate(url.lastmod);
}
// changefreq (use default if not specified)
const changefreq = url.changefreq ?? options?.defaultChangeFreq;
if (changefreq) {
el.changefreq = changefreq;
}
// priority (use default if not specified)
const priority = url.priority ?? options?.defaultPriority;
if (priority != null) {
el.priority = priority.toFixed(1);
}
// Image extension
if (url.images && url.images.length > 0) {
el['image:image'] = url.images.map((img) => XmlRenderer.buildImageElement(img));
}
// Video extension
if (url.videos && url.videos.length > 0) {
el['video:video'] = url.videos.map((vid) => XmlRenderer.buildVideoElement(vid));
}
// News extension
if (url.news) {
el['news:news'] = XmlRenderer.buildNewsElement(url.news);
}
// hreflang alternates
if (url.alternates && url.alternates.length > 0) {
el['xhtml:link'] = url.alternates.map((alt) => ({
'@_rel': 'alternate',
'@_hreflang': alt.hreflang,
'@_href': XmlRenderer.escapeXml(alt.href),
}));
}
return el;
}
/**
* Build an <image:image> element object.
*/
private static buildImageElement(img: interfaces.ISitemapImage): any {
const el: any = {
'image:loc': XmlRenderer.escapeXml(img.loc),
};
if (img.caption) {
el['image:caption'] = XmlRenderer.escapeXml(img.caption);
}
if (img.title) {
el['image:title'] = XmlRenderer.escapeXml(img.title);
}
if (img.geoLocation) {
el['image:geo_location'] = XmlRenderer.escapeXml(img.geoLocation);
}
if (img.licenseUrl) {
el['image:license'] = XmlRenderer.escapeXml(img.licenseUrl);
}
return el;
}
/**
* Build a <video:video> element object.
*/
private static buildVideoElement(vid: interfaces.ISitemapVideo): any {
const el: any = {
'video:thumbnail_loc': XmlRenderer.escapeXml(vid.thumbnailLoc),
'video:title': XmlRenderer.escapeXml(vid.title),
'video:description': XmlRenderer.escapeXml(vid.description),
};
if (vid.contentLoc) {
el['video:content_loc'] = XmlRenderer.escapeXml(vid.contentLoc);
}
if (vid.playerLoc) {
el['video:player_loc'] = XmlRenderer.escapeXml(vid.playerLoc);
}
if (vid.duration != null) {
el['video:duration'] = vid.duration;
}
if (vid.rating != null) {
el['video:rating'] = vid.rating;
}
if (vid.viewCount != null) {
el['video:view_count'] = vid.viewCount;
}
if (vid.publicationDate != null) {
el['video:publication_date'] = XmlRenderer.formatDate(vid.publicationDate);
}
if (vid.familyFriendly != null) {
el['video:family_friendly'] = vid.familyFriendly ? 'yes' : 'no';
}
if (vid.tags && vid.tags.length > 0) {
el['video:tag'] = vid.tags;
}
if (vid.live != null) {
el['video:live'] = vid.live ? 'yes' : 'no';
}
if (vid.requiresSubscription != null) {
el['video:requires_subscription'] = vid.requiresSubscription ? 'yes' : 'no';
}
return el;
}
/**
* Build a <news:news> element object.
*/
private static buildNewsElement(news: interfaces.ISitemapNews): any {
const el: any = {
'news:publication': {
'news:name': XmlRenderer.escapeXml(news.publication.name),
'news:language': news.publication.language,
},
'news:publication_date': XmlRenderer.formatDate(news.publicationDate),
'news:title': XmlRenderer.escapeXml(news.title),
};
if (news.keywords) {
const kw = Array.isArray(news.keywords) ? news.keywords.join(', ') : news.keywords;
el['news:keywords'] = XmlRenderer.escapeXml(kw);
}
return el;
}
/**
* Insert an XSL stylesheet processing instruction after the XML declaration.
*/
private static insertXslInstruction(xml: string, xslUrl: string): string {
const pi = `<?xml-stylesheet type="text/xsl" href="${XmlRenderer.escapeXml(xslUrl)}"?>`;
return xml.replace(
'<?xml version="1.0" encoding="UTF-8"?>',
`<?xml version="1.0" encoding="UTF-8"?>\n${pi}`,
);
}
}

View File

@@ -0,0 +1,61 @@
import * as plugins from './smartsitemap.plugins.js';
import type * as interfaces from './interfaces/index.js';
/**
* Imports sitemap configuration from YAML format.
* Supports the enhanced YAML schema with per-frequency URL groups,
* default settings, and feed imports.
*/
export class YamlImporter {
/**
* Parse a YAML config string and return ISitemapUrl entries.
*/
static async parseConfig(yamlString: string): Promise<interfaces.ISitemapUrl[]> {
const config = (await plugins.smartyaml.yamlStringToObject(yamlString)) as interfaces.ISitemapYamlConfig;
const urls: interfaces.ISitemapUrl[] = [];
const baseUrl = config.baseUrl?.replace(/\/$/, '') ?? '';
// Process URL groups by frequency
if (config.urls) {
const frequencies: interfaces.TChangeFreq[] = [
'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never',
];
for (const freq of frequencies) {
const urlList = config.urls[freq];
if (urlList && Array.isArray(urlList)) {
for (const path of urlList) {
const loc = path.startsWith('http') ? path : `${baseUrl}${path.startsWith('/') ? '' : '/'}${path}`;
urls.push({
loc,
changefreq: freq,
priority: config.defaults?.priority,
});
}
}
}
}
// Process feed imports
if (config.feeds && Array.isArray(config.feeds)) {
// Dynamic import to avoid circular deps at module load time
const { FeedImporter } = await import('./smartsitemap.classes.feedimporter.js');
for (const feedConfig of config.feeds) {
if (feedConfig.type === 'news') {
const newsUrls = await FeedImporter.fromUrlAsNews(
feedConfig.url,
feedConfig.publicationName ?? 'Unknown',
feedConfig.publicationLanguage ?? 'en',
);
urls.push(...newsUrls);
} else {
const standardUrls = await FeedImporter.fromUrl(feedConfig.url);
urls.push(...standardUrls);
}
}
}
return urls;
}
}

View File

@@ -1,11 +1,17 @@
// node built-ins
import * as zlib from 'zlib';
import { promisify } from 'util';
import { Readable } from 'stream';
export { zlib, promisify, Readable };
// pushrocks scope
import * as smartcache from '@push.rocks/smartcache';
import * as smartfeed from '@push.rocks/smartfeed';
import * as smartxml from '@push.rocks/smartxml';
import * as smartyaml from '@push.rocks/smartyaml';
import * as webrequest from '@push.rocks/webrequest';
export { smartcache, smartfeed, smartxml, smartyaml, webrequest };
export { smartfeed, smartxml, smartyaml, webrequest };
// tsclass
import * as tsclass from '@tsclass/tsclass';