Files
smartsitemap/ts/smartsitemap.classes.sitemapstream.ts

169 lines
6.0 KiB
TypeScript

import * as plugins from './smartsitemap.plugins.js';
import type * as interfaces from './interfaces/index.js';
import { XmlRenderer } from './smartsitemap.classes.xmlrenderer.js';
/**
* A Node.js Readable stream that generates sitemap XML incrementally.
* Suitable for very large sitemaps (millions of URLs) that cannot be held in memory.
*
* Usage:
* const stream = new SitemapStream();
* stream.pipe(createWriteStream('sitemap.xml'));
* stream.pushUrl({ loc: 'https://example.com/' });
* stream.pushUrl({ loc: 'https://example.com/about' });
* stream.finish();
*/
export class SitemapStream extends plugins.Readable {
private options: interfaces.ISitemapOptions;
private urlCount = 0;
private headerWritten = false;
private finished = false;
private namespaces: Set<string> = new Set();
constructor(options?: interfaces.ISitemapOptions) {
super({ encoding: 'utf-8' });
this.options = {
prettyPrint: true,
...options,
};
}
/**
* Push a URL entry into the stream.
* The URL is immediately rendered to XML and pushed to the readable buffer.
*/
pushUrl(url: interfaces.ISitemapUrl): boolean {
if (this.finished) {
throw new Error('Cannot push URLs after calling finish()');
}
// Detect needed namespaces
if (url.images?.length) this.namespaces.add('image');
if (url.videos?.length) this.namespaces.add('video');
if (url.news) this.namespaces.add('news');
if (url.alternates?.length) this.namespaces.add('xhtml');
// Write header on first URL
if (!this.headerWritten) {
this.writeHeader();
}
// Build URL element XML using XmlRenderer internals
const indent = this.options.prettyPrint !== false ? ' ' : '';
const nl = this.options.prettyPrint !== false ? '\n' : '';
let urlXml = `${indent}<url>${nl}`;
urlXml += `${indent}${indent}<loc>${XmlRenderer.escapeXml(url.loc)}</loc>${nl}`;
if (url.lastmod != null) {
urlXml += `${indent}${indent}<lastmod>${XmlRenderer.formatDate(url.lastmod)}</lastmod>${nl}`;
}
const changefreq = url.changefreq ?? this.options.defaultChangeFreq;
if (changefreq) {
urlXml += `${indent}${indent}<changefreq>${changefreq}</changefreq>${nl}`;
}
const priority = url.priority ?? this.options.defaultPriority;
if (priority != null) {
urlXml += `${indent}${indent}<priority>${priority.toFixed(1)}</priority>${nl}`;
}
// Extensions (simplified inline rendering for streaming)
if (url.images) {
for (const img of url.images) {
urlXml += `${indent}${indent}<image:image>${nl}`;
urlXml += `${indent}${indent}${indent}<image:loc>${XmlRenderer.escapeXml(img.loc)}</image:loc>${nl}`;
if (img.caption) urlXml += `${indent}${indent}${indent}<image:caption>${XmlRenderer.escapeXml(img.caption)}</image:caption>${nl}`;
if (img.title) urlXml += `${indent}${indent}${indent}<image:title>${XmlRenderer.escapeXml(img.title)}</image:title>${nl}`;
urlXml += `${indent}${indent}</image:image>${nl}`;
}
}
if (url.news) {
urlXml += `${indent}${indent}<news:news>${nl}`;
urlXml += `${indent}${indent}${indent}<news:publication>${nl}`;
urlXml += `${indent}${indent}${indent}${indent}<news:name>${XmlRenderer.escapeXml(url.news.publication.name)}</news:name>${nl}`;
urlXml += `${indent}${indent}${indent}${indent}<news:language>${url.news.publication.language}</news:language>${nl}`;
urlXml += `${indent}${indent}${indent}</news:publication>${nl}`;
urlXml += `${indent}${indent}${indent}<news:publication_date>${XmlRenderer.formatDate(url.news.publicationDate)}</news:publication_date>${nl}`;
urlXml += `${indent}${indent}${indent}<news:title>${XmlRenderer.escapeXml(url.news.title)}</news:title>${nl}`;
if (url.news.keywords) {
const kw = Array.isArray(url.news.keywords) ? url.news.keywords.join(', ') : url.news.keywords;
urlXml += `${indent}${indent}${indent}<news:keywords>${XmlRenderer.escapeXml(kw)}</news:keywords>${nl}`;
}
urlXml += `${indent}${indent}</news:news>${nl}`;
}
if (url.alternates) {
for (const alt of url.alternates) {
urlXml += `${indent}${indent}<xhtml:link rel="alternate" hreflang="${alt.hreflang}" href="${XmlRenderer.escapeXml(alt.href)}"/>${nl}`;
}
}
urlXml += `${indent}</url>${nl}`;
this.urlCount++;
return this.push(urlXml);
}
/**
* Signal that no more URLs will be added.
* Writes the closing tag and ends the stream.
*/
finish(): void {
if (this.finished) return;
this.finished = true;
if (!this.headerWritten) {
// Empty sitemap
this.writeHeader();
}
this.push('</urlset>\n');
this.push(null); // signal end of stream
}
/** Get the number of URLs written so far */
get count(): number {
return this.urlCount;
}
// Required by Readable
_read(): void {
// Data is pushed via pushUrl(), not pulled
}
/**
* Write the XML header and opening urlset tag.
* Namespace declarations are based on what's been detected so far.
*/
private writeHeader(): void {
this.headerWritten = true;
const nl = this.options.prettyPrint !== false ? '\n' : '';
let header = `<?xml version="1.0" encoding="UTF-8"?>${nl}`;
if (this.options.xslUrl) {
header += `<?xml-stylesheet type="text/xsl" href="${XmlRenderer.escapeXml(this.options.xslUrl)}"?>${nl}`;
}
header += '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"';
if (this.namespaces.has('image')) {
header += `${nl} xmlns:image="http://www.google.com/schemas/sitemap-image/1.1"`;
}
if (this.namespaces.has('video')) {
header += `${nl} xmlns:video="http://www.google.com/schemas/sitemap-video/1.1"`;
}
if (this.namespaces.has('news')) {
header += `${nl} xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"`;
}
if (this.namespaces.has('xhtml')) {
header += `${nl} xmlns:xhtml="http://www.w3.org/1999/xhtml"`;
}
header += `>${nl}`;
this.push(header);
}
}