BREAKING CHANGE(api): redesign smartsitemap around builder-based sitemap creation, parsing, validation, and import utilities

This commit is contained in:
2026-03-20 14:03:33 +00:00
parent 61f6bcebd4
commit 4e707347dd
22 changed files with 4843 additions and 2196 deletions

View File

@@ -0,0 +1,289 @@
import type * as interfaces from './interfaces/index.js';
const VALID_CHANGEFREQS: interfaces.TChangeFreq[] = [
'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never',
];
const MAX_URL_LENGTH = 2048;
const MAX_URLS_PER_SITEMAP = 50000;
const MAX_SITEMAP_SIZE_BYTES = 52_428_800; // 50 MB
const MAX_IMAGES_PER_URL = 1000;
const MAX_VIDEO_TAGS = 32;
const MAX_VIDEO_DURATION = 28800;
const MAX_VIDEO_DESCRIPTION_LENGTH = 2048;
/**
* Validates sitemap URLs and fields against the sitemap protocol specification.
*/
export class SitemapValidator {
/**
* Validate a single URL entry.
*/
static validateUrl(url: interfaces.ISitemapUrl): interfaces.IValidationError[] {
const errors: interfaces.IValidationError[] = [];
// loc is required
if (!url.loc) {
errors.push({ field: 'loc', message: 'URL loc is required', url: url.loc });
} else {
errors.push(...SitemapValidator.validateUrlString(url.loc));
}
// priority range
if (url.priority != null && (url.priority < 0 || url.priority > 1)) {
errors.push({
field: 'priority',
message: 'Priority must be between 0.0 and 1.0',
url: url.loc,
});
}
// changefreq
if (url.changefreq && !VALID_CHANGEFREQS.includes(url.changefreq)) {
errors.push({
field: 'changefreq',
message: `Invalid changefreq "${url.changefreq}". Must be one of: ${VALID_CHANGEFREQS.join(', ')}`,
url: url.loc,
});
}
// lastmod date validation
if (url.lastmod != null) {
const date = url.lastmod instanceof Date ? url.lastmod : new Date(url.lastmod as any);
if (isNaN(date.getTime())) {
errors.push({
field: 'lastmod',
message: `Invalid lastmod date: "${url.lastmod}"`,
url: url.loc,
});
}
}
// Images
if (url.images) {
if (url.images.length > MAX_IMAGES_PER_URL) {
errors.push({
field: 'images',
message: `Maximum ${MAX_IMAGES_PER_URL} images per URL, got ${url.images.length}`,
url: url.loc,
});
}
for (const img of url.images) {
if (!img.loc) {
errors.push({ field: 'image:loc', message: 'Image loc is required', url: url.loc });
}
}
}
// Videos
if (url.videos) {
for (const vid of url.videos) {
if (!vid.thumbnailLoc) {
errors.push({ field: 'video:thumbnail_loc', message: 'Video thumbnail_loc is required', url: url.loc });
}
if (!vid.title) {
errors.push({ field: 'video:title', message: 'Video title is required', url: url.loc });
}
if (!vid.description) {
errors.push({ field: 'video:description', message: 'Video description is required', url: url.loc });
}
if (vid.description && vid.description.length > MAX_VIDEO_DESCRIPTION_LENGTH) {
errors.push({
field: 'video:description',
message: `Video description exceeds ${MAX_VIDEO_DESCRIPTION_LENGTH} chars`,
url: url.loc,
});
}
if (!vid.contentLoc && !vid.playerLoc) {
errors.push({
field: 'video:content_loc',
message: 'Video must have at least one of contentLoc or playerLoc',
url: url.loc,
});
}
if (vid.duration != null && (vid.duration < 1 || vid.duration > MAX_VIDEO_DURATION)) {
errors.push({
field: 'video:duration',
message: `Video duration must be 1${MAX_VIDEO_DURATION} seconds`,
url: url.loc,
});
}
if (vid.rating != null && (vid.rating < 0 || vid.rating > 5)) {
errors.push({
field: 'video:rating',
message: 'Video rating must be 0.05.0',
url: url.loc,
});
}
if (vid.tags && vid.tags.length > MAX_VIDEO_TAGS) {
errors.push({
field: 'video:tag',
message: `Maximum ${MAX_VIDEO_TAGS} video tags, got ${vid.tags.length}`,
url: url.loc,
});
}
}
}
// News
if (url.news) {
if (!url.news.publication?.name) {
errors.push({ field: 'news:publication:name', message: 'News publication name is required', url: url.loc });
}
if (!url.news.publication?.language) {
errors.push({ field: 'news:publication:language', message: 'News publication language is required', url: url.loc });
}
if (!url.news.title) {
errors.push({ field: 'news:title', message: 'News title is required', url: url.loc });
}
if (url.news.publicationDate == null) {
errors.push({ field: 'news:publication_date', message: 'News publication date is required', url: url.loc });
}
}
// Alternates
if (url.alternates) {
for (const alt of url.alternates) {
if (!alt.hreflang) {
errors.push({ field: 'xhtml:link:hreflang', message: 'Alternate hreflang is required', url: url.loc });
}
if (!alt.href) {
errors.push({ field: 'xhtml:link:href', message: 'Alternate href is required', url: url.loc });
}
}
}
return errors;
}
/**
* Validate an entire URL array.
*/
static validateUrlset(urls: interfaces.ISitemapUrl[], options?: interfaces.ISitemapOptions): interfaces.IValidationResult {
const errors: interfaces.IValidationError[] = [];
const warnings: interfaces.IValidationWarning[] = [];
for (const url of urls) {
errors.push(...SitemapValidator.validateUrl(url));
}
// Check for duplicates
const locs = new Set<string>();
for (const url of urls) {
if (locs.has(url.loc)) {
warnings.push({
field: 'loc',
message: `Duplicate URL: "${url.loc}"`,
url: url.loc,
});
}
locs.add(url.loc);
}
const maxUrls = options?.maxUrlsPerSitemap ?? MAX_URLS_PER_SITEMAP;
// Size limit warnings
if (urls.length > maxUrls) {
warnings.push({
field: 'urlset',
message: `URL count (${urls.length}) exceeds maximum of ${maxUrls} per sitemap. Use toSitemapSet() for auto-splitting.`,
});
}
const stats = SitemapValidator.computeStats(urls, options);
return {
valid: errors.length === 0,
errors,
warnings,
stats,
};
}
/**
* Validate a URL string for proper format.
*/
static validateUrlString(url: string): interfaces.IValidationError[] {
const errors: interfaces.IValidationError[] = [];
if (url.length > MAX_URL_LENGTH) {
errors.push({
field: 'loc',
message: `URL exceeds maximum length of ${MAX_URL_LENGTH} characters`,
url,
});
}
try {
new URL(url);
} catch {
errors.push({
field: 'loc',
message: `Invalid URL: "${url}"`,
url,
});
}
return errors;
}
/**
* Compute statistics for a set of URLs.
*/
static computeStats(urls: interfaces.ISitemapUrl[], options?: interfaces.ISitemapOptions): interfaces.ISitemapStats {
let imageCount = 0;
let videoCount = 0;
let newsCount = 0;
let alternateCount = 0;
for (const url of urls) {
if (url.images) imageCount += url.images.length;
if (url.videos) videoCount += url.videos.length;
if (url.news) newsCount++;
if (url.alternates) alternateCount += url.alternates.length;
}
// Rough estimate: ~200 bytes per basic URL entry, more for extensions
const estimatedSizeBytes =
200 + // XML header + urlset tags
urls.length * 200 + // base URL entries
imageCount * 150 +
videoCount * 400 +
newsCount * 300 +
alternateCount * 100;
const maxUrls = options?.maxUrlsPerSitemap ?? MAX_URLS_PER_SITEMAP;
return {
urlCount: urls.length,
imageCount,
videoCount,
newsCount,
alternateCount,
estimatedSizeBytes,
needsIndex: urls.length > maxUrls,
};
}
/**
* Check size limits for a URL set.
*/
static checkSizeLimits(urls: interfaces.ISitemapUrl[], options?: interfaces.ISitemapOptions): {
withinLimits: boolean;
urlCount: number;
maxUrls: number;
estimatedSizeBytes: number;
maxSizeBytes: number;
} {
const maxUrls = Math.min(options?.maxUrlsPerSitemap ?? MAX_URLS_PER_SITEMAP, MAX_URLS_PER_SITEMAP);
const stats = SitemapValidator.computeStats(urls, options);
return {
withinLimits: urls.length <= maxUrls && stats.estimatedSizeBytes <= MAX_SITEMAP_SIZE_BYTES,
urlCount: urls.length,
maxUrls,
estimatedSizeBytes: stats.estimatedSizeBytes,
maxSizeBytes: MAX_SITEMAP_SIZE_BYTES,
};
}
}