import type * as interfaces from './interfaces/index.js'; const VALID_CHANGEFREQS: interfaces.TChangeFreq[] = [ 'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never', ]; const MAX_URL_LENGTH = 2048; const MAX_URLS_PER_SITEMAP = 50000; const MAX_SITEMAP_SIZE_BYTES = 52_428_800; // 50 MB const MAX_IMAGES_PER_URL = 1000; const MAX_VIDEO_TAGS = 32; const MAX_VIDEO_DURATION = 28800; const MAX_VIDEO_DESCRIPTION_LENGTH = 2048; /** * Validates sitemap URLs and fields against the sitemap protocol specification. */ export class SitemapValidator { /** * Validate a single URL entry. */ static validateUrl(url: interfaces.ISitemapUrl): interfaces.IValidationError[] { const errors: interfaces.IValidationError[] = []; // loc is required if (!url.loc) { errors.push({ field: 'loc', message: 'URL loc is required', url: url.loc }); } else { errors.push(...SitemapValidator.validateUrlString(url.loc)); } // priority range if (url.priority != null && (url.priority < 0 || url.priority > 1)) { errors.push({ field: 'priority', message: 'Priority must be between 0.0 and 1.0', url: url.loc, }); } // changefreq if (url.changefreq && !VALID_CHANGEFREQS.includes(url.changefreq)) { errors.push({ field: 'changefreq', message: `Invalid changefreq "${url.changefreq}". Must be one of: ${VALID_CHANGEFREQS.join(', ')}`, url: url.loc, }); } // lastmod date validation if (url.lastmod != null) { const date = url.lastmod instanceof Date ? url.lastmod : new Date(url.lastmod as any); if (isNaN(date.getTime())) { errors.push({ field: 'lastmod', message: `Invalid lastmod date: "${url.lastmod}"`, url: url.loc, }); } } // Images if (url.images) { if (url.images.length > MAX_IMAGES_PER_URL) { errors.push({ field: 'images', message: `Maximum ${MAX_IMAGES_PER_URL} images per URL, got ${url.images.length}`, url: url.loc, }); } for (const img of url.images) { if (!img.loc) { errors.push({ field: 'image:loc', message: 'Image loc is required', url: url.loc }); } } } // Videos if (url.videos) { for (const vid of url.videos) { if (!vid.thumbnailLoc) { errors.push({ field: 'video:thumbnail_loc', message: 'Video thumbnail_loc is required', url: url.loc }); } if (!vid.title) { errors.push({ field: 'video:title', message: 'Video title is required', url: url.loc }); } if (!vid.description) { errors.push({ field: 'video:description', message: 'Video description is required', url: url.loc }); } if (vid.description && vid.description.length > MAX_VIDEO_DESCRIPTION_LENGTH) { errors.push({ field: 'video:description', message: `Video description exceeds ${MAX_VIDEO_DESCRIPTION_LENGTH} chars`, url: url.loc, }); } if (!vid.contentLoc && !vid.playerLoc) { errors.push({ field: 'video:content_loc', message: 'Video must have at least one of contentLoc or playerLoc', url: url.loc, }); } if (vid.duration != null && (vid.duration < 1 || vid.duration > MAX_VIDEO_DURATION)) { errors.push({ field: 'video:duration', message: `Video duration must be 1–${MAX_VIDEO_DURATION} seconds`, url: url.loc, }); } if (vid.rating != null && (vid.rating < 0 || vid.rating > 5)) { errors.push({ field: 'video:rating', message: 'Video rating must be 0.0–5.0', url: url.loc, }); } if (vid.tags && vid.tags.length > MAX_VIDEO_TAGS) { errors.push({ field: 'video:tag', message: `Maximum ${MAX_VIDEO_TAGS} video tags, got ${vid.tags.length}`, url: url.loc, }); } } } // News if (url.news) { if (!url.news.publication?.name) { errors.push({ field: 'news:publication:name', message: 'News publication name is required', url: url.loc }); } if (!url.news.publication?.language) { errors.push({ field: 'news:publication:language', message: 'News publication language is required', url: url.loc }); } if (!url.news.title) { errors.push({ field: 'news:title', message: 'News title is required', url: url.loc }); } if (url.news.publicationDate == null) { errors.push({ field: 'news:publication_date', message: 'News publication date is required', url: url.loc }); } } // Alternates if (url.alternates) { for (const alt of url.alternates) { if (!alt.hreflang) { errors.push({ field: 'xhtml:link:hreflang', message: 'Alternate hreflang is required', url: url.loc }); } if (!alt.href) { errors.push({ field: 'xhtml:link:href', message: 'Alternate href is required', url: url.loc }); } } } return errors; } /** * Validate an entire URL array. */ static validateUrlset(urls: interfaces.ISitemapUrl[], options?: interfaces.ISitemapOptions): interfaces.IValidationResult { const errors: interfaces.IValidationError[] = []; const warnings: interfaces.IValidationWarning[] = []; for (const url of urls) { errors.push(...SitemapValidator.validateUrl(url)); } // Check for duplicates const locs = new Set(); for (const url of urls) { if (locs.has(url.loc)) { warnings.push({ field: 'loc', message: `Duplicate URL: "${url.loc}"`, url: url.loc, }); } locs.add(url.loc); } const maxUrls = options?.maxUrlsPerSitemap ?? MAX_URLS_PER_SITEMAP; // Size limit warnings if (urls.length > maxUrls) { warnings.push({ field: 'urlset', message: `URL count (${urls.length}) exceeds maximum of ${maxUrls} per sitemap. Use toSitemapSet() for auto-splitting.`, }); } const stats = SitemapValidator.computeStats(urls, options); return { valid: errors.length === 0, errors, warnings, stats, }; } /** * Validate a URL string for proper format. */ static validateUrlString(url: string): interfaces.IValidationError[] { const errors: interfaces.IValidationError[] = []; if (url.length > MAX_URL_LENGTH) { errors.push({ field: 'loc', message: `URL exceeds maximum length of ${MAX_URL_LENGTH} characters`, url, }); } try { new URL(url); } catch { errors.push({ field: 'loc', message: `Invalid URL: "${url}"`, url, }); } return errors; } /** * Compute statistics for a set of URLs. */ static computeStats(urls: interfaces.ISitemapUrl[], options?: interfaces.ISitemapOptions): interfaces.ISitemapStats { let imageCount = 0; let videoCount = 0; let newsCount = 0; let alternateCount = 0; for (const url of urls) { if (url.images) imageCount += url.images.length; if (url.videos) videoCount += url.videos.length; if (url.news) newsCount++; if (url.alternates) alternateCount += url.alternates.length; } // Rough estimate: ~200 bytes per basic URL entry, more for extensions const estimatedSizeBytes = 200 + // XML header + urlset tags urls.length * 200 + // base URL entries imageCount * 150 + videoCount * 400 + newsCount * 300 + alternateCount * 100; const maxUrls = options?.maxUrlsPerSitemap ?? MAX_URLS_PER_SITEMAP; return { urlCount: urls.length, imageCount, videoCount, newsCount, alternateCount, estimatedSizeBytes, needsIndex: urls.length > maxUrls, }; } /** * Check size limits for a URL set. */ static checkSizeLimits(urls: interfaces.ISitemapUrl[], options?: interfaces.ISitemapOptions): { withinLimits: boolean; urlCount: number; maxUrls: number; estimatedSizeBytes: number; maxSizeBytes: number; } { const maxUrls = Math.min(options?.maxUrlsPerSitemap ?? MAX_URLS_PER_SITEMAP, MAX_URLS_PER_SITEMAP); const stats = SitemapValidator.computeStats(urls, options); return { withinLimits: urls.length <= maxUrls && stats.estimatedSizeBytes <= MAX_SITEMAP_SIZE_BYTES, urlCount: urls.length, maxUrls, estimatedSizeBytes: stats.estimatedSizeBytes, maxSizeBytes: MAX_SITEMAP_SIZE_BYTES, }; } }