Files
smartsitemap/ts/smartsitemap.classes.validator.ts

290 lines
8.6 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import type * as interfaces from './interfaces/index.js';
const VALID_CHANGEFREQS: interfaces.TChangeFreq[] = [
'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never',
];
const MAX_URL_LENGTH = 2048;
const MAX_URLS_PER_SITEMAP = 50000;
const MAX_SITEMAP_SIZE_BYTES = 52_428_800; // 50 MB
const MAX_IMAGES_PER_URL = 1000;
const MAX_VIDEO_TAGS = 32;
const MAX_VIDEO_DURATION = 28800;
const MAX_VIDEO_DESCRIPTION_LENGTH = 2048;
/**
* Validates sitemap URLs and fields against the sitemap protocol specification.
*/
export class SitemapValidator {
/**
* Validate a single URL entry.
*/
static validateUrl(url: interfaces.ISitemapUrl): interfaces.IValidationError[] {
const errors: interfaces.IValidationError[] = [];
// loc is required
if (!url.loc) {
errors.push({ field: 'loc', message: 'URL loc is required', url: url.loc });
} else {
errors.push(...SitemapValidator.validateUrlString(url.loc));
}
// priority range
if (url.priority != null && (url.priority < 0 || url.priority > 1)) {
errors.push({
field: 'priority',
message: 'Priority must be between 0.0 and 1.0',
url: url.loc,
});
}
// changefreq
if (url.changefreq && !VALID_CHANGEFREQS.includes(url.changefreq)) {
errors.push({
field: 'changefreq',
message: `Invalid changefreq "${url.changefreq}". Must be one of: ${VALID_CHANGEFREQS.join(', ')}`,
url: url.loc,
});
}
// lastmod date validation
if (url.lastmod != null) {
const date = url.lastmod instanceof Date ? url.lastmod : new Date(url.lastmod as any);
if (isNaN(date.getTime())) {
errors.push({
field: 'lastmod',
message: `Invalid lastmod date: "${url.lastmod}"`,
url: url.loc,
});
}
}
// Images
if (url.images) {
if (url.images.length > MAX_IMAGES_PER_URL) {
errors.push({
field: 'images',
message: `Maximum ${MAX_IMAGES_PER_URL} images per URL, got ${url.images.length}`,
url: url.loc,
});
}
for (const img of url.images) {
if (!img.loc) {
errors.push({ field: 'image:loc', message: 'Image loc is required', url: url.loc });
}
}
}
// Videos
if (url.videos) {
for (const vid of url.videos) {
if (!vid.thumbnailLoc) {
errors.push({ field: 'video:thumbnail_loc', message: 'Video thumbnail_loc is required', url: url.loc });
}
if (!vid.title) {
errors.push({ field: 'video:title', message: 'Video title is required', url: url.loc });
}
if (!vid.description) {
errors.push({ field: 'video:description', message: 'Video description is required', url: url.loc });
}
if (vid.description && vid.description.length > MAX_VIDEO_DESCRIPTION_LENGTH) {
errors.push({
field: 'video:description',
message: `Video description exceeds ${MAX_VIDEO_DESCRIPTION_LENGTH} chars`,
url: url.loc,
});
}
if (!vid.contentLoc && !vid.playerLoc) {
errors.push({
field: 'video:content_loc',
message: 'Video must have at least one of contentLoc or playerLoc',
url: url.loc,
});
}
if (vid.duration != null && (vid.duration < 1 || vid.duration > MAX_VIDEO_DURATION)) {
errors.push({
field: 'video:duration',
message: `Video duration must be 1${MAX_VIDEO_DURATION} seconds`,
url: url.loc,
});
}
if (vid.rating != null && (vid.rating < 0 || vid.rating > 5)) {
errors.push({
field: 'video:rating',
message: 'Video rating must be 0.05.0',
url: url.loc,
});
}
if (vid.tags && vid.tags.length > MAX_VIDEO_TAGS) {
errors.push({
field: 'video:tag',
message: `Maximum ${MAX_VIDEO_TAGS} video tags, got ${vid.tags.length}`,
url: url.loc,
});
}
}
}
// News
if (url.news) {
if (!url.news.publication?.name) {
errors.push({ field: 'news:publication:name', message: 'News publication name is required', url: url.loc });
}
if (!url.news.publication?.language) {
errors.push({ field: 'news:publication:language', message: 'News publication language is required', url: url.loc });
}
if (!url.news.title) {
errors.push({ field: 'news:title', message: 'News title is required', url: url.loc });
}
if (url.news.publicationDate == null) {
errors.push({ field: 'news:publication_date', message: 'News publication date is required', url: url.loc });
}
}
// Alternates
if (url.alternates) {
for (const alt of url.alternates) {
if (!alt.hreflang) {
errors.push({ field: 'xhtml:link:hreflang', message: 'Alternate hreflang is required', url: url.loc });
}
if (!alt.href) {
errors.push({ field: 'xhtml:link:href', message: 'Alternate href is required', url: url.loc });
}
}
}
return errors;
}
/**
* Validate an entire URL array.
*/
static validateUrlset(urls: interfaces.ISitemapUrl[], options?: interfaces.ISitemapOptions): interfaces.IValidationResult {
const errors: interfaces.IValidationError[] = [];
const warnings: interfaces.IValidationWarning[] = [];
for (const url of urls) {
errors.push(...SitemapValidator.validateUrl(url));
}
// Check for duplicates
const locs = new Set<string>();
for (const url of urls) {
if (locs.has(url.loc)) {
warnings.push({
field: 'loc',
message: `Duplicate URL: "${url.loc}"`,
url: url.loc,
});
}
locs.add(url.loc);
}
const maxUrls = options?.maxUrlsPerSitemap ?? MAX_URLS_PER_SITEMAP;
// Size limit warnings
if (urls.length > maxUrls) {
warnings.push({
field: 'urlset',
message: `URL count (${urls.length}) exceeds maximum of ${maxUrls} per sitemap. Use toSitemapSet() for auto-splitting.`,
});
}
const stats = SitemapValidator.computeStats(urls, options);
return {
valid: errors.length === 0,
errors,
warnings,
stats,
};
}
/**
* Validate a URL string for proper format.
*/
static validateUrlString(url: string): interfaces.IValidationError[] {
const errors: interfaces.IValidationError[] = [];
if (url.length > MAX_URL_LENGTH) {
errors.push({
field: 'loc',
message: `URL exceeds maximum length of ${MAX_URL_LENGTH} characters`,
url,
});
}
try {
new URL(url);
} catch {
errors.push({
field: 'loc',
message: `Invalid URL: "${url}"`,
url,
});
}
return errors;
}
/**
* Compute statistics for a set of URLs.
*/
static computeStats(urls: interfaces.ISitemapUrl[], options?: interfaces.ISitemapOptions): interfaces.ISitemapStats {
let imageCount = 0;
let videoCount = 0;
let newsCount = 0;
let alternateCount = 0;
for (const url of urls) {
if (url.images) imageCount += url.images.length;
if (url.videos) videoCount += url.videos.length;
if (url.news) newsCount++;
if (url.alternates) alternateCount += url.alternates.length;
}
// Rough estimate: ~200 bytes per basic URL entry, more for extensions
const estimatedSizeBytes =
200 + // XML header + urlset tags
urls.length * 200 + // base URL entries
imageCount * 150 +
videoCount * 400 +
newsCount * 300 +
alternateCount * 100;
const maxUrls = options?.maxUrlsPerSitemap ?? MAX_URLS_PER_SITEMAP;
return {
urlCount: urls.length,
imageCount,
videoCount,
newsCount,
alternateCount,
estimatedSizeBytes,
needsIndex: urls.length > maxUrls,
};
}
/**
* Check size limits for a URL set.
*/
static checkSizeLimits(urls: interfaces.ISitemapUrl[], options?: interfaces.ISitemapOptions): {
withinLimits: boolean;
urlCount: number;
maxUrls: number;
estimatedSizeBytes: number;
maxSizeBytes: number;
} {
const maxUrls = Math.min(options?.maxUrlsPerSitemap ?? MAX_URLS_PER_SITEMAP, MAX_URLS_PER_SITEMAP);
const stats = SitemapValidator.computeStats(urls, options);
return {
withinLimits: urls.length <= maxUrls && stats.estimatedSizeBytes <= MAX_SITEMAP_SIZE_BYTES,
urlCount: urls.length,
maxUrls,
estimatedSizeBytes: stats.estimatedSizeBytes,
maxSizeBytes: MAX_SITEMAP_SIZE_BYTES,
};
}
}