290 lines
8.6 KiB
TypeScript
290 lines
8.6 KiB
TypeScript
import type * as interfaces from './interfaces/index.js';
|
||
|
||
const VALID_CHANGEFREQS: interfaces.TChangeFreq[] = [
|
||
'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never',
|
||
];
|
||
|
||
const MAX_URL_LENGTH = 2048;
|
||
const MAX_URLS_PER_SITEMAP = 50000;
|
||
const MAX_SITEMAP_SIZE_BYTES = 52_428_800; // 50 MB
|
||
const MAX_IMAGES_PER_URL = 1000;
|
||
const MAX_VIDEO_TAGS = 32;
|
||
const MAX_VIDEO_DURATION = 28800;
|
||
const MAX_VIDEO_DESCRIPTION_LENGTH = 2048;
|
||
|
||
/**
|
||
* Validates sitemap URLs and fields against the sitemap protocol specification.
|
||
*/
|
||
export class SitemapValidator {
|
||
/**
|
||
* Validate a single URL entry.
|
||
*/
|
||
static validateUrl(url: interfaces.ISitemapUrl): interfaces.IValidationError[] {
|
||
const errors: interfaces.IValidationError[] = [];
|
||
|
||
// loc is required
|
||
if (!url.loc) {
|
||
errors.push({ field: 'loc', message: 'URL loc is required', url: url.loc });
|
||
} else {
|
||
errors.push(...SitemapValidator.validateUrlString(url.loc));
|
||
}
|
||
|
||
// priority range
|
||
if (url.priority != null && (url.priority < 0 || url.priority > 1)) {
|
||
errors.push({
|
||
field: 'priority',
|
||
message: 'Priority must be between 0.0 and 1.0',
|
||
url: url.loc,
|
||
});
|
||
}
|
||
|
||
// changefreq
|
||
if (url.changefreq && !VALID_CHANGEFREQS.includes(url.changefreq)) {
|
||
errors.push({
|
||
field: 'changefreq',
|
||
message: `Invalid changefreq "${url.changefreq}". Must be one of: ${VALID_CHANGEFREQS.join(', ')}`,
|
||
url: url.loc,
|
||
});
|
||
}
|
||
|
||
// lastmod date validation
|
||
if (url.lastmod != null) {
|
||
const date = url.lastmod instanceof Date ? url.lastmod : new Date(url.lastmod as any);
|
||
if (isNaN(date.getTime())) {
|
||
errors.push({
|
||
field: 'lastmod',
|
||
message: `Invalid lastmod date: "${url.lastmod}"`,
|
||
url: url.loc,
|
||
});
|
||
}
|
||
}
|
||
|
||
// Images
|
||
if (url.images) {
|
||
if (url.images.length > MAX_IMAGES_PER_URL) {
|
||
errors.push({
|
||
field: 'images',
|
||
message: `Maximum ${MAX_IMAGES_PER_URL} images per URL, got ${url.images.length}`,
|
||
url: url.loc,
|
||
});
|
||
}
|
||
for (const img of url.images) {
|
||
if (!img.loc) {
|
||
errors.push({ field: 'image:loc', message: 'Image loc is required', url: url.loc });
|
||
}
|
||
}
|
||
}
|
||
|
||
// Videos
|
||
if (url.videos) {
|
||
for (const vid of url.videos) {
|
||
if (!vid.thumbnailLoc) {
|
||
errors.push({ field: 'video:thumbnail_loc', message: 'Video thumbnail_loc is required', url: url.loc });
|
||
}
|
||
if (!vid.title) {
|
||
errors.push({ field: 'video:title', message: 'Video title is required', url: url.loc });
|
||
}
|
||
if (!vid.description) {
|
||
errors.push({ field: 'video:description', message: 'Video description is required', url: url.loc });
|
||
}
|
||
if (vid.description && vid.description.length > MAX_VIDEO_DESCRIPTION_LENGTH) {
|
||
errors.push({
|
||
field: 'video:description',
|
||
message: `Video description exceeds ${MAX_VIDEO_DESCRIPTION_LENGTH} chars`,
|
||
url: url.loc,
|
||
});
|
||
}
|
||
if (!vid.contentLoc && !vid.playerLoc) {
|
||
errors.push({
|
||
field: 'video:content_loc',
|
||
message: 'Video must have at least one of contentLoc or playerLoc',
|
||
url: url.loc,
|
||
});
|
||
}
|
||
if (vid.duration != null && (vid.duration < 1 || vid.duration > MAX_VIDEO_DURATION)) {
|
||
errors.push({
|
||
field: 'video:duration',
|
||
message: `Video duration must be 1–${MAX_VIDEO_DURATION} seconds`,
|
||
url: url.loc,
|
||
});
|
||
}
|
||
if (vid.rating != null && (vid.rating < 0 || vid.rating > 5)) {
|
||
errors.push({
|
||
field: 'video:rating',
|
||
message: 'Video rating must be 0.0–5.0',
|
||
url: url.loc,
|
||
});
|
||
}
|
||
if (vid.tags && vid.tags.length > MAX_VIDEO_TAGS) {
|
||
errors.push({
|
||
field: 'video:tag',
|
||
message: `Maximum ${MAX_VIDEO_TAGS} video tags, got ${vid.tags.length}`,
|
||
url: url.loc,
|
||
});
|
||
}
|
||
}
|
||
}
|
||
|
||
// News
|
||
if (url.news) {
|
||
if (!url.news.publication?.name) {
|
||
errors.push({ field: 'news:publication:name', message: 'News publication name is required', url: url.loc });
|
||
}
|
||
if (!url.news.publication?.language) {
|
||
errors.push({ field: 'news:publication:language', message: 'News publication language is required', url: url.loc });
|
||
}
|
||
if (!url.news.title) {
|
||
errors.push({ field: 'news:title', message: 'News title is required', url: url.loc });
|
||
}
|
||
if (url.news.publicationDate == null) {
|
||
errors.push({ field: 'news:publication_date', message: 'News publication date is required', url: url.loc });
|
||
}
|
||
}
|
||
|
||
// Alternates
|
||
if (url.alternates) {
|
||
for (const alt of url.alternates) {
|
||
if (!alt.hreflang) {
|
||
errors.push({ field: 'xhtml:link:hreflang', message: 'Alternate hreflang is required', url: url.loc });
|
||
}
|
||
if (!alt.href) {
|
||
errors.push({ field: 'xhtml:link:href', message: 'Alternate href is required', url: url.loc });
|
||
}
|
||
}
|
||
}
|
||
|
||
return errors;
|
||
}
|
||
|
||
/**
|
||
* Validate an entire URL array.
|
||
*/
|
||
static validateUrlset(urls: interfaces.ISitemapUrl[], options?: interfaces.ISitemapOptions): interfaces.IValidationResult {
|
||
const errors: interfaces.IValidationError[] = [];
|
||
const warnings: interfaces.IValidationWarning[] = [];
|
||
|
||
for (const url of urls) {
|
||
errors.push(...SitemapValidator.validateUrl(url));
|
||
}
|
||
|
||
// Check for duplicates
|
||
const locs = new Set<string>();
|
||
for (const url of urls) {
|
||
if (locs.has(url.loc)) {
|
||
warnings.push({
|
||
field: 'loc',
|
||
message: `Duplicate URL: "${url.loc}"`,
|
||
url: url.loc,
|
||
});
|
||
}
|
||
locs.add(url.loc);
|
||
}
|
||
|
||
const maxUrls = options?.maxUrlsPerSitemap ?? MAX_URLS_PER_SITEMAP;
|
||
|
||
// Size limit warnings
|
||
if (urls.length > maxUrls) {
|
||
warnings.push({
|
||
field: 'urlset',
|
||
message: `URL count (${urls.length}) exceeds maximum of ${maxUrls} per sitemap. Use toSitemapSet() for auto-splitting.`,
|
||
});
|
||
}
|
||
|
||
const stats = SitemapValidator.computeStats(urls, options);
|
||
|
||
return {
|
||
valid: errors.length === 0,
|
||
errors,
|
||
warnings,
|
||
stats,
|
||
};
|
||
}
|
||
|
||
/**
|
||
* Validate a URL string for proper format.
|
||
*/
|
||
static validateUrlString(url: string): interfaces.IValidationError[] {
|
||
const errors: interfaces.IValidationError[] = [];
|
||
|
||
if (url.length > MAX_URL_LENGTH) {
|
||
errors.push({
|
||
field: 'loc',
|
||
message: `URL exceeds maximum length of ${MAX_URL_LENGTH} characters`,
|
||
url,
|
||
});
|
||
}
|
||
|
||
try {
|
||
new URL(url);
|
||
} catch {
|
||
errors.push({
|
||
field: 'loc',
|
||
message: `Invalid URL: "${url}"`,
|
||
url,
|
||
});
|
||
}
|
||
|
||
return errors;
|
||
}
|
||
|
||
/**
|
||
* Compute statistics for a set of URLs.
|
||
*/
|
||
static computeStats(urls: interfaces.ISitemapUrl[], options?: interfaces.ISitemapOptions): interfaces.ISitemapStats {
|
||
let imageCount = 0;
|
||
let videoCount = 0;
|
||
let newsCount = 0;
|
||
let alternateCount = 0;
|
||
|
||
for (const url of urls) {
|
||
if (url.images) imageCount += url.images.length;
|
||
if (url.videos) videoCount += url.videos.length;
|
||
if (url.news) newsCount++;
|
||
if (url.alternates) alternateCount += url.alternates.length;
|
||
}
|
||
|
||
// Rough estimate: ~200 bytes per basic URL entry, more for extensions
|
||
const estimatedSizeBytes =
|
||
200 + // XML header + urlset tags
|
||
urls.length * 200 + // base URL entries
|
||
imageCount * 150 +
|
||
videoCount * 400 +
|
||
newsCount * 300 +
|
||
alternateCount * 100;
|
||
|
||
const maxUrls = options?.maxUrlsPerSitemap ?? MAX_URLS_PER_SITEMAP;
|
||
|
||
return {
|
||
urlCount: urls.length,
|
||
imageCount,
|
||
videoCount,
|
||
newsCount,
|
||
alternateCount,
|
||
estimatedSizeBytes,
|
||
needsIndex: urls.length > maxUrls,
|
||
};
|
||
}
|
||
|
||
/**
|
||
* Check size limits for a URL set.
|
||
*/
|
||
static checkSizeLimits(urls: interfaces.ISitemapUrl[], options?: interfaces.ISitemapOptions): {
|
||
withinLimits: boolean;
|
||
urlCount: number;
|
||
maxUrls: number;
|
||
estimatedSizeBytes: number;
|
||
maxSizeBytes: number;
|
||
} {
|
||
const maxUrls = Math.min(options?.maxUrlsPerSitemap ?? MAX_URLS_PER_SITEMAP, MAX_URLS_PER_SITEMAP);
|
||
const stats = SitemapValidator.computeStats(urls, options);
|
||
|
||
return {
|
||
withinLimits: urls.length <= maxUrls && stats.estimatedSizeBytes <= MAX_SITEMAP_SIZE_BYTES,
|
||
urlCount: urls.length,
|
||
maxUrls,
|
||
estimatedSizeBytes: stats.estimatedSizeBytes,
|
||
maxSizeBytes: MAX_SITEMAP_SIZE_BYTES,
|
||
};
|
||
}
|
||
}
|