einvoice/test/helpers/corpus.loader.ts
Philipp Kunz 56fd12a6b2 test(suite): comprehensive test suite improvements and new validators
- Update test-utils import path and refactor to helpers/utils.ts
- Migrate all CorpusLoader usage from getFiles() to loadCategory() API
- Add new EN16931 UBL validator with comprehensive validation rules
- Add new XRechnung validator extending EN16931 with German requirements
- Update validator factory to support new validators
- Fix format detector for better XRechnung and EN16931 detection
- Update all test files to use proper import paths
- Improve error handling in security tests
- Fix validation tests to use realistic thresholds
- Add proper namespace handling in corpus validation tests
- Update format detection tests for improved accuracy
- Fix test imports from classes.xinvoice.ts to index.js

All test suites now properly aligned with the updated APIs and realistic performance expectations.
2025-05-30 18:18:42 +00:00

270 lines
8.3 KiB
TypeScript

import * as path from 'path';
import { promises as fs } from 'fs';
import * as plugins from '../../ts/plugins.js';
import { fileURLToPath } from 'url';
/**
* Corpus loader for managing test invoice files
*/
export interface CorpusFile {
path: string;
format: string;
category: string;
size: number;
valid: boolean;
}
export class CorpusLoader {
// Use import.meta.url to get the absolute path relative to this file
private static basePath = path.join(
path.dirname(fileURLToPath(import.meta.url)),
'..',
'assets',
'corpus'
);
private static cache = new Map<string, Buffer>();
/**
* Corpus categories with their paths
*/
static readonly CATEGORIES = {
CII_XMLRECHNUNG: 'XML-Rechnung/CII',
UBL_XMLRECHNUNG: 'XML-Rechnung/UBL',
ZUGFERD_V1_CORRECT: 'ZUGFeRDv1/correct',
ZUGFERD_V1_FAIL: 'ZUGFeRDv1/fail',
ZUGFERD_V2_CORRECT: 'ZUGFeRDv2/correct',
ZUGFERD_V2_FAIL: 'ZUGFeRDv2/fail',
PEPPOL: 'PEPPOL/Valid/Qvalia',
FATTURAPA_OFFICIAL: 'fatturaPA/official',
FATTURAPA_EIGOR: 'fatturaPA/eigor',
EN16931_CII: '../eInvoicing-EN16931/cii/examples',
EN16931_UBL_EXAMPLES: '../eInvoicing-EN16931/ubl/examples',
EN16931_UBL_INVOICE: '../eInvoicing-EN16931/test/Invoice-unit-UBL',
EN16931_UBL_CREDITNOTE: '../eInvoicing-EN16931/test/CreditNote-unit-UBL',
EDIFACT_EXAMPLES: '../eInvoicing-EN16931/edifact/examples',
OTHER: 'other',
INCOMING: 'incoming',
UNSTRUCTURED: 'unstructured'
} as const;
/**
* Load a single corpus file
*/
static async loadFile(filePath: string): Promise<Buffer> {
const fullPath = path.join(this.basePath, filePath);
// Check cache first
if (this.cache.has(fullPath)) {
return this.cache.get(fullPath)!;
}
try {
const buffer = await fs.readFile(fullPath);
// Cache files under 10MB
if (buffer.length < 10 * 1024 * 1024) {
this.cache.set(fullPath, buffer);
}
return buffer;
} catch (error) {
throw new Error(`Failed to load corpus file ${filePath}: ${error.message}`);
}
}
/**
* Load all files from a category (recursively)
*/
static async loadCategory(category: keyof typeof CorpusLoader.CATEGORIES): Promise<CorpusFile[]> {
const categoryPath = this.CATEGORIES[category];
const fullPath = path.join(this.basePath, categoryPath);
try {
const files: CorpusFile[] = [];
// Recursive function to scan directories
const scanDirectory = async (dirPath: string, relativePath: string = '') => {
const entries = await fs.readdir(dirPath, { withFileTypes: true });
for (const entry of entries) {
const entryPath = path.join(dirPath, entry.name);
const relativeFilePath = path.join(relativePath, entry.name);
if (entry.isDirectory()) {
// Recursively scan subdirectories
await scanDirectory(entryPath, relativeFilePath);
} else if (entry.isFile() && this.isInvoiceFile(entry.name)) {
const stat = await fs.stat(entryPath);
const fullRelativePath = path.join(categoryPath, relativeFilePath);
files.push({
path: fullRelativePath,
format: this.detectFormatFromPath(fullRelativePath),
category: category,
size: stat.size,
valid: !categoryPath.includes('fail')
});
}
}
};
await scanDirectory(fullPath);
return files;
} catch (error) {
console.warn(`Failed to load category ${category}: ${error.message}`);
return [];
}
}
/**
* Load files matching a pattern
*/
static async loadPattern(pattern: string, category?: keyof typeof CorpusLoader.CATEGORIES): Promise<CorpusFile[]> {
const files: CorpusFile[] = [];
const categoriesToSearch = category ? [category] : Object.keys(this.CATEGORIES) as Array<keyof typeof CorpusLoader.CATEGORIES>;
for (const cat of categoriesToSearch) {
const categoryFiles = await this.loadCategory(cat);
const matchingFiles = categoryFiles.filter(file => {
// Convert glob pattern to regex pattern
const regexPattern = pattern
.replace(/\*\*/g, '@@DOUBLESTAR@@') // Temporarily replace **
.replace(/\*/g, '[^/]*') // Replace * with "any character except /"
.replace(/@@DOUBLESTAR@@/g, '.*') // Replace ** with "any character"
.replace(/\//g, '\\/') // Escape forward slashes
.replace(/\./g, '\\.'); // Escape dots
try {
const regex = new RegExp(regexPattern);
return regex.test(file.path);
} catch (e) {
// If regex fails, try simple includes match
return file.path.includes(pattern.replace(/\*/g, ''));
}
});
files.push(...matchingFiles);
}
return files;
}
/**
* Get corpus statistics
*/
static async getStatistics(): Promise<{
totalFiles: number;
totalSize: number;
byFormat: Record<string, number>;
byCategory: Record<string, number>;
validFiles: number;
invalidFiles: number;
}> {
const stats = {
totalFiles: 0,
totalSize: 0,
byFormat: {} as Record<string, number>,
byCategory: {} as Record<string, number>,
validFiles: 0,
invalidFiles: 0
};
for (const category of Object.keys(this.CATEGORIES) as Array<keyof typeof CorpusLoader.CATEGORIES>) {
const files = await this.loadCategory(category);
stats.totalFiles += files.length;
stats.byCategory[category] = files.length;
for (const file of files) {
stats.totalSize += file.size;
stats.byFormat[file.format] = (stats.byFormat[file.format] || 0) + 1;
if (file.valid) {
stats.validFiles++;
} else {
stats.invalidFiles++;
}
}
}
return stats;
}
/**
* Clear the file cache
*/
static clearCache(): void {
this.cache.clear();
}
/**
* Check if a file is an invoice file
*/
private static isInvoiceFile(filename: string): boolean {
const extensions = ['.xml', '.pdf', '.txt'];
return extensions.some(ext => filename.toLowerCase().endsWith(ext));
}
/**
* Detect format from file path
*/
private static detectFormatFromPath(filePath: string): string {
const filename = path.basename(filePath).toLowerCase();
if (filename.includes('.cii.')) return 'CII';
if (filename.includes('.ubl.')) return 'UBL';
if (filename.includes('zugferd')) return 'ZUGFeRD';
if (filename.includes('factur')) return 'Factur-X';
if (filename.includes('xrechnung')) return 'XRechnung';
if (filename.includes('fattura')) return 'FatturaPA';
if (filename.includes('peppol')) return 'PEPPOL';
if (filename.endsWith('.pdf')) return 'PDF';
return 'Unknown';
}
/**
* Get files from a category (alias for loadCategory for consistency)
*/
static async getFiles(category: keyof typeof CorpusLoader.CATEGORIES): Promise<string[]> {
const files = await this.loadCategory(category);
return files.map(f => path.join(this.basePath, f.path));
}
/**
* Create a test dataset from corpus files
*/
static async createTestDataset(options: {
formats?: string[];
categories?: Array<keyof typeof CorpusLoader.CATEGORIES>;
maxFiles?: number;
validOnly?: boolean;
} = {}): Promise<CorpusFile[]> {
let files: CorpusFile[] = [];
const categoriesToLoad = options.categories || Object.keys(this.CATEGORIES) as Array<keyof typeof CorpusLoader.CATEGORIES>;
for (const category of categoriesToLoad) {
const categoryFiles = await this.loadCategory(category);
files.push(...categoryFiles);
}
// Filter by format if specified
if (options.formats && options.formats.length > 0) {
files = files.filter(f => options.formats!.includes(f.format));
}
// Filter by validity if specified
if (options.validOnly) {
files = files.filter(f => f.valid);
}
// Limit number of files if specified
if (options.maxFiles && files.length > options.maxFiles) {
// Shuffle and take first N files for variety
files = files.sort(() => Math.random() - 0.5).slice(0, options.maxFiles);
}
return files;
}
}