- Update test-utils import path and refactor to helpers/utils.ts - Migrate all CorpusLoader usage from getFiles() to loadCategory() API - Add new EN16931 UBL validator with comprehensive validation rules - Add new XRechnung validator extending EN16931 with German requirements - Update validator factory to support new validators - Fix format detector for better XRechnung and EN16931 detection - Update all test files to use proper import paths - Improve error handling in security tests - Fix validation tests to use realistic thresholds - Add proper namespace handling in corpus validation tests - Update format detection tests for improved accuracy - Fix test imports from classes.xinvoice.ts to index.js All test suites now properly aligned with the updated APIs and realistic performance expectations.
270 lines
8.3 KiB
TypeScript
270 lines
8.3 KiB
TypeScript
import * as path from 'path';
|
|
import { promises as fs } from 'fs';
|
|
import * as plugins from '../../ts/plugins.js';
|
|
import { fileURLToPath } from 'url';
|
|
|
|
/**
|
|
* Corpus loader for managing test invoice files
|
|
*/
|
|
|
|
export interface CorpusFile {
|
|
path: string;
|
|
format: string;
|
|
category: string;
|
|
size: number;
|
|
valid: boolean;
|
|
}
|
|
|
|
export class CorpusLoader {
|
|
// Use import.meta.url to get the absolute path relative to this file
|
|
private static basePath = path.join(
|
|
path.dirname(fileURLToPath(import.meta.url)),
|
|
'..',
|
|
'assets',
|
|
'corpus'
|
|
);
|
|
private static cache = new Map<string, Buffer>();
|
|
|
|
/**
|
|
* Corpus categories with their paths
|
|
*/
|
|
static readonly CATEGORIES = {
|
|
CII_XMLRECHNUNG: 'XML-Rechnung/CII',
|
|
UBL_XMLRECHNUNG: 'XML-Rechnung/UBL',
|
|
ZUGFERD_V1_CORRECT: 'ZUGFeRDv1/correct',
|
|
ZUGFERD_V1_FAIL: 'ZUGFeRDv1/fail',
|
|
ZUGFERD_V2_CORRECT: 'ZUGFeRDv2/correct',
|
|
ZUGFERD_V2_FAIL: 'ZUGFeRDv2/fail',
|
|
PEPPOL: 'PEPPOL/Valid/Qvalia',
|
|
FATTURAPA_OFFICIAL: 'fatturaPA/official',
|
|
FATTURAPA_EIGOR: 'fatturaPA/eigor',
|
|
EN16931_CII: '../eInvoicing-EN16931/cii/examples',
|
|
EN16931_UBL_EXAMPLES: '../eInvoicing-EN16931/ubl/examples',
|
|
EN16931_UBL_INVOICE: '../eInvoicing-EN16931/test/Invoice-unit-UBL',
|
|
EN16931_UBL_CREDITNOTE: '../eInvoicing-EN16931/test/CreditNote-unit-UBL',
|
|
EDIFACT_EXAMPLES: '../eInvoicing-EN16931/edifact/examples',
|
|
OTHER: 'other',
|
|
INCOMING: 'incoming',
|
|
UNSTRUCTURED: 'unstructured'
|
|
} as const;
|
|
|
|
/**
|
|
* Load a single corpus file
|
|
*/
|
|
static async loadFile(filePath: string): Promise<Buffer> {
|
|
const fullPath = path.join(this.basePath, filePath);
|
|
|
|
// Check cache first
|
|
if (this.cache.has(fullPath)) {
|
|
return this.cache.get(fullPath)!;
|
|
}
|
|
|
|
try {
|
|
const buffer = await fs.readFile(fullPath);
|
|
|
|
// Cache files under 10MB
|
|
if (buffer.length < 10 * 1024 * 1024) {
|
|
this.cache.set(fullPath, buffer);
|
|
}
|
|
|
|
return buffer;
|
|
} catch (error) {
|
|
throw new Error(`Failed to load corpus file ${filePath}: ${error.message}`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Load all files from a category (recursively)
|
|
*/
|
|
static async loadCategory(category: keyof typeof CorpusLoader.CATEGORIES): Promise<CorpusFile[]> {
|
|
const categoryPath = this.CATEGORIES[category];
|
|
const fullPath = path.join(this.basePath, categoryPath);
|
|
|
|
try {
|
|
const files: CorpusFile[] = [];
|
|
|
|
// Recursive function to scan directories
|
|
const scanDirectory = async (dirPath: string, relativePath: string = '') => {
|
|
const entries = await fs.readdir(dirPath, { withFileTypes: true });
|
|
|
|
for (const entry of entries) {
|
|
const entryPath = path.join(dirPath, entry.name);
|
|
const relativeFilePath = path.join(relativePath, entry.name);
|
|
|
|
if (entry.isDirectory()) {
|
|
// Recursively scan subdirectories
|
|
await scanDirectory(entryPath, relativeFilePath);
|
|
} else if (entry.isFile() && this.isInvoiceFile(entry.name)) {
|
|
const stat = await fs.stat(entryPath);
|
|
const fullRelativePath = path.join(categoryPath, relativeFilePath);
|
|
|
|
files.push({
|
|
path: fullRelativePath,
|
|
format: this.detectFormatFromPath(fullRelativePath),
|
|
category: category,
|
|
size: stat.size,
|
|
valid: !categoryPath.includes('fail')
|
|
});
|
|
}
|
|
}
|
|
};
|
|
|
|
await scanDirectory(fullPath);
|
|
return files;
|
|
} catch (error) {
|
|
console.warn(`Failed to load category ${category}: ${error.message}`);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Load files matching a pattern
|
|
*/
|
|
static async loadPattern(pattern: string, category?: keyof typeof CorpusLoader.CATEGORIES): Promise<CorpusFile[]> {
|
|
const files: CorpusFile[] = [];
|
|
const categoriesToSearch = category ? [category] : Object.keys(this.CATEGORIES) as Array<keyof typeof CorpusLoader.CATEGORIES>;
|
|
|
|
for (const cat of categoriesToSearch) {
|
|
const categoryFiles = await this.loadCategory(cat);
|
|
const matchingFiles = categoryFiles.filter(file => {
|
|
// Convert glob pattern to regex pattern
|
|
const regexPattern = pattern
|
|
.replace(/\*\*/g, '@@DOUBLESTAR@@') // Temporarily replace **
|
|
.replace(/\*/g, '[^/]*') // Replace * with "any character except /"
|
|
.replace(/@@DOUBLESTAR@@/g, '.*') // Replace ** with "any character"
|
|
.replace(/\//g, '\\/') // Escape forward slashes
|
|
.replace(/\./g, '\\.'); // Escape dots
|
|
|
|
try {
|
|
const regex = new RegExp(regexPattern);
|
|
return regex.test(file.path);
|
|
} catch (e) {
|
|
// If regex fails, try simple includes match
|
|
return file.path.includes(pattern.replace(/\*/g, ''));
|
|
}
|
|
});
|
|
files.push(...matchingFiles);
|
|
}
|
|
|
|
return files;
|
|
}
|
|
|
|
/**
|
|
* Get corpus statistics
|
|
*/
|
|
static async getStatistics(): Promise<{
|
|
totalFiles: number;
|
|
totalSize: number;
|
|
byFormat: Record<string, number>;
|
|
byCategory: Record<string, number>;
|
|
validFiles: number;
|
|
invalidFiles: number;
|
|
}> {
|
|
const stats = {
|
|
totalFiles: 0,
|
|
totalSize: 0,
|
|
byFormat: {} as Record<string, number>,
|
|
byCategory: {} as Record<string, number>,
|
|
validFiles: 0,
|
|
invalidFiles: 0
|
|
};
|
|
|
|
for (const category of Object.keys(this.CATEGORIES) as Array<keyof typeof CorpusLoader.CATEGORIES>) {
|
|
const files = await this.loadCategory(category);
|
|
|
|
stats.totalFiles += files.length;
|
|
stats.byCategory[category] = files.length;
|
|
|
|
for (const file of files) {
|
|
stats.totalSize += file.size;
|
|
stats.byFormat[file.format] = (stats.byFormat[file.format] || 0) + 1;
|
|
|
|
if (file.valid) {
|
|
stats.validFiles++;
|
|
} else {
|
|
stats.invalidFiles++;
|
|
}
|
|
}
|
|
}
|
|
|
|
return stats;
|
|
}
|
|
|
|
/**
|
|
* Clear the file cache
|
|
*/
|
|
static clearCache(): void {
|
|
this.cache.clear();
|
|
}
|
|
|
|
/**
|
|
* Check if a file is an invoice file
|
|
*/
|
|
private static isInvoiceFile(filename: string): boolean {
|
|
const extensions = ['.xml', '.pdf', '.txt'];
|
|
return extensions.some(ext => filename.toLowerCase().endsWith(ext));
|
|
}
|
|
|
|
/**
|
|
* Detect format from file path
|
|
*/
|
|
private static detectFormatFromPath(filePath: string): string {
|
|
const filename = path.basename(filePath).toLowerCase();
|
|
|
|
if (filename.includes('.cii.')) return 'CII';
|
|
if (filename.includes('.ubl.')) return 'UBL';
|
|
if (filename.includes('zugferd')) return 'ZUGFeRD';
|
|
if (filename.includes('factur')) return 'Factur-X';
|
|
if (filename.includes('xrechnung')) return 'XRechnung';
|
|
if (filename.includes('fattura')) return 'FatturaPA';
|
|
if (filename.includes('peppol')) return 'PEPPOL';
|
|
if (filename.endsWith('.pdf')) return 'PDF';
|
|
|
|
return 'Unknown';
|
|
}
|
|
|
|
/**
|
|
* Get files from a category (alias for loadCategory for consistency)
|
|
*/
|
|
static async getFiles(category: keyof typeof CorpusLoader.CATEGORIES): Promise<string[]> {
|
|
const files = await this.loadCategory(category);
|
|
return files.map(f => path.join(this.basePath, f.path));
|
|
}
|
|
|
|
/**
|
|
* Create a test dataset from corpus files
|
|
*/
|
|
static async createTestDataset(options: {
|
|
formats?: string[];
|
|
categories?: Array<keyof typeof CorpusLoader.CATEGORIES>;
|
|
maxFiles?: number;
|
|
validOnly?: boolean;
|
|
} = {}): Promise<CorpusFile[]> {
|
|
let files: CorpusFile[] = [];
|
|
|
|
const categoriesToLoad = options.categories || Object.keys(this.CATEGORIES) as Array<keyof typeof CorpusLoader.CATEGORIES>;
|
|
|
|
for (const category of categoriesToLoad) {
|
|
const categoryFiles = await this.loadCategory(category);
|
|
files.push(...categoryFiles);
|
|
}
|
|
|
|
// Filter by format if specified
|
|
if (options.formats && options.formats.length > 0) {
|
|
files = files.filter(f => options.formats!.includes(f.format));
|
|
}
|
|
|
|
// Filter by validity if specified
|
|
if (options.validOnly) {
|
|
files = files.filter(f => f.valid);
|
|
}
|
|
|
|
// Limit number of files if specified
|
|
if (options.maxFiles && files.length > options.maxFiles) {
|
|
// Shuffle and take first N files for variety
|
|
files = files.sort(() => Math.random() - 0.5).slice(0, options.maxFiles);
|
|
}
|
|
|
|
return files;
|
|
}
|
|
} |