import * as path from 'path'; import { promises as fs } from 'fs'; import * as plugins from '../../ts/plugins.js'; /** * Corpus loader for managing test invoice files */ export interface CorpusFile { path: string; format: string; category: string; size: number; valid: boolean; } export class CorpusLoader { private static basePath = path.join(process.cwd(), 'test/assets/corpus'); private static cache = new Map(); /** * Corpus categories with their paths */ static readonly CATEGORIES = { CII_XMLRECHNUNG: 'XML-Rechnung/CII', UBL_XMLRECHNUNG: 'XML-Rechnung/UBL', ZUGFERD_V1_CORRECT: 'ZUGFeRDv1/correct', ZUGFERD_V1_FAIL: 'ZUGFeRDv1/fail', ZUGFERD_V2_CORRECT: 'ZUGFeRDv2/correct', ZUGFERD_V2_FAIL: 'ZUGFeRDv2/fail', PEPPOL: 'PEPPOL/Valid/Qvalia', FATTURAPA_OFFICIAL: 'fatturaPA/official', FATTURAPA_EIGOR: 'fatturaPA/eigor', EN16931_CII: 'eInvoicing-EN16931/cii/examples', EN16931_UBL_EXAMPLES: 'eInvoicing-EN16931/ubl/examples', EN16931_UBL_INVOICE: 'eInvoicing-EN16931/test/Invoice-unit-UBL', EN16931_UBL_CREDITNOTE: 'eInvoicing-EN16931/test/CreditNote-unit-UBL', EDIFACT_EXAMPLES: 'eInvoicing-EN16931/edifact/examples', OTHER: 'other', INCOMING: 'incoming', UNSTRUCTURED: 'unstructured' } as const; /** * Load a single corpus file */ static async loadFile(filePath: string): Promise { const fullPath = path.join(this.basePath, filePath); // Check cache first if (this.cache.has(fullPath)) { return this.cache.get(fullPath)!; } try { const buffer = await fs.readFile(fullPath); // Cache files under 10MB if (buffer.length < 10 * 1024 * 1024) { this.cache.set(fullPath, buffer); } return buffer; } catch (error) { throw new Error(`Failed to load corpus file ${filePath}: ${error.message}`); } } /** * Load all files from a category */ static async loadCategory(category: keyof typeof CorpusLoader.CATEGORIES): Promise { const categoryPath = this.CATEGORIES[category]; const fullPath = path.join(this.basePath, categoryPath); try { const entries = await fs.readdir(fullPath, { withFileTypes: true }); const files: CorpusFile[] = []; for (const entry of entries) { if (entry.isFile() && this.isInvoiceFile(entry.name)) { const filePath = path.join(categoryPath, entry.name); const stat = await fs.stat(path.join(this.basePath, filePath)); files.push({ path: filePath, format: this.detectFormatFromPath(filePath), category: category, size: stat.size, valid: !categoryPath.includes('fail') }); } } return files; } catch (error) { console.warn(`Failed to load category ${category}: ${error.message}`); return []; } } /** * Load files matching a pattern */ static async loadPattern(pattern: string, category?: keyof typeof CorpusLoader.CATEGORIES): Promise { const files: CorpusFile[] = []; const categoriesToSearch = category ? [category] : Object.keys(this.CATEGORIES) as Array; for (const cat of categoriesToSearch) { const categoryFiles = await this.loadCategory(cat); const matchingFiles = categoryFiles.filter(file => path.basename(file.path).match(pattern.replace('*', '.*')) ); files.push(...matchingFiles); } return files; } /** * Get corpus statistics */ static async getStatistics(): Promise<{ totalFiles: number; totalSize: number; byFormat: Record; byCategory: Record; validFiles: number; invalidFiles: number; }> { const stats = { totalFiles: 0, totalSize: 0, byFormat: {} as Record, byCategory: {} as Record, validFiles: 0, invalidFiles: 0 }; for (const category of Object.keys(this.CATEGORIES) as Array) { const files = await this.loadCategory(category); stats.totalFiles += files.length; stats.byCategory[category] = files.length; for (const file of files) { stats.totalSize += file.size; stats.byFormat[file.format] = (stats.byFormat[file.format] || 0) + 1; if (file.valid) { stats.validFiles++; } else { stats.invalidFiles++; } } } return stats; } /** * Clear the file cache */ static clearCache(): void { this.cache.clear(); } /** * Check if a file is an invoice file */ private static isInvoiceFile(filename: string): boolean { const extensions = ['.xml', '.pdf', '.txt']; return extensions.some(ext => filename.toLowerCase().endsWith(ext)); } /** * Detect format from file path */ private static detectFormatFromPath(filePath: string): string { const filename = path.basename(filePath).toLowerCase(); if (filename.includes('.cii.')) return 'CII'; if (filename.includes('.ubl.')) return 'UBL'; if (filename.includes('zugferd')) return 'ZUGFeRD'; if (filename.includes('factur')) return 'Factur-X'; if (filename.includes('xrechnung')) return 'XRechnung'; if (filename.includes('fattura')) return 'FatturaPA'; if (filename.includes('peppol')) return 'PEPPOL'; if (filename.endsWith('.pdf')) return 'PDF'; return 'Unknown'; } /** * Get files from a category (alias for loadCategory for consistency) */ static async getFiles(category: keyof typeof CorpusLoader.CATEGORIES): Promise { const files = await this.loadCategory(category); return files.map(f => path.join(this.basePath, f.path)); } /** * Create a test dataset from corpus files */ static async createTestDataset(options: { formats?: string[]; categories?: Array; maxFiles?: number; validOnly?: boolean; } = {}): Promise { let files: CorpusFile[] = []; const categoriesToLoad = options.categories || Object.keys(this.CATEGORIES) as Array; for (const category of categoriesToLoad) { const categoryFiles = await this.loadCategory(category); files.push(...categoryFiles); } // Filter by format if specified if (options.formats && options.formats.length > 0) { files = files.filter(f => options.formats!.includes(f.format)); } // Filter by validity if specified if (options.validOnly) { files = files.filter(f => f.valid); } // Limit number of files if specified if (options.maxFiles && files.length > options.maxFiles) { // Shuffle and take first N files for variety files = files.sort(() => Math.random() - 0.5).slice(0, options.maxFiles); } return files; } }