import * as path from 'path'; import { promises as fs } from 'fs'; import * as plugins from '../../ts/plugins.js'; import { fileURLToPath } from 'url'; /** * Corpus loader for managing test invoice files */ export interface CorpusFile { path: string; format: string; category: string; size: number; valid: boolean; } export class CorpusLoader { // Use import.meta.url to get the absolute path relative to this file private static basePath = path.join( path.dirname(fileURLToPath(import.meta.url)), '..', 'assets', 'corpus' ); private static cache = new Map(); /** * Corpus categories with their paths */ static readonly CATEGORIES = { CII_XMLRECHNUNG: 'XML-Rechnung/CII', UBL_XMLRECHNUNG: 'XML-Rechnung/UBL', ZUGFERD_V1_CORRECT: 'ZUGFeRDv1/correct', ZUGFERD_V1_FAIL: 'ZUGFeRDv1/fail', ZUGFERD_V2_CORRECT: 'ZUGFeRDv2/correct', ZUGFERD_V2_FAIL: 'ZUGFeRDv2/fail', PEPPOL: 'PEPPOL/Valid/Qvalia', FATTURAPA_OFFICIAL: 'fatturaPA/official', FATTURAPA_EIGOR: 'fatturaPA/eigor', EN16931_CII: '../eInvoicing-EN16931/cii/examples', EN16931_UBL_EXAMPLES: '../eInvoicing-EN16931/ubl/examples', EN16931_UBL_INVOICE: '../eInvoicing-EN16931/test/Invoice-unit-UBL', EN16931_UBL_CREDITNOTE: '../eInvoicing-EN16931/test/CreditNote-unit-UBL', EDIFACT_EXAMPLES: '../eInvoicing-EN16931/edifact/examples', OTHER: 'other', INCOMING: 'incoming', UNSTRUCTURED: 'unstructured' } as const; /** * Load a single corpus file */ static async loadFile(filePath: string): Promise { const fullPath = path.join(this.basePath, filePath); // Check cache first if (this.cache.has(fullPath)) { return this.cache.get(fullPath)!; } try { const buffer = await fs.readFile(fullPath); // Cache files under 10MB if (buffer.length < 10 * 1024 * 1024) { this.cache.set(fullPath, buffer); } return buffer; } catch (error) { throw new Error(`Failed to load corpus file ${filePath}: ${error.message}`); } } /** * Load all files from a category (recursively) */ static async loadCategory(category: keyof typeof CorpusLoader.CATEGORIES): Promise { const categoryPath = this.CATEGORIES[category]; const fullPath = path.join(this.basePath, categoryPath); try { const files: CorpusFile[] = []; // Recursive function to scan directories const scanDirectory = async (dirPath: string, relativePath: string = '') => { const entries = await fs.readdir(dirPath, { withFileTypes: true }); for (const entry of entries) { const entryPath = path.join(dirPath, entry.name); const relativeFilePath = path.join(relativePath, entry.name); if (entry.isDirectory()) { // Recursively scan subdirectories await scanDirectory(entryPath, relativeFilePath); } else if (entry.isFile() && this.isInvoiceFile(entry.name)) { const stat = await fs.stat(entryPath); const fullRelativePath = path.join(categoryPath, relativeFilePath); files.push({ path: fullRelativePath, format: this.detectFormatFromPath(fullRelativePath), category: category, size: stat.size, valid: !categoryPath.includes('fail') }); } } }; await scanDirectory(fullPath); return files; } catch (error) { console.warn(`Failed to load category ${category}: ${error.message}`); return []; } } /** * Load files matching a pattern */ static async loadPattern(pattern: string, category?: keyof typeof CorpusLoader.CATEGORIES): Promise { const files: CorpusFile[] = []; const categoriesToSearch = category ? [category] : Object.keys(this.CATEGORIES) as Array; for (const cat of categoriesToSearch) { const categoryFiles = await this.loadCategory(cat); const matchingFiles = categoryFiles.filter(file => { // Convert glob pattern to regex pattern const regexPattern = pattern .replace(/\*\*/g, '@@DOUBLESTAR@@') // Temporarily replace ** .replace(/\*/g, '[^/]*') // Replace * with "any character except /" .replace(/@@DOUBLESTAR@@/g, '.*') // Replace ** with "any character" .replace(/\//g, '\\/') // Escape forward slashes .replace(/\./g, '\\.'); // Escape dots try { const regex = new RegExp(regexPattern); return regex.test(file.path); } catch (e) { // If regex fails, try simple includes match return file.path.includes(pattern.replace(/\*/g, '')); } }); files.push(...matchingFiles); } return files; } /** * Get corpus statistics */ static async getStatistics(): Promise<{ totalFiles: number; totalSize: number; byFormat: Record; byCategory: Record; validFiles: number; invalidFiles: number; }> { const stats = { totalFiles: 0, totalSize: 0, byFormat: {} as Record, byCategory: {} as Record, validFiles: 0, invalidFiles: 0 }; for (const category of Object.keys(this.CATEGORIES) as Array) { const files = await this.loadCategory(category); stats.totalFiles += files.length; stats.byCategory[category] = files.length; for (const file of files) { stats.totalSize += file.size; stats.byFormat[file.format] = (stats.byFormat[file.format] || 0) + 1; if (file.valid) { stats.validFiles++; } else { stats.invalidFiles++; } } } return stats; } /** * Clear the file cache */ static clearCache(): void { this.cache.clear(); } /** * Check if a file is an invoice file */ private static isInvoiceFile(filename: string): boolean { const extensions = ['.xml', '.pdf', '.txt']; return extensions.some(ext => filename.toLowerCase().endsWith(ext)); } /** * Detect format from file path */ private static detectFormatFromPath(filePath: string): string { const filename = path.basename(filePath).toLowerCase(); if (filename.includes('.cii.')) return 'CII'; if (filename.includes('.ubl.')) return 'UBL'; if (filename.includes('zugferd')) return 'ZUGFeRD'; if (filename.includes('factur')) return 'Factur-X'; if (filename.includes('xrechnung')) return 'XRechnung'; if (filename.includes('fattura')) return 'FatturaPA'; if (filename.includes('peppol')) return 'PEPPOL'; if (filename.endsWith('.pdf')) return 'PDF'; return 'Unknown'; } /** * Get files from a category (alias for loadCategory for consistency) */ static async getFiles(category: keyof typeof CorpusLoader.CATEGORIES): Promise { const files = await this.loadCategory(category); return files.map(f => path.join(this.basePath, f.path)); } /** * Create a test dataset from corpus files */ static async createTestDataset(options: { formats?: string[]; categories?: Array; maxFiles?: number; validOnly?: boolean; } = {}): Promise { let files: CorpusFile[] = []; const categoriesToLoad = options.categories || Object.keys(this.CATEGORIES) as Array; for (const category of categoriesToLoad) { const categoryFiles = await this.loadCategory(category); files.push(...categoryFiles); } // Filter by format if specified if (options.formats && options.formats.length > 0) { files = files.filter(f => options.formats!.includes(f.format)); } // Filter by validity if specified if (options.validOnly) { files = files.filter(f => f.valid); } // Limit number of files if specified if (options.maxFiles && files.length > options.maxFiles) { // Shuffle and take first N files for variety files = files.sort(() => Math.random() - 0.5).slice(0, options.maxFiles); } return files; } }