2025-05-25 19:45:37 +00:00
|
|
|
import * as path from 'path';
|
|
|
|
import { promises as fs } from 'fs';
|
|
|
|
import * as plugins from '../../ts/plugins.js';
|
2025-05-30 18:18:42 +00:00
|
|
|
import { fileURLToPath } from 'url';
|
2025-05-25 19:45:37 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Corpus loader for managing test invoice files
|
|
|
|
*/
|
|
|
|
|
|
|
|
export interface CorpusFile {
|
|
|
|
path: string;
|
|
|
|
format: string;
|
|
|
|
category: string;
|
|
|
|
size: number;
|
|
|
|
valid: boolean;
|
|
|
|
}
|
|
|
|
|
|
|
|
export class CorpusLoader {
|
2025-05-30 18:18:42 +00:00
|
|
|
// Use import.meta.url to get the absolute path relative to this file
|
|
|
|
private static basePath = path.join(
|
|
|
|
path.dirname(fileURLToPath(import.meta.url)),
|
|
|
|
'..',
|
|
|
|
'assets',
|
|
|
|
'corpus'
|
|
|
|
);
|
2025-05-25 19:45:37 +00:00
|
|
|
private static cache = new Map<string, Buffer>();
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Corpus categories with their paths
|
|
|
|
*/
|
|
|
|
static readonly CATEGORIES = {
|
|
|
|
CII_XMLRECHNUNG: 'XML-Rechnung/CII',
|
|
|
|
UBL_XMLRECHNUNG: 'XML-Rechnung/UBL',
|
|
|
|
ZUGFERD_V1_CORRECT: 'ZUGFeRDv1/correct',
|
|
|
|
ZUGFERD_V1_FAIL: 'ZUGFeRDv1/fail',
|
|
|
|
ZUGFERD_V2_CORRECT: 'ZUGFeRDv2/correct',
|
|
|
|
ZUGFERD_V2_FAIL: 'ZUGFeRDv2/fail',
|
|
|
|
PEPPOL: 'PEPPOL/Valid/Qvalia',
|
|
|
|
FATTURAPA_OFFICIAL: 'fatturaPA/official',
|
|
|
|
FATTURAPA_EIGOR: 'fatturaPA/eigor',
|
2025-05-30 04:29:13 +00:00
|
|
|
EN16931_CII: '../eInvoicing-EN16931/cii/examples',
|
|
|
|
EN16931_UBL_EXAMPLES: '../eInvoicing-EN16931/ubl/examples',
|
|
|
|
EN16931_UBL_INVOICE: '../eInvoicing-EN16931/test/Invoice-unit-UBL',
|
|
|
|
EN16931_UBL_CREDITNOTE: '../eInvoicing-EN16931/test/CreditNote-unit-UBL',
|
|
|
|
EDIFACT_EXAMPLES: '../eInvoicing-EN16931/edifact/examples',
|
2025-05-25 19:45:37 +00:00
|
|
|
OTHER: 'other',
|
|
|
|
INCOMING: 'incoming',
|
|
|
|
UNSTRUCTURED: 'unstructured'
|
|
|
|
} as const;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Load a single corpus file
|
|
|
|
*/
|
|
|
|
static async loadFile(filePath: string): Promise<Buffer> {
|
|
|
|
const fullPath = path.join(this.basePath, filePath);
|
|
|
|
|
|
|
|
// Check cache first
|
|
|
|
if (this.cache.has(fullPath)) {
|
|
|
|
return this.cache.get(fullPath)!;
|
|
|
|
}
|
|
|
|
|
|
|
|
try {
|
|
|
|
const buffer = await fs.readFile(fullPath);
|
|
|
|
|
|
|
|
// Cache files under 10MB
|
|
|
|
if (buffer.length < 10 * 1024 * 1024) {
|
|
|
|
this.cache.set(fullPath, buffer);
|
|
|
|
}
|
|
|
|
|
|
|
|
return buffer;
|
|
|
|
} catch (error) {
|
|
|
|
throw new Error(`Failed to load corpus file ${filePath}: ${error.message}`);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2025-05-27 12:23:50 +00:00
|
|
|
* Load all files from a category (recursively)
|
2025-05-25 19:45:37 +00:00
|
|
|
*/
|
|
|
|
static async loadCategory(category: keyof typeof CorpusLoader.CATEGORIES): Promise<CorpusFile[]> {
|
|
|
|
const categoryPath = this.CATEGORIES[category];
|
|
|
|
const fullPath = path.join(this.basePath, categoryPath);
|
|
|
|
|
|
|
|
try {
|
|
|
|
const files: CorpusFile[] = [];
|
|
|
|
|
2025-05-27 12:23:50 +00:00
|
|
|
// Recursive function to scan directories
|
|
|
|
const scanDirectory = async (dirPath: string, relativePath: string = '') => {
|
|
|
|
const entries = await fs.readdir(dirPath, { withFileTypes: true });
|
|
|
|
|
|
|
|
for (const entry of entries) {
|
|
|
|
const entryPath = path.join(dirPath, entry.name);
|
|
|
|
const relativeFilePath = path.join(relativePath, entry.name);
|
2025-05-25 19:45:37 +00:00
|
|
|
|
2025-05-27 12:23:50 +00:00
|
|
|
if (entry.isDirectory()) {
|
|
|
|
// Recursively scan subdirectories
|
|
|
|
await scanDirectory(entryPath, relativeFilePath);
|
|
|
|
} else if (entry.isFile() && this.isInvoiceFile(entry.name)) {
|
|
|
|
const stat = await fs.stat(entryPath);
|
|
|
|
const fullRelativePath = path.join(categoryPath, relativeFilePath);
|
|
|
|
|
|
|
|
files.push({
|
|
|
|
path: fullRelativePath,
|
|
|
|
format: this.detectFormatFromPath(fullRelativePath),
|
|
|
|
category: category,
|
|
|
|
size: stat.size,
|
|
|
|
valid: !categoryPath.includes('fail')
|
|
|
|
});
|
|
|
|
}
|
2025-05-25 19:45:37 +00:00
|
|
|
}
|
2025-05-27 12:23:50 +00:00
|
|
|
};
|
2025-05-25 19:45:37 +00:00
|
|
|
|
2025-05-27 12:23:50 +00:00
|
|
|
await scanDirectory(fullPath);
|
2025-05-25 19:45:37 +00:00
|
|
|
return files;
|
|
|
|
} catch (error) {
|
|
|
|
console.warn(`Failed to load category ${category}: ${error.message}`);
|
|
|
|
return [];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Load files matching a pattern
|
|
|
|
*/
|
|
|
|
static async loadPattern(pattern: string, category?: keyof typeof CorpusLoader.CATEGORIES): Promise<CorpusFile[]> {
|
|
|
|
const files: CorpusFile[] = [];
|
|
|
|
const categoriesToSearch = category ? [category] : Object.keys(this.CATEGORIES) as Array<keyof typeof CorpusLoader.CATEGORIES>;
|
|
|
|
|
|
|
|
for (const cat of categoriesToSearch) {
|
|
|
|
const categoryFiles = await this.loadCategory(cat);
|
2025-05-29 13:35:36 +00:00
|
|
|
const matchingFiles = categoryFiles.filter(file => {
|
|
|
|
// Convert glob pattern to regex pattern
|
|
|
|
const regexPattern = pattern
|
|
|
|
.replace(/\*\*/g, '@@DOUBLESTAR@@') // Temporarily replace **
|
|
|
|
.replace(/\*/g, '[^/]*') // Replace * with "any character except /"
|
|
|
|
.replace(/@@DOUBLESTAR@@/g, '.*') // Replace ** with "any character"
|
|
|
|
.replace(/\//g, '\\/') // Escape forward slashes
|
|
|
|
.replace(/\./g, '\\.'); // Escape dots
|
|
|
|
|
|
|
|
try {
|
|
|
|
const regex = new RegExp(regexPattern);
|
|
|
|
return regex.test(file.path);
|
|
|
|
} catch (e) {
|
|
|
|
// If regex fails, try simple includes match
|
|
|
|
return file.path.includes(pattern.replace(/\*/g, ''));
|
|
|
|
}
|
|
|
|
});
|
2025-05-25 19:45:37 +00:00
|
|
|
files.push(...matchingFiles);
|
|
|
|
}
|
|
|
|
|
|
|
|
return files;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get corpus statistics
|
|
|
|
*/
|
|
|
|
static async getStatistics(): Promise<{
|
|
|
|
totalFiles: number;
|
|
|
|
totalSize: number;
|
|
|
|
byFormat: Record<string, number>;
|
|
|
|
byCategory: Record<string, number>;
|
|
|
|
validFiles: number;
|
|
|
|
invalidFiles: number;
|
|
|
|
}> {
|
|
|
|
const stats = {
|
|
|
|
totalFiles: 0,
|
|
|
|
totalSize: 0,
|
|
|
|
byFormat: {} as Record<string, number>,
|
|
|
|
byCategory: {} as Record<string, number>,
|
|
|
|
validFiles: 0,
|
|
|
|
invalidFiles: 0
|
|
|
|
};
|
|
|
|
|
|
|
|
for (const category of Object.keys(this.CATEGORIES) as Array<keyof typeof CorpusLoader.CATEGORIES>) {
|
|
|
|
const files = await this.loadCategory(category);
|
|
|
|
|
|
|
|
stats.totalFiles += files.length;
|
|
|
|
stats.byCategory[category] = files.length;
|
|
|
|
|
|
|
|
for (const file of files) {
|
|
|
|
stats.totalSize += file.size;
|
|
|
|
stats.byFormat[file.format] = (stats.byFormat[file.format] || 0) + 1;
|
|
|
|
|
|
|
|
if (file.valid) {
|
|
|
|
stats.validFiles++;
|
|
|
|
} else {
|
|
|
|
stats.invalidFiles++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return stats;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Clear the file cache
|
|
|
|
*/
|
|
|
|
static clearCache(): void {
|
|
|
|
this.cache.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Check if a file is an invoice file
|
|
|
|
*/
|
|
|
|
private static isInvoiceFile(filename: string): boolean {
|
|
|
|
const extensions = ['.xml', '.pdf', '.txt'];
|
|
|
|
return extensions.some(ext => filename.toLowerCase().endsWith(ext));
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Detect format from file path
|
|
|
|
*/
|
|
|
|
private static detectFormatFromPath(filePath: string): string {
|
|
|
|
const filename = path.basename(filePath).toLowerCase();
|
|
|
|
|
|
|
|
if (filename.includes('.cii.')) return 'CII';
|
|
|
|
if (filename.includes('.ubl.')) return 'UBL';
|
|
|
|
if (filename.includes('zugferd')) return 'ZUGFeRD';
|
|
|
|
if (filename.includes('factur')) return 'Factur-X';
|
|
|
|
if (filename.includes('xrechnung')) return 'XRechnung';
|
|
|
|
if (filename.includes('fattura')) return 'FatturaPA';
|
|
|
|
if (filename.includes('peppol')) return 'PEPPOL';
|
|
|
|
if (filename.endsWith('.pdf')) return 'PDF';
|
|
|
|
|
|
|
|
return 'Unknown';
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get files from a category (alias for loadCategory for consistency)
|
|
|
|
*/
|
|
|
|
static async getFiles(category: keyof typeof CorpusLoader.CATEGORIES): Promise<string[]> {
|
|
|
|
const files = await this.loadCategory(category);
|
|
|
|
return files.map(f => path.join(this.basePath, f.path));
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Create a test dataset from corpus files
|
|
|
|
*/
|
|
|
|
static async createTestDataset(options: {
|
|
|
|
formats?: string[];
|
|
|
|
categories?: Array<keyof typeof CorpusLoader.CATEGORIES>;
|
|
|
|
maxFiles?: number;
|
|
|
|
validOnly?: boolean;
|
|
|
|
} = {}): Promise<CorpusFile[]> {
|
|
|
|
let files: CorpusFile[] = [];
|
|
|
|
|
|
|
|
const categoriesToLoad = options.categories || Object.keys(this.CATEGORIES) as Array<keyof typeof CorpusLoader.CATEGORIES>;
|
|
|
|
|
|
|
|
for (const category of categoriesToLoad) {
|
|
|
|
const categoryFiles = await this.loadCategory(category);
|
|
|
|
files.push(...categoryFiles);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Filter by format if specified
|
|
|
|
if (options.formats && options.formats.length > 0) {
|
|
|
|
files = files.filter(f => options.formats!.includes(f.format));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Filter by validity if specified
|
|
|
|
if (options.validOnly) {
|
|
|
|
files = files.filter(f => f.valid);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Limit number of files if specified
|
|
|
|
if (options.maxFiles && files.length > options.maxFiles) {
|
|
|
|
// Shuffle and take first N files for variety
|
|
|
|
files = files.sort(() => Math.random() - 0.5).slice(0, options.maxFiles);
|
|
|
|
}
|
|
|
|
|
|
|
|
return files;
|
|
|
|
}
|
|
|
|
}
|