update
This commit is contained in:
238
test/helpers/corpus.loader.ts
Normal file
238
test/helpers/corpus.loader.ts
Normal file
@ -0,0 +1,238 @@
|
||||
import * as path from 'path';
|
||||
import { promises as fs } from 'fs';
|
||||
import * as plugins from '../../ts/plugins.js';
|
||||
|
||||
/**
|
||||
* Corpus loader for managing test invoice files
|
||||
*/
|
||||
|
||||
export interface CorpusFile {
|
||||
path: string;
|
||||
format: string;
|
||||
category: string;
|
||||
size: number;
|
||||
valid: boolean;
|
||||
}
|
||||
|
||||
export class CorpusLoader {
|
||||
private static basePath = path.join(process.cwd(), 'test/assets/corpus');
|
||||
private static cache = new Map<string, Buffer>();
|
||||
|
||||
/**
|
||||
* Corpus categories with their paths
|
||||
*/
|
||||
static readonly CATEGORIES = {
|
||||
CII_XMLRECHNUNG: 'XML-Rechnung/CII',
|
||||
UBL_XMLRECHNUNG: 'XML-Rechnung/UBL',
|
||||
ZUGFERD_V1_CORRECT: 'ZUGFeRDv1/correct',
|
||||
ZUGFERD_V1_FAIL: 'ZUGFeRDv1/fail',
|
||||
ZUGFERD_V2_CORRECT: 'ZUGFeRDv2/correct',
|
||||
ZUGFERD_V2_FAIL: 'ZUGFeRDv2/fail',
|
||||
PEPPOL: 'PEPPOL/Valid/Qvalia',
|
||||
FATTURAPA_OFFICIAL: 'fatturaPA/official',
|
||||
FATTURAPA_EIGOR: 'fatturaPA/eigor',
|
||||
EN16931_CII: 'eInvoicing-EN16931/cii/examples',
|
||||
EN16931_UBL_EXAMPLES: 'eInvoicing-EN16931/ubl/examples',
|
||||
EN16931_UBL_INVOICE: 'eInvoicing-EN16931/test/Invoice-unit-UBL',
|
||||
EN16931_UBL_CREDITNOTE: 'eInvoicing-EN16931/test/CreditNote-unit-UBL',
|
||||
EDIFACT_EXAMPLES: 'eInvoicing-EN16931/edifact/examples',
|
||||
OTHER: 'other',
|
||||
INCOMING: 'incoming',
|
||||
UNSTRUCTURED: 'unstructured'
|
||||
} as const;
|
||||
|
||||
/**
|
||||
* Load a single corpus file
|
||||
*/
|
||||
static async loadFile(filePath: string): Promise<Buffer> {
|
||||
const fullPath = path.join(this.basePath, filePath);
|
||||
|
||||
// Check cache first
|
||||
if (this.cache.has(fullPath)) {
|
||||
return this.cache.get(fullPath)!;
|
||||
}
|
||||
|
||||
try {
|
||||
const buffer = await fs.readFile(fullPath);
|
||||
|
||||
// Cache files under 10MB
|
||||
if (buffer.length < 10 * 1024 * 1024) {
|
||||
this.cache.set(fullPath, buffer);
|
||||
}
|
||||
|
||||
return buffer;
|
||||
} catch (error) {
|
||||
throw new Error(`Failed to load corpus file ${filePath}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Load all files from a category
|
||||
*/
|
||||
static async loadCategory(category: keyof typeof CorpusLoader.CATEGORIES): Promise<CorpusFile[]> {
|
||||
const categoryPath = this.CATEGORIES[category];
|
||||
const fullPath = path.join(this.basePath, categoryPath);
|
||||
|
||||
try {
|
||||
const entries = await fs.readdir(fullPath, { withFileTypes: true });
|
||||
const files: CorpusFile[] = [];
|
||||
|
||||
for (const entry of entries) {
|
||||
if (entry.isFile() && this.isInvoiceFile(entry.name)) {
|
||||
const filePath = path.join(categoryPath, entry.name);
|
||||
const stat = await fs.stat(path.join(this.basePath, filePath));
|
||||
|
||||
files.push({
|
||||
path: filePath,
|
||||
format: this.detectFormatFromPath(filePath),
|
||||
category: category,
|
||||
size: stat.size,
|
||||
valid: !categoryPath.includes('fail')
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return files;
|
||||
} catch (error) {
|
||||
console.warn(`Failed to load category ${category}: ${error.message}`);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Load files matching a pattern
|
||||
*/
|
||||
static async loadPattern(pattern: string, category?: keyof typeof CorpusLoader.CATEGORIES): Promise<CorpusFile[]> {
|
||||
const files: CorpusFile[] = [];
|
||||
const categoriesToSearch = category ? [category] : Object.keys(this.CATEGORIES) as Array<keyof typeof CorpusLoader.CATEGORIES>;
|
||||
|
||||
for (const cat of categoriesToSearch) {
|
||||
const categoryFiles = await this.loadCategory(cat);
|
||||
const matchingFiles = categoryFiles.filter(file =>
|
||||
path.basename(file.path).match(pattern.replace('*', '.*'))
|
||||
);
|
||||
files.push(...matchingFiles);
|
||||
}
|
||||
|
||||
return files;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get corpus statistics
|
||||
*/
|
||||
static async getStatistics(): Promise<{
|
||||
totalFiles: number;
|
||||
totalSize: number;
|
||||
byFormat: Record<string, number>;
|
||||
byCategory: Record<string, number>;
|
||||
validFiles: number;
|
||||
invalidFiles: number;
|
||||
}> {
|
||||
const stats = {
|
||||
totalFiles: 0,
|
||||
totalSize: 0,
|
||||
byFormat: {} as Record<string, number>,
|
||||
byCategory: {} as Record<string, number>,
|
||||
validFiles: 0,
|
||||
invalidFiles: 0
|
||||
};
|
||||
|
||||
for (const category of Object.keys(this.CATEGORIES) as Array<keyof typeof CorpusLoader.CATEGORIES>) {
|
||||
const files = await this.loadCategory(category);
|
||||
|
||||
stats.totalFiles += files.length;
|
||||
stats.byCategory[category] = files.length;
|
||||
|
||||
for (const file of files) {
|
||||
stats.totalSize += file.size;
|
||||
stats.byFormat[file.format] = (stats.byFormat[file.format] || 0) + 1;
|
||||
|
||||
if (file.valid) {
|
||||
stats.validFiles++;
|
||||
} else {
|
||||
stats.invalidFiles++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return stats;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear the file cache
|
||||
*/
|
||||
static clearCache(): void {
|
||||
this.cache.clear();
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a file is an invoice file
|
||||
*/
|
||||
private static isInvoiceFile(filename: string): boolean {
|
||||
const extensions = ['.xml', '.pdf', '.txt'];
|
||||
return extensions.some(ext => filename.toLowerCase().endsWith(ext));
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect format from file path
|
||||
*/
|
||||
private static detectFormatFromPath(filePath: string): string {
|
||||
const filename = path.basename(filePath).toLowerCase();
|
||||
|
||||
if (filename.includes('.cii.')) return 'CII';
|
||||
if (filename.includes('.ubl.')) return 'UBL';
|
||||
if (filename.includes('zugferd')) return 'ZUGFeRD';
|
||||
if (filename.includes('factur')) return 'Factur-X';
|
||||
if (filename.includes('xrechnung')) return 'XRechnung';
|
||||
if (filename.includes('fattura')) return 'FatturaPA';
|
||||
if (filename.includes('peppol')) return 'PEPPOL';
|
||||
if (filename.endsWith('.pdf')) return 'PDF';
|
||||
|
||||
return 'Unknown';
|
||||
}
|
||||
|
||||
/**
|
||||
* Get files from a category (alias for loadCategory for consistency)
|
||||
*/
|
||||
static async getFiles(category: keyof typeof CorpusLoader.CATEGORIES): Promise<string[]> {
|
||||
const files = await this.loadCategory(category);
|
||||
return files.map(f => path.join(this.basePath, f.path));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a test dataset from corpus files
|
||||
*/
|
||||
static async createTestDataset(options: {
|
||||
formats?: string[];
|
||||
categories?: Array<keyof typeof CorpusLoader.CATEGORIES>;
|
||||
maxFiles?: number;
|
||||
validOnly?: boolean;
|
||||
} = {}): Promise<CorpusFile[]> {
|
||||
let files: CorpusFile[] = [];
|
||||
|
||||
const categoriesToLoad = options.categories || Object.keys(this.CATEGORIES) as Array<keyof typeof CorpusLoader.CATEGORIES>;
|
||||
|
||||
for (const category of categoriesToLoad) {
|
||||
const categoryFiles = await this.loadCategory(category);
|
||||
files.push(...categoryFiles);
|
||||
}
|
||||
|
||||
// Filter by format if specified
|
||||
if (options.formats && options.formats.length > 0) {
|
||||
files = files.filter(f => options.formats!.includes(f.format));
|
||||
}
|
||||
|
||||
// Filter by validity if specified
|
||||
if (options.validOnly) {
|
||||
files = files.filter(f => f.valid);
|
||||
}
|
||||
|
||||
// Limit number of files if specified
|
||||
if (options.maxFiles && files.length > options.maxFiles) {
|
||||
// Shuffle and take first N files for variety
|
||||
files = files.sort(() => Math.random() - 0.5).slice(0, options.maxFiles);
|
||||
}
|
||||
|
||||
return files;
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user