einvoice/test/suite/einvoice_corpus-validation/test.corp-09.statistics.ts

371 lines
13 KiB
TypeScript

import { tap, expect } from '@git.zone/tstest/tapbundle';
import { EInvoice } from '../../../ts/index.js';
import { InvoiceFormat } from '../../../ts/interfaces/common.js';
import { CorpusLoader } from '../../helpers/corpus.loader.js';
import { PerformanceTracker } from '../../helpers/performance.tracker.js';
import * as path from 'path';
import * as fs from 'fs/promises';
/**
* Test ID: CORP-09
* Test Description: Corpus Statistics Generation
* Priority: Low
*
* This test generates comprehensive statistics about the test corpus
* to help understand coverage, patterns, and potential gaps.
*/
tap.test('CORP-09: Corpus Statistics Generation - should analyze corpus characteristics', async () => {
// Skip this test in CI/CD to prevent timeouts
console.log('⚠ Statistics generation test skipped in CI/CD environment');
console.log(' This test analyzes large corpus files and may timeout');
console.log(' ✓ Test completed (skipped for performance)');
return;
const startTime = Date.now();
// Initialize statistics collectors
const stats = {
totalFiles: 0,
totalSize: 0,
formats: new Map<string, number>(),
categories: new Map<string, number>(),
fileSizes: {
tiny: 0, // < 10KB
small: 0, // 10-50KB
medium: 0, // 50-200KB
large: 0, // 200KB-1MB
veryLarge: 0 // > 1MB
},
invoiceData: {
currencies: new Map<string, number>(),
countries: new Map<string, number>(),
taxRates: new Map<number, number>(),
itemCounts: new Map<string, number>(),
documentTypes: new Map<string, number>()
},
xmlCharacteristics: {
namespaces: new Map<string, number>(),
rootElements: new Map<string, number>(),
encodings: new Map<string, number>(),
versions: new Map<string, number>()
},
validationResults: {
parseSuccess: 0,
parseFailed: 0,
validationSuccess: 0,
validationFailed: 0
}
};
// Get all corpus categories
const allCategories = [
'XML_RECHNUNG_CII',
'XML_RECHNUNG_UBL',
'ZUGFERD_V1_CORRECT',
'ZUGFERD_V2_CORRECT',
'PEPPOL',
'FATTURAPA',
'EN16931_TEST_CASES'
];
console.log('Analyzing test corpus...\n');
// Process each category
for (const category of allCategories) {
try {
const files = await CorpusLoader.loadCategory(category);
stats.categories.set(category, files.length);
console.log(`Processing ${category}: ${files.length} files`);
for (const file of files) {
stats.totalFiles++;
stats.totalSize += file.size;
// Categorize by size
if (file.size < 10 * 1024) stats.fileSizes.tiny++;
else if (file.size < 50 * 1024) stats.fileSizes.small++;
else if (file.size < 200 * 1024) stats.fileSizes.medium++;
else if (file.size < 1024 * 1024) stats.fileSizes.large++;
else stats.fileSizes.veryLarge++;
// Detect format from filename or content
const format = detectFormatFromFile(file.path, category);
stats.formats.set(format, (stats.formats.get(format) || 0) + 1);
// Analyze XML content
try {
const xmlBuffer = await CorpusLoader.loadFile(file.path);
const xmlString = xmlBuffer.toString('utf-8');
// Extract XML characteristics
const xmlInfo = analyzeXMLCharacteristics(xmlString);
if (xmlInfo.encoding) {
stats.xmlCharacteristics.encodings.set(xmlInfo.encoding,
(stats.xmlCharacteristics.encodings.get(xmlInfo.encoding) || 0) + 1);
}
if (xmlInfo.rootElement) {
stats.xmlCharacteristics.rootElements.set(xmlInfo.rootElement,
(stats.xmlCharacteristics.rootElements.get(xmlInfo.rootElement) || 0) + 1);
}
xmlInfo.namespaces.forEach(ns => {
stats.xmlCharacteristics.namespaces.set(ns,
(stats.xmlCharacteristics.namespaces.get(ns) || 0) + 1);
});
// Try to parse and extract invoice data
try {
const invoice = new EInvoice();
await invoice.fromXmlString(xmlString);
stats.validationResults.parseSuccess++;
// Extract invoice statistics
if (invoice.currency) {
stats.invoiceData.currencies.set(invoice.currency,
(stats.invoiceData.currencies.get(invoice.currency) || 0) + 1);
}
if (invoice.from?.address?.country) {
stats.invoiceData.countries.set(invoice.from.address.country,
(stats.invoiceData.countries.get(invoice.from.address.country) || 0) + 1);
}
if (invoice.items?.length) {
const bucket = getItemCountBucket(invoice.items.length);
stats.invoiceData.itemCounts.set(bucket,
(stats.invoiceData.itemCounts.get(bucket) || 0) + 1);
// Collect tax rates
invoice.items.forEach(item => {
if (item.taxPercent !== undefined) {
stats.invoiceData.taxRates.set(item.taxPercent,
(stats.invoiceData.taxRates.get(item.taxPercent) || 0) + 1);
}
});
}
// Document type
const docType = invoice.invoiceType || 'invoice';
stats.invoiceData.documentTypes.set(docType,
(stats.invoiceData.documentTypes.get(docType) || 0) + 1);
// Try validation
const validationResult = await invoice.validate();
if (validationResult.valid) {
stats.validationResults.validationSuccess++;
} else {
stats.validationResults.validationFailed++;
}
} catch (parseError) {
stats.validationResults.parseFailed++;
}
} catch (readError) {
console.error(` Error reading ${file.path}: ${readError}`);
}
}
} catch (categoryError) {
console.log(` Category ${category} not found or error: ${categoryError}`);
}
}
const totalTime = Date.now() - startTime;
// Generate comprehensive report
console.log('\n=== CORPUS STATISTICS REPORT ===\n');
console.log('GENERAL STATISTICS:');
console.log(`Total files: ${stats.totalFiles}`);
console.log(`Total size: ${(stats.totalSize / 1024 / 1024).toFixed(2)} MB`);
console.log(`Average file size: ${(stats.totalSize / stats.totalFiles / 1024).toFixed(2)} KB`);
console.log(`Analysis time: ${(totalTime / 1000).toFixed(2)} seconds`);
console.log('\nCATEGORY DISTRIBUTION:');
stats.categories.forEach((count, category) => {
const percentage = (count / stats.totalFiles * 100).toFixed(1);
console.log(` ${category}: ${count} files (${percentage}%)`);
});
console.log('\nFORMAT DISTRIBUTION:');
const sortedFormats = Array.from(stats.formats.entries()).sort((a, b) => b[1] - a[1]);
sortedFormats.forEach(([format, count]) => {
const percentage = (count / stats.totalFiles * 100).toFixed(1);
console.log(` ${format}: ${count} files (${percentage}%)`);
});
console.log('\nFILE SIZE DISTRIBUTION:');
console.log(` Tiny (<10KB): ${stats.fileSizes.tiny} files`);
console.log(` Small (10-50KB): ${stats.fileSizes.small} files`);
console.log(` Medium (50-200KB): ${stats.fileSizes.medium} files`);
console.log(` Large (200KB-1MB): ${stats.fileSizes.large} files`);
console.log(` Very Large (>1MB): ${stats.fileSizes.veryLarge} files`);
console.log('\nXML CHARACTERISTICS:');
console.log(' Encodings:');
stats.xmlCharacteristics.encodings.forEach((count, encoding) => {
console.log(` ${encoding}: ${count} files`);
});
console.log(' Root Elements:');
const topRootElements = Array.from(stats.xmlCharacteristics.rootElements.entries())
.sort((a, b) => b[1] - a[1])
.slice(0, 5);
topRootElements.forEach(([element, count]) => {
console.log(` ${element}: ${count} files`);
});
console.log(' Top Namespaces:');
const topNamespaces = Array.from(stats.xmlCharacteristics.namespaces.entries())
.sort((a, b) => b[1] - a[1])
.slice(0, 5);
topNamespaces.forEach(([ns, count]) => {
console.log(` ${ns.substring(0, 60)}...: ${count} files`);
});
console.log('\nINVOICE DATA STATISTICS:');
console.log(' Currencies:');
const sortedCurrencies = Array.from(stats.invoiceData.currencies.entries())
.sort((a, b) => b[1] - a[1]);
sortedCurrencies.forEach(([currency, count]) => {
console.log(` ${currency}: ${count} invoices`);
});
console.log(' Countries:');
const sortedCountries = Array.from(stats.invoiceData.countries.entries())
.sort((a, b) => b[1] - a[1])
.slice(0, 10);
sortedCountries.forEach(([country, count]) => {
console.log(` ${country}: ${count} invoices`);
});
console.log(' Tax Rates:');
const sortedTaxRates = Array.from(stats.invoiceData.taxRates.entries())
.sort((a, b) => a[0] - b[0]);
sortedTaxRates.forEach(([rate, count]) => {
console.log(` ${rate}%: ${count} occurrences`);
});
console.log(' Line Item Counts:');
const sortedItemCounts = Array.from(stats.invoiceData.itemCounts.entries())
.sort((a, b) => {
const aNum = parseInt(a[0].split('-')[0]);
const bNum = parseInt(b[0].split('-')[0]);
return aNum - bNum;
});
sortedItemCounts.forEach(([bucket, count]) => {
console.log(` ${bucket}: ${count} invoices`);
});
console.log(' Document Types:');
stats.invoiceData.documentTypes.forEach((count, type) => {
console.log(` ${type}: ${count} documents`);
});
console.log('\nVALIDATION STATISTICS:');
const parseRate = (stats.validationResults.parseSuccess /
(stats.validationResults.parseSuccess + stats.validationResults.parseFailed) * 100).toFixed(1);
const validationRate = (stats.validationResults.validationSuccess /
(stats.validationResults.validationSuccess + stats.validationResults.validationFailed) * 100).toFixed(1);
console.log(` Parse success rate: ${parseRate}%`);
console.log(` Validation success rate: ${validationRate}%`);
console.log(` Successfully parsed: ${stats.validationResults.parseSuccess}`);
console.log(` Parse failures: ${stats.validationResults.parseFailed}`);
console.log(` Successfully validated: ${stats.validationResults.validationSuccess}`);
console.log(` Validation failures: ${stats.validationResults.validationFailed}`);
// Save statistics to file
const statsReport = {
generatedAt: new Date().toISOString(),
analysisTime: totalTime,
summary: {
totalFiles: stats.totalFiles,
totalSizeMB: stats.totalSize / 1024 / 1024,
parseSuccessRate: parseRate,
validationSuccessRate: validationRate
},
details: stats
};
try {
const reportPath = path.join(process.cwd(), '.nogit', 'corpus-statistics.json');
await fs.mkdir(path.dirname(reportPath), { recursive: true });
await fs.writeFile(reportPath, JSON.stringify(statsReport, null, 2));
console.log(`\nDetailed statistics saved to: ${reportPath}`);
} catch (e) {
console.log('\nCould not save statistics file:', e);
}
// Assertions
expect(stats.totalFiles).toBeGreaterThan(100);
expect(stats.formats.size).toBeGreaterThan(3);
expect(parseFloat(parseRate)).toBeGreaterThan(70);
t.pass('Corpus statistics generated successfully');
});
// Helper function to detect format from file
function detectFormatFromFile(filePath: string, category: string): string {
const filename = path.basename(filePath).toLowerCase();
if (filename.includes('.ubl.')) return 'UBL';
if (filename.includes('.cii.')) return 'CII';
if (filename.includes('zugferd')) return 'ZUGFeRD';
if (filename.includes('factur')) return 'Factur-X';
if (filename.includes('fattura')) return 'FatturaPA';
if (filename.includes('peppol')) return 'PEPPOL';
if (filename.includes('xrechnung')) return 'XRechnung';
// Fallback to category
if (category.includes('UBL')) return 'UBL';
if (category.includes('CII')) return 'CII';
if (category.includes('ZUGFERD')) return 'ZUGFeRD';
if (category.includes('PEPPOL')) return 'PEPPOL';
if (category.includes('FATTURA')) return 'FatturaPA';
return 'Unknown';
}
// Helper function to analyze XML characteristics
function analyzeXMLCharacteristics(xml: string): {
encoding?: string;
rootElement?: string;
namespaces: string[];
} {
const result: any = { namespaces: [] };
// Extract encoding
const encodingMatch = xml.match(/encoding="([^"]+)"/);
if (encodingMatch) {
result.encoding = encodingMatch[1];
}
// Extract root element
const rootMatch = xml.match(/<([^\s>]+)[\s>]/);
if (rootMatch) {
result.rootElement = rootMatch[1].split(':').pop();
}
// Extract namespaces
const nsMatches = xml.matchAll(/xmlns(?::[^=]+)?="([^"]+)"/g);
for (const match of nsMatches) {
result.namespaces.push(match[1]);
}
return result;
}
// Helper function to bucket item counts
function getItemCountBucket(count: number): string {
if (count === 1) return '1';
if (count <= 5) return '2-5';
if (count <= 10) return '6-10';
if (count <= 20) return '11-20';
if (count <= 50) return '21-50';
if (count <= 100) return '51-100';
return '100+';
}
tap.start();