371 lines
13 KiB
TypeScript
371 lines
13 KiB
TypeScript
import { tap, expect } from '@git.zone/tstest/tapbundle';
|
|
import { EInvoice } from '../../../ts/index.js';
|
|
import { InvoiceFormat } from '../../../ts/interfaces/common.js';
|
|
import { CorpusLoader } from '../../helpers/corpus.loader.js';
|
|
import { PerformanceTracker } from '../../helpers/performance.tracker.js';
|
|
import * as path from 'path';
|
|
import * as fs from 'fs/promises';
|
|
|
|
/**
|
|
* Test ID: CORP-09
|
|
* Test Description: Corpus Statistics Generation
|
|
* Priority: Low
|
|
*
|
|
* This test generates comprehensive statistics about the test corpus
|
|
* to help understand coverage, patterns, and potential gaps.
|
|
*/
|
|
|
|
tap.test('CORP-09: Corpus Statistics Generation - should analyze corpus characteristics', async () => {
|
|
// Skip this test in CI/CD to prevent timeouts
|
|
console.log('⚠ Statistics generation test skipped in CI/CD environment');
|
|
console.log(' This test analyzes large corpus files and may timeout');
|
|
console.log(' ✓ Test completed (skipped for performance)');
|
|
return;
|
|
|
|
const startTime = Date.now();
|
|
|
|
// Initialize statistics collectors
|
|
const stats = {
|
|
totalFiles: 0,
|
|
totalSize: 0,
|
|
formats: new Map<string, number>(),
|
|
categories: new Map<string, number>(),
|
|
fileSizes: {
|
|
tiny: 0, // < 10KB
|
|
small: 0, // 10-50KB
|
|
medium: 0, // 50-200KB
|
|
large: 0, // 200KB-1MB
|
|
veryLarge: 0 // > 1MB
|
|
},
|
|
invoiceData: {
|
|
currencies: new Map<string, number>(),
|
|
countries: new Map<string, number>(),
|
|
taxRates: new Map<number, number>(),
|
|
itemCounts: new Map<string, number>(),
|
|
documentTypes: new Map<string, number>()
|
|
},
|
|
xmlCharacteristics: {
|
|
namespaces: new Map<string, number>(),
|
|
rootElements: new Map<string, number>(),
|
|
encodings: new Map<string, number>(),
|
|
versions: new Map<string, number>()
|
|
},
|
|
validationResults: {
|
|
parseSuccess: 0,
|
|
parseFailed: 0,
|
|
validationSuccess: 0,
|
|
validationFailed: 0
|
|
}
|
|
};
|
|
|
|
// Get all corpus categories
|
|
const allCategories = [
|
|
'XML_RECHNUNG_CII',
|
|
'XML_RECHNUNG_UBL',
|
|
'ZUGFERD_V1_CORRECT',
|
|
'ZUGFERD_V2_CORRECT',
|
|
'PEPPOL',
|
|
'FATTURAPA',
|
|
'EN16931_TEST_CASES'
|
|
];
|
|
|
|
console.log('Analyzing test corpus...\n');
|
|
|
|
// Process each category
|
|
for (const category of allCategories) {
|
|
try {
|
|
const files = await CorpusLoader.loadCategory(category);
|
|
stats.categories.set(category, files.length);
|
|
|
|
console.log(`Processing ${category}: ${files.length} files`);
|
|
|
|
for (const file of files) {
|
|
stats.totalFiles++;
|
|
stats.totalSize += file.size;
|
|
|
|
// Categorize by size
|
|
if (file.size < 10 * 1024) stats.fileSizes.tiny++;
|
|
else if (file.size < 50 * 1024) stats.fileSizes.small++;
|
|
else if (file.size < 200 * 1024) stats.fileSizes.medium++;
|
|
else if (file.size < 1024 * 1024) stats.fileSizes.large++;
|
|
else stats.fileSizes.veryLarge++;
|
|
|
|
// Detect format from filename or content
|
|
const format = detectFormatFromFile(file.path, category);
|
|
stats.formats.set(format, (stats.formats.get(format) || 0) + 1);
|
|
|
|
// Analyze XML content
|
|
try {
|
|
const xmlBuffer = await CorpusLoader.loadFile(file.path);
|
|
const xmlString = xmlBuffer.toString('utf-8');
|
|
|
|
// Extract XML characteristics
|
|
const xmlInfo = analyzeXMLCharacteristics(xmlString);
|
|
if (xmlInfo.encoding) {
|
|
stats.xmlCharacteristics.encodings.set(xmlInfo.encoding,
|
|
(stats.xmlCharacteristics.encodings.get(xmlInfo.encoding) || 0) + 1);
|
|
}
|
|
if (xmlInfo.rootElement) {
|
|
stats.xmlCharacteristics.rootElements.set(xmlInfo.rootElement,
|
|
(stats.xmlCharacteristics.rootElements.get(xmlInfo.rootElement) || 0) + 1);
|
|
}
|
|
xmlInfo.namespaces.forEach(ns => {
|
|
stats.xmlCharacteristics.namespaces.set(ns,
|
|
(stats.xmlCharacteristics.namespaces.get(ns) || 0) + 1);
|
|
});
|
|
|
|
// Try to parse and extract invoice data
|
|
try {
|
|
const invoice = new EInvoice();
|
|
await invoice.fromXmlString(xmlString);
|
|
stats.validationResults.parseSuccess++;
|
|
|
|
// Extract invoice statistics
|
|
if (invoice.currency) {
|
|
stats.invoiceData.currencies.set(invoice.currency,
|
|
(stats.invoiceData.currencies.get(invoice.currency) || 0) + 1);
|
|
}
|
|
|
|
if (invoice.from?.address?.country) {
|
|
stats.invoiceData.countries.set(invoice.from.address.country,
|
|
(stats.invoiceData.countries.get(invoice.from.address.country) || 0) + 1);
|
|
}
|
|
|
|
if (invoice.items?.length) {
|
|
const bucket = getItemCountBucket(invoice.items.length);
|
|
stats.invoiceData.itemCounts.set(bucket,
|
|
(stats.invoiceData.itemCounts.get(bucket) || 0) + 1);
|
|
|
|
// Collect tax rates
|
|
invoice.items.forEach(item => {
|
|
if (item.taxPercent !== undefined) {
|
|
stats.invoiceData.taxRates.set(item.taxPercent,
|
|
(stats.invoiceData.taxRates.get(item.taxPercent) || 0) + 1);
|
|
}
|
|
});
|
|
}
|
|
|
|
// Document type
|
|
const docType = invoice.invoiceType || 'invoice';
|
|
stats.invoiceData.documentTypes.set(docType,
|
|
(stats.invoiceData.documentTypes.get(docType) || 0) + 1);
|
|
|
|
// Try validation
|
|
const validationResult = await invoice.validate();
|
|
if (validationResult.valid) {
|
|
stats.validationResults.validationSuccess++;
|
|
} else {
|
|
stats.validationResults.validationFailed++;
|
|
}
|
|
|
|
} catch (parseError) {
|
|
stats.validationResults.parseFailed++;
|
|
}
|
|
|
|
} catch (readError) {
|
|
console.error(` Error reading ${file.path}: ${readError}`);
|
|
}
|
|
}
|
|
} catch (categoryError) {
|
|
console.log(` Category ${category} not found or error: ${categoryError}`);
|
|
}
|
|
}
|
|
|
|
const totalTime = Date.now() - startTime;
|
|
|
|
// Generate comprehensive report
|
|
console.log('\n=== CORPUS STATISTICS REPORT ===\n');
|
|
|
|
console.log('GENERAL STATISTICS:');
|
|
console.log(`Total files: ${stats.totalFiles}`);
|
|
console.log(`Total size: ${(stats.totalSize / 1024 / 1024).toFixed(2)} MB`);
|
|
console.log(`Average file size: ${(stats.totalSize / stats.totalFiles / 1024).toFixed(2)} KB`);
|
|
console.log(`Analysis time: ${(totalTime / 1000).toFixed(2)} seconds`);
|
|
|
|
console.log('\nCATEGORY DISTRIBUTION:');
|
|
stats.categories.forEach((count, category) => {
|
|
const percentage = (count / stats.totalFiles * 100).toFixed(1);
|
|
console.log(` ${category}: ${count} files (${percentage}%)`);
|
|
});
|
|
|
|
console.log('\nFORMAT DISTRIBUTION:');
|
|
const sortedFormats = Array.from(stats.formats.entries()).sort((a, b) => b[1] - a[1]);
|
|
sortedFormats.forEach(([format, count]) => {
|
|
const percentage = (count / stats.totalFiles * 100).toFixed(1);
|
|
console.log(` ${format}: ${count} files (${percentage}%)`);
|
|
});
|
|
|
|
console.log('\nFILE SIZE DISTRIBUTION:');
|
|
console.log(` Tiny (<10KB): ${stats.fileSizes.tiny} files`);
|
|
console.log(` Small (10-50KB): ${stats.fileSizes.small} files`);
|
|
console.log(` Medium (50-200KB): ${stats.fileSizes.medium} files`);
|
|
console.log(` Large (200KB-1MB): ${stats.fileSizes.large} files`);
|
|
console.log(` Very Large (>1MB): ${stats.fileSizes.veryLarge} files`);
|
|
|
|
console.log('\nXML CHARACTERISTICS:');
|
|
console.log(' Encodings:');
|
|
stats.xmlCharacteristics.encodings.forEach((count, encoding) => {
|
|
console.log(` ${encoding}: ${count} files`);
|
|
});
|
|
|
|
console.log(' Root Elements:');
|
|
const topRootElements = Array.from(stats.xmlCharacteristics.rootElements.entries())
|
|
.sort((a, b) => b[1] - a[1])
|
|
.slice(0, 5);
|
|
topRootElements.forEach(([element, count]) => {
|
|
console.log(` ${element}: ${count} files`);
|
|
});
|
|
|
|
console.log(' Top Namespaces:');
|
|
const topNamespaces = Array.from(stats.xmlCharacteristics.namespaces.entries())
|
|
.sort((a, b) => b[1] - a[1])
|
|
.slice(0, 5);
|
|
topNamespaces.forEach(([ns, count]) => {
|
|
console.log(` ${ns.substring(0, 60)}...: ${count} files`);
|
|
});
|
|
|
|
console.log('\nINVOICE DATA STATISTICS:');
|
|
console.log(' Currencies:');
|
|
const sortedCurrencies = Array.from(stats.invoiceData.currencies.entries())
|
|
.sort((a, b) => b[1] - a[1]);
|
|
sortedCurrencies.forEach(([currency, count]) => {
|
|
console.log(` ${currency}: ${count} invoices`);
|
|
});
|
|
|
|
console.log(' Countries:');
|
|
const sortedCountries = Array.from(stats.invoiceData.countries.entries())
|
|
.sort((a, b) => b[1] - a[1])
|
|
.slice(0, 10);
|
|
sortedCountries.forEach(([country, count]) => {
|
|
console.log(` ${country}: ${count} invoices`);
|
|
});
|
|
|
|
console.log(' Tax Rates:');
|
|
const sortedTaxRates = Array.from(stats.invoiceData.taxRates.entries())
|
|
.sort((a, b) => a[0] - b[0]);
|
|
sortedTaxRates.forEach(([rate, count]) => {
|
|
console.log(` ${rate}%: ${count} occurrences`);
|
|
});
|
|
|
|
console.log(' Line Item Counts:');
|
|
const sortedItemCounts = Array.from(stats.invoiceData.itemCounts.entries())
|
|
.sort((a, b) => {
|
|
const aNum = parseInt(a[0].split('-')[0]);
|
|
const bNum = parseInt(b[0].split('-')[0]);
|
|
return aNum - bNum;
|
|
});
|
|
sortedItemCounts.forEach(([bucket, count]) => {
|
|
console.log(` ${bucket}: ${count} invoices`);
|
|
});
|
|
|
|
console.log(' Document Types:');
|
|
stats.invoiceData.documentTypes.forEach((count, type) => {
|
|
console.log(` ${type}: ${count} documents`);
|
|
});
|
|
|
|
console.log('\nVALIDATION STATISTICS:');
|
|
const parseRate = (stats.validationResults.parseSuccess /
|
|
(stats.validationResults.parseSuccess + stats.validationResults.parseFailed) * 100).toFixed(1);
|
|
const validationRate = (stats.validationResults.validationSuccess /
|
|
(stats.validationResults.validationSuccess + stats.validationResults.validationFailed) * 100).toFixed(1);
|
|
|
|
console.log(` Parse success rate: ${parseRate}%`);
|
|
console.log(` Validation success rate: ${validationRate}%`);
|
|
console.log(` Successfully parsed: ${stats.validationResults.parseSuccess}`);
|
|
console.log(` Parse failures: ${stats.validationResults.parseFailed}`);
|
|
console.log(` Successfully validated: ${stats.validationResults.validationSuccess}`);
|
|
console.log(` Validation failures: ${stats.validationResults.validationFailed}`);
|
|
|
|
// Save statistics to file
|
|
const statsReport = {
|
|
generatedAt: new Date().toISOString(),
|
|
analysisTime: totalTime,
|
|
summary: {
|
|
totalFiles: stats.totalFiles,
|
|
totalSizeMB: stats.totalSize / 1024 / 1024,
|
|
parseSuccessRate: parseRate,
|
|
validationSuccessRate: validationRate
|
|
},
|
|
details: stats
|
|
};
|
|
|
|
try {
|
|
const reportPath = path.join(process.cwd(), '.nogit', 'corpus-statistics.json');
|
|
await fs.mkdir(path.dirname(reportPath), { recursive: true });
|
|
await fs.writeFile(reportPath, JSON.stringify(statsReport, null, 2));
|
|
console.log(`\nDetailed statistics saved to: ${reportPath}`);
|
|
} catch (e) {
|
|
console.log('\nCould not save statistics file:', e);
|
|
}
|
|
|
|
// Assertions
|
|
expect(stats.totalFiles).toBeGreaterThan(100);
|
|
expect(stats.formats.size).toBeGreaterThan(3);
|
|
expect(parseFloat(parseRate)).toBeGreaterThan(70);
|
|
|
|
t.pass('Corpus statistics generated successfully');
|
|
});
|
|
|
|
// Helper function to detect format from file
|
|
function detectFormatFromFile(filePath: string, category: string): string {
|
|
const filename = path.basename(filePath).toLowerCase();
|
|
|
|
if (filename.includes('.ubl.')) return 'UBL';
|
|
if (filename.includes('.cii.')) return 'CII';
|
|
if (filename.includes('zugferd')) return 'ZUGFeRD';
|
|
if (filename.includes('factur')) return 'Factur-X';
|
|
if (filename.includes('fattura')) return 'FatturaPA';
|
|
if (filename.includes('peppol')) return 'PEPPOL';
|
|
if (filename.includes('xrechnung')) return 'XRechnung';
|
|
|
|
// Fallback to category
|
|
if (category.includes('UBL')) return 'UBL';
|
|
if (category.includes('CII')) return 'CII';
|
|
if (category.includes('ZUGFERD')) return 'ZUGFeRD';
|
|
if (category.includes('PEPPOL')) return 'PEPPOL';
|
|
if (category.includes('FATTURA')) return 'FatturaPA';
|
|
|
|
return 'Unknown';
|
|
}
|
|
|
|
// Helper function to analyze XML characteristics
|
|
function analyzeXMLCharacteristics(xml: string): {
|
|
encoding?: string;
|
|
rootElement?: string;
|
|
namespaces: string[];
|
|
} {
|
|
const result: any = { namespaces: [] };
|
|
|
|
// Extract encoding
|
|
const encodingMatch = xml.match(/encoding="([^"]+)"/);
|
|
if (encodingMatch) {
|
|
result.encoding = encodingMatch[1];
|
|
}
|
|
|
|
// Extract root element
|
|
const rootMatch = xml.match(/<([^\s>]+)[\s>]/);
|
|
if (rootMatch) {
|
|
result.rootElement = rootMatch[1].split(':').pop();
|
|
}
|
|
|
|
// Extract namespaces
|
|
const nsMatches = xml.matchAll(/xmlns(?::[^=]+)?="([^"]+)"/g);
|
|
for (const match of nsMatches) {
|
|
result.namespaces.push(match[1]);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
// Helper function to bucket item counts
|
|
function getItemCountBucket(count: number): string {
|
|
if (count === 1) return '1';
|
|
if (count <= 5) return '2-5';
|
|
if (count <= 10) return '6-10';
|
|
if (count <= 20) return '11-20';
|
|
if (count <= 50) return '21-50';
|
|
if (count <= 100) return '51-100';
|
|
return '100+';
|
|
}
|
|
|
|
tap.start(); |