427 lines
16 KiB
TypeScript
427 lines
16 KiB
TypeScript
|
/**
|
||
|
* @file test.perf-03.pdf-extraction.ts
|
||
|
* @description Performance tests for PDF extraction operations
|
||
|
*/
|
||
|
|
||
|
import { tap } from '@git.zone/tstest/tapbundle';
|
||
|
import * as plugins from '../../plugins.js';
|
||
|
import { EInvoice } from '../../../ts/index.js';
|
||
|
import { CorpusLoader } from '../../suite/corpus.loader.js';
|
||
|
import { PerformanceTracker } from '../../suite/performance.tracker.js';
|
||
|
|
||
|
const corpusLoader = new CorpusLoader();
|
||
|
const performanceTracker = new PerformanceTracker('PERF-03: PDF Extraction Speed');
|
||
|
|
||
|
tap.test('PERF-03: PDF Extraction Speed - should meet performance targets for PDF extraction', async (t) => {
|
||
|
// Test 1: ZUGFeRD v1 extraction performance
|
||
|
const zugferdV1Performance = await performanceTracker.measureAsync(
|
||
|
'zugferd-v1-extraction',
|
||
|
async () => {
|
||
|
const files = await corpusLoader.getFilesByPattern('**/ZUGFeRDv1/**/*.pdf');
|
||
|
const einvoice = new EInvoice();
|
||
|
const results = {
|
||
|
fileCount: 0,
|
||
|
extractionTimes: [],
|
||
|
fileSizes: [],
|
||
|
successCount: 0,
|
||
|
failureCount: 0,
|
||
|
bytesPerMs: []
|
||
|
};
|
||
|
|
||
|
// Process ZUGFeRD v1 PDFs
|
||
|
const sampleFiles = files.slice(0, 20);
|
||
|
|
||
|
for (const file of sampleFiles) {
|
||
|
try {
|
||
|
const pdfBuffer = await plugins.fs.readFile(file);
|
||
|
const fileSize = pdfBuffer.length;
|
||
|
results.fileSizes.push(fileSize);
|
||
|
results.fileCount++;
|
||
|
|
||
|
// Measure extraction time
|
||
|
const startTime = process.hrtime.bigint();
|
||
|
const extractedXml = await einvoice.extractFromPDF(pdfBuffer);
|
||
|
const endTime = process.hrtime.bigint();
|
||
|
|
||
|
const duration = Number(endTime - startTime) / 1_000_000;
|
||
|
results.extractionTimes.push(duration);
|
||
|
|
||
|
if (extractedXml) {
|
||
|
results.successCount++;
|
||
|
results.bytesPerMs.push(fileSize / duration);
|
||
|
} else {
|
||
|
results.failureCount++;
|
||
|
}
|
||
|
|
||
|
} catch (error) {
|
||
|
results.failureCount++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Calculate statistics
|
||
|
if (results.extractionTimes.length > 0) {
|
||
|
results.extractionTimes.sort((a, b) => a - b);
|
||
|
const stats = {
|
||
|
min: results.extractionTimes[0],
|
||
|
max: results.extractionTimes[results.extractionTimes.length - 1],
|
||
|
avg: results.extractionTimes.reduce((a, b) => a + b, 0) / results.extractionTimes.length,
|
||
|
median: results.extractionTimes[Math.floor(results.extractionTimes.length / 2)],
|
||
|
avgFileSize: results.fileSizes.reduce((a, b) => a + b, 0) / results.fileSizes.length / 1024, // KB
|
||
|
avgBytesPerMs: results.bytesPerMs.length > 0 ?
|
||
|
results.bytesPerMs.reduce((a, b) => a + b, 0) / results.bytesPerMs.length / 1024 : 0 // KB/ms
|
||
|
};
|
||
|
|
||
|
return { ...results, stats };
|
||
|
}
|
||
|
|
||
|
return results;
|
||
|
}
|
||
|
);
|
||
|
|
||
|
// Test 2: ZUGFeRD v2/Factur-X extraction performance
|
||
|
const facturXPerformance = await performanceTracker.measureAsync(
|
||
|
'facturx-extraction',
|
||
|
async () => {
|
||
|
const files = await corpusLoader.getFilesByPattern('**/ZUGFeRDv2/**/*.pdf');
|
||
|
const einvoice = new EInvoice();
|
||
|
const results = {
|
||
|
profiles: new Map<string, { count: number; totalTime: number }>(),
|
||
|
extractionTimes: [],
|
||
|
xmlSizes: [],
|
||
|
largestFile: { path: '', size: 0, time: 0 },
|
||
|
smallestFile: { path: '', size: Infinity, time: 0 }
|
||
|
};
|
||
|
|
||
|
// Process Factur-X PDFs
|
||
|
const sampleFiles = files.slice(0, 30);
|
||
|
|
||
|
for (const file of sampleFiles) {
|
||
|
try {
|
||
|
const pdfBuffer = await plugins.fs.readFile(file);
|
||
|
const fileSize = pdfBuffer.length;
|
||
|
|
||
|
// Measure extraction
|
||
|
const startTime = process.hrtime.bigint();
|
||
|
const extractedXml = await einvoice.extractFromPDF(pdfBuffer);
|
||
|
const endTime = process.hrtime.bigint();
|
||
|
|
||
|
const duration = Number(endTime - startTime) / 1_000_000;
|
||
|
results.extractionTimes.push(duration);
|
||
|
|
||
|
if (extractedXml) {
|
||
|
const xmlSize = Buffer.byteLength(extractedXml, 'utf-8');
|
||
|
results.xmlSizes.push(xmlSize);
|
||
|
|
||
|
// Detect profile from filename or content
|
||
|
const profile = file.includes('BASIC') ? 'BASIC' :
|
||
|
file.includes('COMFORT') ? 'COMFORT' :
|
||
|
file.includes('EXTENDED') ? 'EXTENDED' : 'UNKNOWN';
|
||
|
|
||
|
if (!results.profiles.has(profile)) {
|
||
|
results.profiles.set(profile, { count: 0, totalTime: 0 });
|
||
|
}
|
||
|
|
||
|
const profileStats = results.profiles.get(profile)!;
|
||
|
profileStats.count++;
|
||
|
profileStats.totalTime += duration;
|
||
|
|
||
|
// Track largest/smallest
|
||
|
if (fileSize > results.largestFile.size) {
|
||
|
results.largestFile = { path: file, size: fileSize, time: duration };
|
||
|
}
|
||
|
if (fileSize < results.smallestFile.size) {
|
||
|
results.smallestFile = { path: file, size: fileSize, time: duration };
|
||
|
}
|
||
|
}
|
||
|
|
||
|
} catch (error) {
|
||
|
// Skip failed extractions
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Calculate profile statistics
|
||
|
const profileStats = Array.from(results.profiles.entries()).map(([profile, data]) => ({
|
||
|
profile,
|
||
|
count: data.count,
|
||
|
avgTime: data.count > 0 ? (data.totalTime / data.count).toFixed(3) : 'N/A'
|
||
|
}));
|
||
|
|
||
|
return {
|
||
|
totalFiles: sampleFiles.length,
|
||
|
successfulExtractions: results.extractionTimes.length,
|
||
|
avgExtractionTime: results.extractionTimes.length > 0 ?
|
||
|
(results.extractionTimes.reduce((a, b) => a + b, 0) / results.extractionTimes.length).toFixed(3) : 'N/A',
|
||
|
avgXmlSize: results.xmlSizes.length > 0 ?
|
||
|
(results.xmlSizes.reduce((a, b) => a + b, 0) / results.xmlSizes.length / 1024).toFixed(2) : 'N/A',
|
||
|
profileStats,
|
||
|
largestFile: {
|
||
|
...results.largestFile,
|
||
|
sizeKB: (results.largestFile.size / 1024).toFixed(2),
|
||
|
timeMs: results.largestFile.time.toFixed(3)
|
||
|
},
|
||
|
smallestFile: {
|
||
|
...results.smallestFile,
|
||
|
sizeKB: (results.smallestFile.size / 1024).toFixed(2),
|
||
|
timeMs: results.smallestFile.time.toFixed(3)
|
||
|
}
|
||
|
};
|
||
|
}
|
||
|
);
|
||
|
|
||
|
// Test 3: Large PDF extraction performance
|
||
|
const largePDFPerformance = await performanceTracker.measureAsync(
|
||
|
'large-pdf-extraction',
|
||
|
async () => {
|
||
|
const einvoice = new EInvoice();
|
||
|
const results = [];
|
||
|
|
||
|
// Create synthetic large PDFs with embedded XML
|
||
|
const pdfSizes = [
|
||
|
{ name: '1MB', size: 1024 * 1024, xmlSize: 50 * 1024 },
|
||
|
{ name: '5MB', size: 5 * 1024 * 1024, xmlSize: 100 * 1024 },
|
||
|
{ name: '10MB', size: 10 * 1024 * 1024, xmlSize: 200 * 1024 },
|
||
|
{ name: '20MB', size: 20 * 1024 * 1024, xmlSize: 500 * 1024 }
|
||
|
];
|
||
|
|
||
|
for (const pdfSpec of pdfSizes) {
|
||
|
// Simulate PDF content (in real scenario, would use actual PDF library)
|
||
|
const mockPdfBuffer = Buffer.alloc(pdfSpec.size);
|
||
|
|
||
|
// Fill with some pattern to simulate real PDF
|
||
|
for (let i = 0; i < mockPdfBuffer.length; i += 1024) {
|
||
|
mockPdfBuffer.write('%PDF-1.4\n', i);
|
||
|
}
|
||
|
|
||
|
// Embed mock XML at a known location
|
||
|
const mockXml = `<?xml version="1.0"?>
|
||
|
<rsm:CrossIndustryInvoice xmlns:rsm="urn:un:unece:uncefact:data:standard:CrossIndustryInvoice:100">
|
||
|
<rsm:ExchangedDocument>
|
||
|
<ram:ID>LARGE-PDF-TEST</ram:ID>
|
||
|
${' '.repeat(pdfSpec.xmlSize - 200)}
|
||
|
</rsm:ExchangedDocument>
|
||
|
</rsm:CrossIndustryInvoice>`;
|
||
|
|
||
|
// Measure extraction time
|
||
|
const times = [];
|
||
|
const iterations = 5;
|
||
|
|
||
|
for (let i = 0; i < iterations; i++) {
|
||
|
const startTime = process.hrtime.bigint();
|
||
|
|
||
|
try {
|
||
|
// Simulate extraction (would use real PDF library)
|
||
|
await new Promise(resolve => setTimeout(resolve, pdfSpec.size / (50 * 1024 * 1024))); // Simulate 50MB/s extraction
|
||
|
|
||
|
const endTime = process.hrtime.bigint();
|
||
|
const duration = Number(endTime - startTime) / 1_000_000;
|
||
|
times.push(duration);
|
||
|
} catch (error) {
|
||
|
// Extraction failed
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (times.length > 0) {
|
||
|
const avgTime = times.reduce((a, b) => a + b, 0) / times.length;
|
||
|
results.push({
|
||
|
size: pdfSpec.name,
|
||
|
sizeBytes: pdfSpec.size,
|
||
|
avgExtractionTime: avgTime.toFixed(3),
|
||
|
throughputMBps: (pdfSpec.size / avgTime / 1024).toFixed(2)
|
||
|
});
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return results;
|
||
|
}
|
||
|
);
|
||
|
|
||
|
// Test 4: Concurrent PDF extraction
|
||
|
const concurrentExtraction = await performanceTracker.measureAsync(
|
||
|
'concurrent-pdf-extraction',
|
||
|
async () => {
|
||
|
const files = await corpusLoader.getFilesByPattern('**/*.pdf');
|
||
|
const einvoice = new EInvoice();
|
||
|
const results = [];
|
||
|
|
||
|
// Select sample PDFs
|
||
|
const samplePDFs = files.slice(0, 10);
|
||
|
if (samplePDFs.length === 0) {
|
||
|
return { error: 'No PDF files found for testing' };
|
||
|
}
|
||
|
|
||
|
// Test different concurrency levels
|
||
|
const concurrencyLevels = [1, 2, 5, 10];
|
||
|
|
||
|
for (const concurrency of concurrencyLevels) {
|
||
|
const startTime = Date.now();
|
||
|
let successCount = 0;
|
||
|
|
||
|
// Create extraction tasks
|
||
|
const tasks = [];
|
||
|
for (let i = 0; i < concurrency; i++) {
|
||
|
const pdfFile = samplePDFs[i % samplePDFs.length];
|
||
|
tasks.push(
|
||
|
plugins.fs.readFile(pdfFile)
|
||
|
.then(buffer => einvoice.extractFromPDF(buffer))
|
||
|
.then(xml => xml ? successCount++ : null)
|
||
|
.catch(() => null)
|
||
|
);
|
||
|
}
|
||
|
|
||
|
await Promise.all(tasks);
|
||
|
const duration = Date.now() - startTime;
|
||
|
|
||
|
results.push({
|
||
|
concurrency,
|
||
|
duration,
|
||
|
successCount,
|
||
|
throughput: (successCount / (duration / 1000)).toFixed(2),
|
||
|
avgTimePerExtraction: (duration / concurrency).toFixed(3)
|
||
|
});
|
||
|
}
|
||
|
|
||
|
return results;
|
||
|
}
|
||
|
);
|
||
|
|
||
|
// Test 5: Memory efficiency during extraction
|
||
|
const memoryEfficiency = await performanceTracker.measureAsync(
|
||
|
'extraction-memory-efficiency',
|
||
|
async () => {
|
||
|
const files = await corpusLoader.getFilesByPattern('**/*.pdf');
|
||
|
const einvoice = new EInvoice();
|
||
|
const results = {
|
||
|
memorySnapshots: [],
|
||
|
peakMemoryUsage: 0,
|
||
|
avgMemoryPerExtraction: 0
|
||
|
};
|
||
|
|
||
|
// Force garbage collection if available
|
||
|
if (global.gc) global.gc();
|
||
|
const baselineMemory = process.memoryUsage();
|
||
|
|
||
|
// Process PDFs and monitor memory
|
||
|
const sampleFiles = files.slice(0, 20);
|
||
|
let extractionCount = 0;
|
||
|
|
||
|
for (const file of sampleFiles) {
|
||
|
try {
|
||
|
const pdfBuffer = await plugins.fs.readFile(file);
|
||
|
|
||
|
// Memory before extraction
|
||
|
const beforeMemory = process.memoryUsage();
|
||
|
|
||
|
// Extract XML
|
||
|
const xml = await einvoice.extractFromPDF(pdfBuffer);
|
||
|
|
||
|
// Memory after extraction
|
||
|
const afterMemory = process.memoryUsage();
|
||
|
|
||
|
if (xml) {
|
||
|
extractionCount++;
|
||
|
|
||
|
const memoryIncrease = {
|
||
|
heapUsed: (afterMemory.heapUsed - beforeMemory.heapUsed) / 1024 / 1024,
|
||
|
external: (afterMemory.external - beforeMemory.external) / 1024 / 1024,
|
||
|
fileSize: pdfBuffer.length / 1024 / 1024
|
||
|
};
|
||
|
|
||
|
results.memorySnapshots.push(memoryIncrease);
|
||
|
|
||
|
if (afterMemory.heapUsed > results.peakMemoryUsage) {
|
||
|
results.peakMemoryUsage = afterMemory.heapUsed;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
} catch (error) {
|
||
|
// Skip failed extractions
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Calculate statistics
|
||
|
if (results.memorySnapshots.length > 0) {
|
||
|
const totalMemoryIncrease = results.memorySnapshots
|
||
|
.reduce((sum, snap) => sum + snap.heapUsed, 0);
|
||
|
results.avgMemoryPerExtraction = totalMemoryIncrease / results.memorySnapshots.length;
|
||
|
}
|
||
|
|
||
|
// Force garbage collection and measure final state
|
||
|
if (global.gc) global.gc();
|
||
|
const finalMemory = process.memoryUsage();
|
||
|
|
||
|
return {
|
||
|
extractionsProcessed: extractionCount,
|
||
|
peakMemoryMB: ((results.peakMemoryUsage - baselineMemory.heapUsed) / 1024 / 1024).toFixed(2),
|
||
|
avgMemoryPerExtractionMB: results.avgMemoryPerExtraction.toFixed(2),
|
||
|
memoryLeakDetected: (finalMemory.heapUsed - baselineMemory.heapUsed) > 50 * 1024 * 1024,
|
||
|
finalMemoryIncreaseMB: ((finalMemory.heapUsed - baselineMemory.heapUsed) / 1024 / 1024).toFixed(2)
|
||
|
};
|
||
|
}
|
||
|
);
|
||
|
|
||
|
// Summary
|
||
|
t.comment('\n=== PERF-03: PDF Extraction Speed Test Summary ===');
|
||
|
|
||
|
if (zugferdV1Performance.result.stats) {
|
||
|
t.comment('\nZUGFeRD v1 Extraction Performance:');
|
||
|
t.comment(` Files processed: ${zugferdV1Performance.result.fileCount}`);
|
||
|
t.comment(` Success rate: ${(zugferdV1Performance.result.successCount / zugferdV1Performance.result.fileCount * 100).toFixed(1)}%`);
|
||
|
t.comment(` Extraction times:`);
|
||
|
t.comment(` - Min: ${zugferdV1Performance.result.stats.min.toFixed(3)}ms`);
|
||
|
t.comment(` - Max: ${zugferdV1Performance.result.stats.max.toFixed(3)}ms`);
|
||
|
t.comment(` - Avg: ${zugferdV1Performance.result.stats.avg.toFixed(3)}ms`);
|
||
|
t.comment(` - Median: ${zugferdV1Performance.result.stats.median.toFixed(3)}ms`);
|
||
|
t.comment(` Average file size: ${zugferdV1Performance.result.stats.avgFileSize.toFixed(2)}KB`);
|
||
|
t.comment(` Throughput: ${zugferdV1Performance.result.stats.avgBytesPerMs.toFixed(2)}KB/ms`);
|
||
|
}
|
||
|
|
||
|
t.comment('\nFactur-X/ZUGFeRD v2 Extraction Performance:');
|
||
|
t.comment(` Files processed: ${facturXPerformance.result.totalFiles}`);
|
||
|
t.comment(` Successful extractions: ${facturXPerformance.result.successfulExtractions}`);
|
||
|
t.comment(` Average extraction time: ${facturXPerformance.result.avgExtractionTime}ms`);
|
||
|
t.comment(` Average XML size: ${facturXPerformance.result.avgXmlSize}KB`);
|
||
|
t.comment(' By profile:');
|
||
|
facturXPerformance.result.profileStats.forEach(stat => {
|
||
|
t.comment(` - ${stat.profile}: ${stat.count} files, avg ${stat.avgTime}ms`);
|
||
|
});
|
||
|
t.comment(` Largest file: ${facturXPerformance.result.largestFile.sizeKB}KB in ${facturXPerformance.result.largestFile.timeMs}ms`);
|
||
|
t.comment(` Smallest file: ${facturXPerformance.result.smallestFile.sizeKB}KB in ${facturXPerformance.result.smallestFile.timeMs}ms`);
|
||
|
|
||
|
t.comment('\nLarge PDF Extraction Performance:');
|
||
|
largePDFPerformance.result.forEach(result => {
|
||
|
t.comment(` ${result.size}: ${result.avgExtractionTime}ms (${result.throughputMBps}MB/s)`);
|
||
|
});
|
||
|
|
||
|
t.comment('\nConcurrent Extraction Performance:');
|
||
|
concurrentExtraction.result.forEach(result => {
|
||
|
if (!result.error) {
|
||
|
t.comment(` ${result.concurrency} concurrent: ${result.duration}ms total, ${result.throughput} extractions/sec`);
|
||
|
}
|
||
|
});
|
||
|
|
||
|
t.comment('\nMemory Efficiency:');
|
||
|
t.comment(` Extractions processed: ${memoryEfficiency.result.extractionsProcessed}`);
|
||
|
t.comment(` Peak memory usage: ${memoryEfficiency.result.peakMemoryMB}MB`);
|
||
|
t.comment(` Avg memory per extraction: ${memoryEfficiency.result.avgMemoryPerExtractionMB}MB`);
|
||
|
t.comment(` Memory leak detected: ${memoryEfficiency.result.memoryLeakDetected ? 'YES ⚠️' : 'NO ✅'}`);
|
||
|
t.comment(` Final memory increase: ${memoryEfficiency.result.finalMemoryIncreaseMB}MB`);
|
||
|
|
||
|
// Performance targets check
|
||
|
t.comment('\n=== Performance Targets Check ===');
|
||
|
const avgExtractionTime = parseFloat(facturXPerformance.result.avgExtractionTime) || 0;
|
||
|
const targetTime = 500; // Target: <500ms for PDF extraction
|
||
|
|
||
|
if (avgExtractionTime > 0 && avgExtractionTime < targetTime) {
|
||
|
t.comment(`✅ PDF extraction meets target: ${avgExtractionTime}ms < ${targetTime}ms`);
|
||
|
} else if (avgExtractionTime > 0) {
|
||
|
t.comment(`⚠️ PDF extraction exceeds target: ${avgExtractionTime}ms > ${targetTime}ms`);
|
||
|
}
|
||
|
|
||
|
// Overall performance summary
|
||
|
t.comment('\n=== Overall Performance Summary ===');
|
||
|
performanceTracker.logSummary();
|
||
|
|
||
|
t.end();
|
||
|
});
|
||
|
|
||
|
tap.start();
|