320 lines
10 KiB
TypeScript
320 lines
10 KiB
TypeScript
import { expect, tap } from '@git.zone/tstest/tapbundle';
|
|
import { promises as fs } from 'fs';
|
|
import * as path from 'path';
|
|
import { CorpusLoader } from '../../helpers/corpus.loader.js';
|
|
import { PerformanceTracker } from '../../helpers/performance.tracker.js';
|
|
|
|
tap.test('PDF-01: XML Extraction from ZUGFeRD PDFs - should extract XML from ZUGFeRD v1 PDFs', async () => {
|
|
// Get ZUGFeRD v1 PDF files from corpus
|
|
const zugferdV1Files = await CorpusLoader.getFiles('ZUGFERD_V1_CORRECT');
|
|
const pdfFiles = zugferdV1Files.filter(f => f.endsWith('.pdf'));
|
|
|
|
console.log(`Testing XML extraction from ${pdfFiles.length} ZUGFeRD v1 PDFs`);
|
|
|
|
let successCount = 0;
|
|
let failCount = 0;
|
|
const results: { file: string; success: boolean; format?: string; size?: number; error?: string }[] = [];
|
|
|
|
// Import required classes
|
|
const { EInvoice } = await import('../../../ts/index.js');
|
|
|
|
for (const filePath of pdfFiles.slice(0, 5)) { // Test first 5 for performance
|
|
const fileName = path.basename(filePath);
|
|
|
|
try {
|
|
// Read PDF file
|
|
const pdfBuffer = await fs.readFile(filePath);
|
|
|
|
// Track performance of PDF extraction
|
|
const { result: einvoice, metric } = await PerformanceTracker.track(
|
|
'pdf-extraction-v1',
|
|
async () => {
|
|
return await EInvoice.fromPdf(pdfBuffer);
|
|
},
|
|
{
|
|
file: fileName,
|
|
size: pdfBuffer.length
|
|
}
|
|
);
|
|
|
|
// Verify extraction succeeded
|
|
expect(einvoice).toBeTruthy();
|
|
const xml = einvoice.getXml ? einvoice.getXml() : '';
|
|
expect(xml).toBeTruthy();
|
|
expect(xml.length).toBeGreaterThan(100);
|
|
|
|
// Check format detection
|
|
const format = einvoice.getFormat ? einvoice.getFormat() : 'unknown';
|
|
|
|
successCount++;
|
|
results.push({
|
|
file: fileName,
|
|
success: true,
|
|
format: format.toString(),
|
|
size: xml.length
|
|
});
|
|
|
|
console.log(`✓ ${fileName}: Extracted ${xml.length} bytes, format: ${format} (${metric.duration.toFixed(2)}ms)`);
|
|
|
|
// Verify basic invoice data (if available)
|
|
if (einvoice.id) {
|
|
expect(einvoice.id).toBeTruthy();
|
|
}
|
|
if (einvoice.from && einvoice.from.name) {
|
|
expect(einvoice.from.name).toBeTruthy();
|
|
}
|
|
|
|
} catch (error) {
|
|
failCount++;
|
|
results.push({
|
|
file: fileName,
|
|
success: false,
|
|
error: error.message
|
|
});
|
|
|
|
console.log(`✗ ${fileName}: ${error.message}`);
|
|
}
|
|
}
|
|
|
|
console.log(`\nZUGFeRD v1 Extraction Summary: ${successCount} succeeded, ${failCount} failed`);
|
|
|
|
// Show results summary
|
|
const formatCounts: Record<string, number> = {};
|
|
results.filter(r => r.success && r.format).forEach(r => {
|
|
formatCounts[r.format!] = (formatCounts[r.format!] || 0) + 1;
|
|
});
|
|
|
|
if (Object.keys(formatCounts).length > 0) {
|
|
console.log('Format distribution:', formatCounts);
|
|
}
|
|
|
|
// Performance summary
|
|
const perfSummary = await PerformanceTracker.getSummary('pdf-extraction-v1');
|
|
if (perfSummary) {
|
|
console.log(`\nExtraction Performance:`);
|
|
console.log(` Average: ${perfSummary.average.toFixed(2)}ms`);
|
|
console.log(` Min: ${perfSummary.min.toFixed(2)}ms`);
|
|
console.log(` Max: ${perfSummary.max.toFixed(2)}ms`);
|
|
console.log(` P95: ${perfSummary.p95.toFixed(2)}ms`);
|
|
}
|
|
|
|
// Expect at least some success (ZUGFeRD PDFs should extract)
|
|
expect(successCount).toBeGreaterThan(0);
|
|
});
|
|
|
|
tap.test('PDF-01: XML Extraction from ZUGFeRD v2/Factur-X PDFs - should extract XML from v2 PDFs', async () => {
|
|
// Get ZUGFeRD v2 PDF files from corpus
|
|
const zugferdV2Files = await CorpusLoader.getFiles('ZUGFERD_V2_CORRECT');
|
|
const pdfFiles = zugferdV2Files.filter(f => f.endsWith('.pdf'));
|
|
|
|
console.log(`Testing XML extraction from ${pdfFiles.length} ZUGFeRD v2/Factur-X PDFs`);
|
|
|
|
const profileStats: Record<string, number> = {};
|
|
let successCount = 0;
|
|
|
|
const { EInvoice } = await import('../../../ts/index.js');
|
|
|
|
for (const filePath of pdfFiles.slice(0, 8)) { // Test first 8
|
|
const fileName = path.basename(filePath);
|
|
|
|
try {
|
|
// Read PDF file
|
|
const pdfBuffer = await fs.readFile(filePath);
|
|
|
|
const { result: einvoice, metric } = await PerformanceTracker.track(
|
|
'pdf-extraction-v2',
|
|
async () => {
|
|
return await EInvoice.fromPdf(pdfBuffer);
|
|
},
|
|
{
|
|
file: fileName,
|
|
size: pdfBuffer.length
|
|
}
|
|
);
|
|
|
|
// Extract profile from filename if present
|
|
const profileMatch = fileName.match(/(BASIC|COMFORT|EXTENDED|MINIMUM|EN16931)/i);
|
|
const profile = profileMatch ? profileMatch[1].toUpperCase() : 'UNKNOWN';
|
|
profileStats[profile] = (profileStats[profile] || 0) + 1;
|
|
|
|
const format = einvoice.getFormat ? einvoice.getFormat() : 'unknown';
|
|
console.log(`✓ ${fileName}: Profile ${profile}, Format ${format} (${metric.duration.toFixed(2)}ms)`);
|
|
|
|
// Test that we can access the XML
|
|
const xml = einvoice.getXml ? einvoice.getXml() : '';
|
|
expect(xml).toBeTruthy();
|
|
expect(xml).toContain('CrossIndustryInvoice'); // Should be CII format
|
|
|
|
successCount++;
|
|
|
|
} catch (error) {
|
|
console.log(`✗ ${fileName}: ${error.message}`);
|
|
}
|
|
}
|
|
|
|
console.log(`\nZUGFeRD v2/Factur-X Extraction Summary: ${successCount} succeeded`);
|
|
console.log('Profile distribution:', profileStats);
|
|
|
|
// Performance summary
|
|
const perfSummary = await PerformanceTracker.getSummary('pdf-extraction-v2');
|
|
if (perfSummary) {
|
|
console.log(`\nV2 Extraction Performance:`);
|
|
console.log(` Average: ${perfSummary.average.toFixed(2)}ms`);
|
|
console.log(` Min: ${perfSummary.min.toFixed(2)}ms`);
|
|
console.log(` Max: ${perfSummary.max.toFixed(2)}ms`);
|
|
console.log(` P95: ${perfSummary.p95.toFixed(2)}ms`);
|
|
}
|
|
|
|
expect(successCount).toBeGreaterThan(0);
|
|
});
|
|
|
|
tap.test('PDF-01: PDF Extraction Error Handling - should handle invalid PDFs gracefully', async () => {
|
|
const { EInvoice } = await import('../../../ts/index.js');
|
|
|
|
// Test with empty buffer
|
|
try {
|
|
await EInvoice.fromPdf(new Uint8Array(0));
|
|
expect.fail('Should have thrown an error for empty PDF');
|
|
} catch (error) {
|
|
console.log('✓ Empty PDF error handled correctly');
|
|
expect(error.message).toBeTruthy();
|
|
}
|
|
|
|
// Test with non-PDF data
|
|
try {
|
|
const textBuffer = Buffer.from('This is not a PDF file');
|
|
await EInvoice.fromPdf(textBuffer);
|
|
expect.fail('Should have thrown an error for non-PDF data');
|
|
} catch (error) {
|
|
console.log('✓ Non-PDF data error handled correctly');
|
|
expect(error.message).toBeTruthy();
|
|
}
|
|
|
|
// Test with corrupted PDF header
|
|
try {
|
|
const corruptPdf = Buffer.from('%PDF-1.4\nCorrupted content');
|
|
await EInvoice.fromPdf(corruptPdf);
|
|
expect.fail('Should have thrown an error for corrupted PDF');
|
|
} catch (error) {
|
|
console.log('✓ Corrupted PDF error handled correctly');
|
|
expect(error.message).toBeTruthy();
|
|
}
|
|
|
|
// Test with valid PDF but no embedded XML
|
|
const minimalPdf = createMinimalTestPDF();
|
|
try {
|
|
await EInvoice.fromPdf(minimalPdf);
|
|
console.log('○ Minimal PDF processed (may or may not have XML)');
|
|
} catch (error) {
|
|
console.log('✓ PDF without XML handled correctly');
|
|
expect(error.message).toBeTruthy();
|
|
}
|
|
});
|
|
|
|
tap.test('PDF-01: Failed PDF Extraction - should handle PDFs without XML gracefully', async () => {
|
|
// Get files expected to fail
|
|
const failPdfs = await CorpusLoader.getFiles('ZUGFERD_V1_FAIL');
|
|
const pdfFailFiles = failPdfs.filter(f => f.endsWith('.pdf'));
|
|
|
|
console.log(`Testing ${pdfFailFiles.length} PDFs expected to fail`);
|
|
|
|
const { EInvoice } = await import('../../../ts/index.js');
|
|
let expectedFailures = 0;
|
|
let unexpectedSuccesses = 0;
|
|
|
|
for (const filePath of pdfFailFiles) {
|
|
const fileName = path.basename(filePath);
|
|
|
|
try {
|
|
const pdfBuffer = await fs.readFile(filePath);
|
|
|
|
const { result: einvoice } = await PerformanceTracker.track(
|
|
'pdf-extraction-fail',
|
|
async () => {
|
|
return await EInvoice.fromPdf(pdfBuffer);
|
|
}
|
|
);
|
|
|
|
unexpectedSuccesses++;
|
|
console.log(`○ ${fileName}: Unexpectedly succeeded (might have XML)`);
|
|
|
|
} catch (error) {
|
|
expectedFailures++;
|
|
console.log(`✓ ${fileName}: Correctly failed - ${error.message}`);
|
|
}
|
|
}
|
|
|
|
console.log(`\nFail Test Summary: ${expectedFailures} expected failures, ${unexpectedSuccesses} unexpected successes`);
|
|
|
|
// Most files in fail directory should fail
|
|
if (pdfFailFiles.length > 0) {
|
|
expect(expectedFailures).toBeGreaterThan(0);
|
|
}
|
|
});
|
|
|
|
tap.test('PDF-01: Large PDF Performance - should handle large PDFs efficiently', async () => {
|
|
const { EInvoice } = await import('../../../ts/index.js');
|
|
|
|
// Create a larger test PDF (1MB)
|
|
const largePdfSize = 1024 * 1024; // 1MB
|
|
const largePdfBuffer = Buffer.alloc(largePdfSize);
|
|
|
|
// Create a simple PDF header
|
|
const pdfHeader = Buffer.from('%PDF-1.4\n');
|
|
pdfHeader.copy(largePdfBuffer);
|
|
|
|
console.log(`Testing with ${(largePdfSize / 1024 / 1024).toFixed(1)}MB PDF`);
|
|
|
|
const { metric } = await PerformanceTracker.track(
|
|
'large-pdf-processing',
|
|
async () => {
|
|
try {
|
|
await EInvoice.fromPdf(largePdfBuffer);
|
|
return 'success';
|
|
} catch (error) {
|
|
// Expected to fail since it's not a real PDF with XML
|
|
return 'failed';
|
|
}
|
|
}
|
|
);
|
|
|
|
console.log(`✓ Large PDF processed in ${metric.duration.toFixed(2)}ms`);
|
|
expect(metric.duration).toBeLessThan(5000); // Should fail fast, not hang
|
|
|
|
// Test memory usage
|
|
const memoryUsed = metric.memory ? metric.memory.used / 1024 / 1024 : 0; // MB
|
|
console.log(`Memory usage: ${memoryUsed.toFixed(2)}MB`);
|
|
|
|
if (memoryUsed > 0) {
|
|
expect(memoryUsed).toBeLessThan(largePdfSize / 1024 / 1024 * 2); // Should not use more than 2x file size
|
|
}
|
|
});
|
|
|
|
// Helper function to create a minimal test PDF
|
|
function createMinimalTestPDF(): Uint8Array {
|
|
const pdfContent = `%PDF-1.4
|
|
1 0 obj
|
|
<< /Type /Catalog /Pages 2 0 R >>
|
|
endobj
|
|
2 0 obj
|
|
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
|
|
endobj
|
|
3 0 obj
|
|
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << >> >>
|
|
endobj
|
|
xref
|
|
0 4
|
|
0000000000 65535 f
|
|
0000000009 00000 n
|
|
0000000058 00000 n
|
|
0000000115 00000 n
|
|
trailer
|
|
<< /Size 4 /Root 1 0 R >>
|
|
startxref
|
|
217
|
|
%%EOF`;
|
|
|
|
return new Uint8Array(Buffer.from(pdfContent));
|
|
}
|
|
|
|
tap.start(); |