import { tap, expect } from '@git.zone/tstest/tapbundle'; import { EInvoice, EInvoicePDFError } from '../ts/index.js'; import { InvoiceFormat } from '../ts/interfaces/common.js'; import { TestFileHelpers, TestFileCategories, PerformanceUtils, TestInvoiceFactory } from './helpers/utils.js'; import * as path from 'path'; import { promises as fs } from 'fs'; /** * Comprehensive PDF operations test suite */ // Test PDF extraction from ZUGFeRD v1 files tap.test('PDF Operations - Extract XML from ZUGFeRD v1 PDFs', async () => { // Use CorpusLoader for recursive loading const { CorpusLoader } = await import('./helpers/corpus.loader.js'); const corpusFiles = await CorpusLoader.loadCategory('ZUGFERD_V1_CORRECT'); const pdfFiles = corpusFiles.filter(file => file.path.endsWith('.pdf')); console.log(`Testing XML extraction from ${pdfFiles.length} ZUGFeRD v1 PDFs`); // Skip test if no PDF files are available if (pdfFiles.length === 0) { console.log('No ZUGFeRD v1 PDF files found in corpus - skipping test'); return; } let successCount = 0; let failCount = 0; const extractionTimes: number[] = []; for (const corpusFile of pdfFiles.slice(0, 5)) { // Test first 5 for speed const fileName = path.basename(corpusFile.path); try { const pdfBuffer = await CorpusLoader.loadFile(corpusFile.path); const { result: einvoice, duration } = await PerformanceUtils.measure( 'pdf-extraction-v1', async () => EInvoice.fromPdf(pdfBuffer) ); extractionTimes.push(duration); // Verify extraction succeeded expect(einvoice).toBeTruthy(); expect(einvoice.getXml()).toBeTruthy(); expect(einvoice.getXml().length).toBeGreaterThan(100); // Check format detection const format = einvoice.getFormat(); expect([InvoiceFormat.ZUGFERD, InvoiceFormat.FACTURX]).toContain(format); successCount++; console.log(`✓ ${fileName}: Extracted ${einvoice.getXml().length} bytes, format: ${format} (${duration.toFixed(2)}ms)`); // Verify basic invoice data expect(einvoice.id).toBeTruthy(); expect(einvoice.from.name).toBeTruthy(); expect(einvoice.to.name).toBeTruthy(); } catch (error) { failCount++; if (error instanceof EInvoicePDFError) { console.log(`✗ ${fileName}: ${error.message}`); console.log(` Recovery suggestions: ${error.getRecoverySuggestions().join(', ')}`); } else { console.log(`✗ ${fileName}: ${error.message}`); } } } console.log(`\nExtraction Summary: ${successCount} succeeded, ${failCount} failed`); if (extractionTimes.length > 0) { const avgTime = extractionTimes.reduce((a, b) => a + b) / extractionTimes.length; console.log(`Average extraction time: ${avgTime.toFixed(2)}ms`); } // Only expect success if we had files to test if (pdfFiles.length > 0) { expect(successCount).toBeGreaterThan(0); } }); // Test PDF extraction from ZUGFeRD v2/Factur-X files tap.test('PDF Operations - Extract XML from ZUGFeRD v2/Factur-X PDFs', async () => { // Use CorpusLoader for recursive loading const { CorpusLoader } = await import('./helpers/corpus.loader.js'); const corpusFiles = await CorpusLoader.loadCategory('ZUGFERD_V2_CORRECT'); const pdfFiles = corpusFiles.filter(file => file.path.endsWith('.pdf')); console.log(`Testing XML extraction from ${pdfFiles.length} ZUGFeRD v2/Factur-X PDFs`); // Skip test if no PDF files are available if (pdfFiles.length === 0) { console.log('No ZUGFeRD v2/Factur-X PDF files found in corpus - skipping test'); return; } const profileStats: Record = {}; for (const corpusFile of pdfFiles.slice(0, 10)) { // Test first 10 const fileName = path.basename(corpusFile.path); try { const pdfBuffer = await CorpusLoader.loadFile(corpusFile.path); const einvoice = await EInvoice.fromPdf(pdfBuffer); // Extract profile from filename if present const profileMatch = fileName.match(/(BASIC|COMFORT|EXTENDED|MINIMUM|EN16931)/i); const profile = profileMatch ? profileMatch[1].toUpperCase() : 'UNKNOWN'; profileStats[profile] = (profileStats[profile] || 0) + 1; console.log(`✓ ${fileName}: Profile ${profile}, Format ${einvoice.getFormat()}`); // Test that we can re-export the invoice const xml = await einvoice.exportXml('facturx'); expect(xml).toBeTruthy(); expect(xml).toInclude('CrossIndustryInvoice'); } catch (error) { console.log(`✗ ${fileName}: ${error.message}`); } } console.log('\nProfile distribution:', profileStats); }); // Test PDF embedding (creating PDFs with XML) tap.test('PDF Operations - Embed XML into PDF', async () => { // Create a test invoice const invoice = new EInvoice(); Object.assign(invoice, TestInvoiceFactory.createComplexInvoice()); // Generate XML const xml = await invoice.exportXml('facturx'); expect(xml).toBeTruthy(); console.log(`Generated XML: ${xml.length} bytes`); // Create a minimal PDF for testing const pdfBuffer = await createMinimalTestPDF(); invoice.pdf = { name: 'test-invoice.pdf', id: 'test-pdf-001', metadata: { textExtraction: '' }, buffer: pdfBuffer }; // Test embedding try { const { result: resultPdf, duration } = await PerformanceUtils.measure( 'pdf-embedding', async () => ({ buffer: await invoice.embedInPdf(Buffer.from(pdfBuffer), 'facturx') }) ); expect(resultPdf).toBeTruthy(); expect(resultPdf.buffer).toBeTruthy(); expect(resultPdf.buffer.length).toBeGreaterThan(pdfBuffer.length); console.log(`✓ Successfully embedded XML into PDF (${duration.toFixed(2)}ms)`); console.log(` Original PDF: ${pdfBuffer.length} bytes`); console.log(` Result PDF: ${resultPdf.buffer.length} bytes`); console.log(` Size increase: ${resultPdf.buffer.length - pdfBuffer.length} bytes`); // Verify the embedded XML can be extracted const verification = await EInvoice.fromPdf(resultPdf.buffer); expect(verification.getXml()).toBeTruthy(); expect(verification.getFormat()).toEqual(InvoiceFormat.FACTURX); console.log('✓ Verified: Embedded XML can be extracted successfully'); } catch (error) { if (error instanceof EInvoicePDFError) { console.log(`✗ Embedding failed: ${error.message}`); console.log(` Operation: ${error.operation}`); console.log(` Suggestions: ${error.getRecoverySuggestions().join(', ')}`); } throw error; } }); // Test PDF extraction error handling tap.test('PDF Operations - Error handling for invalid PDFs', async () => { // Test with empty buffer try { await EInvoice.fromPdf(Buffer.from(new Uint8Array(0))); throw new Error('Should have thrown an error for empty PDF'); } catch (error) { expect(error).toBeInstanceOf(EInvoicePDFError); if (error instanceof EInvoicePDFError) { expect(error.operation).toEqual('extract'); console.log('✓ Empty PDF error handled correctly'); } } // Test with non-PDF data try { const textBuffer = Buffer.from('This is not a PDF file'); await EInvoice.fromPdf(textBuffer); throw new Error('Should have thrown an error for non-PDF data'); } catch (error) { expect(error).toBeInstanceOf(EInvoicePDFError); console.log('✓ Non-PDF data error handled correctly'); } // Test with corrupted PDF header try { const corruptPdf = Buffer.from('%PDF-1.4\nCorrupted content'); await EInvoice.fromPdf(corruptPdf); throw new Error('Should have thrown an error for corrupted PDF'); } catch (error) { expect(error).toBeInstanceOf(EInvoicePDFError); console.log('✓ Corrupted PDF error handled correctly'); } }); // Test failed PDF extractions from corpus tap.test('PDF Operations - Handle PDFs without XML gracefully', async () => { // Use CorpusLoader for recursive loading const { CorpusLoader } = await import('./helpers/corpus.loader.js'); const corpusFiles = await CorpusLoader.loadCategory('ZUGFERD_V1_FAIL'); const failPdfs = corpusFiles.filter(file => file.path.endsWith('.pdf')); console.log(`Testing ${failPdfs.length} PDFs expected to fail`); // Skip test if no PDF files are available if (failPdfs.length === 0) { console.log('No failed ZUGFeRD v1 PDF files found in corpus - skipping test'); return; } for (const corpusFile of failPdfs) { const fileName = path.basename(corpusFile.path); try { const pdfBuffer = await CorpusLoader.loadFile(corpusFile.path); await EInvoice.fromPdf(pdfBuffer); console.log(`○ ${fileName}: Unexpectedly succeeded (might have XML)`); } catch (error) { if (error instanceof EInvoicePDFError) { expect(error.operation).toEqual('extract'); console.log(`✓ ${fileName}: Correctly failed - ${error.message}`); } else { console.log(`✗ ${fileName}: Wrong error type - ${error.message}`); } } } }); // Test PDF metadata preservation tap.test('PDF Operations - Metadata preservation during embedding', async () => { // Use CorpusLoader for recursive loading const { CorpusLoader } = await import('./helpers/corpus.loader.js'); const corpusFiles = await CorpusLoader.loadCategory('ZUGFERD_V2_CORRECT'); const pdfFiles = corpusFiles.filter(file => file.path.endsWith('.pdf')); if (pdfFiles.length > 0) { const originalPdfBuffer = await CorpusLoader.loadFile(pdfFiles[0].path); try { // Extract from original const originalInvoice = await EInvoice.fromPdf(originalPdfBuffer); // Re-embed with different format const reembeddedBuffer = await originalInvoice.embedInPdf(originalPdfBuffer, 'xrechnung'); // Extract again const reextracted = await EInvoice.fromPdf(reembeddedBuffer); // Compare key fields expect(reextracted.from.name).toEqual(originalInvoice.from.name); expect(reextracted.to.name).toEqual(originalInvoice.to.name); expect(reextracted.items.length).toEqual(originalInvoice.items.length); console.log('✓ Metadata preserved through re-embedding cycle'); } catch (error) { console.log(`○ Metadata preservation test skipped: ${error.message}`); } } else { console.log('No ZUGFeRD v2 PDF files found for metadata preservation test - skipping'); } }); // Test PDF size constraints tap.test('PDF Operations - Performance with large PDFs', async () => { const largePdfSize = 10 * 1024 * 1024; // 10MB const largePdfBuffer = Buffer.alloc(largePdfSize); // Create a simple PDF header const pdfHeader = Buffer.from('%PDF-1.4\n'); pdfHeader.copy(largePdfBuffer); console.log(`Testing with ${(largePdfSize / 1024 / 1024).toFixed(1)}MB PDF`); const startTime = performance.now(); try { await EInvoice.fromPdf(largePdfBuffer); } catch (error) { // Expected to fail, we're testing performance const duration = performance.now() - startTime; console.log(`✓ Large PDF processed in ${duration.toFixed(2)}ms`); expect(duration).toBeLessThan(5000); // Should fail fast, not hang } }); // Test concurrent PDF operations tap.test('PDF Operations - Concurrent processing', async () => { // Use CorpusLoader for recursive loading const { CorpusLoader } = await import('./helpers/corpus.loader.js'); const corpusFiles = await CorpusLoader.loadCategory('ZUGFERD_V2_CORRECT'); const pdfFiles = corpusFiles.filter(file => file.path.endsWith('.pdf')); const testFiles = pdfFiles.slice(0, 5); if (testFiles.length > 0) { console.log(`Testing concurrent processing of ${testFiles.length} PDFs`); const startTime = performance.now(); // Process all PDFs concurrently const promises = testFiles.map(async (corpusFile) => { try { const pdfBuffer = await CorpusLoader.loadFile(corpusFile.path); const einvoice = await EInvoice.fromPdf(pdfBuffer); return { success: true, format: einvoice.getFormat() }; } catch (error) { return { success: false, error: error.message }; } }); const results = await Promise.all(promises); const duration = performance.now() - startTime; const successCount = results.filter(r => r.success).length; console.log(`✓ Processed ${successCount}/${testFiles.length} PDFs concurrently in ${duration.toFixed(2)}ms`); console.log(` Average time per PDF: ${(duration / testFiles.length).toFixed(2)}ms`); } else { console.log('No ZUGFeRD v2 PDF files found for concurrent processing test - skipping'); } }); // Performance summary tap.test('PDF Operations - Performance Summary', async () => { const stats = { extraction: PerformanceUtils.getStats('pdf-extraction-v1'), embedding: PerformanceUtils.getStats('pdf-embedding') }; console.log('\nPDF Operations Performance Summary:'); if (stats.extraction) { console.log('PDF Extraction (ZUGFeRD v1):'); console.log(` Average: ${stats.extraction.avg.toFixed(2)}ms`); console.log(` Min/Max: ${stats.extraction.min.toFixed(2)}ms / ${stats.extraction.max.toFixed(2)}ms`); } if (stats.embedding) { console.log('PDF Embedding:'); console.log(` Average: ${stats.embedding.avg.toFixed(2)}ms`); } // Performance assertions if (stats.extraction && stats.extraction.count > 3) { expect(stats.extraction.avg).toBeLessThan(1000); // Should extract in under 1 second on average } }); // Helper function to create a minimal test PDF async function createMinimalTestPDF(): Promise { // This creates a very minimal valid PDF const pdfContent = `%PDF-1.4 1 0 obj << /Type /Catalog /Pages 2 0 R >> endobj 2 0 obj << /Type /Pages /Kids [3 0 R] /Count 1 >> endobj 3 0 obj << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << >> >> endobj xref 0 4 0000000000 65535 f 0000000009 00000 n 0000000058 00000 n 0000000115 00000 n trailer << /Size 4 /Root 1 0 R >> startxref 217 %%EOF`; return new Uint8Array(Buffer.from(pdfContent)); } tap.start();