import { tap, expect } from '@git.zone/tstest/tapbundle'; import * as plugins from '../../../ts/plugins.js'; import { EInvoice } from '../../../ts/index.js'; import { CorpusLoader } from '../../helpers/corpus.loader.js'; // PDF-09: Corrupted PDF Recovery // Tests recovery mechanisms for corrupted, malformed, or partially damaged PDF files // including graceful error handling and data recovery strategies tap.test('PDF-09: Corrupted PDF Recovery - Truncated PDF Files', async (tools) => { const startTime = Date.now(); try { // Get a working PDF from corpus to create corrupted versions const validPdfs = await CorpusLoader.getFiles('ZUGFERD_V1_CORRECT'); if (validPdfs.length === 0) { console.log('⚠ No valid PDF files found for corruption testing'); return; } const basePdf = validPdfs[0]; const basePdfName = plugins.path.basename(basePdf); console.log(`Creating corrupted versions of: ${basePdfName}`); // Read the original PDF const originalPdfBuffer = await plugins.fs.readFile(basePdf); const originalSize = originalPdfBuffer.length; console.log(`Original PDF size: ${(originalSize / 1024).toFixed(1)}KB`); // Test different levels of truncation const truncationTests = [ { name: '90% Truncated', percentage: 0.9 }, { name: '75% Truncated', percentage: 0.75 }, { name: '50% Truncated', percentage: 0.5 }, { name: '25% Truncated', percentage: 0.25 }, { name: '10% Truncated', percentage: 0.1 } ]; for (const truncationTest of truncationTests) { const truncatedSize = Math.floor(originalSize * truncationTest.percentage); const truncatedBuffer = originalPdfBuffer.subarray(0, truncatedSize); const truncatedPath = plugins.path.join(process.cwd(), '.nogit', `truncated-${truncationTest.name.toLowerCase().replace(/\s+/g, '-')}.pdf`); await plugins.fs.mkdir(plugins.path.dirname(truncatedPath), { recursive: true }); await plugins.fs.writeFile(truncatedPath, truncatedBuffer); console.log(`Testing ${truncationTest.name} (${(truncatedSize / 1024).toFixed(1)}KB)...`); try { const invoice = new EInvoice(); const extractionResult = await invoice.fromFile(truncatedPath); if (extractionResult) { console.log(` ✓ Unexpected success - managed to extract from ${truncationTest.name}`); // Verify extracted content const xmlContent = await invoice.toXmlString('ubl'); if (xmlContent && xmlContent.length > 50) { console.log(` Extracted XML length: ${xmlContent.length} chars`); } } else { console.log(` ✓ Expected failure - no extraction from ${truncationTest.name}`); } } catch (extractionError) { // Expected for corrupted files console.log(` ✓ Expected error for ${truncationTest.name}: ${extractionError.message.substring(0, 100)}...`); expect(extractionError.message).toBeTruthy(); } // Clean up await plugins.fs.unlink(truncatedPath); } } catch (error) { console.log(`Truncated PDF test failed: ${error.message}`); throw error; } const duration = Date.now() - startTime; console.log(`Test completed in ${duration}ms`); }); tap.test('PDF-09: Corrupted PDF Recovery - Header Corruption', async (tools) => { const startTime = Date.now(); // Test various PDF header corruption scenarios const headerCorruptionTests = [ { name: 'Invalid PDF Header', content: '%NOT-A-PDF-1.4\n%âãÏÓ\n', expectedError: true }, { name: 'Missing PDF Version', content: '%PDF-\n%âãÏÓ\n', expectedError: true }, { name: 'Corrupted Binary Marker', content: '%PDF-1.4\n%CORRUPTED\n', expectedError: true }, { name: 'Empty PDF File', content: '', expectedError: true }, { name: 'Only Header Line', content: '%PDF-1.4\n', expectedError: true }, { name: 'Wrong File Extension Content', content: 'This is actually a text file, not a PDF', expectedError: true } ]; for (const headerTest of headerCorruptionTests) { console.log(`Testing ${headerTest.name}...`); const corruptedPath = plugins.path.join(process.cwd(), '.nogit', `header-${headerTest.name.toLowerCase().replace(/\s+/g, '-')}.pdf`); await plugins.fs.mkdir(plugins.path.dirname(corruptedPath), { recursive: true }); try { // Create corrupted file await plugins.fs.writeFile(corruptedPath, headerTest.content, 'binary'); const invoice = new EInvoice(); const extractionResult = await invoice.fromFile(corruptedPath); if (headerTest.expectedError) { if (extractionResult) { console.log(` ⚠ Expected error for ${headerTest.name} but extraction succeeded`); } else { console.log(` ✓ Expected failure - no extraction from ${headerTest.name}`); } } else { console.log(` ✓ ${headerTest.name}: Extraction succeeded as expected`); } } catch (extractionError) { if (headerTest.expectedError) { console.log(` ✓ Expected error for ${headerTest.name}: ${extractionError.message.substring(0, 80)}...`); expect(extractionError.message).toBeTruthy(); } else { console.log(` ✗ Unexpected error for ${headerTest.name}: ${extractionError.message}`); throw extractionError; } } finally { // Clean up try { await plugins.fs.unlink(corruptedPath); } catch (cleanupError) { // Ignore cleanup errors } } } const duration = Date.now() - startTime; console.log(`Test completed in ${duration}ms`); }); tap.test('PDF-09: Corrupted PDF Recovery - Random Byte Corruption', async (tools) => { const startTime = Date.now(); try { const validPdfs = await CorpusLoader.getFiles('ZUGFERD_V1_CORRECT'); if (validPdfs.length === 0) { console.log('⚠ No valid PDF files found for random corruption testing'); return; } const basePdf = validPdfs[0]; const originalBuffer = await plugins.fs.readFile(basePdf); console.log(`Testing random byte corruption with: ${plugins.path.basename(basePdf)}`); // Test different levels of random corruption const corruptionLevels = [ { name: 'Light Corruption (0.1%)', percentage: 0.001 }, { name: 'Medium Corruption (1%)', percentage: 0.01 }, { name: 'Heavy Corruption (5%)', percentage: 0.05 }, { name: 'Severe Corruption (10%)', percentage: 0.1 } ]; for (const corruptionLevel of corruptionLevels) { console.log(`Testing ${corruptionLevel.name}...`); // Create corrupted version const corruptedBuffer = Buffer.from(originalBuffer); const bytesToCorrupt = Math.floor(corruptedBuffer.length * corruptionLevel.percentage); for (let i = 0; i < bytesToCorrupt; i++) { const randomIndex = Math.floor(Math.random() * corruptedBuffer.length); const randomByte = Math.floor(Math.random() * 256); corruptedBuffer[randomIndex] = randomByte; } const corruptedPath = plugins.path.join(process.cwd(), '.nogit', `random-${corruptionLevel.name.toLowerCase().replace(/\s+/g, '-')}.pdf`); await plugins.fs.mkdir(plugins.path.dirname(corruptedPath), { recursive: true }); await plugins.fs.writeFile(corruptedPath, corruptedBuffer); try { const invoice = new EInvoice(); const extractionResult = await invoice.fromFile(corruptedPath); if (extractionResult) { console.log(` ✓ Resilient recovery from ${corruptionLevel.name}`); // Verify extracted content quality const xmlContent = await invoice.toXmlString('ubl'); if (xmlContent && xmlContent.length > 100) { console.log(` Extracted ${xmlContent.length} chars of XML`); // Test if XML is well-formed try { // Simple XML validation if (xmlContent.includes(' { const startTime = Date.now(); try { const validPdfs = await CorpusLoader.getFiles('ZUGFERD_V1_CORRECT'); if (validPdfs.length === 0) { console.log('⚠ No valid PDF files found for structural damage testing'); return; } const basePdf = validPdfs[0]; const originalContent = await plugins.fs.readFile(basePdf, 'binary'); console.log(`Testing structural damage with: ${plugins.path.basename(basePdf)}`); // Test different types of structural damage const structuralDamageTests = [ { name: 'Missing xref table', damage: (content: string) => content.replace(/xref\s*\n[\s\S]*?trailer/g, 'damaged-xref') }, { name: 'Corrupted trailer', damage: (content: string) => content.replace(/trailer\s*<<[\s\S]*?>>/g, 'damaged-trailer') }, { name: 'Missing startxref', damage: (content: string) => content.replace(/startxref\s*\d+/g, 'damaged-startxref') }, { name: 'Corrupted PDF objects', damage: (content: string) => content.replace(/\d+\s+\d+\s+obj/g, 'XX XX damaged') }, { name: 'Missing EOF marker', damage: (content: string) => content.replace(/%%EOF\s*$/, 'CORRUPTED') } ]; for (const damageTest of structuralDamageTests) { console.log(`Testing ${damageTest.name}...`); try { const damagedContent = damageTest.damage(originalContent); const damagedPath = plugins.path.join(process.cwd(), '.nogit', `structural-${damageTest.name.toLowerCase().replace(/\s+/g, '-')}.pdf`); await plugins.fs.mkdir(plugins.path.dirname(damagedPath), { recursive: true }); await plugins.fs.writeFile(damagedPath, damagedContent, 'binary'); const invoice = new EInvoice(); const extractionResult = await invoice.fromFile(damagedPath); if (extractionResult) { console.log(` ✓ Recovered from ${damageTest.name}`); // Test extracted content const xmlContent = await invoice.toXmlString('ubl'); if (xmlContent && xmlContent.length > 50) { console.log(` Recovered XML content: ${xmlContent.length} chars`); } } else { console.log(` ⚠ No recovery possible from ${damageTest.name}`); } // Clean up await plugins.fs.unlink(damagedPath); } catch (extractionError) { console.log(` ⚠ ${damageTest.name} extraction failed: ${extractionError.message.substring(0, 80)}...`); expect(extractionError.message).toBeTruthy(); } } } catch (error) { console.log(`Structural damage test failed: ${error.message}`); throw error; } const duration = Date.now() - startTime; console.log(`Test completed in ${duration}ms`); }); tap.test('PDF-09: Corrupted PDF Recovery - Attachment Corruption', async (tools) => { const startTime = Date.now(); // Test scenarios where the XML attachment itself is corrupted try { const validPdfs = await CorpusLoader.getFiles('ZUGFERD_V1_CORRECT'); if (validPdfs.length === 0) { console.log('⚠ No valid PDF files found for attachment corruption testing'); return; } const basePdf = validPdfs[0]; console.log(`Testing attachment corruption scenarios with: ${plugins.path.basename(basePdf)}`); // First, try to extract XML from the original file to understand the structure let originalXml = null; try { const originalInvoice = new EInvoice(); const originalResult = await originalInvoice.fromFile(basePdf); if (originalResult) { originalXml = await originalInvoice.toXmlString('ubl'); console.log(`Original XML length: ${originalXml.length} chars`); } } catch (originalError) { console.log(`Could not extract original XML: ${originalError.message}`); } // Test various attachment corruption scenarios const attachmentTests = [ { name: 'Partial XML Loss', description: 'Simulate partial loss of XML attachment data' }, { name: 'Encoding Corruption', description: 'Simulate character encoding corruption' }, { name: 'Compression Corruption', description: 'Simulate corruption in compressed attachment streams' }, { name: 'Multiple Attachments', description: 'Test handling when PDF contains multiple/conflicting XML attachments' } ]; for (const attachmentTest of attachmentTests) { console.log(`Testing ${attachmentTest.name}: ${attachmentTest.description}`); try { const invoice = new EInvoice(); // Attempt extraction with error handling const extractionResult = await invoice.fromFile(basePdf); if (extractionResult) { // If we got any result, test the robustness of the extraction const extractedXml = await invoice.toXmlString('ubl'); if (extractedXml) { // Test XML integrity const integrityChecks = { hasXmlDeclaration: extractedXml.startsWith(''), hasClosingTags: extractedXml.includes('/g) || []).length }; console.log(` XML Integrity Checks:`); console.log(` Has XML Declaration: ${integrityChecks.hasXmlDeclaration}`); console.log(` Has Root Element: ${integrityChecks.hasRootElement}`); console.log(` Has Closing Tags: ${integrityChecks.hasClosingTags}`); console.log(` Tags Balanced: ${integrityChecks.isBalanced}`); if (Object.values(integrityChecks).every(check => check === true)) { console.log(` ✓ ${attachmentTest.name}: XML integrity maintained`); } else { console.log(` ⚠ ${attachmentTest.name}: XML integrity issues detected`); } } } else { console.log(` ⚠ ${attachmentTest.name}: No XML extracted`); } } catch (extractionError) { console.log(` ⚠ ${attachmentTest.name} extraction failed: ${extractionError.message.substring(0, 80)}...`); // Verify error contains useful information expect(extractionError.message).toBeTruthy(); // Check if error suggests recovery options const errorMessage = extractionError.message.toLowerCase(); if (errorMessage.includes('corrupt') || errorMessage.includes('malformed') || errorMessage.includes('damaged')) { console.log(` ✓ Error message indicates corruption: helpful for debugging`); } } } } catch (error) { console.log(`Attachment corruption test failed: ${error.message}`); throw error; } const duration = Date.now() - startTime; console.log(`Test completed in ${duration}ms`); }); tap.test('PDF-09: Corrupted PDF Recovery - Error Reporting Quality', async (tools) => { const startTime = Date.now(); // Test quality of error reporting for corrupted PDFs const errorReportingTests = [ { name: 'Completely Invalid File', content: 'This is definitely not a PDF file at all', expectedErrorTypes: ['format', 'invalid', 'not-pdf'] }, { name: 'Binary Garbage', content: Buffer.from([0x00, 0xFF, 0xAB, 0xCD, 0xEF, 0x12, 0x34, 0x56]), expectedErrorTypes: ['binary', 'corrupt', 'invalid'] }, { name: 'Partial PDF Header', content: '%PDF-1.4\n%âãÏÓ\n1 0 obj\n<< >>\nendobj\n', expectedErrorTypes: ['incomplete', 'truncated', 'structure'] } ]; for (const errorTest of errorReportingTests) { console.log(`Testing error reporting for: ${errorTest.name}`); const corruptedPath = plugins.path.join(process.cwd(), '.nogit', `error-${errorTest.name.toLowerCase().replace(/\s+/g, '-')}.pdf`); await plugins.fs.mkdir(plugins.path.dirname(corruptedPath), { recursive: true }); try { // Create corrupted file if (Buffer.isBuffer(errorTest.content)) { await plugins.fs.writeFile(corruptedPath, errorTest.content); } else { await plugins.fs.writeFile(corruptedPath, errorTest.content, 'binary'); } const invoice = new EInvoice(); try { await invoice.fromFile(corruptedPath); console.log(` ⚠ Expected error for ${errorTest.name} but operation succeeded`); } catch (extractionError) { console.log(` ✓ Error caught for ${errorTest.name}`); console.log(` Error message: ${extractionError.message}`); // Analyze error message quality const errorMessage = extractionError.message.toLowerCase(); const messageQuality = { isDescriptive: extractionError.message.length > 20, containsFileInfo: errorMessage.includes('pdf') || errorMessage.includes('file'), containsErrorType: errorTest.expectedErrorTypes.some(type => errorMessage.includes(type)), isActionable: errorMessage.includes('check') || errorMessage.includes('verify') || errorMessage.includes('ensure') || errorMessage.includes('corrupt') }; console.log(` Message Quality Analysis:`); console.log(` Descriptive (>20 chars): ${messageQuality.isDescriptive}`); console.log(` Contains file info: ${messageQuality.containsFileInfo}`); console.log(` Contains error type: ${messageQuality.containsErrorType}`); console.log(` Is actionable: ${messageQuality.isActionable}`); // Error message should be helpful expect(messageQuality.isDescriptive).toBeTrue(); if (messageQuality.containsFileInfo && messageQuality.isActionable) { console.log(` ✓ High quality error message`); } else { console.log(` ⚠ Error message could be more helpful`); } // Check error object properties if (extractionError.code) { console.log(` Error code: ${extractionError.code}`); } if (extractionError.path) { console.log(` Error path: ${extractionError.path}`); } } } finally { // Clean up try { await plugins.fs.unlink(corruptedPath); } catch (cleanupError) { // Ignore cleanup errors } } } const duration = Date.now() - startTime; console.log(`Test completed in ${duration}ms`); }); tap.test('PDF-09: Test Summary', async (tools) => { console.log(`\n=== Corrupted PDF Recovery Test Summary ===`); console.log(`\nCorrupted PDF recovery testing completed.`); console.log(`Note: Most corruption tests expect failures - this is normal and indicates proper error handling.`); }); tap.start();