einvoice/test/suite/einvoice_pdf-operations/test.pdf-09.corrupted-pdf.ts
2025-05-28 10:15:48 +00:00

556 lines
20 KiB
TypeScript

import { tap, expect } from '@git.zone/tstest/tapbundle';
import * as plugins from '../../../ts/plugins.js';
import { EInvoice } from '../../../ts/index.js';
import { CorpusLoader } from '../../helpers/corpus.loader.js';
// PDF-09: Corrupted PDF Recovery
// Tests recovery mechanisms for corrupted, malformed, or partially damaged PDF files
// including graceful error handling and data recovery strategies
tap.test('PDF-09: Corrupted PDF Recovery - Truncated PDF Files', async (tools) => {
const startTime = Date.now();
try {
// Get a working PDF from corpus to create corrupted versions
const validPdfs = await CorpusLoader.getFiles('ZUGFERD_V1_CORRECT');
if (validPdfs.length === 0) {
console.log('⚠ No valid PDF files found for corruption testing');
return;
}
const basePdf = validPdfs[0];
const basePdfName = plugins.path.basename(basePdf);
console.log(`Creating corrupted versions of: ${basePdfName}`);
// Read the original PDF
const originalPdfBuffer = await plugins.fs.readFile(basePdf);
const originalSize = originalPdfBuffer.length;
console.log(`Original PDF size: ${(originalSize / 1024).toFixed(1)}KB`);
// Test different levels of truncation
const truncationTests = [
{ name: '90% Truncated', percentage: 0.9 },
{ name: '75% Truncated', percentage: 0.75 },
{ name: '50% Truncated', percentage: 0.5 },
{ name: '25% Truncated', percentage: 0.25 },
{ name: '10% Truncated', percentage: 0.1 }
];
for (const truncationTest of truncationTests) {
const truncatedSize = Math.floor(originalSize * truncationTest.percentage);
const truncatedBuffer = originalPdfBuffer.subarray(0, truncatedSize);
const truncatedPath = plugins.path.join(process.cwd(), '.nogit', `truncated-${truncationTest.name.toLowerCase().replace(/\s+/g, '-')}.pdf`);
await plugins.fs.mkdir(plugins.path.dirname(truncatedPath), { recursive: true });
await plugins.fs.writeFile(truncatedPath, truncatedBuffer);
console.log(`Testing ${truncationTest.name} (${(truncatedSize / 1024).toFixed(1)}KB)...`);
try {
const invoice = new EInvoice();
const extractionResult = await invoice.fromFile(truncatedPath);
if (extractionResult) {
console.log(` ✓ Unexpected success - managed to extract from ${truncationTest.name}`);
// Verify extracted content
const xmlContent = await invoice.toXmlString('ubl');
if (xmlContent && xmlContent.length > 50) {
console.log(` Extracted XML length: ${xmlContent.length} chars`);
}
} else {
console.log(` ✓ Expected failure - no extraction from ${truncationTest.name}`);
}
} catch (extractionError) {
// Expected for corrupted files
console.log(` ✓ Expected error for ${truncationTest.name}: ${extractionError.message.substring(0, 100)}...`);
expect(extractionError.message).toBeTruthy();
}
// Clean up
await plugins.fs.unlink(truncatedPath);
}
} catch (error) {
console.log(`Truncated PDF test failed: ${error.message}`);
throw error;
}
const duration = Date.now() - startTime;
console.log(`Test completed in ${duration}ms`);
});
tap.test('PDF-09: Corrupted PDF Recovery - Header Corruption', async (tools) => {
const startTime = Date.now();
// Test various PDF header corruption scenarios
const headerCorruptionTests = [
{
name: 'Invalid PDF Header',
content: '%NOT-A-PDF-1.4\n%âãÏÓ\n',
expectedError: true
},
{
name: 'Missing PDF Version',
content: '%PDF-\n%âãÏÓ\n',
expectedError: true
},
{
name: 'Corrupted Binary Marker',
content: '%PDF-1.4\n%CORRUPTED\n',
expectedError: true
},
{
name: 'Empty PDF File',
content: '',
expectedError: true
},
{
name: 'Only Header Line',
content: '%PDF-1.4\n',
expectedError: true
},
{
name: 'Wrong File Extension Content',
content: 'This is actually a text file, not a PDF',
expectedError: true
}
];
for (const headerTest of headerCorruptionTests) {
console.log(`Testing ${headerTest.name}...`);
const corruptedPath = plugins.path.join(process.cwd(), '.nogit', `header-${headerTest.name.toLowerCase().replace(/\s+/g, '-')}.pdf`);
await plugins.fs.mkdir(plugins.path.dirname(corruptedPath), { recursive: true });
try {
// Create corrupted file
await plugins.fs.writeFile(corruptedPath, headerTest.content, 'binary');
const invoice = new EInvoice();
const extractionResult = await invoice.fromFile(corruptedPath);
if (headerTest.expectedError) {
if (extractionResult) {
console.log(` ⚠ Expected error for ${headerTest.name} but extraction succeeded`);
} else {
console.log(` ✓ Expected failure - no extraction from ${headerTest.name}`);
}
} else {
console.log(`${headerTest.name}: Extraction succeeded as expected`);
}
} catch (extractionError) {
if (headerTest.expectedError) {
console.log(` ✓ Expected error for ${headerTest.name}: ${extractionError.message.substring(0, 80)}...`);
expect(extractionError.message).toBeTruthy();
} else {
console.log(` ✗ Unexpected error for ${headerTest.name}: ${extractionError.message}`);
throw extractionError;
}
} finally {
// Clean up
try {
await plugins.fs.unlink(corruptedPath);
} catch (cleanupError) {
// Ignore cleanup errors
}
}
}
const duration = Date.now() - startTime;
console.log(`Test completed in ${duration}ms`);
});
tap.test('PDF-09: Corrupted PDF Recovery - Random Byte Corruption', async (tools) => {
const startTime = Date.now();
try {
const validPdfs = await CorpusLoader.getFiles('ZUGFERD_V1_CORRECT');
if (validPdfs.length === 0) {
console.log('⚠ No valid PDF files found for random corruption testing');
return;
}
const basePdf = validPdfs[0];
const originalBuffer = await plugins.fs.readFile(basePdf);
console.log(`Testing random byte corruption with: ${plugins.path.basename(basePdf)}`);
// Test different levels of random corruption
const corruptionLevels = [
{ name: 'Light Corruption (0.1%)', percentage: 0.001 },
{ name: 'Medium Corruption (1%)', percentage: 0.01 },
{ name: 'Heavy Corruption (5%)', percentage: 0.05 },
{ name: 'Severe Corruption (10%)', percentage: 0.1 }
];
for (const corruptionLevel of corruptionLevels) {
console.log(`Testing ${corruptionLevel.name}...`);
// Create corrupted version
const corruptedBuffer = Buffer.from(originalBuffer);
const bytesToCorrupt = Math.floor(corruptedBuffer.length * corruptionLevel.percentage);
for (let i = 0; i < bytesToCorrupt; i++) {
const randomIndex = Math.floor(Math.random() * corruptedBuffer.length);
const randomByte = Math.floor(Math.random() * 256);
corruptedBuffer[randomIndex] = randomByte;
}
const corruptedPath = plugins.path.join(process.cwd(), '.nogit', `random-${corruptionLevel.name.toLowerCase().replace(/\s+/g, '-')}.pdf`);
await plugins.fs.mkdir(plugins.path.dirname(corruptedPath), { recursive: true });
await plugins.fs.writeFile(corruptedPath, corruptedBuffer);
try {
const invoice = new EInvoice();
const extractionResult = await invoice.fromFile(corruptedPath);
if (extractionResult) {
console.log(` ✓ Resilient recovery from ${corruptionLevel.name}`);
// Verify extracted content quality
const xmlContent = await invoice.toXmlString('ubl');
if (xmlContent && xmlContent.length > 100) {
console.log(` Extracted ${xmlContent.length} chars of XML`);
// Test if XML is well-formed
try {
// Simple XML validation
if (xmlContent.includes('<?xml') && xmlContent.includes('</')) {
console.log(` ✓ Extracted XML appears well-formed`);
}
} catch (xmlError) {
console.log(` ⚠ Extracted XML may be malformed: ${xmlError.message}`);
}
}
} else {
console.log(` ⚠ No extraction possible from ${corruptionLevel.name}`);
}
} catch (extractionError) {
console.log(` ⚠ Extraction failed for ${corruptionLevel.name}: ${extractionError.message.substring(0, 80)}...`);
// Check if error message is helpful
expect(extractionError.message).toBeTruthy();
expect(extractionError.message.length).toBeGreaterThan(10);
}
// Clean up
await plugins.fs.unlink(corruptedPath);
}
} catch (error) {
console.log(`Random corruption test failed: ${error.message}`);
throw error;
}
const duration = Date.now() - startTime;
console.log(`Test completed in ${duration}ms`);
});
tap.test('PDF-09: Corrupted PDF Recovery - Structural Damage', async (tools) => {
const startTime = Date.now();
try {
const validPdfs = await CorpusLoader.getFiles('ZUGFERD_V1_CORRECT');
if (validPdfs.length === 0) {
console.log('⚠ No valid PDF files found for structural damage testing');
return;
}
const basePdf = validPdfs[0];
const originalContent = await plugins.fs.readFile(basePdf, 'binary');
console.log(`Testing structural damage with: ${plugins.path.basename(basePdf)}`);
// Test different types of structural damage
const structuralDamageTests = [
{
name: 'Missing xref table',
damage: (content: string) => content.replace(/xref\s*\n[\s\S]*?trailer/g, 'damaged-xref')
},
{
name: 'Corrupted trailer',
damage: (content: string) => content.replace(/trailer\s*<<[\s\S]*?>>/g, 'damaged-trailer')
},
{
name: 'Missing startxref',
damage: (content: string) => content.replace(/startxref\s*\d+/g, 'damaged-startxref')
},
{
name: 'Corrupted PDF objects',
damage: (content: string) => content.replace(/\d+\s+\d+\s+obj/g, 'XX XX damaged')
},
{
name: 'Missing EOF marker',
damage: (content: string) => content.replace(/%%EOF\s*$/, 'CORRUPTED')
}
];
for (const damageTest of structuralDamageTests) {
console.log(`Testing ${damageTest.name}...`);
try {
const damagedContent = damageTest.damage(originalContent);
const damagedPath = plugins.path.join(process.cwd(), '.nogit', `structural-${damageTest.name.toLowerCase().replace(/\s+/g, '-')}.pdf`);
await plugins.fs.mkdir(plugins.path.dirname(damagedPath), { recursive: true });
await plugins.fs.writeFile(damagedPath, damagedContent, 'binary');
const invoice = new EInvoice();
const extractionResult = await invoice.fromFile(damagedPath);
if (extractionResult) {
console.log(` ✓ Recovered from ${damageTest.name}`);
// Test extracted content
const xmlContent = await invoice.toXmlString('ubl');
if (xmlContent && xmlContent.length > 50) {
console.log(` Recovered XML content: ${xmlContent.length} chars`);
}
} else {
console.log(` ⚠ No recovery possible from ${damageTest.name}`);
}
// Clean up
await plugins.fs.unlink(damagedPath);
} catch (extractionError) {
console.log(`${damageTest.name} extraction failed: ${extractionError.message.substring(0, 80)}...`);
expect(extractionError.message).toBeTruthy();
}
}
} catch (error) {
console.log(`Structural damage test failed: ${error.message}`);
throw error;
}
const duration = Date.now() - startTime;
console.log(`Test completed in ${duration}ms`);
});
tap.test('PDF-09: Corrupted PDF Recovery - Attachment Corruption', async (tools) => {
const startTime = Date.now();
// Test scenarios where the XML attachment itself is corrupted
try {
const validPdfs = await CorpusLoader.getFiles('ZUGFERD_V1_CORRECT');
if (validPdfs.length === 0) {
console.log('⚠ No valid PDF files found for attachment corruption testing');
return;
}
const basePdf = validPdfs[0];
console.log(`Testing attachment corruption scenarios with: ${plugins.path.basename(basePdf)}`);
// First, try to extract XML from the original file to understand the structure
let originalXml = null;
try {
const originalInvoice = new EInvoice();
const originalResult = await originalInvoice.fromFile(basePdf);
if (originalResult) {
originalXml = await originalInvoice.toXmlString('ubl');
console.log(`Original XML length: ${originalXml.length} chars`);
}
} catch (originalError) {
console.log(`Could not extract original XML: ${originalError.message}`);
}
// Test various attachment corruption scenarios
const attachmentTests = [
{
name: 'Partial XML Loss',
description: 'Simulate partial loss of XML attachment data'
},
{
name: 'Encoding Corruption',
description: 'Simulate character encoding corruption'
},
{
name: 'Compression Corruption',
description: 'Simulate corruption in compressed attachment streams'
},
{
name: 'Multiple Attachments',
description: 'Test handling when PDF contains multiple/conflicting XML attachments'
}
];
for (const attachmentTest of attachmentTests) {
console.log(`Testing ${attachmentTest.name}: ${attachmentTest.description}`);
try {
const invoice = new EInvoice();
// Attempt extraction with error handling
const extractionResult = await invoice.fromFile(basePdf);
if (extractionResult) {
// If we got any result, test the robustness of the extraction
const extractedXml = await invoice.toXmlString('ubl');
if (extractedXml) {
// Test XML integrity
const integrityChecks = {
hasXmlDeclaration: extractedXml.startsWith('<?xml'),
hasRootElement: extractedXml.includes('<') && extractedXml.includes('>'),
hasClosingTags: extractedXml.includes('</'),
isBalanced: (extractedXml.match(/</g) || []).length === (extractedXml.match(/>/g) || []).length
};
console.log(` XML Integrity Checks:`);
console.log(` Has XML Declaration: ${integrityChecks.hasXmlDeclaration}`);
console.log(` Has Root Element: ${integrityChecks.hasRootElement}`);
console.log(` Has Closing Tags: ${integrityChecks.hasClosingTags}`);
console.log(` Tags Balanced: ${integrityChecks.isBalanced}`);
if (Object.values(integrityChecks).every(check => check === true)) {
console.log(`${attachmentTest.name}: XML integrity maintained`);
} else {
console.log(`${attachmentTest.name}: XML integrity issues detected`);
}
}
} else {
console.log(`${attachmentTest.name}: No XML extracted`);
}
} catch (extractionError) {
console.log(`${attachmentTest.name} extraction failed: ${extractionError.message.substring(0, 80)}...`);
// Verify error contains useful information
expect(extractionError.message).toBeTruthy();
// Check if error suggests recovery options
const errorMessage = extractionError.message.toLowerCase();
if (errorMessage.includes('corrupt') ||
errorMessage.includes('malformed') ||
errorMessage.includes('damaged')) {
console.log(` ✓ Error message indicates corruption: helpful for debugging`);
}
}
}
} catch (error) {
console.log(`Attachment corruption test failed: ${error.message}`);
throw error;
}
const duration = Date.now() - startTime;
console.log(`Test completed in ${duration}ms`);
});
tap.test('PDF-09: Corrupted PDF Recovery - Error Reporting Quality', async (tools) => {
const startTime = Date.now();
// Test quality of error reporting for corrupted PDFs
const errorReportingTests = [
{
name: 'Completely Invalid File',
content: 'This is definitely not a PDF file at all',
expectedErrorTypes: ['format', 'invalid', 'not-pdf']
},
{
name: 'Binary Garbage',
content: Buffer.from([0x00, 0xFF, 0xAB, 0xCD, 0xEF, 0x12, 0x34, 0x56]),
expectedErrorTypes: ['binary', 'corrupt', 'invalid']
},
{
name: 'Partial PDF Header',
content: '%PDF-1.4\n%âãÏÓ\n1 0 obj\n<< >>\nendobj\n',
expectedErrorTypes: ['incomplete', 'truncated', 'structure']
}
];
for (const errorTest of errorReportingTests) {
console.log(`Testing error reporting for: ${errorTest.name}`);
const corruptedPath = plugins.path.join(process.cwd(), '.nogit', `error-${errorTest.name.toLowerCase().replace(/\s+/g, '-')}.pdf`);
await plugins.fs.mkdir(plugins.path.dirname(corruptedPath), { recursive: true });
try {
// Create corrupted file
if (Buffer.isBuffer(errorTest.content)) {
await plugins.fs.writeFile(corruptedPath, errorTest.content);
} else {
await plugins.fs.writeFile(corruptedPath, errorTest.content, 'binary');
}
const invoice = new EInvoice();
try {
await invoice.fromFile(corruptedPath);
console.log(` ⚠ Expected error for ${errorTest.name} but operation succeeded`);
} catch (extractionError) {
console.log(` ✓ Error caught for ${errorTest.name}`);
console.log(` Error message: ${extractionError.message}`);
// Analyze error message quality
const errorMessage = extractionError.message.toLowerCase();
const messageQuality = {
isDescriptive: extractionError.message.length > 20,
containsFileInfo: errorMessage.includes('pdf') || errorMessage.includes('file'),
containsErrorType: errorTest.expectedErrorTypes.some(type => errorMessage.includes(type)),
isActionable: errorMessage.includes('check') ||
errorMessage.includes('verify') ||
errorMessage.includes('ensure') ||
errorMessage.includes('corrupt')
};
console.log(` Message Quality Analysis:`);
console.log(` Descriptive (>20 chars): ${messageQuality.isDescriptive}`);
console.log(` Contains file info: ${messageQuality.containsFileInfo}`);
console.log(` Contains error type: ${messageQuality.containsErrorType}`);
console.log(` Is actionable: ${messageQuality.isActionable}`);
// Error message should be helpful
expect(messageQuality.isDescriptive).toBeTrue();
if (messageQuality.containsFileInfo && messageQuality.isActionable) {
console.log(` ✓ High quality error message`);
} else {
console.log(` ⚠ Error message could be more helpful`);
}
// Check error object properties
if (extractionError.code) {
console.log(` Error code: ${extractionError.code}`);
}
if (extractionError.path) {
console.log(` Error path: ${extractionError.path}`);
}
}
} finally {
// Clean up
try {
await plugins.fs.unlink(corruptedPath);
} catch (cleanupError) {
// Ignore cleanup errors
}
}
}
const duration = Date.now() - startTime;
console.log(`Test completed in ${duration}ms`);
});
tap.test('PDF-09: Test Summary', async (tools) => {
console.log(`\n=== Corrupted PDF Recovery Test Summary ===`);
console.log(`\nCorrupted PDF recovery testing completed.`);
console.log(`Note: Most corruption tests expect failures - this is normal and indicates proper error handling.`);
});
tap.start();