556 lines
20 KiB
TypeScript
556 lines
20 KiB
TypeScript
import { tap, expect } from '@git.zone/tstest/tapbundle';
|
|
import * as plugins from '../../../ts/plugins.js';
|
|
import { EInvoice } from '../../../ts/index.js';
|
|
import { CorpusLoader } from '../../helpers/corpus.loader.js';
|
|
|
|
// PDF-09: Corrupted PDF Recovery
|
|
// Tests recovery mechanisms for corrupted, malformed, or partially damaged PDF files
|
|
// including graceful error handling and data recovery strategies
|
|
|
|
tap.test('PDF-09: Corrupted PDF Recovery - Truncated PDF Files', async (tools) => {
|
|
const startTime = Date.now();
|
|
|
|
try {
|
|
// Get a working PDF from corpus to create corrupted versions
|
|
const validPdfs = await CorpusLoader.getFiles('ZUGFERD_V1_CORRECT');
|
|
|
|
if (validPdfs.length === 0) {
|
|
console.log('⚠ No valid PDF files found for corruption testing');
|
|
return;
|
|
}
|
|
|
|
const basePdf = validPdfs[0];
|
|
const basePdfName = plugins.path.basename(basePdf);
|
|
|
|
console.log(`Creating corrupted versions of: ${basePdfName}`);
|
|
|
|
// Read the original PDF
|
|
const originalPdfBuffer = await plugins.fs.readFile(basePdf);
|
|
const originalSize = originalPdfBuffer.length;
|
|
|
|
console.log(`Original PDF size: ${(originalSize / 1024).toFixed(1)}KB`);
|
|
|
|
// Test different levels of truncation
|
|
const truncationTests = [
|
|
{ name: '90% Truncated', percentage: 0.9 },
|
|
{ name: '75% Truncated', percentage: 0.75 },
|
|
{ name: '50% Truncated', percentage: 0.5 },
|
|
{ name: '25% Truncated', percentage: 0.25 },
|
|
{ name: '10% Truncated', percentage: 0.1 }
|
|
];
|
|
|
|
for (const truncationTest of truncationTests) {
|
|
const truncatedSize = Math.floor(originalSize * truncationTest.percentage);
|
|
const truncatedBuffer = originalPdfBuffer.subarray(0, truncatedSize);
|
|
|
|
const truncatedPath = plugins.path.join(process.cwd(), '.nogit', `truncated-${truncationTest.name.toLowerCase().replace(/\s+/g, '-')}.pdf`);
|
|
await plugins.fs.mkdir(plugins.path.dirname(truncatedPath), { recursive: true });
|
|
await plugins.fs.writeFile(truncatedPath, truncatedBuffer);
|
|
|
|
console.log(`Testing ${truncationTest.name} (${(truncatedSize / 1024).toFixed(1)}KB)...`);
|
|
|
|
try {
|
|
const invoice = new EInvoice();
|
|
const extractionResult = await invoice.fromFile(truncatedPath);
|
|
|
|
if (extractionResult) {
|
|
console.log(` ✓ Unexpected success - managed to extract from ${truncationTest.name}`);
|
|
|
|
// Verify extracted content
|
|
const xmlContent = await invoice.toXmlString('ubl');
|
|
if (xmlContent && xmlContent.length > 50) {
|
|
console.log(` Extracted XML length: ${xmlContent.length} chars`);
|
|
}
|
|
} else {
|
|
console.log(` ✓ Expected failure - no extraction from ${truncationTest.name}`);
|
|
}
|
|
|
|
} catch (extractionError) {
|
|
// Expected for corrupted files
|
|
console.log(` ✓ Expected error for ${truncationTest.name}: ${extractionError.message.substring(0, 100)}...`);
|
|
expect(extractionError.message).toBeTruthy();
|
|
}
|
|
|
|
// Clean up
|
|
await plugins.fs.unlink(truncatedPath);
|
|
}
|
|
|
|
} catch (error) {
|
|
console.log(`Truncated PDF test failed: ${error.message}`);
|
|
throw error;
|
|
}
|
|
|
|
const duration = Date.now() - startTime;
|
|
console.log(`Test completed in ${duration}ms`);
|
|
});
|
|
|
|
tap.test('PDF-09: Corrupted PDF Recovery - Header Corruption', async (tools) => {
|
|
const startTime = Date.now();
|
|
|
|
// Test various PDF header corruption scenarios
|
|
const headerCorruptionTests = [
|
|
{
|
|
name: 'Invalid PDF Header',
|
|
content: '%NOT-A-PDF-1.4\n%âãÏÓ\n',
|
|
expectedError: true
|
|
},
|
|
{
|
|
name: 'Missing PDF Version',
|
|
content: '%PDF-\n%âãÏÓ\n',
|
|
expectedError: true
|
|
},
|
|
{
|
|
name: 'Corrupted Binary Marker',
|
|
content: '%PDF-1.4\n%CORRUPTED\n',
|
|
expectedError: true
|
|
},
|
|
{
|
|
name: 'Empty PDF File',
|
|
content: '',
|
|
expectedError: true
|
|
},
|
|
{
|
|
name: 'Only Header Line',
|
|
content: '%PDF-1.4\n',
|
|
expectedError: true
|
|
},
|
|
{
|
|
name: 'Wrong File Extension Content',
|
|
content: 'This is actually a text file, not a PDF',
|
|
expectedError: true
|
|
}
|
|
];
|
|
|
|
for (const headerTest of headerCorruptionTests) {
|
|
console.log(`Testing ${headerTest.name}...`);
|
|
|
|
const corruptedPath = plugins.path.join(process.cwd(), '.nogit', `header-${headerTest.name.toLowerCase().replace(/\s+/g, '-')}.pdf`);
|
|
await plugins.fs.mkdir(plugins.path.dirname(corruptedPath), { recursive: true });
|
|
|
|
try {
|
|
// Create corrupted file
|
|
await plugins.fs.writeFile(corruptedPath, headerTest.content, 'binary');
|
|
|
|
const invoice = new EInvoice();
|
|
const extractionResult = await invoice.fromFile(corruptedPath);
|
|
|
|
if (headerTest.expectedError) {
|
|
if (extractionResult) {
|
|
console.log(` ⚠ Expected error for ${headerTest.name} but extraction succeeded`);
|
|
} else {
|
|
console.log(` ✓ Expected failure - no extraction from ${headerTest.name}`);
|
|
}
|
|
} else {
|
|
console.log(` ✓ ${headerTest.name}: Extraction succeeded as expected`);
|
|
}
|
|
|
|
} catch (extractionError) {
|
|
if (headerTest.expectedError) {
|
|
console.log(` ✓ Expected error for ${headerTest.name}: ${extractionError.message.substring(0, 80)}...`);
|
|
expect(extractionError.message).toBeTruthy();
|
|
} else {
|
|
console.log(` ✗ Unexpected error for ${headerTest.name}: ${extractionError.message}`);
|
|
throw extractionError;
|
|
}
|
|
} finally {
|
|
// Clean up
|
|
try {
|
|
await plugins.fs.unlink(corruptedPath);
|
|
} catch (cleanupError) {
|
|
// Ignore cleanup errors
|
|
}
|
|
}
|
|
}
|
|
|
|
const duration = Date.now() - startTime;
|
|
console.log(`Test completed in ${duration}ms`);
|
|
});
|
|
|
|
tap.test('PDF-09: Corrupted PDF Recovery - Random Byte Corruption', async (tools) => {
|
|
const startTime = Date.now();
|
|
|
|
try {
|
|
const validPdfs = await CorpusLoader.getFiles('ZUGFERD_V1_CORRECT');
|
|
|
|
if (validPdfs.length === 0) {
|
|
console.log('⚠ No valid PDF files found for random corruption testing');
|
|
return;
|
|
}
|
|
|
|
const basePdf = validPdfs[0];
|
|
const originalBuffer = await plugins.fs.readFile(basePdf);
|
|
|
|
console.log(`Testing random byte corruption with: ${plugins.path.basename(basePdf)}`);
|
|
|
|
// Test different levels of random corruption
|
|
const corruptionLevels = [
|
|
{ name: 'Light Corruption (0.1%)', percentage: 0.001 },
|
|
{ name: 'Medium Corruption (1%)', percentage: 0.01 },
|
|
{ name: 'Heavy Corruption (5%)', percentage: 0.05 },
|
|
{ name: 'Severe Corruption (10%)', percentage: 0.1 }
|
|
];
|
|
|
|
for (const corruptionLevel of corruptionLevels) {
|
|
console.log(`Testing ${corruptionLevel.name}...`);
|
|
|
|
// Create corrupted version
|
|
const corruptedBuffer = Buffer.from(originalBuffer);
|
|
const bytesToCorrupt = Math.floor(corruptedBuffer.length * corruptionLevel.percentage);
|
|
|
|
for (let i = 0; i < bytesToCorrupt; i++) {
|
|
const randomIndex = Math.floor(Math.random() * corruptedBuffer.length);
|
|
const randomByte = Math.floor(Math.random() * 256);
|
|
corruptedBuffer[randomIndex] = randomByte;
|
|
}
|
|
|
|
const corruptedPath = plugins.path.join(process.cwd(), '.nogit', `random-${corruptionLevel.name.toLowerCase().replace(/\s+/g, '-')}.pdf`);
|
|
await plugins.fs.mkdir(plugins.path.dirname(corruptedPath), { recursive: true });
|
|
await plugins.fs.writeFile(corruptedPath, corruptedBuffer);
|
|
|
|
try {
|
|
const invoice = new EInvoice();
|
|
const extractionResult = await invoice.fromFile(corruptedPath);
|
|
|
|
if (extractionResult) {
|
|
console.log(` ✓ Resilient recovery from ${corruptionLevel.name}`);
|
|
|
|
// Verify extracted content quality
|
|
const xmlContent = await invoice.toXmlString('ubl');
|
|
if (xmlContent && xmlContent.length > 100) {
|
|
console.log(` Extracted ${xmlContent.length} chars of XML`);
|
|
|
|
// Test if XML is well-formed
|
|
try {
|
|
// Simple XML validation
|
|
if (xmlContent.includes('<?xml') && xmlContent.includes('</')) {
|
|
console.log(` ✓ Extracted XML appears well-formed`);
|
|
}
|
|
} catch (xmlError) {
|
|
console.log(` ⚠ Extracted XML may be malformed: ${xmlError.message}`);
|
|
}
|
|
}
|
|
} else {
|
|
console.log(` ⚠ No extraction possible from ${corruptionLevel.name}`);
|
|
}
|
|
|
|
} catch (extractionError) {
|
|
console.log(` ⚠ Extraction failed for ${corruptionLevel.name}: ${extractionError.message.substring(0, 80)}...`);
|
|
|
|
// Check if error message is helpful
|
|
expect(extractionError.message).toBeTruthy();
|
|
expect(extractionError.message.length).toBeGreaterThan(10);
|
|
}
|
|
|
|
// Clean up
|
|
await plugins.fs.unlink(corruptedPath);
|
|
}
|
|
|
|
} catch (error) {
|
|
console.log(`Random corruption test failed: ${error.message}`);
|
|
throw error;
|
|
}
|
|
|
|
const duration = Date.now() - startTime;
|
|
console.log(`Test completed in ${duration}ms`);
|
|
});
|
|
|
|
tap.test('PDF-09: Corrupted PDF Recovery - Structural Damage', async (tools) => {
|
|
const startTime = Date.now();
|
|
|
|
try {
|
|
const validPdfs = await CorpusLoader.getFiles('ZUGFERD_V1_CORRECT');
|
|
|
|
if (validPdfs.length === 0) {
|
|
console.log('⚠ No valid PDF files found for structural damage testing');
|
|
return;
|
|
}
|
|
|
|
const basePdf = validPdfs[0];
|
|
const originalContent = await plugins.fs.readFile(basePdf, 'binary');
|
|
|
|
console.log(`Testing structural damage with: ${plugins.path.basename(basePdf)}`);
|
|
|
|
// Test different types of structural damage
|
|
const structuralDamageTests = [
|
|
{
|
|
name: 'Missing xref table',
|
|
damage: (content: string) => content.replace(/xref\s*\n[\s\S]*?trailer/g, 'damaged-xref')
|
|
},
|
|
{
|
|
name: 'Corrupted trailer',
|
|
damage: (content: string) => content.replace(/trailer\s*<<[\s\S]*?>>/g, 'damaged-trailer')
|
|
},
|
|
{
|
|
name: 'Missing startxref',
|
|
damage: (content: string) => content.replace(/startxref\s*\d+/g, 'damaged-startxref')
|
|
},
|
|
{
|
|
name: 'Corrupted PDF objects',
|
|
damage: (content: string) => content.replace(/\d+\s+\d+\s+obj/g, 'XX XX damaged')
|
|
},
|
|
{
|
|
name: 'Missing EOF marker',
|
|
damage: (content: string) => content.replace(/%%EOF\s*$/, 'CORRUPTED')
|
|
}
|
|
];
|
|
|
|
for (const damageTest of structuralDamageTests) {
|
|
console.log(`Testing ${damageTest.name}...`);
|
|
|
|
try {
|
|
const damagedContent = damageTest.damage(originalContent);
|
|
const damagedPath = plugins.path.join(process.cwd(), '.nogit', `structural-${damageTest.name.toLowerCase().replace(/\s+/g, '-')}.pdf`);
|
|
|
|
await plugins.fs.mkdir(plugins.path.dirname(damagedPath), { recursive: true });
|
|
await plugins.fs.writeFile(damagedPath, damagedContent, 'binary');
|
|
|
|
const invoice = new EInvoice();
|
|
const extractionResult = await invoice.fromFile(damagedPath);
|
|
|
|
if (extractionResult) {
|
|
console.log(` ✓ Recovered from ${damageTest.name}`);
|
|
|
|
// Test extracted content
|
|
const xmlContent = await invoice.toXmlString('ubl');
|
|
if (xmlContent && xmlContent.length > 50) {
|
|
console.log(` Recovered XML content: ${xmlContent.length} chars`);
|
|
}
|
|
} else {
|
|
console.log(` ⚠ No recovery possible from ${damageTest.name}`);
|
|
}
|
|
|
|
// Clean up
|
|
await plugins.fs.unlink(damagedPath);
|
|
|
|
} catch (extractionError) {
|
|
console.log(` ⚠ ${damageTest.name} extraction failed: ${extractionError.message.substring(0, 80)}...`);
|
|
expect(extractionError.message).toBeTruthy();
|
|
}
|
|
}
|
|
|
|
} catch (error) {
|
|
console.log(`Structural damage test failed: ${error.message}`);
|
|
throw error;
|
|
}
|
|
|
|
const duration = Date.now() - startTime;
|
|
console.log(`Test completed in ${duration}ms`);
|
|
});
|
|
|
|
tap.test('PDF-09: Corrupted PDF Recovery - Attachment Corruption', async (tools) => {
|
|
const startTime = Date.now();
|
|
|
|
// Test scenarios where the XML attachment itself is corrupted
|
|
try {
|
|
const validPdfs = await CorpusLoader.getFiles('ZUGFERD_V1_CORRECT');
|
|
|
|
if (validPdfs.length === 0) {
|
|
console.log('⚠ No valid PDF files found for attachment corruption testing');
|
|
return;
|
|
}
|
|
|
|
const basePdf = validPdfs[0];
|
|
|
|
console.log(`Testing attachment corruption scenarios with: ${plugins.path.basename(basePdf)}`);
|
|
|
|
// First, try to extract XML from the original file to understand the structure
|
|
let originalXml = null;
|
|
try {
|
|
const originalInvoice = new EInvoice();
|
|
const originalResult = await originalInvoice.fromFile(basePdf);
|
|
|
|
if (originalResult) {
|
|
originalXml = await originalInvoice.toXmlString('ubl');
|
|
console.log(`Original XML length: ${originalXml.length} chars`);
|
|
}
|
|
} catch (originalError) {
|
|
console.log(`Could not extract original XML: ${originalError.message}`);
|
|
}
|
|
|
|
// Test various attachment corruption scenarios
|
|
const attachmentTests = [
|
|
{
|
|
name: 'Partial XML Loss',
|
|
description: 'Simulate partial loss of XML attachment data'
|
|
},
|
|
{
|
|
name: 'Encoding Corruption',
|
|
description: 'Simulate character encoding corruption'
|
|
},
|
|
{
|
|
name: 'Compression Corruption',
|
|
description: 'Simulate corruption in compressed attachment streams'
|
|
},
|
|
{
|
|
name: 'Multiple Attachments',
|
|
description: 'Test handling when PDF contains multiple/conflicting XML attachments'
|
|
}
|
|
];
|
|
|
|
for (const attachmentTest of attachmentTests) {
|
|
console.log(`Testing ${attachmentTest.name}: ${attachmentTest.description}`);
|
|
|
|
try {
|
|
const invoice = new EInvoice();
|
|
|
|
// Attempt extraction with error handling
|
|
const extractionResult = await invoice.fromFile(basePdf);
|
|
|
|
if (extractionResult) {
|
|
// If we got any result, test the robustness of the extraction
|
|
const extractedXml = await invoice.toXmlString('ubl');
|
|
|
|
if (extractedXml) {
|
|
// Test XML integrity
|
|
const integrityChecks = {
|
|
hasXmlDeclaration: extractedXml.startsWith('<?xml'),
|
|
hasRootElement: extractedXml.includes('<') && extractedXml.includes('>'),
|
|
hasClosingTags: extractedXml.includes('</'),
|
|
isBalanced: (extractedXml.match(/</g) || []).length === (extractedXml.match(/>/g) || []).length
|
|
};
|
|
|
|
console.log(` XML Integrity Checks:`);
|
|
console.log(` Has XML Declaration: ${integrityChecks.hasXmlDeclaration}`);
|
|
console.log(` Has Root Element: ${integrityChecks.hasRootElement}`);
|
|
console.log(` Has Closing Tags: ${integrityChecks.hasClosingTags}`);
|
|
console.log(` Tags Balanced: ${integrityChecks.isBalanced}`);
|
|
|
|
if (Object.values(integrityChecks).every(check => check === true)) {
|
|
console.log(` ✓ ${attachmentTest.name}: XML integrity maintained`);
|
|
} else {
|
|
console.log(` ⚠ ${attachmentTest.name}: XML integrity issues detected`);
|
|
}
|
|
}
|
|
} else {
|
|
console.log(` ⚠ ${attachmentTest.name}: No XML extracted`);
|
|
}
|
|
|
|
} catch (extractionError) {
|
|
console.log(` ⚠ ${attachmentTest.name} extraction failed: ${extractionError.message.substring(0, 80)}...`);
|
|
|
|
// Verify error contains useful information
|
|
expect(extractionError.message).toBeTruthy();
|
|
|
|
// Check if error suggests recovery options
|
|
const errorMessage = extractionError.message.toLowerCase();
|
|
if (errorMessage.includes('corrupt') ||
|
|
errorMessage.includes('malformed') ||
|
|
errorMessage.includes('damaged')) {
|
|
console.log(` ✓ Error message indicates corruption: helpful for debugging`);
|
|
}
|
|
}
|
|
}
|
|
|
|
} catch (error) {
|
|
console.log(`Attachment corruption test failed: ${error.message}`);
|
|
throw error;
|
|
}
|
|
|
|
const duration = Date.now() - startTime;
|
|
console.log(`Test completed in ${duration}ms`);
|
|
});
|
|
|
|
tap.test('PDF-09: Corrupted PDF Recovery - Error Reporting Quality', async (tools) => {
|
|
const startTime = Date.now();
|
|
|
|
// Test quality of error reporting for corrupted PDFs
|
|
const errorReportingTests = [
|
|
{
|
|
name: 'Completely Invalid File',
|
|
content: 'This is definitely not a PDF file at all',
|
|
expectedErrorTypes: ['format', 'invalid', 'not-pdf']
|
|
},
|
|
{
|
|
name: 'Binary Garbage',
|
|
content: Buffer.from([0x00, 0xFF, 0xAB, 0xCD, 0xEF, 0x12, 0x34, 0x56]),
|
|
expectedErrorTypes: ['binary', 'corrupt', 'invalid']
|
|
},
|
|
{
|
|
name: 'Partial PDF Header',
|
|
content: '%PDF-1.4\n%âãÏÓ\n1 0 obj\n<< >>\nendobj\n',
|
|
expectedErrorTypes: ['incomplete', 'truncated', 'structure']
|
|
}
|
|
];
|
|
|
|
for (const errorTest of errorReportingTests) {
|
|
console.log(`Testing error reporting for: ${errorTest.name}`);
|
|
|
|
const corruptedPath = plugins.path.join(process.cwd(), '.nogit', `error-${errorTest.name.toLowerCase().replace(/\s+/g, '-')}.pdf`);
|
|
await plugins.fs.mkdir(plugins.path.dirname(corruptedPath), { recursive: true });
|
|
|
|
try {
|
|
// Create corrupted file
|
|
if (Buffer.isBuffer(errorTest.content)) {
|
|
await plugins.fs.writeFile(corruptedPath, errorTest.content);
|
|
} else {
|
|
await plugins.fs.writeFile(corruptedPath, errorTest.content, 'binary');
|
|
}
|
|
|
|
const invoice = new EInvoice();
|
|
|
|
try {
|
|
await invoice.fromFile(corruptedPath);
|
|
console.log(` ⚠ Expected error for ${errorTest.name} but operation succeeded`);
|
|
} catch (extractionError) {
|
|
console.log(` ✓ Error caught for ${errorTest.name}`);
|
|
console.log(` Error message: ${extractionError.message}`);
|
|
|
|
// Analyze error message quality
|
|
const errorMessage = extractionError.message.toLowerCase();
|
|
const messageQuality = {
|
|
isDescriptive: extractionError.message.length > 20,
|
|
containsFileInfo: errorMessage.includes('pdf') || errorMessage.includes('file'),
|
|
containsErrorType: errorTest.expectedErrorTypes.some(type => errorMessage.includes(type)),
|
|
isActionable: errorMessage.includes('check') ||
|
|
errorMessage.includes('verify') ||
|
|
errorMessage.includes('ensure') ||
|
|
errorMessage.includes('corrupt')
|
|
};
|
|
|
|
console.log(` Message Quality Analysis:`);
|
|
console.log(` Descriptive (>20 chars): ${messageQuality.isDescriptive}`);
|
|
console.log(` Contains file info: ${messageQuality.containsFileInfo}`);
|
|
console.log(` Contains error type: ${messageQuality.containsErrorType}`);
|
|
console.log(` Is actionable: ${messageQuality.isActionable}`);
|
|
|
|
// Error message should be helpful
|
|
expect(messageQuality.isDescriptive).toBeTrue();
|
|
|
|
if (messageQuality.containsFileInfo && messageQuality.isActionable) {
|
|
console.log(` ✓ High quality error message`);
|
|
} else {
|
|
console.log(` ⚠ Error message could be more helpful`);
|
|
}
|
|
|
|
// Check error object properties
|
|
if (extractionError.code) {
|
|
console.log(` Error code: ${extractionError.code}`);
|
|
}
|
|
|
|
if (extractionError.path) {
|
|
console.log(` Error path: ${extractionError.path}`);
|
|
}
|
|
}
|
|
|
|
} finally {
|
|
// Clean up
|
|
try {
|
|
await plugins.fs.unlink(corruptedPath);
|
|
} catch (cleanupError) {
|
|
// Ignore cleanup errors
|
|
}
|
|
}
|
|
}
|
|
|
|
const duration = Date.now() - startTime;
|
|
console.log(`Test completed in ${duration}ms`);
|
|
});
|
|
|
|
tap.test('PDF-09: Test Summary', async (tools) => {
|
|
console.log(`\n=== Corrupted PDF Recovery Test Summary ===`);
|
|
console.log(`\nCorrupted PDF recovery testing completed.`);
|
|
console.log(`Note: Most corruption tests expect failures - this is normal and indicates proper error handling.`);
|
|
});
|
|
|
|
tap.start();
|