2025-05-28 12:52:08 +00:00
|
|
|
import { tap, expect } from '@git.zone/tstest/tapbundle';
|
2025-05-26 04:04:51 +00:00
|
|
|
import { EInvoice } from '../../../ts/index.js';
|
2025-05-28 12:52:08 +00:00
|
|
|
import { PDFExtractor } from '../../../ts/formats/pdf/pdf.extractor.js';
|
|
|
|
|
|
|
|
tap.test('EDGE-09: Corrupted ZIP Containers - should handle corrupted ZIP/container files gracefully', async () => {
|
|
|
|
console.log('Testing corrupted ZIP container handling...\n');
|
|
|
|
|
|
|
|
// Test 1: Invalid PDF headers
|
|
|
|
const testInvalidPdfHeaders = async () => {
|
|
|
|
const corruptHeaders = [
|
|
|
|
{
|
|
|
|
name: 'wrong-magic-bytes',
|
|
|
|
data: Buffer.from('NOTAPDF\x00\x00\x00\x00'),
|
|
|
|
description: 'Invalid PDF signature'
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: 'truncated-header',
|
|
|
|
data: Buffer.from('PK\x03'),
|
|
|
|
description: 'ZIP-like header (not PDF)'
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: 'empty-file',
|
|
|
|
data: Buffer.from(''),
|
|
|
|
description: 'Empty file'
|
2025-05-26 04:04:51 +00:00
|
|
|
}
|
2025-05-28 12:52:08 +00:00
|
|
|
];
|
|
|
|
|
|
|
|
const results = [];
|
|
|
|
for (const corrupt of corruptHeaders) {
|
|
|
|
try {
|
|
|
|
const extractor = new PDFExtractor();
|
|
|
|
const result = await extractor.extractXml(corrupt.data);
|
|
|
|
results.push({
|
|
|
|
name: corrupt.name,
|
|
|
|
handled: true,
|
|
|
|
success: result.success,
|
|
|
|
error: result.error?.message
|
|
|
|
});
|
|
|
|
} catch (error) {
|
|
|
|
results.push({
|
|
|
|
name: corrupt.name,
|
|
|
|
handled: false,
|
|
|
|
error: error.message
|
|
|
|
});
|
2025-05-26 04:04:51 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-05-28 12:52:08 +00:00
|
|
|
return results;
|
|
|
|
};
|
2025-05-26 04:04:51 +00:00
|
|
|
|
2025-05-28 12:52:08 +00:00
|
|
|
const invalidHeaderResults = await testInvalidPdfHeaders();
|
|
|
|
console.log('Test 1 - Invalid PDF headers:');
|
|
|
|
invalidHeaderResults.forEach(result => {
|
|
|
|
console.log(` ${result.name}: ${result.handled ? 'Handled gracefully' : 'Threw exception'}`);
|
|
|
|
if (result.error) {
|
|
|
|
console.log(` Error: ${result.error.substring(0, 50)}...`);
|
2025-05-26 04:04:51 +00:00
|
|
|
}
|
|
|
|
});
|
2025-05-28 12:52:08 +00:00
|
|
|
// All should be handled gracefully (no exceptions)
|
|
|
|
expect(invalidHeaderResults.every(r => r.handled)).toEqual(true);
|
|
|
|
|
|
|
|
// Test 2: Corrupted PDF structure
|
|
|
|
const testCorruptedPdfStructure = async () => {
|
|
|
|
const corruptedPdfs = [
|
|
|
|
{
|
|
|
|
name: 'pdf-header-only',
|
|
|
|
data: Buffer.from('%PDF-1.4\n'),
|
|
|
|
description: 'PDF header without content'
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: 'incomplete-pdf',
|
|
|
|
data: Buffer.from('%PDF-1.4\n1 0 obj\n<< /Type /Catalog >>\nendobj\n'),
|
|
|
|
description: 'PDF without xref table'
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: 'mixed-binary',
|
|
|
|
data: Buffer.concat([
|
|
|
|
Buffer.from('%PDF-1.4\n'),
|
|
|
|
Buffer.from([0xFF, 0xFE, 0xFD, 0xFC]),
|
|
|
|
Buffer.from('\nendobj\n')
|
|
|
|
]),
|
|
|
|
description: 'PDF with binary garbage'
|
2025-05-26 04:04:51 +00:00
|
|
|
}
|
2025-05-28 12:52:08 +00:00
|
|
|
];
|
|
|
|
|
|
|
|
const results = [];
|
|
|
|
for (const pdf of corruptedPdfs) {
|
|
|
|
try {
|
|
|
|
const einvoice = await EInvoice.fromPdf(pdf.data);
|
|
|
|
results.push({
|
|
|
|
name: pdf.name,
|
|
|
|
loaded: true,
|
|
|
|
hasFormat: einvoice.getFormat() !== 'unknown'
|
|
|
|
});
|
|
|
|
} catch (error) {
|
|
|
|
results.push({
|
|
|
|
name: pdf.name,
|
|
|
|
loaded: false,
|
|
|
|
errorType: error.constructor.name,
|
|
|
|
graceful: !error.message.includes('Cannot read') &&
|
|
|
|
!error.message.includes('undefined') &&
|
|
|
|
(error.message.includes('PDF') || error.message.includes('XML'))
|
|
|
|
});
|
2025-05-26 04:04:51 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-05-28 12:52:08 +00:00
|
|
|
return results;
|
|
|
|
};
|
2025-05-26 04:04:51 +00:00
|
|
|
|
2025-05-28 12:52:08 +00:00
|
|
|
const corruptedPdfResults = await testCorruptedPdfStructure();
|
|
|
|
console.log('\nTest 2 - Corrupted PDF structure:');
|
|
|
|
corruptedPdfResults.forEach(result => {
|
|
|
|
console.log(` ${result.name}: ${result.loaded ? 'Loaded' : 'Failed'} ${result.graceful ? '[Graceful]' : ''}`);
|
2025-05-26 04:04:51 +00:00
|
|
|
});
|
2025-05-28 12:52:08 +00:00
|
|
|
// All should fail gracefully
|
|
|
|
expect(corruptedPdfResults.every(r => !r.loaded && r.graceful)).toEqual(true);
|
|
|
|
|
|
|
|
// Test 3: Non-PDF files masquerading as PDFs
|
|
|
|
const testNonPdfFiles = async () => {
|
|
|
|
const nonPdfFiles = [
|
|
|
|
{
|
|
|
|
name: 'xml-file',
|
|
|
|
data: Buffer.from('<?xml version="1.0"?><Invoice xmlns="test"><ID>TEST-001</ID></Invoice>'),
|
|
|
|
description: 'Plain XML file'
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: 'json-file',
|
|
|
|
data: Buffer.from('{"invoice": {"id": "TEST-001", "amount": 100}}'),
|
|
|
|
description: 'JSON file'
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: 'html-file',
|
|
|
|
data: Buffer.from('<!DOCTYPE html><html><body><h1>Invoice</h1></body></html>'),
|
|
|
|
description: 'HTML file'
|
2025-05-26 04:04:51 +00:00
|
|
|
}
|
2025-05-28 12:52:08 +00:00
|
|
|
];
|
|
|
|
|
|
|
|
const results = [];
|
|
|
|
for (const file of nonPdfFiles) {
|
|
|
|
try {
|
|
|
|
const einvoice = await EInvoice.fromPdf(file.data);
|
|
|
|
results.push({
|
|
|
|
name: file.name,
|
|
|
|
processed: true,
|
|
|
|
format: einvoice.getFormat()
|
|
|
|
});
|
|
|
|
} catch (error) {
|
|
|
|
results.push({
|
|
|
|
name: file.name,
|
|
|
|
processed: false,
|
|
|
|
errorClear: error.message.includes('PDF') ||
|
|
|
|
error.message.includes('No XML found') ||
|
|
|
|
error.message.includes('Invalid')
|
|
|
|
});
|
2025-05-26 04:04:51 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-05-28 12:52:08 +00:00
|
|
|
return results;
|
|
|
|
};
|
2025-05-26 04:04:51 +00:00
|
|
|
|
2025-05-28 12:52:08 +00:00
|
|
|
const nonPdfResults = await testNonPdfFiles();
|
|
|
|
console.log('\nTest 3 - Non-PDF files:');
|
|
|
|
nonPdfResults.forEach(result => {
|
|
|
|
console.log(` ${result.name}: ${result.processed ? `Processed (${result.format})` : 'Rejected'} ${result.errorClear ? '[Clear error]' : ''}`);
|
2025-05-26 04:04:51 +00:00
|
|
|
});
|
2025-05-28 12:52:08 +00:00
|
|
|
// All should be rejected with clear errors
|
|
|
|
expect(nonPdfResults.every(r => !r.processed && r.errorClear)).toEqual(true);
|
|
|
|
|
|
|
|
// Test 4: Edge case sizes
|
|
|
|
const testEdgeCaseSizes = async () => {
|
|
|
|
const sizes = [
|
|
|
|
{ size: 0, name: 'empty' },
|
|
|
|
{ size: 1, name: '1-byte' },
|
|
|
|
{ size: 10, name: '10-bytes' },
|
|
|
|
{ size: 1024, name: '1KB' }
|
|
|
|
];
|
|
|
|
|
|
|
|
const results = [];
|
|
|
|
for (const { size, name } of sizes) {
|
|
|
|
const data = Buffer.alloc(size);
|
|
|
|
if (size > 0) {
|
|
|
|
// Add partial PDF header if there's space
|
|
|
|
const header = '%PDF-1.4';
|
|
|
|
data.write(header.substring(0, Math.min(size, header.length)), 0);
|
|
|
|
}
|
2025-05-26 04:04:51 +00:00
|
|
|
|
2025-05-28 12:52:08 +00:00
|
|
|
try {
|
|
|
|
const extractor = new PDFExtractor();
|
|
|
|
const result = await extractor.extractXml(data);
|
|
|
|
results.push({
|
|
|
|
size: name,
|
|
|
|
handled: true,
|
|
|
|
hasError: !!result.error
|
|
|
|
});
|
|
|
|
} catch (error) {
|
|
|
|
results.push({
|
|
|
|
size: name,
|
|
|
|
handled: false,
|
|
|
|
error: error.message
|
|
|
|
});
|
2025-05-26 04:04:51 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-05-28 12:52:08 +00:00
|
|
|
return results;
|
|
|
|
};
|
2025-05-26 04:04:51 +00:00
|
|
|
|
2025-05-28 12:52:08 +00:00
|
|
|
const sizeResults = await testEdgeCaseSizes();
|
|
|
|
console.log('\nTest 4 - Edge case sizes:');
|
|
|
|
sizeResults.forEach(result => {
|
|
|
|
console.log(` ${result.size}: ${result.handled ? 'Handled' : 'Exception'} ${result.hasError ? '[Expected error]' : ''}`);
|
|
|
|
});
|
|
|
|
// All should be handled without throwing
|
|
|
|
expect(sizeResults.every(r => r.handled)).toEqual(true);
|
|
|
|
|
|
|
|
// Test 5: Partial PDF with embedded XML (recovery test)
|
|
|
|
const testPartialPdfRecovery = async () => {
|
|
|
|
// Create a partial PDF that might contain XML
|
|
|
|
const partialPdfWithXml = Buffer.concat([
|
|
|
|
Buffer.from('%PDF-1.4\n'),
|
|
|
|
Buffer.from('1 0 obj\n<<\n/Type /EmbeddedFile\n/Subtype /text#2Fxml\n>>\nstream\n'),
|
|
|
|
Buffer.from('<?xml version="1.0"?>\n<rsm:CrossIndustryInvoice xmlns:rsm="urn:un:unece:uncefact:data:standard:CrossIndustryInvoice:100">\n'),
|
|
|
|
Buffer.from(' <rsm:ExchangedDocument>\n <ram:ID>PARTIAL-001</ram:ID>\n </rsm:ExchangedDocument>\n'),
|
|
|
|
Buffer.from('</rsm:CrossIndustryInvoice>\n'),
|
|
|
|
Buffer.from('endstream\nendobj\n')
|
|
|
|
// Intentionally incomplete - missing xref and trailer
|
|
|
|
]);
|
|
|
|
|
|
|
|
try {
|
|
|
|
const extractor = new PDFExtractor();
|
|
|
|
const result = await extractor.extractXml(partialPdfWithXml);
|
|
|
|
|
|
|
|
return {
|
|
|
|
extracted: result.success,
|
|
|
|
hasXml: !!result.xml,
|
|
|
|
xmlValid: result.xml ? result.xml.includes('PARTIAL-001') : false,
|
|
|
|
errorType: result.error?.type
|
|
|
|
};
|
|
|
|
} catch (error) {
|
|
|
|
return {
|
|
|
|
extracted: false,
|
|
|
|
exception: true,
|
|
|
|
error: error.message
|
|
|
|
};
|
2025-05-26 04:04:51 +00:00
|
|
|
}
|
2025-05-28 12:52:08 +00:00
|
|
|
};
|
2025-05-26 04:04:51 +00:00
|
|
|
|
2025-05-28 12:52:08 +00:00
|
|
|
const recoveryResult = await testPartialPdfRecovery();
|
|
|
|
console.log('\nTest 5 - Partial PDF recovery:');
|
|
|
|
console.log(` Extraction: ${recoveryResult.extracted ? 'Success' : 'Failed'}`);
|
|
|
|
console.log(` Has XML: ${recoveryResult.hasXml || false}`);
|
|
|
|
console.log(` Exception: ${recoveryResult.exception || false}`);
|
2025-05-26 04:04:51 +00:00
|
|
|
|
2025-05-28 12:52:08 +00:00
|
|
|
// Should handle gracefully even if extraction fails
|
|
|
|
expect(!recoveryResult.exception).toEqual(true);
|
2025-05-26 04:04:51 +00:00
|
|
|
|
2025-05-28 12:52:08 +00:00
|
|
|
console.log('\n✓ All corrupted ZIP/PDF edge cases handled appropriately');
|
|
|
|
});
|
2025-05-26 04:04:51 +00:00
|
|
|
|
|
|
|
tap.start();
|