einvoice/test/suite/einvoice_edge-cases/test.edge-09.corrupted-zip.ts

import { tap, expect } from '@git.zone/tstest/tapbundle';
import { EInvoice } from '../../../ts/index.js';
import { PDFExtractor } from '../../../ts/formats/pdf/pdf.extractor.js';

tap.test('EDGE-09: Corrupted ZIP Containers - should handle corrupted ZIP/container files gracefully', async () => {
  console.log('Testing corrupted ZIP container handling...\n');

  // Test 1: Invalid PDF headers
  const testInvalidPdfHeaders = async () => {
    const corruptHeaders = [
      {
        name: 'wrong-magic-bytes',
        data: Buffer.from('NOTAPDF\x00\x00\x00\x00'),
        description: 'Invalid PDF signature'
      },
      {
        name: 'truncated-header',
        data: Buffer.from('PK\x03'),
        description: 'ZIP-like header (not PDF)'
      },
      {
        name: 'empty-file',
        data: Buffer.from(''),
        description: 'Empty file'
      }
    ];

    const results = [];
    for (const corrupt of corruptHeaders) {
      try {
        const extractor = new PDFExtractor();
        const result = await extractor.extractXml(corrupt.data);
        results.push({
          name: corrupt.name,
          handled: true,
          success: result.success,
          error: result.error?.message
        });
      } catch (error) {
        results.push({
          name: corrupt.name,
          handled: false,
          error: error.message
        });
      }
    }

    return results;
  };

  const invalidHeaderResults = await testInvalidPdfHeaders();
  console.log('Test 1 - Invalid PDF headers:');
  invalidHeaderResults.forEach(result => {
    console.log(`  ${result.name}: ${result.handled ? 'Handled gracefully' : 'Threw exception'}`);
    if (result.error) {
      console.log(`    Error: ${result.error.substring(0, 50)}...`);
    }
  });
  // All should be handled gracefully (no exceptions)
  expect(invalidHeaderResults.every(r => r.handled)).toEqual(true);

  // Test 2: Corrupted PDF structure
  const testCorruptedPdfStructure = async () => {
    const corruptedPdfs = [
      {
        name: 'pdf-header-only',
        data: Buffer.from('%PDF-1.4\n'),
        description: 'PDF header without content'
      },
      {
        name: 'incomplete-pdf',
        data: Buffer.from('%PDF-1.4\n1 0 obj\n<< /Type /Catalog >>\nendobj\n'),
        description: 'PDF without xref table'
      },
      {
        name: 'mixed-binary',
        data: Buffer.concat([
          Buffer.from('%PDF-1.4\n'),
          Buffer.from([0xFF, 0xFE, 0xFD, 0xFC]),
          Buffer.from('\nendobj\n')
        ]),
        description: 'PDF with binary garbage'
      }
    ];

    const results = [];
    for (const pdf of corruptedPdfs) {
      try {
        const einvoice = await EInvoice.fromPdf(pdf.data);
        results.push({
          name: pdf.name,
          loaded: true,
          hasFormat: einvoice.getFormat() !== 'unknown'
        });
      } catch (error) {
        results.push({
          name: pdf.name,
          loaded: false,
          errorType: error.constructor.name,
          graceful: !error.message.includes('Cannot read') &&
                    !error.message.includes('undefined') &&
                    (error.message.includes('PDF') || error.message.includes('XML'))
        });
      }
    }

    return results;
  };

  const corruptedPdfResults = await testCorruptedPdfStructure();
  console.log('\nTest 2 - Corrupted PDF structure:');
  corruptedPdfResults.forEach(result => {
    console.log(`  ${result.name}: ${result.loaded ? 'Loaded' : 'Failed'} ${result.graceful ? '[Graceful]' : ''}`);
  });
  // All should fail gracefully
  expect(corruptedPdfResults.every(r => !r.loaded && r.graceful)).toEqual(true);

  // Test 3: Non-PDF files masquerading as PDFs
  const testNonPdfFiles = async () => {
    const nonPdfFiles = [
      {
        name: 'xml-file',
        data: Buffer.from('<?xml version="1.0"?><Invoice xmlns="test"><ID>TEST-001</ID></Invoice>'),
        description: 'Plain XML file'
      },
      {
        name: 'json-file',
        data: Buffer.from('{"invoice": {"id": "TEST-001", "amount": 100}}'),
        description: 'JSON file'
      },
      {
        name: 'html-file',
        data: Buffer.from('<!DOCTYPE html><html><body><h1>Invoice</h1></body></html>'),
        description: 'HTML file'
      }
    ];

    const results = [];
    for (const file of nonPdfFiles) {
      try {
        const einvoice = await EInvoice.fromPdf(file.data);
        results.push({
          name: file.name,
          processed: true,
          format: einvoice.getFormat()
        });
      } catch (error) {
        results.push({
          name: file.name,
          processed: false,
          errorClear: error.message.includes('PDF') ||
                      error.message.includes('No XML found') ||
                      error.message.includes('Invalid')
        });
      }
    }

    return results;
  };

  const nonPdfResults = await testNonPdfFiles();
  console.log('\nTest 3 - Non-PDF files:');
  nonPdfResults.forEach(result => {
    console.log(`  ${result.name}: ${result.processed ? `Processed (${result.format})` : 'Rejected'} ${result.errorClear ? '[Clear error]' : ''}`);
  });
  // All should be rejected with clear errors
  expect(nonPdfResults.every(r => !r.processed && r.errorClear)).toEqual(true);

  // Test 4: Edge case sizes
  const testEdgeCaseSizes = async () => {
    const sizes = [
      { size: 0, name: 'empty' },
      { size: 1, name: '1-byte' },
      { size: 10, name: '10-bytes' },
      { size: 1024, name: '1KB' }
    ];

    const results = [];
    for (const { size, name } of sizes) {
      const data = Buffer.alloc(size);
      if (size > 0) {
        // Add partial PDF header if there's space
        const header = '%PDF-1.4';
        data.write(header.substring(0, Math.min(size, header.length)), 0);
      }

      try {
        const extractor = new PDFExtractor();
        const result = await extractor.extractXml(data);
        results.push({
          size: name,
          handled: true,
          hasError: !!result.error
        });
      } catch (error) {
        results.push({
          size: name,
          handled: false,
          error: error.message
        });
      }
    }

    return results;
  };

  const sizeResults = await testEdgeCaseSizes();
  console.log('\nTest 4 - Edge case sizes:');
  sizeResults.forEach(result => {
    console.log(`  ${result.size}: ${result.handled ? 'Handled' : 'Exception'} ${result.hasError ? '[Expected error]' : ''}`);
  });
  // All should be handled without throwing
  expect(sizeResults.every(r => r.handled)).toEqual(true);

  // Test 5: Partial PDF with embedded XML (recovery test)
  const testPartialPdfRecovery = async () => {
    // Create a partial PDF that might contain XML
    const partialPdfWithXml = Buffer.concat([
      Buffer.from('%PDF-1.4\n'),
      Buffer.from('1 0 obj\n<<\n/Type /EmbeddedFile\n/Subtype /text#2Fxml\n>>\nstream\n'),
      Buffer.from('<?xml version="1.0"?>\n<rsm:CrossIndustryInvoice xmlns:rsm="urn:un:unece:uncefact:data:standard:CrossIndustryInvoice:100">\n'),
      Buffer.from('  <rsm:ExchangedDocument>\n    <ram:ID>PARTIAL-001</ram:ID>\n  </rsm:ExchangedDocument>\n'),
      Buffer.from('</rsm:CrossIndustryInvoice>\n'),
      Buffer.from('endstream\nendobj\n')
      // Intentionally incomplete - missing xref and trailer
    ]);

    try {
      const extractor = new PDFExtractor();
      const result = await extractor.extractXml(partialPdfWithXml);

      return {
        extracted: result.success,
        hasXml: !!result.xml,
        xmlValid: result.xml ? result.xml.includes('PARTIAL-001') : false,
        errorType: result.error?.type
      };
    } catch (error) {
      return {
        extracted: false,
        exception: true,
        error: error.message
      };
    }
  };

  const recoveryResult = await testPartialPdfRecovery();
  console.log('\nTest 5 - Partial PDF recovery:');
  console.log(`  Extraction: ${recoveryResult.extracted ? 'Success' : 'Failed'}`);
  console.log(`  Has XML: ${recoveryResult.hasXml || false}`);
  console.log(`  Exception: ${recoveryResult.exception || false}`);

  // Should handle gracefully even if extraction fails
  expect(!recoveryResult.exception).toEqual(true);

  console.log('\n✓ All corrupted ZIP/PDF edge cases handled appropriately');
});

tap.start();