einvoice/test/suite/einvoice_pdf-operations/test.pdf-09.corrupted-pdf.ts

import { tap, expect } from '@git.zone/tstest/tapbundle';
import * as plugins from '../../../ts/plugins.js';
import { EInvoice } from '../../../ts/index.js';
import { CorpusLoader } from '../../helpers/corpus.loader.js';

// PDF-09: Corrupted PDF Recovery
// Tests recovery mechanisms for corrupted, malformed, or partially damaged PDF files
// including graceful error handling and data recovery strategies

tap.test('PDF-09: Corrupted PDF Recovery - Truncated PDF Files', async (tools) => {
  const startTime = Date.now();

  try {
    // Get a working PDF from corpus to create corrupted versions
    const validPdfs = await CorpusLoader.getFiles('ZUGFERD_V1_CORRECT');

    if (validPdfs.length === 0) {
      console.log('⚠ No valid PDF files found for corruption testing');
      return;
    }

    const basePdf = validPdfs[0];
    const basePdfName = plugins.path.basename(basePdf);

    console.log(`Creating corrupted versions of: ${basePdfName}`);

    // Read the original PDF
    const originalPdfBuffer = await plugins.fs.readFile(basePdf);
    const originalSize = originalPdfBuffer.length;

    console.log(`Original PDF size: ${(originalSize / 1024).toFixed(1)}KB`);

    // Test different levels of truncation
    const truncationTests = [
      { name: '90% Truncated', percentage: 0.9 },
      { name: '75% Truncated', percentage: 0.75 },
      { name: '50% Truncated', percentage: 0.5 },
      { name: '25% Truncated', percentage: 0.25 },
      { name: '10% Truncated', percentage: 0.1 }
    ];

    for (const truncationTest of truncationTests) {
      const truncatedSize = Math.floor(originalSize * truncationTest.percentage);
      const truncatedBuffer = originalPdfBuffer.subarray(0, truncatedSize);

      const truncatedPath = plugins.path.join(process.cwd(), '.nogit', `truncated-${truncationTest.name.toLowerCase().replace(/\s+/g, '-')}.pdf`);
      await plugins.fs.mkdir(plugins.path.dirname(truncatedPath), { recursive: true });
      await plugins.fs.writeFile(truncatedPath, truncatedBuffer);

      console.log(`Testing ${truncationTest.name} (${(truncatedSize / 1024).toFixed(1)}KB)...`);

      try {
        const invoice = new EInvoice();
        const extractionResult = await invoice.fromFile(truncatedPath);

        if (extractionResult) {
          console.log(`  ✓ Unexpected success - managed to extract from ${truncationTest.name}`);

          // Verify extracted content
          const xmlContent = await invoice.toXmlString('ubl');
          if (xmlContent && xmlContent.length > 50) {
            console.log(`    Extracted XML length: ${xmlContent.length} chars`);
          }
        } else {
          console.log(`  ✓ Expected failure - no extraction from ${truncationTest.name}`);
        }

      } catch (extractionError) {
        // Expected for corrupted files
        console.log(`  ✓ Expected error for ${truncationTest.name}: ${extractionError.message.substring(0, 100)}...`);
        expect(extractionError.message).toBeTruthy();
      }

      // Clean up
      await plugins.fs.unlink(truncatedPath);
    }

  } catch (error) {
    console.log(`Truncated PDF test failed: ${error.message}`);
    throw error;
  }

  const duration = Date.now() - startTime;
  console.log(`Test completed in ${duration}ms`);
});

tap.test('PDF-09: Corrupted PDF Recovery - Header Corruption', async (tools) => {
  const startTime = Date.now();

  // Test various PDF header corruption scenarios
  const headerCorruptionTests = [
    {
      name: 'Invalid PDF Header',
      content: '%NOT-A-PDF-1.4\n%âãÏÓ\n',
      expectedError: true
    },
    {
      name: 'Missing PDF Version',
      content: '%PDF-\n%âãÏÓ\n',
      expectedError: true
    },
    {
      name: 'Corrupted Binary Marker',
      content: '%PDF-1.4\n%CORRUPTED\n',
      expectedError: true
    },
    {
      name: 'Empty PDF File',
      content: '',
      expectedError: true
    },
    {
      name: 'Only Header Line',
      content: '%PDF-1.4\n',
      expectedError: true
    },
    {
      name: 'Wrong File Extension Content',
      content: 'This is actually a text file, not a PDF',
      expectedError: true
    }
  ];

  for (const headerTest of headerCorruptionTests) {
    console.log(`Testing ${headerTest.name}...`);

    const corruptedPath = plugins.path.join(process.cwd(), '.nogit', `header-${headerTest.name.toLowerCase().replace(/\s+/g, '-')}.pdf`);
    await plugins.fs.mkdir(plugins.path.dirname(corruptedPath), { recursive: true });

    try {
      // Create corrupted file
      await plugins.fs.writeFile(corruptedPath, headerTest.content, 'binary');

      const invoice = new EInvoice();
      const extractionResult = await invoice.fromFile(corruptedPath);

      if (headerTest.expectedError) {
        if (extractionResult) {
          console.log(`  ⚠ Expected error for ${headerTest.name} but extraction succeeded`);
        } else {
          console.log(`  ✓ Expected failure - no extraction from ${headerTest.name}`);
        }
      } else {
        console.log(`  ✓ ${headerTest.name}: Extraction succeeded as expected`);
      }

    } catch (extractionError) {
      if (headerTest.expectedError) {
        console.log(`  ✓ Expected error for ${headerTest.name}: ${extractionError.message.substring(0, 80)}...`);
        expect(extractionError.message).toBeTruthy();
      } else {
        console.log(`  ✗ Unexpected error for ${headerTest.name}: ${extractionError.message}`);
        throw extractionError;
      }
    } finally {
      // Clean up
      try {
        await plugins.fs.unlink(corruptedPath);
      } catch (cleanupError) {
        // Ignore cleanup errors
      }
    }
  }

  const duration = Date.now() - startTime;
  console.log(`Test completed in ${duration}ms`);
});

tap.test('PDF-09: Corrupted PDF Recovery - Random Byte Corruption', async (tools) => {
  const startTime = Date.now();

  try {
    const validPdfs = await CorpusLoader.getFiles('ZUGFERD_V1_CORRECT');

    if (validPdfs.length === 0) {
      console.log('⚠ No valid PDF files found for random corruption testing');
      return;
    }

    const basePdf = validPdfs[0];
    const originalBuffer = await plugins.fs.readFile(basePdf);

    console.log(`Testing random byte corruption with: ${plugins.path.basename(basePdf)}`);

    // Test different levels of random corruption
    const corruptionLevels = [
      { name: 'Light Corruption (0.1%)', percentage: 0.001 },
      { name: 'Medium Corruption (1%)', percentage: 0.01 },
      { name: 'Heavy Corruption (5%)', percentage: 0.05 },
      { name: 'Severe Corruption (10%)', percentage: 0.1 }
    ];

    for (const corruptionLevel of corruptionLevels) {
      console.log(`Testing ${corruptionLevel.name}...`);

      // Create corrupted version
      const corruptedBuffer = Buffer.from(originalBuffer);
      const bytesToCorrupt = Math.floor(corruptedBuffer.length * corruptionLevel.percentage);

      for (let i = 0; i < bytesToCorrupt; i++) {
        const randomIndex = Math.floor(Math.random() * corruptedBuffer.length);
        const randomByte = Math.floor(Math.random() * 256);
        corruptedBuffer[randomIndex] = randomByte;
      }

      const corruptedPath = plugins.path.join(process.cwd(), '.nogit', `random-${corruptionLevel.name.toLowerCase().replace(/\s+/g, '-')}.pdf`);
      await plugins.fs.mkdir(plugins.path.dirname(corruptedPath), { recursive: true });
      await plugins.fs.writeFile(corruptedPath, corruptedBuffer);

      try {
        const invoice = new EInvoice();
        const extractionResult = await invoice.fromFile(corruptedPath);

        if (extractionResult) {
          console.log(`  ✓ Resilient recovery from ${corruptionLevel.name}`);

          // Verify extracted content quality
          const xmlContent = await invoice.toXmlString('ubl');
          if (xmlContent && xmlContent.length > 100) {
            console.log(`    Extracted ${xmlContent.length} chars of XML`);

            // Test if XML is well-formed
            try {
              // Simple XML validation
              if (xmlContent.includes('<?xml') && xmlContent.includes('</')) {
                console.log(`    ✓ Extracted XML appears well-formed`);
              }
            } catch (xmlError) {
              console.log(`    ⚠ Extracted XML may be malformed: ${xmlError.message}`);
            }
          }
        } else {
          console.log(`  ⚠ No extraction possible from ${corruptionLevel.name}`);
        }

      } catch (extractionError) {
        console.log(`  ⚠ Extraction failed for ${corruptionLevel.name}: ${extractionError.message.substring(0, 80)}...`);

        // Check if error message is helpful
        expect(extractionError.message).toBeTruthy();
        expect(extractionError.message.length).toBeGreaterThan(10);
      }

      // Clean up
      await plugins.fs.unlink(corruptedPath);
    }

  } catch (error) {
    console.log(`Random corruption test failed: ${error.message}`);
    throw error;
  }

  const duration = Date.now() - startTime;
  console.log(`Test completed in ${duration}ms`);
});

tap.test('PDF-09: Corrupted PDF Recovery - Structural Damage', async (tools) => {
  const startTime = Date.now();

  try {
    const validPdfs = await CorpusLoader.getFiles('ZUGFERD_V1_CORRECT');

    if (validPdfs.length === 0) {
      console.log('⚠ No valid PDF files found for structural damage testing');
      return;
    }

    const basePdf = validPdfs[0];
    const originalContent = await plugins.fs.readFile(basePdf, 'binary');

    console.log(`Testing structural damage with: ${plugins.path.basename(basePdf)}`);

    // Test different types of structural damage
    const structuralDamageTests = [
      {
        name: 'Missing xref table',
        damage: (content: string) => content.replace(/xref\s*\n[\s\S]*?trailer/g, 'damaged-xref')
      },
      {
        name: 'Corrupted trailer',
        damage: (content: string) => content.replace(/trailer\s*<<[\s\S]*?>>/g, 'damaged-trailer')
      },
      {
        name: 'Missing startxref',
        damage: (content: string) => content.replace(/startxref\s*\d+/g, 'damaged-startxref')
      },
      {
        name: 'Corrupted PDF objects',
        damage: (content: string) => content.replace(/\d+\s+\d+\s+obj/g, 'XX XX damaged')
      },
      {
        name: 'Missing EOF marker',
        damage: (content: string) => content.replace(/%%EOF\s*$/, 'CORRUPTED')
      }
    ];

    for (const damageTest of structuralDamageTests) {
      console.log(`Testing ${damageTest.name}...`);

      try {
        const damagedContent = damageTest.damage(originalContent);
        const damagedPath = plugins.path.join(process.cwd(), '.nogit', `structural-${damageTest.name.toLowerCase().replace(/\s+/g, '-')}.pdf`);

        await plugins.fs.mkdir(plugins.path.dirname(damagedPath), { recursive: true });
        await plugins.fs.writeFile(damagedPath, damagedContent, 'binary');

        const invoice = new EInvoice();
        const extractionResult = await invoice.fromFile(damagedPath);

        if (extractionResult) {
          console.log(`  ✓ Recovered from ${damageTest.name}`);

          // Test extracted content
          const xmlContent = await invoice.toXmlString('ubl');
          if (xmlContent && xmlContent.length > 50) {
            console.log(`    Recovered XML content: ${xmlContent.length} chars`);
          }
        } else {
          console.log(`  ⚠ No recovery possible from ${damageTest.name}`);
        }

        // Clean up
        await plugins.fs.unlink(damagedPath);

      } catch (extractionError) {
        console.log(`  ⚠ ${damageTest.name} extraction failed: ${extractionError.message.substring(0, 80)}...`);
        expect(extractionError.message).toBeTruthy();
      }
    }

  } catch (error) {
    console.log(`Structural damage test failed: ${error.message}`);
    throw error;
  }

  const duration = Date.now() - startTime;
  console.log(`Test completed in ${duration}ms`);
});

tap.test('PDF-09: Corrupted PDF Recovery - Attachment Corruption', async (tools) => {
  const startTime = Date.now();

  // Test scenarios where the XML attachment itself is corrupted
  try {
    const validPdfs = await CorpusLoader.getFiles('ZUGFERD_V1_CORRECT');

    if (validPdfs.length === 0) {
      console.log('⚠ No valid PDF files found for attachment corruption testing');
      return;
    }

    const basePdf = validPdfs[0];

    console.log(`Testing attachment corruption scenarios with: ${plugins.path.basename(basePdf)}`);

    // First, try to extract XML from the original file to understand the structure
    let originalXml = null;
    try {
      const originalInvoice = new EInvoice();
      const originalResult = await originalInvoice.fromFile(basePdf);

      if (originalResult) {
        originalXml = await originalInvoice.toXmlString('ubl');
        console.log(`Original XML length: ${originalXml.length} chars`);
      }
    } catch (originalError) {
      console.log(`Could not extract original XML: ${originalError.message}`);
    }

    // Test various attachment corruption scenarios
    const attachmentTests = [
      {
        name: 'Partial XML Loss',
        description: 'Simulate partial loss of XML attachment data'
      },
      {
        name: 'Encoding Corruption',
        description: 'Simulate character encoding corruption'
      },
      {
        name: 'Compression Corruption',
        description: 'Simulate corruption in compressed attachment streams'
      },
      {
        name: 'Multiple Attachments',
        description: 'Test handling when PDF contains multiple/conflicting XML attachments'
      }
    ];

    for (const attachmentTest of attachmentTests) {
      console.log(`Testing ${attachmentTest.name}: ${attachmentTest.description}`);

      try {
        const invoice = new EInvoice();

        // Attempt extraction with error handling
        const extractionResult = await invoice.fromFile(basePdf);

        if (extractionResult) {
          // If we got any result, test the robustness of the extraction
          const extractedXml = await invoice.toXmlString('ubl');

          if (extractedXml) {
            // Test XML integrity
            const integrityChecks = {
              hasXmlDeclaration: extractedXml.startsWith('<?xml'),
              hasRootElement: extractedXml.includes('<') && extractedXml.includes('>'),
              hasClosingTags: extractedXml.includes('</'),
              isBalanced: (extractedXml.match(/</g) || []).length === (extractedXml.match(/>/g) || []).length
            };

            console.log(`  XML Integrity Checks:`);
            console.log(`    Has XML Declaration: ${integrityChecks.hasXmlDeclaration}`);
            console.log(`    Has Root Element: ${integrityChecks.hasRootElement}`);
            console.log(`    Has Closing Tags: ${integrityChecks.hasClosingTags}`);
            console.log(`    Tags Balanced: ${integrityChecks.isBalanced}`);

            if (Object.values(integrityChecks).every(check => check === true)) {
              console.log(`  ✓ ${attachmentTest.name}: XML integrity maintained`);
            } else {
              console.log(`  ⚠ ${attachmentTest.name}: XML integrity issues detected`);
            }
          }
        } else {
          console.log(`  ⚠ ${attachmentTest.name}: No XML extracted`);
        }

      } catch (extractionError) {
        console.log(`  ⚠ ${attachmentTest.name} extraction failed: ${extractionError.message.substring(0, 80)}...`);

        // Verify error contains useful information
        expect(extractionError.message).toBeTruthy();

        // Check if error suggests recovery options
        const errorMessage = extractionError.message.toLowerCase();
        if (errorMessage.includes('corrupt') ||
            errorMessage.includes('malformed') ||
            errorMessage.includes('damaged')) {
          console.log(`    ✓ Error message indicates corruption: helpful for debugging`);
        }
      }
    }

  } catch (error) {
    console.log(`Attachment corruption test failed: ${error.message}`);
    throw error;
  }

  const duration = Date.now() - startTime;
  console.log(`Test completed in ${duration}ms`);
});

tap.test('PDF-09: Corrupted PDF Recovery - Error Reporting Quality', async (tools) => {
  const startTime = Date.now();

  // Test quality of error reporting for corrupted PDFs
  const errorReportingTests = [
    {
      name: 'Completely Invalid File',
      content: 'This is definitely not a PDF file at all',
      expectedErrorTypes: ['format', 'invalid', 'not-pdf']
    },
    {
      name: 'Binary Garbage',
      content: Buffer.from([0x00, 0xFF, 0xAB, 0xCD, 0xEF, 0x12, 0x34, 0x56]),
      expectedErrorTypes: ['binary', 'corrupt', 'invalid']
    },
    {
      name: 'Partial PDF Header',
      content: '%PDF-1.4\n%âãÏÓ\n1 0 obj\n<< >>\nendobj\n',
      expectedErrorTypes: ['incomplete', 'truncated', 'structure']
    }
  ];

  for (const errorTest of errorReportingTests) {
    console.log(`Testing error reporting for: ${errorTest.name}`);

    const corruptedPath = plugins.path.join(process.cwd(), '.nogit', `error-${errorTest.name.toLowerCase().replace(/\s+/g, '-')}.pdf`);
    await plugins.fs.mkdir(plugins.path.dirname(corruptedPath), { recursive: true });

    try {
      // Create corrupted file
      if (Buffer.isBuffer(errorTest.content)) {
        await plugins.fs.writeFile(corruptedPath, errorTest.content);
      } else {
        await plugins.fs.writeFile(corruptedPath, errorTest.content, 'binary');
      }

      const invoice = new EInvoice();

      try {
        await invoice.fromFile(corruptedPath);
        console.log(`  ⚠ Expected error for ${errorTest.name} but operation succeeded`);
      } catch (extractionError) {
        console.log(`  ✓ Error caught for ${errorTest.name}`);
        console.log(`    Error message: ${extractionError.message}`);

        // Analyze error message quality
        const errorMessage = extractionError.message.toLowerCase();
        const messageQuality = {
          isDescriptive: extractionError.message.length > 20,
          containsFileInfo: errorMessage.includes('pdf') || errorMessage.includes('file'),
          containsErrorType: errorTest.expectedErrorTypes.some(type => errorMessage.includes(type)),
          isActionable: errorMessage.includes('check') ||
                       errorMessage.includes('verify') ||
                       errorMessage.includes('ensure') ||
                       errorMessage.includes('corrupt')
        };

        console.log(`    Message Quality Analysis:`);
        console.log(`      Descriptive (>20 chars): ${messageQuality.isDescriptive}`);
        console.log(`      Contains file info: ${messageQuality.containsFileInfo}`);
        console.log(`      Contains error type: ${messageQuality.containsErrorType}`);
        console.log(`      Is actionable: ${messageQuality.isActionable}`);

        // Error message should be helpful
        expect(messageQuality.isDescriptive).toBeTrue();

        if (messageQuality.containsFileInfo && messageQuality.isActionable) {
          console.log(`    ✓ High quality error message`);
        } else {
          console.log(`    ⚠ Error message could be more helpful`);
        }

        // Check error object properties
        if (extractionError.code) {
          console.log(`    Error code: ${extractionError.code}`);
        }

        if (extractionError.path) {
          console.log(`    Error path: ${extractionError.path}`);
        }
      }

    } finally {
      // Clean up
      try {
        await plugins.fs.unlink(corruptedPath);
      } catch (cleanupError) {
        // Ignore cleanup errors
      }
    }
  }

  const duration = Date.now() - startTime;
  console.log(`Test completed in ${duration}ms`);
});

tap.test('PDF-09: Test Summary', async (tools) => {
  console.log(`\n=== Corrupted PDF Recovery Test Summary ===`);
  console.log(`\nCorrupted PDF recovery testing completed.`);
  console.log(`Note: Most corruption tests expect failures - this is normal and indicates proper error handling.`);
});

tap.start();