einvoice/test/test.pdf-operations.ts

import { tap, expect } from '@git.zone/tstest/tapbundle';
import { EInvoice, EInvoicePDFError } from '../ts/index.js';
import { InvoiceFormat } from '../ts/interfaces/common.js';
import { TestFileHelpers, TestFileCategories, PerformanceUtils, TestInvoiceFactory } from './helpers/utils.js';
import * as path from 'path';
import { promises as fs } from 'fs';

/**
 * Comprehensive PDF operations test suite
 */

// Test PDF extraction from ZUGFeRD v1 files
tap.test('PDF Operations - Extract XML from ZUGFeRD v1 PDFs', async () => {
  // Use CorpusLoader for recursive loading
  const { CorpusLoader } = await import('./helpers/corpus.loader.js');
  const corpusFiles = await CorpusLoader.loadCategory('ZUGFERD_V1_CORRECT');
  const pdfFiles = corpusFiles.filter(file => file.path.endsWith('.pdf'));

  console.log(`Testing XML extraction from ${pdfFiles.length} ZUGFeRD v1 PDFs`);

  // Skip test if no PDF files are available
  if (pdfFiles.length === 0) {
    console.log('No ZUGFeRD v1 PDF files found in corpus - skipping test');
    return;
  }

  let successCount = 0;
  let failCount = 0;
  const extractionTimes: number[] = [];

  for (const corpusFile of pdfFiles.slice(0, 5)) { // Test first 5 for speed
    const fileName = path.basename(corpusFile.path);

    try {
      const pdfBuffer = await CorpusLoader.loadFile(corpusFile.path);

      const { result: einvoice, duration } = await PerformanceUtils.measure(
        'pdf-extraction-v1',
        async () => EInvoice.fromPdf(pdfBuffer)
      );

      extractionTimes.push(duration);

      // Verify extraction succeeded
      expect(einvoice).toBeTruthy();
      expect(einvoice.getXml()).toBeTruthy();
      expect(einvoice.getXml().length).toBeGreaterThan(100);

      // Check format detection
      const format = einvoice.getFormat();
      expect([InvoiceFormat.ZUGFERD, InvoiceFormat.FACTURX]).toContain(format);

      successCount++;
      console.log(`✓ ${fileName}: Extracted ${einvoice.getXml().length} bytes, format: ${format} (${duration.toFixed(2)}ms)`);

      // Verify basic invoice data
      expect(einvoice.id).toBeTruthy();
      expect(einvoice.from.name).toBeTruthy();
      expect(einvoice.to.name).toBeTruthy();

    } catch (error) {
      failCount++;
      if (error instanceof EInvoicePDFError) {
        console.log(`✗ ${fileName}: ${error.message}`);
        console.log(`  Recovery suggestions: ${error.getRecoverySuggestions().join(', ')}`);
      } else {
        console.log(`✗ ${fileName}: ${error.message}`);
      }
    }
  }

  console.log(`\nExtraction Summary: ${successCount} succeeded, ${failCount} failed`);
  if (extractionTimes.length > 0) {
    const avgTime = extractionTimes.reduce((a, b) => a + b) / extractionTimes.length;
    console.log(`Average extraction time: ${avgTime.toFixed(2)}ms`);
  }

  // Only expect success if we had files to test
  if (pdfFiles.length > 0) {
    expect(successCount).toBeGreaterThan(0);
  }
});

// Test PDF extraction from ZUGFeRD v2/Factur-X files
tap.test('PDF Operations - Extract XML from ZUGFeRD v2/Factur-X PDFs', async () => {
  // Use CorpusLoader for recursive loading
  const { CorpusLoader } = await import('./helpers/corpus.loader.js');
  const corpusFiles = await CorpusLoader.loadCategory('ZUGFERD_V2_CORRECT');
  const pdfFiles = corpusFiles.filter(file => file.path.endsWith('.pdf'));

  console.log(`Testing XML extraction from ${pdfFiles.length} ZUGFeRD v2/Factur-X PDFs`);

  // Skip test if no PDF files are available
  if (pdfFiles.length === 0) {
    console.log('No ZUGFeRD v2/Factur-X PDF files found in corpus - skipping test');
    return;
  }

  const profileStats: Record<string, number> = {};

  for (const corpusFile of pdfFiles.slice(0, 10)) { // Test first 10
    const fileName = path.basename(corpusFile.path);

    try {
      const pdfBuffer = await CorpusLoader.loadFile(corpusFile.path);
      const einvoice = await EInvoice.fromPdf(pdfBuffer);

      // Extract profile from filename if present
      const profileMatch = fileName.match(/(BASIC|COMFORT|EXTENDED|MINIMUM|EN16931)/i);
      const profile = profileMatch ? profileMatch[1].toUpperCase() : 'UNKNOWN';
      profileStats[profile] = (profileStats[profile] || 0) + 1;

      console.log(`✓ ${fileName}: Profile ${profile}, Format ${einvoice.getFormat()}`);

      // Test that we can re-export the invoice
      const xml = await einvoice.exportXml('facturx');
      expect(xml).toBeTruthy();
      expect(xml).toInclude('CrossIndustryInvoice');

    } catch (error) {
      console.log(`✗ ${fileName}: ${error.message}`);
    }
  }

  console.log('\nProfile distribution:', profileStats);
});

// Test PDF embedding (creating PDFs with XML)
tap.test('PDF Operations - Embed XML into PDF', async () => {
  // Create a test invoice
  const invoice = new EInvoice();
  Object.assign(invoice, TestInvoiceFactory.createComplexInvoice());

  // Generate XML
  const xml = await invoice.exportXml('facturx');
  expect(xml).toBeTruthy();
  console.log(`Generated XML: ${xml.length} bytes`);

  // Create a minimal PDF for testing
  const pdfBuffer = await createMinimalTestPDF();
  invoice.pdf = {
    name: 'test-invoice.pdf',
    id: 'test-pdf-001',
    metadata: { textExtraction: '' },
    buffer: pdfBuffer
  };

  // Test embedding
  try {
    const { result: resultPdf, duration } = await PerformanceUtils.measure(
      'pdf-embedding',
      async () => ({ buffer: await invoice.embedInPdf(Buffer.from(pdfBuffer), 'facturx') })
    );

    expect(resultPdf).toBeTruthy();
    expect(resultPdf.buffer).toBeTruthy();
    expect(resultPdf.buffer.length).toBeGreaterThan(pdfBuffer.length);

    console.log(`✓ Successfully embedded XML into PDF (${duration.toFixed(2)}ms)`);
    console.log(`  Original PDF: ${pdfBuffer.length} bytes`);
    console.log(`  Result PDF: ${resultPdf.buffer.length} bytes`);
    console.log(`  Size increase: ${resultPdf.buffer.length - pdfBuffer.length} bytes`);

    // Verify the embedded XML can be extracted
    const verification = await EInvoice.fromPdf(resultPdf.buffer);
    expect(verification.getXml()).toBeTruthy();
    expect(verification.getFormat()).toEqual(InvoiceFormat.FACTURX);
    console.log('✓ Verified: Embedded XML can be extracted successfully');

  } catch (error) {
    if (error instanceof EInvoicePDFError) {
      console.log(`✗ Embedding failed: ${error.message}`);
      console.log(`  Operation: ${error.operation}`);
      console.log(`  Suggestions: ${error.getRecoverySuggestions().join(', ')}`);
    }
    throw error;
  }
});

// Test PDF extraction error handling
tap.test('PDF Operations - Error handling for invalid PDFs', async () => {
  // Test with empty buffer
  try {
    await EInvoice.fromPdf(Buffer.from(new Uint8Array(0)));
    throw new Error('Should have thrown an error for empty PDF');
  } catch (error) {
    expect(error).toBeInstanceOf(EInvoicePDFError);
    if (error instanceof EInvoicePDFError) {
      expect(error.operation).toEqual('extract');
      console.log('✓ Empty PDF error handled correctly');
    }
  }

  // Test with non-PDF data
  try {
    const textBuffer = Buffer.from('This is not a PDF file');
    await EInvoice.fromPdf(textBuffer);
    throw new Error('Should have thrown an error for non-PDF data');
  } catch (error) {
    expect(error).toBeInstanceOf(EInvoicePDFError);
    console.log('✓ Non-PDF data error handled correctly');
  }

  // Test with corrupted PDF header
  try {
    const corruptPdf = Buffer.from('%PDF-1.4\nCorrupted content');
    await EInvoice.fromPdf(corruptPdf);
    throw new Error('Should have thrown an error for corrupted PDF');
  } catch (error) {
    expect(error).toBeInstanceOf(EInvoicePDFError);
    console.log('✓ Corrupted PDF error handled correctly');
  }
});

// Test failed PDF extractions from corpus
tap.test('PDF Operations - Handle PDFs without XML gracefully', async () => {
  // Use CorpusLoader for recursive loading
  const { CorpusLoader } = await import('./helpers/corpus.loader.js');
  const corpusFiles = await CorpusLoader.loadCategory('ZUGFERD_V1_FAIL');
  const failPdfs = corpusFiles.filter(file => file.path.endsWith('.pdf'));

  console.log(`Testing ${failPdfs.length} PDFs expected to fail`);

  // Skip test if no PDF files are available
  if (failPdfs.length === 0) {
    console.log('No failed ZUGFeRD v1 PDF files found in corpus - skipping test');
    return;
  }

  for (const corpusFile of failPdfs) {
    const fileName = path.basename(corpusFile.path);

    try {
      const pdfBuffer = await CorpusLoader.loadFile(corpusFile.path);
      await EInvoice.fromPdf(pdfBuffer);
      console.log(`○ ${fileName}: Unexpectedly succeeded (might have XML)`);
    } catch (error) {
      if (error instanceof EInvoicePDFError) {
        expect(error.operation).toEqual('extract');
        console.log(`✓ ${fileName}: Correctly failed - ${error.message}`);
      } else {
        console.log(`✗ ${fileName}: Wrong error type - ${error.message}`);
      }
    }
  }
});

// Test PDF metadata preservation
tap.test('PDF Operations - Metadata preservation during embedding', async () => {
  // Use CorpusLoader for recursive loading
  const { CorpusLoader } = await import('./helpers/corpus.loader.js');
  const corpusFiles = await CorpusLoader.loadCategory('ZUGFERD_V2_CORRECT');
  const pdfFiles = corpusFiles.filter(file => file.path.endsWith('.pdf'));

  if (pdfFiles.length > 0) {
    const originalPdfBuffer = await CorpusLoader.loadFile(pdfFiles[0].path);

    try {
      // Extract from original
      const originalInvoice = await EInvoice.fromPdf(originalPdfBuffer);

      // Re-embed with different format
      const reembeddedBuffer = await originalInvoice.embedInPdf(originalPdfBuffer, 'xrechnung');

      // Extract again
      const reextracted = await EInvoice.fromPdf(reembeddedBuffer);

      // Compare key fields
      expect(reextracted.from.name).toEqual(originalInvoice.from.name);
      expect(reextracted.to.name).toEqual(originalInvoice.to.name);
      expect(reextracted.items.length).toEqual(originalInvoice.items.length);

      console.log('✓ Metadata preserved through re-embedding cycle');

    } catch (error) {
      console.log(`○ Metadata preservation test skipped: ${error.message}`);
    }
  } else {
    console.log('No ZUGFeRD v2 PDF files found for metadata preservation test - skipping');
  }
});

// Test PDF size constraints
tap.test('PDF Operations - Performance with large PDFs', async () => {
  const largePdfSize = 10 * 1024 * 1024; // 10MB
  const largePdfBuffer = Buffer.alloc(largePdfSize);

  // Create a simple PDF header
  const pdfHeader = Buffer.from('%PDF-1.4\n');
  pdfHeader.copy(largePdfBuffer);

  console.log(`Testing with ${(largePdfSize / 1024 / 1024).toFixed(1)}MB PDF`);

  const startTime = performance.now();
  try {
    await EInvoice.fromPdf(largePdfBuffer);
  } catch (error) {
    // Expected to fail, we're testing performance
    const duration = performance.now() - startTime;
    console.log(`✓ Large PDF processed in ${duration.toFixed(2)}ms`);
    expect(duration).toBeLessThan(5000); // Should fail fast, not hang
  }
});

// Test concurrent PDF operations
tap.test('PDF Operations - Concurrent processing', async () => {
  // Use CorpusLoader for recursive loading
  const { CorpusLoader } = await import('./helpers/corpus.loader.js');
  const corpusFiles = await CorpusLoader.loadCategory('ZUGFERD_V2_CORRECT');
  const pdfFiles = corpusFiles.filter(file => file.path.endsWith('.pdf'));
  const testFiles = pdfFiles.slice(0, 5);

  if (testFiles.length > 0) {
    console.log(`Testing concurrent processing of ${testFiles.length} PDFs`);

    const startTime = performance.now();

    // Process all PDFs concurrently
    const promises = testFiles.map(async (corpusFile) => {
      try {
        const pdfBuffer = await CorpusLoader.loadFile(corpusFile.path);
        const einvoice = await EInvoice.fromPdf(pdfBuffer);
        return { success: true, format: einvoice.getFormat() };
      } catch (error) {
        return { success: false, error: error.message };
      }
    });

    const results = await Promise.all(promises);
    const duration = performance.now() - startTime;

    const successCount = results.filter(r => r.success).length;
    console.log(`✓ Processed ${successCount}/${testFiles.length} PDFs concurrently in ${duration.toFixed(2)}ms`);
    console.log(`  Average time per PDF: ${(duration / testFiles.length).toFixed(2)}ms`);
  } else {
    console.log('No ZUGFeRD v2 PDF files found for concurrent processing test - skipping');
  }
});

// Performance summary
tap.test('PDF Operations - Performance Summary', async () => {
  const stats = {
    extraction: PerformanceUtils.getStats('pdf-extraction-v1'),
    embedding: PerformanceUtils.getStats('pdf-embedding')
  };

  console.log('\nPDF Operations Performance Summary:');

  if (stats.extraction) {
    console.log('PDF Extraction (ZUGFeRD v1):');
    console.log(`  Average: ${stats.extraction.avg.toFixed(2)}ms`);
    console.log(`  Min/Max: ${stats.extraction.min.toFixed(2)}ms / ${stats.extraction.max.toFixed(2)}ms`);
  }

  if (stats.embedding) {
    console.log('PDF Embedding:');
    console.log(`  Average: ${stats.embedding.avg.toFixed(2)}ms`);
  }

  // Performance assertions
  if (stats.extraction && stats.extraction.count > 3) {
    expect(stats.extraction.avg).toBeLessThan(1000); // Should extract in under 1 second on average
  }
});

// Helper function to create a minimal test PDF
async function createMinimalTestPDF(): Promise<Uint8Array> {
  // This creates a very minimal valid PDF
  const pdfContent = `%PDF-1.4
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << >> >>
endobj
xref
0 4
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
trailer
<< /Size 4 /Root 1 0 R >>
startxref
217
%%EOF`;

  return new Uint8Array(Buffer.from(pdfContent));
}

tap.start();