einvoice/test/suite/einvoice_format-detection/test.fd-10.mixed-formats.ts

import { expect, tap } from '@git.zone/tstest/tapbundle';
import { promises as fs } from 'fs';
import * as path from 'path';
import { CorpusLoader } from '../../helpers/corpus.loader.js';
import { PerformanceTracker } from '../../helpers/performance.tracker.js';

tap.test('FD-10: Mixed Format Detection - should correctly identify formats across different categories', async () => {
  // Get samples from multiple format categories
  const formatCategories = [
    { name: 'CII XML-Rechnung', category: 'CII_XMLRECHNUNG' as const, expectedFormats: ['cii', 'xrechnung', 'facturx'] },
    { name: 'UBL XML-Rechnung', category: 'UBL_XMLRECHNUNG' as const, expectedFormats: ['ubl', 'xrechnung'] },
    { name: 'EN16931 CII', category: 'EN16931_CII' as const, expectedFormats: ['cii', 'facturx', 'zugferd'] }, // ZUGFeRD v1 files are valid here
    { name: 'EN16931 UBL', category: 'EN16931_UBL_EXAMPLES' as const, expectedFormats: ['ubl', 'xrechnung', 'fatturapa'] } // Some examples might be FatturaPA
  ];

  console.log('Testing mixed format detection across multiple categories');

  const { FormatDetector } = await import('../../../ts/formats/utils/format.detector.js');

  const results: { category: string; correct: number; total: number; formats: Record<string, number> }[] = [];

  for (const category of formatCategories) {
    try {
      const files = await CorpusLoader.getFiles(category.category);
      const xmlFiles = files.filter(f => f.endsWith('.xml')).slice(0, 3); // Test 3 per category

      if (xmlFiles.length === 0) {
        console.log(`No XML files found in ${category.name}, skipping`);
        continue;
      }

      const categoryResult = {
        category: category.name,
        correct: 0,
        total: xmlFiles.length,
        formats: {} as Record<string, number>
      };

      console.log(`\nTesting ${category.name} (${xmlFiles.length} files)`);

      for (const filePath of xmlFiles) {
        const fileName = path.basename(filePath);

        try {
          const xmlContent = await fs.readFile(filePath, 'utf-8');

          const { result: format } = await PerformanceTracker.track(
            'mixed-format-detection',
            async () => FormatDetector.detectFormat(xmlContent),
            { category: category.name, file: fileName }
          );

          const formatStr = format.toString().toLowerCase();
          categoryResult.formats[formatStr] = (categoryResult.formats[formatStr] || 0) + 1;

          // Check if detected format matches expected formats for this category
          const isCorrect = category.expectedFormats.some(expected =>
            formatStr.includes(expected.toLowerCase())
          );

          if (isCorrect) {
            categoryResult.correct++;
            console.log(`  ✓ ${fileName}: ${format} (expected for ${category.name})`);
          } else {
            console.log(`  ○ ${fileName}: ${format} (unexpected for ${category.name})`);
          }

        } catch (error) {
          console.log(`  ✗ ${fileName}: Error - ${error.message}`);
        }
      }

      const accuracy = (categoryResult.correct / categoryResult.total * 100).toFixed(1);
      console.log(`  Accuracy: ${categoryResult.correct}/${categoryResult.total} (${accuracy}%)`);
      console.log(`  Detected formats:`, categoryResult.formats);

      results.push(categoryResult);

    } catch (error) {
      console.log(`Error testing ${category.name}: ${error.message}`);
    }
  }

  // Overall summary
  console.log('\nMixed Format Detection Summary:');
  let totalCorrect = 0;
  let totalFiles = 0;

  results.forEach(result => {
    totalCorrect += result.correct;
    totalFiles += result.total;
    console.log(`  ${result.category}: ${result.correct}/${result.total} (${(result.correct/result.total*100).toFixed(1)}%)`);
  });

  if (totalFiles > 0) {
    const overallAccuracy = (totalCorrect / totalFiles * 100).toFixed(1);
    console.log(`  Overall: ${totalCorrect}/${totalFiles} (${overallAccuracy}%)`);

    // Expect reasonable accuracy across mixed formats
    expect(totalCorrect / totalFiles).toBeGreaterThan(0.7);
  }

  // Performance summary
  const perfSummary = await PerformanceTracker.getSummary('mixed-format-detection');
  if (perfSummary) {
    console.log(`\nMixed Format Detection Performance:`);
    console.log(`  Average: ${perfSummary.average.toFixed(2)}ms`);
    console.log(`  P95: ${perfSummary.p95.toFixed(2)}ms`);
  }
});

tap.test('FD-10: Format Ambiguity Resolution - should handle ambiguous cases correctly', async () => {
  const { FormatDetector } = await import('../../../ts/formats/utils/format.detector.js');

  const ambiguousTests = [
    {
      name: 'UBL with XRechnung CustomizationID',
      xml: `<?xml version="1.0"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2"
         xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
  <cbc:CustomizationID>urn:cen.eu:en16931:2017#compliant#urn:xoev-de:kosit:standard:xrechnung_3.0</cbc:CustomizationID>
  <cbc:ID>AMBIG-001</cbc:ID>
</Invoice>`,
      expectedPriority: ['xrechnung', 'ubl'], // XRechnung should take priority over generic UBL
      description: 'Should prioritize XRechnung over UBL when CustomizationID is present'
    },
    {
      name: 'CII with Factur-X profile',
      xml: `<?xml version="1.0"?>
<rsm:CrossIndustryInvoice xmlns:rsm="urn:un:unece:uncefact:data:standard:CrossIndustryInvoice:100"
                          xmlns:ram="urn:un:unece:uncefact:data:standard:ReusableAggregateBusinessInformationEntity:100">
  <rsm:ExchangedDocumentContext>
    <ram:GuidelineSpecifiedDocumentContextParameter>
      <ram:ID>urn:cen.eu:en16931:2017#compliant#urn:factur-x.eu:1p0:basic</ram:ID>
    </ram:GuidelineSpecifiedDocumentContextParameter>
  </rsm:ExchangedDocumentContext>
</rsm:CrossIndustryInvoice>`,
      expectedPriority: ['facturx', 'cii'], // Factur-X should take priority over generic CII
      description: 'Should prioritize Factur-X over CII when profile is present'
    },
    {
      name: 'Generic UBL without customization',
      xml: `<?xml version="1.0"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
  <ID>GENERIC-001</ID>
</Invoice>`,
      expectedPriority: ['ubl'],
      description: 'Should detect as generic UBL without specific customization'
    }
  ];

  for (const test of ambiguousTests) {
    const { result: format } = await PerformanceTracker.track(
      'ambiguity-resolution-test',
      async () => FormatDetector.detectFormat(test.xml)
    );

    console.log(`\n${test.name}:`);
    console.log(`  Description: ${test.description}`);
    console.log(`  Detected: ${format}`);

    const formatStr = format.toString().toLowerCase();
    const matchesPriority = test.expectedPriority.some(expected =>
      formatStr.includes(expected)
    );

    if (matchesPriority) {
      const primaryMatch = test.expectedPriority.find(expected =>
        formatStr.includes(expected)
      );
      console.log(`  ✓ Correctly prioritized ${primaryMatch}`);
    } else {
      console.log(`  ○ Expected one of: ${test.expectedPriority.join(', ')}`);
    }
  }
});

tap.test('FD-10: Format Detection Consistency - should produce consistent results', async () => {
  const { FormatDetector } = await import('../../../ts/formats/utils/format.detector.js');

  // Test the same XML multiple times to ensure consistency
  const testXml = `<?xml version="1.0"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2"
         xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
  <cbc:ID>CONSISTENCY-TEST</cbc:ID>
  <cbc:IssueDate>2024-01-01</cbc:IssueDate>
</Invoice>`;

  console.log('Testing format detection consistency (10 iterations)');

  const detectedFormats: string[] = [];
  const times: number[] = [];

  for (let i = 0; i < 10; i++) {
    const { result: format, metric } = await PerformanceTracker.track(
      'consistency-test',
      async () => FormatDetector.detectFormat(testXml)
    );

    detectedFormats.push(format.toString());
    times.push(metric.duration);
  }

  // Check consistency
  const uniqueFormats = [...new Set(detectedFormats)];
  console.log(`Detected formats: ${uniqueFormats.join(', ')}`);
  console.log(`Consistency: ${uniqueFormats.length === 1 ? 'CONSISTENT' : 'INCONSISTENT'}`);

  expect(uniqueFormats.length).toEqual(1); // Should always detect the same format

  // Check performance consistency
  const avgTime = times.reduce((a, b) => a + b, 0) / times.length;
  const maxTime = Math.max(...times);
  const minTime = Math.min(...times);
  const variance = maxTime - minTime;

  console.log(`Performance: avg ${avgTime.toFixed(2)}ms, range ${minTime.toFixed(2)}-${maxTime.toFixed(2)}ms`);
  console.log(`Variance: ${variance.toFixed(2)}ms`);

  // Performance should be relatively stable
  // Allow for some variation in timing due to system load
  expect(variance).toBeLessThan(Math.max(avgTime * 3, 0.5)); // Variance shouldn't exceed 3x average or 0.5ms
});

tap.test('FD-10: Complex Document Structure - should handle complex nested structures', async () => {
  const { FormatDetector } = await import('../../../ts/formats/utils/format.detector.js');

  const complexXml = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2"
         xmlns:cac="urn:oasis:names:specification:ubl:schema:xsd:CommonAggregateComponents-2"
         xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
  <cbc:CustomizationID>urn:cen.eu:en16931:2017#compliant#urn:xoev-de:kosit:standard:xrechnung_3.0</cbc:CustomizationID>
  <cbc:ProfileID>urn:fdc:peppol.eu:2017:poacc:billing:01:1.0</cbc:ProfileID>
  <cbc:ID>COMPLEX-001</cbc:ID>
  <cbc:IssueDate>2024-01-01</cbc:IssueDate>
  <cbc:DocumentCurrencyCode>EUR</cbc:DocumentCurrencyCode>

  <cac:AccountingSupplierParty>
    <cac:Party>
      <cac:PartyName>
        <cbc:Name>Complex Seller GmbH</cbc:Name>
      </cac:PartyName>
      <cac:PostalAddress>
        <cbc:StreetName>Musterstraße</cbc:StreetName>
        <cbc:CityName>Berlin</cbc:CityName>
        <cbc:PostalZone>10115</cbc:PostalZone>
        <cac:Country>
          <cbc:IdentificationCode>DE</cbc:IdentificationCode>
        </cac:Country>
      </cac:PostalAddress>
      <cac:PartyTaxScheme>
        <cbc:CompanyID>DE123456789</cbc:CompanyID>
        <cac:TaxScheme>
          <cbc:ID>VAT</cbc:ID>
        </cac:TaxScheme>
      </cac:PartyTaxScheme>
    </cac:Party>
  </cac:AccountingSupplierParty>

  <cac:InvoiceLine>
    <cbc:ID>1</cbc:ID>
    <cbc:InvoicedQuantity unitCode="EA">10</cbc:InvoicedQuantity>
    <cbc:LineExtensionAmount currencyID="EUR">1000.00</cbc:LineExtensionAmount>
    <cac:Item>
      <cbc:Name>Complex Product</cbc:Name>
      <cac:ClassifiedTaxCategory>
        <cbc:ID>S</cbc:ID>
        <cbc:Percent>19</cbc:Percent>
        <cac:TaxScheme>
          <cbc:ID>VAT</cbc:ID>
        </cac:TaxScheme>
      </cac:ClassifiedTaxCategory>
    </cac:Item>
  </cac:InvoiceLine>
</Invoice>`;

  console.log('Testing complex document structure detection');

  const { result: format, metric } = await PerformanceTracker.track(
    'complex-structure-detection',
    async () => FormatDetector.detectFormat(complexXml),
    { complexity: 'high', elements: complexXml.split('<').length }
  );

  console.log(`Complex document detected as: ${format}`);
  console.log(`Detection time: ${metric.duration.toFixed(2)}ms`);
  console.log(`Document size: ${complexXml.length} bytes`);

  // Should still detect correctly despite complexity
  const formatStr = format.toString().toLowerCase();
  const isValidFormat = formatStr.includes('xrechnung') || formatStr.includes('ubl');
  expect(isValidFormat).toEqual(true);

  // Should still be fast despite complexity
  expect(metric.duration).toBeLessThan(20); // Should be under 20ms even for complex docs
});

tap.start();