einvoice/test/suite/einvoice_parsing/test.parse-03.encoding-detection.ts

import { expect, tap } from '@git.zone/tstest/tapbundle';
import * as einvoice from '../../../ts/index.js';
import * as plugins from '../../plugins.js';
import { CorpusLoader } from '../../helpers/corpus.loader.js';
import { PerformanceTracker } from '../../helpers/performance.tracker.js';

tap.test('PARSE-03: Character Encoding Detection - Detect and handle various character encodings', async (t) => {
  const performanceTracker = new PerformanceTracker('PARSE-03');
  
  await t.test('Encoding declaration detection', async () => {
    performanceTracker.startOperation('declaration-detection');
    
    const encodingTests = [
      {
        name: 'UTF-8 declaration',
        xml: '<?xml version="1.0" encoding="UTF-8"?>\n<invoice><id>TEST-001</id></invoice>',
        expectedEncoding: 'UTF-8',
        actualEncoding: 'UTF-8'
      },
      {
        name: 'UTF-16 declaration',
        xml: '<?xml version="1.0" encoding="UTF-16"?>\n<invoice><id>TEST-002</id></invoice>',
        expectedEncoding: 'UTF-16',
        actualEncoding: 'UTF-8' // Mismatch test
      },
      {
        name: 'ISO-8859-1 declaration',
        xml: '<?xml version="1.0" encoding="ISO-8859-1"?>\n<invoice><supplier>Müller</supplier></invoice>',
        expectedEncoding: 'ISO-8859-1',
        actualEncoding: 'ISO-8859-1'
      },
      {
        name: 'Windows-1252 declaration',
        xml: '<?xml version="1.0" encoding="Windows-1252"?>\n<invoice><note>Special – chars</note></invoice>',
        expectedEncoding: 'Windows-1252',
        actualEncoding: 'Windows-1252'
      },
      {
        name: 'Case variations',
        xml: '<?xml version="1.0" encoding="utf-8"?>\n<invoice><id>TEST-003</id></invoice>',
        expectedEncoding: 'UTF-8',
        actualEncoding: 'UTF-8'
      },
      {
        name: 'No encoding declaration',
        xml: '<?xml version="1.0"?>\n<invoice><id>TEST-004</id></invoice>',
        expectedEncoding: 'UTF-8', // Default
        actualEncoding: 'UTF-8'
      }
    ];
    
    for (const test of encodingTests) {
      const startTime = performance.now();
      
      // Extract declared encoding
      const encodingMatch = test.xml.match(/encoding=["']([^"']+)["']/i);
      const declaredEncoding = encodingMatch ? encodingMatch[1].toUpperCase() : 'UTF-8';
      
      console.log(`${test.name}:`);
      console.log(`  Declared: ${declaredEncoding}`);
      console.log(`  Expected: ${test.expectedEncoding}`);
      
      if (declaredEncoding.replace(/-/g, '').toUpperCase() === 
          test.expectedEncoding.replace(/-/g, '').toUpperCase()) {
        console.log('  ✓ Declaration matches expected encoding');
      } else {
        console.log('  ✗ Declaration mismatch');
      }
      
      performanceTracker.recordMetric('encoding-detection', performance.now() - startTime);
    }
    
    performanceTracker.endOperation('declaration-detection');
  });
  
  await t.test('BOM (Byte Order Mark) detection', async () => {
    performanceTracker.startOperation('bom-detection');
    
    const bomTests = [
      {
        name: 'UTF-8 with BOM',
        bom: Buffer.from([0xEF, 0xBB, 0xBF]),
        encoding: 'UTF-8',
        xml: '<?xml version="1.0" encoding="UTF-8"?><invoice><id>TEST-005</id></invoice>'
      },
      {
        name: 'UTF-16 LE BOM',
        bom: Buffer.from([0xFF, 0xFE]),
        encoding: 'UTF-16LE',
        xml: '<?xml version="1.0" encoding="UTF-16"?><invoice><id>TEST-006</id></invoice>'
      },
      {
        name: 'UTF-16 BE BOM',
        bom: Buffer.from([0xFE, 0xFF]),
        encoding: 'UTF-16BE',
        xml: '<?xml version="1.0" encoding="UTF-16"?><invoice><id>TEST-007</id></invoice>'
      },
      {
        name: 'UTF-32 LE BOM',
        bom: Buffer.from([0xFF, 0xFE, 0x00, 0x00]),
        encoding: 'UTF-32LE',
        xml: '<?xml version="1.0" encoding="UTF-32"?><invoice><id>TEST-008</id></invoice>'
      },
      {
        name: 'UTF-32 BE BOM',
        bom: Buffer.from([0x00, 0x00, 0xFE, 0xFF]),
        encoding: 'UTF-32BE',
        xml: '<?xml version="1.0" encoding="UTF-32"?><invoice><id>TEST-009</id></invoice>'
      },
      {
        name: 'No BOM',
        bom: Buffer.from([]),
        encoding: 'UTF-8',
        xml: '<?xml version="1.0" encoding="UTF-8"?><invoice><id>TEST-010</id></invoice>'
      }
    ];
    
    for (const test of bomTests) {
      const startTime = performance.now();
      
      // Create buffer with BOM
      const xmlBuffer = Buffer.from(test.xml, 'utf8');
      const fullBuffer = Buffer.concat([test.bom, xmlBuffer]);
      
      // Detect BOM
      let detectedEncoding = 'UTF-8'; // Default
      
      if (fullBuffer.length >= 4) {
        if (fullBuffer[0] === 0xEF && fullBuffer[1] === 0xBB && fullBuffer[2] === 0xBF) {
          detectedEncoding = 'UTF-8';
        } else if (fullBuffer[0] === 0xFF && fullBuffer[1] === 0xFE) {
          if (fullBuffer[2] === 0x00 && fullBuffer[3] === 0x00) {
            detectedEncoding = 'UTF-32LE';
          } else {
            detectedEncoding = 'UTF-16LE';
          }
        } else if (fullBuffer[0] === 0xFE && fullBuffer[1] === 0xFF) {
          detectedEncoding = 'UTF-16BE';
        } else if (fullBuffer[0] === 0x00 && fullBuffer[1] === 0x00 && 
                   fullBuffer[2] === 0xFE && fullBuffer[3] === 0xFF) {
          detectedEncoding = 'UTF-32BE';
        }
      }
      
      console.log(`${test.name}:`);
      console.log(`  BOM bytes: ${test.bom.length > 0 ? Array.from(test.bom).map(b => '0x' + b.toString(16).toUpperCase()).join(' ') : 'None'}`);
      console.log(`  Expected: ${test.encoding}`);
      console.log(`  Detected: ${detectedEncoding}`);
      
      if (detectedEncoding === test.encoding || 
          (test.bom.length === 0 && detectedEncoding === 'UTF-8')) {
        console.log('  ✓ BOM detection correct');
      } else {
        console.log('  ✗ BOM detection failed');
      }
      
      performanceTracker.recordMetric('bom-detection', performance.now() - startTime);
    }
    
    performanceTracker.endOperation('bom-detection');
  });
  
  await t.test('Heuristic encoding detection', async () => {
    performanceTracker.startOperation('heuristic-detection');
    
    class EncodingDetector {
      detectEncoding(buffer: Buffer): { encoding: string; confidence: number; method: string } {
        // Check for BOM first
        const bomResult = this.checkBOM(buffer);
        if (bomResult) {
          return { ...bomResult, confidence: 100, method: 'BOM' };
        }
        
        // Check XML declaration
        const declResult = this.checkXmlDeclaration(buffer);
        if (declResult) {
          return { ...declResult, confidence: 90, method: 'XML Declaration' };
        }
        
        // Heuristic checks
        const heuristicResult = this.heuristicCheck(buffer);
        return { ...heuristicResult, method: 'Heuristic' };
      }
      
      private checkBOM(buffer: Buffer): { encoding: string } | null {
        if (buffer.length < 2) return null;
        
        if (buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) {
          return { encoding: 'UTF-8' };
        }
        if (buffer[0] === 0xFF && buffer[1] === 0xFE) {
          return { encoding: 'UTF-16LE' };
        }
        if (buffer[0] === 0xFE && buffer[1] === 0xFF) {
          return { encoding: 'UTF-16BE' };
        }
        
        return null;
      }
      
      private checkXmlDeclaration(buffer: Buffer): { encoding: string } | null {
        // Look for encoding in first 100 bytes
        const sample = buffer.toString('ascii', 0, Math.min(100, buffer.length));
        const match = sample.match(/encoding=["']([^"']+)["']/i);
        
        if (match) {
          return { encoding: match[1].toUpperCase() };
        }
        
        return null;
      }
      
      private heuristicCheck(buffer: Buffer): { encoding: string; confidence: number } {
        const sampleSize = Math.min(1000, buffer.length);
        
        // Check for null bytes (indicates UTF-16/32)
        let nullBytes = 0;
        let highBytes = 0;
        let validUtf8 = true;
        
        for (let i = 0; i < sampleSize; i++) {
          if (buffer[i] === 0) nullBytes++;
          if (buffer[i] > 127) highBytes++;
          
          // Simple UTF-8 validation
          if (buffer[i] > 127) {
            if ((buffer[i] & 0xE0) === 0xC0) {
              // 2-byte sequence
              if (i + 1 >= sampleSize || (buffer[i + 1] & 0xC0) !== 0x80) {
                validUtf8 = false;
              }
              i++;
            } else if ((buffer[i] & 0xF0) === 0xE0) {
              // 3-byte sequence
              if (i + 2 >= sampleSize || 
                  (buffer[i + 1] & 0xC0) !== 0x80 || 
                  (buffer[i + 2] & 0xC0) !== 0x80) {
                validUtf8 = false;
              }
              i += 2;
            }
          }
        }
        
        // Decision logic
        if (nullBytes > sampleSize * 0.3) {
          return { encoding: 'UTF-16', confidence: 70 };
        }
        
        if (validUtf8 && highBytes > 0) {
          return { encoding: 'UTF-8', confidence: 85 };
        }
        
        if (highBytes > sampleSize * 0.3) {
          return { encoding: 'ISO-8859-1', confidence: 60 };
        }
        
        return { encoding: 'UTF-8', confidence: 50 }; // Default
      }
    }
    
    const detector = new EncodingDetector();
    
    const testBuffers = [
      {
        name: 'Pure ASCII',
        content: Buffer.from('<?xml version="1.0"?><invoice><id>TEST-011</id></invoice>')
      },
      {
        name: 'UTF-8 with special chars',
        content: Buffer.from('<?xml version="1.0"?><invoice><name>Café €100</name></invoice>')
      },
      {
        name: 'ISO-8859-1 content',
        content: Buffer.from([
          0x3C, 0x69, 0x6E, 0x76, 0x6F, 0x69, 0x63, 0x65, 0x3E, // <invoice>
          0x3C, 0x6E, 0x61, 0x6D, 0x65, 0x3E, // <name>
          0xC4, 0xD6, 0xDC, // ÄÖÜ in ISO-8859-1
          0x3C, 0x2F, 0x6E, 0x61, 0x6D, 0x65, 0x3E, // </name>
          0x3C, 0x2F, 0x69, 0x6E, 0x76, 0x6F, 0x69, 0x63, 0x65, 0x3E // </invoice>
        ])
      },
      {
        name: 'UTF-16 with nulls',
        content: Buffer.from('invoice', 'utf16le')
      }
    ];
    
    for (const test of testBuffers) {
      const result = detector.detectEncoding(test.content);
      
      console.log(`${test.name}:`);
      console.log(`  Detected: ${result.encoding}`);
      console.log(`  Confidence: ${result.confidence}%`);
      console.log(`  Method: ${result.method}`);
    }
    
    performanceTracker.endOperation('heuristic-detection');
  });
  
  await t.test('Multi-encoding document handling', async () => {
    performanceTracker.startOperation('multi-encoding');
    
    const multiEncodingTests = [
      {
        name: 'Declaration vs actual mismatch',
        declared: 'UTF-8',
        actual: 'ISO-8859-1',
        content: Buffer.from([
          // <?xml version="1.0" encoding="UTF-8"?>
          0x3C, 0x3F, 0x78, 0x6D, 0x6C, 0x20, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6F, 0x6E, 0x3D,
          0x22, 0x31, 0x2E, 0x30, 0x22, 0x20, 0x65, 0x6E, 0x63, 0x6F, 0x64, 0x69, 0x6E, 0x67,
          0x3D, 0x22, 0x55, 0x54, 0x46, 0x2D, 0x38, 0x22, 0x3F, 0x3E,
          // <invoice><name>
          0x3C, 0x69, 0x6E, 0x76, 0x6F, 0x69, 0x63, 0x65, 0x3E, 0x3C, 0x6E, 0x61, 0x6D, 0x65, 0x3E,
          // Müller in ISO-8859-1
          0x4D, 0xFC, 0x6C, 0x6C, 0x65, 0x72,
          // </name></invoice>
          0x3C, 0x2F, 0x6E, 0x61, 0x6D, 0x65, 0x3E, 0x3C, 0x2F, 0x69, 0x6E, 0x76, 0x6F, 0x69, 0x63, 0x65, 0x3E
        ])
      },
      {
        name: 'Mixed encoding in attributes',
        content: `<?xml version="1.0" encoding="UTF-8"?>
<invoice currency="€" supplier="Müller & Co.">
  <amount>100.00</amount>
</invoice>`
      },
      {
        name: 'Entity-encoded special chars',
        content: `<?xml version="1.0" encoding="ASCII"?>
<invoice>
  <supplier>M&#252;ller</supplier>
  <amount>&#8364;100</amount>
</invoice>`
      }
    ];
    
    for (const test of multiEncodingTests) {
      const startTime = performance.now();
      
      console.log(`${test.name}:`);
      
      if (test.declared && test.actual) {
        console.log(`  Declared: ${test.declared}`);
        console.log(`  Actual: ${test.actual}`);
        console.log(`  ⚠️  Encoding mismatch detected`);
      }
      
      try {
        const invoice = new einvoice.EInvoice();
        const content = test.content instanceof Buffer ? test.content : test.content;
        
        if (invoice.fromXmlString && typeof content === 'string') {
          await invoice.fromXmlString(content);
          console.log('  ✓ Parsed successfully');
        } else if (invoice.fromBuffer && content instanceof Buffer) {
          await invoice.fromBuffer(content);
          console.log('  ✓ Parsed from buffer');
        }
      } catch (error) {
        console.log(`  ✗ Parse error: ${error.message}`);
      }
      
      performanceTracker.recordMetric('multi-encoding', performance.now() - startTime);
    }
    
    performanceTracker.endOperation('multi-encoding');
  });
  
  await t.test('Corpus encoding analysis', async () => {
    performanceTracker.startOperation('corpus-encoding');
    
    const corpusLoader = new CorpusLoader();
    const xmlFiles = await corpusLoader.getFiles(/\.xml$/);
    
    console.log(`\nAnalyzing encodings in ${xmlFiles.length} corpus files...`);
    
    const encodingStats = {
      total: 0,
      byDeclaration: new Map<string, number>(),
      byBOM: { withBOM: 0, withoutBOM: 0 },
      conflicts: 0,
      errors: 0
    };
    
    const sampleSize = Math.min(100, xmlFiles.length);
    const sampledFiles = xmlFiles.slice(0, sampleSize);
    
    for (const file of sampledFiles) {
      encodingStats.total++;
      
      try {
        const buffer = await plugins.fs.readFile(file.path);
        
        // Check for BOM
        if (buffer.length >= 3 && 
            buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) {
          encodingStats.byBOM.withBOM++;
        } else {
          encodingStats.byBOM.withoutBOM++;
        }
        
        // Check declaration
        const sample = buffer.toString('utf8', 0, Math.min(200, buffer.length));
        const match = sample.match(/encoding=["']([^"']+)["']/i);
        
        if (match) {
          const encoding = match[1].toUpperCase();
          encodingStats.byDeclaration.set(
            encoding, 
            (encodingStats.byDeclaration.get(encoding) || 0) + 1
          );
        } else {
          encodingStats.byDeclaration.set(
            'NONE', 
            (encodingStats.byDeclaration.get('NONE') || 0) + 1
          );
        }
      } catch (error) {
        encodingStats.errors++;
      }
    }
    
    console.log('\nEncoding Statistics:');
    console.log(`Total files analyzed: ${encodingStats.total}`);
    console.log(`Files with BOM: ${encodingStats.byBOM.withBOM}`);
    console.log(`Files without BOM: ${encodingStats.byBOM.withoutBOM}`);
    console.log('\nDeclared encodings:');
    
    const sortedEncodings = Array.from(encodingStats.byDeclaration.entries())
      .sort((a, b) => b[1] - a[1]);
    
    for (const [encoding, count] of sortedEncodings) {
      const percentage = (count / encodingStats.total * 100).toFixed(1);
      console.log(`  ${encoding}: ${count} (${percentage}%)`);
    }
    
    console.log(`\nRead errors: ${encodingStats.errors}`);
    
    performanceTracker.endOperation('corpus-encoding');
  });
  
  await t.test('Encoding conversion and normalization', async () => {
    performanceTracker.startOperation('encoding-conversion');
    
    class EncodingNormalizer {
      async normalizeToUTF8(buffer: Buffer, sourceEncoding?: string): Promise<Buffer> {
        // Detect encoding if not provided
        if (!sourceEncoding) {
          sourceEncoding = this.detectSourceEncoding(buffer);
        }
        
        // Skip if already UTF-8
        if (sourceEncoding === 'UTF-8') {
          // Just remove BOM if present
          if (buffer.length >= 3 && 
              buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) {
            return buffer.slice(3);
          }
          return buffer;
        }
        
        // Convert to UTF-8
        try {
          const decoder = new TextDecoder(sourceEncoding.toLowerCase());
          const text = decoder.decode(buffer);
          
          // Update encoding declaration
          const updatedText = text.replace(
            /encoding=["'][^"']+["']/i,
            'encoding="UTF-8"'
          );
          
          return Buffer.from(updatedText, 'utf8');
        } catch (error) {
          throw new Error(`Encoding conversion failed: ${error.message}`);
        }
      }
      
      private detectSourceEncoding(buffer: Buffer): string {
        // Simple detection logic
        if (buffer.length >= 3 && 
            buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) {
          return 'UTF-8';
        }
        
        const sample = buffer.toString('ascii', 0, Math.min(100, buffer.length));
        const match = sample.match(/encoding=["']([^"']+)["']/i);
        
        return match ? match[1].toUpperCase() : 'UTF-8';
      }
    }
    
    const normalizer = new EncodingNormalizer();
    
    const conversionTests = [
      {
        name: 'UTF-8 with BOM to UTF-8 without BOM',
        input: Buffer.concat([
          Buffer.from([0xEF, 0xBB, 0xBF]),
          Buffer.from('<?xml version="1.0" encoding="UTF-8"?><invoice><id>TEST</id></invoice>')
        ])
      },
      {
        name: 'ISO-8859-1 to UTF-8',
        input: Buffer.from('<?xml version="1.0" encoding="ISO-8859-1"?><invoice><name>Test</name></invoice>')
      }
    ];
    
    for (const test of conversionTests) {
      const startTime = performance.now();
      
      try {
        const normalized = await normalizer.normalizeToUTF8(test.input);
        
        console.log(`${test.name}:`);
        console.log(`  Input size: ${test.input.length} bytes`);
        console.log(`  Output size: ${normalized.length} bytes`);
        console.log(`  ✓ Conversion successful`);
        
        // Verify no BOM in output
        if (normalized.length >= 3 && 
            normalized[0] === 0xEF && normalized[1] === 0xBB && normalized[2] === 0xBF) {
          console.log('  ✗ BOM still present in output');
        } else {
          console.log('  ✓ BOM removed');
        }
      } catch (error) {
        console.log(`${test.name}: ✗ Conversion failed - ${error.message}`);
      }
      
      performanceTracker.recordMetric('encoding-conversion', performance.now() - startTime);
    }
    
    performanceTracker.endOperation('encoding-conversion');
  });
  
  // Performance summary
  console.log('\n' + performanceTracker.getSummary());
  
  // Encoding detection best practices
  console.log('\nCharacter Encoding Detection Best Practices:');
  console.log('1. Always check for BOM before parsing');
  console.log('2. Verify declared encoding matches actual encoding');
  console.log('3. Use heuristics when declaration is missing');
  console.log('4. Handle encoding mismatches gracefully');
  console.log('5. Normalize to UTF-8 for consistent processing');
  console.log('6. Preserve original encoding information for round-trip');
  console.log('7. Support common legacy encodings (ISO-8859-1, Windows-1252)');
  console.log('8. Test with real-world data that includes various encodings');
});

tap.start();