update

2025-05-28 08:40:26 +00:00
parent e4c762658d
commit 32f8bc192a
24 changed files with 3350 additions and 5416 deletions
@@ -1,554 +1,320 @@
 import { expect, tap } from '@git.zone/tstest/tapbundle';
 import * as einvoice from '../../../ts/index.js';
-import * as plugins from '../../plugins.js';
-import { CorpusLoader } from '../../helpers/corpus.loader.js';
 import { PerformanceTracker } from '../../helpers/performance.tracker.js';

-tap.test('PARSE-03: Character Encoding Detection - Detect and handle various character encodings', async (t) => {
-  const performanceTracker = new PerformanceTracker('PARSE-03');
+tap.test('PARSE-03: Encoding declaration detection', async () => {
+  const encodingTests = [
+    {
+      name: 'UTF-8 declaration',
+      xml: '<?xml version="1.0" encoding="UTF-8"?>\n<invoice><id>TEST-001</id></invoice>',
+      expectedEncoding: 'UTF-8',
+      actualEncoding: 'UTF-8'
+    },
+    {
+      name: 'UTF-16 declaration',
+      xml: '<?xml version="1.0" encoding="UTF-16"?>\n<invoice><id>TEST-002</id></invoice>',
+      expectedEncoding: 'UTF-16',
+      actualEncoding: 'UTF-8' // Mismatch test
+    },
+    {
+      name: 'ISO-8859-1 declaration',
+      xml: '<?xml version="1.0" encoding="ISO-8859-1"?>\n<invoice><supplier>Müller</supplier></invoice>',
+      expectedEncoding: 'ISO-8859-1',
+      actualEncoding: 'ISO-8859-1'
+    },
+    {
+      name: 'Windows-1252 declaration',
+      xml: '<?xml version="1.0" encoding="Windows-1252"?>\n<invoice><note>Special – chars</note></invoice>',
+      expectedEncoding: 'Windows-1252',
+      actualEncoding: 'Windows-1252'
+    },
+    {
+      name: 'Case variations',
+      xml: '<?xml version="1.0" encoding="utf-8"?>\n<invoice><id>TEST-003</id></invoice>',
+      expectedEncoding: 'UTF-8',
+      actualEncoding: 'UTF-8'
+    },
+    {
+      name: 'No encoding declaration',
+      xml: '<?xml version="1.0"?>\n<invoice><id>TEST-004</id></invoice>',
+      expectedEncoding: 'UTF-8', // Default
+      actualEncoding: 'UTF-8'
+    }
+  ];
  
-  await t.test('Encoding declaration detection', async () => {
-    performanceTracker.startOperation('declaration-detection');
-    
-    const encodingTests = [
-      {
-        name: 'UTF-8 declaration',
-        xml: '<?xml version="1.0" encoding="UTF-8"?>\n<invoice><id>TEST-001</id></invoice>',
-        expectedEncoding: 'UTF-8',
-        actualEncoding: 'UTF-8'
-      },
-      {
-        name: 'UTF-16 declaration',
-        xml: '<?xml version="1.0" encoding="UTF-16"?>\n<invoice><id>TEST-002</id></invoice>',
-        expectedEncoding: 'UTF-16',
-        actualEncoding: 'UTF-8' // Mismatch test
-      },
-      {
-        name: 'ISO-8859-1 declaration',
-        xml: '<?xml version="1.0" encoding="ISO-8859-1"?>\n<invoice><supplier>Müller</supplier></invoice>',
-        expectedEncoding: 'ISO-8859-1',
-        actualEncoding: 'ISO-8859-1'
-      },
-      {
-        name: 'Windows-1252 declaration',
-        xml: '<?xml version="1.0" encoding="Windows-1252"?>\n<invoice><note>Special – chars</note></invoice>',
-        expectedEncoding: 'Windows-1252',
-        actualEncoding: 'Windows-1252'
-      },
-      {
-        name: 'Case variations',
-        xml: '<?xml version="1.0" encoding="utf-8"?>\n<invoice><id>TEST-003</id></invoice>',
-        expectedEncoding: 'UTF-8',
-        actualEncoding: 'UTF-8'
-      },
-      {
-        name: 'No encoding declaration',
-        xml: '<?xml version="1.0"?>\n<invoice><id>TEST-004</id></invoice>',
-        expectedEncoding: 'UTF-8', // Default
-        actualEncoding: 'UTF-8'
+  for (const test of encodingTests) {
+    const { result, metric } = await PerformanceTracker.track(
+      'encoding-detection',
+      async () => {
+        // Extract declared encoding
+        const encodingMatch = test.xml.match(/encoding=["']([^"']+)["']/i);
+        const declaredEncoding = encodingMatch ? encodingMatch[1].toUpperCase() : 'UTF-8';
+        
+        return {
+          declaredEncoding,
+          matches: declaredEncoding.replace(/-/g, '').toUpperCase() === 
+                  test.expectedEncoding.replace(/-/g, '').toUpperCase()
+        };
      }
-    ];
+    );
    
-    for (const test of encodingTests) {
-      const startTime = performance.now();
-      
-      // Extract declared encoding
-      const encodingMatch = test.xml.match(/encoding=["']([^"']+)["']/i);
-      const declaredEncoding = encodingMatch ? encodingMatch[1].toUpperCase() : 'UTF-8';
-      
-      console.log(`${test.name}:`);
-      console.log(`  Declared: ${declaredEncoding}`);
-      console.log(`  Expected: ${test.expectedEncoding}`);
-      
-      if (declaredEncoding.replace(/-/g, '').toUpperCase() === 
-          test.expectedEncoding.replace(/-/g, '').toUpperCase()) {
-        console.log('  ✓ Declaration matches expected encoding');
-      } else {
-        console.log('  ✗ Declaration mismatch');
-      }
-      
-      performanceTracker.recordMetric('encoding-detection', performance.now() - startTime);
-    }
-    
-    performanceTracker.endOperation('declaration-detection');
-  });
-  
-  await t.test('BOM (Byte Order Mark) detection', async () => {
-    performanceTracker.startOperation('bom-detection');
-    
-    const bomTests = [
-      {
-        name: 'UTF-8 with BOM',
-        bom: Buffer.from([0xEF, 0xBB, 0xBF]),
-        encoding: 'UTF-8',
-        xml: '<?xml version="1.0" encoding="UTF-8"?><invoice><id>TEST-005</id></invoice>'
-      },
-      {
-        name: 'UTF-16 LE BOM',
-        bom: Buffer.from([0xFF, 0xFE]),
-        encoding: 'UTF-16LE',
-        xml: '<?xml version="1.0" encoding="UTF-16"?><invoice><id>TEST-006</id></invoice>'
-      },
-      {
-        name: 'UTF-16 BE BOM',
-        bom: Buffer.from([0xFE, 0xFF]),
-        encoding: 'UTF-16BE',
-        xml: '<?xml version="1.0" encoding="UTF-16"?><invoice><id>TEST-007</id></invoice>'
-      },
-      {
-        name: 'UTF-32 LE BOM',
-        bom: Buffer.from([0xFF, 0xFE, 0x00, 0x00]),
-        encoding: 'UTF-32LE',
-        xml: '<?xml version="1.0" encoding="UTF-32"?><invoice><id>TEST-008</id></invoice>'
-      },
-      {
-        name: 'UTF-32 BE BOM',
-        bom: Buffer.from([0x00, 0x00, 0xFE, 0xFF]),
-        encoding: 'UTF-32BE',
-        xml: '<?xml version="1.0" encoding="UTF-32"?><invoice><id>TEST-009</id></invoice>'
-      },
-      {
-        name: 'No BOM',
-        bom: Buffer.from([]),
-        encoding: 'UTF-8',
-        xml: '<?xml version="1.0" encoding="UTF-8"?><invoice><id>TEST-010</id></invoice>'
-      }
-    ];
-    
-    for (const test of bomTests) {
-      const startTime = performance.now();
-      
-      // Create buffer with BOM
-      const xmlBuffer = Buffer.from(test.xml, 'utf8');
-      const fullBuffer = Buffer.concat([test.bom, xmlBuffer]);
-      
-      // Detect BOM
-      let detectedEncoding = 'UTF-8'; // Default
-      
-      if (fullBuffer.length >= 4) {
-        if (fullBuffer[0] === 0xEF && fullBuffer[1] === 0xBB && fullBuffer[2] === 0xBF) {
-          detectedEncoding = 'UTF-8';
-        } else if (fullBuffer[0] === 0xFF && fullBuffer[1] === 0xFE) {
-          if (fullBuffer[2] === 0x00 && fullBuffer[3] === 0x00) {
-            detectedEncoding = 'UTF-32LE';
-          } else {
-            detectedEncoding = 'UTF-16LE';
-          }
-        } else if (fullBuffer[0] === 0xFE && fullBuffer[1] === 0xFF) {
-          detectedEncoding = 'UTF-16BE';
-        } else if (fullBuffer[0] === 0x00 && fullBuffer[1] === 0x00 && 
-                   fullBuffer[2] === 0xFE && fullBuffer[3] === 0xFF) {
-          detectedEncoding = 'UTF-32BE';
-        }
-      }
-      
-      console.log(`${test.name}:`);
-      console.log(`  BOM bytes: ${test.bom.length > 0 ? Array.from(test.bom).map(b => '0x' + b.toString(16).toUpperCase()).join(' ') : 'None'}`);
-      console.log(`  Expected: ${test.encoding}`);
-      console.log(`  Detected: ${detectedEncoding}`);
-      
-      if (detectedEncoding === test.encoding || 
-          (test.bom.length === 0 && detectedEncoding === 'UTF-8')) {
-        console.log('  ✓ BOM detection correct');
-      } else {
-        console.log('  ✗ BOM detection failed');
-      }
-      
-      performanceTracker.recordMetric('bom-detection', performance.now() - startTime);
-    }
-    
-    performanceTracker.endOperation('bom-detection');
-  });
-  
-  await t.test('Heuristic encoding detection', async () => {
-    performanceTracker.startOperation('heuristic-detection');
-    
-    class EncodingDetector {
-      detectEncoding(buffer: Buffer): { encoding: string; confidence: number; method: string } {
-        // Check for BOM first
-        const bomResult = this.checkBOM(buffer);
-        if (bomResult) {
-          return { ...bomResult, confidence: 100, method: 'BOM' };
-        }
-        
-        // Check XML declaration
-        const declResult = this.checkXmlDeclaration(buffer);
-        if (declResult) {
-          return { ...declResult, confidence: 90, method: 'XML Declaration' };
-        }
-        
-        // Heuristic checks
-        const heuristicResult = this.heuristicCheck(buffer);
-        return { ...heuristicResult, method: 'Heuristic' };
-      }
-      
-      private checkBOM(buffer: Buffer): { encoding: string } | null {
-        if (buffer.length < 2) return null;
-        
-        if (buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) {
-          return { encoding: 'UTF-8' };
-        }
-        if (buffer[0] === 0xFF && buffer[1] === 0xFE) {
-          return { encoding: 'UTF-16LE' };
-        }
-        if (buffer[0] === 0xFE && buffer[1] === 0xFF) {
-          return { encoding: 'UTF-16BE' };
-        }
-        
-        return null;
-      }
-      
-      private checkXmlDeclaration(buffer: Buffer): { encoding: string } | null {
-        // Look for encoding in first 100 bytes
-        const sample = buffer.toString('ascii', 0, Math.min(100, buffer.length));
-        const match = sample.match(/encoding=["']([^"']+)["']/i);
-        
-        if (match) {
-          return { encoding: match[1].toUpperCase() };
-        }
-        
-        return null;
-      }
-      
-      private heuristicCheck(buffer: Buffer): { encoding: string; confidence: number } {
-        const sampleSize = Math.min(1000, buffer.length);
-        
-        // Check for null bytes (indicates UTF-16/32)
-        let nullBytes = 0;
-        let highBytes = 0;
-        let validUtf8 = true;
-        
-        for (let i = 0; i < sampleSize; i++) {
-          if (buffer[i] === 0) nullBytes++;
-          if (buffer[i] > 127) highBytes++;
-          
-          // Simple UTF-8 validation
-          if (buffer[i] > 127) {
-            if ((buffer[i] & 0xE0) === 0xC0) {
-              // 2-byte sequence
-              if (i + 1 >= sampleSize || (buffer[i + 1] & 0xC0) !== 0x80) {
-                validUtf8 = false;
-              }
-              i++;
-            } else if ((buffer[i] & 0xF0) === 0xE0) {
-              // 3-byte sequence
-              if (i + 2 >= sampleSize || 
-                  (buffer[i + 1] & 0xC0) !== 0x80 || 
-                  (buffer[i + 2] & 0xC0) !== 0x80) {
-                validUtf8 = false;
-              }
-              i += 2;
-            }
-          }
-        }
-        
-        // Decision logic
-        if (nullBytes > sampleSize * 0.3) {
-          return { encoding: 'UTF-16', confidence: 70 };
-        }
-        
-        if (validUtf8 && highBytes > 0) {
-          return { encoding: 'UTF-8', confidence: 85 };
-        }
-        
-        if (highBytes > sampleSize * 0.3) {
-          return { encoding: 'ISO-8859-1', confidence: 60 };
-        }
-        
-        return { encoding: 'UTF-8', confidence: 50 }; // Default
-      }
-    }
-    
-    const detector = new EncodingDetector();
-    
-    const testBuffers = [
-      {
-        name: 'Pure ASCII',
-        content: Buffer.from('<?xml version="1.0"?><invoice><id>TEST-011</id></invoice>')
-      },
-      {
-        name: 'UTF-8 with special chars',
-        content: Buffer.from('<?xml version="1.0"?><invoice><name>Café €100</name></invoice>')
-      },
-      {
-        name: 'ISO-8859-1 content',
-        content: Buffer.from([
-          0x3C, 0x69, 0x6E, 0x76, 0x6F, 0x69, 0x63, 0x65, 0x3E, // <invoice>
-          0x3C, 0x6E, 0x61, 0x6D, 0x65, 0x3E, // <name>
-          0xC4, 0xD6, 0xDC, // ÄÖÜ in ISO-8859-1
-          0x3C, 0x2F, 0x6E, 0x61, 0x6D, 0x65, 0x3E, // </name>
-          0x3C, 0x2F, 0x69, 0x6E, 0x76, 0x6F, 0x69, 0x63, 0x65, 0x3E // </invoice>
-        ])
-      },
-      {
-        name: 'UTF-16 with nulls',
-        content: Buffer.from('invoice', 'utf16le')
-      }
-    ];
-    
-    for (const test of testBuffers) {
-      const result = detector.detectEncoding(test.content);
-      
-      console.log(`${test.name}:`);
-      console.log(`  Detected: ${result.encoding}`);
-      console.log(`  Confidence: ${result.confidence}%`);
-      console.log(`  Method: ${result.method}`);
-    }
-    
-    performanceTracker.endOperation('heuristic-detection');
-  });
-  
-  await t.test('Multi-encoding document handling', async () => {
-    performanceTracker.startOperation('multi-encoding');
-    
-    const multiEncodingTests = [
-      {
-        name: 'Declaration vs actual mismatch',
-        declared: 'UTF-8',
-        actual: 'ISO-8859-1',
-        content: Buffer.from([
-          // <?xml version="1.0" encoding="UTF-8"?>
-          0x3C, 0x3F, 0x78, 0x6D, 0x6C, 0x20, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6F, 0x6E, 0x3D,
-          0x22, 0x31, 0x2E, 0x30, 0x22, 0x20, 0x65, 0x6E, 0x63, 0x6F, 0x64, 0x69, 0x6E, 0x67,
-          0x3D, 0x22, 0x55, 0x54, 0x46, 0x2D, 0x38, 0x22, 0x3F, 0x3E,
-          // <invoice><name>
-          0x3C, 0x69, 0x6E, 0x76, 0x6F, 0x69, 0x63, 0x65, 0x3E, 0x3C, 0x6E, 0x61, 0x6D, 0x65, 0x3E,
-          // Müller in ISO-8859-1
-          0x4D, 0xFC, 0x6C, 0x6C, 0x65, 0x72,
-          // </name></invoice>
-          0x3C, 0x2F, 0x6E, 0x61, 0x6D, 0x65, 0x3E, 0x3C, 0x2F, 0x69, 0x6E, 0x76, 0x6F, 0x69, 0x63, 0x65, 0x3E
-        ])
-      },
-      {
-        name: 'Mixed encoding in attributes',
-        content: `<?xml version="1.0" encoding="UTF-8"?>
-<invoice currency="€" supplier="Müller & Co.">
-  <amount>100.00</amount>
-</invoice>`
-      },
-      {
-        name: 'Entity-encoded special chars',
-        content: `<?xml version="1.0" encoding="ASCII"?>
-<invoice>
-  <supplier>M&#252;ller</supplier>
-  <amount>&#8364;100</amount>
-</invoice>`
-      }
-    ];
-    
-    for (const test of multiEncodingTests) {
-      const startTime = performance.now();
-      
-      console.log(`${test.name}:`);
-      
-      if (test.declared && test.actual) {
-        console.log(`  Declared: ${test.declared}`);
-        console.log(`  Actual: ${test.actual}`);
-        console.log(`  ⚠️  Encoding mismatch detected`);
-      }
-      
-      try {
-        const invoice = new einvoice.EInvoice();
-        const content = test.content instanceof Buffer ? test.content : test.content;
-        
-        if (invoice.fromXmlString && typeof content === 'string') {
-          await invoice.fromXmlString(content);
-          console.log('  ✓ Parsed successfully');
-        } else if (invoice.fromBuffer && content instanceof Buffer) {
-          await invoice.fromBuffer(content);
-          console.log('  ✓ Parsed from buffer');
-        }
-      } catch (error) {
-        console.log(`  ✗ Parse error: ${error.message}`);
-      }
-      
-      performanceTracker.recordMetric('multi-encoding', performance.now() - startTime);
-    }
-    
-    performanceTracker.endOperation('multi-encoding');
-  });
-  
-  await t.test('Corpus encoding analysis', async () => {
-    performanceTracker.startOperation('corpus-encoding');
-    
-    const corpusLoader = new CorpusLoader();
-    const xmlFiles = await corpusLoader.getFiles(/\.xml$/);
-    
-    console.log(`\nAnalyzing encodings in ${xmlFiles.length} corpus files...`);
-    
-    const encodingStats = {
-      total: 0,
-      byDeclaration: new Map<string, number>(),
-      byBOM: { withBOM: 0, withoutBOM: 0 },
-      conflicts: 0,
-      errors: 0
-    };
-    
-    const sampleSize = Math.min(100, xmlFiles.length);
-    const sampledFiles = xmlFiles.slice(0, sampleSize);
-    
-    for (const file of sampledFiles) {
-      encodingStats.total++;
-      
-      try {
-        const buffer = await plugins.fs.readFile(file.path);
-        
-        // Check for BOM
-        if (buffer.length >= 3 && 
-            buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) {
-          encodingStats.byBOM.withBOM++;
-        } else {
-          encodingStats.byBOM.withoutBOM++;
-        }
-        
-        // Check declaration
-        const sample = buffer.toString('utf8', 0, Math.min(200, buffer.length));
-        const match = sample.match(/encoding=["']([^"']+)["']/i);
-        
-        if (match) {
-          const encoding = match[1].toUpperCase();
-          encodingStats.byDeclaration.set(
-            encoding, 
-            (encodingStats.byDeclaration.get(encoding) || 0) + 1
-          );
-        } else {
-          encodingStats.byDeclaration.set(
-            'NONE', 
-            (encodingStats.byDeclaration.get('NONE') || 0) + 1
-          );
-        }
-      } catch (error) {
-        encodingStats.errors++;
-      }
-    }
-    
-    console.log('\nEncoding Statistics:');
-    console.log(`Total files analyzed: ${encodingStats.total}`);
-    console.log(`Files with BOM: ${encodingStats.byBOM.withBOM}`);
-    console.log(`Files without BOM: ${encodingStats.byBOM.withoutBOM}`);
-    console.log('\nDeclared encodings:');
-    
-    const sortedEncodings = Array.from(encodingStats.byDeclaration.entries())
-      .sort((a, b) => b[1] - a[1]);
-    
-    for (const [encoding, count] of sortedEncodings) {
-      const percentage = (count / encodingStats.total * 100).toFixed(1);
-      console.log(`  ${encoding}: ${count} (${percentage}%)`);
-    }
-    
-    console.log(`\nRead errors: ${encodingStats.errors}`);
-    
-    performanceTracker.endOperation('corpus-encoding');
-  });
-  
-  await t.test('Encoding conversion and normalization', async () => {
-    performanceTracker.startOperation('encoding-conversion');
-    
-    class EncodingNormalizer {
-      async normalizeToUTF8(buffer: Buffer, sourceEncoding?: string): Promise<Buffer> {
-        // Detect encoding if not provided
-        if (!sourceEncoding) {
-          sourceEncoding = this.detectSourceEncoding(buffer);
-        }
-        
-        // Skip if already UTF-8
-        if (sourceEncoding === 'UTF-8') {
-          // Just remove BOM if present
-          if (buffer.length >= 3 && 
-              buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) {
-            return buffer.slice(3);
-          }
-          return buffer;
-        }
-        
-        // Convert to UTF-8
-        try {
-          const decoder = new TextDecoder(sourceEncoding.toLowerCase());
-          const text = decoder.decode(buffer);
-          
-          // Update encoding declaration
-          const updatedText = text.replace(
-            /encoding=["'][^"']+["']/i,
-            'encoding="UTF-8"'
-          );
-          
-          return Buffer.from(updatedText, 'utf8');
-        } catch (error) {
-          throw new Error(`Encoding conversion failed: ${error.message}`);
-        }
-      }
-      
-      private detectSourceEncoding(buffer: Buffer): string {
-        // Simple detection logic
-        if (buffer.length >= 3 && 
-            buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) {
-          return 'UTF-8';
-        }
-        
-        const sample = buffer.toString('ascii', 0, Math.min(100, buffer.length));
-        const match = sample.match(/encoding=["']([^"']+)["']/i);
-        
-        return match ? match[1].toUpperCase() : 'UTF-8';
-      }
-    }
-    
-    const normalizer = new EncodingNormalizer();
-    
-    const conversionTests = [
-      {
-        name: 'UTF-8 with BOM to UTF-8 without BOM',
-        input: Buffer.concat([
-          Buffer.from([0xEF, 0xBB, 0xBF]),
-          Buffer.from('<?xml version="1.0" encoding="UTF-8"?><invoice><id>TEST</id></invoice>')
-        ])
-      },
-      {
-        name: 'ISO-8859-1 to UTF-8',
-        input: Buffer.from('<?xml version="1.0" encoding="ISO-8859-1"?><invoice><name>Test</name></invoice>')
-      }
-    ];
-    
-    for (const test of conversionTests) {
-      const startTime = performance.now();
-      
-      try {
-        const normalized = await normalizer.normalizeToUTF8(test.input);
-        
-        console.log(`${test.name}:`);
-        console.log(`  Input size: ${test.input.length} bytes`);
-        console.log(`  Output size: ${normalized.length} bytes`);
-        console.log(`  ✓ Conversion successful`);
-        
-        // Verify no BOM in output
-        if (normalized.length >= 3 && 
-            normalized[0] === 0xEF && normalized[1] === 0xBB && normalized[2] === 0xBF) {
-          console.log('  ✗ BOM still present in output');
-        } else {
-          console.log('  ✓ BOM removed');
-        }
-      } catch (error) {
-        console.log(`${test.name}: ✗ Conversion failed - ${error.message}`);
-      }
-      
-      performanceTracker.recordMetric('encoding-conversion', performance.now() - startTime);
-    }
-    
-    performanceTracker.endOperation('encoding-conversion');
-  });
-  
-  // Performance summary
-  console.log('\n' + performanceTracker.getSummary());
-  
-  // Encoding detection best practices
-  console.log('\nCharacter Encoding Detection Best Practices:');
-  console.log('1. Always check for BOM before parsing');
-  console.log('2. Verify declared encoding matches actual encoding');
-  console.log('3. Use heuristics when declaration is missing');
-  console.log('4. Handle encoding mismatches gracefully');
-  console.log('5. Normalize to UTF-8 for consistent processing');
-  console.log('6. Preserve original encoding information for round-trip');
-  console.log('7. Support common legacy encodings (ISO-8859-1, Windows-1252)');
-  console.log('8. Test with real-world data that includes various encodings');
+    console.log(`${test.name}:`);
+    console.log(`  Declared: ${result.declaredEncoding}`);
+    console.log(`  Expected: ${test.expectedEncoding}`);
+    console.log(`  ${result.matches ? '✓' : '✗'} Declaration ${result.matches ? 'matches' : 'mismatch'}`);
+  }
 });

+tap.test('PARSE-03: BOM (Byte Order Mark) detection', async () => {
+  const bomTests = [
+    {
+      name: 'UTF-8 with BOM',
+      bom: Buffer.from([0xEF, 0xBB, 0xBF]),
+      encoding: 'UTF-8',
+      xml: '<?xml version="1.0" encoding="UTF-8"?><invoice><id>TEST-005</id></invoice>'
+    },
+    {
+      name: 'UTF-16 LE BOM',
+      bom: Buffer.from([0xFF, 0xFE]),
+      encoding: 'UTF-16LE',
+      xml: '<?xml version="1.0" encoding="UTF-16"?><invoice><id>TEST-006</id></invoice>'
+    },
+    {
+      name: 'UTF-16 BE BOM',
+      bom: Buffer.from([0xFE, 0xFF]),
+      encoding: 'UTF-16BE',
+      xml: '<?xml version="1.0" encoding="UTF-16"?><invoice><id>TEST-007</id></invoice>'
+    }
+  ];
+  
+  for (const test of bomTests) {
+    const xmlWithBom = Buffer.concat([test.bom, Buffer.from(test.xml)]);
+    
+    const { result } = await PerformanceTracker.track(
+      'bom-detection',
+      async () => {
+        const invoice = new einvoice.EInvoice();
+        
+        try {
+          // Try parsing with BOM
+          await invoice.fromXmlString(xmlWithBom.toString('utf8'));
+          return { success: true, parsed: true };
+        } catch (error) {
+          return { 
+            success: false, 
+            error: error.message,
+            // Check if it's an encoding issue
+            encodingError: error.message.toLowerCase().includes('encoding') ||
+                          error.message.toLowerCase().includes('utf')
+          };
+        }
+      }
+    );
+    
+    console.log(`${test.name}: ${result.parsed ? '✓' : '✗'}`);
+    if (!result.parsed) {
+      console.log(`  Error: ${result.error}`);
+      if (result.encodingError) {
+        console.log(`  Likely encoding issue detected`);
+      }
+    }
+  }
+});
+
+tap.test('PARSE-03: Special character handling', async () => {
+  const charTests = [
+    {
+      name: 'German umlauts',
+      xml: `<?xml version="1.0" encoding="UTF-8"?>
+<ubl:Invoice xmlns:ubl="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2" xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
+  <cbc:ID>UMLAUT-TEST</cbc:ID>
+  <cbc:Note>Müller, Schäfer, Köln, Größe</cbc:Note>
+</ubl:Invoice>`,
+      chars: 'üäöß',
+      expectedChars: 'Müller, Schäfer, Köln, Größe'
+    },
+    {
+      name: 'French accents',
+      xml: `<?xml version="1.0" encoding="UTF-8"?>
+<ubl:Invoice xmlns:ubl="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2" xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
+  <cbc:ID>ACCENT-TEST</cbc:ID>
+  <cbc:Note>Café, naïve, façade, à côté</cbc:Note>
+</ubl:Invoice>`,
+      chars: 'éèêëàçï',
+      expectedChars: 'Café, naïve, façade, à côté'
+    },
+    {
+      name: 'Currency symbols',
+      xml: `<?xml version="1.0" encoding="UTF-8"?>
+<ubl:Invoice xmlns:ubl="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2" xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
+  <cbc:ID>CURRENCY-TEST</cbc:ID>
+  <cbc:Note>€ 100, £ 50, ¥ 1000, $ 75</cbc:Note>
+</ubl:Invoice>`,
+      chars: '€£¥$',
+      expectedChars: '€ 100, £ 50, ¥ 1000, $ 75'
+    },
+    {
+      name: 'Emoji and Unicode',
+      xml: `<?xml version="1.0" encoding="UTF-8"?>
+<ubl:Invoice xmlns:ubl="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2" xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
+  <cbc:ID>UNICODE-TEST</cbc:ID>
+  <cbc:Note>Invoice 📄 Payment 💰 Delivered 📦</cbc:Note>
+</ubl:Invoice>`,
+      chars: '📄💰📦',
+      expectedChars: 'Invoice 📄 Payment 💰 Delivered 📦'
+    }
+  ];
+  
+  for (const test of charTests) {
+    const { result } = await PerformanceTracker.track(
+      'special-chars',
+      async () => {
+        const invoice = new einvoice.EInvoice();
+        
+        try {
+          await invoice.fromXmlString(test.xml);
+          return {
+            success: true,
+            notes: invoice.notes,
+            preserved: invoice.notes && invoice.notes[0] === test.expectedChars
+          };
+        } catch (error) {
+          return { success: false, error: error.message };
+        }
+      }
+    );
+    
+    console.log(`${test.name}: ${result.success ? '✓' : '✗'}`);
+    if (result.success && result.notes) {
+      console.log(`  Characters ${result.preserved ? 'preserved' : 'not preserved'}`);
+      if (result.notes[0]) {
+        console.log(`  Content: ${result.notes[0]}`);
+      }
+    }
+  }
+});
+
+tap.test('PARSE-03: XML entities and escaping', async () => {
+  const entityTests = [
+    {
+      name: 'Basic XML entities',
+      xml: `<?xml version="1.0" encoding="UTF-8"?>
+<ubl:Invoice xmlns:ubl="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2" xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
+  <cbc:ID>ENTITY-TEST-1</cbc:ID>
+  <cbc:Note>Less than &lt; Greater than &gt; Ampersand &amp; Quote &quot; Apostrophe &apos;</cbc:Note>
+</ubl:Invoice>`,
+      expected: 'Less than < Greater than > Ampersand & Quote " Apostrophe \''
+    },
+    {
+      name: 'Numeric entities',
+      xml: `<?xml version="1.0" encoding="UTF-8"?>
+<ubl:Invoice xmlns:ubl="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2" xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
+  <cbc:ID>ENTITY-TEST-2</cbc:ID>
+  <cbc:Note>Euro &#8364; Copyright &#169; Registered &#174;</cbc:Note>
+</ubl:Invoice>`,
+      expected: 'Euro € Copyright © Registered ®'
+    },
+    {
+      name: 'CDATA sections',
+      xml: `<?xml version="1.0" encoding="UTF-8"?>
+<ubl:Invoice xmlns:ubl="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2" xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
+  <cbc:ID>CDATA-TEST</cbc:ID>
+  <cbc:Note><![CDATA[HTML content: <p>Price > 100 & quantity < 50</p>]]></cbc:Note>
+</ubl:Invoice>`,
+      expected: 'HTML content: <p>Price > 100 & quantity < 50</p>'
+    }
+  ];
+  
+  for (const test of entityTests) {
+    const { result } = await PerformanceTracker.track(
+      'entity-handling',
+      async () => {
+        const invoice = new einvoice.EInvoice();
+        
+        try {
+          await invoice.fromXmlString(test.xml);
+          return {
+            success: true,
+            notes: invoice.notes,
+            correct: invoice.notes && invoice.notes[0] === test.expected
+          };
+        } catch (error) {
+          return { success: false, error: error.message };
+        }
+      }
+    );
+    
+    console.log(`${test.name}: ${result.success && result.correct ? '✓' : '✗'}`);
+    if (result.success && result.notes) {
+      console.log(`  Expected: ${test.expected}`);
+      console.log(`  Got: ${result.notes[0] || '(empty)'}`);
+    }
+  }
+});
+
+tap.test('PARSE-03: Mixed encoding scenarios', async () => {
+  // Test real-world scenarios where encoding might be problematic
+  const scenarios = [
+    {
+      name: 'Mislabeled encoding',
+      // Says UTF-8 but contains ISO-8859-1 characters
+      xml: '<?xml version="1.0" encoding="UTF-8"?><invoice><supplier>Müller GmbH</supplier></invoice>',
+      issue: 'Declared UTF-8 but might have ISO-8859-1 content'
+    },
+    {
+      name: 'Double-encoded UTF-8',
+      // UTF-8 encoded twice
+      xml: '<?xml version="1.0" encoding="UTF-8"?><invoice><note>MÃ¼ller</note></invoice>',
+      issue: 'Possible double UTF-8 encoding'
+    },
+    {
+      name: 'Mixed line endings with special chars',
+      xml: '<?xml version="1.0" encoding="UTF-8"?>\r\n<invoice>\n<note>Special–chars</note>\r</invoice>',
+      issue: 'Mixed CRLF/LF with special characters'
+    }
+  ];
+  
+  for (const scenario of scenarios) {
+    const { result } = await PerformanceTracker.track(
+      'mixed-encoding',
+      async () => {
+        const invoice = new einvoice.EInvoice();
+        
+        try {
+          await invoice.fromXmlString(scenario.xml);
+          return { success: true, handled: true };
+        } catch (error) {
+          return { 
+            success: false, 
+            error: error.message,
+            isEncodingError: error.message.includes('encoding') || 
+                            error.message.includes('character')
+          };
+        }
+      }
+    );
+    
+    console.log(`${scenario.name}: ${result.handled || !result.isEncodingError ? '✓' : '✗'}`);
+    console.log(`  Issue: ${scenario.issue}`);
+    if (!result.success) {
+      console.log(`  Result: ${result.isEncodingError ? 'Encoding error' : 'Other error'}`);
+    }
+  }
+});
+
+tap.test('PARSE-03: Encoding performance', async () => {
+  const stats = PerformanceTracker.getStats('encoding-detection');
+  
+  if (stats) {
+    console.log('\nEncoding Detection Performance:');
+    console.log(`  Total operations: ${stats.count}`);
+    console.log(`  Average time: ${stats.avg.toFixed(2)}ms`);
+    console.log(`  Max time: ${stats.max.toFixed(2)}ms`);
+    
+    // Encoding detection should be fast
+    expect(stats.avg).toBeLessThan(5); // Should detect encoding in < 5ms on average
+  }
+});
+
+// Run the tests
 tap.start();