import { expect, tap } from '@git.zone/tstest/tapbundle'; import * as einvoice from '../../../ts/index.js'; import * as plugins from '../../plugins.js'; import { CorpusLoader } from '../../helpers/corpus.loader.js'; import { PerformanceTracker } from '../../helpers/performance.tracker.js'; tap.test('PARSE-03: Character Encoding Detection - Detect and handle various character encodings', async (t) => { const performanceTracker = new PerformanceTracker('PARSE-03'); await t.test('Encoding declaration detection', async () => { performanceTracker.startOperation('declaration-detection'); const encodingTests = [ { name: 'UTF-8 declaration', xml: '\nTEST-001', expectedEncoding: 'UTF-8', actualEncoding: 'UTF-8' }, { name: 'UTF-16 declaration', xml: '\nTEST-002', expectedEncoding: 'UTF-16', actualEncoding: 'UTF-8' // Mismatch test }, { name: 'ISO-8859-1 declaration', xml: '\nMüller', expectedEncoding: 'ISO-8859-1', actualEncoding: 'ISO-8859-1' }, { name: 'Windows-1252 declaration', xml: '\nSpecial – chars', expectedEncoding: 'Windows-1252', actualEncoding: 'Windows-1252' }, { name: 'Case variations', xml: '\nTEST-003', expectedEncoding: 'UTF-8', actualEncoding: 'UTF-8' }, { name: 'No encoding declaration', xml: '\nTEST-004', expectedEncoding: 'UTF-8', // Default actualEncoding: 'UTF-8' } ]; for (const test of encodingTests) { const startTime = performance.now(); // Extract declared encoding const encodingMatch = test.xml.match(/encoding=["']([^"']+)["']/i); const declaredEncoding = encodingMatch ? encodingMatch[1].toUpperCase() : 'UTF-8'; console.log(`${test.name}:`); console.log(` Declared: ${declaredEncoding}`); console.log(` Expected: ${test.expectedEncoding}`); if (declaredEncoding.replace(/-/g, '').toUpperCase() === test.expectedEncoding.replace(/-/g, '').toUpperCase()) { console.log(' ✓ Declaration matches expected encoding'); } else { console.log(' ✗ Declaration mismatch'); } performanceTracker.recordMetric('encoding-detection', performance.now() - startTime); } performanceTracker.endOperation('declaration-detection'); }); await t.test('BOM (Byte Order Mark) detection', async () => { performanceTracker.startOperation('bom-detection'); const bomTests = [ { name: 'UTF-8 with BOM', bom: Buffer.from([0xEF, 0xBB, 0xBF]), encoding: 'UTF-8', xml: 'TEST-005' }, { name: 'UTF-16 LE BOM', bom: Buffer.from([0xFF, 0xFE]), encoding: 'UTF-16LE', xml: 'TEST-006' }, { name: 'UTF-16 BE BOM', bom: Buffer.from([0xFE, 0xFF]), encoding: 'UTF-16BE', xml: 'TEST-007' }, { name: 'UTF-32 LE BOM', bom: Buffer.from([0xFF, 0xFE, 0x00, 0x00]), encoding: 'UTF-32LE', xml: 'TEST-008' }, { name: 'UTF-32 BE BOM', bom: Buffer.from([0x00, 0x00, 0xFE, 0xFF]), encoding: 'UTF-32BE', xml: 'TEST-009' }, { name: 'No BOM', bom: Buffer.from([]), encoding: 'UTF-8', xml: 'TEST-010' } ]; for (const test of bomTests) { const startTime = performance.now(); // Create buffer with BOM const xmlBuffer = Buffer.from(test.xml, 'utf8'); const fullBuffer = Buffer.concat([test.bom, xmlBuffer]); // Detect BOM let detectedEncoding = 'UTF-8'; // Default if (fullBuffer.length >= 4) { if (fullBuffer[0] === 0xEF && fullBuffer[1] === 0xBB && fullBuffer[2] === 0xBF) { detectedEncoding = 'UTF-8'; } else if (fullBuffer[0] === 0xFF && fullBuffer[1] === 0xFE) { if (fullBuffer[2] === 0x00 && fullBuffer[3] === 0x00) { detectedEncoding = 'UTF-32LE'; } else { detectedEncoding = 'UTF-16LE'; } } else if (fullBuffer[0] === 0xFE && fullBuffer[1] === 0xFF) { detectedEncoding = 'UTF-16BE'; } else if (fullBuffer[0] === 0x00 && fullBuffer[1] === 0x00 && fullBuffer[2] === 0xFE && fullBuffer[3] === 0xFF) { detectedEncoding = 'UTF-32BE'; } } console.log(`${test.name}:`); console.log(` BOM bytes: ${test.bom.length > 0 ? Array.from(test.bom).map(b => '0x' + b.toString(16).toUpperCase()).join(' ') : 'None'}`); console.log(` Expected: ${test.encoding}`); console.log(` Detected: ${detectedEncoding}`); if (detectedEncoding === test.encoding || (test.bom.length === 0 && detectedEncoding === 'UTF-8')) { console.log(' ✓ BOM detection correct'); } else { console.log(' ✗ BOM detection failed'); } performanceTracker.recordMetric('bom-detection', performance.now() - startTime); } performanceTracker.endOperation('bom-detection'); }); await t.test('Heuristic encoding detection', async () => { performanceTracker.startOperation('heuristic-detection'); class EncodingDetector { detectEncoding(buffer: Buffer): { encoding: string; confidence: number; method: string } { // Check for BOM first const bomResult = this.checkBOM(buffer); if (bomResult) { return { ...bomResult, confidence: 100, method: 'BOM' }; } // Check XML declaration const declResult = this.checkXmlDeclaration(buffer); if (declResult) { return { ...declResult, confidence: 90, method: 'XML Declaration' }; } // Heuristic checks const heuristicResult = this.heuristicCheck(buffer); return { ...heuristicResult, method: 'Heuristic' }; } private checkBOM(buffer: Buffer): { encoding: string } | null { if (buffer.length < 2) return null; if (buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) { return { encoding: 'UTF-8' }; } if (buffer[0] === 0xFF && buffer[1] === 0xFE) { return { encoding: 'UTF-16LE' }; } if (buffer[0] === 0xFE && buffer[1] === 0xFF) { return { encoding: 'UTF-16BE' }; } return null; } private checkXmlDeclaration(buffer: Buffer): { encoding: string } | null { // Look for encoding in first 100 bytes const sample = buffer.toString('ascii', 0, Math.min(100, buffer.length)); const match = sample.match(/encoding=["']([^"']+)["']/i); if (match) { return { encoding: match[1].toUpperCase() }; } return null; } private heuristicCheck(buffer: Buffer): { encoding: string; confidence: number } { const sampleSize = Math.min(1000, buffer.length); // Check for null bytes (indicates UTF-16/32) let nullBytes = 0; let highBytes = 0; let validUtf8 = true; for (let i = 0; i < sampleSize; i++) { if (buffer[i] === 0) nullBytes++; if (buffer[i] > 127) highBytes++; // Simple UTF-8 validation if (buffer[i] > 127) { if ((buffer[i] & 0xE0) === 0xC0) { // 2-byte sequence if (i + 1 >= sampleSize || (buffer[i + 1] & 0xC0) !== 0x80) { validUtf8 = false; } i++; } else if ((buffer[i] & 0xF0) === 0xE0) { // 3-byte sequence if (i + 2 >= sampleSize || (buffer[i + 1] & 0xC0) !== 0x80 || (buffer[i + 2] & 0xC0) !== 0x80) { validUtf8 = false; } i += 2; } } } // Decision logic if (nullBytes > sampleSize * 0.3) { return { encoding: 'UTF-16', confidence: 70 }; } if (validUtf8 && highBytes > 0) { return { encoding: 'UTF-8', confidence: 85 }; } if (highBytes > sampleSize * 0.3) { return { encoding: 'ISO-8859-1', confidence: 60 }; } return { encoding: 'UTF-8', confidence: 50 }; // Default } } const detector = new EncodingDetector(); const testBuffers = [ { name: 'Pure ASCII', content: Buffer.from('TEST-011') }, { name: 'UTF-8 with special chars', content: Buffer.from('Café €100') }, { name: 'ISO-8859-1 content', content: Buffer.from([ 0x3C, 0x69, 0x6E, 0x76, 0x6F, 0x69, 0x63, 0x65, 0x3E, // 0x3C, 0x6E, 0x61, 0x6D, 0x65, 0x3E, // 0xC4, 0xD6, 0xDC, // ÄÖÜ in ISO-8859-1 0x3C, 0x2F, 0x6E, 0x61, 0x6D, 0x65, 0x3E, // 0x3C, 0x2F, 0x69, 0x6E, 0x76, 0x6F, 0x69, 0x63, 0x65, 0x3E // ]) }, { name: 'UTF-16 with nulls', content: Buffer.from('invoice', 'utf16le') } ]; for (const test of testBuffers) { const result = detector.detectEncoding(test.content); console.log(`${test.name}:`); console.log(` Detected: ${result.encoding}`); console.log(` Confidence: ${result.confidence}%`); console.log(` Method: ${result.method}`); } performanceTracker.endOperation('heuristic-detection'); }); await t.test('Multi-encoding document handling', async () => { performanceTracker.startOperation('multi-encoding'); const multiEncodingTests = [ { name: 'Declaration vs actual mismatch', declared: 'UTF-8', actual: 'ISO-8859-1', content: Buffer.from([ // 0x3C, 0x3F, 0x78, 0x6D, 0x6C, 0x20, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6F, 0x6E, 0x3D, 0x22, 0x31, 0x2E, 0x30, 0x22, 0x20, 0x65, 0x6E, 0x63, 0x6F, 0x64, 0x69, 0x6E, 0x67, 0x3D, 0x22, 0x55, 0x54, 0x46, 0x2D, 0x38, 0x22, 0x3F, 0x3E, // 0x3C, 0x69, 0x6E, 0x76, 0x6F, 0x69, 0x63, 0x65, 0x3E, 0x3C, 0x6E, 0x61, 0x6D, 0x65, 0x3E, // Müller in ISO-8859-1 0x4D, 0xFC, 0x6C, 0x6C, 0x65, 0x72, // 0x3C, 0x2F, 0x6E, 0x61, 0x6D, 0x65, 0x3E, 0x3C, 0x2F, 0x69, 0x6E, 0x76, 0x6F, 0x69, 0x63, 0x65, 0x3E ]) }, { name: 'Mixed encoding in attributes', content: ` 100.00 ` }, { name: 'Entity-encoded special chars', content: ` Müller €100 ` } ]; for (const test of multiEncodingTests) { const startTime = performance.now(); console.log(`${test.name}:`); if (test.declared && test.actual) { console.log(` Declared: ${test.declared}`); console.log(` Actual: ${test.actual}`); console.log(` ⚠️ Encoding mismatch detected`); } try { const invoice = new einvoice.EInvoice(); const content = test.content instanceof Buffer ? test.content : test.content; if (invoice.fromXmlString && typeof content === 'string') { await invoice.fromXmlString(content); console.log(' ✓ Parsed successfully'); } else if (invoice.fromBuffer && content instanceof Buffer) { await invoice.fromBuffer(content); console.log(' ✓ Parsed from buffer'); } } catch (error) { console.log(` ✗ Parse error: ${error.message}`); } performanceTracker.recordMetric('multi-encoding', performance.now() - startTime); } performanceTracker.endOperation('multi-encoding'); }); await t.test('Corpus encoding analysis', async () => { performanceTracker.startOperation('corpus-encoding'); const corpusLoader = new CorpusLoader(); const xmlFiles = await corpusLoader.getFiles(/\.xml$/); console.log(`\nAnalyzing encodings in ${xmlFiles.length} corpus files...`); const encodingStats = { total: 0, byDeclaration: new Map(), byBOM: { withBOM: 0, withoutBOM: 0 }, conflicts: 0, errors: 0 }; const sampleSize = Math.min(100, xmlFiles.length); const sampledFiles = xmlFiles.slice(0, sampleSize); for (const file of sampledFiles) { encodingStats.total++; try { const buffer = await plugins.fs.readFile(file.path); // Check for BOM if (buffer.length >= 3 && buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) { encodingStats.byBOM.withBOM++; } else { encodingStats.byBOM.withoutBOM++; } // Check declaration const sample = buffer.toString('utf8', 0, Math.min(200, buffer.length)); const match = sample.match(/encoding=["']([^"']+)["']/i); if (match) { const encoding = match[1].toUpperCase(); encodingStats.byDeclaration.set( encoding, (encodingStats.byDeclaration.get(encoding) || 0) + 1 ); } else { encodingStats.byDeclaration.set( 'NONE', (encodingStats.byDeclaration.get('NONE') || 0) + 1 ); } } catch (error) { encodingStats.errors++; } } console.log('\nEncoding Statistics:'); console.log(`Total files analyzed: ${encodingStats.total}`); console.log(`Files with BOM: ${encodingStats.byBOM.withBOM}`); console.log(`Files without BOM: ${encodingStats.byBOM.withoutBOM}`); console.log('\nDeclared encodings:'); const sortedEncodings = Array.from(encodingStats.byDeclaration.entries()) .sort((a, b) => b[1] - a[1]); for (const [encoding, count] of sortedEncodings) { const percentage = (count / encodingStats.total * 100).toFixed(1); console.log(` ${encoding}: ${count} (${percentage}%)`); } console.log(`\nRead errors: ${encodingStats.errors}`); performanceTracker.endOperation('corpus-encoding'); }); await t.test('Encoding conversion and normalization', async () => { performanceTracker.startOperation('encoding-conversion'); class EncodingNormalizer { async normalizeToUTF8(buffer: Buffer, sourceEncoding?: string): Promise { // Detect encoding if not provided if (!sourceEncoding) { sourceEncoding = this.detectSourceEncoding(buffer); } // Skip if already UTF-8 if (sourceEncoding === 'UTF-8') { // Just remove BOM if present if (buffer.length >= 3 && buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) { return buffer.slice(3); } return buffer; } // Convert to UTF-8 try { const decoder = new TextDecoder(sourceEncoding.toLowerCase()); const text = decoder.decode(buffer); // Update encoding declaration const updatedText = text.replace( /encoding=["'][^"']+["']/i, 'encoding="UTF-8"' ); return Buffer.from(updatedText, 'utf8'); } catch (error) { throw new Error(`Encoding conversion failed: ${error.message}`); } } private detectSourceEncoding(buffer: Buffer): string { // Simple detection logic if (buffer.length >= 3 && buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) { return 'UTF-8'; } const sample = buffer.toString('ascii', 0, Math.min(100, buffer.length)); const match = sample.match(/encoding=["']([^"']+)["']/i); return match ? match[1].toUpperCase() : 'UTF-8'; } } const normalizer = new EncodingNormalizer(); const conversionTests = [ { name: 'UTF-8 with BOM to UTF-8 without BOM', input: Buffer.concat([ Buffer.from([0xEF, 0xBB, 0xBF]), Buffer.from('TEST') ]) }, { name: 'ISO-8859-1 to UTF-8', input: Buffer.from('Test') } ]; for (const test of conversionTests) { const startTime = performance.now(); try { const normalized = await normalizer.normalizeToUTF8(test.input); console.log(`${test.name}:`); console.log(` Input size: ${test.input.length} bytes`); console.log(` Output size: ${normalized.length} bytes`); console.log(` ✓ Conversion successful`); // Verify no BOM in output if (normalized.length >= 3 && normalized[0] === 0xEF && normalized[1] === 0xBB && normalized[2] === 0xBF) { console.log(' ✗ BOM still present in output'); } else { console.log(' ✓ BOM removed'); } } catch (error) { console.log(`${test.name}: ✗ Conversion failed - ${error.message}`); } performanceTracker.recordMetric('encoding-conversion', performance.now() - startTime); } performanceTracker.endOperation('encoding-conversion'); }); // Performance summary console.log('\n' + performanceTracker.getSummary()); // Encoding detection best practices console.log('\nCharacter Encoding Detection Best Practices:'); console.log('1. Always check for BOM before parsing'); console.log('2. Verify declared encoding matches actual encoding'); console.log('3. Use heuristics when declaration is missing'); console.log('4. Handle encoding mismatches gracefully'); console.log('5. Normalize to UTF-8 for consistent processing'); console.log('6. Preserve original encoding information for round-trip'); console.log('7. Support common legacy encodings (ISO-8859-1, Windows-1252)'); console.log('8. Test with real-world data that includes various encodings'); }); tap.start();