import { expect, tap } from '@git.zone/tstest/tapbundle'; import * as einvoice from '../../../ts/index.js'; import * as plugins from '../../plugins.js'; import { PerformanceTracker } from '../../helpers/performance.tracker.js'; import { CorpusLoader } from '../../helpers/corpus.loader.js'; tap.test('ERR-07: Character Encoding Errors - Handle encoding issues and charset problems', async (t) => { const performanceTracker = new PerformanceTracker('ERR-07'); await t.test('Common encoding issues', async () => { performanceTracker.startOperation('encoding-issues'); const encodingTests = [ { name: 'UTF-8 with BOM', content: '\uFEFFTEST-001', expectedHandling: 'BOM removal', shouldParse: true }, { name: 'Windows-1252 declared as UTF-8', content: Buffer.from([ 0x3C, 0x3F, 0x78, 0x6D, 0x6C, 0x20, // 0x3C, 0x69, 0x6E, 0x76, 0x6F, 0x69, 0x63, 0x65, 0x3E, // 0x3C, 0x6E, 0x61, 0x6D, 0x65, 0x3E, // 0x4D, 0xFC, 0x6C, 0x6C, 0x65, 0x72, // Müller with Windows-1252 ü (0xFC) 0x3C, 0x2F, 0x6E, 0x61, 0x6D, 0x65, 0x3E, // 0x3C, 0x2F, 0x69, 0x6E, 0x76, 0x6F, 0x69, 0x63, 0x65, 0x3E // ]), expectedHandling: 'Encoding mismatch detection', shouldParse: false }, { name: 'UTF-16 without BOM', content: Buffer.from('TEST', 'utf16le'), expectedHandling: 'UTF-16 detection', shouldParse: true }, { name: 'Mixed encoding in same document', content: 'CaféMüller', expectedHandling: 'Mixed encoding handling', shouldParse: true }, { name: 'Invalid UTF-8 sequences', content: Buffer.from([ 0x3C, 0x69, 0x6E, 0x76, 0x6F, 0x69, 0x63, 0x65, 0x3E, // 0xC3, 0x28, // Invalid UTF-8 sequence 0x3C, 0x2F, 0x69, 0x6E, 0x76, 0x6F, 0x69, 0x63, 0x65, 0x3E // ]), expectedHandling: 'Invalid UTF-8 sequence detection', shouldParse: false } ]; for (const test of encodingTests) { const startTime = performance.now(); try { const invoice = new einvoice.EInvoice(); const content = test.content instanceof Buffer ? test.content : test.content; if (invoice.fromXmlString && typeof content === 'string') { await invoice.fromXmlString(content); } else if (invoice.fromBuffer && content instanceof Buffer) { await invoice.fromBuffer(content); } else { console.log(`⚠️ No suitable method for ${test.name}`); continue; } if (test.shouldParse) { console.log(`✓ ${test.name}: Successfully handled - ${test.expectedHandling}`); } else { console.log(`✗ ${test.name}: Parsed when it should have failed`); } } catch (error) { if (!test.shouldParse) { console.log(`✓ ${test.name}: Correctly rejected - ${error.message}`); } else { console.log(`✗ ${test.name}: Failed to parse - ${error.message}`); } } performanceTracker.recordMetric('encoding-test', performance.now() - startTime); } performanceTracker.endOperation('encoding-issues'); }); await t.test('Character set detection', async () => { performanceTracker.startOperation('charset-detection'); class CharsetDetector { detectEncoding(buffer: Buffer): { encoding: string; confidence: number } { // Check for BOM if (buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) { return { encoding: 'UTF-8', confidence: 100 }; } if (buffer[0] === 0xFF && buffer[1] === 0xFE) { return { encoding: 'UTF-16LE', confidence: 100 }; } if (buffer[0] === 0xFE && buffer[1] === 0xFF) { return { encoding: 'UTF-16BE', confidence: 100 }; } // Check XML declaration const xmlDeclMatch = buffer.toString('ascii', 0, 100).match(/encoding=["']([^"']+)["']/i); if (xmlDeclMatch) { return { encoding: xmlDeclMatch[1].toUpperCase(), confidence: 90 }; } // Heuristic detection try { const utf8String = buffer.toString('utf8'); // Check for replacement characters if (!utf8String.includes('\uFFFD')) { return { encoding: 'UTF-8', confidence: 80 }; } } catch (e) { // Not valid UTF-8 } // Check for common Windows-1252 characters let windows1252Count = 0; for (let i = 0; i < Math.min(buffer.length, 1000); i++) { if (buffer[i] >= 0x80 && buffer[i] <= 0x9F) { windows1252Count++; } } if (windows1252Count > 5) { return { encoding: 'WINDOWS-1252', confidence: 70 }; } // Default return { encoding: 'UTF-8', confidence: 50 }; } } const detector = new CharsetDetector(); const testBuffers = [ { name: 'UTF-8 with BOM', buffer: Buffer.from('\uFEFFHello') }, { name: 'UTF-16LE', buffer: Buffer.from('\xFF\xFEHello', 'binary') }, { name: 'Plain ASCII', buffer: Buffer.from('Hello') }, { name: 'Windows-1252', buffer: Buffer.from('Café €', 'binary') } ]; for (const test of testBuffers) { const result = detector.detectEncoding(test.buffer); console.log(`${test.name}: Detected ${result.encoding} (confidence: ${result.confidence}%)`); } performanceTracker.endOperation('charset-detection'); }); await t.test('Encoding conversion strategies', async () => { performanceTracker.startOperation('encoding-conversion'); class EncodingConverter { async convertToUTF8(buffer: Buffer, sourceEncoding: string): Promise { try { // Try iconv-lite simulation if (sourceEncoding === 'WINDOWS-1252') { // Simple Windows-1252 to UTF-8 conversion for common chars const result = []; for (let i = 0; i < buffer.length; i++) { const byte = buffer[i]; if (byte < 0x80) { result.push(byte); } else if (byte === 0xFC) { // ü result.push(0xC3, 0xBC); } else if (byte === 0xE4) { // ä result.push(0xC3, 0xA4); } else if (byte === 0xF6) { // ö result.push(0xC3, 0xB6); } else if (byte === 0x80) { // € result.push(0xE2, 0x82, 0xAC); } else { // Replace with question mark result.push(0x3F); } } return Buffer.from(result); } // For other encodings, attempt Node.js built-in conversion const decoder = new TextDecoder(sourceEncoding.toLowerCase()); const text = decoder.decode(buffer); return Buffer.from(text, 'utf8'); } catch (error) { throw new Error(`Failed to convert from ${sourceEncoding} to UTF-8: ${error.message}`); } } sanitizeXML(xmlString: string): string { // Remove invalid XML characters return xmlString .replace(/[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]/g, '') // Control characters .replace(/\uFEFF/g, '') // BOM .replace(/[\uD800-\uDBFF](?![\uDC00-\uDFFF])/g, '') // Unpaired surrogates .replace(/(? { performanceTracker.startOperation('special-characters'); const specialCharTests = [ { name: 'Emoji in invoice', xml: 'Payment received 👍', shouldWork: true }, { name: 'Zero-width characters', xml: 'TEST\u200B001', shouldWork: true }, { name: 'Right-to-left text', xml: 'شركة الفواتير', shouldWork: true }, { name: 'Control characters', xml: 'Line1\x00Line2', shouldWork: false }, { name: 'Combining characters', xml: 'José', // é as e + combining acute shouldWork: true } ]; for (const test of specialCharTests) { const startTime = performance.now(); try { const invoice = new einvoice.EInvoice(); if (invoice.fromXmlString) { await invoice.fromXmlString(test.xml); if (test.shouldWork) { console.log(`✓ ${test.name}: Handled correctly`); } else { console.log(`✗ ${test.name}: Should have failed but didn't`); } } else { console.log(`⚠️ fromXmlString not implemented`); } } catch (error) { if (!test.shouldWork) { console.log(`✓ ${test.name}: Correctly rejected - ${error.message}`); } else { console.log(`✗ ${test.name}: Failed unexpectedly - ${error.message}`); } } performanceTracker.recordMetric('special-char-test', performance.now() - startTime); } performanceTracker.endOperation('special-characters'); }); await t.test('Corpus encoding analysis', async () => { performanceTracker.startOperation('corpus-encoding'); const corpusLoader = new CorpusLoader(); const xmlFiles = await corpusLoader.getFiles(/\.xml$/); console.log(`\nAnalyzing encodings in ${xmlFiles.length} XML files...`); const encodingStats = { total: 0, utf8: 0, utf8WithBom: 0, utf16: 0, windows1252: 0, iso88591: 0, other: 0, noDeclaration: 0, errors: 0 }; const sampleSize = Math.min(100, xmlFiles.length); const sampledFiles = xmlFiles.slice(0, sampleSize); for (const file of sampledFiles) { encodingStats.total++; try { const buffer = await plugins.fs.readFile(file.path); const content = buffer.toString('utf8', 0, Math.min(200, buffer.length)); // Check for BOM if (buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) { encodingStats.utf8WithBom++; } // Check XML declaration const encodingMatch = content.match(/encoding=["']([^"']+)["']/i); if (encodingMatch) { const encoding = encodingMatch[1].toUpperCase(); switch (encoding) { case 'UTF-8': encodingStats.utf8++; break; case 'UTF-16': case 'UTF-16LE': case 'UTF-16BE': encodingStats.utf16++; break; case 'WINDOWS-1252': case 'CP1252': encodingStats.windows1252++; break; case 'ISO-8859-1': case 'LATIN1': encodingStats.iso88591++; break; default: encodingStats.other++; console.log(` Found unusual encoding: ${encoding} in ${file.name}`); } } else { encodingStats.noDeclaration++; } } catch (error) { encodingStats.errors++; } } console.log('\nEncoding Statistics:'); console.log(`Total files analyzed: ${encodingStats.total}`); console.log(`UTF-8: ${encodingStats.utf8}`); console.log(`UTF-8 with BOM: ${encodingStats.utf8WithBom}`); console.log(`UTF-16: ${encodingStats.utf16}`); console.log(`Windows-1252: ${encodingStats.windows1252}`); console.log(`ISO-8859-1: ${encodingStats.iso88591}`); console.log(`Other encodings: ${encodingStats.other}`); console.log(`No encoding declaration: ${encodingStats.noDeclaration}`); console.log(`Read errors: ${encodingStats.errors}`); performanceTracker.endOperation('corpus-encoding'); }); await t.test('Encoding error recovery', async () => { performanceTracker.startOperation('encoding-recovery'); const recoveryStrategies = [ { name: 'Remove BOM', apply: (content: string) => content.replace(/^\uFEFF/, ''), test: '\uFEFF' }, { name: 'Fix encoding declaration', apply: (content: string) => { return content.replace( /encoding=["'][^"']*["']/i, 'encoding="UTF-8"' ); }, test: '' }, { name: 'Remove invalid characters', apply: (content: string) => { return content.replace(/[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]/g, ''); }, test: 'TEST\x00001' }, { name: 'Normalize line endings', apply: (content: string) => { return content.replace(/\r\n/g, '\n').replace(/\r/g, '\n'); }, test: '\r\n\rTEST\r\n' }, { name: 'HTML entity decode', apply: (content: string) => { return content .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') .replace(/'/g, "'"); }, test: 'Müller & Co.' } ]; for (const strategy of recoveryStrategies) { const startTime = performance.now(); try { const recovered = strategy.apply(strategy.test); const invoice = new einvoice.EInvoice(); if (invoice.fromXmlString) { await invoice.fromXmlString(recovered); console.log(`✓ ${strategy.name}: Recovery successful`); } else { console.log(`⚠️ ${strategy.name}: Cannot test without fromXmlString`); } } catch (error) { console.log(`✗ ${strategy.name}: Recovery failed - ${error.message}`); } performanceTracker.recordMetric('recovery-strategy', performance.now() - startTime); } performanceTracker.endOperation('encoding-recovery'); }); // Performance summary console.log('\n' + performanceTracker.getSummary()); // Encoding error handling best practices console.log('\nCharacter Encoding Error Handling Best Practices:'); console.log('1. Always detect encoding before parsing'); console.log('2. Handle BOM (Byte Order Mark) correctly'); console.log('3. Validate encoding declaration matches actual encoding'); console.log('4. Sanitize invalid XML characters'); console.log('5. Support common legacy encodings (Windows-1252, ISO-8859-1)'); console.log('6. Provide clear error messages for encoding issues'); console.log('7. Implement fallback strategies for recovery'); console.log('8. Normalize text to prevent encoding-related security issues'); }); tap.start();