/** * @file test.conv-11.encoding-edge-cases.ts * @description Tests for character encoding edge cases and special scenarios during conversion */ import { tap } from '@git.zone/tstest/tapbundle'; import * as plugins from '../../plugins.js'; import { EInvoice } from '../../../ts/index.js'; import { CorpusLoader } from '../../suite/corpus.loader.js'; import { PerformanceTracker } from '../../suite/performance.tracker.js'; const corpusLoader = new CorpusLoader(); const performanceTracker = new PerformanceTracker('CONV-11: Character Encoding Edge Cases'); tap.test('CONV-11: Character Encoding - should handle encoding edge cases during conversion', async (t) => { // Test 1: Mixed encoding declarations const mixedEncodingDeclarations = await performanceTracker.measureAsync( 'mixed-encoding-declarations', async () => { const einvoice = new EInvoice(); const results = { utf8ToUtf16: false, utf16ToIso: false, isoToUtf8: false, bomHandling: false }; // UTF-8 to UTF-16 conversion const utf8Invoice = { format: 'ubl' as const, encoding: 'UTF-8', data: { documentType: 'INVOICE', invoiceNumber: 'ENC-UTF8-2024-001', issueDate: '2024-01-28', seller: { name: 'UTF-8 Société Française €', address: 'Rue de la Paix № 42', country: 'FR', taxId: 'FR12345678901' }, buyer: { name: 'Käufer GmbH & Co. KG', address: 'Hauptstraße 123½', country: 'DE', taxId: 'DE123456789' }, items: [{ description: 'Spécialité française – Délicieux', quantity: 1, unitPrice: 99.99, vatRate: 20, lineTotal: 99.99 }], totals: { netAmount: 99.99, vatAmount: 20.00, grossAmount: 119.99 } } }; try { // Convert and force UTF-16 encoding const converted = await einvoice.convertFormat(utf8Invoice, 'cii'); converted.encoding = 'UTF-16'; // Check if special characters are preserved results.utf8ToUtf16 = converted.data.seller.name.includes('€') && converted.data.seller.address.includes('№') && converted.data.items[0].description.includes('–'); } catch (error) { // Encoding conversion may not be supported } // ISO-8859-1 limitations test const isoInvoice = { format: 'cii' as const, encoding: 'ISO-8859-1', data: { documentType: 'INVOICE', invoiceNumber: 'ENC-ISO-2024-001', issueDate: '2024-01-28', seller: { name: 'Latin-1 Company', address: 'Simple Street 1', country: 'ES', taxId: 'ES12345678A' }, buyer: { name: 'Buyer Limited', address: 'Plain Avenue 2', country: 'ES', taxId: 'ES87654321B' }, items: [{ description: 'Product with emoji 😀 and Chinese 中文', quantity: 1, unitPrice: 50.00, vatRate: 21, lineTotal: 50.00 }], totals: { netAmount: 50.00, vatAmount: 10.50, grossAmount: 60.50 } } }; try { const converted = await einvoice.convertFormat(isoInvoice, 'ubl'); // Characters outside ISO-8859-1 should be handled (replaced or encoded) results.isoToUtf8 = converted.data.items[0].description !== isoInvoice.data.items[0].description; } catch (error) { // Expected behavior for unsupported characters results.isoToUtf8 = true; } // BOM handling test const bomInvoice = { format: 'ubl' as const, encoding: 'UTF-8-BOM', data: { documentType: 'INVOICE', invoiceNumber: 'ENC-BOM-2024-001', issueDate: '2024-01-28', seller: { name: 'BOM Test Company', address: 'BOM Street 1', country: 'US', taxId: 'US12-3456789' }, buyer: { name: 'BOM Buyer Inc', address: 'BOM Avenue 2', country: 'US', taxId: 'US98-7654321' }, items: [{ description: 'BOM-aware product', quantity: 1, unitPrice: 100.00, vatRate: 8, lineTotal: 100.00 }], totals: { netAmount: 100.00, vatAmount: 8.00, grossAmount: 108.00 } } }; try { const converted = await einvoice.convertFormat(bomInvoice, 'cii'); results.bomHandling = converted.data.invoiceNumber === bomInvoice.data.invoiceNumber; } catch (error) { // BOM handling error } return results; } ); // Test 2: Unicode normalization during conversion const unicodeNormalization = await performanceTracker.measureAsync( 'unicode-normalization', async () => { const einvoice = new EInvoice(); // Test with different Unicode normalization forms const testCases = [ { name: 'NFC vs NFD', text1: 'café', // NFC: é as single character text2: 'café', // NFD: e + combining acute accent shouldMatch: true }, { name: 'Precomposed vs Decomposed', text1: 'Å', // Precomposed text2: 'Å', // A + ring above shouldMatch: true }, { name: 'Complex diacritics', text1: 'Việt Nam', text2: 'Việt Nam', // Different composition shouldMatch: true } ]; const results = []; for (const testCase of testCases) { const invoice = { format: 'ubl' as const, data: { documentType: 'INVOICE', invoiceNumber: `NORM-${testCase.name.replace(/\s+/g, '-')}`, issueDate: '2024-01-28', seller: { name: testCase.text1, address: 'Normalization Test 1', country: 'VN', taxId: 'VN1234567890' }, buyer: { name: testCase.text2, address: 'Normalization Test 2', country: 'VN', taxId: 'VN0987654321' }, items: [{ description: `Product from ${testCase.text1}`, quantity: 1, unitPrice: 100.00, vatRate: 10, lineTotal: 100.00 }], totals: { netAmount: 100.00, vatAmount: 10.00, grossAmount: 110.00 } } }; try { const converted = await einvoice.convertFormat(invoice, 'cii'); const backToUBL = await einvoice.convertFormat(converted, 'ubl'); // Check if normalized strings are handled correctly const sellerMatch = backToUBL.data.seller.name === invoice.data.seller.name || backToUBL.data.seller.name.normalize('NFC') === invoice.data.seller.name.normalize('NFC'); results.push({ testCase: testCase.name, preserved: sellerMatch, original: testCase.text1, converted: backToUBL.data.seller.name }); } catch (error) { results.push({ testCase: testCase.name, preserved: false, error: error.message }); } } return results; } ); // Test 3: Zero-width and control characters const controlCharacters = await performanceTracker.measureAsync( 'control-characters-handling', async () => { const einvoice = new EInvoice(); // Test various control and special characters const specialChars = { zeroWidth: '\u200B\u200C\u200D\uFEFF', // Zero-width characters control: '\u0001\u0002\u001F', // Control characters directional: '\u202A\u202B\u202C\u202D\u202E', // Directional marks combining: 'a\u0300\u0301\u0302\u0303', // Combining diacriticals surrogates: '𝕳𝖊𝖑𝖑𝖔', // Mathematical alphanumeric symbols emoji: '🧾💰📊' // Emoji characters }; const results = {}; for (const [charType, chars] of Object.entries(specialChars)) { const invoice = { format: 'ubl' as const, data: { documentType: 'INVOICE', invoiceNumber: `CTRL-${charType.toUpperCase()}-001`, issueDate: '2024-01-28', seller: { name: `Seller${chars}Company`, address: `Address ${chars} Line`, country: 'US', taxId: 'US12-3456789' }, buyer: { name: `Buyer ${chars} Ltd`, address: 'Normal Address', country: 'US', taxId: 'US98-7654321' }, items: [{ description: `Product ${chars} Description`, quantity: 1, unitPrice: 100.00, vatRate: 10, lineTotal: 100.00 }], totals: { netAmount: 100.00, vatAmount: 10.00, grossAmount: 110.00 }, notes: `Notes with ${chars} special characters` } }; try { const converted = await einvoice.convertFormat(invoice, 'cii'); const sanitized = await einvoice.convertFormat(converted, 'ubl'); // Check how special characters are handled results[charType] = { originalLength: invoice.data.seller.name.length, convertedLength: sanitized.data.seller.name.length, preserved: invoice.data.seller.name === sanitized.data.seller.name, cleaned: sanitized.data.seller.name.replace(/[\u0000-\u001F\u200B-\u200D\uFEFF]/g, '').length < invoice.data.seller.name.length }; } catch (error) { results[charType] = { error: true, message: error.message }; } } return results; } ); // Test 4: Encoding conflicts in multi-language invoices const multiLanguageEncoding = await performanceTracker.measureAsync( 'multi-language-encoding', async () => { const einvoice = new EInvoice(); // Create invoice with multiple scripts/languages const multiLangInvoice = { format: 'ubl' as const, data: { documentType: 'INVOICE', invoiceNumber: 'MULTI-LANG-2024-001', issueDate: '2024-01-28', seller: { name: 'Global Trading Company 全球贸易公司', address: 'International Plaza 国际广场 Διεθνής Πλατεία', country: 'SG', taxId: 'SG12345678X' }, buyer: { name: 'المشتري العربي | Arabic Buyer | खरीदार', address: 'شارع العرب | Arab Street | अरब स्ट्रीट', country: 'AE', taxId: 'AE123456789012345' }, items: [ { description: 'Product 产品 Προϊόν منتج उत्पाद', quantity: 1, unitPrice: 100.00, vatRate: 5, lineTotal: 100.00 }, { description: 'Service 服务 Υπηρεσία خدمة सेवा', quantity: 2, unitPrice: 200.00, vatRate: 5, lineTotal: 400.00 } ], totals: { netAmount: 500.00, vatAmount: 25.00, grossAmount: 525.00 }, notes: 'Thank you 谢谢 Ευχαριστώ شكرا धन्यवाद' } }; // Test conversion through different formats const conversionTests = [ { from: 'ubl', to: 'cii' }, { from: 'cii', to: 'zugferd' }, { from: 'zugferd', to: 'xrechnung' } ]; const results = []; let currentInvoice = multiLangInvoice; for (const test of conversionTests) { try { const converted = await einvoice.convertFormat(currentInvoice, test.to); // Check preservation of multi-language content const sellerNamePreserved = converted.data.seller.name.includes('全球贸易公司'); const buyerNamePreserved = converted.data.buyer.name.includes('العربي') && converted.data.buyer.name.includes('खरीदार'); const itemsPreserved = converted.data.items[0].description.includes('产品') && converted.data.items[0].description.includes('منتج'); results.push({ conversion: `${test.from} -> ${test.to}`, sellerNamePreserved, buyerNamePreserved, itemsPreserved, allPreserved: sellerNamePreserved && buyerNamePreserved && itemsPreserved }); currentInvoice = converted; } catch (error) { results.push({ conversion: `${test.from} -> ${test.to}`, error: error.message }); } } return results; } ); // Test 5: Corpus encoding analysis const corpusEncodingAnalysis = await performanceTracker.measureAsync( 'corpus-encoding-edge-cases', async () => { const files = await corpusLoader.getFilesByPattern('**/*.xml'); const einvoice = new EInvoice(); const encodingStats = { totalFiles: 0, encodingIssues: 0, specialCharFiles: 0, conversionFailures: 0, characterTypes: new Set(), problematicFiles: [] as string[] }; // Sample files for analysis const sampleFiles = files.slice(0, 30); for (const file of sampleFiles) { try { const content = await plugins.fs.readFile(file, 'utf-8'); encodingStats.totalFiles++; // Check for special characters const hasSpecialChars = /[^\x00-\x7F]/.test(content); const hasControlChars = /[\x00-\x1F\x7F]/.test(content); const hasRTL = /[\u0590-\u08FF\uFB1D-\uFDFF\uFE70-\uFEFF]/.test(content); const hasCJK = /[\u4E00-\u9FFF\u3040-\u30FF\uAC00-\uD7AF]/.test(content); if (hasSpecialChars || hasControlChars || hasRTL || hasCJK) { encodingStats.specialCharFiles++; if (hasControlChars) encodingStats.characterTypes.add('control'); if (hasRTL) encodingStats.characterTypes.add('RTL'); if (hasCJK) encodingStats.characterTypes.add('CJK'); } // Try format detection and conversion const format = await einvoice.detectFormat(content); if (format && format !== 'unknown') { try { const parsed = await einvoice.parseInvoice(content, format); const targetFormat = format === 'ubl' ? 'cii' : 'ubl'; // Test conversion with special characters await einvoice.convertFormat(parsed, targetFormat); } catch (convError) { encodingStats.conversionFailures++; if (hasSpecialChars) { encodingStats.problematicFiles.push(file); } } } } catch (error) { encodingStats.encodingIssues++; } } return { ...encodingStats, characterTypes: Array.from(encodingStats.characterTypes), specialCharPercentage: (encodingStats.specialCharFiles / encodingStats.totalFiles * 100).toFixed(2) + '%', conversionFailureRate: (encodingStats.conversionFailures / encodingStats.totalFiles * 100).toFixed(2) + '%' }; } ); // Summary t.comment('\n=== CONV-11: Character Encoding Edge Cases Test Summary ==='); t.comment('\nMixed Encoding Declarations:'); t.comment(` - UTF-8 to UTF-16: ${mixedEncodingDeclarations.result.utf8ToUtf16 ? 'SUPPORTED' : 'NOT SUPPORTED'}`); t.comment(` - UTF-16 to ISO-8859-1: ${mixedEncodingDeclarations.result.utf16ToIso ? 'HANDLED' : 'NOT HANDLED'}`); t.comment(` - ISO-8859-1 to UTF-8: ${mixedEncodingDeclarations.result.isoToUtf8 ? 'HANDLED' : 'NOT HANDLED'}`); t.comment(` - BOM handling: ${mixedEncodingDeclarations.result.bomHandling ? 'SUPPORTED' : 'NOT SUPPORTED'}`); t.comment('\nUnicode Normalization:'); unicodeNormalization.result.forEach(test => { t.comment(` - ${test.testCase}: ${test.preserved ? 'PRESERVED' : 'MODIFIED'}`); }); t.comment('\nControl Characters Handling:'); Object.entries(controlCharacters.result).forEach(([type, result]: [string, any]) => { if (result.error) { t.comment(` - ${type}: ERROR - ${result.message}`); } else { t.comment(` - ${type}: ${result.preserved ? 'PRESERVED' : 'SANITIZED'} (${result.originalLength} -> ${result.convertedLength} chars)`); } }); t.comment('\nMulti-Language Encoding:'); multiLanguageEncoding.result.forEach(test => { if (test.error) { t.comment(` - ${test.conversion}: ERROR - ${test.error}`); } else { t.comment(` - ${test.conversion}: ${test.allPreserved ? 'ALL PRESERVED' : 'PARTIAL LOSS'}`); } }); t.comment('\nCorpus Encoding Analysis:'); t.comment(` - Files analyzed: ${corpusEncodingAnalysis.result.totalFiles}`); t.comment(` - Files with special characters: ${corpusEncodingAnalysis.result.specialCharFiles} (${corpusEncodingAnalysis.result.specialCharPercentage})`); t.comment(` - Character types found: ${corpusEncodingAnalysis.result.characterTypes.join(', ')}`); t.comment(` - Encoding issues: ${corpusEncodingAnalysis.result.encodingIssues}`); t.comment(` - Conversion failures: ${corpusEncodingAnalysis.result.conversionFailures} (${corpusEncodingAnalysis.result.conversionFailureRate})`); // Performance summary t.comment('\n=== Performance Summary ==='); performanceTracker.logSummary(); t.end(); }); tap.start();