import { expect, tap } from '@git.zone/tstest/tapbundle'; import * as plugins from '../plugins.js'; import { EInvoice } from '../../../ts/index.js'; import { CorpusLoader } from '../corpus.loader.js'; import { PerformanceTracker } from '../performance.tracker.js'; tap.test('CONV-07: Character Encoding - should preserve character encoding during conversion', async (t) => { // CONV-07: Verify character encoding is maintained across format conversions // This test ensures special characters and international text are preserved const performanceTracker = new PerformanceTracker('CONV-07: Character Encoding'); const corpusLoader = new CorpusLoader(); t.test('UTF-8 encoding preservation in conversion', async () => { const startTime = performance.now(); // UBL invoice with various UTF-8 characters const ublInvoice = ` UTF8-CONV-001 2025-01-25 380 Special characters: € £ ¥ © ® ™ § ¶ • ° ± × ÷ Diacritics: àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ Greek: ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ αβγδεζηθικλμνξοπρστυφχψω Cyrillic: АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ CJK: 中文 日本語 한국어 Arabic: العربية مرحبا Hebrew: עברית שלום Emoji: 😀 🎉 💰 📧 🌍 EUR Société Générale Müller & Associés Rue de la Légion d'Honneur Zürich 8001 CH François Lefèvre françois@société-générale.ch 北京科技有限公司 (Beijing Tech Co.) 北京市朝阳区建国路88号 北京 CN 1 Spëcïål cháracters in line: ñ ç ø å æ þ ð 10 1000.00 Bücher über Köln – München Prix: 25,50 € (TVA incluse) • Größe: 21×29,7 cm² 100.00 `; const einvoice = new EInvoice(); await einvoice.loadFromString(ublInvoice); // Convert to another format (simulated by getting XML back) const convertedXml = einvoice.getXmlString(); // Verify all special characters are preserved const encodingChecks = [ // Currency symbols { char: '€', name: 'Euro' }, { char: '£', name: 'Pound' }, { char: '¥', name: 'Yen' }, // Special symbols { char: '©', name: 'Copyright' }, { char: '®', name: 'Registered' }, { char: '™', name: 'Trademark' }, { char: '×', name: 'Multiplication' }, { char: '÷', name: 'Division' }, // Diacritics { char: 'àáâãäå', name: 'Latin a variations' }, { char: 'çñøæþð', name: 'Special Latin' }, // Greek { char: 'ΑΒΓΔ', name: 'Greek uppercase' }, { char: 'αβγδ', name: 'Greek lowercase' }, // Cyrillic { char: 'АБВГ', name: 'Cyrillic' }, // CJK { char: '中文', name: 'Chinese' }, { char: '日本語', name: 'Japanese' }, { char: '한국어', name: 'Korean' }, // RTL { char: 'العربية', name: 'Arabic' }, { char: 'עברית', name: 'Hebrew' }, // Emoji { char: '😀', name: 'Emoji' }, // Names with diacritics { char: 'François Lefèvre', name: 'French name' }, { char: 'Zürich', name: 'Swiss city' }, { char: 'Müller', name: 'German name' }, // Special punctuation { char: '–', name: 'En dash' }, { char: '•', name: 'Bullet' }, { char: '²', name: 'Superscript' } ]; let preservedCount = 0; const missingChars: string[] = []; encodingChecks.forEach(check => { if (convertedXml.includes(check.char)) { preservedCount++; } else { missingChars.push(`${check.name} (${check.char})`); } }); console.log(`UTF-8 preservation: ${preservedCount}/${encodingChecks.length} character sets preserved`); if (missingChars.length > 0) { console.log('Missing characters:', missingChars); } expect(preservedCount).toBeGreaterThan(encodingChecks.length * 0.9); // Allow 10% loss const elapsed = performance.now() - startTime; performanceTracker.addMeasurement('utf8-preservation', elapsed); }); t.test('Entity encoding in conversion', async () => { const startTime = performance.now(); // CII invoice with XML entities const ciiInvoice = ` ENTITY-CONV-001 XML entities: <invoice> & "quotes" with 'apostrophes' Numeric entities: € £ ¥ ™ Hex entities: € £ ¥ Product & Service <Premium> Price comparison: USD < EUR > GBP Smith & Jones "Trading" Ltd. Registered in England & Wales `; const einvoice = new EInvoice(); await einvoice.loadFromString(ciiInvoice); const convertedXml = einvoice.getXmlString(); // Check entity preservation const entityChecks = { 'Ampersand entity': convertedXml.includes('&') || convertedXml.includes(' & '), 'Less than entity': convertedXml.includes('<') || convertedXml.includes(' < '), 'Greater than entity': convertedXml.includes('>') || convertedXml.includes(' > '), 'Quote preservation': convertedXml.includes('"quotes"') || convertedXml.includes('"quotes"'), 'Apostrophe preservation': convertedXml.includes("'apostrophes'") || convertedXml.includes(''apostrophes''), 'Numeric entities': convertedXml.includes('€') || convertedXml.includes('€'), 'Hex entities': convertedXml.includes('£') || convertedXml.includes('£') }; Object.entries(entityChecks).forEach(([check, passed]) => { if (passed) { console.log(`✓ ${check}`); } else { console.log(`✗ ${check}`); } }); const elapsed = performance.now() - startTime; performanceTracker.addMeasurement('entity-encoding', elapsed); }); t.test('Mixed encoding scenarios', async () => { const startTime = performance.now(); // Invoice with mixed encoding challenges const mixedInvoice = ` MIXED-ENC-001 2025-01-25 380 EUR & special chars € £ ¥]]> Mixed: Normal text with €100 and <escaped> content Müller & Associés S.à r.l. Hauptstraße 42 (Gebäude "A") Köln DE Payment terms: 2/10 net 30 (2% if paid <= 10 days) 1 Temperature range: -40°C ≤ T ≤ +85°C 10 1000.00 Product™ with ® symbol © 2025 Size: 10cm × 20cm × 5cm • Weight: ≈1kg Special chars α β γ δ ε ≠ ∞ ∑ √ ∫ `; const einvoice = new EInvoice(); await einvoice.loadFromString(mixedInvoice); const convertedXml = einvoice.getXmlString(); // Check mixed encoding preservation const mixedChecks = { 'CDATA content': convertedXml.includes('CDATA content') || convertedXml.includes(''), 'Mixed entities and Unicode': convertedXml.includes('€100') || convertedXml.includes('€100'), 'German umlauts': convertedXml.includes('Müller') && convertedXml.includes('Köln'), 'French accents': convertedXml.includes('Associés') && convertedXml.includes('Société'), 'Mathematical symbols': convertedXml.includes('≤') && convertedXml.includes('≈'), 'Trademark symbols': convertedXml.includes('™') && convertedXml.includes('®'), 'Greek letters': convertedXml.includes('α') || convertedXml.includes('beta'), 'Temperature notation': convertedXml.includes('°C'), 'Multiplication sign': convertedXml.includes('×'), 'CDATA in address': convertedXml.includes('Floor 3') || convertedXml.includes('& 4') }; const passedChecks = Object.entries(mixedChecks).filter(([_, passed]) => passed).length; console.log(`Mixed encoding: ${passedChecks}/${Object.keys(mixedChecks).length} checks passed`); expect(passedChecks).toBeGreaterThan(Object.keys(mixedChecks).length * 0.8); const elapsed = performance.now() - startTime; performanceTracker.addMeasurement('mixed-encoding', elapsed); }); t.test('Encoding in different invoice formats', async () => { const startTime = performance.now(); // Test encoding across different format characteristics const formats = [ { name: 'UBL with namespaces', content: ` NS-€-001 Namespace test: €£¥ ` }, { name: 'CII with complex structure', content: ` CII-Ü-001 Übersicht über Änderungen ` }, { name: 'Factur-X with French', content: ` FX-FR-001 Facture détaillée avec références spéciales ` } ]; for (const format of formats) { try { const einvoice = new EInvoice(); await einvoice.loadFromString(format.content); const converted = einvoice.getXmlString(); // Check key characters are preserved let preserved = true; if (format.name.includes('UBL') && !converted.includes('€£¥')) preserved = false; if (format.name.includes('CII') && !converted.includes('Ü')) preserved = false; if (format.name.includes('French') && !converted.includes('détaillée')) preserved = false; console.log(`${format.name}: ${preserved ? '✓' : '✗'} Encoding preserved`); } catch (error) { console.log(`${format.name}: Error - ${error.message}`); } } const elapsed = performance.now() - startTime; performanceTracker.addMeasurement('format-encoding', elapsed); }); t.test('Bidirectional text preservation', async () => { const startTime = performance.now(); // Test RTL (Right-to-Left) text preservation const rtlInvoice = ` RTL-TEST-001 2025-01-25 380 EUR شركة التقنية المحدودة شارع الملك فهد 123 الرياض SA חברת הטכנולוגיה בע"מ רחוב דיזנגוף 456 תל אביב IL 1 Mixed text: العربية (Arabic) and עברית (Hebrew) with English 10 1000.00 منتج تقني متقدم / מוצר טכנולוגי מתקדם `; const einvoice = new EInvoice(); await einvoice.loadFromString(rtlInvoice); const convertedXml = einvoice.getXmlString(); // Check RTL text preservation const rtlChecks = { 'Arabic company': convertedXml.includes('شركة التقنية المحدودة'), 'Arabic street': convertedXml.includes('شارع الملك فهد'), 'Arabic city': convertedXml.includes('الرياض'), 'Hebrew company': convertedXml.includes('חברת הטכנולוגיה'), 'Hebrew street': convertedXml.includes('רחוב דיזנגוף'), 'Hebrew city': convertedXml.includes('תל אביב'), 'Mixed RTL/LTR': convertedXml.includes('Arabic') && convertedXml.includes('Hebrew'), 'Arabic product': convertedXml.includes('منتج تقني متقدم'), 'Hebrew product': convertedXml.includes('מוצר טכנולוגי מתקדם') }; const rtlPreserved = Object.entries(rtlChecks).filter(([_, passed]) => passed).length; console.log(`RTL text preservation: ${rtlPreserved}/${Object.keys(rtlChecks).length}`); const elapsed = performance.now() - startTime; performanceTracker.addMeasurement('rtl-preservation', elapsed); }); t.test('Corpus encoding preservation analysis', async () => { const startTime = performance.now(); let processedCount = 0; let encodingIssues = 0; const characterCategories = { 'ASCII only': 0, 'Latin extended': 0, 'Greek': 0, 'Cyrillic': 0, 'CJK': 0, 'Arabic/Hebrew': 0, 'Special symbols': 0, 'Emoji': 0 }; const files = await corpusLoader.getAllFiles(); const xmlFiles = files.filter(f => f.endsWith('.xml') && !f.includes('.pdf')); // Sample corpus for encoding analysis const sampleSize = Math.min(50, xmlFiles.length); const sample = xmlFiles.slice(0, sampleSize); for (const file of sample) { try { const content = await corpusLoader.readFile(file); const einvoice = new EInvoice(); let originalString: string; if (typeof content === 'string') { originalString = content; await einvoice.loadFromString(content); } else { originalString = content.toString('utf8'); await einvoice.loadFromBuffer(content); } const convertedXml = einvoice.getXmlString(); // Categorize content if (!/[^\x00-\x7F]/.test(originalString)) { characterCategories['ASCII only']++; } else { if (/[À-ÿĀ-ſ]/.test(originalString)) characterCategories['Latin extended']++; if (/[Α-Ωα-ω]/.test(originalString)) characterCategories['Greek']++; if (/[А-Яа-я]/.test(originalString)) characterCategories['Cyrillic']++; if (/[\u4E00-\u9FFF\u3040-\u309F\u30A0-\u30FF\uAC00-\uD7AF]/.test(originalString)) characterCategories['CJK']++; if (/[\u0590-\u05FF\u0600-\u06FF]/.test(originalString)) characterCategories['Arabic/Hebrew']++; if (/[©®™€£¥§¶•°±×÷≤≥≠≈∞]/.test(originalString)) characterCategories['Special symbols']++; if (/[\u{1F300}-\u{1F9FF}]/u.test(originalString)) characterCategories['Emoji']++; } // Simple check for major encoding loss const originalNonAscii = (originalString.match(/[^\x00-\x7F]/g) || []).length; const convertedNonAscii = (convertedXml.match(/[^\x00-\x7F]/g) || []).length; if (originalNonAscii > 0 && convertedNonAscii < originalNonAscii * 0.8) { encodingIssues++; console.log(`Potential encoding loss in ${file}: ${originalNonAscii} -> ${convertedNonAscii} non-ASCII chars`); } processedCount++; } catch (error) { console.log(`Encoding analysis error in ${file}:`, error.message); } } console.log(`Corpus encoding analysis (${processedCount} files):`); console.log('Character categories found:'); Object.entries(characterCategories) .filter(([_, count]) => count > 0) .sort((a, b) => b[1] - a[1]) .forEach(([category, count]) => { console.log(` ${category}: ${count} files`); }); console.log(`Files with potential encoding issues: ${encodingIssues}`); const elapsed = performance.now() - startTime; performanceTracker.addMeasurement('corpus-encoding', elapsed); }); // Print performance summary performanceTracker.printSummary(); // Performance assertions const avgTime = performanceTracker.getAverageTime(); expect(avgTime).toBeLessThan(400); // Encoding operations may take longer }); tap.start();