import { tap } from '@git.zone/tstest/tapbundle'; import * as plugins from '../plugins.js'; import { EInvoice } from '../../../ts/index.js'; import { PerformanceTracker } from '../performance.tracker.js'; const performanceTracker = new PerformanceTracker('EDGE-04: Unusual Character Sets'); tap.test('EDGE-04: Unusual Character Sets - should handle unusual and exotic character encodings', async (t) => { const einvoice = new EInvoice(); // Test 1: Unicode edge cases const unicodeEdgeCases = await performanceTracker.measureAsync( 'unicode-edge-cases', async () => { const testCases = [ { name: 'zero-width-characters', text: 'Invoice\u200B\u200C\u200D\uFEFFNumber', description: 'Zero-width spaces and joiners' }, { name: 'right-to-left', text: 'مرحبا INV-001 שלום', description: 'RTL Arabic and Hebrew mixed with LTR' }, { name: 'surrogate-pairs', text: '𝐇𝐞𝐥𝐥𝐨 😀 🎉 Invoice', description: 'Mathematical bold text and emojis' }, { name: 'combining-characters', text: 'Ińvȯíçë̃ Nüm̈bër̊', description: 'Combining diacritical marks' }, { name: 'control-characters', text: 'Invoice\x00\x01\x02\x1F\x7FTest', description: 'Control characters' }, { name: 'bidi-override', text: '\u202Eتسا Invoice 123\u202C', description: 'Bidirectional override characters' } ]; const results = []; for (const testCase of testCases) { const xml = ` ${testCase.text} ${testCase.description} `; try { const parsed = await einvoice.parseXML(xml); const idValue = parsed?.ID || ''; results.push({ name: testCase.name, success: true, preserved: idValue === testCase.text, normalized: idValue !== testCase.text, parsedValue: idValue, originalLength: testCase.text.length, parsedLength: idValue.length }); } catch (error) { results.push({ name: testCase.name, success: false, error: error.message }); } } return results; } ); unicodeEdgeCases.forEach(result => { t.ok(result.success, `Unicode edge case ${result.name} should be handled`); }); // Test 2: Various character encodings const characterEncodings = await performanceTracker.measureAsync( 'various-character-encodings', async () => { const encodings = [ { encoding: 'UTF-8', bom: Buffer.from([0xEF, 0xBB, 0xBF]), text: 'Übung macht den Meister' }, { encoding: 'UTF-16BE', bom: Buffer.from([0xFE, 0xFF]), text: 'Invoice \u4E2D\u6587' }, { encoding: 'UTF-16LE', bom: Buffer.from([0xFF, 0xFE]), text: 'Facture française' }, { encoding: 'ISO-8859-1', bom: null, text: 'Ñoño español' }, { encoding: 'Windows-1252', bom: null, text: 'Smart "quotes" and —dashes' } ]; const results = []; for (const enc of encodings) { const xmlContent = ` ENC-001 ${enc.text} `; try { // Create buffer with proper encoding let buffer; if (enc.bom) { const textBuffer = Buffer.from(xmlContent, enc.encoding.toLowerCase()); buffer = Buffer.concat([enc.bom, textBuffer]); } else { buffer = Buffer.from(xmlContent, enc.encoding.toLowerCase().replace('-', '')); } const parsed = await einvoice.parseDocument(buffer); results.push({ encoding: enc.encoding, success: true, hasBOM: !!enc.bom, textPreserved: parsed?.CustomerName === enc.text }); } catch (error) { results.push({ encoding: enc.encoding, success: false, error: error.message }); } } return results; } ); characterEncodings.forEach(result => { t.ok(result.success || result.error, `Encoding ${result.encoding} was processed`); }); // Test 3: Emoji and pictographic characters const emojiAndPictographs = await performanceTracker.measureAsync( 'emoji-and-pictographs', async () => { const emojiTests = [ { name: 'basic-emoji', content: 'Invoice 📧 sent ✅' }, { name: 'flag-emoji', content: 'Country: 🇺🇸 🇬🇧 🇩🇪 🇫🇷' }, { name: 'skin-tone-emoji', content: 'Approved by 👍🏻👍🏼👍🏽👍🏾👍🏿' }, { name: 'zwj-sequences', content: 'Family: 👨‍👩‍👧‍👦' }, { name: 'mixed-emoji-text', content: '💰 Total: €1,234.56 💶' } ]; const results = []; for (const test of emojiTests) { const xml = ` EMOJI-001 ${test.content} `; try { const parsed = await einvoice.parseXML(xml); const noteValue = parsed?.Note || ''; // Count grapheme clusters (visual characters) const graphemeCount = [...new Intl.Segmenter().segment(test.content)].length; const preservedGraphemes = [...new Intl.Segmenter().segment(noteValue)].length; results.push({ name: test.name, success: true, preserved: noteValue === test.content, originalGraphemes: graphemeCount, preservedGraphemes, codePointCount: Array.from(test.content).length, byteLength: Buffer.from(test.content, 'utf8').length }); } catch (error) { results.push({ name: test.name, success: false, error: error.message }); } } return results; } ); emojiAndPictographs.forEach(result => { t.ok(result.success, `Emoji test ${result.name} should succeed`); if (result.success) { t.ok(result.preserved, `Emoji content should be preserved`); } }); // Test 4: Legacy and exotic scripts const exoticScripts = await performanceTracker.measureAsync( 'exotic-scripts', async () => { const scripts = [ { name: 'chinese-traditional', text: '發票編號:貳零貳肆' }, { name: 'japanese-mixed', text: '請求書番号:2024年' }, { name: 'korean', text: '송장 번호: 2024' }, { name: 'thai', text: 'ใบแจ้งหนี้: ๒๐๒๔' }, { name: 'devanagari', text: 'चालान संख्या: २०२४' }, { name: 'cyrillic', text: 'Счёт-фактура № 2024' }, { name: 'greek', text: 'Τιμολόγιο: ΜΜΚΔ' }, { name: 'ethiopic', text: 'ቁጥር: ፪፻፳፬' }, { name: 'bengali', text: 'চালান নং: ২০২৪' }, { name: 'tamil', text: 'விலைப்பட்டியல்: ௨௦௨௪' } ]; const results = []; for (const script of scripts) { const xml = ` SCRIPT-${script.name} ${script.text} `; try { const parsed = await einvoice.parseXML(xml); const description = parsed?.Description || ''; results.push({ script: script.name, success: true, preserved: description === script.text, charCount: script.text.length, byteCount: Buffer.from(script.text, 'utf8').length }); } catch (error) { results.push({ script: script.name, success: false, error: error.message }); } } return results; } ); exoticScripts.forEach(result => { t.ok(result.success, `Script ${result.script} should be handled`); if (result.success) { t.ok(result.preserved, `Script ${result.script} content should be preserved`); } }); // Test 5: Invalid UTF-8 sequences const invalidUTF8 = await performanceTracker.measureAsync( 'invalid-utf8-sequences', async () => { const invalidSequences = [ { name: 'orphan-continuation', bytes: Buffer.from([0x80, 0x81, 0x82]) }, { name: 'incomplete-sequence', bytes: Buffer.from([0xC2]) }, { name: 'overlong-encoding', bytes: Buffer.from([0xC0, 0x80]) }, { name: 'invalid-start', bytes: Buffer.from([0xF8, 0x80, 0x80, 0x80]) }, { name: 'mixed-valid-invalid', bytes: Buffer.concat([ Buffer.from('Valid '), Buffer.from([0xFF, 0xFE]), Buffer.from(' Text') ]) } ]; const results = []; for (const seq of invalidSequences) { const xmlStart = Buffer.from(''); const xmlEnd = Buffer.from(''); const fullBuffer = Buffer.concat([xmlStart, seq.bytes, xmlEnd]); try { const parsed = await einvoice.parseDocument(fullBuffer); results.push({ name: seq.name, handled: true, recovered: !!parsed, replacedWithPlaceholder: true }); } catch (error) { results.push({ name: seq.name, handled: true, rejected: true, error: error.message }); } } return results; } ); invalidUTF8.forEach(result => { t.ok(result.handled, `Invalid UTF-8 ${result.name} was handled`); }); // Test 6: Normalization forms const normalizationForms = await performanceTracker.measureAsync( 'unicode-normalization-forms', async () => { const testText = 'Café'; // Can be represented differently const forms = [ { name: 'NFC', text: testText.normalize('NFC') }, { name: 'NFD', text: testText.normalize('NFD') }, { name: 'NFKC', text: testText.normalize('NFKC') }, { name: 'NFKD', text: testText.normalize('NFKD') } ]; const results = []; for (const form of forms) { const xml = ` ${form.text} `; try { const parsed = await einvoice.parseXML(xml); const name = parsed?.CustomerName || ''; results.push({ form: form.name, success: true, preserved: name === form.text, normalized: name.normalize('NFC') === testText.normalize('NFC'), codePoints: Array.from(form.text).length, bytes: Buffer.from(form.text, 'utf8').length }); } catch (error) { results.push({ form: form.name, success: false, error: error.message }); } } return results; } ); normalizationForms.forEach(result => { t.ok(result.success, `Normalization form ${result.form} should be handled`); if (result.success) { t.ok(result.normalized, `Content should be comparable after normalization`); } }); // Test 7: Homoglyphs and confusables const homoglyphsAndConfusables = await performanceTracker.measureAsync( 'homoglyphs-and-confusables', async () => { const confusables = [ { name: 'latin-cyrillic-mix', text: 'Invоicе Numbеr', // Contains Cyrillic о and е description: 'Mixed Latin and Cyrillic lookalikes' }, { name: 'greek-latin-mix', text: 'Ιnvoice Νumber', // Greek Ι and Ν description: 'Greek letters that look like Latin' }, { name: 'fullwidth-chars', text: 'Invoice Number', description: 'Fullwidth characters' }, { name: 'mathematical-alphanumeric', text: '𝐈𝐧𝐯𝐨𝐢𝐜𝐞 𝐍𝐮𝐦𝐛𝐞𝐫', description: 'Mathematical bold characters' } ]; const results = []; for (const test of confusables) { const xml = ` ${test.text} ${test.description} `; try { const parsed = await einvoice.parseXML(xml); const id = parsed?.ID || ''; // Check if system detects potential homoglyphs const hasNonASCII = /[^\x00-\x7F]/.test(id); const normalized = id.normalize('NFKC'); results.push({ name: test.name, success: true, preserved: id === test.text, hasNonASCII, normalized: normalized !== test.text, detectable: hasNonASCII || normalized !== test.text }); } catch (error) { results.push({ name: test.name, success: false, error: error.message }); } } return results; } ); homoglyphsAndConfusables.forEach(result => { t.ok(result.success, `Homoglyph test ${result.name} should be handled`); if (result.success) { t.ok(result.detectable, `Potential confusables should be detectable`); } }); // Test 8: XML special characters in unusual encodings const xmlSpecialInEncodings = await performanceTracker.measureAsync( 'xml-special-characters-in-encodings', async () => { const specialChars = [ { char: '<', entity: '<', desc: 'less than' }, { char: '>', entity: '>', desc: 'greater than' }, { char: '&', entity: '&', desc: 'ampersand' }, { char: '"', entity: '"', desc: 'quote' }, { char: "'", entity: ''', desc: 'apostrophe' } ]; const results = []; for (const special of specialChars) { // Test both raw and entity forms const tests = [ { type: 'entity', value: special.entity }, { type: 'cdata', value: `` }, { type: 'numeric', value: `&#${special.char.charCodeAt(0)};` } ]; for (const test of tests) { const xml = ` Price ${test.value} 100 `; try { const parsed = await einvoice.parseXML(xml); const desc = parsed?.Description || ''; results.push({ char: special.desc, method: test.type, success: true, containsChar: desc.includes(special.char), preserved: true }); } catch (error) { results.push({ char: special.desc, method: test.type, success: false, error: error.message }); } } } return results; } ); xmlSpecialInEncodings.forEach(result => { t.ok(result.success, `XML special ${result.char} as ${result.method} should be handled`); }); // Test 9: Private use area characters const privateUseArea = await performanceTracker.measureAsync( 'private-use-area-characters', async () => { const puaRanges = [ { name: 'BMP-PUA', start: 0xE000, end: 0xF8FF }, { name: 'Plane15-PUA', start: 0xF0000, end: 0xFFFFD }, { name: 'Plane16-PUA', start: 0x100000, end: 0x10FFFD } ]; const results = []; for (const range of puaRanges) { // Test a few characters from each range const testChars = []; testChars.push(String.fromCodePoint(range.start)); testChars.push(String.fromCodePoint(Math.floor((range.start + range.end) / 2))); if (range.end <= 0x10FFFF) { testChars.push(String.fromCodePoint(range.end)); } const testString = testChars.join(''); const xml = ` ${testString} `; try { const parsed = await einvoice.parseXML(xml); const field = parsed?.CustomField || ''; results.push({ range: range.name, success: true, preserved: field === testString, charCount: testString.length, handled: true }); } catch (error) { results.push({ range: range.name, success: false, error: error.message }); } } return results; } ); privateUseArea.forEach(result => { t.ok(result.success || result.error, `PUA range ${result.range} was processed`); }); // Test 10: Character set conversion in format transformation const formatTransformCharsets = await performanceTracker.measureAsync( 'format-transform-charsets', async () => { const testContents = [ { name: 'multilingual', text: 'Hello مرحبا 你好 Здравствуйте' }, { name: 'symbols', text: '€ £ ¥ $ ₹ ₽ ¢ ₩' }, { name: 'accented', text: 'àáäâ èéëê ìíïî òóöô ùúüû ñç' }, { name: 'mixed-emoji', text: 'Invoice 📄 Total: 💰 Status: ✅' } ]; const results = []; for (const content of testContents) { const ublInvoice = ` CHARSET-001 ${content.text} `; try { // Convert to CII const ciiResult = await einvoice.convertFormat(ublInvoice, 'cii'); // Parse the converted result const parsed = await einvoice.parseDocument(ciiResult); // Check if content was preserved const preserved = JSON.stringify(parsed).includes(content.text); results.push({ content: content.name, success: true, preserved, formatConversionOk: true }); } catch (error) { results.push({ content: content.name, success: false, error: error.message }); } } return results; } ); formatTransformCharsets.forEach(result => { t.ok(result.success, `Format transform with ${result.content} should succeed`); if (result.success) { t.ok(result.preserved, `Character content should be preserved in transformation`); } }); // Print performance summary performanceTracker.printSummary(); }); // Run the test tap.start();