einvoice/test/suite/einvoice_edge-cases/test.edge-04.unusual-charsets.ts

import { tap } from '@git.zone/tstest/tapbundle';
import * as plugins from '../plugins.js';
import { EInvoice } from '../../../ts/index.js';
import { PerformanceTracker } from '../performance.tracker.js';

const performanceTracker = new PerformanceTracker('EDGE-04: Unusual Character Sets');

tap.test('EDGE-04: Unusual Character Sets - should handle unusual and exotic character encodings', async (t) => {
  const einvoice = new EInvoice();

  // Test 1: Unicode edge cases
  const unicodeEdgeCases = await performanceTracker.measureAsync(
    'unicode-edge-cases',
    async () => {
      const testCases = [
        {
          name: 'zero-width-characters',
          text: 'Invoice\u200B\u200C\u200D\uFEFFNumber',
          description: 'Zero-width spaces and joiners'
        },
        {
          name: 'right-to-left',
          text: 'مرحبا INV-001 שלום',
          description: 'RTL Arabic and Hebrew mixed with LTR'
        },
        {
          name: 'surrogate-pairs',
          text: '𝐇𝐞𝐥𝐥𝐨 😀 🎉 Invoice',
          description: 'Mathematical bold text and emojis'
        },
        {
          name: 'combining-characters',
          text: 'Ińvȯíçë̃ Nüm̈bër̊',
          description: 'Combining diacritical marks'
        },
        {
          name: 'control-characters',
          text: 'Invoice\x00\x01\x02\x1F\x7FTest',
          description: 'Control characters'
        },
        {
          name: 'bidi-override',
          text: '\u202Eتسا Invoice 123\u202C',
          description: 'Bidirectional override characters'
        }
      ];
      
      const results = [];
      
      for (const testCase of testCases) {
        const xml = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice>
  <ID>${testCase.text}</ID>
  <Description>${testCase.description}</Description>
</Invoice>`;
        
        try {
          const parsed = await einvoice.parseXML(xml);
          const idValue = parsed?.ID || '';
          
          results.push({
            name: testCase.name,
            success: true,
            preserved: idValue === testCase.text,
            normalized: idValue !== testCase.text,
            parsedValue: idValue,
            originalLength: testCase.text.length,
            parsedLength: idValue.length
          });
        } catch (error) {
          results.push({
            name: testCase.name,
            success: false,
            error: error.message
          });
        }
      }
      
      return results;
    }
  );

  unicodeEdgeCases.forEach(result => {
    t.ok(result.success, `Unicode edge case ${result.name} should be handled`);
  });

  // Test 2: Various character encodings
  const characterEncodings = await performanceTracker.measureAsync(
    'various-character-encodings',
    async () => {
      const encodings = [
        {
          encoding: 'UTF-8',
          bom: Buffer.from([0xEF, 0xBB, 0xBF]),
          text: 'Übung macht den Meister'
        },
        {
          encoding: 'UTF-16BE',
          bom: Buffer.from([0xFE, 0xFF]),
          text: 'Invoice \u4E2D\u6587'
        },
        {
          encoding: 'UTF-16LE',
          bom: Buffer.from([0xFF, 0xFE]),
          text: 'Facture française'
        },
        {
          encoding: 'ISO-8859-1',
          bom: null,
          text: 'Ñoño español'
        },
        {
          encoding: 'Windows-1252',
          bom: null,
          text: 'Smart "quotes" and —dashes'
        }
      ];
      
      const results = [];
      
      for (const enc of encodings) {
        const xmlContent = `<?xml version="1.0" encoding="${enc.encoding}"?>
<Invoice>
  <ID>ENC-001</ID>
  <CustomerName>${enc.text}</CustomerName>
</Invoice>`;
        
        try {
          // Create buffer with proper encoding
          let buffer;
          if (enc.bom) {
            const textBuffer = Buffer.from(xmlContent, enc.encoding.toLowerCase());
            buffer = Buffer.concat([enc.bom, textBuffer]);
          } else {
            buffer = Buffer.from(xmlContent, enc.encoding.toLowerCase().replace('-', ''));
          }
          
          const parsed = await einvoice.parseDocument(buffer);
          
          results.push({
            encoding: enc.encoding,
            success: true,
            hasBOM: !!enc.bom,
            textPreserved: parsed?.CustomerName === enc.text
          });
        } catch (error) {
          results.push({
            encoding: enc.encoding,
            success: false,
            error: error.message
          });
        }
      }
      
      return results;
    }
  );

  characterEncodings.forEach(result => {
    t.ok(result.success || result.error, `Encoding ${result.encoding} was processed`);
  });

  // Test 3: Emoji and pictographic characters
  const emojiAndPictographs = await performanceTracker.measureAsync(
    'emoji-and-pictographs',
    async () => {
      const emojiTests = [
        {
          name: 'basic-emoji',
          content: 'Invoice 📧 sent ✅'
        },
        {
          name: 'flag-emoji',
          content: 'Country: 🇺🇸 🇬🇧 🇩🇪 🇫🇷'
        },
        {
          name: 'skin-tone-emoji',
          content: 'Approved by 👍🏻👍🏼👍🏽👍🏾👍🏿'
        },
        {
          name: 'zwj-sequences',
          content: 'Family: 👨‍👩‍👧‍👦'
        },
        {
          name: 'mixed-emoji-text',
          content: '💰 Total: €1,234.56 💶'
        }
      ];
      
      const results = [];
      
      for (const test of emojiTests) {
        const xml = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice>
  <ID>EMOJI-001</ID>
  <Note>${test.content}</Note>
</Invoice>`;
        
        try {
          const parsed = await einvoice.parseXML(xml);
          const noteValue = parsed?.Note || '';
          
          // Count grapheme clusters (visual characters)
          const graphemeCount = [...new Intl.Segmenter().segment(test.content)].length;
          const preservedGraphemes = [...new Intl.Segmenter().segment(noteValue)].length;
          
          results.push({
            name: test.name,
            success: true,
            preserved: noteValue === test.content,
            originalGraphemes: graphemeCount,
            preservedGraphemes,
            codePointCount: Array.from(test.content).length,
            byteLength: Buffer.from(test.content, 'utf8').length
          });
        } catch (error) {
          results.push({
            name: test.name,
            success: false,
            error: error.message
          });
        }
      }
      
      return results;
    }
  );

  emojiAndPictographs.forEach(result => {
    t.ok(result.success, `Emoji test ${result.name} should succeed`);
    if (result.success) {
      t.ok(result.preserved, `Emoji content should be preserved`);
    }
  });

  // Test 4: Legacy and exotic scripts
  const exoticScripts = await performanceTracker.measureAsync(
    'exotic-scripts',
    async () => {
      const scripts = [
        { name: 'chinese-traditional', text: '發票編號：貳零貳肆' },
        { name: 'japanese-mixed', text: '請求書番号：２０２４年' },
        { name: 'korean', text: '송장 번호: 2024' },
        { name: 'thai', text: 'ใบแจ้งหนี้: ๒๐๒๔' },
        { name: 'devanagari', text: 'चालान संख्या: २०२४' },
        { name: 'cyrillic', text: 'Счёт-фактура № 2024' },
        { name: 'greek', text: 'Τιμολόγιο: ΜΜΚΔ' },
        { name: 'ethiopic', text: 'ቁጥር: ፪፻፳፬' },
        { name: 'bengali', text: 'চালান নং: ২০২৪' },
        { name: 'tamil', text: 'விலைப்பட்டியல்: ௨௦௨௪' }
      ];
      
      const results = [];
      
      for (const script of scripts) {
        const xml = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice>
  <ID>SCRIPT-${script.name}</ID>
  <Description>${script.text}</Description>
</Invoice>`;
        
        try {
          const parsed = await einvoice.parseXML(xml);
          const description = parsed?.Description || '';
          
          results.push({
            script: script.name,
            success: true,
            preserved: description === script.text,
            charCount: script.text.length,
            byteCount: Buffer.from(script.text, 'utf8').length
          });
        } catch (error) {
          results.push({
            script: script.name,
            success: false,
            error: error.message
          });
        }
      }
      
      return results;
    }
  );

  exoticScripts.forEach(result => {
    t.ok(result.success, `Script ${result.script} should be handled`);
    if (result.success) {
      t.ok(result.preserved, `Script ${result.script} content should be preserved`);
    }
  });

  // Test 5: Invalid UTF-8 sequences
  const invalidUTF8 = await performanceTracker.measureAsync(
    'invalid-utf8-sequences',
    async () => {
      const invalidSequences = [
        {
          name: 'orphan-continuation',
          bytes: Buffer.from([0x80, 0x81, 0x82])
        },
        {
          name: 'incomplete-sequence',
          bytes: Buffer.from([0xC2])
        },
        {
          name: 'overlong-encoding',
          bytes: Buffer.from([0xC0, 0x80])
        },
        {
          name: 'invalid-start',
          bytes: Buffer.from([0xF8, 0x80, 0x80, 0x80])
        },
        {
          name: 'mixed-valid-invalid',
          bytes: Buffer.concat([
            Buffer.from('Valid '),
            Buffer.from([0xFF, 0xFE]),
            Buffer.from(' Text')
          ])
        }
      ];
      
      const results = [];
      
      for (const seq of invalidSequences) {
        const xmlStart = Buffer.from('<?xml version="1.0" encoding="UTF-8"?><Invoice><ID>');
        const xmlEnd = Buffer.from('</ID></Invoice>');
        const fullBuffer = Buffer.concat([xmlStart, seq.bytes, xmlEnd]);
        
        try {
          const parsed = await einvoice.parseDocument(fullBuffer);
          
          results.push({
            name: seq.name,
            handled: true,
            recovered: !!parsed,
            replacedWithPlaceholder: true
          });
        } catch (error) {
          results.push({
            name: seq.name,
            handled: true,
            rejected: true,
            error: error.message
          });
        }
      }
      
      return results;
    }
  );

  invalidUTF8.forEach(result => {
    t.ok(result.handled, `Invalid UTF-8 ${result.name} was handled`);
  });

  // Test 6: Normalization forms
  const normalizationForms = await performanceTracker.measureAsync(
    'unicode-normalization-forms',
    async () => {
      const testText = 'Café'; // Can be represented differently
      const forms = [
        { name: 'NFC', text: testText.normalize('NFC') },
        { name: 'NFD', text: testText.normalize('NFD') },
        { name: 'NFKC', text: testText.normalize('NFKC') },
        { name: 'NFKD', text: testText.normalize('NFKD') }
      ];
      
      const results = [];
      
      for (const form of forms) {
        const xml = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice>
  <CustomerName>${form.text}</CustomerName>
</Invoice>`;
        
        try {
          const parsed = await einvoice.parseXML(xml);
          const name = parsed?.CustomerName || '';
          
          results.push({
            form: form.name,
            success: true,
            preserved: name === form.text,
            normalized: name.normalize('NFC') === testText.normalize('NFC'),
            codePoints: Array.from(form.text).length,
            bytes: Buffer.from(form.text, 'utf8').length
          });
        } catch (error) {
          results.push({
            form: form.name,
            success: false,
            error: error.message
          });
        }
      }
      
      return results;
    }
  );

  normalizationForms.forEach(result => {
    t.ok(result.success, `Normalization form ${result.form} should be handled`);
    if (result.success) {
      t.ok(result.normalized, `Content should be comparable after normalization`);
    }
  });

  // Test 7: Homoglyphs and confusables
  const homoglyphsAndConfusables = await performanceTracker.measureAsync(
    'homoglyphs-and-confusables',
    async () => {
      const confusables = [
        {
          name: 'latin-cyrillic-mix',
          text: 'Invоicе Numbеr', // Contains Cyrillic о and е
          description: 'Mixed Latin and Cyrillic lookalikes'
        },
        {
          name: 'greek-latin-mix',
          text: 'Ιnvoice Νumber', // Greek Ι and Ν
          description: 'Greek letters that look like Latin'
        },
        {
          name: 'fullwidth-chars',
          text: 'Ｉｎｖｏｉｃｅ　Ｎｕｍｂｅｒ',
          description: 'Fullwidth characters'
        },
        {
          name: 'mathematical-alphanumeric',
          text: '𝐈𝐧𝐯𝐨𝐢𝐜𝐞 𝐍𝐮𝐦𝐛𝐞𝐫',
          description: 'Mathematical bold characters'
        }
      ];
      
      const results = [];
      
      for (const test of confusables) {
        const xml = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice>
  <ID>${test.text}</ID>
  <Note>${test.description}</Note>
</Invoice>`;
        
        try {
          const parsed = await einvoice.parseXML(xml);
          const id = parsed?.ID || '';
          
          // Check if system detects potential homoglyphs
          const hasNonASCII = /[^\x00-\x7F]/.test(id);
          const normalized = id.normalize('NFKC');
          
          results.push({
            name: test.name,
            success: true,
            preserved: id === test.text,
            hasNonASCII,
            normalized: normalized !== test.text,
            detectable: hasNonASCII || normalized !== test.text
          });
        } catch (error) {
          results.push({
            name: test.name,
            success: false,
            error: error.message
          });
        }
      }
      
      return results;
    }
  );

  homoglyphsAndConfusables.forEach(result => {
    t.ok(result.success, `Homoglyph test ${result.name} should be handled`);
    if (result.success) {
      t.ok(result.detectable, `Potential confusables should be detectable`);
    }
  });

  // Test 8: XML special characters in unusual encodings
  const xmlSpecialInEncodings = await performanceTracker.measureAsync(
    'xml-special-characters-in-encodings',
    async () => {
      const specialChars = [
        { char: '<', entity: '&lt;', desc: 'less than' },
        { char: '>', entity: '&gt;', desc: 'greater than' },
        { char: '&', entity: '&amp;', desc: 'ampersand' },
        { char: '"', entity: '&quot;', desc: 'quote' },
        { char: "'", entity: '&apos;', desc: 'apostrophe' }
      ];
      
      const results = [];
      
      for (const special of specialChars) {
        // Test both raw and entity forms
        const tests = [
          { type: 'entity', value: special.entity },
          { type: 'cdata', value: `<![CDATA[${special.char}]]>` },
          { type: 'numeric', value: `&#${special.char.charCodeAt(0)};` }
        ];
        
        for (const test of tests) {
          const xml = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice>
  <Description>Price ${test.value} 100</Description>
</Invoice>`;
          
          try {
            const parsed = await einvoice.parseXML(xml);
            const desc = parsed?.Description || '';
            
            results.push({
              char: special.desc,
              method: test.type,
              success: true,
              containsChar: desc.includes(special.char),
              preserved: true
            });
          } catch (error) {
            results.push({
              char: special.desc,
              method: test.type,
              success: false,
              error: error.message
            });
          }
        }
      }
      
      return results;
    }
  );

  xmlSpecialInEncodings.forEach(result => {
    t.ok(result.success, `XML special ${result.char} as ${result.method} should be handled`);
  });

  // Test 9: Private use area characters
  const privateUseArea = await performanceTracker.measureAsync(
    'private-use-area-characters',
    async () => {
      const puaRanges = [
        { name: 'BMP-PUA', start: 0xE000, end: 0xF8FF },
        { name: 'Plane15-PUA', start: 0xF0000, end: 0xFFFFD },
        { name: 'Plane16-PUA', start: 0x100000, end: 0x10FFFD }
      ];
      
      const results = [];
      
      for (const range of puaRanges) {
        // Test a few characters from each range
        const testChars = [];
        testChars.push(String.fromCodePoint(range.start));
        testChars.push(String.fromCodePoint(Math.floor((range.start + range.end) / 2)));
        if (range.end <= 0x10FFFF) {
          testChars.push(String.fromCodePoint(range.end));
        }
        
        const testString = testChars.join('');
        const xml = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice>
  <CustomField>${testString}</CustomField>
</Invoice>`;
        
        try {
          const parsed = await einvoice.parseXML(xml);
          const field = parsed?.CustomField || '';
          
          results.push({
            range: range.name,
            success: true,
            preserved: field === testString,
            charCount: testString.length,
            handled: true
          });
        } catch (error) {
          results.push({
            range: range.name,
            success: false,
            error: error.message
          });
        }
      }
      
      return results;
    }
  );

  privateUseArea.forEach(result => {
    t.ok(result.success || result.error, `PUA range ${result.range} was processed`);
  });

  // Test 10: Character set conversion in format transformation
  const formatTransformCharsets = await performanceTracker.measureAsync(
    'format-transform-charsets',
    async () => {
      const testContents = [
        { name: 'multilingual', text: 'Hello مرحبا 你好 Здравствуйте' },
        { name: 'symbols', text: '€ £ ¥ $ ₹ ₽ ¢ ₩' },
        { name: 'accented', text: 'àáäâ èéëê ìíïî òóöô ùúüû ñç' },
        { name: 'mixed-emoji', text: 'Invoice 📄 Total: 💰 Status: ✅' }
      ];
      
      const results = [];
      
      for (const content of testContents) {
        const ublInvoice = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
  <ID>CHARSET-001</ID>
  <Note>${content.text}</Note>
</Invoice>`;
        
        try {
          // Convert to CII
          const ciiResult = await einvoice.convertFormat(ublInvoice, 'cii');
          
          // Parse the converted result
          const parsed = await einvoice.parseDocument(ciiResult);
          
          // Check if content was preserved
          const preserved = JSON.stringify(parsed).includes(content.text);
          
          results.push({
            content: content.name,
            success: true,
            preserved,
            formatConversionOk: true
          });
        } catch (error) {
          results.push({
            content: content.name,
            success: false,
            error: error.message
          });
        }
      }
      
      return results;
    }
  );

  formatTransformCharsets.forEach(result => {
    t.ok(result.success, `Format transform with ${result.content} should succeed`);
    if (result.success) {
      t.ok(result.preserved, `Character content should be preserved in transformation`);
    }
  });

  // Print performance summary
  performanceTracker.printSummary();
});

// Run the test
tap.start();