einvoice/test/suite/einvoice_edge-cases/test.edge-04.unusual-charsets.ts
2025-05-26 04:04:51 +00:00

656 lines
19 KiB
TypeScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { tap } from '@git.zone/tstest/tapbundle';
import * as plugins from '../plugins.js';
import { EInvoice } from '../../../ts/index.js';
import { PerformanceTracker } from '../performance.tracker.js';
const performanceTracker = new PerformanceTracker('EDGE-04: Unusual Character Sets');
tap.test('EDGE-04: Unusual Character Sets - should handle unusual and exotic character encodings', async (t) => {
const einvoice = new EInvoice();
// Test 1: Unicode edge cases
const unicodeEdgeCases = await performanceTracker.measureAsync(
'unicode-edge-cases',
async () => {
const testCases = [
{
name: 'zero-width-characters',
text: 'Invoice\u200B\u200C\u200D\uFEFFNumber',
description: 'Zero-width spaces and joiners'
},
{
name: 'right-to-left',
text: 'مرحبا INV-001 שלום',
description: 'RTL Arabic and Hebrew mixed with LTR'
},
{
name: 'surrogate-pairs',
text: '𝐇𝐞𝐥𝐥𝐨 😀 🎉 Invoice',
description: 'Mathematical bold text and emojis'
},
{
name: 'combining-characters',
text: 'Ińvȯíçë̃ Nüm̈bër̊',
description: 'Combining diacritical marks'
},
{
name: 'control-characters',
text: 'Invoice\x00\x01\x02\x1F\x7FTest',
description: 'Control characters'
},
{
name: 'bidi-override',
text: '\u202Eتسا Invoice 123\u202C',
description: 'Bidirectional override characters'
}
];
const results = [];
for (const testCase of testCases) {
const xml = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice>
<ID>${testCase.text}</ID>
<Description>${testCase.description}</Description>
</Invoice>`;
try {
const parsed = await einvoice.parseXML(xml);
const idValue = parsed?.ID || '';
results.push({
name: testCase.name,
success: true,
preserved: idValue === testCase.text,
normalized: idValue !== testCase.text,
parsedValue: idValue,
originalLength: testCase.text.length,
parsedLength: idValue.length
});
} catch (error) {
results.push({
name: testCase.name,
success: false,
error: error.message
});
}
}
return results;
}
);
unicodeEdgeCases.forEach(result => {
t.ok(result.success, `Unicode edge case ${result.name} should be handled`);
});
// Test 2: Various character encodings
const characterEncodings = await performanceTracker.measureAsync(
'various-character-encodings',
async () => {
const encodings = [
{
encoding: 'UTF-8',
bom: Buffer.from([0xEF, 0xBB, 0xBF]),
text: 'Übung macht den Meister'
},
{
encoding: 'UTF-16BE',
bom: Buffer.from([0xFE, 0xFF]),
text: 'Invoice \u4E2D\u6587'
},
{
encoding: 'UTF-16LE',
bom: Buffer.from([0xFF, 0xFE]),
text: 'Facture française'
},
{
encoding: 'ISO-8859-1',
bom: null,
text: 'Ñoño español'
},
{
encoding: 'Windows-1252',
bom: null,
text: 'Smart "quotes" and —dashes'
}
];
const results = [];
for (const enc of encodings) {
const xmlContent = `<?xml version="1.0" encoding="${enc.encoding}"?>
<Invoice>
<ID>ENC-001</ID>
<CustomerName>${enc.text}</CustomerName>
</Invoice>`;
try {
// Create buffer with proper encoding
let buffer;
if (enc.bom) {
const textBuffer = Buffer.from(xmlContent, enc.encoding.toLowerCase());
buffer = Buffer.concat([enc.bom, textBuffer]);
} else {
buffer = Buffer.from(xmlContent, enc.encoding.toLowerCase().replace('-', ''));
}
const parsed = await einvoice.parseDocument(buffer);
results.push({
encoding: enc.encoding,
success: true,
hasBOM: !!enc.bom,
textPreserved: parsed?.CustomerName === enc.text
});
} catch (error) {
results.push({
encoding: enc.encoding,
success: false,
error: error.message
});
}
}
return results;
}
);
characterEncodings.forEach(result => {
t.ok(result.success || result.error, `Encoding ${result.encoding} was processed`);
});
// Test 3: Emoji and pictographic characters
const emojiAndPictographs = await performanceTracker.measureAsync(
'emoji-and-pictographs',
async () => {
const emojiTests = [
{
name: 'basic-emoji',
content: 'Invoice 📧 sent ✅'
},
{
name: 'flag-emoji',
content: 'Country: 🇺🇸 🇬🇧 🇩🇪 🇫🇷'
},
{
name: 'skin-tone-emoji',
content: 'Approved by 👍🏻👍🏼👍🏽👍🏾👍🏿'
},
{
name: 'zwj-sequences',
content: 'Family: 👨‍👩‍👧‍👦'
},
{
name: 'mixed-emoji-text',
content: '💰 Total: €1,234.56 💶'
}
];
const results = [];
for (const test of emojiTests) {
const xml = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice>
<ID>EMOJI-001</ID>
<Note>${test.content}</Note>
</Invoice>`;
try {
const parsed = await einvoice.parseXML(xml);
const noteValue = parsed?.Note || '';
// Count grapheme clusters (visual characters)
const graphemeCount = [...new Intl.Segmenter().segment(test.content)].length;
const preservedGraphemes = [...new Intl.Segmenter().segment(noteValue)].length;
results.push({
name: test.name,
success: true,
preserved: noteValue === test.content,
originalGraphemes: graphemeCount,
preservedGraphemes,
codePointCount: Array.from(test.content).length,
byteLength: Buffer.from(test.content, 'utf8').length
});
} catch (error) {
results.push({
name: test.name,
success: false,
error: error.message
});
}
}
return results;
}
);
emojiAndPictographs.forEach(result => {
t.ok(result.success, `Emoji test ${result.name} should succeed`);
if (result.success) {
t.ok(result.preserved, `Emoji content should be preserved`);
}
});
// Test 4: Legacy and exotic scripts
const exoticScripts = await performanceTracker.measureAsync(
'exotic-scripts',
async () => {
const scripts = [
{ name: 'chinese-traditional', text: '發票編號:貳零貳肆' },
{ name: 'japanese-mixed', text: '請求書番号:2024年' },
{ name: 'korean', text: '송장 번호: 2024' },
{ name: 'thai', text: 'ใบแจ้งหนี้: ๒๐๒๔' },
{ name: 'devanagari', text: 'चालान संख्या: २०२४' },
{ name: 'cyrillic', text: 'Счёт-фактура № 2024' },
{ name: 'greek', text: 'Τιμολόγιο: ΜΜΚΔ' },
{ name: 'ethiopic', text: 'ቁጥር: ፪፻፳፬' },
{ name: 'bengali', text: 'চালান নং: ২০২৪' },
{ name: 'tamil', text: 'விலைப்பட்டியல்: ௨௦௨௪' }
];
const results = [];
for (const script of scripts) {
const xml = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice>
<ID>SCRIPT-${script.name}</ID>
<Description>${script.text}</Description>
</Invoice>`;
try {
const parsed = await einvoice.parseXML(xml);
const description = parsed?.Description || '';
results.push({
script: script.name,
success: true,
preserved: description === script.text,
charCount: script.text.length,
byteCount: Buffer.from(script.text, 'utf8').length
});
} catch (error) {
results.push({
script: script.name,
success: false,
error: error.message
});
}
}
return results;
}
);
exoticScripts.forEach(result => {
t.ok(result.success, `Script ${result.script} should be handled`);
if (result.success) {
t.ok(result.preserved, `Script ${result.script} content should be preserved`);
}
});
// Test 5: Invalid UTF-8 sequences
const invalidUTF8 = await performanceTracker.measureAsync(
'invalid-utf8-sequences',
async () => {
const invalidSequences = [
{
name: 'orphan-continuation',
bytes: Buffer.from([0x80, 0x81, 0x82])
},
{
name: 'incomplete-sequence',
bytes: Buffer.from([0xC2])
},
{
name: 'overlong-encoding',
bytes: Buffer.from([0xC0, 0x80])
},
{
name: 'invalid-start',
bytes: Buffer.from([0xF8, 0x80, 0x80, 0x80])
},
{
name: 'mixed-valid-invalid',
bytes: Buffer.concat([
Buffer.from('Valid '),
Buffer.from([0xFF, 0xFE]),
Buffer.from(' Text')
])
}
];
const results = [];
for (const seq of invalidSequences) {
const xmlStart = Buffer.from('<?xml version="1.0" encoding="UTF-8"?><Invoice><ID>');
const xmlEnd = Buffer.from('</ID></Invoice>');
const fullBuffer = Buffer.concat([xmlStart, seq.bytes, xmlEnd]);
try {
const parsed = await einvoice.parseDocument(fullBuffer);
results.push({
name: seq.name,
handled: true,
recovered: !!parsed,
replacedWithPlaceholder: true
});
} catch (error) {
results.push({
name: seq.name,
handled: true,
rejected: true,
error: error.message
});
}
}
return results;
}
);
invalidUTF8.forEach(result => {
t.ok(result.handled, `Invalid UTF-8 ${result.name} was handled`);
});
// Test 6: Normalization forms
const normalizationForms = await performanceTracker.measureAsync(
'unicode-normalization-forms',
async () => {
const testText = 'Café'; // Can be represented differently
const forms = [
{ name: 'NFC', text: testText.normalize('NFC') },
{ name: 'NFD', text: testText.normalize('NFD') },
{ name: 'NFKC', text: testText.normalize('NFKC') },
{ name: 'NFKD', text: testText.normalize('NFKD') }
];
const results = [];
for (const form of forms) {
const xml = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice>
<CustomerName>${form.text}</CustomerName>
</Invoice>`;
try {
const parsed = await einvoice.parseXML(xml);
const name = parsed?.CustomerName || '';
results.push({
form: form.name,
success: true,
preserved: name === form.text,
normalized: name.normalize('NFC') === testText.normalize('NFC'),
codePoints: Array.from(form.text).length,
bytes: Buffer.from(form.text, 'utf8').length
});
} catch (error) {
results.push({
form: form.name,
success: false,
error: error.message
});
}
}
return results;
}
);
normalizationForms.forEach(result => {
t.ok(result.success, `Normalization form ${result.form} should be handled`);
if (result.success) {
t.ok(result.normalized, `Content should be comparable after normalization`);
}
});
// Test 7: Homoglyphs and confusables
const homoglyphsAndConfusables = await performanceTracker.measureAsync(
'homoglyphs-and-confusables',
async () => {
const confusables = [
{
name: 'latin-cyrillic-mix',
text: 'Invоicе Numbеr', // Contains Cyrillic о and е
description: 'Mixed Latin and Cyrillic lookalikes'
},
{
name: 'greek-latin-mix',
text: 'Ιnvoice Νumber', // Greek Ι and Ν
description: 'Greek letters that look like Latin'
},
{
name: 'fullwidth-chars',
text: ' ',
description: 'Fullwidth characters'
},
{
name: 'mathematical-alphanumeric',
text: '𝐈𝐧𝐯𝐨𝐢𝐜𝐞 𝐍𝐮𝐦𝐛𝐞𝐫',
description: 'Mathematical bold characters'
}
];
const results = [];
for (const test of confusables) {
const xml = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice>
<ID>${test.text}</ID>
<Note>${test.description}</Note>
</Invoice>`;
try {
const parsed = await einvoice.parseXML(xml);
const id = parsed?.ID || '';
// Check if system detects potential homoglyphs
const hasNonASCII = /[^\x00-\x7F]/.test(id);
const normalized = id.normalize('NFKC');
results.push({
name: test.name,
success: true,
preserved: id === test.text,
hasNonASCII,
normalized: normalized !== test.text,
detectable: hasNonASCII || normalized !== test.text
});
} catch (error) {
results.push({
name: test.name,
success: false,
error: error.message
});
}
}
return results;
}
);
homoglyphsAndConfusables.forEach(result => {
t.ok(result.success, `Homoglyph test ${result.name} should be handled`);
if (result.success) {
t.ok(result.detectable, `Potential confusables should be detectable`);
}
});
// Test 8: XML special characters in unusual encodings
const xmlSpecialInEncodings = await performanceTracker.measureAsync(
'xml-special-characters-in-encodings',
async () => {
const specialChars = [
{ char: '<', entity: '&lt;', desc: 'less than' },
{ char: '>', entity: '&gt;', desc: 'greater than' },
{ char: '&', entity: '&amp;', desc: 'ampersand' },
{ char: '"', entity: '&quot;', desc: 'quote' },
{ char: "'", entity: '&apos;', desc: 'apostrophe' }
];
const results = [];
for (const special of specialChars) {
// Test both raw and entity forms
const tests = [
{ type: 'entity', value: special.entity },
{ type: 'cdata', value: `<![CDATA[${special.char}]]>` },
{ type: 'numeric', value: `&#${special.char.charCodeAt(0)};` }
];
for (const test of tests) {
const xml = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice>
<Description>Price ${test.value} 100</Description>
</Invoice>`;
try {
const parsed = await einvoice.parseXML(xml);
const desc = parsed?.Description || '';
results.push({
char: special.desc,
method: test.type,
success: true,
containsChar: desc.includes(special.char),
preserved: true
});
} catch (error) {
results.push({
char: special.desc,
method: test.type,
success: false,
error: error.message
});
}
}
}
return results;
}
);
xmlSpecialInEncodings.forEach(result => {
t.ok(result.success, `XML special ${result.char} as ${result.method} should be handled`);
});
// Test 9: Private use area characters
const privateUseArea = await performanceTracker.measureAsync(
'private-use-area-characters',
async () => {
const puaRanges = [
{ name: 'BMP-PUA', start: 0xE000, end: 0xF8FF },
{ name: 'Plane15-PUA', start: 0xF0000, end: 0xFFFFD },
{ name: 'Plane16-PUA', start: 0x100000, end: 0x10FFFD }
];
const results = [];
for (const range of puaRanges) {
// Test a few characters from each range
const testChars = [];
testChars.push(String.fromCodePoint(range.start));
testChars.push(String.fromCodePoint(Math.floor((range.start + range.end) / 2)));
if (range.end <= 0x10FFFF) {
testChars.push(String.fromCodePoint(range.end));
}
const testString = testChars.join('');
const xml = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice>
<CustomField>${testString}</CustomField>
</Invoice>`;
try {
const parsed = await einvoice.parseXML(xml);
const field = parsed?.CustomField || '';
results.push({
range: range.name,
success: true,
preserved: field === testString,
charCount: testString.length,
handled: true
});
} catch (error) {
results.push({
range: range.name,
success: false,
error: error.message
});
}
}
return results;
}
);
privateUseArea.forEach(result => {
t.ok(result.success || result.error, `PUA range ${result.range} was processed`);
});
// Test 10: Character set conversion in format transformation
const formatTransformCharsets = await performanceTracker.measureAsync(
'format-transform-charsets',
async () => {
const testContents = [
{ name: 'multilingual', text: 'Hello مرحبا 你好 Здравствуйте' },
{ name: 'symbols', text: '€ £ ¥ $ ₹ ₽ ¢ ₩' },
{ name: 'accented', text: 'àáäâ èéëê ìíïî òóöô ùúüû ñç' },
{ name: 'mixed-emoji', text: 'Invoice 📄 Total: 💰 Status: ✅' }
];
const results = [];
for (const content of testContents) {
const ublInvoice = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<ID>CHARSET-001</ID>
<Note>${content.text}</Note>
</Invoice>`;
try {
// Convert to CII
const ciiResult = await einvoice.convertFormat(ublInvoice, 'cii');
// Parse the converted result
const parsed = await einvoice.parseDocument(ciiResult);
// Check if content was preserved
const preserved = JSON.stringify(parsed).includes(content.text);
results.push({
content: content.name,
success: true,
preserved,
formatConversionOk: true
});
} catch (error) {
results.push({
content: content.name,
success: false,
error: error.message
});
}
}
return results;
}
);
formatTransformCharsets.forEach(result => {
t.ok(result.success, `Format transform with ${result.content} should succeed`);
if (result.success) {
t.ok(result.preserved, `Character content should be preserved in transformation`);
}
});
// Print performance summary
performanceTracker.printSummary();
});
// Run the test
tap.start();