einvoice/test/suite/einvoice_edge-cases/test.edge-04.unusual-charsets.ts

656 lines
19 KiB
TypeScript
Raw Normal View History

2025-05-26 04:04:51 +00:00
import { tap } from '@git.zone/tstest/tapbundle';
import * as plugins from '../plugins.js';
import { EInvoice } from '../../../ts/index.js';
import { PerformanceTracker } from '../performance.tracker.js';
const performanceTracker = new PerformanceTracker('EDGE-04: Unusual Character Sets');
tap.test('EDGE-04: Unusual Character Sets - should handle unusual and exotic character encodings', async (t) => {
const einvoice = new EInvoice();
// Test 1: Unicode edge cases
const unicodeEdgeCases = await performanceTracker.measureAsync(
'unicode-edge-cases',
async () => {
const testCases = [
{
name: 'zero-width-characters',
text: 'Invoice\u200B\u200C\u200D\uFEFFNumber',
description: 'Zero-width spaces and joiners'
},
{
name: 'right-to-left',
text: 'مرحبا INV-001 שלום',
description: 'RTL Arabic and Hebrew mixed with LTR'
},
{
name: 'surrogate-pairs',
text: '𝐇𝐞𝐥𝐥𝐨 😀 🎉 Invoice',
description: 'Mathematical bold text and emojis'
},
{
name: 'combining-characters',
text: 'Ińvȯíçë̃ Nüm̈bër̊',
description: 'Combining diacritical marks'
},
{
name: 'control-characters',
text: 'Invoice\x00\x01\x02\x1F\x7FTest',
description: 'Control characters'
},
{
name: 'bidi-override',
text: '\u202Eتسا Invoice 123\u202C',
description: 'Bidirectional override characters'
}
];
const results = [];
for (const testCase of testCases) {
const xml = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice>
<ID>${testCase.text}</ID>
<Description>${testCase.description}</Description>
</Invoice>`;
try {
const parsed = await einvoice.parseXML(xml);
const idValue = parsed?.ID || '';
results.push({
name: testCase.name,
success: true,
preserved: idValue === testCase.text,
normalized: idValue !== testCase.text,
parsedValue: idValue,
originalLength: testCase.text.length,
parsedLength: idValue.length
});
} catch (error) {
results.push({
name: testCase.name,
success: false,
error: error.message
});
}
}
return results;
}
);
unicodeEdgeCases.forEach(result => {
t.ok(result.success, `Unicode edge case ${result.name} should be handled`);
});
// Test 2: Various character encodings
const characterEncodings = await performanceTracker.measureAsync(
'various-character-encodings',
async () => {
const encodings = [
{
encoding: 'UTF-8',
bom: Buffer.from([0xEF, 0xBB, 0xBF]),
text: 'Übung macht den Meister'
},
{
encoding: 'UTF-16BE',
bom: Buffer.from([0xFE, 0xFF]),
text: 'Invoice \u4E2D\u6587'
},
{
encoding: 'UTF-16LE',
bom: Buffer.from([0xFF, 0xFE]),
text: 'Facture française'
},
{
encoding: 'ISO-8859-1',
bom: null,
text: 'Ñoño español'
},
{
encoding: 'Windows-1252',
bom: null,
text: 'Smart "quotes" and —dashes'
}
];
const results = [];
for (const enc of encodings) {
const xmlContent = `<?xml version="1.0" encoding="${enc.encoding}"?>
<Invoice>
<ID>ENC-001</ID>
<CustomerName>${enc.text}</CustomerName>
</Invoice>`;
try {
// Create buffer with proper encoding
let buffer;
if (enc.bom) {
const textBuffer = Buffer.from(xmlContent, enc.encoding.toLowerCase());
buffer = Buffer.concat([enc.bom, textBuffer]);
} else {
buffer = Buffer.from(xmlContent, enc.encoding.toLowerCase().replace('-', ''));
}
const parsed = await einvoice.parseDocument(buffer);
results.push({
encoding: enc.encoding,
success: true,
hasBOM: !!enc.bom,
textPreserved: parsed?.CustomerName === enc.text
});
} catch (error) {
results.push({
encoding: enc.encoding,
success: false,
error: error.message
});
}
}
return results;
}
);
characterEncodings.forEach(result => {
t.ok(result.success || result.error, `Encoding ${result.encoding} was processed`);
});
// Test 3: Emoji and pictographic characters
const emojiAndPictographs = await performanceTracker.measureAsync(
'emoji-and-pictographs',
async () => {
const emojiTests = [
{
name: 'basic-emoji',
content: 'Invoice 📧 sent ✅'
},
{
name: 'flag-emoji',
content: 'Country: 🇺🇸 🇬🇧 🇩🇪 🇫🇷'
},
{
name: 'skin-tone-emoji',
content: 'Approved by 👍🏻👍🏼👍🏽👍🏾👍🏿'
},
{
name: 'zwj-sequences',
content: 'Family: 👨‍👩‍👧‍👦'
},
{
name: 'mixed-emoji-text',
content: '💰 Total: €1,234.56 💶'
}
];
const results = [];
for (const test of emojiTests) {
const xml = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice>
<ID>EMOJI-001</ID>
<Note>${test.content}</Note>
</Invoice>`;
try {
const parsed = await einvoice.parseXML(xml);
const noteValue = parsed?.Note || '';
// Count grapheme clusters (visual characters)
const graphemeCount = [...new Intl.Segmenter().segment(test.content)].length;
const preservedGraphemes = [...new Intl.Segmenter().segment(noteValue)].length;
results.push({
name: test.name,
success: true,
preserved: noteValue === test.content,
originalGraphemes: graphemeCount,
preservedGraphemes,
codePointCount: Array.from(test.content).length,
byteLength: Buffer.from(test.content, 'utf8').length
});
} catch (error) {
results.push({
name: test.name,
success: false,
error: error.message
});
}
}
return results;
}
);
emojiAndPictographs.forEach(result => {
t.ok(result.success, `Emoji test ${result.name} should succeed`);
if (result.success) {
t.ok(result.preserved, `Emoji content should be preserved`);
}
});
// Test 4: Legacy and exotic scripts
const exoticScripts = await performanceTracker.measureAsync(
'exotic-scripts',
async () => {
const scripts = [
{ name: 'chinese-traditional', text: '發票編號:貳零貳肆' },
{ name: 'japanese-mixed', text: '請求書番号:2024年' },
{ name: 'korean', text: '송장 번호: 2024' },
{ name: 'thai', text: 'ใบแจ้งหนี้: ๒๐๒๔' },
{ name: 'devanagari', text: 'चालान संख्या: २०२४' },
{ name: 'cyrillic', text: 'Счёт-фактура № 2024' },
{ name: 'greek', text: 'Τιμολόγιο: ΜΜΚΔ' },
{ name: 'ethiopic', text: 'ቁጥር: ፪፻፳፬' },
{ name: 'bengali', text: 'চালান নং: ২০২৪' },
{ name: 'tamil', text: 'விலைப்பட்டியல்: ௨௦௨௪' }
];
const results = [];
for (const script of scripts) {
const xml = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice>
<ID>SCRIPT-${script.name}</ID>
<Description>${script.text}</Description>
</Invoice>`;
try {
const parsed = await einvoice.parseXML(xml);
const description = parsed?.Description || '';
results.push({
script: script.name,
success: true,
preserved: description === script.text,
charCount: script.text.length,
byteCount: Buffer.from(script.text, 'utf8').length
});
} catch (error) {
results.push({
script: script.name,
success: false,
error: error.message
});
}
}
return results;
}
);
exoticScripts.forEach(result => {
t.ok(result.success, `Script ${result.script} should be handled`);
if (result.success) {
t.ok(result.preserved, `Script ${result.script} content should be preserved`);
}
});
// Test 5: Invalid UTF-8 sequences
const invalidUTF8 = await performanceTracker.measureAsync(
'invalid-utf8-sequences',
async () => {
const invalidSequences = [
{
name: 'orphan-continuation',
bytes: Buffer.from([0x80, 0x81, 0x82])
},
{
name: 'incomplete-sequence',
bytes: Buffer.from([0xC2])
},
{
name: 'overlong-encoding',
bytes: Buffer.from([0xC0, 0x80])
},
{
name: 'invalid-start',
bytes: Buffer.from([0xF8, 0x80, 0x80, 0x80])
},
{
name: 'mixed-valid-invalid',
bytes: Buffer.concat([
Buffer.from('Valid '),
Buffer.from([0xFF, 0xFE]),
Buffer.from(' Text')
])
}
];
const results = [];
for (const seq of invalidSequences) {
const xmlStart = Buffer.from('<?xml version="1.0" encoding="UTF-8"?><Invoice><ID>');
const xmlEnd = Buffer.from('</ID></Invoice>');
const fullBuffer = Buffer.concat([xmlStart, seq.bytes, xmlEnd]);
try {
const parsed = await einvoice.parseDocument(fullBuffer);
results.push({
name: seq.name,
handled: true,
recovered: !!parsed,
replacedWithPlaceholder: true
});
} catch (error) {
results.push({
name: seq.name,
handled: true,
rejected: true,
error: error.message
});
}
}
return results;
}
);
invalidUTF8.forEach(result => {
t.ok(result.handled, `Invalid UTF-8 ${result.name} was handled`);
});
// Test 6: Normalization forms
const normalizationForms = await performanceTracker.measureAsync(
'unicode-normalization-forms',
async () => {
const testText = 'Café'; // Can be represented differently
const forms = [
{ name: 'NFC', text: testText.normalize('NFC') },
{ name: 'NFD', text: testText.normalize('NFD') },
{ name: 'NFKC', text: testText.normalize('NFKC') },
{ name: 'NFKD', text: testText.normalize('NFKD') }
];
const results = [];
for (const form of forms) {
const xml = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice>
<CustomerName>${form.text}</CustomerName>
</Invoice>`;
try {
const parsed = await einvoice.parseXML(xml);
const name = parsed?.CustomerName || '';
results.push({
form: form.name,
success: true,
preserved: name === form.text,
normalized: name.normalize('NFC') === testText.normalize('NFC'),
codePoints: Array.from(form.text).length,
bytes: Buffer.from(form.text, 'utf8').length
});
} catch (error) {
results.push({
form: form.name,
success: false,
error: error.message
});
}
}
return results;
}
);
normalizationForms.forEach(result => {
t.ok(result.success, `Normalization form ${result.form} should be handled`);
if (result.success) {
t.ok(result.normalized, `Content should be comparable after normalization`);
}
});
// Test 7: Homoglyphs and confusables
const homoglyphsAndConfusables = await performanceTracker.measureAsync(
'homoglyphs-and-confusables',
async () => {
const confusables = [
{
name: 'latin-cyrillic-mix',
text: 'Invоicе Numbеr', // Contains Cyrillic о and е
description: 'Mixed Latin and Cyrillic lookalikes'
},
{
name: 'greek-latin-mix',
text: 'Ιnvoice Νumber', // Greek Ι and Ν
description: 'Greek letters that look like Latin'
},
{
name: 'fullwidth-chars',
text: ' ',
description: 'Fullwidth characters'
},
{
name: 'mathematical-alphanumeric',
text: '𝐈𝐧𝐯𝐨𝐢𝐜𝐞 𝐍𝐮𝐦𝐛𝐞𝐫',
description: 'Mathematical bold characters'
}
];
const results = [];
for (const test of confusables) {
const xml = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice>
<ID>${test.text}</ID>
<Note>${test.description}</Note>
</Invoice>`;
try {
const parsed = await einvoice.parseXML(xml);
const id = parsed?.ID || '';
// Check if system detects potential homoglyphs
const hasNonASCII = /[^\x00-\x7F]/.test(id);
const normalized = id.normalize('NFKC');
results.push({
name: test.name,
success: true,
preserved: id === test.text,
hasNonASCII,
normalized: normalized !== test.text,
detectable: hasNonASCII || normalized !== test.text
});
} catch (error) {
results.push({
name: test.name,
success: false,
error: error.message
});
}
}
return results;
}
);
homoglyphsAndConfusables.forEach(result => {
t.ok(result.success, `Homoglyph test ${result.name} should be handled`);
if (result.success) {
t.ok(result.detectable, `Potential confusables should be detectable`);
}
});
// Test 8: XML special characters in unusual encodings
const xmlSpecialInEncodings = await performanceTracker.measureAsync(
'xml-special-characters-in-encodings',
async () => {
const specialChars = [
{ char: '<', entity: '&lt;', desc: 'less than' },
{ char: '>', entity: '&gt;', desc: 'greater than' },
{ char: '&', entity: '&amp;', desc: 'ampersand' },
{ char: '"', entity: '&quot;', desc: 'quote' },
{ char: "'", entity: '&apos;', desc: 'apostrophe' }
];
const results = [];
for (const special of specialChars) {
// Test both raw and entity forms
const tests = [
{ type: 'entity', value: special.entity },
{ type: 'cdata', value: `<![CDATA[${special.char}]]>` },
{ type: 'numeric', value: `&#${special.char.charCodeAt(0)};` }
];
for (const test of tests) {
const xml = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice>
<Description>Price ${test.value} 100</Description>
</Invoice>`;
try {
const parsed = await einvoice.parseXML(xml);
const desc = parsed?.Description || '';
results.push({
char: special.desc,
method: test.type,
success: true,
containsChar: desc.includes(special.char),
preserved: true
});
} catch (error) {
results.push({
char: special.desc,
method: test.type,
success: false,
error: error.message
});
}
}
}
return results;
}
);
xmlSpecialInEncodings.forEach(result => {
t.ok(result.success, `XML special ${result.char} as ${result.method} should be handled`);
});
// Test 9: Private use area characters
const privateUseArea = await performanceTracker.measureAsync(
'private-use-area-characters',
async () => {
const puaRanges = [
{ name: 'BMP-PUA', start: 0xE000, end: 0xF8FF },
{ name: 'Plane15-PUA', start: 0xF0000, end: 0xFFFFD },
{ name: 'Plane16-PUA', start: 0x100000, end: 0x10FFFD }
];
const results = [];
for (const range of puaRanges) {
// Test a few characters from each range
const testChars = [];
testChars.push(String.fromCodePoint(range.start));
testChars.push(String.fromCodePoint(Math.floor((range.start + range.end) / 2)));
if (range.end <= 0x10FFFF) {
testChars.push(String.fromCodePoint(range.end));
}
const testString = testChars.join('');
const xml = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice>
<CustomField>${testString}</CustomField>
</Invoice>`;
try {
const parsed = await einvoice.parseXML(xml);
const field = parsed?.CustomField || '';
results.push({
range: range.name,
success: true,
preserved: field === testString,
charCount: testString.length,
handled: true
});
} catch (error) {
results.push({
range: range.name,
success: false,
error: error.message
});
}
}
return results;
}
);
privateUseArea.forEach(result => {
t.ok(result.success || result.error, `PUA range ${result.range} was processed`);
});
// Test 10: Character set conversion in format transformation
const formatTransformCharsets = await performanceTracker.measureAsync(
'format-transform-charsets',
async () => {
const testContents = [
{ name: 'multilingual', text: 'Hello مرحبا 你好 Здравствуйте' },
{ name: 'symbols', text: '€ £ ¥ $ ₹ ₽ ¢ ₩' },
{ name: 'accented', text: 'àáäâ èéëê ìíïî òóöô ùúüû ñç' },
{ name: 'mixed-emoji', text: 'Invoice 📄 Total: 💰 Status: ✅' }
];
const results = [];
for (const content of testContents) {
const ublInvoice = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<ID>CHARSET-001</ID>
<Note>${content.text}</Note>
</Invoice>`;
try {
// Convert to CII
const ciiResult = await einvoice.convertFormat(ublInvoice, 'cii');
// Parse the converted result
const parsed = await einvoice.parseDocument(ciiResult);
// Check if content was preserved
const preserved = JSON.stringify(parsed).includes(content.text);
results.push({
content: content.name,
success: true,
preserved,
formatConversionOk: true
});
} catch (error) {
results.push({
content: content.name,
success: false,
error: error.message
});
}
}
return results;
}
);
formatTransformCharsets.forEach(result => {
t.ok(result.success, `Format transform with ${result.content} should succeed`);
if (result.success) {
t.ok(result.preserved, `Character content should be preserved in transformation`);
}
});
// Print performance summary
performanceTracker.printSummary();
});
// Run the test
tap.start();