656 lines
19 KiB
TypeScript
656 lines
19 KiB
TypeScript
import { tap } from '@git.zone/tstest/tapbundle';
|
||
import * as plugins from '../plugins.js';
|
||
import { EInvoice } from '../../../ts/index.js';
|
||
import { PerformanceTracker } from '../performance.tracker.js';
|
||
|
||
const performanceTracker = new PerformanceTracker('EDGE-04: Unusual Character Sets');
|
||
|
||
tap.test('EDGE-04: Unusual Character Sets - should handle unusual and exotic character encodings', async (t) => {
|
||
const einvoice = new EInvoice();
|
||
|
||
// Test 1: Unicode edge cases
|
||
const unicodeEdgeCases = await performanceTracker.measureAsync(
|
||
'unicode-edge-cases',
|
||
async () => {
|
||
const testCases = [
|
||
{
|
||
name: 'zero-width-characters',
|
||
text: 'Invoice\u200B\u200C\u200D\uFEFFNumber',
|
||
description: 'Zero-width spaces and joiners'
|
||
},
|
||
{
|
||
name: 'right-to-left',
|
||
text: 'مرحبا INV-001 שלום',
|
||
description: 'RTL Arabic and Hebrew mixed with LTR'
|
||
},
|
||
{
|
||
name: 'surrogate-pairs',
|
||
text: '𝐇𝐞𝐥𝐥𝐨 😀 🎉 Invoice',
|
||
description: 'Mathematical bold text and emojis'
|
||
},
|
||
{
|
||
name: 'combining-characters',
|
||
text: 'Ińvȯíçë̃ Nüm̈bër̊',
|
||
description: 'Combining diacritical marks'
|
||
},
|
||
{
|
||
name: 'control-characters',
|
||
text: 'Invoice\x00\x01\x02\x1F\x7FTest',
|
||
description: 'Control characters'
|
||
},
|
||
{
|
||
name: 'bidi-override',
|
||
text: '\u202Eتسا Invoice 123\u202C',
|
||
description: 'Bidirectional override characters'
|
||
}
|
||
];
|
||
|
||
const results = [];
|
||
|
||
for (const testCase of testCases) {
|
||
const xml = `<?xml version="1.0" encoding="UTF-8"?>
|
||
<Invoice>
|
||
<ID>${testCase.text}</ID>
|
||
<Description>${testCase.description}</Description>
|
||
</Invoice>`;
|
||
|
||
try {
|
||
const parsed = await einvoice.parseXML(xml);
|
||
const idValue = parsed?.ID || '';
|
||
|
||
results.push({
|
||
name: testCase.name,
|
||
success: true,
|
||
preserved: idValue === testCase.text,
|
||
normalized: idValue !== testCase.text,
|
||
parsedValue: idValue,
|
||
originalLength: testCase.text.length,
|
||
parsedLength: idValue.length
|
||
});
|
||
} catch (error) {
|
||
results.push({
|
||
name: testCase.name,
|
||
success: false,
|
||
error: error.message
|
||
});
|
||
}
|
||
}
|
||
|
||
return results;
|
||
}
|
||
);
|
||
|
||
unicodeEdgeCases.forEach(result => {
|
||
t.ok(result.success, `Unicode edge case ${result.name} should be handled`);
|
||
});
|
||
|
||
// Test 2: Various character encodings
|
||
const characterEncodings = await performanceTracker.measureAsync(
|
||
'various-character-encodings',
|
||
async () => {
|
||
const encodings = [
|
||
{
|
||
encoding: 'UTF-8',
|
||
bom: Buffer.from([0xEF, 0xBB, 0xBF]),
|
||
text: 'Übung macht den Meister'
|
||
},
|
||
{
|
||
encoding: 'UTF-16BE',
|
||
bom: Buffer.from([0xFE, 0xFF]),
|
||
text: 'Invoice \u4E2D\u6587'
|
||
},
|
||
{
|
||
encoding: 'UTF-16LE',
|
||
bom: Buffer.from([0xFF, 0xFE]),
|
||
text: 'Facture française'
|
||
},
|
||
{
|
||
encoding: 'ISO-8859-1',
|
||
bom: null,
|
||
text: 'Ñoño español'
|
||
},
|
||
{
|
||
encoding: 'Windows-1252',
|
||
bom: null,
|
||
text: 'Smart "quotes" and —dashes'
|
||
}
|
||
];
|
||
|
||
const results = [];
|
||
|
||
for (const enc of encodings) {
|
||
const xmlContent = `<?xml version="1.0" encoding="${enc.encoding}"?>
|
||
<Invoice>
|
||
<ID>ENC-001</ID>
|
||
<CustomerName>${enc.text}</CustomerName>
|
||
</Invoice>`;
|
||
|
||
try {
|
||
// Create buffer with proper encoding
|
||
let buffer;
|
||
if (enc.bom) {
|
||
const textBuffer = Buffer.from(xmlContent, enc.encoding.toLowerCase());
|
||
buffer = Buffer.concat([enc.bom, textBuffer]);
|
||
} else {
|
||
buffer = Buffer.from(xmlContent, enc.encoding.toLowerCase().replace('-', ''));
|
||
}
|
||
|
||
const parsed = await einvoice.parseDocument(buffer);
|
||
|
||
results.push({
|
||
encoding: enc.encoding,
|
||
success: true,
|
||
hasBOM: !!enc.bom,
|
||
textPreserved: parsed?.CustomerName === enc.text
|
||
});
|
||
} catch (error) {
|
||
results.push({
|
||
encoding: enc.encoding,
|
||
success: false,
|
||
error: error.message
|
||
});
|
||
}
|
||
}
|
||
|
||
return results;
|
||
}
|
||
);
|
||
|
||
characterEncodings.forEach(result => {
|
||
t.ok(result.success || result.error, `Encoding ${result.encoding} was processed`);
|
||
});
|
||
|
||
// Test 3: Emoji and pictographic characters
|
||
const emojiAndPictographs = await performanceTracker.measureAsync(
|
||
'emoji-and-pictographs',
|
||
async () => {
|
||
const emojiTests = [
|
||
{
|
||
name: 'basic-emoji',
|
||
content: 'Invoice 📧 sent ✅'
|
||
},
|
||
{
|
||
name: 'flag-emoji',
|
||
content: 'Country: 🇺🇸 🇬🇧 🇩🇪 🇫🇷'
|
||
},
|
||
{
|
||
name: 'skin-tone-emoji',
|
||
content: 'Approved by 👍🏻👍🏼👍🏽👍🏾👍🏿'
|
||
},
|
||
{
|
||
name: 'zwj-sequences',
|
||
content: 'Family: 👨👩👧👦'
|
||
},
|
||
{
|
||
name: 'mixed-emoji-text',
|
||
content: '💰 Total: €1,234.56 💶'
|
||
}
|
||
];
|
||
|
||
const results = [];
|
||
|
||
for (const test of emojiTests) {
|
||
const xml = `<?xml version="1.0" encoding="UTF-8"?>
|
||
<Invoice>
|
||
<ID>EMOJI-001</ID>
|
||
<Note>${test.content}</Note>
|
||
</Invoice>`;
|
||
|
||
try {
|
||
const parsed = await einvoice.parseXML(xml);
|
||
const noteValue = parsed?.Note || '';
|
||
|
||
// Count grapheme clusters (visual characters)
|
||
const graphemeCount = [...new Intl.Segmenter().segment(test.content)].length;
|
||
const preservedGraphemes = [...new Intl.Segmenter().segment(noteValue)].length;
|
||
|
||
results.push({
|
||
name: test.name,
|
||
success: true,
|
||
preserved: noteValue === test.content,
|
||
originalGraphemes: graphemeCount,
|
||
preservedGraphemes,
|
||
codePointCount: Array.from(test.content).length,
|
||
byteLength: Buffer.from(test.content, 'utf8').length
|
||
});
|
||
} catch (error) {
|
||
results.push({
|
||
name: test.name,
|
||
success: false,
|
||
error: error.message
|
||
});
|
||
}
|
||
}
|
||
|
||
return results;
|
||
}
|
||
);
|
||
|
||
emojiAndPictographs.forEach(result => {
|
||
t.ok(result.success, `Emoji test ${result.name} should succeed`);
|
||
if (result.success) {
|
||
t.ok(result.preserved, `Emoji content should be preserved`);
|
||
}
|
||
});
|
||
|
||
// Test 4: Legacy and exotic scripts
|
||
const exoticScripts = await performanceTracker.measureAsync(
|
||
'exotic-scripts',
|
||
async () => {
|
||
const scripts = [
|
||
{ name: 'chinese-traditional', text: '發票編號:貳零貳肆' },
|
||
{ name: 'japanese-mixed', text: '請求書番号:2024年' },
|
||
{ name: 'korean', text: '송장 번호: 2024' },
|
||
{ name: 'thai', text: 'ใบแจ้งหนี้: ๒๐๒๔' },
|
||
{ name: 'devanagari', text: 'चालान संख्या: २०२४' },
|
||
{ name: 'cyrillic', text: 'Счёт-фактура № 2024' },
|
||
{ name: 'greek', text: 'Τιμολόγιο: ΜΜΚΔ' },
|
||
{ name: 'ethiopic', text: 'ቁጥር: ፪፻፳፬' },
|
||
{ name: 'bengali', text: 'চালান নং: ২০২৪' },
|
||
{ name: 'tamil', text: 'விலைப்பட்டியல்: ௨௦௨௪' }
|
||
];
|
||
|
||
const results = [];
|
||
|
||
for (const script of scripts) {
|
||
const xml = `<?xml version="1.0" encoding="UTF-8"?>
|
||
<Invoice>
|
||
<ID>SCRIPT-${script.name}</ID>
|
||
<Description>${script.text}</Description>
|
||
</Invoice>`;
|
||
|
||
try {
|
||
const parsed = await einvoice.parseXML(xml);
|
||
const description = parsed?.Description || '';
|
||
|
||
results.push({
|
||
script: script.name,
|
||
success: true,
|
||
preserved: description === script.text,
|
||
charCount: script.text.length,
|
||
byteCount: Buffer.from(script.text, 'utf8').length
|
||
});
|
||
} catch (error) {
|
||
results.push({
|
||
script: script.name,
|
||
success: false,
|
||
error: error.message
|
||
});
|
||
}
|
||
}
|
||
|
||
return results;
|
||
}
|
||
);
|
||
|
||
exoticScripts.forEach(result => {
|
||
t.ok(result.success, `Script ${result.script} should be handled`);
|
||
if (result.success) {
|
||
t.ok(result.preserved, `Script ${result.script} content should be preserved`);
|
||
}
|
||
});
|
||
|
||
// Test 5: Invalid UTF-8 sequences
|
||
const invalidUTF8 = await performanceTracker.measureAsync(
|
||
'invalid-utf8-sequences',
|
||
async () => {
|
||
const invalidSequences = [
|
||
{
|
||
name: 'orphan-continuation',
|
||
bytes: Buffer.from([0x80, 0x81, 0x82])
|
||
},
|
||
{
|
||
name: 'incomplete-sequence',
|
||
bytes: Buffer.from([0xC2])
|
||
},
|
||
{
|
||
name: 'overlong-encoding',
|
||
bytes: Buffer.from([0xC0, 0x80])
|
||
},
|
||
{
|
||
name: 'invalid-start',
|
||
bytes: Buffer.from([0xF8, 0x80, 0x80, 0x80])
|
||
},
|
||
{
|
||
name: 'mixed-valid-invalid',
|
||
bytes: Buffer.concat([
|
||
Buffer.from('Valid '),
|
||
Buffer.from([0xFF, 0xFE]),
|
||
Buffer.from(' Text')
|
||
])
|
||
}
|
||
];
|
||
|
||
const results = [];
|
||
|
||
for (const seq of invalidSequences) {
|
||
const xmlStart = Buffer.from('<?xml version="1.0" encoding="UTF-8"?><Invoice><ID>');
|
||
const xmlEnd = Buffer.from('</ID></Invoice>');
|
||
const fullBuffer = Buffer.concat([xmlStart, seq.bytes, xmlEnd]);
|
||
|
||
try {
|
||
const parsed = await einvoice.parseDocument(fullBuffer);
|
||
|
||
results.push({
|
||
name: seq.name,
|
||
handled: true,
|
||
recovered: !!parsed,
|
||
replacedWithPlaceholder: true
|
||
});
|
||
} catch (error) {
|
||
results.push({
|
||
name: seq.name,
|
||
handled: true,
|
||
rejected: true,
|
||
error: error.message
|
||
});
|
||
}
|
||
}
|
||
|
||
return results;
|
||
}
|
||
);
|
||
|
||
invalidUTF8.forEach(result => {
|
||
t.ok(result.handled, `Invalid UTF-8 ${result.name} was handled`);
|
||
});
|
||
|
||
// Test 6: Normalization forms
|
||
const normalizationForms = await performanceTracker.measureAsync(
|
||
'unicode-normalization-forms',
|
||
async () => {
|
||
const testText = 'Café'; // Can be represented differently
|
||
const forms = [
|
||
{ name: 'NFC', text: testText.normalize('NFC') },
|
||
{ name: 'NFD', text: testText.normalize('NFD') },
|
||
{ name: 'NFKC', text: testText.normalize('NFKC') },
|
||
{ name: 'NFKD', text: testText.normalize('NFKD') }
|
||
];
|
||
|
||
const results = [];
|
||
|
||
for (const form of forms) {
|
||
const xml = `<?xml version="1.0" encoding="UTF-8"?>
|
||
<Invoice>
|
||
<CustomerName>${form.text}</CustomerName>
|
||
</Invoice>`;
|
||
|
||
try {
|
||
const parsed = await einvoice.parseXML(xml);
|
||
const name = parsed?.CustomerName || '';
|
||
|
||
results.push({
|
||
form: form.name,
|
||
success: true,
|
||
preserved: name === form.text,
|
||
normalized: name.normalize('NFC') === testText.normalize('NFC'),
|
||
codePoints: Array.from(form.text).length,
|
||
bytes: Buffer.from(form.text, 'utf8').length
|
||
});
|
||
} catch (error) {
|
||
results.push({
|
||
form: form.name,
|
||
success: false,
|
||
error: error.message
|
||
});
|
||
}
|
||
}
|
||
|
||
return results;
|
||
}
|
||
);
|
||
|
||
normalizationForms.forEach(result => {
|
||
t.ok(result.success, `Normalization form ${result.form} should be handled`);
|
||
if (result.success) {
|
||
t.ok(result.normalized, `Content should be comparable after normalization`);
|
||
}
|
||
});
|
||
|
||
// Test 7: Homoglyphs and confusables
|
||
const homoglyphsAndConfusables = await performanceTracker.measureAsync(
|
||
'homoglyphs-and-confusables',
|
||
async () => {
|
||
const confusables = [
|
||
{
|
||
name: 'latin-cyrillic-mix',
|
||
text: 'Invоicе Numbеr', // Contains Cyrillic о and е
|
||
description: 'Mixed Latin and Cyrillic lookalikes'
|
||
},
|
||
{
|
||
name: 'greek-latin-mix',
|
||
text: 'Ιnvoice Νumber', // Greek Ι and Ν
|
||
description: 'Greek letters that look like Latin'
|
||
},
|
||
{
|
||
name: 'fullwidth-chars',
|
||
text: 'Invoice Number',
|
||
description: 'Fullwidth characters'
|
||
},
|
||
{
|
||
name: 'mathematical-alphanumeric',
|
||
text: '𝐈𝐧𝐯𝐨𝐢𝐜𝐞 𝐍𝐮𝐦𝐛𝐞𝐫',
|
||
description: 'Mathematical bold characters'
|
||
}
|
||
];
|
||
|
||
const results = [];
|
||
|
||
for (const test of confusables) {
|
||
const xml = `<?xml version="1.0" encoding="UTF-8"?>
|
||
<Invoice>
|
||
<ID>${test.text}</ID>
|
||
<Note>${test.description}</Note>
|
||
</Invoice>`;
|
||
|
||
try {
|
||
const parsed = await einvoice.parseXML(xml);
|
||
const id = parsed?.ID || '';
|
||
|
||
// Check if system detects potential homoglyphs
|
||
const hasNonASCII = /[^\x00-\x7F]/.test(id);
|
||
const normalized = id.normalize('NFKC');
|
||
|
||
results.push({
|
||
name: test.name,
|
||
success: true,
|
||
preserved: id === test.text,
|
||
hasNonASCII,
|
||
normalized: normalized !== test.text,
|
||
detectable: hasNonASCII || normalized !== test.text
|
||
});
|
||
} catch (error) {
|
||
results.push({
|
||
name: test.name,
|
||
success: false,
|
||
error: error.message
|
||
});
|
||
}
|
||
}
|
||
|
||
return results;
|
||
}
|
||
);
|
||
|
||
homoglyphsAndConfusables.forEach(result => {
|
||
t.ok(result.success, `Homoglyph test ${result.name} should be handled`);
|
||
if (result.success) {
|
||
t.ok(result.detectable, `Potential confusables should be detectable`);
|
||
}
|
||
});
|
||
|
||
// Test 8: XML special characters in unusual encodings
|
||
const xmlSpecialInEncodings = await performanceTracker.measureAsync(
|
||
'xml-special-characters-in-encodings',
|
||
async () => {
|
||
const specialChars = [
|
||
{ char: '<', entity: '<', desc: 'less than' },
|
||
{ char: '>', entity: '>', desc: 'greater than' },
|
||
{ char: '&', entity: '&', desc: 'ampersand' },
|
||
{ char: '"', entity: '"', desc: 'quote' },
|
||
{ char: "'", entity: ''', desc: 'apostrophe' }
|
||
];
|
||
|
||
const results = [];
|
||
|
||
for (const special of specialChars) {
|
||
// Test both raw and entity forms
|
||
const tests = [
|
||
{ type: 'entity', value: special.entity },
|
||
{ type: 'cdata', value: `<![CDATA[${special.char}]]>` },
|
||
{ type: 'numeric', value: `&#${special.char.charCodeAt(0)};` }
|
||
];
|
||
|
||
for (const test of tests) {
|
||
const xml = `<?xml version="1.0" encoding="UTF-8"?>
|
||
<Invoice>
|
||
<Description>Price ${test.value} 100</Description>
|
||
</Invoice>`;
|
||
|
||
try {
|
||
const parsed = await einvoice.parseXML(xml);
|
||
const desc = parsed?.Description || '';
|
||
|
||
results.push({
|
||
char: special.desc,
|
||
method: test.type,
|
||
success: true,
|
||
containsChar: desc.includes(special.char),
|
||
preserved: true
|
||
});
|
||
} catch (error) {
|
||
results.push({
|
||
char: special.desc,
|
||
method: test.type,
|
||
success: false,
|
||
error: error.message
|
||
});
|
||
}
|
||
}
|
||
}
|
||
|
||
return results;
|
||
}
|
||
);
|
||
|
||
xmlSpecialInEncodings.forEach(result => {
|
||
t.ok(result.success, `XML special ${result.char} as ${result.method} should be handled`);
|
||
});
|
||
|
||
// Test 9: Private use area characters
|
||
const privateUseArea = await performanceTracker.measureAsync(
|
||
'private-use-area-characters',
|
||
async () => {
|
||
const puaRanges = [
|
||
{ name: 'BMP-PUA', start: 0xE000, end: 0xF8FF },
|
||
{ name: 'Plane15-PUA', start: 0xF0000, end: 0xFFFFD },
|
||
{ name: 'Plane16-PUA', start: 0x100000, end: 0x10FFFD }
|
||
];
|
||
|
||
const results = [];
|
||
|
||
for (const range of puaRanges) {
|
||
// Test a few characters from each range
|
||
const testChars = [];
|
||
testChars.push(String.fromCodePoint(range.start));
|
||
testChars.push(String.fromCodePoint(Math.floor((range.start + range.end) / 2)));
|
||
if (range.end <= 0x10FFFF) {
|
||
testChars.push(String.fromCodePoint(range.end));
|
||
}
|
||
|
||
const testString = testChars.join('');
|
||
const xml = `<?xml version="1.0" encoding="UTF-8"?>
|
||
<Invoice>
|
||
<CustomField>${testString}</CustomField>
|
||
</Invoice>`;
|
||
|
||
try {
|
||
const parsed = await einvoice.parseXML(xml);
|
||
const field = parsed?.CustomField || '';
|
||
|
||
results.push({
|
||
range: range.name,
|
||
success: true,
|
||
preserved: field === testString,
|
||
charCount: testString.length,
|
||
handled: true
|
||
});
|
||
} catch (error) {
|
||
results.push({
|
||
range: range.name,
|
||
success: false,
|
||
error: error.message
|
||
});
|
||
}
|
||
}
|
||
|
||
return results;
|
||
}
|
||
);
|
||
|
||
privateUseArea.forEach(result => {
|
||
t.ok(result.success || result.error, `PUA range ${result.range} was processed`);
|
||
});
|
||
|
||
// Test 10: Character set conversion in format transformation
|
||
const formatTransformCharsets = await performanceTracker.measureAsync(
|
||
'format-transform-charsets',
|
||
async () => {
|
||
const testContents = [
|
||
{ name: 'multilingual', text: 'Hello مرحبا 你好 Здравствуйте' },
|
||
{ name: 'symbols', text: '€ £ ¥ $ ₹ ₽ ¢ ₩' },
|
||
{ name: 'accented', text: 'àáäâ èéëê ìíïî òóöô ùúüû ñç' },
|
||
{ name: 'mixed-emoji', text: 'Invoice 📄 Total: 💰 Status: ✅' }
|
||
];
|
||
|
||
const results = [];
|
||
|
||
for (const content of testContents) {
|
||
const ublInvoice = `<?xml version="1.0" encoding="UTF-8"?>
|
||
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
|
||
<ID>CHARSET-001</ID>
|
||
<Note>${content.text}</Note>
|
||
</Invoice>`;
|
||
|
||
try {
|
||
// Convert to CII
|
||
const ciiResult = await einvoice.convertFormat(ublInvoice, 'cii');
|
||
|
||
// Parse the converted result
|
||
const parsed = await einvoice.parseDocument(ciiResult);
|
||
|
||
// Check if content was preserved
|
||
const preserved = JSON.stringify(parsed).includes(content.text);
|
||
|
||
results.push({
|
||
content: content.name,
|
||
success: true,
|
||
preserved,
|
||
formatConversionOk: true
|
||
});
|
||
} catch (error) {
|
||
results.push({
|
||
content: content.name,
|
||
success: false,
|
||
error: error.message
|
||
});
|
||
}
|
||
}
|
||
|
||
return results;
|
||
}
|
||
);
|
||
|
||
formatTransformCharsets.forEach(result => {
|
||
t.ok(result.success, `Format transform with ${result.content} should succeed`);
|
||
if (result.success) {
|
||
t.ok(result.preserved, `Character content should be preserved in transformation`);
|
||
}
|
||
});
|
||
|
||
// Print performance summary
|
||
performanceTracker.printSummary();
|
||
});
|
||
|
||
// Run the test
|
||
tap.start(); |