537 lines
18 KiB
TypeScript
537 lines
18 KiB
TypeScript
/**
|
||
* @file test.conv-11.encoding-edge-cases.ts
|
||
* @description Tests for character encoding edge cases and special scenarios during conversion
|
||
*/
|
||
|
||
import { tap } from '@git.zone/tstest/tapbundle';
|
||
import * as plugins from '../../plugins.js';
|
||
import { EInvoice } from '../../../ts/index.js';
|
||
import { CorpusLoader } from '../../suite/corpus.loader.js';
|
||
import { PerformanceTracker } from '../../suite/performance.tracker.js';
|
||
|
||
const corpusLoader = new CorpusLoader();
|
||
const performanceTracker = new PerformanceTracker('CONV-11: Character Encoding Edge Cases');
|
||
|
||
tap.test('CONV-11: Character Encoding - should handle encoding edge cases during conversion', async (t) => {
|
||
// Test 1: Mixed encoding declarations
|
||
const mixedEncodingDeclarations = await performanceTracker.measureAsync(
|
||
'mixed-encoding-declarations',
|
||
async () => {
|
||
const einvoice = new EInvoice();
|
||
const results = {
|
||
utf8ToUtf16: false,
|
||
utf16ToIso: false,
|
||
isoToUtf8: false,
|
||
bomHandling: false
|
||
};
|
||
|
||
// UTF-8 to UTF-16 conversion
|
||
const utf8Invoice = {
|
||
format: 'ubl' as const,
|
||
encoding: 'UTF-8',
|
||
data: {
|
||
documentType: 'INVOICE',
|
||
invoiceNumber: 'ENC-UTF8-2024-001',
|
||
issueDate: '2024-01-28',
|
||
seller: {
|
||
name: 'UTF-8 Société Française €',
|
||
address: 'Rue de la Paix № 42',
|
||
country: 'FR',
|
||
taxId: 'FR12345678901'
|
||
},
|
||
buyer: {
|
||
name: 'Käufer GmbH & Co. KG',
|
||
address: 'Hauptstraße 123½',
|
||
country: 'DE',
|
||
taxId: 'DE123456789'
|
||
},
|
||
items: [{
|
||
description: 'Spécialité française – Délicieux',
|
||
quantity: 1,
|
||
unitPrice: 99.99,
|
||
vatRate: 20,
|
||
lineTotal: 99.99
|
||
}],
|
||
totals: {
|
||
netAmount: 99.99,
|
||
vatAmount: 20.00,
|
||
grossAmount: 119.99
|
||
}
|
||
}
|
||
};
|
||
|
||
try {
|
||
// Convert and force UTF-16 encoding
|
||
const converted = await einvoice.convertFormat(utf8Invoice, 'cii');
|
||
converted.encoding = 'UTF-16';
|
||
|
||
// Check if special characters are preserved
|
||
results.utf8ToUtf16 = converted.data.seller.name.includes('€') &&
|
||
converted.data.seller.address.includes('№') &&
|
||
converted.data.items[0].description.includes('–');
|
||
} catch (error) {
|
||
// Encoding conversion may not be supported
|
||
}
|
||
|
||
// ISO-8859-1 limitations test
|
||
const isoInvoice = {
|
||
format: 'cii' as const,
|
||
encoding: 'ISO-8859-1',
|
||
data: {
|
||
documentType: 'INVOICE',
|
||
invoiceNumber: 'ENC-ISO-2024-001',
|
||
issueDate: '2024-01-28',
|
||
seller: {
|
||
name: 'Latin-1 Company',
|
||
address: 'Simple Street 1',
|
||
country: 'ES',
|
||
taxId: 'ES12345678A'
|
||
},
|
||
buyer: {
|
||
name: 'Buyer Limited',
|
||
address: 'Plain Avenue 2',
|
||
country: 'ES',
|
||
taxId: 'ES87654321B'
|
||
},
|
||
items: [{
|
||
description: 'Product with emoji 😀 and Chinese 中文',
|
||
quantity: 1,
|
||
unitPrice: 50.00,
|
||
vatRate: 21,
|
||
lineTotal: 50.00
|
||
}],
|
||
totals: {
|
||
netAmount: 50.00,
|
||
vatAmount: 10.50,
|
||
grossAmount: 60.50
|
||
}
|
||
}
|
||
};
|
||
|
||
try {
|
||
const converted = await einvoice.convertFormat(isoInvoice, 'ubl');
|
||
// Characters outside ISO-8859-1 should be handled (replaced or encoded)
|
||
results.isoToUtf8 = converted.data.items[0].description !== isoInvoice.data.items[0].description;
|
||
} catch (error) {
|
||
// Expected behavior for unsupported characters
|
||
results.isoToUtf8 = true;
|
||
}
|
||
|
||
// BOM handling test
|
||
const bomInvoice = {
|
||
format: 'ubl' as const,
|
||
encoding: 'UTF-8-BOM',
|
||
data: {
|
||
documentType: 'INVOICE',
|
||
invoiceNumber: 'ENC-BOM-2024-001',
|
||
issueDate: '2024-01-28',
|
||
seller: {
|
||
name: 'BOM Test Company',
|
||
address: 'BOM Street 1',
|
||
country: 'US',
|
||
taxId: 'US12-3456789'
|
||
},
|
||
buyer: {
|
||
name: 'BOM Buyer Inc',
|
||
address: 'BOM Avenue 2',
|
||
country: 'US',
|
||
taxId: 'US98-7654321'
|
||
},
|
||
items: [{
|
||
description: 'BOM-aware product',
|
||
quantity: 1,
|
||
unitPrice: 100.00,
|
||
vatRate: 8,
|
||
lineTotal: 100.00
|
||
}],
|
||
totals: {
|
||
netAmount: 100.00,
|
||
vatAmount: 8.00,
|
||
grossAmount: 108.00
|
||
}
|
||
}
|
||
};
|
||
|
||
try {
|
||
const converted = await einvoice.convertFormat(bomInvoice, 'cii');
|
||
results.bomHandling = converted.data.invoiceNumber === bomInvoice.data.invoiceNumber;
|
||
} catch (error) {
|
||
// BOM handling error
|
||
}
|
||
|
||
return results;
|
||
}
|
||
);
|
||
|
||
// Test 2: Unicode normalization during conversion
|
||
const unicodeNormalization = await performanceTracker.measureAsync(
|
||
'unicode-normalization',
|
||
async () => {
|
||
const einvoice = new EInvoice();
|
||
|
||
// Test with different Unicode normalization forms
|
||
const testCases = [
|
||
{
|
||
name: 'NFC vs NFD',
|
||
text1: 'café', // NFC: é as single character
|
||
text2: 'café', // NFD: e + combining acute accent
|
||
shouldMatch: true
|
||
},
|
||
{
|
||
name: 'Precomposed vs Decomposed',
|
||
text1: 'Å', // Precomposed
|
||
text2: 'Å', // A + ring above
|
||
shouldMatch: true
|
||
},
|
||
{
|
||
name: 'Complex diacritics',
|
||
text1: 'Việt Nam',
|
||
text2: 'Việt Nam', // Different composition
|
||
shouldMatch: true
|
||
}
|
||
];
|
||
|
||
const results = [];
|
||
|
||
for (const testCase of testCases) {
|
||
const invoice = {
|
||
format: 'ubl' as const,
|
||
data: {
|
||
documentType: 'INVOICE',
|
||
invoiceNumber: `NORM-${testCase.name.replace(/\s+/g, '-')}`,
|
||
issueDate: '2024-01-28',
|
||
seller: {
|
||
name: testCase.text1,
|
||
address: 'Normalization Test 1',
|
||
country: 'VN',
|
||
taxId: 'VN1234567890'
|
||
},
|
||
buyer: {
|
||
name: testCase.text2,
|
||
address: 'Normalization Test 2',
|
||
country: 'VN',
|
||
taxId: 'VN0987654321'
|
||
},
|
||
items: [{
|
||
description: `Product from ${testCase.text1}`,
|
||
quantity: 1,
|
||
unitPrice: 100.00,
|
||
vatRate: 10,
|
||
lineTotal: 100.00
|
||
}],
|
||
totals: {
|
||
netAmount: 100.00,
|
||
vatAmount: 10.00,
|
||
grossAmount: 110.00
|
||
}
|
||
}
|
||
};
|
||
|
||
try {
|
||
const converted = await einvoice.convertFormat(invoice, 'cii');
|
||
const backToUBL = await einvoice.convertFormat(converted, 'ubl');
|
||
|
||
// Check if normalized strings are handled correctly
|
||
const sellerMatch = backToUBL.data.seller.name === invoice.data.seller.name ||
|
||
backToUBL.data.seller.name.normalize('NFC') === invoice.data.seller.name.normalize('NFC');
|
||
|
||
results.push({
|
||
testCase: testCase.name,
|
||
preserved: sellerMatch,
|
||
original: testCase.text1,
|
||
converted: backToUBL.data.seller.name
|
||
});
|
||
} catch (error) {
|
||
results.push({
|
||
testCase: testCase.name,
|
||
preserved: false,
|
||
error: error.message
|
||
});
|
||
}
|
||
}
|
||
|
||
return results;
|
||
}
|
||
);
|
||
|
||
// Test 3: Zero-width and control characters
|
||
const controlCharacters = await performanceTracker.measureAsync(
|
||
'control-characters-handling',
|
||
async () => {
|
||
const einvoice = new EInvoice();
|
||
|
||
// Test various control and special characters
|
||
const specialChars = {
|
||
zeroWidth: '\u200B\u200C\u200D\uFEFF', // Zero-width characters
|
||
control: '\u0001\u0002\u001F', // Control characters
|
||
directional: '\u202A\u202B\u202C\u202D\u202E', // Directional marks
|
||
combining: 'a\u0300\u0301\u0302\u0303', // Combining diacriticals
|
||
surrogates: '𝕳𝖊𝖑𝖑𝖔', // Mathematical alphanumeric symbols
|
||
emoji: '🧾💰📊' // Emoji characters
|
||
};
|
||
|
||
const results = {};
|
||
|
||
for (const [charType, chars] of Object.entries(specialChars)) {
|
||
const invoice = {
|
||
format: 'ubl' as const,
|
||
data: {
|
||
documentType: 'INVOICE',
|
||
invoiceNumber: `CTRL-${charType.toUpperCase()}-001`,
|
||
issueDate: '2024-01-28',
|
||
seller: {
|
||
name: `Seller${chars}Company`,
|
||
address: `Address ${chars} Line`,
|
||
country: 'US',
|
||
taxId: 'US12-3456789'
|
||
},
|
||
buyer: {
|
||
name: `Buyer ${chars} Ltd`,
|
||
address: 'Normal Address',
|
||
country: 'US',
|
||
taxId: 'US98-7654321'
|
||
},
|
||
items: [{
|
||
description: `Product ${chars} Description`,
|
||
quantity: 1,
|
||
unitPrice: 100.00,
|
||
vatRate: 10,
|
||
lineTotal: 100.00
|
||
}],
|
||
totals: {
|
||
netAmount: 100.00,
|
||
vatAmount: 10.00,
|
||
grossAmount: 110.00
|
||
},
|
||
notes: `Notes with ${chars} special characters`
|
||
}
|
||
};
|
||
|
||
try {
|
||
const converted = await einvoice.convertFormat(invoice, 'cii');
|
||
const sanitized = await einvoice.convertFormat(converted, 'ubl');
|
||
|
||
// Check how special characters are handled
|
||
results[charType] = {
|
||
originalLength: invoice.data.seller.name.length,
|
||
convertedLength: sanitized.data.seller.name.length,
|
||
preserved: invoice.data.seller.name === sanitized.data.seller.name,
|
||
cleaned: sanitized.data.seller.name.replace(/[\u0000-\u001F\u200B-\u200D\uFEFF]/g, '').length < invoice.data.seller.name.length
|
||
};
|
||
} catch (error) {
|
||
results[charType] = {
|
||
error: true,
|
||
message: error.message
|
||
};
|
||
}
|
||
}
|
||
|
||
return results;
|
||
}
|
||
);
|
||
|
||
// Test 4: Encoding conflicts in multi-language invoices
|
||
const multiLanguageEncoding = await performanceTracker.measureAsync(
|
||
'multi-language-encoding',
|
||
async () => {
|
||
const einvoice = new EInvoice();
|
||
|
||
// Create invoice with multiple scripts/languages
|
||
const multiLangInvoice = {
|
||
format: 'ubl' as const,
|
||
data: {
|
||
documentType: 'INVOICE',
|
||
invoiceNumber: 'MULTI-LANG-2024-001',
|
||
issueDate: '2024-01-28',
|
||
seller: {
|
||
name: 'Global Trading Company 全球贸易公司',
|
||
address: 'International Plaza 国际广场 Διεθνής Πλατεία',
|
||
country: 'SG',
|
||
taxId: 'SG12345678X'
|
||
},
|
||
buyer: {
|
||
name: 'المشتري العربي | Arabic Buyer | खरीदार',
|
||
address: 'شارع العرب | Arab Street | अरब स्ट्रीट',
|
||
country: 'AE',
|
||
taxId: 'AE123456789012345'
|
||
},
|
||
items: [
|
||
{
|
||
description: 'Product 产品 Προϊόν منتج उत्पाद',
|
||
quantity: 1,
|
||
unitPrice: 100.00,
|
||
vatRate: 5,
|
||
lineTotal: 100.00
|
||
},
|
||
{
|
||
description: 'Service 服务 Υπηρεσία خدمة सेवा',
|
||
quantity: 2,
|
||
unitPrice: 200.00,
|
||
vatRate: 5,
|
||
lineTotal: 400.00
|
||
}
|
||
],
|
||
totals: {
|
||
netAmount: 500.00,
|
||
vatAmount: 25.00,
|
||
grossAmount: 525.00
|
||
},
|
||
notes: 'Thank you 谢谢 Ευχαριστώ شكرا धन्यवाद'
|
||
}
|
||
};
|
||
|
||
// Test conversion through different formats
|
||
const conversionTests = [
|
||
{ from: 'ubl', to: 'cii' },
|
||
{ from: 'cii', to: 'zugferd' },
|
||
{ from: 'zugferd', to: 'xrechnung' }
|
||
];
|
||
|
||
const results = [];
|
||
let currentInvoice = multiLangInvoice;
|
||
|
||
for (const test of conversionTests) {
|
||
try {
|
||
const converted = await einvoice.convertFormat(currentInvoice, test.to);
|
||
|
||
// Check preservation of multi-language content
|
||
const sellerNamePreserved = converted.data.seller.name.includes('全球贸易公司');
|
||
const buyerNamePreserved = converted.data.buyer.name.includes('العربي') &&
|
||
converted.data.buyer.name.includes('खरीदार');
|
||
const itemsPreserved = converted.data.items[0].description.includes('产品') &&
|
||
converted.data.items[0].description.includes('منتج');
|
||
|
||
results.push({
|
||
conversion: `${test.from} -> ${test.to}`,
|
||
sellerNamePreserved,
|
||
buyerNamePreserved,
|
||
itemsPreserved,
|
||
allPreserved: sellerNamePreserved && buyerNamePreserved && itemsPreserved
|
||
});
|
||
|
||
currentInvoice = converted;
|
||
} catch (error) {
|
||
results.push({
|
||
conversion: `${test.from} -> ${test.to}`,
|
||
error: error.message
|
||
});
|
||
}
|
||
}
|
||
|
||
return results;
|
||
}
|
||
);
|
||
|
||
// Test 5: Corpus encoding analysis
|
||
const corpusEncodingAnalysis = await performanceTracker.measureAsync(
|
||
'corpus-encoding-edge-cases',
|
||
async () => {
|
||
const files = await corpusLoader.getFilesByPattern('**/*.xml');
|
||
const einvoice = new EInvoice();
|
||
const encodingStats = {
|
||
totalFiles: 0,
|
||
encodingIssues: 0,
|
||
specialCharFiles: 0,
|
||
conversionFailures: 0,
|
||
characterTypes: new Set<string>(),
|
||
problematicFiles: [] as string[]
|
||
};
|
||
|
||
// Sample files for analysis
|
||
const sampleFiles = files.slice(0, 30);
|
||
|
||
for (const file of sampleFiles) {
|
||
try {
|
||
const content = await plugins.fs.readFile(file, 'utf-8');
|
||
encodingStats.totalFiles++;
|
||
|
||
// Check for special characters
|
||
const hasSpecialChars = /[^\x00-\x7F]/.test(content);
|
||
const hasControlChars = /[\x00-\x1F\x7F]/.test(content);
|
||
const hasRTL = /[\u0590-\u08FF\uFB1D-\uFDFF\uFE70-\uFEFF]/.test(content);
|
||
const hasCJK = /[\u4E00-\u9FFF\u3040-\u30FF\uAC00-\uD7AF]/.test(content);
|
||
|
||
if (hasSpecialChars || hasControlChars || hasRTL || hasCJK) {
|
||
encodingStats.specialCharFiles++;
|
||
if (hasControlChars) encodingStats.characterTypes.add('control');
|
||
if (hasRTL) encodingStats.characterTypes.add('RTL');
|
||
if (hasCJK) encodingStats.characterTypes.add('CJK');
|
||
}
|
||
|
||
// Try format detection and conversion
|
||
const format = await einvoice.detectFormat(content);
|
||
if (format && format !== 'unknown') {
|
||
try {
|
||
const parsed = await einvoice.parseInvoice(content, format);
|
||
const targetFormat = format === 'ubl' ? 'cii' : 'ubl';
|
||
|
||
// Test conversion with special characters
|
||
await einvoice.convertFormat(parsed, targetFormat);
|
||
} catch (convError) {
|
||
encodingStats.conversionFailures++;
|
||
if (hasSpecialChars) {
|
||
encodingStats.problematicFiles.push(file);
|
||
}
|
||
}
|
||
}
|
||
|
||
} catch (error) {
|
||
encodingStats.encodingIssues++;
|
||
}
|
||
}
|
||
|
||
return {
|
||
...encodingStats,
|
||
characterTypes: Array.from(encodingStats.characterTypes),
|
||
specialCharPercentage: (encodingStats.specialCharFiles / encodingStats.totalFiles * 100).toFixed(2) + '%',
|
||
conversionFailureRate: (encodingStats.conversionFailures / encodingStats.totalFiles * 100).toFixed(2) + '%'
|
||
};
|
||
}
|
||
);
|
||
|
||
// Summary
|
||
t.comment('\n=== CONV-11: Character Encoding Edge Cases Test Summary ===');
|
||
t.comment('\nMixed Encoding Declarations:');
|
||
t.comment(` - UTF-8 to UTF-16: ${mixedEncodingDeclarations.result.utf8ToUtf16 ? 'SUPPORTED' : 'NOT SUPPORTED'}`);
|
||
t.comment(` - UTF-16 to ISO-8859-1: ${mixedEncodingDeclarations.result.utf16ToIso ? 'HANDLED' : 'NOT HANDLED'}`);
|
||
t.comment(` - ISO-8859-1 to UTF-8: ${mixedEncodingDeclarations.result.isoToUtf8 ? 'HANDLED' : 'NOT HANDLED'}`);
|
||
t.comment(` - BOM handling: ${mixedEncodingDeclarations.result.bomHandling ? 'SUPPORTED' : 'NOT SUPPORTED'}`);
|
||
|
||
t.comment('\nUnicode Normalization:');
|
||
unicodeNormalization.result.forEach(test => {
|
||
t.comment(` - ${test.testCase}: ${test.preserved ? 'PRESERVED' : 'MODIFIED'}`);
|
||
});
|
||
|
||
t.comment('\nControl Characters Handling:');
|
||
Object.entries(controlCharacters.result).forEach(([type, result]: [string, any]) => {
|
||
if (result.error) {
|
||
t.comment(` - ${type}: ERROR - ${result.message}`);
|
||
} else {
|
||
t.comment(` - ${type}: ${result.preserved ? 'PRESERVED' : 'SANITIZED'} (${result.originalLength} -> ${result.convertedLength} chars)`);
|
||
}
|
||
});
|
||
|
||
t.comment('\nMulti-Language Encoding:');
|
||
multiLanguageEncoding.result.forEach(test => {
|
||
if (test.error) {
|
||
t.comment(` - ${test.conversion}: ERROR - ${test.error}`);
|
||
} else {
|
||
t.comment(` - ${test.conversion}: ${test.allPreserved ? 'ALL PRESERVED' : 'PARTIAL LOSS'}`);
|
||
}
|
||
});
|
||
|
||
t.comment('\nCorpus Encoding Analysis:');
|
||
t.comment(` - Files analyzed: ${corpusEncodingAnalysis.result.totalFiles}`);
|
||
t.comment(` - Files with special characters: ${corpusEncodingAnalysis.result.specialCharFiles} (${corpusEncodingAnalysis.result.specialCharPercentage})`);
|
||
t.comment(` - Character types found: ${corpusEncodingAnalysis.result.characterTypes.join(', ')}`);
|
||
t.comment(` - Encoding issues: ${corpusEncodingAnalysis.result.encodingIssues}`);
|
||
t.comment(` - Conversion failures: ${corpusEncodingAnalysis.result.conversionFailures} (${corpusEncodingAnalysis.result.conversionFailureRate})`);
|
||
|
||
// Performance summary
|
||
t.comment('\n=== Performance Summary ===');
|
||
performanceTracker.logSummary();
|
||
|
||
t.end();
|
||
});
|
||
|
||
tap.start(); |