einvoice/test/suite/einvoice_conversion/test.conv-11.encoding-edge-cases.ts
2025-05-25 19:45:37 +00:00

537 lines
18 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* @file test.conv-11.encoding-edge-cases.ts
* @description Tests for character encoding edge cases and special scenarios during conversion
*/
import { tap } from '@git.zone/tstest/tapbundle';
import * as plugins from '../../plugins.js';
import { EInvoice } from '../../../ts/index.js';
import { CorpusLoader } from '../../suite/corpus.loader.js';
import { PerformanceTracker } from '../../suite/performance.tracker.js';
const corpusLoader = new CorpusLoader();
const performanceTracker = new PerformanceTracker('CONV-11: Character Encoding Edge Cases');
tap.test('CONV-11: Character Encoding - should handle encoding edge cases during conversion', async (t) => {
// Test 1: Mixed encoding declarations
const mixedEncodingDeclarations = await performanceTracker.measureAsync(
'mixed-encoding-declarations',
async () => {
const einvoice = new EInvoice();
const results = {
utf8ToUtf16: false,
utf16ToIso: false,
isoToUtf8: false,
bomHandling: false
};
// UTF-8 to UTF-16 conversion
const utf8Invoice = {
format: 'ubl' as const,
encoding: 'UTF-8',
data: {
documentType: 'INVOICE',
invoiceNumber: 'ENC-UTF8-2024-001',
issueDate: '2024-01-28',
seller: {
name: 'UTF-8 Société Française €',
address: 'Rue de la Paix № 42',
country: 'FR',
taxId: 'FR12345678901'
},
buyer: {
name: 'Käufer GmbH & Co. KG',
address: 'Hauptstraße 123½',
country: 'DE',
taxId: 'DE123456789'
},
items: [{
description: 'Spécialité française Délicieux',
quantity: 1,
unitPrice: 99.99,
vatRate: 20,
lineTotal: 99.99
}],
totals: {
netAmount: 99.99,
vatAmount: 20.00,
grossAmount: 119.99
}
}
};
try {
// Convert and force UTF-16 encoding
const converted = await einvoice.convertFormat(utf8Invoice, 'cii');
converted.encoding = 'UTF-16';
// Check if special characters are preserved
results.utf8ToUtf16 = converted.data.seller.name.includes('€') &&
converted.data.seller.address.includes('№') &&
converted.data.items[0].description.includes('');
} catch (error) {
// Encoding conversion may not be supported
}
// ISO-8859-1 limitations test
const isoInvoice = {
format: 'cii' as const,
encoding: 'ISO-8859-1',
data: {
documentType: 'INVOICE',
invoiceNumber: 'ENC-ISO-2024-001',
issueDate: '2024-01-28',
seller: {
name: 'Latin-1 Company',
address: 'Simple Street 1',
country: 'ES',
taxId: 'ES12345678A'
},
buyer: {
name: 'Buyer Limited',
address: 'Plain Avenue 2',
country: 'ES',
taxId: 'ES87654321B'
},
items: [{
description: 'Product with emoji 😀 and Chinese 中文',
quantity: 1,
unitPrice: 50.00,
vatRate: 21,
lineTotal: 50.00
}],
totals: {
netAmount: 50.00,
vatAmount: 10.50,
grossAmount: 60.50
}
}
};
try {
const converted = await einvoice.convertFormat(isoInvoice, 'ubl');
// Characters outside ISO-8859-1 should be handled (replaced or encoded)
results.isoToUtf8 = converted.data.items[0].description !== isoInvoice.data.items[0].description;
} catch (error) {
// Expected behavior for unsupported characters
results.isoToUtf8 = true;
}
// BOM handling test
const bomInvoice = {
format: 'ubl' as const,
encoding: 'UTF-8-BOM',
data: {
documentType: 'INVOICE',
invoiceNumber: 'ENC-BOM-2024-001',
issueDate: '2024-01-28',
seller: {
name: 'BOM Test Company',
address: 'BOM Street 1',
country: 'US',
taxId: 'US12-3456789'
},
buyer: {
name: 'BOM Buyer Inc',
address: 'BOM Avenue 2',
country: 'US',
taxId: 'US98-7654321'
},
items: [{
description: 'BOM-aware product',
quantity: 1,
unitPrice: 100.00,
vatRate: 8,
lineTotal: 100.00
}],
totals: {
netAmount: 100.00,
vatAmount: 8.00,
grossAmount: 108.00
}
}
};
try {
const converted = await einvoice.convertFormat(bomInvoice, 'cii');
results.bomHandling = converted.data.invoiceNumber === bomInvoice.data.invoiceNumber;
} catch (error) {
// BOM handling error
}
return results;
}
);
// Test 2: Unicode normalization during conversion
const unicodeNormalization = await performanceTracker.measureAsync(
'unicode-normalization',
async () => {
const einvoice = new EInvoice();
// Test with different Unicode normalization forms
const testCases = [
{
name: 'NFC vs NFD',
text1: 'café', // NFC: é as single character
text2: 'café', // NFD: e + combining acute accent
shouldMatch: true
},
{
name: 'Precomposed vs Decomposed',
text1: 'Å', // Precomposed
text2: 'Å', // A + ring above
shouldMatch: true
},
{
name: 'Complex diacritics',
text1: 'Việt Nam',
text2: 'Việt Nam', // Different composition
shouldMatch: true
}
];
const results = [];
for (const testCase of testCases) {
const invoice = {
format: 'ubl' as const,
data: {
documentType: 'INVOICE',
invoiceNumber: `NORM-${testCase.name.replace(/\s+/g, '-')}`,
issueDate: '2024-01-28',
seller: {
name: testCase.text1,
address: 'Normalization Test 1',
country: 'VN',
taxId: 'VN1234567890'
},
buyer: {
name: testCase.text2,
address: 'Normalization Test 2',
country: 'VN',
taxId: 'VN0987654321'
},
items: [{
description: `Product from ${testCase.text1}`,
quantity: 1,
unitPrice: 100.00,
vatRate: 10,
lineTotal: 100.00
}],
totals: {
netAmount: 100.00,
vatAmount: 10.00,
grossAmount: 110.00
}
}
};
try {
const converted = await einvoice.convertFormat(invoice, 'cii');
const backToUBL = await einvoice.convertFormat(converted, 'ubl');
// Check if normalized strings are handled correctly
const sellerMatch = backToUBL.data.seller.name === invoice.data.seller.name ||
backToUBL.data.seller.name.normalize('NFC') === invoice.data.seller.name.normalize('NFC');
results.push({
testCase: testCase.name,
preserved: sellerMatch,
original: testCase.text1,
converted: backToUBL.data.seller.name
});
} catch (error) {
results.push({
testCase: testCase.name,
preserved: false,
error: error.message
});
}
}
return results;
}
);
// Test 3: Zero-width and control characters
const controlCharacters = await performanceTracker.measureAsync(
'control-characters-handling',
async () => {
const einvoice = new EInvoice();
// Test various control and special characters
const specialChars = {
zeroWidth: '\u200B\u200C\u200D\uFEFF', // Zero-width characters
control: '\u0001\u0002\u001F', // Control characters
directional: '\u202A\u202B\u202C\u202D\u202E', // Directional marks
combining: 'a\u0300\u0301\u0302\u0303', // Combining diacriticals
surrogates: '𝕳𝖊𝖑𝖑𝖔', // Mathematical alphanumeric symbols
emoji: '🧾💰📊' // Emoji characters
};
const results = {};
for (const [charType, chars] of Object.entries(specialChars)) {
const invoice = {
format: 'ubl' as const,
data: {
documentType: 'INVOICE',
invoiceNumber: `CTRL-${charType.toUpperCase()}-001`,
issueDate: '2024-01-28',
seller: {
name: `Seller${chars}Company`,
address: `Address ${chars} Line`,
country: 'US',
taxId: 'US12-3456789'
},
buyer: {
name: `Buyer ${chars} Ltd`,
address: 'Normal Address',
country: 'US',
taxId: 'US98-7654321'
},
items: [{
description: `Product ${chars} Description`,
quantity: 1,
unitPrice: 100.00,
vatRate: 10,
lineTotal: 100.00
}],
totals: {
netAmount: 100.00,
vatAmount: 10.00,
grossAmount: 110.00
},
notes: `Notes with ${chars} special characters`
}
};
try {
const converted = await einvoice.convertFormat(invoice, 'cii');
const sanitized = await einvoice.convertFormat(converted, 'ubl');
// Check how special characters are handled
results[charType] = {
originalLength: invoice.data.seller.name.length,
convertedLength: sanitized.data.seller.name.length,
preserved: invoice.data.seller.name === sanitized.data.seller.name,
cleaned: sanitized.data.seller.name.replace(/[\u0000-\u001F\u200B-\u200D\uFEFF]/g, '').length < invoice.data.seller.name.length
};
} catch (error) {
results[charType] = {
error: true,
message: error.message
};
}
}
return results;
}
);
// Test 4: Encoding conflicts in multi-language invoices
const multiLanguageEncoding = await performanceTracker.measureAsync(
'multi-language-encoding',
async () => {
const einvoice = new EInvoice();
// Create invoice with multiple scripts/languages
const multiLangInvoice = {
format: 'ubl' as const,
data: {
documentType: 'INVOICE',
invoiceNumber: 'MULTI-LANG-2024-001',
issueDate: '2024-01-28',
seller: {
name: 'Global Trading Company 全球贸易公司',
address: 'International Plaza 国际广场 Διεθνής Πλατεία',
country: 'SG',
taxId: 'SG12345678X'
},
buyer: {
name: 'المشتري العربي | Arabic Buyer | खरीदार',
address: 'شارع العرب | Arab Street | अरब स्ट्रीट',
country: 'AE',
taxId: 'AE123456789012345'
},
items: [
{
description: 'Product 产品 Προϊόν منتج उत्पाद',
quantity: 1,
unitPrice: 100.00,
vatRate: 5,
lineTotal: 100.00
},
{
description: 'Service 服务 Υπηρεσία خدمة सेवा',
quantity: 2,
unitPrice: 200.00,
vatRate: 5,
lineTotal: 400.00
}
],
totals: {
netAmount: 500.00,
vatAmount: 25.00,
grossAmount: 525.00
},
notes: 'Thank you 谢谢 Ευχαριστώ شكرا धन्यवाद'
}
};
// Test conversion through different formats
const conversionTests = [
{ from: 'ubl', to: 'cii' },
{ from: 'cii', to: 'zugferd' },
{ from: 'zugferd', to: 'xrechnung' }
];
const results = [];
let currentInvoice = multiLangInvoice;
for (const test of conversionTests) {
try {
const converted = await einvoice.convertFormat(currentInvoice, test.to);
// Check preservation of multi-language content
const sellerNamePreserved = converted.data.seller.name.includes('全球贸易公司');
const buyerNamePreserved = converted.data.buyer.name.includes('العربي') &&
converted.data.buyer.name.includes('खरीदार');
const itemsPreserved = converted.data.items[0].description.includes('产品') &&
converted.data.items[0].description.includes('منتج');
results.push({
conversion: `${test.from} -> ${test.to}`,
sellerNamePreserved,
buyerNamePreserved,
itemsPreserved,
allPreserved: sellerNamePreserved && buyerNamePreserved && itemsPreserved
});
currentInvoice = converted;
} catch (error) {
results.push({
conversion: `${test.from} -> ${test.to}`,
error: error.message
});
}
}
return results;
}
);
// Test 5: Corpus encoding analysis
const corpusEncodingAnalysis = await performanceTracker.measureAsync(
'corpus-encoding-edge-cases',
async () => {
const files = await corpusLoader.getFilesByPattern('**/*.xml');
const einvoice = new EInvoice();
const encodingStats = {
totalFiles: 0,
encodingIssues: 0,
specialCharFiles: 0,
conversionFailures: 0,
characterTypes: new Set<string>(),
problematicFiles: [] as string[]
};
// Sample files for analysis
const sampleFiles = files.slice(0, 30);
for (const file of sampleFiles) {
try {
const content = await plugins.fs.readFile(file, 'utf-8');
encodingStats.totalFiles++;
// Check for special characters
const hasSpecialChars = /[^\x00-\x7F]/.test(content);
const hasControlChars = /[\x00-\x1F\x7F]/.test(content);
const hasRTL = /[\u0590-\u08FF\uFB1D-\uFDFF\uFE70-\uFEFF]/.test(content);
const hasCJK = /[\u4E00-\u9FFF\u3040-\u30FF\uAC00-\uD7AF]/.test(content);
if (hasSpecialChars || hasControlChars || hasRTL || hasCJK) {
encodingStats.specialCharFiles++;
if (hasControlChars) encodingStats.characterTypes.add('control');
if (hasRTL) encodingStats.characterTypes.add('RTL');
if (hasCJK) encodingStats.characterTypes.add('CJK');
}
// Try format detection and conversion
const format = await einvoice.detectFormat(content);
if (format && format !== 'unknown') {
try {
const parsed = await einvoice.parseInvoice(content, format);
const targetFormat = format === 'ubl' ? 'cii' : 'ubl';
// Test conversion with special characters
await einvoice.convertFormat(parsed, targetFormat);
} catch (convError) {
encodingStats.conversionFailures++;
if (hasSpecialChars) {
encodingStats.problematicFiles.push(file);
}
}
}
} catch (error) {
encodingStats.encodingIssues++;
}
}
return {
...encodingStats,
characterTypes: Array.from(encodingStats.characterTypes),
specialCharPercentage: (encodingStats.specialCharFiles / encodingStats.totalFiles * 100).toFixed(2) + '%',
conversionFailureRate: (encodingStats.conversionFailures / encodingStats.totalFiles * 100).toFixed(2) + '%'
};
}
);
// Summary
t.comment('\n=== CONV-11: Character Encoding Edge Cases Test Summary ===');
t.comment('\nMixed Encoding Declarations:');
t.comment(` - UTF-8 to UTF-16: ${mixedEncodingDeclarations.result.utf8ToUtf16 ? 'SUPPORTED' : 'NOT SUPPORTED'}`);
t.comment(` - UTF-16 to ISO-8859-1: ${mixedEncodingDeclarations.result.utf16ToIso ? 'HANDLED' : 'NOT HANDLED'}`);
t.comment(` - ISO-8859-1 to UTF-8: ${mixedEncodingDeclarations.result.isoToUtf8 ? 'HANDLED' : 'NOT HANDLED'}`);
t.comment(` - BOM handling: ${mixedEncodingDeclarations.result.bomHandling ? 'SUPPORTED' : 'NOT SUPPORTED'}`);
t.comment('\nUnicode Normalization:');
unicodeNormalization.result.forEach(test => {
t.comment(` - ${test.testCase}: ${test.preserved ? 'PRESERVED' : 'MODIFIED'}`);
});
t.comment('\nControl Characters Handling:');
Object.entries(controlCharacters.result).forEach(([type, result]: [string, any]) => {
if (result.error) {
t.comment(` - ${type}: ERROR - ${result.message}`);
} else {
t.comment(` - ${type}: ${result.preserved ? 'PRESERVED' : 'SANITIZED'} (${result.originalLength} -> ${result.convertedLength} chars)`);
}
});
t.comment('\nMulti-Language Encoding:');
multiLanguageEncoding.result.forEach(test => {
if (test.error) {
t.comment(` - ${test.conversion}: ERROR - ${test.error}`);
} else {
t.comment(` - ${test.conversion}: ${test.allPreserved ? 'ALL PRESERVED' : 'PARTIAL LOSS'}`);
}
});
t.comment('\nCorpus Encoding Analysis:');
t.comment(` - Files analyzed: ${corpusEncodingAnalysis.result.totalFiles}`);
t.comment(` - Files with special characters: ${corpusEncodingAnalysis.result.specialCharFiles} (${corpusEncodingAnalysis.result.specialCharPercentage})`);
t.comment(` - Character types found: ${corpusEncodingAnalysis.result.characterTypes.join(', ')}`);
t.comment(` - Encoding issues: ${corpusEncodingAnalysis.result.encodingIssues}`);
t.comment(` - Conversion failures: ${corpusEncodingAnalysis.result.conversionFailures} (${corpusEncodingAnalysis.result.conversionFailureRate})`);
// Performance summary
t.comment('\n=== Performance Summary ===');
performanceTracker.logSummary();
t.end();
});
tap.start();