import { expect, tap } from '@git.zone/tstest/tapbundle';
import * as plugins from '../plugins.js';
import { EInvoice } from '../../../ts/index.js';
import { CorpusLoader } from '../corpus.loader.js';
import { PerformanceTracker } from '../performance.tracker.js';
tap.test('CONV-07: Character Encoding - should preserve character encoding during conversion', async (t) => {
// CONV-07: Verify character encoding is maintained across format conversions
// This test ensures special characters and international text are preserved
const performanceTracker = new PerformanceTracker('CONV-07: Character Encoding');
const corpusLoader = new CorpusLoader();
t.test('UTF-8 encoding preservation in conversion', async () => {
const startTime = performance.now();
// UBL invoice with various UTF-8 characters
const ublInvoice = `
UTF8-CONV-001
2025-01-25
380
Special characters: € £ ¥ © ® ™ § ¶ • ° ± × ÷
Diacritics: àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ
Greek: ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ αβγδεζηθικλμνξοπρστυφχψω
Cyrillic: АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
CJK: 中文 日本語 한국어
Arabic: العربية مرحبا
Hebrew: עברית שלום
Emoji: 😀 🎉 💰 📧 🌍
EUR
Société Générale Müller & Associés
Rue de la Légion d'Honneur
Zürich
8001
CH
François Lefèvre
françois@société-générale.ch
北京科技有限公司 (Beijing Tech Co.)
北京市朝阳区建国路88号
北京
CN
1
Spëcïål cháracters in line: ñ ç ø å æ þ ð
10
1000.00
Bücher über Köln – München
Prix: 25,50 € (TVA incluse) • Größe: 21×29,7 cm²
100.00
`;
const einvoice = new EInvoice();
await einvoice.loadFromString(ublInvoice);
// Convert to another format (simulated by getting XML back)
const convertedXml = einvoice.getXmlString();
// Verify all special characters are preserved
const encodingChecks = [
// Currency symbols
{ char: '€', name: 'Euro' },
{ char: '£', name: 'Pound' },
{ char: '¥', name: 'Yen' },
// Special symbols
{ char: '©', name: 'Copyright' },
{ char: '®', name: 'Registered' },
{ char: '™', name: 'Trademark' },
{ char: '×', name: 'Multiplication' },
{ char: '÷', name: 'Division' },
// Diacritics
{ char: 'àáâãäå', name: 'Latin a variations' },
{ char: 'çñøæþð', name: 'Special Latin' },
// Greek
{ char: 'ΑΒΓΔ', name: 'Greek uppercase' },
{ char: 'αβγδ', name: 'Greek lowercase' },
// Cyrillic
{ char: 'АБВГ', name: 'Cyrillic' },
// CJK
{ char: '中文', name: 'Chinese' },
{ char: '日本語', name: 'Japanese' },
{ char: '한국어', name: 'Korean' },
// RTL
{ char: 'العربية', name: 'Arabic' },
{ char: 'עברית', name: 'Hebrew' },
// Emoji
{ char: '😀', name: 'Emoji' },
// Names with diacritics
{ char: 'François Lefèvre', name: 'French name' },
{ char: 'Zürich', name: 'Swiss city' },
{ char: 'Müller', name: 'German name' },
// Special punctuation
{ char: '–', name: 'En dash' },
{ char: '•', name: 'Bullet' },
{ char: '²', name: 'Superscript' }
];
let preservedCount = 0;
const missingChars: string[] = [];
encodingChecks.forEach(check => {
if (convertedXml.includes(check.char)) {
preservedCount++;
} else {
missingChars.push(`${check.name} (${check.char})`);
}
});
console.log(`UTF-8 preservation: ${preservedCount}/${encodingChecks.length} character sets preserved`);
if (missingChars.length > 0) {
console.log('Missing characters:', missingChars);
}
expect(preservedCount).toBeGreaterThan(encodingChecks.length * 0.9); // Allow 10% loss
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('utf8-preservation', elapsed);
});
t.test('Entity encoding in conversion', async () => {
const startTime = performance.now();
// CII invoice with XML entities
const ciiInvoice = `
ENTITY-CONV-001
XML entities: <invoice> & "quotes" with 'apostrophes'
Numeric entities: € £ ¥ ™
Hex entities: € £ ¥
Product & Service <Premium>
Price comparison: USD < EUR > GBP
Smith & Jones "Trading" Ltd.
Registered in England & Wales
`;
const einvoice = new EInvoice();
await einvoice.loadFromString(ciiInvoice);
const convertedXml = einvoice.getXmlString();
// Check entity preservation
const entityChecks = {
'Ampersand entity': convertedXml.includes('&') || convertedXml.includes(' & '),
'Less than entity': convertedXml.includes('<') || convertedXml.includes(' < '),
'Greater than entity': convertedXml.includes('>') || convertedXml.includes(' > '),
'Quote preservation': convertedXml.includes('"quotes"') || convertedXml.includes('"quotes"'),
'Apostrophe preservation': convertedXml.includes("'apostrophes'") || convertedXml.includes(''apostrophes''),
'Numeric entities': convertedXml.includes('€') || convertedXml.includes('€'),
'Hex entities': convertedXml.includes('£') || convertedXml.includes('£')
};
Object.entries(entityChecks).forEach(([check, passed]) => {
if (passed) {
console.log(`✓ ${check}`);
} else {
console.log(`✗ ${check}`);
}
});
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('entity-encoding', elapsed);
});
t.test('Mixed encoding scenarios', async () => {
const startTime = performance.now();
// Invoice with mixed encoding challenges
const mixedInvoice = `
MIXED-ENC-001
2025-01-25
380
EUR
& special chars € £ ¥]]>
Mixed: Normal text with €100 and <escaped> content
Müller & Associés S.à r.l.
Hauptstraße 42 (Gebäude "A")
Köln
DE
Payment terms: 2/10 net 30 (2% if paid <= 10 days)
1
Temperature range: -40°C ≤ T ≤ +85°C
10
1000.00
Product™ with ® symbol © 2025
Size: 10cm × 20cm × 5cm • Weight: ≈1kg
Special chars
α β γ δ ε ≠ ∞ ∑ √ ∫
`;
const einvoice = new EInvoice();
await einvoice.loadFromString(mixedInvoice);
const convertedXml = einvoice.getXmlString();
// Check mixed encoding preservation
const mixedChecks = {
'CDATA content': convertedXml.includes('CDATA content') || convertedXml.includes(''),
'Mixed entities and Unicode': convertedXml.includes('€100') || convertedXml.includes('€100'),
'German umlauts': convertedXml.includes('Müller') && convertedXml.includes('Köln'),
'French accents': convertedXml.includes('Associés') && convertedXml.includes('Société'),
'Mathematical symbols': convertedXml.includes('≤') && convertedXml.includes('≈'),
'Trademark symbols': convertedXml.includes('™') && convertedXml.includes('®'),
'Greek letters': convertedXml.includes('α') || convertedXml.includes('beta'),
'Temperature notation': convertedXml.includes('°C'),
'Multiplication sign': convertedXml.includes('×'),
'CDATA in address': convertedXml.includes('Floor 3') || convertedXml.includes('& 4')
};
const passedChecks = Object.entries(mixedChecks).filter(([_, passed]) => passed).length;
console.log(`Mixed encoding: ${passedChecks}/${Object.keys(mixedChecks).length} checks passed`);
expect(passedChecks).toBeGreaterThan(Object.keys(mixedChecks).length * 0.8);
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('mixed-encoding', elapsed);
});
t.test('Encoding in different invoice formats', async () => {
const startTime = performance.now();
// Test encoding across different format characteristics
const formats = [
{
name: 'UBL with namespaces',
content: `
NS-€-001
Namespace test: €£¥
`
},
{
name: 'CII with complex structure',
content: `
CII-Ü-001
Übersicht über Änderungen
`
},
{
name: 'Factur-X with French',
content: `
FX-FR-001
Facture détaillée avec références spéciales
`
}
];
for (const format of formats) {
try {
const einvoice = new EInvoice();
await einvoice.loadFromString(format.content);
const converted = einvoice.getXmlString();
// Check key characters are preserved
let preserved = true;
if (format.name.includes('UBL') && !converted.includes('€£¥')) preserved = false;
if (format.name.includes('CII') && !converted.includes('Ü')) preserved = false;
if (format.name.includes('French') && !converted.includes('détaillée')) preserved = false;
console.log(`${format.name}: ${preserved ? '✓' : '✗'} Encoding preserved`);
} catch (error) {
console.log(`${format.name}: Error - ${error.message}`);
}
}
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('format-encoding', elapsed);
});
t.test('Bidirectional text preservation', async () => {
const startTime = performance.now();
// Test RTL (Right-to-Left) text preservation
const rtlInvoice = `
RTL-TEST-001
2025-01-25
380
EUR
شركة التقنية المحدودة
شارع الملك فهد 123
الرياض
SA
חברת הטכנולוגיה בע"מ
רחוב דיזנגוף 456
תל אביב
IL
1
Mixed text: العربية (Arabic) and עברית (Hebrew) with English
10
1000.00
منتج تقني متقدم / מוצר טכנולוגי מתקדם
`;
const einvoice = new EInvoice();
await einvoice.loadFromString(rtlInvoice);
const convertedXml = einvoice.getXmlString();
// Check RTL text preservation
const rtlChecks = {
'Arabic company': convertedXml.includes('شركة التقنية المحدودة'),
'Arabic street': convertedXml.includes('شارع الملك فهد'),
'Arabic city': convertedXml.includes('الرياض'),
'Hebrew company': convertedXml.includes('חברת הטכנולוגיה'),
'Hebrew street': convertedXml.includes('רחוב דיזנגוף'),
'Hebrew city': convertedXml.includes('תל אביב'),
'Mixed RTL/LTR': convertedXml.includes('Arabic') && convertedXml.includes('Hebrew'),
'Arabic product': convertedXml.includes('منتج تقني متقدم'),
'Hebrew product': convertedXml.includes('מוצר טכנולוגי מתקדם')
};
const rtlPreserved = Object.entries(rtlChecks).filter(([_, passed]) => passed).length;
console.log(`RTL text preservation: ${rtlPreserved}/${Object.keys(rtlChecks).length}`);
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('rtl-preservation', elapsed);
});
t.test('Corpus encoding preservation analysis', async () => {
const startTime = performance.now();
let processedCount = 0;
let encodingIssues = 0;
const characterCategories = {
'ASCII only': 0,
'Latin extended': 0,
'Greek': 0,
'Cyrillic': 0,
'CJK': 0,
'Arabic/Hebrew': 0,
'Special symbols': 0,
'Emoji': 0
};
const files = await corpusLoader.getAllFiles();
const xmlFiles = files.filter(f => f.endsWith('.xml') && !f.includes('.pdf'));
// Sample corpus for encoding analysis
const sampleSize = Math.min(50, xmlFiles.length);
const sample = xmlFiles.slice(0, sampleSize);
for (const file of sample) {
try {
const content = await corpusLoader.readFile(file);
const einvoice = new EInvoice();
let originalString: string;
if (typeof content === 'string') {
originalString = content;
await einvoice.loadFromString(content);
} else {
originalString = content.toString('utf8');
await einvoice.loadFromBuffer(content);
}
const convertedXml = einvoice.getXmlString();
// Categorize content
if (!/[^\x00-\x7F]/.test(originalString)) {
characterCategories['ASCII only']++;
} else {
if (/[À-ÿĀ-ſ]/.test(originalString)) characterCategories['Latin extended']++;
if (/[Α-Ωα-ω]/.test(originalString)) characterCategories['Greek']++;
if (/[А-Яа-я]/.test(originalString)) characterCategories['Cyrillic']++;
if (/[\u4E00-\u9FFF\u3040-\u309F\u30A0-\u30FF\uAC00-\uD7AF]/.test(originalString)) characterCategories['CJK']++;
if (/[\u0590-\u05FF\u0600-\u06FF]/.test(originalString)) characterCategories['Arabic/Hebrew']++;
if (/[©®™€£¥§¶•°±×÷≤≥≠≈∞]/.test(originalString)) characterCategories['Special symbols']++;
if (/[\u{1F300}-\u{1F9FF}]/u.test(originalString)) characterCategories['Emoji']++;
}
// Simple check for major encoding loss
const originalNonAscii = (originalString.match(/[^\x00-\x7F]/g) || []).length;
const convertedNonAscii = (convertedXml.match(/[^\x00-\x7F]/g) || []).length;
if (originalNonAscii > 0 && convertedNonAscii < originalNonAscii * 0.8) {
encodingIssues++;
console.log(`Potential encoding loss in ${file}: ${originalNonAscii} -> ${convertedNonAscii} non-ASCII chars`);
}
processedCount++;
} catch (error) {
console.log(`Encoding analysis error in ${file}:`, error.message);
}
}
console.log(`Corpus encoding analysis (${processedCount} files):`);
console.log('Character categories found:');
Object.entries(characterCategories)
.filter(([_, count]) => count > 0)
.sort((a, b) => b[1] - a[1])
.forEach(([category, count]) => {
console.log(` ${category}: ${count} files`);
});
console.log(`Files with potential encoding issues: ${encodingIssues}`);
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('corpus-encoding', elapsed);
});
// Print performance summary
performanceTracker.printSummary();
// Performance assertions
const avgTime = performanceTracker.getAverageTime();
expect(avgTime).toBeLessThan(400); // Encoding operations may take longer
});
tap.start();