einvoice/test/suite/einvoice_conversion/test.conv-07.character-encoding.ts

523 lines
21 KiB
TypeScript
Raw Normal View History

2025-05-25 19:45:37 +00:00
import { expect, tap } from '@git.zone/tstest/tapbundle';
import * as plugins from '../plugins.js';
import { EInvoice } from '../../../ts/index.js';
import { CorpusLoader } from '../corpus.loader.js';
import { PerformanceTracker } from '../performance.tracker.js';
tap.test('CONV-07: Character Encoding - should preserve character encoding during conversion', async (t) => {
// CONV-07: Verify character encoding is maintained across format conversions
// This test ensures special characters and international text are preserved
const performanceTracker = new PerformanceTracker('CONV-07: Character Encoding');
const corpusLoader = new CorpusLoader();
t.test('UTF-8 encoding preservation in conversion', async () => {
const startTime = performance.now();
// UBL invoice with various UTF-8 characters
const ublInvoice = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2"
xmlns:cac="urn:oasis:names:specification:ubl:schema:xsd:CommonAggregateComponents-2"
xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
<cbc:ID>UTF8-CONV-001</cbc:ID>
<cbc:IssueDate>2025-01-25</cbc:IssueDate>
<cbc:InvoiceTypeCode>380</cbc:InvoiceTypeCode>
<cbc:Note>Special characters: £ ¥ © ® § ° ± × ÷</cbc:Note>
<cbc:Note>Diacritics: àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ</cbc:Note>
<cbc:Note>Greek: ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ αβγδεζηθικλμνξοπρστυφχψω</cbc:Note>
<cbc:Note>Cyrillic: АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ</cbc:Note>
<cbc:Note>CJK: 中文 </cbc:Note>
<cbc:Note>Arabic: العربية مرحبا</cbc:Note>
<cbc:Note>Hebrew: עברית שלום</cbc:Note>
<cbc:Note>Emoji: 😀 🎉 💰 📧 🌍</cbc:Note>
<cbc:DocumentCurrencyCode>EUR</cbc:DocumentCurrencyCode>
<cac:AccountingSupplierParty>
<cac:Party>
<cac:PartyName>
<cbc:Name>Société Générale Müller & Associés</cbc:Name>
</cac:PartyName>
<cac:PostalAddress>
<cbc:StreetName>Rue de la Légion d'Honneur</cbc:StreetName>
<cbc:CityName>Zürich</cbc:CityName>
<cbc:PostalZone>8001</cbc:PostalZone>
<cac:Country>
<cbc:IdentificationCode>CH</cbc:IdentificationCode>
</cac:Country>
</cac:PostalAddress>
<cac:Contact>
<cbc:Name>François Lefèvre</cbc:Name>
<cbc:ElectronicMail>françois@société-générale.ch</cbc:ElectronicMail>
</cac:Contact>
</cac:Party>
</cac:AccountingSupplierParty>
<cac:AccountingCustomerParty>
<cac:Party>
<cac:PartyName>
<cbc:Name> (Beijing Tech Co.)</cbc:Name>
</cac:PartyName>
<cac:PostalAddress>
<cbc:StreetName>88</cbc:StreetName>
<cbc:CityName></cbc:CityName>
<cac:Country>
<cbc:IdentificationCode>CN</cbc:IdentificationCode>
</cac:Country>
</cac:PostalAddress>
</cac:Party>
</cac:AccountingCustomerParty>
<cac:InvoiceLine>
<cbc:ID>1</cbc:ID>
<cbc:Note>Spëcïål cháracters in line: ñ ç ø å æ þ ð</cbc:Note>
<cbc:InvoicedQuantity unitCode="C62">10</cbc:InvoicedQuantity>
<cbc:LineExtensionAmount currencyID="EUR">1000.00</cbc:LineExtensionAmount>
<cac:Item>
<cbc:Name>Bücher über Köln München</cbc:Name>
<cbc:Description>Prix: 25,50 (TVA incluse) Größe: 21×29,7 cm²</cbc:Description>
</cac:Item>
<cac:Price>
<cbc:PriceAmount currencyID="EUR">100.00</cbc:PriceAmount>
</cac:Price>
</cac:InvoiceLine>
</Invoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(ublInvoice);
// Convert to another format (simulated by getting XML back)
const convertedXml = einvoice.getXmlString();
// Verify all special characters are preserved
const encodingChecks = [
// Currency symbols
{ char: '€', name: 'Euro' },
{ char: '£', name: 'Pound' },
{ char: '¥', name: 'Yen' },
// Special symbols
{ char: '©', name: 'Copyright' },
{ char: '®', name: 'Registered' },
{ char: '™', name: 'Trademark' },
{ char: '×', name: 'Multiplication' },
{ char: '÷', name: 'Division' },
// Diacritics
{ char: 'àáâãäå', name: 'Latin a variations' },
{ char: 'çñøæþð', name: 'Special Latin' },
// Greek
{ char: 'ΑΒΓΔ', name: 'Greek uppercase' },
{ char: 'αβγδ', name: 'Greek lowercase' },
// Cyrillic
{ char: 'АБВГ', name: 'Cyrillic' },
// CJK
{ char: '中文', name: 'Chinese' },
{ char: '日本語', name: 'Japanese' },
{ char: '한국어', name: 'Korean' },
// RTL
{ char: 'العربية', name: 'Arabic' },
{ char: 'עברית', name: 'Hebrew' },
// Emoji
{ char: '😀', name: 'Emoji' },
// Names with diacritics
{ char: 'François Lefèvre', name: 'French name' },
{ char: 'Zürich', name: 'Swiss city' },
{ char: 'Müller', name: 'German name' },
// Special punctuation
{ char: '', name: 'En dash' },
{ char: '•', name: 'Bullet' },
{ char: '²', name: 'Superscript' }
];
let preservedCount = 0;
const missingChars: string[] = [];
encodingChecks.forEach(check => {
if (convertedXml.includes(check.char)) {
preservedCount++;
} else {
missingChars.push(`${check.name} (${check.char})`);
}
});
console.log(`UTF-8 preservation: ${preservedCount}/${encodingChecks.length} character sets preserved`);
if (missingChars.length > 0) {
console.log('Missing characters:', missingChars);
}
expect(preservedCount).toBeGreaterThan(encodingChecks.length * 0.9); // Allow 10% loss
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('utf8-preservation', elapsed);
});
t.test('Entity encoding in conversion', async () => {
const startTime = performance.now();
// CII invoice with XML entities
const ciiInvoice = `<?xml version="1.0" encoding="UTF-8"?>
<rsm:CrossIndustryInvoice xmlns:rsm="urn:un:unece:uncefact:data:standard:CrossIndustryInvoice:100"
xmlns:ram="urn:un:unece:uncefact:data:standard:ReusableAggregateBusinessInformationEntity:100">
<rsm:ExchangedDocument>
<ram:ID>ENTITY-CONV-001</ram:ID>
<ram:IncludedNote>
<ram:Content>XML entities: &lt;invoice&gt; &amp; "quotes" with 'apostrophes'</ram:Content>
</ram:IncludedNote>
<ram:IncludedNote>
<ram:Content>Numeric entities: &#8364; &#163; &#165; &#8482;</ram:Content>
</ram:IncludedNote>
<ram:IncludedNote>
<ram:Content>Hex entities: &#x20AC; &#x00A3; &#x00A5;</ram:Content>
</ram:IncludedNote>
</rsm:ExchangedDocument>
<rsm:SupplyChainTradeTransaction>
<ram:IncludedSupplyChainTradeLineItem>
<ram:SpecifiedTradeProduct>
<ram:Name>Product &amp; Service &lt;Premium&gt;</ram:Name>
<ram:Description>Price comparison: USD &lt; EUR &gt; GBP</ram:Description>
</ram:SpecifiedTradeProduct>
</ram:IncludedSupplyChainTradeLineItem>
<ram:ApplicableHeaderTradeAgreement>
<ram:SellerTradeParty>
<ram:Name>Smith &amp; Jones "Trading" Ltd.</ram:Name>
<ram:Description>Registered in England &amp; Wales</ram:Description>
</ram:SellerTradeParty>
</ram:ApplicableHeaderTradeAgreement>
</rsm:SupplyChainTradeTransaction>
</rsm:CrossIndustryInvoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(ciiInvoice);
const convertedXml = einvoice.getXmlString();
// Check entity preservation
const entityChecks = {
'Ampersand entity': convertedXml.includes('&amp;') || convertedXml.includes(' & '),
'Less than entity': convertedXml.includes('&lt;') || convertedXml.includes(' < '),
'Greater than entity': convertedXml.includes('&gt;') || convertedXml.includes(' > '),
'Quote preservation': convertedXml.includes('"quotes"') || convertedXml.includes('&quot;quotes&quot;'),
'Apostrophe preservation': convertedXml.includes("'apostrophes'") || convertedXml.includes('&apos;apostrophes&apos;'),
'Numeric entities': convertedXml.includes('€') || convertedXml.includes('&#8364;'),
'Hex entities': convertedXml.includes('£') || convertedXml.includes('&#x00A3;')
};
Object.entries(entityChecks).forEach(([check, passed]) => {
if (passed) {
console.log(`${check}`);
} else {
console.log(`${check}`);
}
});
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('entity-encoding', elapsed);
});
t.test('Mixed encoding scenarios', async () => {
const startTime = performance.now();
// Invoice with mixed encoding challenges
const mixedInvoice = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2"
xmlns:cac="urn:oasis:names:specification:ubl:schema:xsd:CommonAggregateComponents-2"
xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
<cbc:ID>MIXED-ENC-001</cbc:ID>
<cbc:IssueDate>2025-01-25</cbc:IssueDate>
<cbc:InvoiceTypeCode>380</cbc:InvoiceTypeCode>
<cbc:DocumentCurrencyCode>EUR</cbc:DocumentCurrencyCode>
<cbc:Note><![CDATA[CDATA content: <tag> & special chars £ ¥]]></cbc:Note>
<cbc:Note>Mixed: Normal text with &#8364;100 and &lt;escaped&gt; content</cbc:Note>
<cac:AccountingSupplierParty>
<cac:Party>
<cac:PartyName>
<cbc:Name>Müller &amp; Associés S.à r.l.</cbc:Name>
</cac:PartyName>
<cac:PostalAddress>
<cbc:StreetName>Hauptstraße 42 (Gebäude "A")</cbc:StreetName>
<cbc:AdditionalStreetName><![CDATA[Floor 3 & 4]]></cbc:AdditionalStreetName>
<cbc:CityName>Köln</cbc:CityName>
<cac:Country>
<cbc:IdentificationCode>DE</cbc:IdentificationCode>
</cac:Country>
</cac:PostalAddress>
</cac:Party>
</cac:AccountingSupplierParty>
<cac:PaymentTerms>
<cbc:Note>Payment terms: 2/10 net 30 (2% if paid &lt;= 10 days)</cbc:Note>
<cbc:Note><![CDATA[Bank: Société Générale
IBAN: FR14 2004 1010 0505 0001 3M02 606
BIC: SOGEFRPP]]></cbc:Note>
</cac:PaymentTerms>
<cac:InvoiceLine>
<cbc:ID>1</cbc:ID>
<cbc:Note>Temperature range: -40°C T +85°C</cbc:Note>
<cbc:InvoicedQuantity unitCode="C62">10</cbc:InvoicedQuantity>
<cbc:LineExtensionAmount currencyID="EUR">1000.00</cbc:LineExtensionAmount>
<cac:Item>
<cbc:Name>Product with ® symbol © 2025</cbc:Name>
<cbc:Description>Size: 10cm × 20cm × 5cm Weight: 1kg</cbc:Description>
<cac:AdditionalItemProperty>
<cbc:Name>Special chars</cbc:Name>
<cbc:Value>α β γ δ ε </cbc:Value>
</cac:AdditionalItemProperty>
</cac:Item>
</cac:InvoiceLine>
</Invoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(mixedInvoice);
const convertedXml = einvoice.getXmlString();
// Check mixed encoding preservation
const mixedChecks = {
'CDATA content': convertedXml.includes('CDATA content') || convertedXml.includes('<tag>'),
'Mixed entities and Unicode': convertedXml.includes('€100') || convertedXml.includes('&#8364;100'),
'German umlauts': convertedXml.includes('Müller') && convertedXml.includes('Köln'),
'French accents': convertedXml.includes('Associés') && convertedXml.includes('Société'),
'Mathematical symbols': convertedXml.includes('≤') && convertedXml.includes('≈'),
'Trademark symbols': convertedXml.includes('™') && convertedXml.includes('®'),
'Greek letters': convertedXml.includes('α') || convertedXml.includes('beta'),
'Temperature notation': convertedXml.includes('°C'),
'Multiplication sign': convertedXml.includes('×'),
'CDATA in address': convertedXml.includes('Floor 3') || convertedXml.includes('&amp; 4')
};
const passedChecks = Object.entries(mixedChecks).filter(([_, passed]) => passed).length;
console.log(`Mixed encoding: ${passedChecks}/${Object.keys(mixedChecks).length} checks passed`);
expect(passedChecks).toBeGreaterThan(Object.keys(mixedChecks).length * 0.8);
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('mixed-encoding', elapsed);
});
t.test('Encoding in different invoice formats', async () => {
const startTime = performance.now();
// Test encoding across different format characteristics
const formats = [
{
name: 'UBL with namespaces',
content: `<?xml version="1.0" encoding="UTF-8"?>
<ubl:Invoice xmlns:ubl="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<cbc:ID xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">NS--001</cbc:ID>
<cbc:Note xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">Namespace test: £¥</cbc:Note>
</ubl:Invoice>`
},
{
name: 'CII with complex structure',
content: `<?xml version="1.0" encoding="UTF-8"?>
<CrossIndustryInvoice xmlns="urn:un:unece:uncefact:data:standard:CrossIndustryInvoice:100">
<ExchangedDocument>
<ID>CII-Ü-001</ID>
<Name>Übersicht über Änderungen</Name>
</ExchangedDocument>
</CrossIndustryInvoice>`
},
{
name: 'Factur-X with French',
content: `<?xml version="1.0" encoding="UTF-8"?>
<CrossIndustryInvoice>
<ExchangedDocument>
<ID>FX-FR-001</ID>
<IncludedNote>
<Content>Facture détaillée avec références spéciales</Content>
</IncludedNote>
</ExchangedDocument>
</CrossIndustryInvoice>`
}
];
for (const format of formats) {
try {
const einvoice = new EInvoice();
await einvoice.loadFromString(format.content);
const converted = einvoice.getXmlString();
// Check key characters are preserved
let preserved = true;
if (format.name.includes('UBL') && !converted.includes('€£¥')) preserved = false;
if (format.name.includes('CII') && !converted.includes('Ü')) preserved = false;
if (format.name.includes('French') && !converted.includes('détaillée')) preserved = false;
console.log(`${format.name}: ${preserved ? '✓' : '✗'} Encoding preserved`);
} catch (error) {
console.log(`${format.name}: Error - ${error.message}`);
}
}
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('format-encoding', elapsed);
});
t.test('Bidirectional text preservation', async () => {
const startTime = performance.now();
// Test RTL (Right-to-Left) text preservation
const rtlInvoice = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2"
xmlns:cac="urn:oasis:names:specification:ubl:schema:xsd:CommonAggregateComponents-2"
xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
<cbc:ID>RTL-TEST-001</cbc:ID>
<cbc:IssueDate>2025-01-25</cbc:IssueDate>
<cbc:InvoiceTypeCode>380</cbc:InvoiceTypeCode>
<cbc:DocumentCurrencyCode>EUR</cbc:DocumentCurrencyCode>
<cac:AccountingSupplierParty>
<cac:Party>
<cac:PartyName>
<cbc:Name>شركة التقنية المحدودة</cbc:Name>
</cac:PartyName>
<cac:PostalAddress>
<cbc:StreetName>شارع الملك فهد 123</cbc:StreetName>
<cbc:CityName>الرياض</cbc:CityName>
<cac:Country>
<cbc:IdentificationCode>SA</cbc:IdentificationCode>
</cac:Country>
</cac:PostalAddress>
</cac:Party>
</cac:AccountingSupplierParty>
<cac:AccountingCustomerParty>
<cac:Party>
<cac:PartyName>
<cbc:Name>חברת הטכנולוגיה בע"מ</cbc:Name>
</cac:PartyName>
<cac:PostalAddress>
<cbc:StreetName>רחוב דיזנגוף 456</cbc:StreetName>
<cbc:CityName>תל אביב</cbc:CityName>
<cac:Country>
<cbc:IdentificationCode>IL</cbc:IdentificationCode>
</cac:Country>
</cac:PostalAddress>
</cac:Party>
</cac:AccountingCustomerParty>
<cac:InvoiceLine>
<cbc:ID>1</cbc:ID>
<cbc:Note>Mixed text: العربية (Arabic) and עברית (Hebrew) with English</cbc:Note>
<cbc:InvoicedQuantity unitCode="C62">10</cbc:InvoicedQuantity>
<cbc:LineExtensionAmount currencyID="EUR">1000.00</cbc:LineExtensionAmount>
<cac:Item>
<cbc:Name>منتج تقني متقدم / מוצר טכנולוגי מתקדם</cbc:Name>
</cac:Item>
</cac:InvoiceLine>
</Invoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(rtlInvoice);
const convertedXml = einvoice.getXmlString();
// Check RTL text preservation
const rtlChecks = {
'Arabic company': convertedXml.includes('شركة التقنية المحدودة'),
'Arabic street': convertedXml.includes('شارع الملك فهد'),
'Arabic city': convertedXml.includes('الرياض'),
'Hebrew company': convertedXml.includes('חברת הטכנולוגיה'),
'Hebrew street': convertedXml.includes('רחוב דיזנגוף'),
'Hebrew city': convertedXml.includes('תל אביב'),
'Mixed RTL/LTR': convertedXml.includes('Arabic') && convertedXml.includes('Hebrew'),
'Arabic product': convertedXml.includes('منتج تقني متقدم'),
'Hebrew product': convertedXml.includes('מוצר טכנולוגי מתקדם')
};
const rtlPreserved = Object.entries(rtlChecks).filter(([_, passed]) => passed).length;
console.log(`RTL text preservation: ${rtlPreserved}/${Object.keys(rtlChecks).length}`);
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('rtl-preservation', elapsed);
});
t.test('Corpus encoding preservation analysis', async () => {
const startTime = performance.now();
let processedCount = 0;
let encodingIssues = 0;
const characterCategories = {
'ASCII only': 0,
'Latin extended': 0,
'Greek': 0,
'Cyrillic': 0,
'CJK': 0,
'Arabic/Hebrew': 0,
'Special symbols': 0,
'Emoji': 0
};
const files = await corpusLoader.getAllFiles();
const xmlFiles = files.filter(f => f.endsWith('.xml') && !f.includes('.pdf'));
// Sample corpus for encoding analysis
const sampleSize = Math.min(50, xmlFiles.length);
const sample = xmlFiles.slice(0, sampleSize);
for (const file of sample) {
try {
const content = await corpusLoader.readFile(file);
const einvoice = new EInvoice();
let originalString: string;
if (typeof content === 'string') {
originalString = content;
await einvoice.loadFromString(content);
} else {
originalString = content.toString('utf8');
await einvoice.loadFromBuffer(content);
}
const convertedXml = einvoice.getXmlString();
// Categorize content
if (!/[^\x00-\x7F]/.test(originalString)) {
characterCategories['ASCII only']++;
} else {
if (/[À-ÿĀ-ſ]/.test(originalString)) characterCategories['Latin extended']++;
if (/[Α-Ωα-ω]/.test(originalString)) characterCategories['Greek']++;
if (/[А-Яа-я]/.test(originalString)) characterCategories['Cyrillic']++;
if (/[\u4E00-\u9FFF\u3040-\u309F\u30A0-\u30FF\uAC00-\uD7AF]/.test(originalString)) characterCategories['CJK']++;
if (/[\u0590-\u05FF\u0600-\u06FF]/.test(originalString)) characterCategories['Arabic/Hebrew']++;
if (/[©®™€£¥§¶•°±×÷≤≥≠≈∞]/.test(originalString)) characterCategories['Special symbols']++;
if (/[\u{1F300}-\u{1F9FF}]/u.test(originalString)) characterCategories['Emoji']++;
}
// Simple check for major encoding loss
const originalNonAscii = (originalString.match(/[^\x00-\x7F]/g) || []).length;
const convertedNonAscii = (convertedXml.match(/[^\x00-\x7F]/g) || []).length;
if (originalNonAscii > 0 && convertedNonAscii < originalNonAscii * 0.8) {
encodingIssues++;
console.log(`Potential encoding loss in ${file}: ${originalNonAscii} -> ${convertedNonAscii} non-ASCII chars`);
}
processedCount++;
} catch (error) {
console.log(`Encoding analysis error in ${file}:`, error.message);
}
}
console.log(`Corpus encoding analysis (${processedCount} files):`);
console.log('Character categories found:');
Object.entries(characterCategories)
.filter(([_, count]) => count > 0)
.sort((a, b) => b[1] - a[1])
.forEach(([category, count]) => {
console.log(` ${category}: ${count} files`);
});
console.log(`Files with potential encoding issues: ${encodingIssues}`);
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('corpus-encoding', elapsed);
});
// Print performance summary
performanceTracker.printSummary();
// Performance assertions
const avgTime = performanceTracker.getAverageTime();
expect(avgTime).toBeLessThan(400); // Encoding operations may take longer
});
tap.start();