einvoice/test/suite/einvoice_conversion/test.conv-07.character-encoding.ts
2025-05-25 19:45:37 +00:00

523 lines
21 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { expect, tap } from '@git.zone/tstest/tapbundle';
import * as plugins from '../plugins.js';
import { EInvoice } from '../../../ts/index.js';
import { CorpusLoader } from '../corpus.loader.js';
import { PerformanceTracker } from '../performance.tracker.js';
tap.test('CONV-07: Character Encoding - should preserve character encoding during conversion', async (t) => {
// CONV-07: Verify character encoding is maintained across format conversions
// This test ensures special characters and international text are preserved
const performanceTracker = new PerformanceTracker('CONV-07: Character Encoding');
const corpusLoader = new CorpusLoader();
t.test('UTF-8 encoding preservation in conversion', async () => {
const startTime = performance.now();
// UBL invoice with various UTF-8 characters
const ublInvoice = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2"
xmlns:cac="urn:oasis:names:specification:ubl:schema:xsd:CommonAggregateComponents-2"
xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
<cbc:ID>UTF8-CONV-001</cbc:ID>
<cbc:IssueDate>2025-01-25</cbc:IssueDate>
<cbc:InvoiceTypeCode>380</cbc:InvoiceTypeCode>
<cbc:Note>Special characters: € £ ¥ © ® ™ § ¶ • ° ± × ÷</cbc:Note>
<cbc:Note>Diacritics: àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ</cbc:Note>
<cbc:Note>Greek: ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ αβγδεζηθικλμνξοπρστυφχψω</cbc:Note>
<cbc:Note>Cyrillic: АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ</cbc:Note>
<cbc:Note>CJK: 中文 日本語 한국어</cbc:Note>
<cbc:Note>Arabic: العربية مرحبا</cbc:Note>
<cbc:Note>Hebrew: עברית שלום</cbc:Note>
<cbc:Note>Emoji: 😀 🎉 💰 📧 🌍</cbc:Note>
<cbc:DocumentCurrencyCode>EUR</cbc:DocumentCurrencyCode>
<cac:AccountingSupplierParty>
<cac:Party>
<cac:PartyName>
<cbc:Name>Société Générale Müller & Associés</cbc:Name>
</cac:PartyName>
<cac:PostalAddress>
<cbc:StreetName>Rue de la Légion d'Honneur</cbc:StreetName>
<cbc:CityName>Zürich</cbc:CityName>
<cbc:PostalZone>8001</cbc:PostalZone>
<cac:Country>
<cbc:IdentificationCode>CH</cbc:IdentificationCode>
</cac:Country>
</cac:PostalAddress>
<cac:Contact>
<cbc:Name>François Lefèvre</cbc:Name>
<cbc:ElectronicMail>françois@société-générale.ch</cbc:ElectronicMail>
</cac:Contact>
</cac:Party>
</cac:AccountingSupplierParty>
<cac:AccountingCustomerParty>
<cac:Party>
<cac:PartyName>
<cbc:Name>北京科技有限公司 (Beijing Tech Co.)</cbc:Name>
</cac:PartyName>
<cac:PostalAddress>
<cbc:StreetName>北京市朝阳区建国路88号</cbc:StreetName>
<cbc:CityName>北京</cbc:CityName>
<cac:Country>
<cbc:IdentificationCode>CN</cbc:IdentificationCode>
</cac:Country>
</cac:PostalAddress>
</cac:Party>
</cac:AccountingCustomerParty>
<cac:InvoiceLine>
<cbc:ID>1</cbc:ID>
<cbc:Note>Spëcïål cháracters in line: ñ ç ø å æ þ ð</cbc:Note>
<cbc:InvoicedQuantity unitCode="C62">10</cbc:InvoicedQuantity>
<cbc:LineExtensionAmount currencyID="EUR">1000.00</cbc:LineExtensionAmount>
<cac:Item>
<cbc:Name>Bücher über Köln München</cbc:Name>
<cbc:Description>Prix: 25,50 € (TVA incluse) • Größe: 21×29,7 cm²</cbc:Description>
</cac:Item>
<cac:Price>
<cbc:PriceAmount currencyID="EUR">100.00</cbc:PriceAmount>
</cac:Price>
</cac:InvoiceLine>
</Invoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(ublInvoice);
// Convert to another format (simulated by getting XML back)
const convertedXml = einvoice.getXmlString();
// Verify all special characters are preserved
const encodingChecks = [
// Currency symbols
{ char: '€', name: 'Euro' },
{ char: '£', name: 'Pound' },
{ char: '¥', name: 'Yen' },
// Special symbols
{ char: '©', name: 'Copyright' },
{ char: '®', name: 'Registered' },
{ char: '™', name: 'Trademark' },
{ char: '×', name: 'Multiplication' },
{ char: '÷', name: 'Division' },
// Diacritics
{ char: 'àáâãäå', name: 'Latin a variations' },
{ char: 'çñøæþð', name: 'Special Latin' },
// Greek
{ char: 'ΑΒΓΔ', name: 'Greek uppercase' },
{ char: 'αβγδ', name: 'Greek lowercase' },
// Cyrillic
{ char: 'АБВГ', name: 'Cyrillic' },
// CJK
{ char: '中文', name: 'Chinese' },
{ char: '日本語', name: 'Japanese' },
{ char: '한국어', name: 'Korean' },
// RTL
{ char: 'العربية', name: 'Arabic' },
{ char: 'עברית', name: 'Hebrew' },
// Emoji
{ char: '😀', name: 'Emoji' },
// Names with diacritics
{ char: 'François Lefèvre', name: 'French name' },
{ char: 'Zürich', name: 'Swiss city' },
{ char: 'Müller', name: 'German name' },
// Special punctuation
{ char: '', name: 'En dash' },
{ char: '•', name: 'Bullet' },
{ char: '²', name: 'Superscript' }
];
let preservedCount = 0;
const missingChars: string[] = [];
encodingChecks.forEach(check => {
if (convertedXml.includes(check.char)) {
preservedCount++;
} else {
missingChars.push(`${check.name} (${check.char})`);
}
});
console.log(`UTF-8 preservation: ${preservedCount}/${encodingChecks.length} character sets preserved`);
if (missingChars.length > 0) {
console.log('Missing characters:', missingChars);
}
expect(preservedCount).toBeGreaterThan(encodingChecks.length * 0.9); // Allow 10% loss
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('utf8-preservation', elapsed);
});
t.test('Entity encoding in conversion', async () => {
const startTime = performance.now();
// CII invoice with XML entities
const ciiInvoice = `<?xml version="1.0" encoding="UTF-8"?>
<rsm:CrossIndustryInvoice xmlns:rsm="urn:un:unece:uncefact:data:standard:CrossIndustryInvoice:100"
xmlns:ram="urn:un:unece:uncefact:data:standard:ReusableAggregateBusinessInformationEntity:100">
<rsm:ExchangedDocument>
<ram:ID>ENTITY-CONV-001</ram:ID>
<ram:IncludedNote>
<ram:Content>XML entities: &lt;invoice&gt; &amp; "quotes" with 'apostrophes'</ram:Content>
</ram:IncludedNote>
<ram:IncludedNote>
<ram:Content>Numeric entities: &#8364; &#163; &#165; &#8482;</ram:Content>
</ram:IncludedNote>
<ram:IncludedNote>
<ram:Content>Hex entities: &#x20AC; &#x00A3; &#x00A5;</ram:Content>
</ram:IncludedNote>
</rsm:ExchangedDocument>
<rsm:SupplyChainTradeTransaction>
<ram:IncludedSupplyChainTradeLineItem>
<ram:SpecifiedTradeProduct>
<ram:Name>Product &amp; Service &lt;Premium&gt;</ram:Name>
<ram:Description>Price comparison: USD &lt; EUR &gt; GBP</ram:Description>
</ram:SpecifiedTradeProduct>
</ram:IncludedSupplyChainTradeLineItem>
<ram:ApplicableHeaderTradeAgreement>
<ram:SellerTradeParty>
<ram:Name>Smith &amp; Jones "Trading" Ltd.</ram:Name>
<ram:Description>Registered in England &amp; Wales</ram:Description>
</ram:SellerTradeParty>
</ram:ApplicableHeaderTradeAgreement>
</rsm:SupplyChainTradeTransaction>
</rsm:CrossIndustryInvoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(ciiInvoice);
const convertedXml = einvoice.getXmlString();
// Check entity preservation
const entityChecks = {
'Ampersand entity': convertedXml.includes('&amp;') || convertedXml.includes(' & '),
'Less than entity': convertedXml.includes('&lt;') || convertedXml.includes(' < '),
'Greater than entity': convertedXml.includes('&gt;') || convertedXml.includes(' > '),
'Quote preservation': convertedXml.includes('"quotes"') || convertedXml.includes('&quot;quotes&quot;'),
'Apostrophe preservation': convertedXml.includes("'apostrophes'") || convertedXml.includes('&apos;apostrophes&apos;'),
'Numeric entities': convertedXml.includes('€') || convertedXml.includes('&#8364;'),
'Hex entities': convertedXml.includes('£') || convertedXml.includes('&#x00A3;')
};
Object.entries(entityChecks).forEach(([check, passed]) => {
if (passed) {
console.log(`${check}`);
} else {
console.log(`${check}`);
}
});
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('entity-encoding', elapsed);
});
t.test('Mixed encoding scenarios', async () => {
const startTime = performance.now();
// Invoice with mixed encoding challenges
const mixedInvoice = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2"
xmlns:cac="urn:oasis:names:specification:ubl:schema:xsd:CommonAggregateComponents-2"
xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
<cbc:ID>MIXED-ENC-001</cbc:ID>
<cbc:IssueDate>2025-01-25</cbc:IssueDate>
<cbc:InvoiceTypeCode>380</cbc:InvoiceTypeCode>
<cbc:DocumentCurrencyCode>EUR</cbc:DocumentCurrencyCode>
<cbc:Note><![CDATA[CDATA content: <tag> & special chars € £ ¥]]></cbc:Note>
<cbc:Note>Mixed: Normal text with &#8364;100 and &lt;escaped&gt; content</cbc:Note>
<cac:AccountingSupplierParty>
<cac:Party>
<cac:PartyName>
<cbc:Name>Müller &amp; Associés S.à r.l.</cbc:Name>
</cac:PartyName>
<cac:PostalAddress>
<cbc:StreetName>Hauptstraße 42 (Gebäude "A")</cbc:StreetName>
<cbc:AdditionalStreetName><![CDATA[Floor 3 & 4]]></cbc:AdditionalStreetName>
<cbc:CityName>Köln</cbc:CityName>
<cac:Country>
<cbc:IdentificationCode>DE</cbc:IdentificationCode>
</cac:Country>
</cac:PostalAddress>
</cac:Party>
</cac:AccountingSupplierParty>
<cac:PaymentTerms>
<cbc:Note>Payment terms: 2/10 net 30 (2% if paid &lt;= 10 days)</cbc:Note>
<cbc:Note><![CDATA[Bank: Société Générale
IBAN: FR14 2004 1010 0505 0001 3M02 606
BIC: SOGEFRPP]]></cbc:Note>
</cac:PaymentTerms>
<cac:InvoiceLine>
<cbc:ID>1</cbc:ID>
<cbc:Note>Temperature range: -40°C ≤ T ≤ +85°C</cbc:Note>
<cbc:InvoicedQuantity unitCode="C62">10</cbc:InvoicedQuantity>
<cbc:LineExtensionAmount currencyID="EUR">1000.00</cbc:LineExtensionAmount>
<cac:Item>
<cbc:Name>Product™ with ® symbol © 2025</cbc:Name>
<cbc:Description>Size: 10cm × 20cm × 5cm • Weight: ≈1kg</cbc:Description>
<cac:AdditionalItemProperty>
<cbc:Name>Special chars</cbc:Name>
<cbc:Value>α β γ δ ε ≠ ∞ ∑ √ ∫</cbc:Value>
</cac:AdditionalItemProperty>
</cac:Item>
</cac:InvoiceLine>
</Invoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(mixedInvoice);
const convertedXml = einvoice.getXmlString();
// Check mixed encoding preservation
const mixedChecks = {
'CDATA content': convertedXml.includes('CDATA content') || convertedXml.includes('<tag>'),
'Mixed entities and Unicode': convertedXml.includes('€100') || convertedXml.includes('&#8364;100'),
'German umlauts': convertedXml.includes('Müller') && convertedXml.includes('Köln'),
'French accents': convertedXml.includes('Associés') && convertedXml.includes('Société'),
'Mathematical symbols': convertedXml.includes('≤') && convertedXml.includes('≈'),
'Trademark symbols': convertedXml.includes('™') && convertedXml.includes('®'),
'Greek letters': convertedXml.includes('α') || convertedXml.includes('beta'),
'Temperature notation': convertedXml.includes('°C'),
'Multiplication sign': convertedXml.includes('×'),
'CDATA in address': convertedXml.includes('Floor 3') || convertedXml.includes('&amp; 4')
};
const passedChecks = Object.entries(mixedChecks).filter(([_, passed]) => passed).length;
console.log(`Mixed encoding: ${passedChecks}/${Object.keys(mixedChecks).length} checks passed`);
expect(passedChecks).toBeGreaterThan(Object.keys(mixedChecks).length * 0.8);
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('mixed-encoding', elapsed);
});
t.test('Encoding in different invoice formats', async () => {
const startTime = performance.now();
// Test encoding across different format characteristics
const formats = [
{
name: 'UBL with namespaces',
content: `<?xml version="1.0" encoding="UTF-8"?>
<ubl:Invoice xmlns:ubl="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<cbc:ID xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">NS-€-001</cbc:ID>
<cbc:Note xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">Namespace test: €£¥</cbc:Note>
</ubl:Invoice>`
},
{
name: 'CII with complex structure',
content: `<?xml version="1.0" encoding="UTF-8"?>
<CrossIndustryInvoice xmlns="urn:un:unece:uncefact:data:standard:CrossIndustryInvoice:100">
<ExchangedDocument>
<ID>CII-Ü-001</ID>
<Name>Übersicht über Änderungen</Name>
</ExchangedDocument>
</CrossIndustryInvoice>`
},
{
name: 'Factur-X with French',
content: `<?xml version="1.0" encoding="UTF-8"?>
<CrossIndustryInvoice>
<ExchangedDocument>
<ID>FX-FR-001</ID>
<IncludedNote>
<Content>Facture détaillée avec références spéciales</Content>
</IncludedNote>
</ExchangedDocument>
</CrossIndustryInvoice>`
}
];
for (const format of formats) {
try {
const einvoice = new EInvoice();
await einvoice.loadFromString(format.content);
const converted = einvoice.getXmlString();
// Check key characters are preserved
let preserved = true;
if (format.name.includes('UBL') && !converted.includes('€£¥')) preserved = false;
if (format.name.includes('CII') && !converted.includes('Ü')) preserved = false;
if (format.name.includes('French') && !converted.includes('détaillée')) preserved = false;
console.log(`${format.name}: ${preserved ? '✓' : '✗'} Encoding preserved`);
} catch (error) {
console.log(`${format.name}: Error - ${error.message}`);
}
}
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('format-encoding', elapsed);
});
t.test('Bidirectional text preservation', async () => {
const startTime = performance.now();
// Test RTL (Right-to-Left) text preservation
const rtlInvoice = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2"
xmlns:cac="urn:oasis:names:specification:ubl:schema:xsd:CommonAggregateComponents-2"
xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
<cbc:ID>RTL-TEST-001</cbc:ID>
<cbc:IssueDate>2025-01-25</cbc:IssueDate>
<cbc:InvoiceTypeCode>380</cbc:InvoiceTypeCode>
<cbc:DocumentCurrencyCode>EUR</cbc:DocumentCurrencyCode>
<cac:AccountingSupplierParty>
<cac:Party>
<cac:PartyName>
<cbc:Name>شركة التقنية المحدودة</cbc:Name>
</cac:PartyName>
<cac:PostalAddress>
<cbc:StreetName>شارع الملك فهد 123</cbc:StreetName>
<cbc:CityName>الرياض</cbc:CityName>
<cac:Country>
<cbc:IdentificationCode>SA</cbc:IdentificationCode>
</cac:Country>
</cac:PostalAddress>
</cac:Party>
</cac:AccountingSupplierParty>
<cac:AccountingCustomerParty>
<cac:Party>
<cac:PartyName>
<cbc:Name>חברת הטכנולוגיה בע"מ</cbc:Name>
</cac:PartyName>
<cac:PostalAddress>
<cbc:StreetName>רחוב דיזנגוף 456</cbc:StreetName>
<cbc:CityName>תל אביב</cbc:CityName>
<cac:Country>
<cbc:IdentificationCode>IL</cbc:IdentificationCode>
</cac:Country>
</cac:PostalAddress>
</cac:Party>
</cac:AccountingCustomerParty>
<cac:InvoiceLine>
<cbc:ID>1</cbc:ID>
<cbc:Note>Mixed text: العربية (Arabic) and עברית (Hebrew) with English</cbc:Note>
<cbc:InvoicedQuantity unitCode="C62">10</cbc:InvoicedQuantity>
<cbc:LineExtensionAmount currencyID="EUR">1000.00</cbc:LineExtensionAmount>
<cac:Item>
<cbc:Name>منتج تقني متقدم / מוצר טכנולוגי מתקדם</cbc:Name>
</cac:Item>
</cac:InvoiceLine>
</Invoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(rtlInvoice);
const convertedXml = einvoice.getXmlString();
// Check RTL text preservation
const rtlChecks = {
'Arabic company': convertedXml.includes('شركة التقنية المحدودة'),
'Arabic street': convertedXml.includes('شارع الملك فهد'),
'Arabic city': convertedXml.includes('الرياض'),
'Hebrew company': convertedXml.includes('חברת הטכנולוגיה'),
'Hebrew street': convertedXml.includes('רחוב דיזנגוף'),
'Hebrew city': convertedXml.includes('תל אביב'),
'Mixed RTL/LTR': convertedXml.includes('Arabic') && convertedXml.includes('Hebrew'),
'Arabic product': convertedXml.includes('منتج تقني متقدم'),
'Hebrew product': convertedXml.includes('מוצר טכנולוגי מתקדם')
};
const rtlPreserved = Object.entries(rtlChecks).filter(([_, passed]) => passed).length;
console.log(`RTL text preservation: ${rtlPreserved}/${Object.keys(rtlChecks).length}`);
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('rtl-preservation', elapsed);
});
t.test('Corpus encoding preservation analysis', async () => {
const startTime = performance.now();
let processedCount = 0;
let encodingIssues = 0;
const characterCategories = {
'ASCII only': 0,
'Latin extended': 0,
'Greek': 0,
'Cyrillic': 0,
'CJK': 0,
'Arabic/Hebrew': 0,
'Special symbols': 0,
'Emoji': 0
};
const files = await corpusLoader.getAllFiles();
const xmlFiles = files.filter(f => f.endsWith('.xml') && !f.includes('.pdf'));
// Sample corpus for encoding analysis
const sampleSize = Math.min(50, xmlFiles.length);
const sample = xmlFiles.slice(0, sampleSize);
for (const file of sample) {
try {
const content = await corpusLoader.readFile(file);
const einvoice = new EInvoice();
let originalString: string;
if (typeof content === 'string') {
originalString = content;
await einvoice.loadFromString(content);
} else {
originalString = content.toString('utf8');
await einvoice.loadFromBuffer(content);
}
const convertedXml = einvoice.getXmlString();
// Categorize content
if (!/[^\x00-\x7F]/.test(originalString)) {
characterCategories['ASCII only']++;
} else {
if (/[À-ÿĀ-ſ]/.test(originalString)) characterCategories['Latin extended']++;
if (/[Α-Ωα-ω]/.test(originalString)) characterCategories['Greek']++;
if (/[А-Яа-я]/.test(originalString)) characterCategories['Cyrillic']++;
if (/[\u4E00-\u9FFF\u3040-\u309F\u30A0-\u30FF\uAC00-\uD7AF]/.test(originalString)) characterCategories['CJK']++;
if (/[\u0590-\u05FF\u0600-\u06FF]/.test(originalString)) characterCategories['Arabic/Hebrew']++;
if (/[©®™€£¥§¶•°±×÷≤≥≠≈∞]/.test(originalString)) characterCategories['Special symbols']++;
if (/[\u{1F300}-\u{1F9FF}]/u.test(originalString)) characterCategories['Emoji']++;
}
// Simple check for major encoding loss
const originalNonAscii = (originalString.match(/[^\x00-\x7F]/g) || []).length;
const convertedNonAscii = (convertedXml.match(/[^\x00-\x7F]/g) || []).length;
if (originalNonAscii > 0 && convertedNonAscii < originalNonAscii * 0.8) {
encodingIssues++;
console.log(`Potential encoding loss in ${file}: ${originalNonAscii} -> ${convertedNonAscii} non-ASCII chars`);
}
processedCount++;
} catch (error) {
console.log(`Encoding analysis error in ${file}:`, error.message);
}
}
console.log(`Corpus encoding analysis (${processedCount} files):`);
console.log('Character categories found:');
Object.entries(characterCategories)
.filter(([_, count]) => count > 0)
.sort((a, b) => b[1] - a[1])
.forEach(([category, count]) => {
console.log(` ${category}: ${count} files`);
});
console.log(`Files with potential encoding issues: ${encodingIssues}`);
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('corpus-encoding', elapsed);
});
// Print performance summary
performanceTracker.printSummary();
// Performance assertions
const avgTime = performanceTracker.getAverageTime();
expect(avgTime).toBeLessThan(400); // Encoding operations may take longer
});
tap.start();