einvoice/test/suite/einvoice_encoding/test.enc-01.utf8-encoding.ts
2025-05-25 19:45:37 +00:00

280 lines
9.9 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { expect, tap } from '@git.zone/tstest/tapbundle';
import * as plugins from '../plugins.js';
import { EInvoice } from '../../../ts/index.js';
import { CorpusLoader } from '../corpus.loader.js';
import { PerformanceTracker } from '../performance.tracker.js';
tap.test('ENC-01: UTF-8 Encoding - should handle UTF-8 encoded documents correctly', async (t) => {
// ENC-01: Verify correct handling of UTF-8 encoded XML documents
// This test ensures that the library can properly read, process, and write UTF-8 encoded invoices
const performanceTracker = new PerformanceTracker('ENC-01: UTF-8 Encoding');
const corpusLoader = new CorpusLoader();
t.test('Basic UTF-8 encoding support', async () => {
const startTime = performance.now();
// Test with UTF-8 encoded content containing various characters
const utf8Content = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<CustomizationID>urn:cen.eu:en16931:2017#compliant#urn:fdc:peppol.eu:2017:poacc:billing:3.0</CustomizationID>
<ProfileID>urn:fdc:peppol.eu:2017:poacc:billing:01:1.0</ProfileID>
<ID>UTF8-TEST-001</ID>
<IssueDate>2025-01-25</IssueDate>
<InvoiceTypeCode>380</InvoiceTypeCode>
<Note>UTF-8 Test: €£¥ñüäöß 中文 العربية русский 日本語 한국어 🌍📧</Note>
<DocumentCurrencyCode>EUR</DocumentCurrencyCode>
<AccountingSupplierParty>
<Party>
<PartyName>
<Name>UTF-8 Supplier GmbH</Name>
</PartyName>
</Party>
</AccountingSupplierParty>
<AccountingCustomerParty>
<Party>
<PartyName>
<Name>Büßer & Müller GmbH</Name>
</PartyName>
</Party>
</AccountingCustomerParty>
<LegalMonetaryTotal>
<TaxExclusiveAmount currencyID="EUR">100.00</TaxExclusiveAmount>
<TaxInclusiveAmount currencyID="EUR">119.00</TaxInclusiveAmount>
<PayableAmount currencyID="EUR">119.00</PayableAmount>
</LegalMonetaryTotal>
</Invoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(utf8Content);
// Verify encoding is preserved
const xmlString = einvoice.getXmlString();
expect(xmlString).toContain('encoding="UTF-8"');
expect(xmlString).toContain('€£¥ñüäöß');
expect(xmlString).toContain('中文');
expect(xmlString).toContain('العربية');
expect(xmlString).toContain('русский');
expect(xmlString).toContain('日本語');
expect(xmlString).toContain('한국어');
expect(xmlString).toContain('🌍📧');
expect(xmlString).toContain('Büßer & Müller GmbH');
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('basic-utf8', elapsed);
});
t.test('UTF-8 BOM handling', async () => {
const startTime = performance.now();
// Test with UTF-8 BOM (Byte Order Mark)
const utf8BOM = Buffer.from([0xEF, 0xBB, 0xBF]);
const xmlContent = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<ID>UTF8-BOM-TEST</ID>
<IssueDate>2025-01-25</IssueDate>
<Note>UTF-8 with BOM: Spëcïål Chäracters</Note>
</Invoice>`;
const contentWithBOM = Buffer.concat([utf8BOM, Buffer.from(xmlContent, 'utf8')]);
const einvoice = new EInvoice();
try {
await einvoice.loadFromBuffer(contentWithBOM);
// Verify BOM is handled correctly
const parsedData = einvoice.getInvoiceData();
expect(parsedData).toBeTruthy();
const xmlString = einvoice.getXmlString();
expect(xmlString).toContain('UTF8-BOM-TEST');
expect(xmlString).toContain('Spëcïål Chäracters');
// BOM should not appear in the output
expect(xmlString.charCodeAt(0)).not.toBe(0xFEFF);
} catch (error) {
// Some implementations might not support BOM
console.log('UTF-8 BOM handling not supported:', error.message);
}
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('utf8-bom', elapsed);
});
t.test('UTF-8 without explicit declaration', async () => {
const startTime = performance.now();
// Test UTF-8 content without encoding declaration (should default to UTF-8)
const implicitUtf8 = `<?xml version="1.0"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<ID>IMPLICIT-UTF8</ID>
<Note>Köln München København</Note>
</Invoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(implicitUtf8);
// Verify UTF-8 is used by default
const xmlString = einvoice.getXmlString();
expect(xmlString).toContain('Köln München København');
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('implicit-utf8', elapsed);
});
t.test('Multi-byte UTF-8 sequences', async () => {
const startTime = performance.now();
// Test various UTF-8 multi-byte sequences
const multiByteContent = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<ID>MULTIBYTE-UTF8</ID>
<Note>
2-byte: £¥€ñüäöß
3-byte: ₹₽₨ 中文漢字
4-byte: 𝕳𝖊𝖑𝖑𝖔 🎉🌍🚀
Mixed: Prix: 42,50€ (včetně DPH)
</Note>
</Invoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(multiByteContent);
const xmlString = einvoice.getXmlString();
// Verify all multi-byte sequences are preserved
expect(xmlString).toContain('£¥€ñüäöß');
expect(xmlString).toContain('₹₽₨');
expect(xmlString).toContain('中文漢字');
expect(xmlString).toContain('𝕳𝖊𝖑𝖑𝖔');
expect(xmlString).toContain('🎉🌍🚀');
expect(xmlString).toContain('42,50€');
expect(xmlString).toContain('včetně DPH');
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('multibyte-utf8', elapsed);
});
t.test('UTF-8 encoding in attributes', async () => {
const startTime = performance.now();
const attributeContent = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<ID>UTF8-ATTR-TEST</ID>
<PaymentMeans>
<PaymentMeansCode name="Überweisung">30</PaymentMeansCode>
<PayeeFinancialAccount>
<Name>Büro für Städtebau</Name>
<FinancialInstitutionBranch>
<Name>Sparkasse Köln/Bonn</Name>
</FinancialInstitutionBranch>
</PayeeFinancialAccount>
</PaymentMeans>
<TaxTotal>
<TaxAmount currencyID="EUR" symbol="€">19.00</TaxAmount>
</TaxTotal>
</Invoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(attributeContent);
const xmlString = einvoice.getXmlString();
expect(xmlString).toContain('name="Überweisung"');
expect(xmlString).toContain('Büro für Städtebau');
expect(xmlString).toContain('Sparkasse Köln/Bonn');
expect(xmlString).toContain('symbol="€"');
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('utf8-attributes', elapsed);
});
t.test('UTF-8 corpus validation', async () => {
const startTime = performance.now();
let processedCount = 0;
let utf8Count = 0;
const files = await corpusLoader.getAllFiles();
const xmlFiles = files.filter(f => f.endsWith('.xml'));
// Test a sample of XML files for UTF-8 handling
const sampleSize = Math.min(50, xmlFiles.length);
const sample = xmlFiles.slice(0, sampleSize);
for (const file of sample) {
try {
const content = await corpusLoader.readFile(file);
const einvoice = new EInvoice();
if (typeof content === 'string') {
await einvoice.loadFromString(content);
} else {
await einvoice.loadFromBuffer(content);
}
const xmlString = einvoice.getXmlString();
// Check if encoding is preserved or defaulted to UTF-8
if (xmlString.includes('encoding="UTF-8"') || xmlString.includes("encoding='UTF-8'")) {
utf8Count++;
}
// Verify content is properly encoded
expect(xmlString).toBeTruthy();
expect(xmlString.length).toBeGreaterThan(0);
processedCount++;
} catch (error) {
// Some files might have different encodings
console.log(`Non-UTF-8 or invalid file: ${file}`);
}
}
console.log(`UTF-8 corpus test: ${utf8Count}/${processedCount} files explicitly use UTF-8`);
expect(processedCount).toBeGreaterThan(0);
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('corpus-utf8', elapsed);
});
t.test('UTF-8 normalization', async () => {
const startTime = performance.now();
// Test Unicode normalization forms (NFC, NFD)
const unnormalizedContent = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<ID>NORMALIZATION-TEST</ID>
<Note>Café (NFC) vs Café (NFD)</Note>
<AccountingSupplierParty>
<Party>
<PartyName>
<Name>André's Büro</Name>
</PartyName>
</Party>
</AccountingSupplierParty>
</Invoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(unnormalizedContent);
const xmlString = einvoice.getXmlString();
// Both forms should be preserved
expect(xmlString).toContain('Café');
expect(xmlString).toContain("André's Büro");
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('utf8-normalization', elapsed);
});
// Print performance summary
performanceTracker.printSummary();
// Performance assertions
const avgTime = performanceTracker.getAverageTime();
expect(avgTime).toBeLessThan(100); // UTF-8 operations should be fast
});
tap.start();