einvoice/test/suite/einvoice_encoding/test.enc-02.utf16-encoding.ts
2025-05-25 19:45:37 +00:00

307 lines
11 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { expect, tap } from '@git.zone/tstest/tapbundle';
import * as plugins from '../plugins.js';
import { EInvoice } from '../../../ts/index.js';
import { CorpusLoader } from '../corpus.loader.js';
import { PerformanceTracker } from '../performance.tracker.js';
tap.test('ENC-02: UTF-16 Encoding - should handle UTF-16 encoded documents correctly', async (t) => {
// ENC-02: Verify correct handling of UTF-16 encoded XML documents (both BE and LE)
// This test ensures proper support for UTF-16 encoding variants
const performanceTracker = new PerformanceTracker('ENC-02: UTF-16 Encoding');
const corpusLoader = new CorpusLoader();
t.test('UTF-16 BE (Big Endian) encoding', async () => {
const startTime = performance.now();
// Create UTF-16 BE content
const xmlContent = `<?xml version="1.0" encoding="UTF-16BE"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<ID>UTF16BE-TEST</ID>
<IssueDate>2025-01-25</IssueDate>
<Note>UTF-16 BE Test: €100 für Bücher</Note>
<DocumentCurrencyCode>EUR</DocumentCurrencyCode>
<AccountingSupplierParty>
<Party>
<PartyName>
<Name>Großhändler GmbH</Name>
</PartyName>
</Party>
</AccountingSupplierParty>
<LegalMonetaryTotal>
<PayableAmount currencyID="EUR">100.00</PayableAmount>
</LegalMonetaryTotal>
</Invoice>`;
// Convert to UTF-16 BE with BOM
const utf16BeBom = Buffer.from([0xFE, 0xFF]); // UTF-16 BE BOM
const utf16BeContent = Buffer.from(xmlContent, 'utf16le').swap16(); // Convert to BE
const contentWithBom = Buffer.concat([utf16BeBom, utf16BeContent]);
const einvoice = new EInvoice();
try {
await einvoice.loadFromBuffer(contentWithBom);
const parsedData = einvoice.getInvoiceData();
expect(parsedData).toBeTruthy();
const xmlString = einvoice.getXmlString();
expect(xmlString).toContain('UTF16BE-TEST');
expect(xmlString).toContain('€100 für Bücher');
expect(xmlString).toContain('Großhändler GmbH');
} catch (error) {
console.log('UTF-16 BE not fully supported:', error.message);
// Try alternative approach
const decoded = contentWithBom.toString('utf16le').replace(/^\ufeff/, '');
await einvoice.loadFromString(decoded);
expect(einvoice.getXmlString()).toContain('UTF16BE-TEST');
}
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('utf16-be', elapsed);
});
t.test('UTF-16 LE (Little Endian) encoding', async () => {
const startTime = performance.now();
// Create UTF-16 LE content
const xmlContent = `<?xml version="1.0" encoding="UTF-16LE"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<ID>UTF16LE-TEST</ID>
<IssueDate>2025-01-25</IssueDate>
<Note>UTF-16 LE: Special chars → ← ↑ ↓ ♠ ♣ ♥ ♦</Note>
<AccountingCustomerParty>
<Party>
<PartyName>
<Name>François & Søren Ltd.</Name>
</PartyName>
</Party>
</AccountingCustomerParty>
</Invoice>`;
// Convert to UTF-16 LE with BOM
const utf16LeBom = Buffer.from([0xFF, 0xFE]); // UTF-16 LE BOM
const utf16LeContent = Buffer.from(xmlContent, 'utf16le');
const contentWithBom = Buffer.concat([utf16LeBom, utf16LeContent]);
const einvoice = new EInvoice();
try {
await einvoice.loadFromBuffer(contentWithBom);
const xmlString = einvoice.getXmlString();
expect(xmlString).toContain('UTF16LE-TEST');
expect(xmlString).toContain('→ ← ↑ ↓');
expect(xmlString).toContain('♠ ♣ ♥ ♦');
expect(xmlString).toContain('François & Søren Ltd.');
} catch (error) {
console.log('UTF-16 LE not fully supported:', error.message);
// Try fallback
const decoded = contentWithBom.toString('utf16le').replace(/^\ufeff/, '');
await einvoice.loadFromString(decoded);
expect(einvoice.getXmlString()).toContain('UTF16LE-TEST');
}
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('utf16-le', elapsed);
});
t.test('UTF-16 without BOM', async () => {
const startTime = performance.now();
// UTF-16 without BOM (should detect from encoding declaration)
const xmlContent = `<?xml version="1.0" encoding="UTF-16"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<ID>UTF16-NO-BOM</ID>
<Note>Ψ Ω α β γ δ ε ζ η θ</Note>
</Invoice>`;
// Create UTF-16 without BOM (system default endianness)
const utf16Content = Buffer.from(xmlContent, 'utf16le');
const einvoice = new EInvoice();
try {
await einvoice.loadFromBuffer(utf16Content);
const xmlString = einvoice.getXmlString();
expect(xmlString).toContain('UTF16-NO-BOM');
expect(xmlString).toContain('Ψ Ω α β γ δ ε ζ η θ');
} catch (error) {
console.log('UTF-16 without BOM requires explicit handling:', error.message);
}
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('utf16-no-bom', elapsed);
});
t.test('UTF-16 surrogate pairs', async () => {
const startTime = performance.now();
// Test UTF-16 surrogate pairs (for characters outside BMP)
const xmlContent = `<?xml version="1.0" encoding="UTF-16"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<ID>UTF16-SURROGATE</ID>
<Note>Emojis: 😀😃😄😁 Math: 𝕳𝖊𝖑𝖑𝖔 CJK Ext: 𠀀𠀁</Note>
<InvoiceLine>
<Note>Ancient scripts: 𐌀𐌁𐌂 𓀀𓀁𓀂</Note>
</InvoiceLine>
</Invoice>`;
const utf16Bom = Buffer.from([0xFF, 0xFE]); // UTF-16 LE BOM
const utf16Content = Buffer.from(xmlContent, 'utf16le');
const contentWithBom = Buffer.concat([utf16Bom, utf16Content]);
const einvoice = new EInvoice();
try {
await einvoice.loadFromBuffer(contentWithBom);
const xmlString = einvoice.getXmlString();
expect(xmlString).toContain('😀😃😄😁');
expect(xmlString).toContain('𝕳𝖊𝖑𝖑𝖔');
expect(xmlString).toContain('𠀀𠀁');
expect(xmlString).toContain('𐌀𐌁𐌂');
expect(xmlString).toContain('𓀀𓀁𓀂');
} catch (error) {
console.log('Surrogate pair handling:', error.message);
// Try string approach
const decoded = contentWithBom.toString('utf16le').replace(/^\ufeff/, '');
await einvoice.loadFromString(decoded);
}
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('utf16-surrogates', elapsed);
});
t.test('UTF-16 to UTF-8 conversion', async () => {
const startTime = performance.now();
// Test that UTF-16 input can be converted to UTF-8 output
const xmlContent = `<?xml version="1.0" encoding="UTF-16"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<ID>UTF16-TO-UTF8</ID>
<Note>Müller, François, 北京, Москва</Note>
</Invoice>`;
const utf16Bom = Buffer.from([0xFF, 0xFE]);
const utf16Content = Buffer.from(xmlContent, 'utf16le');
const contentWithBom = Buffer.concat([utf16Bom, utf16Content]);
const einvoice = new EInvoice();
try {
// Load UTF-16 content
await einvoice.loadFromBuffer(contentWithBom);
// Get as UTF-8 string
const xmlString = einvoice.getXmlString();
// Should be valid UTF-8 now
expect(xmlString).toContain('Müller');
expect(xmlString).toContain('François');
expect(xmlString).toContain('北京');
expect(xmlString).toContain('Москва');
// Verify it's valid UTF-8
const utf8Buffer = Buffer.from(xmlString, 'utf8');
expect(utf8Buffer.toString('utf8')).toBe(xmlString);
} catch (error) {
console.log('UTF-16 to UTF-8 conversion not supported:', error.message);
}
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('utf16-to-utf8', elapsed);
});
t.test('Mixed content with UTF-16', async () => {
const startTime = performance.now();
const xmlContent = `<?xml version="1.0" encoding="UTF-16"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<ID>UTF16-MIXED</ID>
<PaymentTerms>
<Note>Payment terms: 30 days net
• Early payment: 2% discount
• Late payment: 1.5% interest
→ Bank: Sparkasse München
← Account: DE89 3704 0044 0532 0130 00</Note>
</PaymentTerms>
<InvoiceLine>
<Item>
<Description>Bücher (10× @ €15)</Description>
</Item>
</InvoiceLine>
</Invoice>`;
const utf16Bom = Buffer.from([0xFF, 0xFE]);
const utf16Content = Buffer.from(xmlContent, 'utf16le');
const contentWithBom = Buffer.concat([utf16Bom, utf16Content]);
const einvoice = new EInvoice();
try {
await einvoice.loadFromBuffer(contentWithBom);
const xmlString = einvoice.getXmlString();
expect(xmlString).toContain('•');
expect(xmlString).toContain('→');
expect(xmlString).toContain('←');
expect(xmlString).toContain('×');
expect(xmlString).toContain('€');
expect(xmlString).toContain('Sparkasse München');
} catch (error) {
console.log('UTF-16 mixed content:', error.message);
}
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('utf16-mixed', elapsed);
});
t.test('Corpus UTF-16 detection', async () => {
const startTime = performance.now();
let utf16Count = 0;
let checkedCount = 0;
const files = await corpusLoader.getAllFiles();
const xmlFiles = files.filter(f => f.endsWith('.xml'));
// Check a sample for UTF-16 encoded files
const sampleSize = Math.min(30, xmlFiles.length);
const sample = xmlFiles.slice(0, sampleSize);
for (const file of sample) {
try {
const content = await corpusLoader.readFile(file);
if (Buffer.isBuffer(content)) {
// Check for UTF-16 BOMs
if ((content[0] === 0xFE && content[1] === 0xFF) ||
(content[0] === 0xFF && content[1] === 0xFE)) {
utf16Count++;
console.log(`Found UTF-16 file: ${file}`);
}
}
checkedCount++;
} catch (error) {
// Skip files that can't be read
}
}
console.log(`UTF-16 corpus scan: ${utf16Count}/${checkedCount} files use UTF-16`);
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('corpus-utf16', elapsed);
});
// Print performance summary
performanceTracker.printSummary();
// Performance assertions
const avgTime = performanceTracker.getAverageTime();
expect(avgTime).toBeLessThan(150); // UTF-16 operations may be slightly slower than UTF-8
});
tap.start();