307 lines
11 KiB
TypeScript
307 lines
11 KiB
TypeScript
|
import { expect, tap } from '@git.zone/tstest/tapbundle';
|
|||
|
import * as plugins from '../plugins.js';
|
|||
|
import { EInvoice } from '../../../ts/index.js';
|
|||
|
import { CorpusLoader } from '../corpus.loader.js';
|
|||
|
import { PerformanceTracker } from '../performance.tracker.js';
|
|||
|
|
|||
|
tap.test('ENC-02: UTF-16 Encoding - should handle UTF-16 encoded documents correctly', async (t) => {
|
|||
|
// ENC-02: Verify correct handling of UTF-16 encoded XML documents (both BE and LE)
|
|||
|
// This test ensures proper support for UTF-16 encoding variants
|
|||
|
|
|||
|
const performanceTracker = new PerformanceTracker('ENC-02: UTF-16 Encoding');
|
|||
|
const corpusLoader = new CorpusLoader();
|
|||
|
|
|||
|
t.test('UTF-16 BE (Big Endian) encoding', async () => {
|
|||
|
const startTime = performance.now();
|
|||
|
|
|||
|
// Create UTF-16 BE content
|
|||
|
const xmlContent = `<?xml version="1.0" encoding="UTF-16BE"?>
|
|||
|
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
|
|||
|
<UBLVersionID>2.1</UBLVersionID>
|
|||
|
<ID>UTF16BE-TEST</ID>
|
|||
|
<IssueDate>2025-01-25</IssueDate>
|
|||
|
<Note>UTF-16 BE Test: €100 für Bücher</Note>
|
|||
|
<DocumentCurrencyCode>EUR</DocumentCurrencyCode>
|
|||
|
<AccountingSupplierParty>
|
|||
|
<Party>
|
|||
|
<PartyName>
|
|||
|
<Name>Großhändler GmbH</Name>
|
|||
|
</PartyName>
|
|||
|
</Party>
|
|||
|
</AccountingSupplierParty>
|
|||
|
<LegalMonetaryTotal>
|
|||
|
<PayableAmount currencyID="EUR">100.00</PayableAmount>
|
|||
|
</LegalMonetaryTotal>
|
|||
|
</Invoice>`;
|
|||
|
|
|||
|
// Convert to UTF-16 BE with BOM
|
|||
|
const utf16BeBom = Buffer.from([0xFE, 0xFF]); // UTF-16 BE BOM
|
|||
|
const utf16BeContent = Buffer.from(xmlContent, 'utf16le').swap16(); // Convert to BE
|
|||
|
const contentWithBom = Buffer.concat([utf16BeBom, utf16BeContent]);
|
|||
|
|
|||
|
const einvoice = new EInvoice();
|
|||
|
try {
|
|||
|
await einvoice.loadFromBuffer(contentWithBom);
|
|||
|
|
|||
|
const parsedData = einvoice.getInvoiceData();
|
|||
|
expect(parsedData).toBeTruthy();
|
|||
|
|
|||
|
const xmlString = einvoice.getXmlString();
|
|||
|
expect(xmlString).toContain('UTF16BE-TEST');
|
|||
|
expect(xmlString).toContain('€100 für Bücher');
|
|||
|
expect(xmlString).toContain('Großhändler GmbH');
|
|||
|
} catch (error) {
|
|||
|
console.log('UTF-16 BE not fully supported:', error.message);
|
|||
|
// Try alternative approach
|
|||
|
const decoded = contentWithBom.toString('utf16le').replace(/^\ufeff/, '');
|
|||
|
await einvoice.loadFromString(decoded);
|
|||
|
expect(einvoice.getXmlString()).toContain('UTF16BE-TEST');
|
|||
|
}
|
|||
|
|
|||
|
const elapsed = performance.now() - startTime;
|
|||
|
performanceTracker.addMeasurement('utf16-be', elapsed);
|
|||
|
});
|
|||
|
|
|||
|
t.test('UTF-16 LE (Little Endian) encoding', async () => {
|
|||
|
const startTime = performance.now();
|
|||
|
|
|||
|
// Create UTF-16 LE content
|
|||
|
const xmlContent = `<?xml version="1.0" encoding="UTF-16LE"?>
|
|||
|
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
|
|||
|
<UBLVersionID>2.1</UBLVersionID>
|
|||
|
<ID>UTF16LE-TEST</ID>
|
|||
|
<IssueDate>2025-01-25</IssueDate>
|
|||
|
<Note>UTF-16 LE: Special chars → ← ↑ ↓ ♠ ♣ ♥ ♦</Note>
|
|||
|
<AccountingCustomerParty>
|
|||
|
<Party>
|
|||
|
<PartyName>
|
|||
|
<Name>François & Søren Ltd.</Name>
|
|||
|
</PartyName>
|
|||
|
</Party>
|
|||
|
</AccountingCustomerParty>
|
|||
|
</Invoice>`;
|
|||
|
|
|||
|
// Convert to UTF-16 LE with BOM
|
|||
|
const utf16LeBom = Buffer.from([0xFF, 0xFE]); // UTF-16 LE BOM
|
|||
|
const utf16LeContent = Buffer.from(xmlContent, 'utf16le');
|
|||
|
const contentWithBom = Buffer.concat([utf16LeBom, utf16LeContent]);
|
|||
|
|
|||
|
const einvoice = new EInvoice();
|
|||
|
try {
|
|||
|
await einvoice.loadFromBuffer(contentWithBom);
|
|||
|
|
|||
|
const xmlString = einvoice.getXmlString();
|
|||
|
expect(xmlString).toContain('UTF16LE-TEST');
|
|||
|
expect(xmlString).toContain('→ ← ↑ ↓');
|
|||
|
expect(xmlString).toContain('♠ ♣ ♥ ♦');
|
|||
|
expect(xmlString).toContain('François & Søren Ltd.');
|
|||
|
} catch (error) {
|
|||
|
console.log('UTF-16 LE not fully supported:', error.message);
|
|||
|
// Try fallback
|
|||
|
const decoded = contentWithBom.toString('utf16le').replace(/^\ufeff/, '');
|
|||
|
await einvoice.loadFromString(decoded);
|
|||
|
expect(einvoice.getXmlString()).toContain('UTF16LE-TEST');
|
|||
|
}
|
|||
|
|
|||
|
const elapsed = performance.now() - startTime;
|
|||
|
performanceTracker.addMeasurement('utf16-le', elapsed);
|
|||
|
});
|
|||
|
|
|||
|
t.test('UTF-16 without BOM', async () => {
|
|||
|
const startTime = performance.now();
|
|||
|
|
|||
|
// UTF-16 without BOM (should detect from encoding declaration)
|
|||
|
const xmlContent = `<?xml version="1.0" encoding="UTF-16"?>
|
|||
|
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
|
|||
|
<UBLVersionID>2.1</UBLVersionID>
|
|||
|
<ID>UTF16-NO-BOM</ID>
|
|||
|
<Note>Ψ Ω α β γ δ ε ζ η θ</Note>
|
|||
|
</Invoice>`;
|
|||
|
|
|||
|
// Create UTF-16 without BOM (system default endianness)
|
|||
|
const utf16Content = Buffer.from(xmlContent, 'utf16le');
|
|||
|
|
|||
|
const einvoice = new EInvoice();
|
|||
|
try {
|
|||
|
await einvoice.loadFromBuffer(utf16Content);
|
|||
|
|
|||
|
const xmlString = einvoice.getXmlString();
|
|||
|
expect(xmlString).toContain('UTF16-NO-BOM');
|
|||
|
expect(xmlString).toContain('Ψ Ω α β γ δ ε ζ η θ');
|
|||
|
} catch (error) {
|
|||
|
console.log('UTF-16 without BOM requires explicit handling:', error.message);
|
|||
|
}
|
|||
|
|
|||
|
const elapsed = performance.now() - startTime;
|
|||
|
performanceTracker.addMeasurement('utf16-no-bom', elapsed);
|
|||
|
});
|
|||
|
|
|||
|
t.test('UTF-16 surrogate pairs', async () => {
|
|||
|
const startTime = performance.now();
|
|||
|
|
|||
|
// Test UTF-16 surrogate pairs (for characters outside BMP)
|
|||
|
const xmlContent = `<?xml version="1.0" encoding="UTF-16"?>
|
|||
|
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
|
|||
|
<UBLVersionID>2.1</UBLVersionID>
|
|||
|
<ID>UTF16-SURROGATE</ID>
|
|||
|
<Note>Emojis: 😀😃😄😁 Math: 𝕳𝖊𝖑𝖑𝖔 CJK Ext: 𠀀𠀁</Note>
|
|||
|
<InvoiceLine>
|
|||
|
<Note>Ancient scripts: 𐌀𐌁𐌂 𓀀𓀁𓀂</Note>
|
|||
|
</InvoiceLine>
|
|||
|
</Invoice>`;
|
|||
|
|
|||
|
const utf16Bom = Buffer.from([0xFF, 0xFE]); // UTF-16 LE BOM
|
|||
|
const utf16Content = Buffer.from(xmlContent, 'utf16le');
|
|||
|
const contentWithBom = Buffer.concat([utf16Bom, utf16Content]);
|
|||
|
|
|||
|
const einvoice = new EInvoice();
|
|||
|
try {
|
|||
|
await einvoice.loadFromBuffer(contentWithBom);
|
|||
|
|
|||
|
const xmlString = einvoice.getXmlString();
|
|||
|
expect(xmlString).toContain('😀😃😄😁');
|
|||
|
expect(xmlString).toContain('𝕳𝖊𝖑𝖑𝖔');
|
|||
|
expect(xmlString).toContain('𠀀𠀁');
|
|||
|
expect(xmlString).toContain('𐌀𐌁𐌂');
|
|||
|
expect(xmlString).toContain('𓀀𓀁𓀂');
|
|||
|
} catch (error) {
|
|||
|
console.log('Surrogate pair handling:', error.message);
|
|||
|
// Try string approach
|
|||
|
const decoded = contentWithBom.toString('utf16le').replace(/^\ufeff/, '');
|
|||
|
await einvoice.loadFromString(decoded);
|
|||
|
}
|
|||
|
|
|||
|
const elapsed = performance.now() - startTime;
|
|||
|
performanceTracker.addMeasurement('utf16-surrogates', elapsed);
|
|||
|
});
|
|||
|
|
|||
|
t.test('UTF-16 to UTF-8 conversion', async () => {
|
|||
|
const startTime = performance.now();
|
|||
|
|
|||
|
// Test that UTF-16 input can be converted to UTF-8 output
|
|||
|
const xmlContent = `<?xml version="1.0" encoding="UTF-16"?>
|
|||
|
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
|
|||
|
<UBLVersionID>2.1</UBLVersionID>
|
|||
|
<ID>UTF16-TO-UTF8</ID>
|
|||
|
<Note>Müller, François, 北京, Москва</Note>
|
|||
|
</Invoice>`;
|
|||
|
|
|||
|
const utf16Bom = Buffer.from([0xFF, 0xFE]);
|
|||
|
const utf16Content = Buffer.from(xmlContent, 'utf16le');
|
|||
|
const contentWithBom = Buffer.concat([utf16Bom, utf16Content]);
|
|||
|
|
|||
|
const einvoice = new EInvoice();
|
|||
|
try {
|
|||
|
// Load UTF-16 content
|
|||
|
await einvoice.loadFromBuffer(contentWithBom);
|
|||
|
|
|||
|
// Get as UTF-8 string
|
|||
|
const xmlString = einvoice.getXmlString();
|
|||
|
|
|||
|
// Should be valid UTF-8 now
|
|||
|
expect(xmlString).toContain('Müller');
|
|||
|
expect(xmlString).toContain('François');
|
|||
|
expect(xmlString).toContain('北京');
|
|||
|
expect(xmlString).toContain('Москва');
|
|||
|
|
|||
|
// Verify it's valid UTF-8
|
|||
|
const utf8Buffer = Buffer.from(xmlString, 'utf8');
|
|||
|
expect(utf8Buffer.toString('utf8')).toBe(xmlString);
|
|||
|
} catch (error) {
|
|||
|
console.log('UTF-16 to UTF-8 conversion not supported:', error.message);
|
|||
|
}
|
|||
|
|
|||
|
const elapsed = performance.now() - startTime;
|
|||
|
performanceTracker.addMeasurement('utf16-to-utf8', elapsed);
|
|||
|
});
|
|||
|
|
|||
|
t.test('Mixed content with UTF-16', async () => {
|
|||
|
const startTime = performance.now();
|
|||
|
|
|||
|
const xmlContent = `<?xml version="1.0" encoding="UTF-16"?>
|
|||
|
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
|
|||
|
<UBLVersionID>2.1</UBLVersionID>
|
|||
|
<ID>UTF16-MIXED</ID>
|
|||
|
<PaymentTerms>
|
|||
|
<Note>Payment terms: 30 days net
|
|||
|
• Early payment: 2% discount
|
|||
|
• Late payment: 1.5% interest
|
|||
|
→ Bank: Sparkasse München
|
|||
|
← Account: DE89 3704 0044 0532 0130 00</Note>
|
|||
|
</PaymentTerms>
|
|||
|
<InvoiceLine>
|
|||
|
<Item>
|
|||
|
<Description>Bücher (10× @ €15)</Description>
|
|||
|
</Item>
|
|||
|
</InvoiceLine>
|
|||
|
</Invoice>`;
|
|||
|
|
|||
|
const utf16Bom = Buffer.from([0xFF, 0xFE]);
|
|||
|
const utf16Content = Buffer.from(xmlContent, 'utf16le');
|
|||
|
const contentWithBom = Buffer.concat([utf16Bom, utf16Content]);
|
|||
|
|
|||
|
const einvoice = new EInvoice();
|
|||
|
try {
|
|||
|
await einvoice.loadFromBuffer(contentWithBom);
|
|||
|
|
|||
|
const xmlString = einvoice.getXmlString();
|
|||
|
expect(xmlString).toContain('•');
|
|||
|
expect(xmlString).toContain('→');
|
|||
|
expect(xmlString).toContain('←');
|
|||
|
expect(xmlString).toContain('×');
|
|||
|
expect(xmlString).toContain('€');
|
|||
|
expect(xmlString).toContain('Sparkasse München');
|
|||
|
} catch (error) {
|
|||
|
console.log('UTF-16 mixed content:', error.message);
|
|||
|
}
|
|||
|
|
|||
|
const elapsed = performance.now() - startTime;
|
|||
|
performanceTracker.addMeasurement('utf16-mixed', elapsed);
|
|||
|
});
|
|||
|
|
|||
|
t.test('Corpus UTF-16 detection', async () => {
|
|||
|
const startTime = performance.now();
|
|||
|
let utf16Count = 0;
|
|||
|
let checkedCount = 0;
|
|||
|
|
|||
|
const files = await corpusLoader.getAllFiles();
|
|||
|
const xmlFiles = files.filter(f => f.endsWith('.xml'));
|
|||
|
|
|||
|
// Check a sample for UTF-16 encoded files
|
|||
|
const sampleSize = Math.min(30, xmlFiles.length);
|
|||
|
const sample = xmlFiles.slice(0, sampleSize);
|
|||
|
|
|||
|
for (const file of sample) {
|
|||
|
try {
|
|||
|
const content = await corpusLoader.readFile(file);
|
|||
|
|
|||
|
if (Buffer.isBuffer(content)) {
|
|||
|
// Check for UTF-16 BOMs
|
|||
|
if ((content[0] === 0xFE && content[1] === 0xFF) ||
|
|||
|
(content[0] === 0xFF && content[1] === 0xFE)) {
|
|||
|
utf16Count++;
|
|||
|
console.log(`Found UTF-16 file: ${file}`);
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
checkedCount++;
|
|||
|
} catch (error) {
|
|||
|
// Skip files that can't be read
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
console.log(`UTF-16 corpus scan: ${utf16Count}/${checkedCount} files use UTF-16`);
|
|||
|
|
|||
|
const elapsed = performance.now() - startTime;
|
|||
|
performanceTracker.addMeasurement('corpus-utf16', elapsed);
|
|||
|
});
|
|||
|
|
|||
|
// Print performance summary
|
|||
|
performanceTracker.printSummary();
|
|||
|
|
|||
|
// Performance assertions
|
|||
|
const avgTime = performanceTracker.getAverageTime();
|
|||
|
expect(avgTime).toBeLessThan(150); // UTF-16 operations may be slightly slower than UTF-8
|
|||
|
});
|
|||
|
|
|||
|
tap.start();
|