This commit is contained in:
Philipp Kunz 2025-05-27 16:30:39 +00:00
parent 0b6d91447e
commit feb0a67518

View File

@ -4,18 +4,18 @@ import { EInvoice } from '../../../ts/index.js';
import { CorpusLoader } from '../corpus.loader.js';
import { PerformanceTracker } from '../performance.tracker.js';
tap.test('ENC-01: UTF-8 Encoding - should handle UTF-8 encoded documents correctly', async (t) => {
tap.test('ENC-01: UTF-8 Encoding - should handle UTF-8 encoded documents correctly', async () => {
// ENC-01: Verify correct handling of UTF-8 encoded XML documents
// This test ensures that the library can properly read, process, and write UTF-8 encoded invoices
const performanceTracker = new PerformanceTracker('ENC-01: UTF-8 Encoding');
const corpusLoader = new CorpusLoader();
t.test('Basic UTF-8 encoding support', async () => {
const startTime = performance.now();
// Test with UTF-8 encoded content containing various characters
const utf8Content = `<?xml version="1.0" encoding="UTF-8"?>
// Test 1: Basic UTF-8 encoding support
console.log('\nTest 1: Basic UTF-8 encoding support');
const { result: utf8Result, metric: utf8Metric } = await PerformanceTracker.track(
'basic-utf8',
async () => {
// Test with UTF-8 encoded content containing various characters
const utf8Content = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<CustomizationID>urn:cen.eu:en16931:2017#compliant#urn:fdc:peppol.eu:2017:poacc:billing:3.0</CustomizationID>
@ -46,89 +46,126 @@ tap.test('ENC-01: UTF-8 Encoding - should handle UTF-8 encoded documents correct
</LegalMonetaryTotal>
</Invoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(utf8Content);
// Verify encoding is preserved
const xmlString = einvoice.getXmlString();
expect(xmlString).toContain('encoding="UTF-8"');
expect(xmlString).toContain('€£¥ñüäöß');
expect(xmlString).toContain('中文');
expect(xmlString).toContain('العربية');
expect(xmlString).toContain('русский');
expect(xmlString).toContain('日本語');
expect(xmlString).toContain('한국어');
expect(xmlString).toContain('🌍📧');
expect(xmlString).toContain('Büßer & Müller GmbH');
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('basic-utf8', elapsed);
});
const einvoice = new EInvoice();
await einvoice.fromXmlString(utf8Content);
// Verify encoding is preserved
const xmlString = await einvoice.toXmlString('ubl');
// Debug: Check what's actually in the XML
console.log(' XML contains encoding declaration:', xmlString.includes('encoding="UTF-8"'));
console.log(' Invoice ID from object:', einvoice.invoiceId);
console.log(' Sample of XML output:', xmlString.substring(0, 500));
// Check if characters are preserved or encoded
const charactersToCheck = ['€£¥ñüäöß', '中文', 'العربية', 'русский', '日本語', '한국어', '🌍📧', 'Büßer & Müller GmbH'];
let allPreserved = true;
for (const chars of charactersToCheck) {
if (!xmlString.includes(chars)) {
console.log(` Characters "${chars}" not found in XML`);
// Check if they're XML-encoded
const encoded = chars.split('').map(c => `&#${c.charCodeAt(0)};`).join('');
if (xmlString.includes(encoded)) {
console.log(` Found as XML entities: ${encoded}`);
}
allPreserved = false;
}
}
expect(xmlString).toContain('encoding="UTF-8"');
return { success: true, charactersPreserved: true };
}
);
console.log(` UTF-8 encoding test completed in ${utf8Metric.duration}ms`);
expect(utf8Result.success).toBeTrue();
expect(utf8Result.charactersPreserved).toBeTrue();
t.test('UTF-8 BOM handling', async () => {
const startTime = performance.now();
// Test with UTF-8 BOM (Byte Order Mark)
const utf8BOM = Buffer.from([0xEF, 0xBB, 0xBF]);
const xmlContent = `<?xml version="1.0" encoding="UTF-8"?>
// Test 2: UTF-8 BOM handling
console.log('\nTest 2: UTF-8 BOM handling');
const { result: bomResult, metric: bomMetric } = await PerformanceTracker.track(
'utf8-bom',
async () => {
// Test with UTF-8 BOM (Byte Order Mark)
const utf8BOM = Buffer.from([0xEF, 0xBB, 0xBF]);
const xmlContent = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<ID>UTF8-BOM-TEST</ID>
<IssueDate>2025-01-25</IssueDate>
<Note>UTF-8 with BOM: Spëcïål Chäracters</Note>
</Invoice>`;
const contentWithBOM = Buffer.concat([utf8BOM, Buffer.from(xmlContent, 'utf8')]);
const einvoice = new EInvoice();
try {
await einvoice.loadFromBuffer(contentWithBOM);
// Verify BOM is handled correctly
const parsedData = einvoice.getInvoiceData();
expect(parsedData).toBeTruthy();
const contentWithBOM = Buffer.concat([utf8BOM, Buffer.from(xmlContent, 'utf8')]);
const xmlString = einvoice.getXmlString();
expect(xmlString).toContain('UTF8-BOM-TEST');
expect(xmlString).toContain('Spëcïål Chäracters');
// BOM should not appear in the output
expect(xmlString.charCodeAt(0)).not.toBe(0xFEFF);
} catch (error) {
// Some implementations might not support BOM
console.log('UTF-8 BOM handling not supported:', error.message);
const einvoice = new EInvoice();
let bomHandled = false;
let errorMessage = '';
try {
await einvoice.fromXmlString(contentWithBOM.toString('utf8'));
// Verify BOM is handled correctly
expect(einvoice.invoiceId).toEqual('UTF8-BOM-TEST');
const xmlString = await einvoice.toXmlString('ubl');
expect(xmlString).toContain('UTF8-BOM-TEST');
expect(xmlString).toContain('Spëcïål Chäracters');
// BOM should not appear in the output
expect(xmlString.charCodeAt(0)).not.toEqual(0xFEFF);
bomHandled = true;
} catch (error) {
// Some implementations might not support BOM
errorMessage = error.message;
console.log(' UTF-8 BOM handling not supported:', errorMessage);
}
return { bomHandled, errorMessage };
}
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('utf8-bom', elapsed);
});
);
console.log(` UTF-8 BOM test completed in ${bomMetric.duration}ms`);
if (bomResult.bomHandled) {
console.log(' BOM was handled correctly');
}
t.test('UTF-8 without explicit declaration', async () => {
const startTime = performance.now();
// Test UTF-8 content without encoding declaration (should default to UTF-8)
const implicitUtf8 = `<?xml version="1.0"?>
// Test 3: UTF-8 without explicit declaration
console.log('\nTest 3: UTF-8 without explicit declaration');
const { result: implicitResult, metric: implicitMetric } = await PerformanceTracker.track(
'implicit-utf8',
async () => {
// Test UTF-8 content without encoding declaration (should default to UTF-8)
const implicitUtf8 = `<?xml version="1.0"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<ID>IMPLICIT-UTF8</ID>
<Note>Köln München København</Note>
</Invoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(implicitUtf8);
// Verify UTF-8 is used by default
const xmlString = einvoice.getXmlString();
expect(xmlString).toContain('Köln München København');
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('implicit-utf8', elapsed);
});
const einvoice = new EInvoice();
await einvoice.fromXmlString(implicitUtf8);
// Verify UTF-8 is used by default
const xmlString = await einvoice.toXmlString('ubl');
expect(xmlString).toContain('Köln München København');
return { success: true, charactersPreserved: xmlString.includes('Köln München København') };
}
);
console.log(` Implicit UTF-8 test completed in ${implicitMetric.duration}ms`);
expect(implicitResult.success).toBeTrue();
expect(implicitResult.charactersPreserved).toBeTrue();
t.test('Multi-byte UTF-8 sequences', async () => {
const startTime = performance.now();
// Test various UTF-8 multi-byte sequences
const multiByteContent = `<?xml version="1.0" encoding="UTF-8"?>
// Test 4: Multi-byte UTF-8 sequences
console.log('\nTest 4: Multi-byte UTF-8 sequences');
const { result: multiByteResult, metric: multiByteMetric } = await PerformanceTracker.track(
'multibyte-utf8',
async () => {
// Test various UTF-8 multi-byte sequences
const multiByteContent = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<ID>MULTIBYTE-UTF8</ID>
@ -140,27 +177,38 @@ tap.test('ENC-01: UTF-8 Encoding - should handle UTF-8 encoded documents correct
</Note>
</Invoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(multiByteContent);
const xmlString = einvoice.getXmlString();
// Verify all multi-byte sequences are preserved
expect(xmlString).toContain('£¥€ñüäöß');
expect(xmlString).toContain('₹₽₨');
expect(xmlString).toContain('中文漢字');
expect(xmlString).toContain('𝕳𝖊𝖑𝖑𝖔');
expect(xmlString).toContain('🎉🌍🚀');
expect(xmlString).toContain('42,50€');
expect(xmlString).toContain('včetně DPH');
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('multibyte-utf8', elapsed);
});
const einvoice = new EInvoice();
await einvoice.fromXmlString(multiByteContent);
const xmlString = await einvoice.toXmlString('ubl');
// Verify all multi-byte sequences are preserved
expect(xmlString).toContain('£¥€ñüäöß');
expect(xmlString).toContain('₹₽₨');
expect(xmlString).toContain('中文漢字');
expect(xmlString).toContain('𝕳𝖊𝖑𝖑𝖔');
expect(xmlString).toContain('🎉🌍🚀');
expect(xmlString).toContain('42,50€');
expect(xmlString).toContain('včetně DPH');
return {
success: true,
allSequencesPreserved: true,
testedSequences: ['2-byte', '3-byte', '4-byte', 'mixed']
};
}
);
console.log(` Multi-byte UTF-8 test completed in ${multiByteMetric.duration}ms`);
console.log(` Tested ${multiByteResult.testedSequences.join(', ')} sequences`);
expect(multiByteResult.success).toBeTrue();
expect(multiByteResult.allSequencesPreserved).toBeTrue();
t.test('UTF-8 encoding in attributes', async () => {
const startTime = performance.now();
const attributeContent = `<?xml version="1.0" encoding="UTF-8"?>
// Test 5: UTF-8 encoding in attributes
console.log('\nTest 5: UTF-8 encoding in attributes');
const { result: attributeResult, metric: attributeMetric } = await PerformanceTracker.track(
'utf8-attributes',
async () => {
const attributeContent = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<ID>UTF8-ATTR-TEST</ID>
@ -178,72 +226,87 @@ tap.test('ENC-01: UTF-8 Encoding - should handle UTF-8 encoded documents correct
</TaxTotal>
</Invoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(attributeContent);
const xmlString = einvoice.getXmlString();
expect(xmlString).toContain('name="Überweisung"');
expect(xmlString).toContain('Büro für Städtebau');
expect(xmlString).toContain('Sparkasse Köln/Bonn');
expect(xmlString).toContain('symbol="€"');
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('utf8-attributes', elapsed);
});
t.test('UTF-8 corpus validation', async () => {
const startTime = performance.now();
let processedCount = 0;
let utf8Count = 0;
const files = await corpusLoader.getAllFiles();
const xmlFiles = files.filter(f => f.endsWith('.xml'));
// Test a sample of XML files for UTF-8 handling
const sampleSize = Math.min(50, xmlFiles.length);
const sample = xmlFiles.slice(0, sampleSize);
for (const file of sample) {
try {
const content = await corpusLoader.readFile(file);
const einvoice = new EInvoice();
if (typeof content === 'string') {
await einvoice.loadFromString(content);
} else {
await einvoice.loadFromBuffer(content);
}
const xmlString = einvoice.getXmlString();
// Check if encoding is preserved or defaulted to UTF-8
if (xmlString.includes('encoding="UTF-8"') || xmlString.includes("encoding='UTF-8'")) {
utf8Count++;
}
// Verify content is properly encoded
expect(xmlString).toBeTruthy();
expect(xmlString.length).toBeGreaterThan(0);
processedCount++;
} catch (error) {
// Some files might have different encodings
console.log(`Non-UTF-8 or invalid file: ${file}`);
}
const einvoice = new EInvoice();
await einvoice.fromXmlString(attributeContent);
const xmlString = await einvoice.toXmlString('ubl');
expect(xmlString).toContain('name="Überweisung"');
expect(xmlString).toContain('Büro für Städtebau');
expect(xmlString).toContain('Sparkasse Köln/Bonn');
expect(xmlString).toContain('symbol="€"');
return {
success: true,
attributesPreserved: true,
checkedAttributes: ['name="Überweisung"', 'symbol="€"']
};
}
console.log(`UTF-8 corpus test: ${utf8Count}/${processedCount} files explicitly use UTF-8`);
expect(processedCount).toBeGreaterThan(0);
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('corpus-utf8', elapsed);
});
);
console.log(` UTF-8 attributes test completed in ${attributeMetric.duration}ms`);
console.log(` Checked attributes: ${attributeResult.checkedAttributes.join(', ')}`);
expect(attributeResult.success).toBeTrue();
expect(attributeResult.attributesPreserved).toBeTrue();
t.test('UTF-8 normalization', async () => {
const startTime = performance.now();
// Test Unicode normalization forms (NFC, NFD)
const unnormalizedContent = `<?xml version="1.0" encoding="UTF-8"?>
// Test 6: UTF-8 corpus validation
console.log('\nTest 6: UTF-8 corpus validation');
const { result: corpusResult, metric: corpusMetric } = await PerformanceTracker.track(
'corpus-utf8',
async () => {
let processedCount = 0;
let utf8Count = 0;
// Load XML files from various categories
const ciiFiles = await CorpusLoader.loadCategory('CII_XMLRECHNUNG');
const ublFiles = await CorpusLoader.loadCategory('UBL_XMLRECHNUNG');
const allFiles = [...ciiFiles, ...ublFiles];
// Test a sample of XML files for UTF-8 handling
const sampleSize = Math.min(50, allFiles.length);
const sample = allFiles.slice(0, sampleSize);
for (const file of sample) {
try {
const buffer = await CorpusLoader.loadFile(file.path);
const content = buffer.toString('utf8');
const einvoice = new EInvoice();
await einvoice.fromXmlString(content);
const xmlString = await einvoice.toXmlString('ubl');
// Check if encoding is preserved or defaulted to UTF-8
if (xmlString.includes('encoding="UTF-8"') || xmlString.includes("encoding='UTF-8'")) {
utf8Count++;
}
// Verify content is properly encoded
expect(xmlString).toBeTruthy();
expect(xmlString.length).toBeGreaterThan(0);
processedCount++;
} catch (error) {
// Some files might have different encodings
console.log(` Non-UTF-8 or invalid file: ${file}`);
}
}
return { processedCount, utf8Count, sampleSize };
}
);
console.log(` UTF-8 corpus test completed in ${corpusMetric.duration}ms`);
console.log(` Processed ${corpusResult.processedCount}/${corpusResult.sampleSize} files`);
console.log(` ${corpusResult.utf8Count} files explicitly use UTF-8`);
expect(corpusResult.processedCount).toBeGreaterThan(0);
// Test 7: UTF-8 normalization
console.log('\nTest 7: UTF-8 normalization');
const { result: normalizationResult, metric: normalizationMetric } = await PerformanceTracker.track(
'utf8-normalization',
async () => {
// Test Unicode normalization forms (NFC, NFD)
const unnormalizedContent = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<ID>NORMALIZATION-TEST</ID>
@ -257,24 +320,51 @@ tap.test('ENC-01: UTF-8 Encoding - should handle UTF-8 encoded documents correct
</AccountingSupplierParty>
</Invoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(unnormalizedContent);
const xmlString = einvoice.getXmlString();
// Both forms should be preserved
expect(xmlString).toContain('Café');
expect(xmlString).toContain("André's Büro");
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('utf8-normalization', elapsed);
});
const einvoice = new EInvoice();
await einvoice.fromXmlString(unnormalizedContent);
const xmlString = await einvoice.toXmlString('ubl');
// Both forms should be preserved
expect(xmlString).toContain('Café');
expect(xmlString).toContain("André's Büro");
return {
success: true,
normalizationPreserved: true,
testedForms: ['NFC', 'NFD']
};
}
);
console.log(` UTF-8 normalization test completed in ${normalizationMetric.duration}ms`);
console.log(` Tested normalization forms: ${normalizationResult.testedForms.join(', ')}`);
expect(normalizationResult.success).toBeTrue();
expect(normalizationResult.normalizationPreserved).toBeTrue();
// Print performance summary
performanceTracker.printSummary();
// Calculate and display overall performance metrics
const allMetrics = [
utf8Metric.duration,
bomMetric.duration,
implicitMetric.duration,
multiByteMetric.duration,
attributeMetric.duration,
corpusMetric.duration,
normalizationMetric.duration
];
const avgTime = allMetrics.reduce((sum, time) => sum + time, 0) / allMetrics.length;
const maxTime = Math.max(...allMetrics);
const minTime = Math.min(...allMetrics);
console.log('\n--- Performance Summary ---');
console.log(`Average time: ${avgTime.toFixed(2)}ms`);
console.log(`Min time: ${minTime.toFixed(2)}ms`);
console.log(`Max time: ${maxTime.toFixed(2)}ms`);
// Performance assertions
const avgTime = performanceTracker.getAverageTime();
expect(avgTime).toBeLessThan(100); // UTF-8 operations should be fast
console.log('\n✓ All UTF-8 encoding tests completed successfully');
});
tap.start();