einvoice/test/suite/einvoice_parsing/test.parse-03.encoding-detection.ts
2025-05-28 08:40:26 +00:00

320 lines
11 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { expect, tap } from '@git.zone/tstest/tapbundle';
import * as einvoice from '../../../ts/index.js';
import { PerformanceTracker } from '../../helpers/performance.tracker.js';
tap.test('PARSE-03: Encoding declaration detection', async () => {
const encodingTests = [
{
name: 'UTF-8 declaration',
xml: '<?xml version="1.0" encoding="UTF-8"?>\n<invoice><id>TEST-001</id></invoice>',
expectedEncoding: 'UTF-8',
actualEncoding: 'UTF-8'
},
{
name: 'UTF-16 declaration',
xml: '<?xml version="1.0" encoding="UTF-16"?>\n<invoice><id>TEST-002</id></invoice>',
expectedEncoding: 'UTF-16',
actualEncoding: 'UTF-8' // Mismatch test
},
{
name: 'ISO-8859-1 declaration',
xml: '<?xml version="1.0" encoding="ISO-8859-1"?>\n<invoice><supplier>Müller</supplier></invoice>',
expectedEncoding: 'ISO-8859-1',
actualEncoding: 'ISO-8859-1'
},
{
name: 'Windows-1252 declaration',
xml: '<?xml version="1.0" encoding="Windows-1252"?>\n<invoice><note>Special chars</note></invoice>',
expectedEncoding: 'Windows-1252',
actualEncoding: 'Windows-1252'
},
{
name: 'Case variations',
xml: '<?xml version="1.0" encoding="utf-8"?>\n<invoice><id>TEST-003</id></invoice>',
expectedEncoding: 'UTF-8',
actualEncoding: 'UTF-8'
},
{
name: 'No encoding declaration',
xml: '<?xml version="1.0"?>\n<invoice><id>TEST-004</id></invoice>',
expectedEncoding: 'UTF-8', // Default
actualEncoding: 'UTF-8'
}
];
for (const test of encodingTests) {
const { result, metric } = await PerformanceTracker.track(
'encoding-detection',
async () => {
// Extract declared encoding
const encodingMatch = test.xml.match(/encoding=["']([^"']+)["']/i);
const declaredEncoding = encodingMatch ? encodingMatch[1].toUpperCase() : 'UTF-8';
return {
declaredEncoding,
matches: declaredEncoding.replace(/-/g, '').toUpperCase() ===
test.expectedEncoding.replace(/-/g, '').toUpperCase()
};
}
);
console.log(`${test.name}:`);
console.log(` Declared: ${result.declaredEncoding}`);
console.log(` Expected: ${test.expectedEncoding}`);
console.log(` ${result.matches ? '✓' : '✗'} Declaration ${result.matches ? 'matches' : 'mismatch'}`);
}
});
tap.test('PARSE-03: BOM (Byte Order Mark) detection', async () => {
const bomTests = [
{
name: 'UTF-8 with BOM',
bom: Buffer.from([0xEF, 0xBB, 0xBF]),
encoding: 'UTF-8',
xml: '<?xml version="1.0" encoding="UTF-8"?><invoice><id>TEST-005</id></invoice>'
},
{
name: 'UTF-16 LE BOM',
bom: Buffer.from([0xFF, 0xFE]),
encoding: 'UTF-16LE',
xml: '<?xml version="1.0" encoding="UTF-16"?><invoice><id>TEST-006</id></invoice>'
},
{
name: 'UTF-16 BE BOM',
bom: Buffer.from([0xFE, 0xFF]),
encoding: 'UTF-16BE',
xml: '<?xml version="1.0" encoding="UTF-16"?><invoice><id>TEST-007</id></invoice>'
}
];
for (const test of bomTests) {
const xmlWithBom = Buffer.concat([test.bom, Buffer.from(test.xml)]);
const { result } = await PerformanceTracker.track(
'bom-detection',
async () => {
const invoice = new einvoice.EInvoice();
try {
// Try parsing with BOM
await invoice.fromXmlString(xmlWithBom.toString('utf8'));
return { success: true, parsed: true };
} catch (error) {
return {
success: false,
error: error.message,
// Check if it's an encoding issue
encodingError: error.message.toLowerCase().includes('encoding') ||
error.message.toLowerCase().includes('utf')
};
}
}
);
console.log(`${test.name}: ${result.parsed ? '✓' : '✗'}`);
if (!result.parsed) {
console.log(` Error: ${result.error}`);
if (result.encodingError) {
console.log(` Likely encoding issue detected`);
}
}
}
});
tap.test('PARSE-03: Special character handling', async () => {
const charTests = [
{
name: 'German umlauts',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<ubl:Invoice xmlns:ubl="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2" xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
<cbc:ID>UMLAUT-TEST</cbc:ID>
<cbc:Note>Müller, Schäfer, Köln, Größe</cbc:Note>
</ubl:Invoice>`,
chars: 'üäöß',
expectedChars: 'Müller, Schäfer, Köln, Größe'
},
{
name: 'French accents',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<ubl:Invoice xmlns:ubl="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2" xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
<cbc:ID>ACCENT-TEST</cbc:ID>
<cbc:Note>Café, naïve, façade, à côté</cbc:Note>
</ubl:Invoice>`,
chars: 'éèêëàçï',
expectedChars: 'Café, naïve, façade, à côté'
},
{
name: 'Currency symbols',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<ubl:Invoice xmlns:ubl="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2" xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
<cbc:ID>CURRENCY-TEST</cbc:ID>
<cbc:Note>€ 100, £ 50, ¥ 1000, $ 75</cbc:Note>
</ubl:Invoice>`,
chars: '€£¥$',
expectedChars: '€ 100, £ 50, ¥ 1000, $ 75'
},
{
name: 'Emoji and Unicode',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<ubl:Invoice xmlns:ubl="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2" xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
<cbc:ID>UNICODE-TEST</cbc:ID>
<cbc:Note>Invoice 📄 Payment 💰 Delivered 📦</cbc:Note>
</ubl:Invoice>`,
chars: '📄💰📦',
expectedChars: 'Invoice 📄 Payment 💰 Delivered 📦'
}
];
for (const test of charTests) {
const { result } = await PerformanceTracker.track(
'special-chars',
async () => {
const invoice = new einvoice.EInvoice();
try {
await invoice.fromXmlString(test.xml);
return {
success: true,
notes: invoice.notes,
preserved: invoice.notes && invoice.notes[0] === test.expectedChars
};
} catch (error) {
return { success: false, error: error.message };
}
}
);
console.log(`${test.name}: ${result.success ? '✓' : '✗'}`);
if (result.success && result.notes) {
console.log(` Characters ${result.preserved ? 'preserved' : 'not preserved'}`);
if (result.notes[0]) {
console.log(` Content: ${result.notes[0]}`);
}
}
}
});
tap.test('PARSE-03: XML entities and escaping', async () => {
const entityTests = [
{
name: 'Basic XML entities',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<ubl:Invoice xmlns:ubl="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2" xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
<cbc:ID>ENTITY-TEST-1</cbc:ID>
<cbc:Note>Less than &lt; Greater than &gt; Ampersand &amp; Quote &quot; Apostrophe &apos;</cbc:Note>
</ubl:Invoice>`,
expected: 'Less than < Greater than > Ampersand & Quote " Apostrophe \''
},
{
name: 'Numeric entities',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<ubl:Invoice xmlns:ubl="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2" xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
<cbc:ID>ENTITY-TEST-2</cbc:ID>
<cbc:Note>Euro &#8364; Copyright &#169; Registered &#174;</cbc:Note>
</ubl:Invoice>`,
expected: 'Euro € Copyright © Registered ®'
},
{
name: 'CDATA sections',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<ubl:Invoice xmlns:ubl="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2" xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
<cbc:ID>CDATA-TEST</cbc:ID>
<cbc:Note><![CDATA[HTML content: <p>Price > 100 & quantity < 50</p>]]></cbc:Note>
</ubl:Invoice>`,
expected: 'HTML content: <p>Price > 100 & quantity < 50</p>'
}
];
for (const test of entityTests) {
const { result } = await PerformanceTracker.track(
'entity-handling',
async () => {
const invoice = new einvoice.EInvoice();
try {
await invoice.fromXmlString(test.xml);
return {
success: true,
notes: invoice.notes,
correct: invoice.notes && invoice.notes[0] === test.expected
};
} catch (error) {
return { success: false, error: error.message };
}
}
);
console.log(`${test.name}: ${result.success && result.correct ? '✓' : '✗'}`);
if (result.success && result.notes) {
console.log(` Expected: ${test.expected}`);
console.log(` Got: ${result.notes[0] || '(empty)'}`);
}
}
});
tap.test('PARSE-03: Mixed encoding scenarios', async () => {
// Test real-world scenarios where encoding might be problematic
const scenarios = [
{
name: 'Mislabeled encoding',
// Says UTF-8 but contains ISO-8859-1 characters
xml: '<?xml version="1.0" encoding="UTF-8"?><invoice><supplier>Müller GmbH</supplier></invoice>',
issue: 'Declared UTF-8 but might have ISO-8859-1 content'
},
{
name: 'Double-encoded UTF-8',
// UTF-8 encoded twice
xml: '<?xml version="1.0" encoding="UTF-8"?><invoice><note>Müller</note></invoice>',
issue: 'Possible double UTF-8 encoding'
},
{
name: 'Mixed line endings with special chars',
xml: '<?xml version="1.0" encoding="UTF-8"?>\r\n<invoice>\n<note>Specialchars</note>\r</invoice>',
issue: 'Mixed CRLF/LF with special characters'
}
];
for (const scenario of scenarios) {
const { result } = await PerformanceTracker.track(
'mixed-encoding',
async () => {
const invoice = new einvoice.EInvoice();
try {
await invoice.fromXmlString(scenario.xml);
return { success: true, handled: true };
} catch (error) {
return {
success: false,
error: error.message,
isEncodingError: error.message.includes('encoding') ||
error.message.includes('character')
};
}
}
);
console.log(`${scenario.name}: ${result.handled || !result.isEncodingError ? '✓' : '✗'}`);
console.log(` Issue: ${scenario.issue}`);
if (!result.success) {
console.log(` Result: ${result.isEncodingError ? 'Encoding error' : 'Other error'}`);
}
}
});
tap.test('PARSE-03: Encoding performance', async () => {
const stats = PerformanceTracker.getStats('encoding-detection');
if (stats) {
console.log('\nEncoding Detection Performance:');
console.log(` Total operations: ${stats.count}`);
console.log(` Average time: ${stats.avg.toFixed(2)}ms`);
console.log(` Max time: ${stats.max.toFixed(2)}ms`);
// Encoding detection should be fast
expect(stats.avg).toBeLessThan(5); // Should detect encoding in < 5ms on average
}
});
// Run the tests
tap.start();