This commit is contained in:
2025-05-28 08:40:26 +00:00
parent e4c762658d
commit 32f8bc192a
24 changed files with 3350 additions and 5416 deletions

View File

@@ -1,532 +1,435 @@
import { expect, tap } from '@git.zone/tstest/tapbundle';
import { tap } from '@git.zone/tstest/tapbundle';
import * as einvoice from '../../../ts/index.js';
import * as plugins from '../../plugins.js';
import { CorpusLoader } from '../../helpers/corpus.loader.js';
import { PerformanceTracker } from '../../helpers/performance.tracker.js';
tap.test('PARSE-04: BOM Handling - Process Byte Order Marks correctly across encodings', async (t) => {
const performanceTracker = new PerformanceTracker('PARSE-04');
await t.test('Standard BOM detection and removal', async () => {
performanceTracker.startOperation('standard-bom');
const bomTypes = [
{
name: 'UTF-8 BOM',
bom: Buffer.from([0xEF, 0xBB, 0xBF]),
encoding: 'UTF-8',
description: 'Most common BOM in XML files'
},
{
name: 'UTF-16 LE BOM',
bom: Buffer.from([0xFF, 0xFE]),
encoding: 'UTF-16LE',
description: 'Little-endian UTF-16'
},
{
name: 'UTF-16 BE BOM',
bom: Buffer.from([0xFE, 0xFF]),
encoding: 'UTF-16BE',
description: 'Big-endian UTF-16'
},
{
name: 'UTF-32 LE BOM',
bom: Buffer.from([0xFF, 0xFE, 0x00, 0x00]),
encoding: 'UTF-32LE',
description: 'Little-endian UTF-32'
},
{
name: 'UTF-32 BE BOM',
bom: Buffer.from([0x00, 0x00, 0xFE, 0xFF]),
encoding: 'UTF-32BE',
description: 'Big-endian UTF-32'
}
];
for (const bomType of bomTypes) {
const startTime = performance.now();
// Create XML with BOM
let xmlContent: Buffer;
if (bomType.encoding.startsWith('UTF-16')) {
xmlContent = Buffer.from(
'<?xml version="1.0" encoding="UTF-16"?><invoice><id>TEST-BOM</id></invoice>',
bomType.encoding.toLowerCase() as BufferEncoding
);
} else if (bomType.encoding.startsWith('UTF-32')) {
// UTF-32 not directly supported by Node.js, simulate
xmlContent = Buffer.from('<?xml version="1.0" encoding="UTF-32"?><invoice><id>TEST-BOM</id></invoice>');
} else {
xmlContent = Buffer.from('<?xml version="1.0" encoding="UTF-8"?><invoice><id>TEST-BOM</id></invoice>');
}
const fullContent = Buffer.concat([bomType.bom, xmlContent]);
console.log(`${bomType.name}:`);
console.log(` BOM: ${Array.from(bomType.bom).map(b => '0x' + b.toString(16).toUpperCase().padStart(2, '0')).join(' ')}`);
console.log(` Encoding: ${bomType.encoding}`);
console.log(` Description: ${bomType.description}`);
console.log(` Total size: ${fullContent.length} bytes`);
// Test BOM removal
const withoutBom = removeBOM(fullContent);
if (withoutBom.length === fullContent.length - bomType.bom.length) {
console.log(' ✓ BOM removed successfully');
} else {
console.log(' ✗ BOM removal failed');
}
performanceTracker.recordMetric('bom-processing', performance.now() - startTime);
// Helper function to remove BOM from buffer
const removeBOM = (buffer: Buffer): Buffer => {
// UTF-8 BOM
if (buffer.length >= 3 && buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) {
return buffer.subarray(3);
}
// UTF-16 LE BOM
if (buffer.length >= 2 && buffer[0] === 0xFF && buffer[1] === 0xFE) {
return buffer.subarray(2);
}
// UTF-16 BE BOM
if (buffer.length >= 2 && buffer[0] === 0xFE && buffer[1] === 0xFF) {
return buffer.subarray(2);
}
// UTF-32 LE BOM
if (buffer.length >= 4 && buffer[0] === 0xFF && buffer[1] === 0xFE && buffer[2] === 0x00 && buffer[3] === 0x00) {
return buffer.subarray(4);
}
// UTF-32 BE BOM
if (buffer.length >= 4 && buffer[0] === 0x00 && buffer[1] === 0x00 && buffer[2] === 0xFE && buffer[3] === 0xFF) {
return buffer.subarray(4);
}
return buffer;
};
tap.test('PARSE-04: Standard BOM detection and removal', async () => {
const bomTypes = [
{
name: 'UTF-8 BOM',
bom: Buffer.from([0xEF, 0xBB, 0xBF]),
encoding: 'UTF-8',
description: 'Most common BOM in XML files'
},
{
name: 'UTF-16 LE BOM',
bom: Buffer.from([0xFF, 0xFE]),
encoding: 'UTF-16LE',
description: 'Little-endian UTF-16'
},
{
name: 'UTF-16 BE BOM',
bom: Buffer.from([0xFE, 0xFF]),
encoding: 'UTF-16BE',
description: 'Big-endian UTF-16'
},
{
name: 'UTF-32 LE BOM',
bom: Buffer.from([0xFF, 0xFE, 0x00, 0x00]),
encoding: 'UTF-32LE',
description: 'Little-endian UTF-32'
},
{
name: 'UTF-32 BE BOM',
bom: Buffer.from([0x00, 0x00, 0xFE, 0xFF]),
encoding: 'UTF-32BE',
description: 'Big-endian UTF-32'
}
performanceTracker.endOperation('standard-bom');
});
];
await t.test('BOM in different positions', async () => {
performanceTracker.startOperation('bom-positions');
const positionTests = [
{
name: 'BOM at start (correct)',
content: Buffer.concat([
Buffer.from([0xEF, 0xBB, 0xBF]),
Buffer.from('<?xml version="1.0"?><invoice><id>TEST-001</id></invoice>')
]),
valid: true
},
{
name: 'BOM after XML declaration',
content: Buffer.concat([
Buffer.from('<?xml version="1.0"?>'),
Buffer.from([0xEF, 0xBB, 0xBF]),
Buffer.from('<invoice><id>TEST-002</id></invoice>')
]),
valid: false
},
{
name: 'BOM in middle of document',
content: Buffer.concat([
Buffer.from('<?xml version="1.0"?><invoice>'),
Buffer.from([0xEF, 0xBB, 0xBF]),
Buffer.from('<id>TEST-003</id></invoice>')
]),
valid: false
},
{
name: 'Multiple BOMs',
content: Buffer.concat([
Buffer.from([0xEF, 0xBB, 0xBF]),
Buffer.from([0xEF, 0xBB, 0xBF]),
Buffer.from('<?xml version="1.0"?><invoice><id>TEST-004</id></invoice>')
]),
valid: false
},
{
name: 'BOM-like bytes in content',
content: Buffer.concat([
Buffer.from('<?xml version="1.0"?><invoice><data>'),
Buffer.from([0xEF, 0xBB, 0xBF]), // These are actual data, not BOM
Buffer.from('</data></invoice>')
]),
valid: true // Valid XML, but BOM-like bytes are data
}
];
for (const test of positionTests) {
const startTime = performance.now();
console.log(`${test.name}:`);
// Check for BOM at start
const hasValidBOM = test.content.length >= 3 &&
test.content[0] === 0xEF &&
test.content[1] === 0xBB &&
test.content[2] === 0xBF &&
test.content.indexOf('<?xml') === 3;
// Find all BOM occurrences
const bomOccurrences = findBOMOccurrences(test.content);
console.log(` BOM occurrences: ${bomOccurrences.length} at positions: ${bomOccurrences.join(', ')}`);
if (test.valid) {
console.log(' ✓ Valid BOM usage');
} else {
console.log(' ✗ Invalid BOM usage');
}
// Try parsing
try {
const invoice = new einvoice.EInvoice();
if (invoice.fromBuffer) {
await invoice.fromBuffer(test.content);
console.log(' Parse result: Success');
}
} catch (error) {
console.log(` Parse result: Failed - ${error.message}`);
}
performanceTracker.recordMetric('bom-position', performance.now() - startTime);
}
performanceTracker.endOperation('bom-positions');
});
await t.test('BOM preservation in round-trip operations', async () => {
performanceTracker.startOperation('bom-roundtrip');
const roundTripTests = [
{
name: 'Preserve UTF-8 BOM',
input: Buffer.concat([
Buffer.from([0xEF, 0xBB, 0xBF]),
Buffer.from('<?xml version="1.0" encoding="UTF-8"?><invoice><id>RT-001</id></invoice>')
]),
preserveBOM: true
},
{
name: 'Remove UTF-8 BOM',
input: Buffer.concat([
Buffer.from([0xEF, 0xBB, 0xBF]),
Buffer.from('<?xml version="1.0" encoding="UTF-8"?><invoice><id>RT-002</id></invoice>')
]),
preserveBOM: false
},
{
name: 'Add BOM to BOM-less file',
input: Buffer.from('<?xml version="1.0" encoding="UTF-8"?><invoice><id>RT-003</id></invoice>'),
preserveBOM: true,
addBOM: true
}
];
for (const test of roundTripTests) {
const startTime = performance.now();
console.log(`${test.name}:`);
const inputHasBOM = test.input.length >= 3 &&
test.input[0] === 0xEF &&
test.input[1] === 0xBB &&
test.input[2] === 0xBF;
console.log(` Input has BOM: ${inputHasBOM}`);
console.log(` Preserve BOM: ${test.preserveBOM}`);
// Simulate round-trip
let processed = test.input;
if (!test.preserveBOM && inputHasBOM) {
// Remove BOM
processed = processed.slice(3);
console.log(' Action: Removed BOM');
} else if (test.addBOM && !inputHasBOM) {
// Add BOM
processed = Buffer.concat([Buffer.from([0xEF, 0xBB, 0xBF]), processed]);
console.log(' Action: Added BOM');
} else {
console.log(' Action: No change');
}
const outputHasBOM = processed.length >= 3 &&
processed[0] === 0xEF &&
processed[1] === 0xBB &&
processed[2] === 0xBF;
console.log(` Output has BOM: ${outputHasBOM}`);
performanceTracker.recordMetric('bom-roundtrip', performance.now() - startTime);
}
performanceTracker.endOperation('bom-roundtrip');
});
await t.test('BOM conflicts with encoding declarations', async () => {
performanceTracker.startOperation('bom-conflicts');
const conflictTests = [
{
name: 'UTF-8 BOM with UTF-8 declaration',
bom: Buffer.from([0xEF, 0xBB, 0xBF]),
declaration: 'UTF-8',
conflict: false
},
{
name: 'UTF-8 BOM with UTF-16 declaration',
bom: Buffer.from([0xEF, 0xBB, 0xBF]),
declaration: 'UTF-16',
conflict: true
},
{
name: 'UTF-16 LE BOM with UTF-8 declaration',
bom: Buffer.from([0xFF, 0xFE]),
declaration: 'UTF-8',
conflict: true
},
{
name: 'UTF-16 BE BOM with UTF-16 declaration',
bom: Buffer.from([0xFE, 0xFF]),
declaration: 'UTF-16',
conflict: false
},
{
name: 'No BOM with any declaration',
bom: Buffer.from([]),
declaration: 'UTF-8',
conflict: false
}
];
for (const test of conflictTests) {
const startTime = performance.now();
const xml = `<?xml version="1.0" encoding="${test.declaration}"?><invoice><id>CONFLICT-TEST</id></invoice>`;
const fullContent = Buffer.concat([test.bom, Buffer.from(xml)]);
console.log(`${test.name}:`);
console.log(` BOM type: ${test.bom.length > 0 ? detectBOMType(test.bom) : 'None'}`);
console.log(` Declaration: ${test.declaration}`);
console.log(` Conflict: ${test.conflict ? '✗ Yes' : '✓ No'}`);
if (test.conflict) {
console.log(' Resolution: BOM takes precedence over declaration');
}
performanceTracker.recordMetric('bom-conflict', performance.now() - startTime);
}
performanceTracker.endOperation('bom-conflicts');
});
await t.test('BOM handling in corpus files', async () => {
performanceTracker.startOperation('corpus-bom');
const corpusLoader = new CorpusLoader();
const files = await corpusLoader.getFiles(/\.(xml|cii|ubl)$/);
console.log(`\nAnalyzing BOM usage in ${files.length} corpus files...`);
const bomStats = {
total: 0,
withBOM: 0,
utf8BOM: 0,
utf16BOM: 0,
otherBOM: 0,
multipleBOM: 0,
invalidPosition: 0
};
const sampleSize = Math.min(100, files.length);
const sampledFiles = files.slice(0, sampleSize);
for (const file of sampledFiles) {
bomStats.total++;
try {
const content = await plugins.fs.readFile(file.path);
for (const bomType of bomTypes) {
const { result, metric } = await PerformanceTracker.track(
'bom-processing',
async () => {
// Create XML with BOM
let xmlContent: Buffer;
let encodingSupported = true;
// Check for BOM
if (content.length >= 3) {
if (content[0] === 0xEF && content[1] === 0xBB && content[2] === 0xBF) {
bomStats.withBOM++;
bomStats.utf8BOM++;
} else if (content.length >= 2) {
if ((content[0] === 0xFF && content[1] === 0xFE) ||
(content[0] === 0xFE && content[1] === 0xFF)) {
bomStats.withBOM++;
bomStats.utf16BOM++;
try {
if (bomType.encoding.startsWith('UTF-16')) {
// Node.js doesn't support UTF-16 BE directly
if (bomType.encoding === 'UTF-16BE') {
// Create UTF-8 content instead for testing
xmlContent = Buffer.from('<?xml version="1.0" encoding="UTF-16"?><invoice><id>TEST-BOM</id></invoice>');
encodingSupported = false;
} else {
const nodeEncoding = bomType.encoding.replace('-', '').toLowerCase();
xmlContent = Buffer.from(
'<?xml version="1.0" encoding="UTF-16"?><invoice><id>TEST-BOM</id></invoice>',
nodeEncoding as BufferEncoding
);
}
} else if (bomType.encoding.startsWith('UTF-32')) {
// UTF-32 not directly supported by Node.js, simulate
xmlContent = Buffer.from('<?xml version="1.0" encoding="UTF-32"?><invoice><id>TEST-BOM</id></invoice>');
encodingSupported = false;
} else {
xmlContent = Buffer.from('<?xml version="1.0" encoding="UTF-8"?><invoice><id>TEST-BOM</id></invoice>');
}
} catch (e) {
// Fallback to UTF-8 if encoding not supported
xmlContent = Buffer.from('<?xml version="1.0" encoding="UTF-8"?><invoice><id>TEST-BOM</id></invoice>');
encodingSupported = false;
}
// Check for multiple BOMs or BOMs in wrong position
const bomOccurrences = findBOMOccurrences(content);
if (bomOccurrences.length > 1) {
bomStats.multipleBOM++;
}
if (bomOccurrences.length > 0 && bomOccurrences[0] !== 0) {
bomStats.invalidPosition++;
}
} catch (error) {
// Skip files that can't be read
const fullContent = Buffer.concat([bomType.bom, xmlContent]);
// Test BOM removal
const withoutBom = removeBOM(fullContent);
const bomRemoved = withoutBom.length === fullContent.length - bomType.bom.length;
return {
bomBytes: Array.from(bomType.bom).map(b => '0x' + b.toString(16).toUpperCase().padStart(2, '0')).join(' '),
totalSize: fullContent.length,
bomRemoved,
encodingSupported
};
}
}
);
console.log('\nBOM Statistics:');
console.log(`Total files analyzed: ${bomStats.total}`);
console.log(`Files with BOM: ${bomStats.withBOM} (${(bomStats.withBOM/bomStats.total*100).toFixed(1)}%)`);
console.log(` UTF-8 BOM: ${bomStats.utf8BOM}`);
console.log(` UTF-16 BOM: ${bomStats.utf16BOM}`);
console.log(` Other BOM: ${bomStats.otherBOM}`);
console.log(`Multiple BOMs: ${bomStats.multipleBOM}`);
console.log(`Invalid BOM position: ${bomStats.invalidPosition}`);
performanceTracker.endOperation('corpus-bom');
});
await t.test('BOM security implications', async () => {
performanceTracker.startOperation('bom-security');
const securityTests = [
{
name: 'BOM hiding malicious content',
content: Buffer.concat([
Buffer.from([0xEF, 0xBB, 0xBF]),
Buffer.from('<?xml version="1.0"?><!-- '),
Buffer.from([0xEF, 0xBB, 0xBF]), // Hidden BOM in comment
Buffer.from(' --><invoice><script>alert("XSS")</script></invoice>')
]),
risk: 'BOM bytes could be used to bypass filters'
},
{
name: 'Zero-width BOM characters',
content: Buffer.from('<?xml version="1.0"?><invoice>\uFEFF<id>TEST</id></invoice>'),
risk: 'Invisible characters could hide malicious content'
},
{
name: 'BOM-based encoding confusion',
content: Buffer.concat([
Buffer.from([0xFF, 0xFE]), // UTF-16 LE BOM
Buffer.from('<?xml version="1.0" encoding="UTF-8"?><invoice><id>TEST</id></invoice>')
]),
risk: 'Encoding mismatch could lead to parsing errors'
}
];
for (const test of securityTests) {
const startTime = performance.now();
console.log(`${test.name}:`);
console.log(` Risk: ${test.risk}`);
// Scan for suspicious patterns
const bomCount = findBOMOccurrences(test.content).length;
const hasMultipleBOMs = bomCount > 1;
const hasInvisibleChars = test.content.includes(0xFEFF) ||
test.content.toString().includes('\uFEFF');
console.log(` BOM count: ${bomCount}`);
console.log(` Multiple BOMs: ${hasMultipleBOMs ? '✗ Yes' : '✓ No'}`);
console.log(` Invisible chars: ${hasInvisibleChars ? '✗ Yes' : '✓ No'}`);
if (hasMultipleBOMs || hasInvisibleChars) {
console.log(' ⚠️ Security risk detected');
}
performanceTracker.recordMetric('bom-security', performance.now() - startTime);
}
performanceTracker.endOperation('bom-security');
});
await t.test('BOM handling performance', async () => {
performanceTracker.startOperation('bom-performance');
const sizes = [1000, 10000, 100000]; // 1KB, 10KB, 100KB
for (const size of sizes) {
// Generate content with BOM
const bom = Buffer.from([0xEF, 0xBB, 0xBF]);
const xmlContent = Buffer.from(`<?xml version="1.0"?><invoice><data>${'x'.repeat(size)}</data></invoice>`);
const withBOM = Buffer.concat([bom, xmlContent]);
// Measure BOM detection time
const detectStart = performance.now();
for (let i = 0; i < 1000; i++) {
const hasBOM = withBOM.length >= 3 &&
withBOM[0] === 0xEF &&
withBOM[1] === 0xBB &&
withBOM[2] === 0xBF;
}
const detectTime = performance.now() - detectStart;
// Measure BOM removal time
const removeStart = performance.now();
for (let i = 0; i < 1000; i++) {
const cleaned = removeBOM(withBOM);
}
const removeTime = performance.now() - removeStart;
console.log(`File size ${size} bytes:`);
console.log(` BOM detection: ${(detectTime/1000).toFixed(3)}ms per operation`);
console.log(` BOM removal: ${(removeTime/1000).toFixed(3)}ms per operation`);
performanceTracker.recordMetric(`bom-perf-${size}`, detectTime + removeTime);
}
performanceTracker.endOperation('bom-performance');
});
// Helper functions
function removeBOM(buffer: Buffer): Buffer {
if (buffer.length >= 3 &&
buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) {
return buffer.slice(3);
}
if (buffer.length >= 2) {
if ((buffer[0] === 0xFF && buffer[1] === 0xFE) ||
(buffer[0] === 0xFE && buffer[1] === 0xFF)) {
return buffer.slice(2);
}
}
if (buffer.length >= 4) {
if ((buffer[0] === 0xFF && buffer[1] === 0xFE &&
buffer[2] === 0x00 && buffer[3] === 0x00) ||
(buffer[0] === 0x00 && buffer[1] === 0x00 &&
buffer[2] === 0xFE && buffer[3] === 0xFF)) {
return buffer.slice(4);
}
}
return buffer;
console.log(`${bomType.name}:`);
console.log(` BOM: ${result.bomBytes}`);
console.log(` Encoding: ${bomType.encoding}`);
console.log(` Description: ${bomType.description}`);
console.log(` Total size: ${result.totalSize} bytes`);
console.log(` ${result.bomRemoved ? '✓' : '✗'} BOM ${result.bomRemoved ? 'removed successfully' : 'removal failed'}`);
console.log(` Processing time: ${metric.duration.toFixed(2)}ms`);
}
function findBOMOccurrences(buffer: Buffer): number[] {
const positions: number[] = [];
for (let i = 0; i < buffer.length - 2; i++) {
if (buffer[i] === 0xEF && buffer[i+1] === 0xBB && buffer[i+2] === 0xBF) {
positions.push(i);
i += 2; // Skip past this BOM
}
}
return positions;
}
function detectBOMType(bom: Buffer): string {
if (bom.length >= 3 && bom[0] === 0xEF && bom[1] === 0xBB && bom[2] === 0xBF) {
return 'UTF-8';
}
if (bom.length >= 2) {
if (bom[0] === 0xFF && bom[1] === 0xFE) {
if (bom.length >= 4 && bom[2] === 0x00 && bom[3] === 0x00) {
return 'UTF-32LE';
}
return 'UTF-16LE';
}
if (bom[0] === 0xFE && bom[1] === 0xFF) {
return 'UTF-16BE';
}
}
if (bom.length >= 4 && bom[0] === 0x00 && bom[1] === 0x00 &&
bom[2] === 0xFE && bom[3] === 0xFF) {
return 'UTF-32BE';
}
return 'Unknown';
}
// Performance summary
console.log('\n' + performanceTracker.getSummary());
// BOM handling best practices
console.log('\nBOM Handling Best Practices:');
console.log('1. Always check for BOM before parsing XML');
console.log('2. Remove BOM after detection to avoid parsing issues');
console.log('3. Preserve BOM information for round-trip operations if needed');
console.log('4. Handle conflicts between BOM and encoding declarations');
console.log('5. Be aware of security implications of multiple/hidden BOMs');
console.log('6. Test with files both with and without BOM');
console.log('7. Consider BOM handling in performance-critical paths');
console.log('8. Support all common BOM types (UTF-8, UTF-16, UTF-32)');
});
tap.test('PARSE-04: BOM in different positions', async () => {
const positionTests = [
{
name: 'BOM at start (correct)',
content: Buffer.concat([
Buffer.from([0xEF, 0xBB, 0xBF]),
Buffer.from('<?xml version="1.0"?><invoice><id>TEST-001</id></invoice>')
]),
valid: true
},
{
name: 'BOM after XML declaration',
content: Buffer.concat([
Buffer.from('<?xml version="1.0"?>'),
Buffer.from([0xEF, 0xBB, 0xBF]),
Buffer.from('<invoice><id>TEST-002</id></invoice>')
]),
valid: false
},
{
name: 'No BOM',
content: Buffer.from('<?xml version="1.0"?><invoice><id>TEST-003</id></invoice>'),
valid: true
},
{
name: 'Multiple BOMs',
content: Buffer.concat([
Buffer.from([0xEF, 0xBB, 0xBF]),
Buffer.from([0xEF, 0xBB, 0xBF]),
Buffer.from('<?xml version="1.0"?><invoice><id>TEST-004</id></invoice>')
]),
valid: false
}
];
for (const test of positionTests) {
const { result } = await PerformanceTracker.track(
'bom-position',
async () => {
const invoice = new einvoice.EInvoice();
try {
await invoice.fromXmlString(test.content.toString('utf8'));
return { parsed: true, error: null };
} catch (error) {
return { parsed: false, error: error.message };
}
}
);
console.log(`${test.name}: ${result.parsed ? '✓' : '✗'}`);
console.log(` Expected ${test.valid ? 'valid' : 'invalid'}, got ${result.parsed ? 'parsed' : 'error'}`);
if (!result.parsed) {
console.log(` Error: ${result.error}`);
}
}
});
tap.test('PARSE-04: Real invoice files with BOM', async () => {
// Test with actual invoice formats that might have BOM
const realWorldTests = [
{
name: 'UBL with UTF-8 BOM',
xml: Buffer.concat([
Buffer.from([0xEF, 0xBB, 0xBF]), // UTF-8 BOM
Buffer.from(`<?xml version="1.0" encoding="UTF-8"?>
<ubl:Invoice xmlns:ubl="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2"
xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2"
xmlns:cac="urn:oasis:names:specification:ubl:schema:xsd:CommonAggregateComponents-2">
<cbc:ID>BOM-UBL-001</cbc:ID>
<cbc:IssueDate>2024-01-01</cbc:IssueDate>
<cac:AccountingSupplierParty>
<cac:Party>
<cac:PartyName>
<cbc:Name>Test Supplier</cbc:Name>
</cac:PartyName>
<cac:PostalAddress>
<cbc:CityName>Berlin</cbc:CityName>
<cbc:PostalZone>10115</cbc:PostalZone>
<cac:Country>
<cbc:IdentificationCode>DE</cbc:IdentificationCode>
</cac:Country>
</cac:PostalAddress>
</cac:Party>
</cac:AccountingSupplierParty>
<cac:AccountingCustomerParty>
<cac:Party>
<cac:PartyName>
<cbc:Name>Test Customer</cbc:Name>
</cac:PartyName>
<cac:PostalAddress>
<cbc:CityName>Munich</cbc:CityName>
<cbc:PostalZone>80331</cbc:PostalZone>
<cac:Country>
<cbc:IdentificationCode>DE</cbc:IdentificationCode>
</cac:Country>
</cac:PostalAddress>
</cac:Party>
</cac:AccountingCustomerParty>
<cac:InvoiceLine>
<cbc:ID>1</cbc:ID>
<cbc:InvoicedQuantity unitCode="EA">1</cbc:InvoicedQuantity>
<cbc:LineExtensionAmount currencyID="EUR">100.00</cbc:LineExtensionAmount>
<cac:Item>
<cbc:Name>Test Product</cbc:Name>
</cac:Item>
<cac:Price>
<cbc:PriceAmount currencyID="EUR">100.00</cbc:PriceAmount>
</cac:Price>
</cac:InvoiceLine>
</ubl:Invoice>`)
])
},
{
name: 'ZUGFeRD with UTF-8 BOM',
xml: Buffer.concat([
Buffer.from([0xEF, 0xBB, 0xBF]), // UTF-8 BOM
Buffer.from(`<?xml version="1.0" encoding="UTF-8"?>
<rsm:CrossIndustryInvoice xmlns:rsm="urn:un:unece:uncefact:data:standard:CrossIndustryInvoice:100"
xmlns:ram="urn:un:unece:uncefact:data:standard:ReusableAggregateBusinessInformationEntity:100">
<rsm:ExchangedDocument>
<ram:ID>BOM-ZUGFERD-001</ram:ID>
</rsm:ExchangedDocument>
</rsm:CrossIndustryInvoice>`)
])
}
];
for (const test of realWorldTests) {
const { result } = await PerformanceTracker.track(
'real-world-bom',
async () => {
const invoice = new einvoice.EInvoice();
try {
await invoice.fromXmlString(test.xml.toString('utf8'));
return {
success: true,
id: invoice.id,
format: invoice.getFormat()
};
} catch (error) {
return {
success: false,
error: error.message
};
}
}
);
console.log(`${test.name}: ${result.success ? '✓' : '✗'}`);
if (result.success) {
console.log(` Invoice ID: ${result.id}`);
console.log(` Format: ${einvoice.InvoiceFormat[result.format]}`);
} else {
console.log(` Error: ${result.error}`);
}
}
});
tap.test('PARSE-04: BOM encoding conflicts', async () => {
const conflictTests = [
{
name: 'UTF-16 BOM with UTF-8 declaration',
bom: Buffer.from([0xFF, 0xFE]), // UTF-16 LE BOM
xml: '<?xml version="1.0" encoding="UTF-8"?><invoice><id>CONFLICT-001</id></invoice>',
issue: 'BOM indicates UTF-16 but declaration says UTF-8'
},
{
name: 'UTF-8 BOM with ISO-8859-1 declaration',
bom: Buffer.from([0xEF, 0xBB, 0xBF]), // UTF-8 BOM
xml: '<?xml version="1.0" encoding="ISO-8859-1"?><invoice><id>CONFLICT-002</id></invoice>',
issue: 'BOM indicates UTF-8 but declaration says ISO-8859-1'
}
];
for (const test of conflictTests) {
const content = Buffer.concat([test.bom, Buffer.from(test.xml)]);
const { result } = await PerformanceTracker.track(
'bom-conflict',
async () => {
const invoice = new einvoice.EInvoice();
try {
await invoice.fromXmlString(content.toString('utf8'));
return { parsed: true };
} catch (error) {
return {
parsed: false,
error: error.message,
isEncodingError: error.message.toLowerCase().includes('encoding') ||
error.message.toLowerCase().includes('bom')
};
}
}
);
console.log(`${test.name}: ${!result.parsed ? '✓ (correctly rejected)' : '✗ (should have failed)'}`);
console.log(` Issue: ${test.issue}`);
if (!result.parsed) {
console.log(` ${result.isEncodingError ? 'Encoding error detected' : 'Other error'}`);
}
}
});
tap.test('PARSE-04: Performance with BOM', async () => {
const sizes = [1, 10, 100];
for (const size of sizes) {
// Generate invoice with many line items
const lines = [];
for (let i = 1; i <= size; i++) {
lines.push(`
<cac:InvoiceLine>
<cbc:ID>${i}</cbc:ID>
<cbc:InvoicedQuantity unitCode="EA">1</cbc:InvoicedQuantity>
<cbc:LineExtensionAmount currencyID="EUR">${i * 10}.00</cbc:LineExtensionAmount>
<cac:Item>
<cbc:Name>Product ${i}</cbc:Name>
</cac:Item>
</cac:InvoiceLine>`);
}
const xmlWithBom = Buffer.concat([
Buffer.from([0xEF, 0xBB, 0xBF]), // UTF-8 BOM
Buffer.from(`<?xml version="1.0" encoding="UTF-8"?>
<ubl:Invoice xmlns:ubl="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2"
xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2"
xmlns:cac="urn:oasis:names:specification:ubl:schema:xsd:CommonAggregateComponents-2">
<cbc:ID>PERF-BOM-${size}</cbc:ID>
<cbc:IssueDate>2024-01-01</cbc:IssueDate>
<cac:AccountingSupplierParty>
<cac:Party>
<cac:PartyName>
<cbc:Name>Performance Test Supplier</cbc:Name>
</cac:PartyName>
<cac:PostalAddress>
<cbc:CityName>Berlin</cbc:CityName>
<cbc:PostalZone>10115</cbc:PostalZone>
<cac:Country>
<cbc:IdentificationCode>DE</cbc:IdentificationCode>
</cac:Country>
</cac:PostalAddress>
</cac:Party>
</cac:AccountingSupplierParty>
<cac:AccountingCustomerParty>
<cac:Party>
<cac:PartyName>
<cbc:Name>Performance Test Customer</cbc:Name>
</cac:PartyName>
<cac:PostalAddress>
<cbc:CityName>Munich</cbc:CityName>
<cbc:PostalZone>80331</cbc:PostalZone>
<cac:Country>
<cbc:IdentificationCode>DE</cbc:IdentificationCode>
</cac:Country>
</cac:PostalAddress>
</cac:Party>
</cac:AccountingCustomerParty>
${lines.join('')}
</ubl:Invoice>`)
]);
const { result, metric } = await PerformanceTracker.track(
`bom-performance-${size}`,
async () => {
const invoice = new einvoice.EInvoice();
try {
await invoice.fromXmlString(xmlWithBom.toString('utf8'));
return {
success: true,
itemCount: invoice.items?.length || 0
};
} catch (error) {
return {
success: false,
error: error.message
};
}
}
);
const xmlSize = xmlWithBom.length / 1024; // KB
console.log(`Parse ${size} items with BOM (${xmlSize.toFixed(1)}KB): ${result.success ? '✓' : '✗'}`);
if (result.success) {
console.log(` Items parsed: ${result.itemCount}`);
console.log(` Parse time: ${metric.duration.toFixed(2)}ms`);
console.log(` Speed: ${(xmlSize / metric.duration * 1000).toFixed(2)}KB/s`);
}
}
});
tap.test('PARSE-04: BOM handling summary', async () => {
console.log('\nBOM Handling Best Practices:');
console.log('1. Always check for BOM at the beginning of XML files');
console.log('2. Remove BOM before parsing if present');
console.log('3. Handle conflicts between BOM and encoding declaration');
console.log('4. Support UTF-8, UTF-16, and UTF-32 BOMs');
console.log('5. Validate that BOM matches the actual encoding');
const stats = PerformanceTracker.getStats('bom-processing');
if (stats) {
console.log(`\nBOM Processing Performance:`);
console.log(` Average: ${stats.avg.toFixed(2)}ms`);
console.log(` Max: ${stats.max.toFixed(2)}ms`);
}
});
// Run the tests
tap.start();