update
This commit is contained in:
@@ -1,532 +1,435 @@
|
||||
import { expect, tap } from '@git.zone/tstest/tapbundle';
|
||||
import { tap } from '@git.zone/tstest/tapbundle';
|
||||
import * as einvoice from '../../../ts/index.js';
|
||||
import * as plugins from '../../plugins.js';
|
||||
import { CorpusLoader } from '../../helpers/corpus.loader.js';
|
||||
import { PerformanceTracker } from '../../helpers/performance.tracker.js';
|
||||
|
||||
tap.test('PARSE-04: BOM Handling - Process Byte Order Marks correctly across encodings', async (t) => {
|
||||
const performanceTracker = new PerformanceTracker('PARSE-04');
|
||||
|
||||
await t.test('Standard BOM detection and removal', async () => {
|
||||
performanceTracker.startOperation('standard-bom');
|
||||
|
||||
const bomTypes = [
|
||||
{
|
||||
name: 'UTF-8 BOM',
|
||||
bom: Buffer.from([0xEF, 0xBB, 0xBF]),
|
||||
encoding: 'UTF-8',
|
||||
description: 'Most common BOM in XML files'
|
||||
},
|
||||
{
|
||||
name: 'UTF-16 LE BOM',
|
||||
bom: Buffer.from([0xFF, 0xFE]),
|
||||
encoding: 'UTF-16LE',
|
||||
description: 'Little-endian UTF-16'
|
||||
},
|
||||
{
|
||||
name: 'UTF-16 BE BOM',
|
||||
bom: Buffer.from([0xFE, 0xFF]),
|
||||
encoding: 'UTF-16BE',
|
||||
description: 'Big-endian UTF-16'
|
||||
},
|
||||
{
|
||||
name: 'UTF-32 LE BOM',
|
||||
bom: Buffer.from([0xFF, 0xFE, 0x00, 0x00]),
|
||||
encoding: 'UTF-32LE',
|
||||
description: 'Little-endian UTF-32'
|
||||
},
|
||||
{
|
||||
name: 'UTF-32 BE BOM',
|
||||
bom: Buffer.from([0x00, 0x00, 0xFE, 0xFF]),
|
||||
encoding: 'UTF-32BE',
|
||||
description: 'Big-endian UTF-32'
|
||||
}
|
||||
];
|
||||
|
||||
for (const bomType of bomTypes) {
|
||||
const startTime = performance.now();
|
||||
|
||||
// Create XML with BOM
|
||||
let xmlContent: Buffer;
|
||||
if (bomType.encoding.startsWith('UTF-16')) {
|
||||
xmlContent = Buffer.from(
|
||||
'<?xml version="1.0" encoding="UTF-16"?><invoice><id>TEST-BOM</id></invoice>',
|
||||
bomType.encoding.toLowerCase() as BufferEncoding
|
||||
);
|
||||
} else if (bomType.encoding.startsWith('UTF-32')) {
|
||||
// UTF-32 not directly supported by Node.js, simulate
|
||||
xmlContent = Buffer.from('<?xml version="1.0" encoding="UTF-32"?><invoice><id>TEST-BOM</id></invoice>');
|
||||
} else {
|
||||
xmlContent = Buffer.from('<?xml version="1.0" encoding="UTF-8"?><invoice><id>TEST-BOM</id></invoice>');
|
||||
}
|
||||
|
||||
const fullContent = Buffer.concat([bomType.bom, xmlContent]);
|
||||
|
||||
console.log(`${bomType.name}:`);
|
||||
console.log(` BOM: ${Array.from(bomType.bom).map(b => '0x' + b.toString(16).toUpperCase().padStart(2, '0')).join(' ')}`);
|
||||
console.log(` Encoding: ${bomType.encoding}`);
|
||||
console.log(` Description: ${bomType.description}`);
|
||||
console.log(` Total size: ${fullContent.length} bytes`);
|
||||
|
||||
// Test BOM removal
|
||||
const withoutBom = removeBOM(fullContent);
|
||||
if (withoutBom.length === fullContent.length - bomType.bom.length) {
|
||||
console.log(' ✓ BOM removed successfully');
|
||||
} else {
|
||||
console.log(' ✗ BOM removal failed');
|
||||
}
|
||||
|
||||
performanceTracker.recordMetric('bom-processing', performance.now() - startTime);
|
||||
// Helper function to remove BOM from buffer
|
||||
const removeBOM = (buffer: Buffer): Buffer => {
|
||||
// UTF-8 BOM
|
||||
if (buffer.length >= 3 && buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) {
|
||||
return buffer.subarray(3);
|
||||
}
|
||||
// UTF-16 LE BOM
|
||||
if (buffer.length >= 2 && buffer[0] === 0xFF && buffer[1] === 0xFE) {
|
||||
return buffer.subarray(2);
|
||||
}
|
||||
// UTF-16 BE BOM
|
||||
if (buffer.length >= 2 && buffer[0] === 0xFE && buffer[1] === 0xFF) {
|
||||
return buffer.subarray(2);
|
||||
}
|
||||
// UTF-32 LE BOM
|
||||
if (buffer.length >= 4 && buffer[0] === 0xFF && buffer[1] === 0xFE && buffer[2] === 0x00 && buffer[3] === 0x00) {
|
||||
return buffer.subarray(4);
|
||||
}
|
||||
// UTF-32 BE BOM
|
||||
if (buffer.length >= 4 && buffer[0] === 0x00 && buffer[1] === 0x00 && buffer[2] === 0xFE && buffer[3] === 0xFF) {
|
||||
return buffer.subarray(4);
|
||||
}
|
||||
return buffer;
|
||||
};
|
||||
|
||||
tap.test('PARSE-04: Standard BOM detection and removal', async () => {
|
||||
const bomTypes = [
|
||||
{
|
||||
name: 'UTF-8 BOM',
|
||||
bom: Buffer.from([0xEF, 0xBB, 0xBF]),
|
||||
encoding: 'UTF-8',
|
||||
description: 'Most common BOM in XML files'
|
||||
},
|
||||
{
|
||||
name: 'UTF-16 LE BOM',
|
||||
bom: Buffer.from([0xFF, 0xFE]),
|
||||
encoding: 'UTF-16LE',
|
||||
description: 'Little-endian UTF-16'
|
||||
},
|
||||
{
|
||||
name: 'UTF-16 BE BOM',
|
||||
bom: Buffer.from([0xFE, 0xFF]),
|
||||
encoding: 'UTF-16BE',
|
||||
description: 'Big-endian UTF-16'
|
||||
},
|
||||
{
|
||||
name: 'UTF-32 LE BOM',
|
||||
bom: Buffer.from([0xFF, 0xFE, 0x00, 0x00]),
|
||||
encoding: 'UTF-32LE',
|
||||
description: 'Little-endian UTF-32'
|
||||
},
|
||||
{
|
||||
name: 'UTF-32 BE BOM',
|
||||
bom: Buffer.from([0x00, 0x00, 0xFE, 0xFF]),
|
||||
encoding: 'UTF-32BE',
|
||||
description: 'Big-endian UTF-32'
|
||||
}
|
||||
|
||||
performanceTracker.endOperation('standard-bom');
|
||||
});
|
||||
];
|
||||
|
||||
await t.test('BOM in different positions', async () => {
|
||||
performanceTracker.startOperation('bom-positions');
|
||||
|
||||
const positionTests = [
|
||||
{
|
||||
name: 'BOM at start (correct)',
|
||||
content: Buffer.concat([
|
||||
Buffer.from([0xEF, 0xBB, 0xBF]),
|
||||
Buffer.from('<?xml version="1.0"?><invoice><id>TEST-001</id></invoice>')
|
||||
]),
|
||||
valid: true
|
||||
},
|
||||
{
|
||||
name: 'BOM after XML declaration',
|
||||
content: Buffer.concat([
|
||||
Buffer.from('<?xml version="1.0"?>'),
|
||||
Buffer.from([0xEF, 0xBB, 0xBF]),
|
||||
Buffer.from('<invoice><id>TEST-002</id></invoice>')
|
||||
]),
|
||||
valid: false
|
||||
},
|
||||
{
|
||||
name: 'BOM in middle of document',
|
||||
content: Buffer.concat([
|
||||
Buffer.from('<?xml version="1.0"?><invoice>'),
|
||||
Buffer.from([0xEF, 0xBB, 0xBF]),
|
||||
Buffer.from('<id>TEST-003</id></invoice>')
|
||||
]),
|
||||
valid: false
|
||||
},
|
||||
{
|
||||
name: 'Multiple BOMs',
|
||||
content: Buffer.concat([
|
||||
Buffer.from([0xEF, 0xBB, 0xBF]),
|
||||
Buffer.from([0xEF, 0xBB, 0xBF]),
|
||||
Buffer.from('<?xml version="1.0"?><invoice><id>TEST-004</id></invoice>')
|
||||
]),
|
||||
valid: false
|
||||
},
|
||||
{
|
||||
name: 'BOM-like bytes in content',
|
||||
content: Buffer.concat([
|
||||
Buffer.from('<?xml version="1.0"?><invoice><data>'),
|
||||
Buffer.from([0xEF, 0xBB, 0xBF]), // These are actual data, not BOM
|
||||
Buffer.from('</data></invoice>')
|
||||
]),
|
||||
valid: true // Valid XML, but BOM-like bytes are data
|
||||
}
|
||||
];
|
||||
|
||||
for (const test of positionTests) {
|
||||
const startTime = performance.now();
|
||||
|
||||
console.log(`${test.name}:`);
|
||||
|
||||
// Check for BOM at start
|
||||
const hasValidBOM = test.content.length >= 3 &&
|
||||
test.content[0] === 0xEF &&
|
||||
test.content[1] === 0xBB &&
|
||||
test.content[2] === 0xBF &&
|
||||
test.content.indexOf('<?xml') === 3;
|
||||
|
||||
// Find all BOM occurrences
|
||||
const bomOccurrences = findBOMOccurrences(test.content);
|
||||
console.log(` BOM occurrences: ${bomOccurrences.length} at positions: ${bomOccurrences.join(', ')}`);
|
||||
|
||||
if (test.valid) {
|
||||
console.log(' ✓ Valid BOM usage');
|
||||
} else {
|
||||
console.log(' ✗ Invalid BOM usage');
|
||||
}
|
||||
|
||||
// Try parsing
|
||||
try {
|
||||
const invoice = new einvoice.EInvoice();
|
||||
if (invoice.fromBuffer) {
|
||||
await invoice.fromBuffer(test.content);
|
||||
console.log(' Parse result: Success');
|
||||
}
|
||||
} catch (error) {
|
||||
console.log(` Parse result: Failed - ${error.message}`);
|
||||
}
|
||||
|
||||
performanceTracker.recordMetric('bom-position', performance.now() - startTime);
|
||||
}
|
||||
|
||||
performanceTracker.endOperation('bom-positions');
|
||||
});
|
||||
|
||||
await t.test('BOM preservation in round-trip operations', async () => {
|
||||
performanceTracker.startOperation('bom-roundtrip');
|
||||
|
||||
const roundTripTests = [
|
||||
{
|
||||
name: 'Preserve UTF-8 BOM',
|
||||
input: Buffer.concat([
|
||||
Buffer.from([0xEF, 0xBB, 0xBF]),
|
||||
Buffer.from('<?xml version="1.0" encoding="UTF-8"?><invoice><id>RT-001</id></invoice>')
|
||||
]),
|
||||
preserveBOM: true
|
||||
},
|
||||
{
|
||||
name: 'Remove UTF-8 BOM',
|
||||
input: Buffer.concat([
|
||||
Buffer.from([0xEF, 0xBB, 0xBF]),
|
||||
Buffer.from('<?xml version="1.0" encoding="UTF-8"?><invoice><id>RT-002</id></invoice>')
|
||||
]),
|
||||
preserveBOM: false
|
||||
},
|
||||
{
|
||||
name: 'Add BOM to BOM-less file',
|
||||
input: Buffer.from('<?xml version="1.0" encoding="UTF-8"?><invoice><id>RT-003</id></invoice>'),
|
||||
preserveBOM: true,
|
||||
addBOM: true
|
||||
}
|
||||
];
|
||||
|
||||
for (const test of roundTripTests) {
|
||||
const startTime = performance.now();
|
||||
|
||||
console.log(`${test.name}:`);
|
||||
|
||||
const inputHasBOM = test.input.length >= 3 &&
|
||||
test.input[0] === 0xEF &&
|
||||
test.input[1] === 0xBB &&
|
||||
test.input[2] === 0xBF;
|
||||
|
||||
console.log(` Input has BOM: ${inputHasBOM}`);
|
||||
console.log(` Preserve BOM: ${test.preserveBOM}`);
|
||||
|
||||
// Simulate round-trip
|
||||
let processed = test.input;
|
||||
|
||||
if (!test.preserveBOM && inputHasBOM) {
|
||||
// Remove BOM
|
||||
processed = processed.slice(3);
|
||||
console.log(' Action: Removed BOM');
|
||||
} else if (test.addBOM && !inputHasBOM) {
|
||||
// Add BOM
|
||||
processed = Buffer.concat([Buffer.from([0xEF, 0xBB, 0xBF]), processed]);
|
||||
console.log(' Action: Added BOM');
|
||||
} else {
|
||||
console.log(' Action: No change');
|
||||
}
|
||||
|
||||
const outputHasBOM = processed.length >= 3 &&
|
||||
processed[0] === 0xEF &&
|
||||
processed[1] === 0xBB &&
|
||||
processed[2] === 0xBF;
|
||||
|
||||
console.log(` Output has BOM: ${outputHasBOM}`);
|
||||
|
||||
performanceTracker.recordMetric('bom-roundtrip', performance.now() - startTime);
|
||||
}
|
||||
|
||||
performanceTracker.endOperation('bom-roundtrip');
|
||||
});
|
||||
|
||||
await t.test('BOM conflicts with encoding declarations', async () => {
|
||||
performanceTracker.startOperation('bom-conflicts');
|
||||
|
||||
const conflictTests = [
|
||||
{
|
||||
name: 'UTF-8 BOM with UTF-8 declaration',
|
||||
bom: Buffer.from([0xEF, 0xBB, 0xBF]),
|
||||
declaration: 'UTF-8',
|
||||
conflict: false
|
||||
},
|
||||
{
|
||||
name: 'UTF-8 BOM with UTF-16 declaration',
|
||||
bom: Buffer.from([0xEF, 0xBB, 0xBF]),
|
||||
declaration: 'UTF-16',
|
||||
conflict: true
|
||||
},
|
||||
{
|
||||
name: 'UTF-16 LE BOM with UTF-8 declaration',
|
||||
bom: Buffer.from([0xFF, 0xFE]),
|
||||
declaration: 'UTF-8',
|
||||
conflict: true
|
||||
},
|
||||
{
|
||||
name: 'UTF-16 BE BOM with UTF-16 declaration',
|
||||
bom: Buffer.from([0xFE, 0xFF]),
|
||||
declaration: 'UTF-16',
|
||||
conflict: false
|
||||
},
|
||||
{
|
||||
name: 'No BOM with any declaration',
|
||||
bom: Buffer.from([]),
|
||||
declaration: 'UTF-8',
|
||||
conflict: false
|
||||
}
|
||||
];
|
||||
|
||||
for (const test of conflictTests) {
|
||||
const startTime = performance.now();
|
||||
|
||||
const xml = `<?xml version="1.0" encoding="${test.declaration}"?><invoice><id>CONFLICT-TEST</id></invoice>`;
|
||||
const fullContent = Buffer.concat([test.bom, Buffer.from(xml)]);
|
||||
|
||||
console.log(`${test.name}:`);
|
||||
console.log(` BOM type: ${test.bom.length > 0 ? detectBOMType(test.bom) : 'None'}`);
|
||||
console.log(` Declaration: ${test.declaration}`);
|
||||
console.log(` Conflict: ${test.conflict ? '✗ Yes' : '✓ No'}`);
|
||||
|
||||
if (test.conflict) {
|
||||
console.log(' Resolution: BOM takes precedence over declaration');
|
||||
}
|
||||
|
||||
performanceTracker.recordMetric('bom-conflict', performance.now() - startTime);
|
||||
}
|
||||
|
||||
performanceTracker.endOperation('bom-conflicts');
|
||||
});
|
||||
|
||||
await t.test('BOM handling in corpus files', async () => {
|
||||
performanceTracker.startOperation('corpus-bom');
|
||||
|
||||
const corpusLoader = new CorpusLoader();
|
||||
const files = await corpusLoader.getFiles(/\.(xml|cii|ubl)$/);
|
||||
|
||||
console.log(`\nAnalyzing BOM usage in ${files.length} corpus files...`);
|
||||
|
||||
const bomStats = {
|
||||
total: 0,
|
||||
withBOM: 0,
|
||||
utf8BOM: 0,
|
||||
utf16BOM: 0,
|
||||
otherBOM: 0,
|
||||
multipleBOM: 0,
|
||||
invalidPosition: 0
|
||||
};
|
||||
|
||||
const sampleSize = Math.min(100, files.length);
|
||||
const sampledFiles = files.slice(0, sampleSize);
|
||||
|
||||
for (const file of sampledFiles) {
|
||||
bomStats.total++;
|
||||
|
||||
try {
|
||||
const content = await plugins.fs.readFile(file.path);
|
||||
for (const bomType of bomTypes) {
|
||||
const { result, metric } = await PerformanceTracker.track(
|
||||
'bom-processing',
|
||||
async () => {
|
||||
// Create XML with BOM
|
||||
let xmlContent: Buffer;
|
||||
let encodingSupported = true;
|
||||
|
||||
// Check for BOM
|
||||
if (content.length >= 3) {
|
||||
if (content[0] === 0xEF && content[1] === 0xBB && content[2] === 0xBF) {
|
||||
bomStats.withBOM++;
|
||||
bomStats.utf8BOM++;
|
||||
} else if (content.length >= 2) {
|
||||
if ((content[0] === 0xFF && content[1] === 0xFE) ||
|
||||
(content[0] === 0xFE && content[1] === 0xFF)) {
|
||||
bomStats.withBOM++;
|
||||
bomStats.utf16BOM++;
|
||||
try {
|
||||
if (bomType.encoding.startsWith('UTF-16')) {
|
||||
// Node.js doesn't support UTF-16 BE directly
|
||||
if (bomType.encoding === 'UTF-16BE') {
|
||||
// Create UTF-8 content instead for testing
|
||||
xmlContent = Buffer.from('<?xml version="1.0" encoding="UTF-16"?><invoice><id>TEST-BOM</id></invoice>');
|
||||
encodingSupported = false;
|
||||
} else {
|
||||
const nodeEncoding = bomType.encoding.replace('-', '').toLowerCase();
|
||||
xmlContent = Buffer.from(
|
||||
'<?xml version="1.0" encoding="UTF-16"?><invoice><id>TEST-BOM</id></invoice>',
|
||||
nodeEncoding as BufferEncoding
|
||||
);
|
||||
}
|
||||
} else if (bomType.encoding.startsWith('UTF-32')) {
|
||||
// UTF-32 not directly supported by Node.js, simulate
|
||||
xmlContent = Buffer.from('<?xml version="1.0" encoding="UTF-32"?><invoice><id>TEST-BOM</id></invoice>');
|
||||
encodingSupported = false;
|
||||
} else {
|
||||
xmlContent = Buffer.from('<?xml version="1.0" encoding="UTF-8"?><invoice><id>TEST-BOM</id></invoice>');
|
||||
}
|
||||
} catch (e) {
|
||||
// Fallback to UTF-8 if encoding not supported
|
||||
xmlContent = Buffer.from('<?xml version="1.0" encoding="UTF-8"?><invoice><id>TEST-BOM</id></invoice>');
|
||||
encodingSupported = false;
|
||||
}
|
||||
|
||||
// Check for multiple BOMs or BOMs in wrong position
|
||||
const bomOccurrences = findBOMOccurrences(content);
|
||||
if (bomOccurrences.length > 1) {
|
||||
bomStats.multipleBOM++;
|
||||
}
|
||||
if (bomOccurrences.length > 0 && bomOccurrences[0] !== 0) {
|
||||
bomStats.invalidPosition++;
|
||||
}
|
||||
} catch (error) {
|
||||
// Skip files that can't be read
|
||||
const fullContent = Buffer.concat([bomType.bom, xmlContent]);
|
||||
|
||||
// Test BOM removal
|
||||
const withoutBom = removeBOM(fullContent);
|
||||
const bomRemoved = withoutBom.length === fullContent.length - bomType.bom.length;
|
||||
|
||||
return {
|
||||
bomBytes: Array.from(bomType.bom).map(b => '0x' + b.toString(16).toUpperCase().padStart(2, '0')).join(' '),
|
||||
totalSize: fullContent.length,
|
||||
bomRemoved,
|
||||
encodingSupported
|
||||
};
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
console.log('\nBOM Statistics:');
|
||||
console.log(`Total files analyzed: ${bomStats.total}`);
|
||||
console.log(`Files with BOM: ${bomStats.withBOM} (${(bomStats.withBOM/bomStats.total*100).toFixed(1)}%)`);
|
||||
console.log(` UTF-8 BOM: ${bomStats.utf8BOM}`);
|
||||
console.log(` UTF-16 BOM: ${bomStats.utf16BOM}`);
|
||||
console.log(` Other BOM: ${bomStats.otherBOM}`);
|
||||
console.log(`Multiple BOMs: ${bomStats.multipleBOM}`);
|
||||
console.log(`Invalid BOM position: ${bomStats.invalidPosition}`);
|
||||
|
||||
performanceTracker.endOperation('corpus-bom');
|
||||
});
|
||||
|
||||
await t.test('BOM security implications', async () => {
|
||||
performanceTracker.startOperation('bom-security');
|
||||
|
||||
const securityTests = [
|
||||
{
|
||||
name: 'BOM hiding malicious content',
|
||||
content: Buffer.concat([
|
||||
Buffer.from([0xEF, 0xBB, 0xBF]),
|
||||
Buffer.from('<?xml version="1.0"?><!-- '),
|
||||
Buffer.from([0xEF, 0xBB, 0xBF]), // Hidden BOM in comment
|
||||
Buffer.from(' --><invoice><script>alert("XSS")</script></invoice>')
|
||||
]),
|
||||
risk: 'BOM bytes could be used to bypass filters'
|
||||
},
|
||||
{
|
||||
name: 'Zero-width BOM characters',
|
||||
content: Buffer.from('<?xml version="1.0"?><invoice>\uFEFF<id>TEST</id></invoice>'),
|
||||
risk: 'Invisible characters could hide malicious content'
|
||||
},
|
||||
{
|
||||
name: 'BOM-based encoding confusion',
|
||||
content: Buffer.concat([
|
||||
Buffer.from([0xFF, 0xFE]), // UTF-16 LE BOM
|
||||
Buffer.from('<?xml version="1.0" encoding="UTF-8"?><invoice><id>TEST</id></invoice>')
|
||||
]),
|
||||
risk: 'Encoding mismatch could lead to parsing errors'
|
||||
}
|
||||
];
|
||||
|
||||
for (const test of securityTests) {
|
||||
const startTime = performance.now();
|
||||
|
||||
console.log(`${test.name}:`);
|
||||
console.log(` Risk: ${test.risk}`);
|
||||
|
||||
// Scan for suspicious patterns
|
||||
const bomCount = findBOMOccurrences(test.content).length;
|
||||
const hasMultipleBOMs = bomCount > 1;
|
||||
const hasInvisibleChars = test.content.includes(0xFEFF) ||
|
||||
test.content.toString().includes('\uFEFF');
|
||||
|
||||
console.log(` BOM count: ${bomCount}`);
|
||||
console.log(` Multiple BOMs: ${hasMultipleBOMs ? '✗ Yes' : '✓ No'}`);
|
||||
console.log(` Invisible chars: ${hasInvisibleChars ? '✗ Yes' : '✓ No'}`);
|
||||
|
||||
if (hasMultipleBOMs || hasInvisibleChars) {
|
||||
console.log(' ⚠️ Security risk detected');
|
||||
}
|
||||
|
||||
performanceTracker.recordMetric('bom-security', performance.now() - startTime);
|
||||
}
|
||||
|
||||
performanceTracker.endOperation('bom-security');
|
||||
});
|
||||
|
||||
await t.test('BOM handling performance', async () => {
|
||||
performanceTracker.startOperation('bom-performance');
|
||||
|
||||
const sizes = [1000, 10000, 100000]; // 1KB, 10KB, 100KB
|
||||
|
||||
for (const size of sizes) {
|
||||
// Generate content with BOM
|
||||
const bom = Buffer.from([0xEF, 0xBB, 0xBF]);
|
||||
const xmlContent = Buffer.from(`<?xml version="1.0"?><invoice><data>${'x'.repeat(size)}</data></invoice>`);
|
||||
const withBOM = Buffer.concat([bom, xmlContent]);
|
||||
|
||||
// Measure BOM detection time
|
||||
const detectStart = performance.now();
|
||||
for (let i = 0; i < 1000; i++) {
|
||||
const hasBOM = withBOM.length >= 3 &&
|
||||
withBOM[0] === 0xEF &&
|
||||
withBOM[1] === 0xBB &&
|
||||
withBOM[2] === 0xBF;
|
||||
}
|
||||
const detectTime = performance.now() - detectStart;
|
||||
|
||||
// Measure BOM removal time
|
||||
const removeStart = performance.now();
|
||||
for (let i = 0; i < 1000; i++) {
|
||||
const cleaned = removeBOM(withBOM);
|
||||
}
|
||||
const removeTime = performance.now() - removeStart;
|
||||
|
||||
console.log(`File size ${size} bytes:`);
|
||||
console.log(` BOM detection: ${(detectTime/1000).toFixed(3)}ms per operation`);
|
||||
console.log(` BOM removal: ${(removeTime/1000).toFixed(3)}ms per operation`);
|
||||
|
||||
performanceTracker.recordMetric(`bom-perf-${size}`, detectTime + removeTime);
|
||||
}
|
||||
|
||||
performanceTracker.endOperation('bom-performance');
|
||||
});
|
||||
|
||||
// Helper functions
|
||||
function removeBOM(buffer: Buffer): Buffer {
|
||||
if (buffer.length >= 3 &&
|
||||
buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) {
|
||||
return buffer.slice(3);
|
||||
}
|
||||
if (buffer.length >= 2) {
|
||||
if ((buffer[0] === 0xFF && buffer[1] === 0xFE) ||
|
||||
(buffer[0] === 0xFE && buffer[1] === 0xFF)) {
|
||||
return buffer.slice(2);
|
||||
}
|
||||
}
|
||||
if (buffer.length >= 4) {
|
||||
if ((buffer[0] === 0xFF && buffer[1] === 0xFE &&
|
||||
buffer[2] === 0x00 && buffer[3] === 0x00) ||
|
||||
(buffer[0] === 0x00 && buffer[1] === 0x00 &&
|
||||
buffer[2] === 0xFE && buffer[3] === 0xFF)) {
|
||||
return buffer.slice(4);
|
||||
}
|
||||
}
|
||||
return buffer;
|
||||
console.log(`${bomType.name}:`);
|
||||
console.log(` BOM: ${result.bomBytes}`);
|
||||
console.log(` Encoding: ${bomType.encoding}`);
|
||||
console.log(` Description: ${bomType.description}`);
|
||||
console.log(` Total size: ${result.totalSize} bytes`);
|
||||
console.log(` ${result.bomRemoved ? '✓' : '✗'} BOM ${result.bomRemoved ? 'removed successfully' : 'removal failed'}`);
|
||||
console.log(` Processing time: ${metric.duration.toFixed(2)}ms`);
|
||||
}
|
||||
|
||||
function findBOMOccurrences(buffer: Buffer): number[] {
|
||||
const positions: number[] = [];
|
||||
|
||||
for (let i = 0; i < buffer.length - 2; i++) {
|
||||
if (buffer[i] === 0xEF && buffer[i+1] === 0xBB && buffer[i+2] === 0xBF) {
|
||||
positions.push(i);
|
||||
i += 2; // Skip past this BOM
|
||||
}
|
||||
}
|
||||
|
||||
return positions;
|
||||
}
|
||||
|
||||
function detectBOMType(bom: Buffer): string {
|
||||
if (bom.length >= 3 && bom[0] === 0xEF && bom[1] === 0xBB && bom[2] === 0xBF) {
|
||||
return 'UTF-8';
|
||||
}
|
||||
if (bom.length >= 2) {
|
||||
if (bom[0] === 0xFF && bom[1] === 0xFE) {
|
||||
if (bom.length >= 4 && bom[2] === 0x00 && bom[3] === 0x00) {
|
||||
return 'UTF-32LE';
|
||||
}
|
||||
return 'UTF-16LE';
|
||||
}
|
||||
if (bom[0] === 0xFE && bom[1] === 0xFF) {
|
||||
return 'UTF-16BE';
|
||||
}
|
||||
}
|
||||
if (bom.length >= 4 && bom[0] === 0x00 && bom[1] === 0x00 &&
|
||||
bom[2] === 0xFE && bom[3] === 0xFF) {
|
||||
return 'UTF-32BE';
|
||||
}
|
||||
return 'Unknown';
|
||||
}
|
||||
|
||||
// Performance summary
|
||||
console.log('\n' + performanceTracker.getSummary());
|
||||
|
||||
// BOM handling best practices
|
||||
console.log('\nBOM Handling Best Practices:');
|
||||
console.log('1. Always check for BOM before parsing XML');
|
||||
console.log('2. Remove BOM after detection to avoid parsing issues');
|
||||
console.log('3. Preserve BOM information for round-trip operations if needed');
|
||||
console.log('4. Handle conflicts between BOM and encoding declarations');
|
||||
console.log('5. Be aware of security implications of multiple/hidden BOMs');
|
||||
console.log('6. Test with files both with and without BOM');
|
||||
console.log('7. Consider BOM handling in performance-critical paths');
|
||||
console.log('8. Support all common BOM types (UTF-8, UTF-16, UTF-32)');
|
||||
});
|
||||
|
||||
tap.test('PARSE-04: BOM in different positions', async () => {
|
||||
const positionTests = [
|
||||
{
|
||||
name: 'BOM at start (correct)',
|
||||
content: Buffer.concat([
|
||||
Buffer.from([0xEF, 0xBB, 0xBF]),
|
||||
Buffer.from('<?xml version="1.0"?><invoice><id>TEST-001</id></invoice>')
|
||||
]),
|
||||
valid: true
|
||||
},
|
||||
{
|
||||
name: 'BOM after XML declaration',
|
||||
content: Buffer.concat([
|
||||
Buffer.from('<?xml version="1.0"?>'),
|
||||
Buffer.from([0xEF, 0xBB, 0xBF]),
|
||||
Buffer.from('<invoice><id>TEST-002</id></invoice>')
|
||||
]),
|
||||
valid: false
|
||||
},
|
||||
{
|
||||
name: 'No BOM',
|
||||
content: Buffer.from('<?xml version="1.0"?><invoice><id>TEST-003</id></invoice>'),
|
||||
valid: true
|
||||
},
|
||||
{
|
||||
name: 'Multiple BOMs',
|
||||
content: Buffer.concat([
|
||||
Buffer.from([0xEF, 0xBB, 0xBF]),
|
||||
Buffer.from([0xEF, 0xBB, 0xBF]),
|
||||
Buffer.from('<?xml version="1.0"?><invoice><id>TEST-004</id></invoice>')
|
||||
]),
|
||||
valid: false
|
||||
}
|
||||
];
|
||||
|
||||
for (const test of positionTests) {
|
||||
const { result } = await PerformanceTracker.track(
|
||||
'bom-position',
|
||||
async () => {
|
||||
const invoice = new einvoice.EInvoice();
|
||||
|
||||
try {
|
||||
await invoice.fromXmlString(test.content.toString('utf8'));
|
||||
return { parsed: true, error: null };
|
||||
} catch (error) {
|
||||
return { parsed: false, error: error.message };
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
console.log(`${test.name}: ${result.parsed ? '✓' : '✗'}`);
|
||||
console.log(` Expected ${test.valid ? 'valid' : 'invalid'}, got ${result.parsed ? 'parsed' : 'error'}`);
|
||||
if (!result.parsed) {
|
||||
console.log(` Error: ${result.error}`);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
tap.test('PARSE-04: Real invoice files with BOM', async () => {
|
||||
// Test with actual invoice formats that might have BOM
|
||||
const realWorldTests = [
|
||||
{
|
||||
name: 'UBL with UTF-8 BOM',
|
||||
xml: Buffer.concat([
|
||||
Buffer.from([0xEF, 0xBB, 0xBF]), // UTF-8 BOM
|
||||
Buffer.from(`<?xml version="1.0" encoding="UTF-8"?>
|
||||
<ubl:Invoice xmlns:ubl="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2"
|
||||
xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2"
|
||||
xmlns:cac="urn:oasis:names:specification:ubl:schema:xsd:CommonAggregateComponents-2">
|
||||
<cbc:ID>BOM-UBL-001</cbc:ID>
|
||||
<cbc:IssueDate>2024-01-01</cbc:IssueDate>
|
||||
<cac:AccountingSupplierParty>
|
||||
<cac:Party>
|
||||
<cac:PartyName>
|
||||
<cbc:Name>Test Supplier</cbc:Name>
|
||||
</cac:PartyName>
|
||||
<cac:PostalAddress>
|
||||
<cbc:CityName>Berlin</cbc:CityName>
|
||||
<cbc:PostalZone>10115</cbc:PostalZone>
|
||||
<cac:Country>
|
||||
<cbc:IdentificationCode>DE</cbc:IdentificationCode>
|
||||
</cac:Country>
|
||||
</cac:PostalAddress>
|
||||
</cac:Party>
|
||||
</cac:AccountingSupplierParty>
|
||||
<cac:AccountingCustomerParty>
|
||||
<cac:Party>
|
||||
<cac:PartyName>
|
||||
<cbc:Name>Test Customer</cbc:Name>
|
||||
</cac:PartyName>
|
||||
<cac:PostalAddress>
|
||||
<cbc:CityName>Munich</cbc:CityName>
|
||||
<cbc:PostalZone>80331</cbc:PostalZone>
|
||||
<cac:Country>
|
||||
<cbc:IdentificationCode>DE</cbc:IdentificationCode>
|
||||
</cac:Country>
|
||||
</cac:PostalAddress>
|
||||
</cac:Party>
|
||||
</cac:AccountingCustomerParty>
|
||||
<cac:InvoiceLine>
|
||||
<cbc:ID>1</cbc:ID>
|
||||
<cbc:InvoicedQuantity unitCode="EA">1</cbc:InvoicedQuantity>
|
||||
<cbc:LineExtensionAmount currencyID="EUR">100.00</cbc:LineExtensionAmount>
|
||||
<cac:Item>
|
||||
<cbc:Name>Test Product</cbc:Name>
|
||||
</cac:Item>
|
||||
<cac:Price>
|
||||
<cbc:PriceAmount currencyID="EUR">100.00</cbc:PriceAmount>
|
||||
</cac:Price>
|
||||
</cac:InvoiceLine>
|
||||
</ubl:Invoice>`)
|
||||
])
|
||||
},
|
||||
{
|
||||
name: 'ZUGFeRD with UTF-8 BOM',
|
||||
xml: Buffer.concat([
|
||||
Buffer.from([0xEF, 0xBB, 0xBF]), // UTF-8 BOM
|
||||
Buffer.from(`<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rsm:CrossIndustryInvoice xmlns:rsm="urn:un:unece:uncefact:data:standard:CrossIndustryInvoice:100"
|
||||
xmlns:ram="urn:un:unece:uncefact:data:standard:ReusableAggregateBusinessInformationEntity:100">
|
||||
<rsm:ExchangedDocument>
|
||||
<ram:ID>BOM-ZUGFERD-001</ram:ID>
|
||||
</rsm:ExchangedDocument>
|
||||
</rsm:CrossIndustryInvoice>`)
|
||||
])
|
||||
}
|
||||
];
|
||||
|
||||
for (const test of realWorldTests) {
|
||||
const { result } = await PerformanceTracker.track(
|
||||
'real-world-bom',
|
||||
async () => {
|
||||
const invoice = new einvoice.EInvoice();
|
||||
|
||||
try {
|
||||
await invoice.fromXmlString(test.xml.toString('utf8'));
|
||||
return {
|
||||
success: true,
|
||||
id: invoice.id,
|
||||
format: invoice.getFormat()
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
success: false,
|
||||
error: error.message
|
||||
};
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
console.log(`${test.name}: ${result.success ? '✓' : '✗'}`);
|
||||
if (result.success) {
|
||||
console.log(` Invoice ID: ${result.id}`);
|
||||
console.log(` Format: ${einvoice.InvoiceFormat[result.format]}`);
|
||||
} else {
|
||||
console.log(` Error: ${result.error}`);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
tap.test('PARSE-04: BOM encoding conflicts', async () => {
|
||||
const conflictTests = [
|
||||
{
|
||||
name: 'UTF-16 BOM with UTF-8 declaration',
|
||||
bom: Buffer.from([0xFF, 0xFE]), // UTF-16 LE BOM
|
||||
xml: '<?xml version="1.0" encoding="UTF-8"?><invoice><id>CONFLICT-001</id></invoice>',
|
||||
issue: 'BOM indicates UTF-16 but declaration says UTF-8'
|
||||
},
|
||||
{
|
||||
name: 'UTF-8 BOM with ISO-8859-1 declaration',
|
||||
bom: Buffer.from([0xEF, 0xBB, 0xBF]), // UTF-8 BOM
|
||||
xml: '<?xml version="1.0" encoding="ISO-8859-1"?><invoice><id>CONFLICT-002</id></invoice>',
|
||||
issue: 'BOM indicates UTF-8 but declaration says ISO-8859-1'
|
||||
}
|
||||
];
|
||||
|
||||
for (const test of conflictTests) {
|
||||
const content = Buffer.concat([test.bom, Buffer.from(test.xml)]);
|
||||
|
||||
const { result } = await PerformanceTracker.track(
|
||||
'bom-conflict',
|
||||
async () => {
|
||||
const invoice = new einvoice.EInvoice();
|
||||
|
||||
try {
|
||||
await invoice.fromXmlString(content.toString('utf8'));
|
||||
return { parsed: true };
|
||||
} catch (error) {
|
||||
return {
|
||||
parsed: false,
|
||||
error: error.message,
|
||||
isEncodingError: error.message.toLowerCase().includes('encoding') ||
|
||||
error.message.toLowerCase().includes('bom')
|
||||
};
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
console.log(`${test.name}: ${!result.parsed ? '✓ (correctly rejected)' : '✗ (should have failed)'}`);
|
||||
console.log(` Issue: ${test.issue}`);
|
||||
if (!result.parsed) {
|
||||
console.log(` ${result.isEncodingError ? 'Encoding error detected' : 'Other error'}`);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
tap.test('PARSE-04: Performance with BOM', async () => {
|
||||
const sizes = [1, 10, 100];
|
||||
|
||||
for (const size of sizes) {
|
||||
// Generate invoice with many line items
|
||||
const lines = [];
|
||||
for (let i = 1; i <= size; i++) {
|
||||
lines.push(`
|
||||
<cac:InvoiceLine>
|
||||
<cbc:ID>${i}</cbc:ID>
|
||||
<cbc:InvoicedQuantity unitCode="EA">1</cbc:InvoicedQuantity>
|
||||
<cbc:LineExtensionAmount currencyID="EUR">${i * 10}.00</cbc:LineExtensionAmount>
|
||||
<cac:Item>
|
||||
<cbc:Name>Product ${i}</cbc:Name>
|
||||
</cac:Item>
|
||||
</cac:InvoiceLine>`);
|
||||
}
|
||||
|
||||
const xmlWithBom = Buffer.concat([
|
||||
Buffer.from([0xEF, 0xBB, 0xBF]), // UTF-8 BOM
|
||||
Buffer.from(`<?xml version="1.0" encoding="UTF-8"?>
|
||||
<ubl:Invoice xmlns:ubl="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2"
|
||||
xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2"
|
||||
xmlns:cac="urn:oasis:names:specification:ubl:schema:xsd:CommonAggregateComponents-2">
|
||||
<cbc:ID>PERF-BOM-${size}</cbc:ID>
|
||||
<cbc:IssueDate>2024-01-01</cbc:IssueDate>
|
||||
<cac:AccountingSupplierParty>
|
||||
<cac:Party>
|
||||
<cac:PartyName>
|
||||
<cbc:Name>Performance Test Supplier</cbc:Name>
|
||||
</cac:PartyName>
|
||||
<cac:PostalAddress>
|
||||
<cbc:CityName>Berlin</cbc:CityName>
|
||||
<cbc:PostalZone>10115</cbc:PostalZone>
|
||||
<cac:Country>
|
||||
<cbc:IdentificationCode>DE</cbc:IdentificationCode>
|
||||
</cac:Country>
|
||||
</cac:PostalAddress>
|
||||
</cac:Party>
|
||||
</cac:AccountingSupplierParty>
|
||||
<cac:AccountingCustomerParty>
|
||||
<cac:Party>
|
||||
<cac:PartyName>
|
||||
<cbc:Name>Performance Test Customer</cbc:Name>
|
||||
</cac:PartyName>
|
||||
<cac:PostalAddress>
|
||||
<cbc:CityName>Munich</cbc:CityName>
|
||||
<cbc:PostalZone>80331</cbc:PostalZone>
|
||||
<cac:Country>
|
||||
<cbc:IdentificationCode>DE</cbc:IdentificationCode>
|
||||
</cac:Country>
|
||||
</cac:PostalAddress>
|
||||
</cac:Party>
|
||||
</cac:AccountingCustomerParty>
|
||||
${lines.join('')}
|
||||
</ubl:Invoice>`)
|
||||
]);
|
||||
|
||||
const { result, metric } = await PerformanceTracker.track(
|
||||
`bom-performance-${size}`,
|
||||
async () => {
|
||||
const invoice = new einvoice.EInvoice();
|
||||
|
||||
try {
|
||||
await invoice.fromXmlString(xmlWithBom.toString('utf8'));
|
||||
return {
|
||||
success: true,
|
||||
itemCount: invoice.items?.length || 0
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
success: false,
|
||||
error: error.message
|
||||
};
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
const xmlSize = xmlWithBom.length / 1024; // KB
|
||||
console.log(`Parse ${size} items with BOM (${xmlSize.toFixed(1)}KB): ${result.success ? '✓' : '✗'}`);
|
||||
if (result.success) {
|
||||
console.log(` Items parsed: ${result.itemCount}`);
|
||||
console.log(` Parse time: ${metric.duration.toFixed(2)}ms`);
|
||||
console.log(` Speed: ${(xmlSize / metric.duration * 1000).toFixed(2)}KB/s`);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
tap.test('PARSE-04: BOM handling summary', async () => {
|
||||
console.log('\nBOM Handling Best Practices:');
|
||||
console.log('1. Always check for BOM at the beginning of XML files');
|
||||
console.log('2. Remove BOM before parsing if present');
|
||||
console.log('3. Handle conflicts between BOM and encoding declaration');
|
||||
console.log('4. Support UTF-8, UTF-16, and UTF-32 BOMs');
|
||||
console.log('5. Validate that BOM matches the actual encoding');
|
||||
|
||||
const stats = PerformanceTracker.getStats('bom-processing');
|
||||
if (stats) {
|
||||
console.log(`\nBOM Processing Performance:`);
|
||||
console.log(` Average: ${stats.avg.toFixed(2)}ms`);
|
||||
console.log(` Max: ${stats.max.toFixed(2)}ms`);
|
||||
}
|
||||
});
|
||||
|
||||
// Run the tests
|
||||
tap.start();
|
Reference in New Issue
Block a user