import { expect, tap } from '@git.zone/tstest/tapbundle'; import * as einvoice from '../../../ts/index.js'; import * as plugins from '../../plugins.js'; import { CorpusLoader } from '../../helpers/corpus.loader.js'; import { PerformanceTracker } from '../../helpers/performance.tracker.js'; tap.test('PARSE-04: BOM Handling - Process Byte Order Marks correctly across encodings', async (t) => { const performanceTracker = new PerformanceTracker('PARSE-04'); await t.test('Standard BOM detection and removal', async () => { performanceTracker.startOperation('standard-bom'); const bomTypes = [ { name: 'UTF-8 BOM', bom: Buffer.from([0xEF, 0xBB, 0xBF]), encoding: 'UTF-8', description: 'Most common BOM in XML files' }, { name: 'UTF-16 LE BOM', bom: Buffer.from([0xFF, 0xFE]), encoding: 'UTF-16LE', description: 'Little-endian UTF-16' }, { name: 'UTF-16 BE BOM', bom: Buffer.from([0xFE, 0xFF]), encoding: 'UTF-16BE', description: 'Big-endian UTF-16' }, { name: 'UTF-32 LE BOM', bom: Buffer.from([0xFF, 0xFE, 0x00, 0x00]), encoding: 'UTF-32LE', description: 'Little-endian UTF-32' }, { name: 'UTF-32 BE BOM', bom: Buffer.from([0x00, 0x00, 0xFE, 0xFF]), encoding: 'UTF-32BE', description: 'Big-endian UTF-32' } ]; for (const bomType of bomTypes) { const startTime = performance.now(); // Create XML with BOM let xmlContent: Buffer; if (bomType.encoding.startsWith('UTF-16')) { xmlContent = Buffer.from( 'TEST-BOM', bomType.encoding.toLowerCase() as BufferEncoding ); } else if (bomType.encoding.startsWith('UTF-32')) { // UTF-32 not directly supported by Node.js, simulate xmlContent = Buffer.from('TEST-BOM'); } else { xmlContent = Buffer.from('TEST-BOM'); } const fullContent = Buffer.concat([bomType.bom, xmlContent]); console.log(`${bomType.name}:`); console.log(` BOM: ${Array.from(bomType.bom).map(b => '0x' + b.toString(16).toUpperCase().padStart(2, '0')).join(' ')}`); console.log(` Encoding: ${bomType.encoding}`); console.log(` Description: ${bomType.description}`); console.log(` Total size: ${fullContent.length} bytes`); // Test BOM removal const withoutBom = removeBOM(fullContent); if (withoutBom.length === fullContent.length - bomType.bom.length) { console.log(' ✓ BOM removed successfully'); } else { console.log(' ✗ BOM removal failed'); } performanceTracker.recordMetric('bom-processing', performance.now() - startTime); } performanceTracker.endOperation('standard-bom'); }); await t.test('BOM in different positions', async () => { performanceTracker.startOperation('bom-positions'); const positionTests = [ { name: 'BOM at start (correct)', content: Buffer.concat([ Buffer.from([0xEF, 0xBB, 0xBF]), Buffer.from('TEST-001') ]), valid: true }, { name: 'BOM after XML declaration', content: Buffer.concat([ Buffer.from(''), Buffer.from([0xEF, 0xBB, 0xBF]), Buffer.from('TEST-002') ]), valid: false }, { name: 'BOM in middle of document', content: Buffer.concat([ Buffer.from(''), Buffer.from([0xEF, 0xBB, 0xBF]), Buffer.from('TEST-003') ]), valid: false }, { name: 'Multiple BOMs', content: Buffer.concat([ Buffer.from([0xEF, 0xBB, 0xBF]), Buffer.from([0xEF, 0xBB, 0xBF]), Buffer.from('TEST-004') ]), valid: false }, { name: 'BOM-like bytes in content', content: Buffer.concat([ Buffer.from(''), Buffer.from([0xEF, 0xBB, 0xBF]), // These are actual data, not BOM Buffer.from('') ]), valid: true // Valid XML, but BOM-like bytes are data } ]; for (const test of positionTests) { const startTime = performance.now(); console.log(`${test.name}:`); // Check for BOM at start const hasValidBOM = test.content.length >= 3 && test.content[0] === 0xEF && test.content[1] === 0xBB && test.content[2] === 0xBF && test.content.indexOf(' { performanceTracker.startOperation('bom-roundtrip'); const roundTripTests = [ { name: 'Preserve UTF-8 BOM', input: Buffer.concat([ Buffer.from([0xEF, 0xBB, 0xBF]), Buffer.from('RT-001') ]), preserveBOM: true }, { name: 'Remove UTF-8 BOM', input: Buffer.concat([ Buffer.from([0xEF, 0xBB, 0xBF]), Buffer.from('RT-002') ]), preserveBOM: false }, { name: 'Add BOM to BOM-less file', input: Buffer.from('RT-003'), preserveBOM: true, addBOM: true } ]; for (const test of roundTripTests) { const startTime = performance.now(); console.log(`${test.name}:`); const inputHasBOM = test.input.length >= 3 && test.input[0] === 0xEF && test.input[1] === 0xBB && test.input[2] === 0xBF; console.log(` Input has BOM: ${inputHasBOM}`); console.log(` Preserve BOM: ${test.preserveBOM}`); // Simulate round-trip let processed = test.input; if (!test.preserveBOM && inputHasBOM) { // Remove BOM processed = processed.slice(3); console.log(' Action: Removed BOM'); } else if (test.addBOM && !inputHasBOM) { // Add BOM processed = Buffer.concat([Buffer.from([0xEF, 0xBB, 0xBF]), processed]); console.log(' Action: Added BOM'); } else { console.log(' Action: No change'); } const outputHasBOM = processed.length >= 3 && processed[0] === 0xEF && processed[1] === 0xBB && processed[2] === 0xBF; console.log(` Output has BOM: ${outputHasBOM}`); performanceTracker.recordMetric('bom-roundtrip', performance.now() - startTime); } performanceTracker.endOperation('bom-roundtrip'); }); await t.test('BOM conflicts with encoding declarations', async () => { performanceTracker.startOperation('bom-conflicts'); const conflictTests = [ { name: 'UTF-8 BOM with UTF-8 declaration', bom: Buffer.from([0xEF, 0xBB, 0xBF]), declaration: 'UTF-8', conflict: false }, { name: 'UTF-8 BOM with UTF-16 declaration', bom: Buffer.from([0xEF, 0xBB, 0xBF]), declaration: 'UTF-16', conflict: true }, { name: 'UTF-16 LE BOM with UTF-8 declaration', bom: Buffer.from([0xFF, 0xFE]), declaration: 'UTF-8', conflict: true }, { name: 'UTF-16 BE BOM with UTF-16 declaration', bom: Buffer.from([0xFE, 0xFF]), declaration: 'UTF-16', conflict: false }, { name: 'No BOM with any declaration', bom: Buffer.from([]), declaration: 'UTF-8', conflict: false } ]; for (const test of conflictTests) { const startTime = performance.now(); const xml = `CONFLICT-TEST`; const fullContent = Buffer.concat([test.bom, Buffer.from(xml)]); console.log(`${test.name}:`); console.log(` BOM type: ${test.bom.length > 0 ? detectBOMType(test.bom) : 'None'}`); console.log(` Declaration: ${test.declaration}`); console.log(` Conflict: ${test.conflict ? '✗ Yes' : '✓ No'}`); if (test.conflict) { console.log(' Resolution: BOM takes precedence over declaration'); } performanceTracker.recordMetric('bom-conflict', performance.now() - startTime); } performanceTracker.endOperation('bom-conflicts'); }); await t.test('BOM handling in corpus files', async () => { performanceTracker.startOperation('corpus-bom'); const corpusLoader = new CorpusLoader(); const files = await corpusLoader.getFiles(/\.(xml|cii|ubl)$/); console.log(`\nAnalyzing BOM usage in ${files.length} corpus files...`); const bomStats = { total: 0, withBOM: 0, utf8BOM: 0, utf16BOM: 0, otherBOM: 0, multipleBOM: 0, invalidPosition: 0 }; const sampleSize = Math.min(100, files.length); const sampledFiles = files.slice(0, sampleSize); for (const file of sampledFiles) { bomStats.total++; try { const content = await plugins.fs.readFile(file.path); // Check for BOM if (content.length >= 3) { if (content[0] === 0xEF && content[1] === 0xBB && content[2] === 0xBF) { bomStats.withBOM++; bomStats.utf8BOM++; } else if (content.length >= 2) { if ((content[0] === 0xFF && content[1] === 0xFE) || (content[0] === 0xFE && content[1] === 0xFF)) { bomStats.withBOM++; bomStats.utf16BOM++; } } } // Check for multiple BOMs or BOMs in wrong position const bomOccurrences = findBOMOccurrences(content); if (bomOccurrences.length > 1) { bomStats.multipleBOM++; } if (bomOccurrences.length > 0 && bomOccurrences[0] !== 0) { bomStats.invalidPosition++; } } catch (error) { // Skip files that can't be read } } console.log('\nBOM Statistics:'); console.log(`Total files analyzed: ${bomStats.total}`); console.log(`Files with BOM: ${bomStats.withBOM} (${(bomStats.withBOM/bomStats.total*100).toFixed(1)}%)`); console.log(` UTF-8 BOM: ${bomStats.utf8BOM}`); console.log(` UTF-16 BOM: ${bomStats.utf16BOM}`); console.log(` Other BOM: ${bomStats.otherBOM}`); console.log(`Multiple BOMs: ${bomStats.multipleBOM}`); console.log(`Invalid BOM position: ${bomStats.invalidPosition}`); performanceTracker.endOperation('corpus-bom'); }); await t.test('BOM security implications', async () => { performanceTracker.startOperation('bom-security'); const securityTests = [ { name: 'BOM hiding malicious content', content: Buffer.concat([ Buffer.from([0xEF, 0xBB, 0xBF]), Buffer.from('') ]), risk: 'BOM bytes could be used to bypass filters' }, { name: 'Zero-width BOM characters', content: Buffer.from('\uFEFFTEST'), risk: 'Invisible characters could hide malicious content' }, { name: 'BOM-based encoding confusion', content: Buffer.concat([ Buffer.from([0xFF, 0xFE]), // UTF-16 LE BOM Buffer.from('TEST') ]), risk: 'Encoding mismatch could lead to parsing errors' } ]; for (const test of securityTests) { const startTime = performance.now(); console.log(`${test.name}:`); console.log(` Risk: ${test.risk}`); // Scan for suspicious patterns const bomCount = findBOMOccurrences(test.content).length; const hasMultipleBOMs = bomCount > 1; const hasInvisibleChars = test.content.includes(0xFEFF) || test.content.toString().includes('\uFEFF'); console.log(` BOM count: ${bomCount}`); console.log(` Multiple BOMs: ${hasMultipleBOMs ? '✗ Yes' : '✓ No'}`); console.log(` Invisible chars: ${hasInvisibleChars ? '✗ Yes' : '✓ No'}`); if (hasMultipleBOMs || hasInvisibleChars) { console.log(' ⚠️ Security risk detected'); } performanceTracker.recordMetric('bom-security', performance.now() - startTime); } performanceTracker.endOperation('bom-security'); }); await t.test('BOM handling performance', async () => { performanceTracker.startOperation('bom-performance'); const sizes = [1000, 10000, 100000]; // 1KB, 10KB, 100KB for (const size of sizes) { // Generate content with BOM const bom = Buffer.from([0xEF, 0xBB, 0xBF]); const xmlContent = Buffer.from(`${'x'.repeat(size)}`); const withBOM = Buffer.concat([bom, xmlContent]); // Measure BOM detection time const detectStart = performance.now(); for (let i = 0; i < 1000; i++) { const hasBOM = withBOM.length >= 3 && withBOM[0] === 0xEF && withBOM[1] === 0xBB && withBOM[2] === 0xBF; } const detectTime = performance.now() - detectStart; // Measure BOM removal time const removeStart = performance.now(); for (let i = 0; i < 1000; i++) { const cleaned = removeBOM(withBOM); } const removeTime = performance.now() - removeStart; console.log(`File size ${size} bytes:`); console.log(` BOM detection: ${(detectTime/1000).toFixed(3)}ms per operation`); console.log(` BOM removal: ${(removeTime/1000).toFixed(3)}ms per operation`); performanceTracker.recordMetric(`bom-perf-${size}`, detectTime + removeTime); } performanceTracker.endOperation('bom-performance'); }); // Helper functions function removeBOM(buffer: Buffer): Buffer { if (buffer.length >= 3 && buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) { return buffer.slice(3); } if (buffer.length >= 2) { if ((buffer[0] === 0xFF && buffer[1] === 0xFE) || (buffer[0] === 0xFE && buffer[1] === 0xFF)) { return buffer.slice(2); } } if (buffer.length >= 4) { if ((buffer[0] === 0xFF && buffer[1] === 0xFE && buffer[2] === 0x00 && buffer[3] === 0x00) || (buffer[0] === 0x00 && buffer[1] === 0x00 && buffer[2] === 0xFE && buffer[3] === 0xFF)) { return buffer.slice(4); } } return buffer; } function findBOMOccurrences(buffer: Buffer): number[] { const positions: number[] = []; for (let i = 0; i < buffer.length - 2; i++) { if (buffer[i] === 0xEF && buffer[i+1] === 0xBB && buffer[i+2] === 0xBF) { positions.push(i); i += 2; // Skip past this BOM } } return positions; } function detectBOMType(bom: Buffer): string { if (bom.length >= 3 && bom[0] === 0xEF && bom[1] === 0xBB && bom[2] === 0xBF) { return 'UTF-8'; } if (bom.length >= 2) { if (bom[0] === 0xFF && bom[1] === 0xFE) { if (bom.length >= 4 && bom[2] === 0x00 && bom[3] === 0x00) { return 'UTF-32LE'; } return 'UTF-16LE'; } if (bom[0] === 0xFE && bom[1] === 0xFF) { return 'UTF-16BE'; } } if (bom.length >= 4 && bom[0] === 0x00 && bom[1] === 0x00 && bom[2] === 0xFE && bom[3] === 0xFF) { return 'UTF-32BE'; } return 'Unknown'; } // Performance summary console.log('\n' + performanceTracker.getSummary()); // BOM handling best practices console.log('\nBOM Handling Best Practices:'); console.log('1. Always check for BOM before parsing XML'); console.log('2. Remove BOM after detection to avoid parsing issues'); console.log('3. Preserve BOM information for round-trip operations if needed'); console.log('4. Handle conflicts between BOM and encoding declarations'); console.log('5. Be aware of security implications of multiple/hidden BOMs'); console.log('6. Test with files both with and without BOM'); console.log('7. Consider BOM handling in performance-critical paths'); console.log('8. Support all common BOM types (UTF-8, UTF-16, UTF-32)'); }); tap.start();