import { expect, tap } from '@git.zone/tstest/tapbundle'; import * as einvoice from '../../../ts/index.js'; import * as plugins from '../../plugins.js'; import { CorpusLoader } from '../../helpers/corpus.loader.js'; import { PerformanceTracker } from '../../helpers/performance.tracker.js'; tap.test('PARSE-06: Large XML Streaming - Handle large files with streaming parsers', async (t) => { const performanceTracker = new PerformanceTracker('PARSE-06'); await t.test('Memory-efficient parsing strategies', async () => { performanceTracker.startOperation('memory-strategies'); // Generate different sized test documents const generateLargeInvoice = (lineItems: number): string => { let xml = ` LARGE-${lineItems} 2024-01-01 `; for (let i = 1; i <= lineItems; i++) { xml += ` ${i} Product Item ${i} with a reasonably long description to increase document size 1 ${(Math.random() * 1000).toFixed(2)} false ${(Math.random() * 10).toFixed(2)} `; } xml += ` `; return xml; }; const testSizes = [ { items: 100, expectedSize: '~50KB' }, { items: 1000, expectedSize: '~500KB' }, { items: 5000, expectedSize: '~2.5MB' }, { items: 10000, expectedSize: '~5MB' } ]; for (const test of testSizes) { const startTime = performance.now(); const startMemory = process.memoryUsage(); const largeXml = generateLargeInvoice(test.items); const xmlSize = Buffer.byteLength(largeXml, 'utf8'); console.log(`\nTesting ${test.items} line items (${test.expectedSize}, actual: ${(xmlSize/1024).toFixed(1)}KB):`); try { const invoice = new einvoice.EInvoice(); if (invoice.fromXmlString) { await invoice.fromXmlString(largeXml); const endMemory = process.memoryUsage(); const memoryDelta = { heapUsed: (endMemory.heapUsed - startMemory.heapUsed) / 1024 / 1024, external: (endMemory.external - startMemory.external) / 1024 / 1024 }; const parseTime = performance.now() - startTime; console.log(` Parse time: ${parseTime.toFixed(2)}ms`); console.log(` Memory delta: ${memoryDelta.heapUsed.toFixed(2)}MB heap, ${memoryDelta.external.toFixed(2)}MB external`); console.log(` Parse rate: ${(xmlSize / parseTime * 1000 / 1024 / 1024).toFixed(2)}MB/s`); // Check if memory usage is reasonable const memoryRatio = memoryDelta.heapUsed / (xmlSize / 1024 / 1024); console.log(` Memory ratio: ${memoryRatio.toFixed(2)}x document size`); if (memoryRatio > 5) { console.log(' ⚠️ High memory usage detected'); } else { console.log(' ✓ Memory usage acceptable'); } } else { console.log(' ⚠️ fromXmlString not implemented'); } } catch (error) { console.log(` ✗ Parse error: ${error.message}`); } performanceTracker.recordMetric(`parse-${test.items}-items`, performance.now() - startTime); // Force garbage collection if available if (global.gc) { global.gc(); } } performanceTracker.endOperation('memory-strategies'); }); await t.test('Streaming parser simulation', async () => { performanceTracker.startOperation('streaming-simulation'); class StreamingXmlParser { private buffer = ''; private tagStack: string[] = []; private currentElement: any = null; private parsedElements = 0; private eventHandlers: Map void> = new Map(); onElement(tagName: string, handler: (element: any) => void): void { this.eventHandlers.set(tagName, handler); } async parseChunk(chunk: string): Promise { this.buffer += chunk; // Simple streaming parser simulation let tagMatch; const tagRegex = /<([^>]+)>([^<]*)/g; while ((tagMatch = tagRegex.exec(this.buffer)) !== null) { const [fullMatch, tag, content] = tagMatch; if (tag.startsWith('/')) { // Closing tag const tagName = tag.substring(1); if (this.tagStack[this.tagStack.length - 1] === tagName) { this.tagStack.pop(); // Emit element event if (this.currentElement && this.eventHandlers.has(tagName)) { this.eventHandlers.get(tagName)!(this.currentElement); this.parsedElements++; } this.currentElement = null; } } else if (!tag.endsWith('/')) { // Opening tag const tagName = tag.split(' ')[0]; this.tagStack.push(tagName); this.currentElement = { tag: tagName, content: content.trim() }; } } // Keep unparsed content in buffer const lastTagEnd = this.buffer.lastIndexOf('>'); if (lastTagEnd !== -1) { this.buffer = this.buffer.substring(lastTagEnd + 1); } } getStats() { return { parsedElements: this.parsedElements, bufferSize: this.buffer.length, stackDepth: this.tagStack.length }; } } // Test streaming parser const parser = new StreamingXmlParser(); let lineItemCount = 0; let totalAmount = 0; // Register handlers for specific elements parser.onElement('LineItem', (element) => { lineItemCount++; }); parser.onElement('Amount', (element) => { const amount = parseFloat(element.content); if (!isNaN(amount)) { totalAmount += amount; } }); // Generate and parse in chunks const chunkSize = 1024; // 1KB chunks const totalItems = 1000; console.log(`\nStreaming parse simulation (${totalItems} items in ${chunkSize} byte chunks):`); const startTime = performance.now(); // Generate header await parser.parseChunk(` STREAM-TEST `); // Generate items in chunks let currentChunk = ''; for (let i = 1; i <= totalItems; i++) { const item = ` ${i} Item ${i} 10.00 `; currentChunk += item; if (currentChunk.length >= chunkSize) { await parser.parseChunk(currentChunk); currentChunk = ''; // Log progress every 100 items if (i % 100 === 0) { const stats = parser.getStats(); console.log(` Progress: ${i}/${totalItems} items, buffer: ${stats.bufferSize} bytes`); } } } // Parse remaining chunk and footer await parser.parseChunk(currentChunk + ` `); const parseTime = performance.now() - startTime; const finalStats = parser.getStats(); console.log(`\nStreaming results:`); console.log(` Parse time: ${parseTime.toFixed(2)}ms`); console.log(` Line items found: ${lineItemCount}`); console.log(` Total amount sum: ${totalAmount.toFixed(2)}`); console.log(` Elements parsed: ${finalStats.parsedElements}`); console.log(` Parse rate: ${(totalItems / parseTime * 1000).toFixed(0)} items/second`); performanceTracker.endOperation('streaming-simulation'); }); await t.test('Chunked processing patterns', async () => { performanceTracker.startOperation('chunked-processing'); const chunkPatterns = [ { name: 'Fixed size chunks', chunkSize: 4096, description: 'Process in fixed byte chunks' }, { name: 'Line-based chunks', chunkSize: 100, // lines description: 'Process by number of lines' }, { name: 'Element-based chunks', chunkSize: 50, // elements description: 'Process by complete elements' }, { name: 'Memory-based chunks', chunkSize: 1024 * 1024, // 1MB description: 'Process based on memory limits' } ]; for (const pattern of chunkPatterns) { console.log(`\n${pattern.name}:`); console.log(` ${pattern.description}`); console.log(` Chunk size: ${pattern.chunkSize}`); // Simulate processing const startTime = performance.now(); let chunksProcessed = 0; let totalBytes = 0; // Process 10 chunks for (let i = 0; i < 10; i++) { // Simulate chunk processing await new Promise(resolve => setTimeout(resolve, 1)); chunksProcessed++; totalBytes += pattern.chunkSize; } const processTime = performance.now() - startTime; console.log(` Chunks processed: ${chunksProcessed}`); console.log(` Processing rate: ${(totalBytes / processTime * 1000 / 1024).toFixed(2)}KB/s`); performanceTracker.recordMetric(`chunk-${pattern.name}`, processTime); } performanceTracker.endOperation('chunked-processing'); }); await t.test('Large corpus file handling', async () => { performanceTracker.startOperation('corpus-large-files'); const corpusLoader = new CorpusLoader(); const allFiles = await corpusLoader.getFiles(/\.(xml|ubl|cii)$/); // Find large files const fileSizes = await Promise.all( allFiles.map(async (file) => { const stats = await plugins.fs.stat(file.path); return { file, size: stats.size }; }) ); // Sort by size and get top 10 const largeFiles = fileSizes .sort((a, b) => b.size - a.size) .slice(0, 10); console.log(`\nLargest files in corpus:`); for (const { file, size } of largeFiles) { console.log(` ${file.name}: ${(size / 1024).toFixed(1)}KB`); if (size > 100 * 1024) { // Files larger than 100KB const startTime = performance.now(); const startMemory = process.memoryUsage(); try { const content = await plugins.fs.readFile(file.path, 'utf8'); const invoice = new einvoice.EInvoice(); if (invoice.fromXmlString) { await invoice.fromXmlString(content); const parseTime = performance.now() - startTime; const endMemory = process.memoryUsage(); const memoryUsed = (endMemory.heapUsed - startMemory.heapUsed) / 1024 / 1024; console.log(` Parse time: ${parseTime.toFixed(2)}ms`); console.log(` Memory used: ${memoryUsed.toFixed(2)}MB`); console.log(` Parse rate: ${(size / parseTime * 1000 / 1024).toFixed(2)}KB/s`); } } catch (error) { console.log(` Error: ${error.message}`); } performanceTracker.recordMetric(`large-file-${file.name}`, performance.now() - startTime); } } performanceTracker.endOperation('corpus-large-files'); }); await t.test('Progressive parsing with callbacks', async () => { performanceTracker.startOperation('progressive-parsing'); class ProgressiveParser { private invoiceData: any = {}; private lineItems: any[] = []; private currentPath: string[] = []; constructor( private onProgress?: (progress: number) => void, private onLineItem?: (item: any) => void ) {} async parse(xml: string): Promise { const totalSize = xml.length; let processed = 0; const chunkSize = 10000; // Parse in chunks for (let i = 0; i < totalSize; i += chunkSize) { const chunk = xml.substring(i, Math.min(i + chunkSize, totalSize)); await this.processChunk(chunk); processed += chunk.length; if (this.onProgress) { this.onProgress(processed / totalSize * 100); } // Simulate async processing await new Promise(resolve => setImmediate(resolve)); } return { invoice: this.invoiceData, lineItems: this.lineItems }; } private async processChunk(chunk: string): Promise { // Simplified parsing - in reality would maintain state across chunks const lineItemMatches = chunk.matchAll(/[\s\S]*?<\/LineItem>/g); for (const match of lineItemMatches) { const item = this.parseLineItem(match[0]); if (item) { this.lineItems.push(item); if (this.onLineItem) { this.onLineItem(item); } } } } private parseLineItem(xml: string): any { const item: any = {}; const idMatch = xml.match(/([^<]+)<\/ID>/); if (idMatch) item.id = idMatch[1]; const descMatch = xml.match(/([^<]+)<\/Description>/); if (descMatch) item.description = descMatch[1]; const amountMatch = xml.match(/]*>([^<]+)<\/Amount>/); if (amountMatch) item.amount = parseFloat(amountMatch[1]); return Object.keys(item).length > 0 ? item : null; } } // Test progressive parser console.log('\nProgressive parsing test:'); const largeXml = generateLargeInvoice(500); let progressUpdates = 0; let itemsFound = 0; const parser = new ProgressiveParser( (progress) => { progressUpdates++; if (progress % 20 < 5) { // Log at ~20% intervals console.log(` Progress: ${progress.toFixed(0)}%`); } }, (item) => { itemsFound++; if (itemsFound % 100 === 0) { console.log(` Found ${itemsFound} items...`); } } ); const startTime = performance.now(); const result = await parser.parse(largeXml); const parseTime = performance.now() - startTime; console.log(`\nProgressive parsing results:`); console.log(` Parse time: ${parseTime.toFixed(2)}ms`); console.log(` Progress updates: ${progressUpdates}`); console.log(` Line items found: ${result.lineItems.length}`); console.log(` Items/second: ${(result.lineItems.length / parseTime * 1000).toFixed(0)}`); performanceTracker.endOperation('progressive-parsing'); // Helper function function generateLargeInvoice(lineItems: number): string { let xml = ` LARGE-${lineItems} 2024-01-01`; for (let i = 1; i <= lineItems; i++) { xml += ` ${i} Product Item ${i} with extended description for testing 1 ${(Math.random() * 1000).toFixed(2)} `; } xml += '\n'; return xml; } }); await t.test('Stream processing optimization techniques', async () => { performanceTracker.startOperation('stream-optimization'); const optimizations = [ { name: 'Buffer pooling', description: 'Reuse buffers to reduce allocation', implementation: () => { const bufferPool: Buffer[] = []; const poolSize = 10; const bufferSize = 4096; // Pre-allocate buffers for (let i = 0; i < poolSize; i++) { bufferPool.push(Buffer.allocUnsafe(bufferSize)); } return { acquire: () => bufferPool.pop() || Buffer.allocUnsafe(bufferSize), release: (buffer: Buffer) => { if (bufferPool.length < poolSize) { bufferPool.push(buffer); } } }; } }, { name: 'Lazy evaluation', description: 'Defer processing until needed', implementation: () => { const pendingOperations: (() => any)[] = []; return { defer: (op: () => any) => pendingOperations.push(op), evaluate: () => { const results = pendingOperations.map(op => op()); pendingOperations.length = 0; return results; } }; } }, { name: 'Element skipping', description: 'Skip unneeded elements during parsing', implementation: () => { const skipPaths = new Set(['Signature', 'Extension', 'AdditionalInfo']); return { shouldSkip: (elementPath: string) => { return skipPaths.has(elementPath.split('/').pop() || ''); } }; } } ]; for (const opt of optimizations) { console.log(`\n${opt.name}:`); console.log(` ${opt.description}`); const impl = opt.implementation(); // Simulate usage const startTime = performance.now(); if ('acquire' in impl) { // Buffer pooling test for (let i = 0; i < 1000; i++) { const buffer = impl.acquire(); // Use buffer... impl.release(buffer); } console.log(' ✓ Buffer pool working'); } else if ('defer' in impl) { // Lazy evaluation test for (let i = 0; i < 100; i++) { impl.defer(() => Math.random() * 1000); } const results = impl.evaluate(); console.log(` ✓ Deferred ${results.length} operations`); } else if ('shouldSkip' in impl) { // Element skipping test const testPaths = [ 'Invoice/Signature', 'Invoice/LineItem/Price', 'Invoice/Extension' ]; const skipped = testPaths.filter(p => impl.shouldSkip(p)); console.log(` ✓ Skipping ${skipped.length} of ${testPaths.length} paths`); } performanceTracker.recordMetric(`optimization-${opt.name}`, performance.now() - startTime); } performanceTracker.endOperation('stream-optimization'); }); // Performance summary console.log('\n' + performanceTracker.getSummary()); // Streaming best practices console.log('\nLarge XML Streaming Best Practices:'); console.log('1. Use streaming parsers for files > 10MB'); console.log('2. Process data in chunks to control memory usage'); console.log('3. Implement progress callbacks for user feedback'); console.log('4. Use buffer pools to reduce allocation overhead'); console.log('5. Skip unnecessary elements during parsing'); console.log('6. Monitor memory usage and implement limits'); console.log('7. Support both streaming and DOM parsing modes'); console.log('8. Optimize chunk sizes based on document structure'); }); tap.start();