update

2025-05-25 19:45:37 +00:00
parent e89675c319
commit 39942638d9
110 changed files with 49183 additions and 3104 deletions
--- a/test/suite/einvoice_parsing/test.parse-06.streaming-parse.ts
+++ b/test/suite/einvoice_parsing/test.parse-06.streaming-parse.ts
@@ -0,0 +1,588 @@
+import { expect, tap } from '@git.zone/tstest/tapbundle';
+import * as einvoice from '../../../ts/index.js';
+import * as plugins from '../../plugins.js';
+import { CorpusLoader } from '../../helpers/corpus.loader.js';
+import { PerformanceTracker } from '../../helpers/performance.tracker.js';
+
+tap.test('PARSE-06: Large XML Streaming - Handle large files with streaming parsers', async (t) => {
+  const performanceTracker = new PerformanceTracker('PARSE-06');
+  
+  await t.test('Memory-efficient parsing strategies', async () => {
+    performanceTracker.startOperation('memory-strategies');
+    
+    // Generate different sized test documents
+    const generateLargeInvoice = (lineItems: number): string => {
+      let xml = `<?xml version="1.0" encoding="UTF-8"?>
+<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
+  <ID>LARGE-${lineItems}</ID>
+  <IssueDate>2024-01-01</IssueDate>
+  <InvoiceLine>`;
+      
+      for (let i = 1; i <= lineItems; i++) {
+        xml += `
+    <LineItem>
+      <ID>${i}</ID>
+      <Description>Product Item ${i} with a reasonably long description to increase document size</Description>
+      <Quantity>1</Quantity>
+      <Price>
+        <Amount currencyID="EUR">${(Math.random() * 1000).toFixed(2)}</Amount>
+      </Price>
+      <AllowanceCharge>
+        <ChargeIndicator>false</ChargeIndicator>
+        <Amount currencyID="EUR">${(Math.random() * 10).toFixed(2)}</Amount>
+      </AllowanceCharge>
+    </LineItem>`;
+      }
+      
+      xml += `
+  </InvoiceLine>
+</Invoice>`;
+      return xml;
+    };
+    
+    const testSizes = [
+      { items: 100, expectedSize: '~50KB' },
+      { items: 1000, expectedSize: '~500KB' },
+      { items: 5000, expectedSize: '~2.5MB' },
+      { items: 10000, expectedSize: '~5MB' }
+    ];
+    
+    for (const test of testSizes) {
+      const startTime = performance.now();
+      const startMemory = process.memoryUsage();
+      
+      const largeXml = generateLargeInvoice(test.items);
+      const xmlSize = Buffer.byteLength(largeXml, 'utf8');
+      
+      console.log(`\nTesting ${test.items} line items (${test.expectedSize}, actual: ${(xmlSize/1024).toFixed(1)}KB):`);
+      
+      try {
+        const invoice = new einvoice.EInvoice();
+        
+        if (invoice.fromXmlString) {
+          await invoice.fromXmlString(largeXml);
+          
+          const endMemory = process.memoryUsage();
+          const memoryDelta = {
+            heapUsed: (endMemory.heapUsed - startMemory.heapUsed) / 1024 / 1024,
+            external: (endMemory.external - startMemory.external) / 1024 / 1024
+          };
+          
+          const parseTime = performance.now() - startTime;
+          
+          console.log(`  Parse time: ${parseTime.toFixed(2)}ms`);
+          console.log(`  Memory delta: ${memoryDelta.heapUsed.toFixed(2)}MB heap, ${memoryDelta.external.toFixed(2)}MB external`);
+          console.log(`  Parse rate: ${(xmlSize / parseTime * 1000 / 1024 / 1024).toFixed(2)}MB/s`);
+          
+          // Check if memory usage is reasonable
+          const memoryRatio = memoryDelta.heapUsed / (xmlSize / 1024 / 1024);
+          console.log(`  Memory ratio: ${memoryRatio.toFixed(2)}x document size`);
+          
+          if (memoryRatio > 5) {
+            console.log('  ⚠️  High memory usage detected');
+          } else {
+            console.log('  ✓ Memory usage acceptable');
+          }
+        } else {
+          console.log('  ⚠️  fromXmlString not implemented');
+        }
+      } catch (error) {
+        console.log(`  ✗ Parse error: ${error.message}`);
+      }
+      
+      performanceTracker.recordMetric(`parse-${test.items}-items`, performance.now() - startTime);
+      
+      // Force garbage collection if available
+      if (global.gc) {
+        global.gc();
+      }
+    }
+    
+    performanceTracker.endOperation('memory-strategies');
+  });
+  
+  await t.test('Streaming parser simulation', async () => {
+    performanceTracker.startOperation('streaming-simulation');
+    
+    class StreamingXmlParser {
+      private buffer = '';
+      private tagStack: string[] = [];
+      private currentElement: any = null;
+      private parsedElements = 0;
+      private eventHandlers: Map<string, (element: any) => void> = new Map();
+      
+      onElement(tagName: string, handler: (element: any) => void): void {
+        this.eventHandlers.set(tagName, handler);
+      }
+      
+      async parseChunk(chunk: string): Promise<void> {
+        this.buffer += chunk;
+        
+        // Simple streaming parser simulation
+        let tagMatch;
+        const tagRegex = /<([^>]+)>([^<]*)/g;
+        
+        while ((tagMatch = tagRegex.exec(this.buffer)) !== null) {
+          const [fullMatch, tag, content] = tagMatch;
+          
+          if (tag.startsWith('/')) {
+            // Closing tag
+            const tagName = tag.substring(1);
+            if (this.tagStack[this.tagStack.length - 1] === tagName) {
+              this.tagStack.pop();
+              
+              // Emit element event
+              if (this.currentElement && this.eventHandlers.has(tagName)) {
+                this.eventHandlers.get(tagName)!(this.currentElement);
+                this.parsedElements++;
+              }
+              
+              this.currentElement = null;
+            }
+          } else if (!tag.endsWith('/')) {
+            // Opening tag
+            const tagName = tag.split(' ')[0];
+            this.tagStack.push(tagName);
+            this.currentElement = { tag: tagName, content: content.trim() };
+          }
+        }
+        
+        // Keep unparsed content in buffer
+        const lastTagEnd = this.buffer.lastIndexOf('>');
+        if (lastTagEnd !== -1) {
+          this.buffer = this.buffer.substring(lastTagEnd + 1);
+        }
+      }
+      
+      getStats() {
+        return {
+          parsedElements: this.parsedElements,
+          bufferSize: this.buffer.length,
+          stackDepth: this.tagStack.length
+        };
+      }
+    }
+    
+    // Test streaming parser
+    const parser = new StreamingXmlParser();
+    let lineItemCount = 0;
+    let totalAmount = 0;
+    
+    // Register handlers for specific elements
+    parser.onElement('LineItem', (element) => {
+      lineItemCount++;
+    });
+    
+    parser.onElement('Amount', (element) => {
+      const amount = parseFloat(element.content);
+      if (!isNaN(amount)) {
+        totalAmount += amount;
+      }
+    });
+    
+    // Generate and parse in chunks
+    const chunkSize = 1024; // 1KB chunks
+    const totalItems = 1000;
+    
+    console.log(`\nStreaming parse simulation (${totalItems} items in ${chunkSize} byte chunks):`);
+    
+    const startTime = performance.now();
+    
+    // Generate header
+    await parser.parseChunk(`<?xml version="1.0"?>
+<Invoice>
+  <ID>STREAM-TEST</ID>
+  <InvoiceLine>`);
+    
+    // Generate items in chunks
+    let currentChunk = '';
+    for (let i = 1; i <= totalItems; i++) {
+      const item = `
+    <LineItem>
+      <ID>${i}</ID>
+      <Description>Item ${i}</Description>
+      <Amount>10.00</Amount>
+    </LineItem>`;
+      
+      currentChunk += item;
+      
+      if (currentChunk.length >= chunkSize) {
+        await parser.parseChunk(currentChunk);
+        currentChunk = '';
+        
+        // Log progress every 100 items
+        if (i % 100 === 0) {
+          const stats = parser.getStats();
+          console.log(`  Progress: ${i}/${totalItems} items, buffer: ${stats.bufferSize} bytes`);
+        }
+      }
+    }
+    
+    // Parse remaining chunk and footer
+    await parser.parseChunk(currentChunk + `
+  </InvoiceLine>
+</Invoice>`);
+    
+    const parseTime = performance.now() - startTime;
+    const finalStats = parser.getStats();
+    
+    console.log(`\nStreaming results:`);
+    console.log(`  Parse time: ${parseTime.toFixed(2)}ms`);
+    console.log(`  Line items found: ${lineItemCount}`);
+    console.log(`  Total amount sum: ${totalAmount.toFixed(2)}`);
+    console.log(`  Elements parsed: ${finalStats.parsedElements}`);
+    console.log(`  Parse rate: ${(totalItems / parseTime * 1000).toFixed(0)} items/second`);
+    
+    performanceTracker.endOperation('streaming-simulation');
+  });
+  
+  await t.test('Chunked processing patterns', async () => {
+    performanceTracker.startOperation('chunked-processing');
+    
+    const chunkPatterns = [
+      {
+        name: 'Fixed size chunks',
+        chunkSize: 4096,
+        description: 'Process in fixed byte chunks'
+      },
+      {
+        name: 'Line-based chunks',
+        chunkSize: 100, // lines
+        description: 'Process by number of lines'
+      },
+      {
+        name: 'Element-based chunks',
+        chunkSize: 50, // elements
+        description: 'Process by complete elements'
+      },
+      {
+        name: 'Memory-based chunks',
+        chunkSize: 1024 * 1024, // 1MB
+        description: 'Process based on memory limits'
+      }
+    ];
+    
+    for (const pattern of chunkPatterns) {
+      console.log(`\n${pattern.name}:`);
+      console.log(`  ${pattern.description}`);
+      console.log(`  Chunk size: ${pattern.chunkSize}`);
+      
+      // Simulate processing
+      const startTime = performance.now();
+      let chunksProcessed = 0;
+      let totalBytes = 0;
+      
+      // Process 10 chunks
+      for (let i = 0; i < 10; i++) {
+        // Simulate chunk processing
+        await new Promise(resolve => setTimeout(resolve, 1));
+        chunksProcessed++;
+        totalBytes += pattern.chunkSize;
+      }
+      
+      const processTime = performance.now() - startTime;
+      
+      console.log(`  Chunks processed: ${chunksProcessed}`);
+      console.log(`  Processing rate: ${(totalBytes / processTime * 1000 / 1024).toFixed(2)}KB/s`);
+      
+      performanceTracker.recordMetric(`chunk-${pattern.name}`, processTime);
+    }
+    
+    performanceTracker.endOperation('chunked-processing');
+  });
+  
+  await t.test('Large corpus file handling', async () => {
+    performanceTracker.startOperation('corpus-large-files');
+    
+    const corpusLoader = new CorpusLoader();
+    const allFiles = await corpusLoader.getFiles(/\.(xml|ubl|cii)$/);
+    
+    // Find large files
+    const fileSizes = await Promise.all(
+      allFiles.map(async (file) => {
+        const stats = await plugins.fs.stat(file.path);
+        return { file, size: stats.size };
+      })
+    );
+    
+    // Sort by size and get top 10
+    const largeFiles = fileSizes
+      .sort((a, b) => b.size - a.size)
+      .slice(0, 10);
+    
+    console.log(`\nLargest files in corpus:`);
+    
+    for (const { file, size } of largeFiles) {
+      console.log(`  ${file.name}: ${(size / 1024).toFixed(1)}KB`);
+      
+      if (size > 100 * 1024) { // Files larger than 100KB
+        const startTime = performance.now();
+        const startMemory = process.memoryUsage();
+        
+        try {
+          const content = await plugins.fs.readFile(file.path, 'utf8');
+          const invoice = new einvoice.EInvoice();
+          
+          if (invoice.fromXmlString) {
+            await invoice.fromXmlString(content);
+            
+            const parseTime = performance.now() - startTime;
+            const endMemory = process.memoryUsage();
+            const memoryUsed = (endMemory.heapUsed - startMemory.heapUsed) / 1024 / 1024;
+            
+            console.log(`    Parse time: ${parseTime.toFixed(2)}ms`);
+            console.log(`    Memory used: ${memoryUsed.toFixed(2)}MB`);
+            console.log(`    Parse rate: ${(size / parseTime * 1000 / 1024).toFixed(2)}KB/s`);
+          }
+        } catch (error) {
+          console.log(`    Error: ${error.message}`);
+        }
+        
+        performanceTracker.recordMetric(`large-file-${file.name}`, performance.now() - startTime);
+      }
+    }
+    
+    performanceTracker.endOperation('corpus-large-files');
+  });
+  
+  await t.test('Progressive parsing with callbacks', async () => {
+    performanceTracker.startOperation('progressive-parsing');
+    
+    class ProgressiveParser {
+      private invoiceData: any = {};
+      private lineItems: any[] = [];
+      private currentPath: string[] = [];
+      
+      constructor(
+        private onProgress?: (progress: number) => void,
+        private onLineItem?: (item: any) => void
+      ) {}
+      
+      async parse(xml: string): Promise<any> {
+        const totalSize = xml.length;
+        let processed = 0;
+        const chunkSize = 10000;
+        
+        // Parse in chunks
+        for (let i = 0; i < totalSize; i += chunkSize) {
+          const chunk = xml.substring(i, Math.min(i + chunkSize, totalSize));
+          await this.processChunk(chunk);
+          
+          processed += chunk.length;
+          
+          if (this.onProgress) {
+            this.onProgress(processed / totalSize * 100);
+          }
+          
+          // Simulate async processing
+          await new Promise(resolve => setImmediate(resolve));
+        }
+        
+        return {
+          invoice: this.invoiceData,
+          lineItems: this.lineItems
+        };
+      }
+      
+      private async processChunk(chunk: string): Promise<void> {
+        // Simplified parsing - in reality would maintain state across chunks
+        const lineItemMatches = chunk.matchAll(/<LineItem>[\s\S]*?<\/LineItem>/g);
+        
+        for (const match of lineItemMatches) {
+          const item = this.parseLineItem(match[0]);
+          if (item) {
+            this.lineItems.push(item);
+            if (this.onLineItem) {
+              this.onLineItem(item);
+            }
+          }
+        }
+      }
+      
+      private parseLineItem(xml: string): any {
+        const item: any = {};
+        
+        const idMatch = xml.match(/<ID>([^<]+)<\/ID>/);
+        if (idMatch) item.id = idMatch[1];
+        
+        const descMatch = xml.match(/<Description>([^<]+)<\/Description>/);
+        if (descMatch) item.description = descMatch[1];
+        
+        const amountMatch = xml.match(/<Amount[^>]*>([^<]+)<\/Amount>/);
+        if (amountMatch) item.amount = parseFloat(amountMatch[1]);
+        
+        return Object.keys(item).length > 0 ? item : null;
+      }
+    }
+    
+    // Test progressive parser
+    console.log('\nProgressive parsing test:');
+    
+    const largeXml = generateLargeInvoice(500);
+    let progressUpdates = 0;
+    let itemsFound = 0;
+    
+    const parser = new ProgressiveParser(
+      (progress) => {
+        progressUpdates++;
+        if (progress % 20 < 5) { // Log at ~20% intervals
+          console.log(`  Progress: ${progress.toFixed(0)}%`);
+        }
+      },
+      (item) => {
+        itemsFound++;
+        if (itemsFound % 100 === 0) {
+          console.log(`  Found ${itemsFound} items...`);
+        }
+      }
+    );
+    
+    const startTime = performance.now();
+    const result = await parser.parse(largeXml);
+    const parseTime = performance.now() - startTime;
+    
+    console.log(`\nProgressive parsing results:`);
+    console.log(`  Parse time: ${parseTime.toFixed(2)}ms`);
+    console.log(`  Progress updates: ${progressUpdates}`);
+    console.log(`  Line items found: ${result.lineItems.length}`);
+    console.log(`  Items/second: ${(result.lineItems.length / parseTime * 1000).toFixed(0)}`);
+    
+    performanceTracker.endOperation('progressive-parsing');
+    
+    // Helper function
+    function generateLargeInvoice(lineItems: number): string {
+      let xml = `<?xml version="1.0" encoding="UTF-8"?>
+<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
+  <ID>LARGE-${lineItems}</ID>
+  <IssueDate>2024-01-01</IssueDate>`;
+      
+      for (let i = 1; i <= lineItems; i++) {
+        xml += `
+  <LineItem>
+    <ID>${i}</ID>
+    <Description>Product Item ${i} with extended description for testing</Description>
+    <Quantity>1</Quantity>
+    <Amount currencyID="EUR">${(Math.random() * 1000).toFixed(2)}</Amount>
+  </LineItem>`;
+      }
+      
+      xml += '\n</Invoice>';
+      return xml;
+    }
+  });
+  
+  await t.test('Stream processing optimization techniques', async () => {
+    performanceTracker.startOperation('stream-optimization');
+    
+    const optimizations = [
+      {
+        name: 'Buffer pooling',
+        description: 'Reuse buffers to reduce allocation',
+        implementation: () => {
+          const bufferPool: Buffer[] = [];
+          const poolSize = 10;
+          const bufferSize = 4096;
+          
+          // Pre-allocate buffers
+          for (let i = 0; i < poolSize; i++) {
+            bufferPool.push(Buffer.allocUnsafe(bufferSize));
+          }
+          
+          return {
+            acquire: () => bufferPool.pop() || Buffer.allocUnsafe(bufferSize),
+            release: (buffer: Buffer) => {
+              if (bufferPool.length < poolSize) {
+                bufferPool.push(buffer);
+              }
+            }
+          };
+        }
+      },
+      {
+        name: 'Lazy evaluation',
+        description: 'Defer processing until needed',
+        implementation: () => {
+          const pendingOperations: (() => any)[] = [];
+          
+          return {
+            defer: (op: () => any) => pendingOperations.push(op),
+            evaluate: () => {
+              const results = pendingOperations.map(op => op());
+              pendingOperations.length = 0;
+              return results;
+            }
+          };
+        }
+      },
+      {
+        name: 'Element skipping',
+        description: 'Skip unneeded elements during parsing',
+        implementation: () => {
+          const skipPaths = new Set(['Signature', 'Extension', 'AdditionalInfo']);
+          
+          return {
+            shouldSkip: (elementPath: string) => {
+              return skipPaths.has(elementPath.split('/').pop() || '');
+            }
+          };
+        }
+      }
+    ];
+    
+    for (const opt of optimizations) {
+      console.log(`\n${opt.name}:`);
+      console.log(`  ${opt.description}`);
+      
+      const impl = opt.implementation();
+      
+      // Simulate usage
+      const startTime = performance.now();
+      
+      if ('acquire' in impl) {
+        // Buffer pooling test
+        for (let i = 0; i < 1000; i++) {
+          const buffer = impl.acquire();
+          // Use buffer...
+          impl.release(buffer);
+        }
+        console.log('  ✓ Buffer pool working');
+      } else if ('defer' in impl) {
+        // Lazy evaluation test
+        for (let i = 0; i < 100; i++) {
+          impl.defer(() => Math.random() * 1000);
+        }
+        const results = impl.evaluate();
+        console.log(`  ✓ Deferred ${results.length} operations`);
+      } else if ('shouldSkip' in impl) {
+        // Element skipping test
+        const testPaths = [
+          'Invoice/Signature',
+          'Invoice/LineItem/Price',
+          'Invoice/Extension'
+        ];
+        const skipped = testPaths.filter(p => impl.shouldSkip(p));
+        console.log(`  ✓ Skipping ${skipped.length} of ${testPaths.length} paths`);
+      }
+      
+      performanceTracker.recordMetric(`optimization-${opt.name}`, performance.now() - startTime);
+    }
+    
+    performanceTracker.endOperation('stream-optimization');
+  });
+  
+  // Performance summary
+  console.log('\n' + performanceTracker.getSummary());
+  
+  // Streaming best practices
+  console.log('\nLarge XML Streaming Best Practices:');
+  console.log('1. Use streaming parsers for files > 10MB');
+  console.log('2. Process data in chunks to control memory usage');
+  console.log('3. Implement progress callbacks for user feedback');
+  console.log('4. Use buffer pools to reduce allocation overhead');
+  console.log('5. Skip unnecessary elements during parsing');
+  console.log('6. Monitor memory usage and implement limits');
+  console.log('7. Support both streaming and DOM parsing modes');
+  console.log('8. Optimize chunk sizes based on document structure');
+});
+
+tap.start();