einvoice/test/suite/einvoice_parsing/test.parse-06.streaming-parse.ts

import { expect, tap } from '@git.zone/tstest/tapbundle';
import * as einvoice from '../../../ts/index.js';
import * as plugins from '../../plugins.js';
import { CorpusLoader } from '../../helpers/corpus.loader.js';
import { PerformanceTracker } from '../../helpers/performance.tracker.js';

tap.test('PARSE-06: Large XML Streaming - Handle large files with streaming parsers', async (t) => {
  const performanceTracker = new PerformanceTracker('PARSE-06');

  await t.test('Memory-efficient parsing strategies', async () => {
    performanceTracker.startOperation('memory-strategies');

    // Generate different sized test documents
    const generateLargeInvoice = (lineItems: number): string => {
      let xml = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
  <ID>LARGE-${lineItems}</ID>
  <IssueDate>2024-01-01</IssueDate>
  <InvoiceLine>`;

      for (let i = 1; i <= lineItems; i++) {
        xml += `
    <LineItem>
      <ID>${i}</ID>
      <Description>Product Item ${i} with a reasonably long description to increase document size</Description>
      <Quantity>1</Quantity>
      <Price>
        <Amount currencyID="EUR">${(Math.random() * 1000).toFixed(2)}</Amount>
      </Price>
      <AllowanceCharge>
        <ChargeIndicator>false</ChargeIndicator>
        <Amount currencyID="EUR">${(Math.random() * 10).toFixed(2)}</Amount>
      </AllowanceCharge>
    </LineItem>`;
      }

      xml += `
  </InvoiceLine>
</Invoice>`;
      return xml;
    };

    const testSizes = [
      { items: 100, expectedSize: '~50KB' },
      { items: 1000, expectedSize: '~500KB' },
      { items: 5000, expectedSize: '~2.5MB' },
      { items: 10000, expectedSize: '~5MB' }
    ];

    for (const test of testSizes) {
      const startTime = performance.now();
      const startMemory = process.memoryUsage();

      const largeXml = generateLargeInvoice(test.items);
      const xmlSize = Buffer.byteLength(largeXml, 'utf8');

      console.log(`\nTesting ${test.items} line items (${test.expectedSize}, actual: ${(xmlSize/1024).toFixed(1)}KB):`);

      try {
        const invoice = new einvoice.EInvoice();

        if (invoice.fromXmlString) {
          await invoice.fromXmlString(largeXml);

          const endMemory = process.memoryUsage();
          const memoryDelta = {
            heapUsed: (endMemory.heapUsed - startMemory.heapUsed) / 1024 / 1024,
            external: (endMemory.external - startMemory.external) / 1024 / 1024
          };

          const parseTime = performance.now() - startTime;

          console.log(`  Parse time: ${parseTime.toFixed(2)}ms`);
          console.log(`  Memory delta: ${memoryDelta.heapUsed.toFixed(2)}MB heap, ${memoryDelta.external.toFixed(2)}MB external`);
          console.log(`  Parse rate: ${(xmlSize / parseTime * 1000 / 1024 / 1024).toFixed(2)}MB/s`);

          // Check if memory usage is reasonable
          const memoryRatio = memoryDelta.heapUsed / (xmlSize / 1024 / 1024);
          console.log(`  Memory ratio: ${memoryRatio.toFixed(2)}x document size`);

          if (memoryRatio > 5) {
            console.log('  ⚠️  High memory usage detected');
          } else {
            console.log('  ✓ Memory usage acceptable');
          }
        } else {
          console.log('  ⚠️  fromXmlString not implemented');
        }
      } catch (error) {
        console.log(`  ✗ Parse error: ${error.message}`);
      }

      performanceTracker.recordMetric(`parse-${test.items}-items`, performance.now() - startTime);

      // Force garbage collection if available
      if (global.gc) {
        global.gc();
      }
    }

    performanceTracker.endOperation('memory-strategies');
  });

  await t.test('Streaming parser simulation', async () => {
    performanceTracker.startOperation('streaming-simulation');

    class StreamingXmlParser {
      private buffer = '';
      private tagStack: string[] = [];
      private currentElement: any = null;
      private parsedElements = 0;
      private eventHandlers: Map<string, (element: any) => void> = new Map();

      onElement(tagName: string, handler: (element: any) => void): void {
        this.eventHandlers.set(tagName, handler);
      }

      async parseChunk(chunk: string): Promise<void> {
        this.buffer += chunk;

        // Simple streaming parser simulation
        let tagMatch;
        const tagRegex = /<([^>]+)>([^<]*)/g;

        while ((tagMatch = tagRegex.exec(this.buffer)) !== null) {
          const [fullMatch, tag, content] = tagMatch;

          if (tag.startsWith('/')) {
            // Closing tag
            const tagName = tag.substring(1);
            if (this.tagStack[this.tagStack.length - 1] === tagName) {
              this.tagStack.pop();

              // Emit element event
              if (this.currentElement && this.eventHandlers.has(tagName)) {
                this.eventHandlers.get(tagName)!(this.currentElement);
                this.parsedElements++;
              }

              this.currentElement = null;
            }
          } else if (!tag.endsWith('/')) {
            // Opening tag
            const tagName = tag.split(' ')[0];
            this.tagStack.push(tagName);
            this.currentElement = { tag: tagName, content: content.trim() };
          }
        }

        // Keep unparsed content in buffer
        const lastTagEnd = this.buffer.lastIndexOf('>');
        if (lastTagEnd !== -1) {
          this.buffer = this.buffer.substring(lastTagEnd + 1);
        }
      }

      getStats() {
        return {
          parsedElements: this.parsedElements,
          bufferSize: this.buffer.length,
          stackDepth: this.tagStack.length
        };
      }
    }

    // Test streaming parser
    const parser = new StreamingXmlParser();
    let lineItemCount = 0;
    let totalAmount = 0;

    // Register handlers for specific elements
    parser.onElement('LineItem', (element) => {
      lineItemCount++;
    });

    parser.onElement('Amount', (element) => {
      const amount = parseFloat(element.content);
      if (!isNaN(amount)) {
        totalAmount += amount;
      }
    });

    // Generate and parse in chunks
    const chunkSize = 1024; // 1KB chunks
    const totalItems = 1000;

    console.log(`\nStreaming parse simulation (${totalItems} items in ${chunkSize} byte chunks):`);

    const startTime = performance.now();

    // Generate header
    await parser.parseChunk(`<?xml version="1.0"?>
<Invoice>
  <ID>STREAM-TEST</ID>
  <InvoiceLine>`);

    // Generate items in chunks
    let currentChunk = '';
    for (let i = 1; i <= totalItems; i++) {
      const item = `
    <LineItem>
      <ID>${i}</ID>
      <Description>Item ${i}</Description>
      <Amount>10.00</Amount>
    </LineItem>`;

      currentChunk += item;

      if (currentChunk.length >= chunkSize) {
        await parser.parseChunk(currentChunk);
        currentChunk = '';

        // Log progress every 100 items
        if (i % 100 === 0) {
          const stats = parser.getStats();
          console.log(`  Progress: ${i}/${totalItems} items, buffer: ${stats.bufferSize} bytes`);
        }
      }
    }

    // Parse remaining chunk and footer
    await parser.parseChunk(currentChunk + `
  </InvoiceLine>
</Invoice>`);

    const parseTime = performance.now() - startTime;
    const finalStats = parser.getStats();

    console.log(`\nStreaming results:`);
    console.log(`  Parse time: ${parseTime.toFixed(2)}ms`);
    console.log(`  Line items found: ${lineItemCount}`);
    console.log(`  Total amount sum: ${totalAmount.toFixed(2)}`);
    console.log(`  Elements parsed: ${finalStats.parsedElements}`);
    console.log(`  Parse rate: ${(totalItems / parseTime * 1000).toFixed(0)} items/second`);

    performanceTracker.endOperation('streaming-simulation');
  });

  await t.test('Chunked processing patterns', async () => {
    performanceTracker.startOperation('chunked-processing');

    const chunkPatterns = [
      {
        name: 'Fixed size chunks',
        chunkSize: 4096,
        description: 'Process in fixed byte chunks'
      },
      {
        name: 'Line-based chunks',
        chunkSize: 100, // lines
        description: 'Process by number of lines'
      },
      {
        name: 'Element-based chunks',
        chunkSize: 50, // elements
        description: 'Process by complete elements'
      },
      {
        name: 'Memory-based chunks',
        chunkSize: 1024 * 1024, // 1MB
        description: 'Process based on memory limits'
      }
    ];

    for (const pattern of chunkPatterns) {
      console.log(`\n${pattern.name}:`);
      console.log(`  ${pattern.description}`);
      console.log(`  Chunk size: ${pattern.chunkSize}`);

      // Simulate processing
      const startTime = performance.now();
      let chunksProcessed = 0;
      let totalBytes = 0;

      // Process 10 chunks
      for (let i = 0; i < 10; i++) {
        // Simulate chunk processing
        await new Promise(resolve => setTimeout(resolve, 1));
        chunksProcessed++;
        totalBytes += pattern.chunkSize;
      }

      const processTime = performance.now() - startTime;

      console.log(`  Chunks processed: ${chunksProcessed}`);
      console.log(`  Processing rate: ${(totalBytes / processTime * 1000 / 1024).toFixed(2)}KB/s`);

      performanceTracker.recordMetric(`chunk-${pattern.name}`, processTime);
    }

    performanceTracker.endOperation('chunked-processing');
  });

  await t.test('Large corpus file handling', async () => {
    performanceTracker.startOperation('corpus-large-files');

    const corpusLoader = new CorpusLoader();
    const allFiles = await corpusLoader.getFiles(/\.(xml|ubl|cii)$/);

    // Find large files
    const fileSizes = await Promise.all(
      allFiles.map(async (file) => {
        const stats = await plugins.fs.stat(file.path);
        return { file, size: stats.size };
      })
    );

    // Sort by size and get top 10
    const largeFiles = fileSizes
      .sort((a, b) => b.size - a.size)
      .slice(0, 10);

    console.log(`\nLargest files in corpus:`);

    for (const { file, size } of largeFiles) {
      console.log(`  ${file.name}: ${(size / 1024).toFixed(1)}KB`);

      if (size > 100 * 1024) { // Files larger than 100KB
        const startTime = performance.now();
        const startMemory = process.memoryUsage();

        try {
          const content = await plugins.fs.readFile(file.path, 'utf8');
          const invoice = new einvoice.EInvoice();

          if (invoice.fromXmlString) {
            await invoice.fromXmlString(content);

            const parseTime = performance.now() - startTime;
            const endMemory = process.memoryUsage();
            const memoryUsed = (endMemory.heapUsed - startMemory.heapUsed) / 1024 / 1024;

            console.log(`    Parse time: ${parseTime.toFixed(2)}ms`);
            console.log(`    Memory used: ${memoryUsed.toFixed(2)}MB`);
            console.log(`    Parse rate: ${(size / parseTime * 1000 / 1024).toFixed(2)}KB/s`);
          }
        } catch (error) {
          console.log(`    Error: ${error.message}`);
        }

        performanceTracker.recordMetric(`large-file-${file.name}`, performance.now() - startTime);
      }
    }

    performanceTracker.endOperation('corpus-large-files');
  });

  await t.test('Progressive parsing with callbacks', async () => {
    performanceTracker.startOperation('progressive-parsing');

    class ProgressiveParser {
      private invoiceData: any = {};
      private lineItems: any[] = [];
      private currentPath: string[] = [];

      constructor(
        private onProgress?: (progress: number) => void,
        private onLineItem?: (item: any) => void
      ) {}

      async parse(xml: string): Promise<any> {
        const totalSize = xml.length;
        let processed = 0;
        const chunkSize = 10000;

        // Parse in chunks
        for (let i = 0; i < totalSize; i += chunkSize) {
          const chunk = xml.substring(i, Math.min(i + chunkSize, totalSize));
          await this.processChunk(chunk);

          processed += chunk.length;

          if (this.onProgress) {
            this.onProgress(processed / totalSize * 100);
          }

          // Simulate async processing
          await new Promise(resolve => setImmediate(resolve));
        }

        return {
          invoice: this.invoiceData,
          lineItems: this.lineItems
        };
      }

      private async processChunk(chunk: string): Promise<void> {
        // Simplified parsing - in reality would maintain state across chunks
        const lineItemMatches = chunk.matchAll(/<LineItem>[\s\S]*?<\/LineItem>/g);

        for (const match of lineItemMatches) {
          const item = this.parseLineItem(match[0]);
          if (item) {
            this.lineItems.push(item);
            if (this.onLineItem) {
              this.onLineItem(item);
            }
          }
        }
      }

      private parseLineItem(xml: string): any {
        const item: any = {};

        const idMatch = xml.match(/<ID>([^<]+)<\/ID>/);
        if (idMatch) item.id = idMatch[1];

        const descMatch = xml.match(/<Description>([^<]+)<\/Description>/);
        if (descMatch) item.description = descMatch[1];

        const amountMatch = xml.match(/<Amount[^>]*>([^<]+)<\/Amount>/);
        if (amountMatch) item.amount = parseFloat(amountMatch[1]);

        return Object.keys(item).length > 0 ? item : null;
      }
    }

    // Test progressive parser
    console.log('\nProgressive parsing test:');

    const largeXml = generateLargeInvoice(500);
    let progressUpdates = 0;
    let itemsFound = 0;

    const parser = new ProgressiveParser(
      (progress) => {
        progressUpdates++;
        if (progress % 20 < 5) { // Log at ~20% intervals
          console.log(`  Progress: ${progress.toFixed(0)}%`);
        }
      },
      (item) => {
        itemsFound++;
        if (itemsFound % 100 === 0) {
          console.log(`  Found ${itemsFound} items...`);
        }
      }
    );

    const startTime = performance.now();
    const result = await parser.parse(largeXml);
    const parseTime = performance.now() - startTime;

    console.log(`\nProgressive parsing results:`);
    console.log(`  Parse time: ${parseTime.toFixed(2)}ms`);
    console.log(`  Progress updates: ${progressUpdates}`);
    console.log(`  Line items found: ${result.lineItems.length}`);
    console.log(`  Items/second: ${(result.lineItems.length / parseTime * 1000).toFixed(0)}`);

    performanceTracker.endOperation('progressive-parsing');

    // Helper function
    function generateLargeInvoice(lineItems: number): string {
      let xml = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
  <ID>LARGE-${lineItems}</ID>
  <IssueDate>2024-01-01</IssueDate>`;

      for (let i = 1; i <= lineItems; i++) {
        xml += `
  <LineItem>
    <ID>${i}</ID>
    <Description>Product Item ${i} with extended description for testing</Description>
    <Quantity>1</Quantity>
    <Amount currencyID="EUR">${(Math.random() * 1000).toFixed(2)}</Amount>
  </LineItem>`;
      }

      xml += '\n</Invoice>';
      return xml;
    }
  });

  await t.test('Stream processing optimization techniques', async () => {
    performanceTracker.startOperation('stream-optimization');

    const optimizations = [
      {
        name: 'Buffer pooling',
        description: 'Reuse buffers to reduce allocation',
        implementation: () => {
          const bufferPool: Buffer[] = [];
          const poolSize = 10;
          const bufferSize = 4096;

          // Pre-allocate buffers
          for (let i = 0; i < poolSize; i++) {
            bufferPool.push(Buffer.allocUnsafe(bufferSize));
          }

          return {
            acquire: () => bufferPool.pop() || Buffer.allocUnsafe(bufferSize),
            release: (buffer: Buffer) => {
              if (bufferPool.length < poolSize) {
                bufferPool.push(buffer);
              }
            }
          };
        }
      },
      {
        name: 'Lazy evaluation',
        description: 'Defer processing until needed',
        implementation: () => {
          const pendingOperations: (() => any)[] = [];

          return {
            defer: (op: () => any) => pendingOperations.push(op),
            evaluate: () => {
              const results = pendingOperations.map(op => op());
              pendingOperations.length = 0;
              return results;
            }
          };
        }
      },
      {
        name: 'Element skipping',
        description: 'Skip unneeded elements during parsing',
        implementation: () => {
          const skipPaths = new Set(['Signature', 'Extension', 'AdditionalInfo']);

          return {
            shouldSkip: (elementPath: string) => {
              return skipPaths.has(elementPath.split('/').pop() || '');
            }
          };
        }
      }
    ];

    for (const opt of optimizations) {
      console.log(`\n${opt.name}:`);
      console.log(`  ${opt.description}`);

      const impl = opt.implementation();

      // Simulate usage
      const startTime = performance.now();

      if ('acquire' in impl) {
        // Buffer pooling test
        for (let i = 0; i < 1000; i++) {
          const buffer = impl.acquire();
          // Use buffer...
          impl.release(buffer);
        }
        console.log('  ✓ Buffer pool working');
      } else if ('defer' in impl) {
        // Lazy evaluation test
        for (let i = 0; i < 100; i++) {
          impl.defer(() => Math.random() * 1000);
        }
        const results = impl.evaluate();
        console.log(`  ✓ Deferred ${results.length} operations`);
      } else if ('shouldSkip' in impl) {
        // Element skipping test
        const testPaths = [
          'Invoice/Signature',
          'Invoice/LineItem/Price',
          'Invoice/Extension'
        ];
        const skipped = testPaths.filter(p => impl.shouldSkip(p));
        console.log(`  ✓ Skipping ${skipped.length} of ${testPaths.length} paths`);
      }

      performanceTracker.recordMetric(`optimization-${opt.name}`, performance.now() - startTime);
    }

    performanceTracker.endOperation('stream-optimization');
  });

  // Performance summary
  console.log('\n' + performanceTracker.getSummary());

  // Streaming best practices
  console.log('\nLarge XML Streaming Best Practices:');
  console.log('1. Use streaming parsers for files > 10MB');
  console.log('2. Process data in chunks to control memory usage');
  console.log('3. Implement progress callbacks for user feedback');
  console.log('4. Use buffer pools to reduce allocation overhead');
  console.log('5. Skip unnecessary elements during parsing');
  console.log('6. Monitor memory usage and implement limits');
  console.log('7. Support both streaming and DOM parsing modes');
  console.log('8. Optimize chunk sizes based on document structure');
});

tap.start();