import { expect, tap } from '@git.zone/tstest/tapbundle';
import * as einvoice from '../../../ts/index.js';
import * as plugins from '../../plugins.js';
import { CorpusLoader } from '../../helpers/corpus.loader.js';
import { PerformanceTracker } from '../../helpers/performance.tracker.js';
tap.test('PARSE-06: Large XML Streaming - Handle large files with streaming parsers', async (t) => {
const performanceTracker = new PerformanceTracker('PARSE-06');
await t.test('Memory-efficient parsing strategies', async () => {
performanceTracker.startOperation('memory-strategies');
// Generate different sized test documents
const generateLargeInvoice = (lineItems: number): string => {
let xml = `
LARGE-${lineItems}
2024-01-01
`;
for (let i = 1; i <= lineItems; i++) {
xml += `
${i}
Product Item ${i} with a reasonably long description to increase document size
1
${(Math.random() * 1000).toFixed(2)}
false
${(Math.random() * 10).toFixed(2)}
`;
}
xml += `
`;
return xml;
};
const testSizes = [
{ items: 100, expectedSize: '~50KB' },
{ items: 1000, expectedSize: '~500KB' },
{ items: 5000, expectedSize: '~2.5MB' },
{ items: 10000, expectedSize: '~5MB' }
];
for (const test of testSizes) {
const startTime = performance.now();
const startMemory = process.memoryUsage();
const largeXml = generateLargeInvoice(test.items);
const xmlSize = Buffer.byteLength(largeXml, 'utf8');
console.log(`\nTesting ${test.items} line items (${test.expectedSize}, actual: ${(xmlSize/1024).toFixed(1)}KB):`);
try {
const invoice = new einvoice.EInvoice();
if (invoice.fromXmlString) {
await invoice.fromXmlString(largeXml);
const endMemory = process.memoryUsage();
const memoryDelta = {
heapUsed: (endMemory.heapUsed - startMemory.heapUsed) / 1024 / 1024,
external: (endMemory.external - startMemory.external) / 1024 / 1024
};
const parseTime = performance.now() - startTime;
console.log(` Parse time: ${parseTime.toFixed(2)}ms`);
console.log(` Memory delta: ${memoryDelta.heapUsed.toFixed(2)}MB heap, ${memoryDelta.external.toFixed(2)}MB external`);
console.log(` Parse rate: ${(xmlSize / parseTime * 1000 / 1024 / 1024).toFixed(2)}MB/s`);
// Check if memory usage is reasonable
const memoryRatio = memoryDelta.heapUsed / (xmlSize / 1024 / 1024);
console.log(` Memory ratio: ${memoryRatio.toFixed(2)}x document size`);
if (memoryRatio > 5) {
console.log(' ⚠️ High memory usage detected');
} else {
console.log(' ✓ Memory usage acceptable');
}
} else {
console.log(' ⚠️ fromXmlString not implemented');
}
} catch (error) {
console.log(` ✗ Parse error: ${error.message}`);
}
performanceTracker.recordMetric(`parse-${test.items}-items`, performance.now() - startTime);
// Force garbage collection if available
if (global.gc) {
global.gc();
}
}
performanceTracker.endOperation('memory-strategies');
});
await t.test('Streaming parser simulation', async () => {
performanceTracker.startOperation('streaming-simulation');
class StreamingXmlParser {
private buffer = '';
private tagStack: string[] = [];
private currentElement: any = null;
private parsedElements = 0;
private eventHandlers: Map void> = new Map();
onElement(tagName: string, handler: (element: any) => void): void {
this.eventHandlers.set(tagName, handler);
}
async parseChunk(chunk: string): Promise {
this.buffer += chunk;
// Simple streaming parser simulation
let tagMatch;
const tagRegex = /<([^>]+)>([^<]*)/g;
while ((tagMatch = tagRegex.exec(this.buffer)) !== null) {
const [fullMatch, tag, content] = tagMatch;
if (tag.startsWith('/')) {
// Closing tag
const tagName = tag.substring(1);
if (this.tagStack[this.tagStack.length - 1] === tagName) {
this.tagStack.pop();
// Emit element event
if (this.currentElement && this.eventHandlers.has(tagName)) {
this.eventHandlers.get(tagName)!(this.currentElement);
this.parsedElements++;
}
this.currentElement = null;
}
} else if (!tag.endsWith('/')) {
// Opening tag
const tagName = tag.split(' ')[0];
this.tagStack.push(tagName);
this.currentElement = { tag: tagName, content: content.trim() };
}
}
// Keep unparsed content in buffer
const lastTagEnd = this.buffer.lastIndexOf('>');
if (lastTagEnd !== -1) {
this.buffer = this.buffer.substring(lastTagEnd + 1);
}
}
getStats() {
return {
parsedElements: this.parsedElements,
bufferSize: this.buffer.length,
stackDepth: this.tagStack.length
};
}
}
// Test streaming parser
const parser = new StreamingXmlParser();
let lineItemCount = 0;
let totalAmount = 0;
// Register handlers for specific elements
parser.onElement('LineItem', (element) => {
lineItemCount++;
});
parser.onElement('Amount', (element) => {
const amount = parseFloat(element.content);
if (!isNaN(amount)) {
totalAmount += amount;
}
});
// Generate and parse in chunks
const chunkSize = 1024; // 1KB chunks
const totalItems = 1000;
console.log(`\nStreaming parse simulation (${totalItems} items in ${chunkSize} byte chunks):`);
const startTime = performance.now();
// Generate header
await parser.parseChunk(`
STREAM-TEST
`);
// Generate items in chunks
let currentChunk = '';
for (let i = 1; i <= totalItems; i++) {
const item = `
${i}
Item ${i}
10.00
`;
currentChunk += item;
if (currentChunk.length >= chunkSize) {
await parser.parseChunk(currentChunk);
currentChunk = '';
// Log progress every 100 items
if (i % 100 === 0) {
const stats = parser.getStats();
console.log(` Progress: ${i}/${totalItems} items, buffer: ${stats.bufferSize} bytes`);
}
}
}
// Parse remaining chunk and footer
await parser.parseChunk(currentChunk + `
`);
const parseTime = performance.now() - startTime;
const finalStats = parser.getStats();
console.log(`\nStreaming results:`);
console.log(` Parse time: ${parseTime.toFixed(2)}ms`);
console.log(` Line items found: ${lineItemCount}`);
console.log(` Total amount sum: ${totalAmount.toFixed(2)}`);
console.log(` Elements parsed: ${finalStats.parsedElements}`);
console.log(` Parse rate: ${(totalItems / parseTime * 1000).toFixed(0)} items/second`);
performanceTracker.endOperation('streaming-simulation');
});
await t.test('Chunked processing patterns', async () => {
performanceTracker.startOperation('chunked-processing');
const chunkPatterns = [
{
name: 'Fixed size chunks',
chunkSize: 4096,
description: 'Process in fixed byte chunks'
},
{
name: 'Line-based chunks',
chunkSize: 100, // lines
description: 'Process by number of lines'
},
{
name: 'Element-based chunks',
chunkSize: 50, // elements
description: 'Process by complete elements'
},
{
name: 'Memory-based chunks',
chunkSize: 1024 * 1024, // 1MB
description: 'Process based on memory limits'
}
];
for (const pattern of chunkPatterns) {
console.log(`\n${pattern.name}:`);
console.log(` ${pattern.description}`);
console.log(` Chunk size: ${pattern.chunkSize}`);
// Simulate processing
const startTime = performance.now();
let chunksProcessed = 0;
let totalBytes = 0;
// Process 10 chunks
for (let i = 0; i < 10; i++) {
// Simulate chunk processing
await new Promise(resolve => setTimeout(resolve, 1));
chunksProcessed++;
totalBytes += pattern.chunkSize;
}
const processTime = performance.now() - startTime;
console.log(` Chunks processed: ${chunksProcessed}`);
console.log(` Processing rate: ${(totalBytes / processTime * 1000 / 1024).toFixed(2)}KB/s`);
performanceTracker.recordMetric(`chunk-${pattern.name}`, processTime);
}
performanceTracker.endOperation('chunked-processing');
});
await t.test('Large corpus file handling', async () => {
performanceTracker.startOperation('corpus-large-files');
const corpusLoader = new CorpusLoader();
const allFiles = await corpusLoader.getFiles(/\.(xml|ubl|cii)$/);
// Find large files
const fileSizes = await Promise.all(
allFiles.map(async (file) => {
const stats = await plugins.fs.stat(file.path);
return { file, size: stats.size };
})
);
// Sort by size and get top 10
const largeFiles = fileSizes
.sort((a, b) => b.size - a.size)
.slice(0, 10);
console.log(`\nLargest files in corpus:`);
for (const { file, size } of largeFiles) {
console.log(` ${file.name}: ${(size / 1024).toFixed(1)}KB`);
if (size > 100 * 1024) { // Files larger than 100KB
const startTime = performance.now();
const startMemory = process.memoryUsage();
try {
const content = await plugins.fs.readFile(file.path, 'utf8');
const invoice = new einvoice.EInvoice();
if (invoice.fromXmlString) {
await invoice.fromXmlString(content);
const parseTime = performance.now() - startTime;
const endMemory = process.memoryUsage();
const memoryUsed = (endMemory.heapUsed - startMemory.heapUsed) / 1024 / 1024;
console.log(` Parse time: ${parseTime.toFixed(2)}ms`);
console.log(` Memory used: ${memoryUsed.toFixed(2)}MB`);
console.log(` Parse rate: ${(size / parseTime * 1000 / 1024).toFixed(2)}KB/s`);
}
} catch (error) {
console.log(` Error: ${error.message}`);
}
performanceTracker.recordMetric(`large-file-${file.name}`, performance.now() - startTime);
}
}
performanceTracker.endOperation('corpus-large-files');
});
await t.test('Progressive parsing with callbacks', async () => {
performanceTracker.startOperation('progressive-parsing');
class ProgressiveParser {
private invoiceData: any = {};
private lineItems: any[] = [];
private currentPath: string[] = [];
constructor(
private onProgress?: (progress: number) => void,
private onLineItem?: (item: any) => void
) {}
async parse(xml: string): Promise {
const totalSize = xml.length;
let processed = 0;
const chunkSize = 10000;
// Parse in chunks
for (let i = 0; i < totalSize; i += chunkSize) {
const chunk = xml.substring(i, Math.min(i + chunkSize, totalSize));
await this.processChunk(chunk);
processed += chunk.length;
if (this.onProgress) {
this.onProgress(processed / totalSize * 100);
}
// Simulate async processing
await new Promise(resolve => setImmediate(resolve));
}
return {
invoice: this.invoiceData,
lineItems: this.lineItems
};
}
private async processChunk(chunk: string): Promise {
// Simplified parsing - in reality would maintain state across chunks
const lineItemMatches = chunk.matchAll(/[\s\S]*?<\/LineItem>/g);
for (const match of lineItemMatches) {
const item = this.parseLineItem(match[0]);
if (item) {
this.lineItems.push(item);
if (this.onLineItem) {
this.onLineItem(item);
}
}
}
}
private parseLineItem(xml: string): any {
const item: any = {};
const idMatch = xml.match(/([^<]+)<\/ID>/);
if (idMatch) item.id = idMatch[1];
const descMatch = xml.match(/([^<]+)<\/Description>/);
if (descMatch) item.description = descMatch[1];
const amountMatch = xml.match(/]*>([^<]+)<\/Amount>/);
if (amountMatch) item.amount = parseFloat(amountMatch[1]);
return Object.keys(item).length > 0 ? item : null;
}
}
// Test progressive parser
console.log('\nProgressive parsing test:');
const largeXml = generateLargeInvoice(500);
let progressUpdates = 0;
let itemsFound = 0;
const parser = new ProgressiveParser(
(progress) => {
progressUpdates++;
if (progress % 20 < 5) { // Log at ~20% intervals
console.log(` Progress: ${progress.toFixed(0)}%`);
}
},
(item) => {
itemsFound++;
if (itemsFound % 100 === 0) {
console.log(` Found ${itemsFound} items...`);
}
}
);
const startTime = performance.now();
const result = await parser.parse(largeXml);
const parseTime = performance.now() - startTime;
console.log(`\nProgressive parsing results:`);
console.log(` Parse time: ${parseTime.toFixed(2)}ms`);
console.log(` Progress updates: ${progressUpdates}`);
console.log(` Line items found: ${result.lineItems.length}`);
console.log(` Items/second: ${(result.lineItems.length / parseTime * 1000).toFixed(0)}`);
performanceTracker.endOperation('progressive-parsing');
// Helper function
function generateLargeInvoice(lineItems: number): string {
let xml = `
LARGE-${lineItems}
2024-01-01`;
for (let i = 1; i <= lineItems; i++) {
xml += `
${i}
Product Item ${i} with extended description for testing
1
${(Math.random() * 1000).toFixed(2)}
`;
}
xml += '\n';
return xml;
}
});
await t.test('Stream processing optimization techniques', async () => {
performanceTracker.startOperation('stream-optimization');
const optimizations = [
{
name: 'Buffer pooling',
description: 'Reuse buffers to reduce allocation',
implementation: () => {
const bufferPool: Buffer[] = [];
const poolSize = 10;
const bufferSize = 4096;
// Pre-allocate buffers
for (let i = 0; i < poolSize; i++) {
bufferPool.push(Buffer.allocUnsafe(bufferSize));
}
return {
acquire: () => bufferPool.pop() || Buffer.allocUnsafe(bufferSize),
release: (buffer: Buffer) => {
if (bufferPool.length < poolSize) {
bufferPool.push(buffer);
}
}
};
}
},
{
name: 'Lazy evaluation',
description: 'Defer processing until needed',
implementation: () => {
const pendingOperations: (() => any)[] = [];
return {
defer: (op: () => any) => pendingOperations.push(op),
evaluate: () => {
const results = pendingOperations.map(op => op());
pendingOperations.length = 0;
return results;
}
};
}
},
{
name: 'Element skipping',
description: 'Skip unneeded elements during parsing',
implementation: () => {
const skipPaths = new Set(['Signature', 'Extension', 'AdditionalInfo']);
return {
shouldSkip: (elementPath: string) => {
return skipPaths.has(elementPath.split('/').pop() || '');
}
};
}
}
];
for (const opt of optimizations) {
console.log(`\n${opt.name}:`);
console.log(` ${opt.description}`);
const impl = opt.implementation();
// Simulate usage
const startTime = performance.now();
if ('acquire' in impl) {
// Buffer pooling test
for (let i = 0; i < 1000; i++) {
const buffer = impl.acquire();
// Use buffer...
impl.release(buffer);
}
console.log(' ✓ Buffer pool working');
} else if ('defer' in impl) {
// Lazy evaluation test
for (let i = 0; i < 100; i++) {
impl.defer(() => Math.random() * 1000);
}
const results = impl.evaluate();
console.log(` ✓ Deferred ${results.length} operations`);
} else if ('shouldSkip' in impl) {
// Element skipping test
const testPaths = [
'Invoice/Signature',
'Invoice/LineItem/Price',
'Invoice/Extension'
];
const skipped = testPaths.filter(p => impl.shouldSkip(p));
console.log(` ✓ Skipping ${skipped.length} of ${testPaths.length} paths`);
}
performanceTracker.recordMetric(`optimization-${opt.name}`, performance.now() - startTime);
}
performanceTracker.endOperation('stream-optimization');
});
// Performance summary
console.log('\n' + performanceTracker.getSummary());
// Streaming best practices
console.log('\nLarge XML Streaming Best Practices:');
console.log('1. Use streaming parsers for files > 10MB');
console.log('2. Process data in chunks to control memory usage');
console.log('3. Implement progress callbacks for user feedback');
console.log('4. Use buffer pools to reduce allocation overhead');
console.log('5. Skip unnecessary elements during parsing');
console.log('6. Monitor memory usage and implement limits');
console.log('7. Support both streaming and DOM parsing modes');
console.log('8. Optimize chunk sizes based on document structure');
});
tap.start();