588 lines
19 KiB
TypeScript
588 lines
19 KiB
TypeScript
import { expect, tap } from '@git.zone/tstest/tapbundle';
|
|
import * as einvoice from '../../../ts/index.js';
|
|
import * as plugins from '../../plugins.js';
|
|
import { CorpusLoader } from '../../helpers/corpus.loader.js';
|
|
import { PerformanceTracker } from '../../helpers/performance.tracker.js';
|
|
|
|
tap.test('PARSE-06: Large XML Streaming - Handle large files with streaming parsers', async (t) => {
|
|
const performanceTracker = new PerformanceTracker('PARSE-06');
|
|
|
|
await t.test('Memory-efficient parsing strategies', async () => {
|
|
performanceTracker.startOperation('memory-strategies');
|
|
|
|
// Generate different sized test documents
|
|
const generateLargeInvoice = (lineItems: number): string => {
|
|
let xml = `<?xml version="1.0" encoding="UTF-8"?>
|
|
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
|
|
<ID>LARGE-${lineItems}</ID>
|
|
<IssueDate>2024-01-01</IssueDate>
|
|
<InvoiceLine>`;
|
|
|
|
for (let i = 1; i <= lineItems; i++) {
|
|
xml += `
|
|
<LineItem>
|
|
<ID>${i}</ID>
|
|
<Description>Product Item ${i} with a reasonably long description to increase document size</Description>
|
|
<Quantity>1</Quantity>
|
|
<Price>
|
|
<Amount currencyID="EUR">${(Math.random() * 1000).toFixed(2)}</Amount>
|
|
</Price>
|
|
<AllowanceCharge>
|
|
<ChargeIndicator>false</ChargeIndicator>
|
|
<Amount currencyID="EUR">${(Math.random() * 10).toFixed(2)}</Amount>
|
|
</AllowanceCharge>
|
|
</LineItem>`;
|
|
}
|
|
|
|
xml += `
|
|
</InvoiceLine>
|
|
</Invoice>`;
|
|
return xml;
|
|
};
|
|
|
|
const testSizes = [
|
|
{ items: 100, expectedSize: '~50KB' },
|
|
{ items: 1000, expectedSize: '~500KB' },
|
|
{ items: 5000, expectedSize: '~2.5MB' },
|
|
{ items: 10000, expectedSize: '~5MB' }
|
|
];
|
|
|
|
for (const test of testSizes) {
|
|
const startTime = performance.now();
|
|
const startMemory = process.memoryUsage();
|
|
|
|
const largeXml = generateLargeInvoice(test.items);
|
|
const xmlSize = Buffer.byteLength(largeXml, 'utf8');
|
|
|
|
console.log(`\nTesting ${test.items} line items (${test.expectedSize}, actual: ${(xmlSize/1024).toFixed(1)}KB):`);
|
|
|
|
try {
|
|
const invoice = new einvoice.EInvoice();
|
|
|
|
if (invoice.fromXmlString) {
|
|
await invoice.fromXmlString(largeXml);
|
|
|
|
const endMemory = process.memoryUsage();
|
|
const memoryDelta = {
|
|
heapUsed: (endMemory.heapUsed - startMemory.heapUsed) / 1024 / 1024,
|
|
external: (endMemory.external - startMemory.external) / 1024 / 1024
|
|
};
|
|
|
|
const parseTime = performance.now() - startTime;
|
|
|
|
console.log(` Parse time: ${parseTime.toFixed(2)}ms`);
|
|
console.log(` Memory delta: ${memoryDelta.heapUsed.toFixed(2)}MB heap, ${memoryDelta.external.toFixed(2)}MB external`);
|
|
console.log(` Parse rate: ${(xmlSize / parseTime * 1000 / 1024 / 1024).toFixed(2)}MB/s`);
|
|
|
|
// Check if memory usage is reasonable
|
|
const memoryRatio = memoryDelta.heapUsed / (xmlSize / 1024 / 1024);
|
|
console.log(` Memory ratio: ${memoryRatio.toFixed(2)}x document size`);
|
|
|
|
if (memoryRatio > 5) {
|
|
console.log(' ⚠️ High memory usage detected');
|
|
} else {
|
|
console.log(' ✓ Memory usage acceptable');
|
|
}
|
|
} else {
|
|
console.log(' ⚠️ fromXmlString not implemented');
|
|
}
|
|
} catch (error) {
|
|
console.log(` ✗ Parse error: ${error.message}`);
|
|
}
|
|
|
|
performanceTracker.recordMetric(`parse-${test.items}-items`, performance.now() - startTime);
|
|
|
|
// Force garbage collection if available
|
|
if (global.gc) {
|
|
global.gc();
|
|
}
|
|
}
|
|
|
|
performanceTracker.endOperation('memory-strategies');
|
|
});
|
|
|
|
await t.test('Streaming parser simulation', async () => {
|
|
performanceTracker.startOperation('streaming-simulation');
|
|
|
|
class StreamingXmlParser {
|
|
private buffer = '';
|
|
private tagStack: string[] = [];
|
|
private currentElement: any = null;
|
|
private parsedElements = 0;
|
|
private eventHandlers: Map<string, (element: any) => void> = new Map();
|
|
|
|
onElement(tagName: string, handler: (element: any) => void): void {
|
|
this.eventHandlers.set(tagName, handler);
|
|
}
|
|
|
|
async parseChunk(chunk: string): Promise<void> {
|
|
this.buffer += chunk;
|
|
|
|
// Simple streaming parser simulation
|
|
let tagMatch;
|
|
const tagRegex = /<([^>]+)>([^<]*)/g;
|
|
|
|
while ((tagMatch = tagRegex.exec(this.buffer)) !== null) {
|
|
const [fullMatch, tag, content] = tagMatch;
|
|
|
|
if (tag.startsWith('/')) {
|
|
// Closing tag
|
|
const tagName = tag.substring(1);
|
|
if (this.tagStack[this.tagStack.length - 1] === tagName) {
|
|
this.tagStack.pop();
|
|
|
|
// Emit element event
|
|
if (this.currentElement && this.eventHandlers.has(tagName)) {
|
|
this.eventHandlers.get(tagName)!(this.currentElement);
|
|
this.parsedElements++;
|
|
}
|
|
|
|
this.currentElement = null;
|
|
}
|
|
} else if (!tag.endsWith('/')) {
|
|
// Opening tag
|
|
const tagName = tag.split(' ')[0];
|
|
this.tagStack.push(tagName);
|
|
this.currentElement = { tag: tagName, content: content.trim() };
|
|
}
|
|
}
|
|
|
|
// Keep unparsed content in buffer
|
|
const lastTagEnd = this.buffer.lastIndexOf('>');
|
|
if (lastTagEnd !== -1) {
|
|
this.buffer = this.buffer.substring(lastTagEnd + 1);
|
|
}
|
|
}
|
|
|
|
getStats() {
|
|
return {
|
|
parsedElements: this.parsedElements,
|
|
bufferSize: this.buffer.length,
|
|
stackDepth: this.tagStack.length
|
|
};
|
|
}
|
|
}
|
|
|
|
// Test streaming parser
|
|
const parser = new StreamingXmlParser();
|
|
let lineItemCount = 0;
|
|
let totalAmount = 0;
|
|
|
|
// Register handlers for specific elements
|
|
parser.onElement('LineItem', (element) => {
|
|
lineItemCount++;
|
|
});
|
|
|
|
parser.onElement('Amount', (element) => {
|
|
const amount = parseFloat(element.content);
|
|
if (!isNaN(amount)) {
|
|
totalAmount += amount;
|
|
}
|
|
});
|
|
|
|
// Generate and parse in chunks
|
|
const chunkSize = 1024; // 1KB chunks
|
|
const totalItems = 1000;
|
|
|
|
console.log(`\nStreaming parse simulation (${totalItems} items in ${chunkSize} byte chunks):`);
|
|
|
|
const startTime = performance.now();
|
|
|
|
// Generate header
|
|
await parser.parseChunk(`<?xml version="1.0"?>
|
|
<Invoice>
|
|
<ID>STREAM-TEST</ID>
|
|
<InvoiceLine>`);
|
|
|
|
// Generate items in chunks
|
|
let currentChunk = '';
|
|
for (let i = 1; i <= totalItems; i++) {
|
|
const item = `
|
|
<LineItem>
|
|
<ID>${i}</ID>
|
|
<Description>Item ${i}</Description>
|
|
<Amount>10.00</Amount>
|
|
</LineItem>`;
|
|
|
|
currentChunk += item;
|
|
|
|
if (currentChunk.length >= chunkSize) {
|
|
await parser.parseChunk(currentChunk);
|
|
currentChunk = '';
|
|
|
|
// Log progress every 100 items
|
|
if (i % 100 === 0) {
|
|
const stats = parser.getStats();
|
|
console.log(` Progress: ${i}/${totalItems} items, buffer: ${stats.bufferSize} bytes`);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Parse remaining chunk and footer
|
|
await parser.parseChunk(currentChunk + `
|
|
</InvoiceLine>
|
|
</Invoice>`);
|
|
|
|
const parseTime = performance.now() - startTime;
|
|
const finalStats = parser.getStats();
|
|
|
|
console.log(`\nStreaming results:`);
|
|
console.log(` Parse time: ${parseTime.toFixed(2)}ms`);
|
|
console.log(` Line items found: ${lineItemCount}`);
|
|
console.log(` Total amount sum: ${totalAmount.toFixed(2)}`);
|
|
console.log(` Elements parsed: ${finalStats.parsedElements}`);
|
|
console.log(` Parse rate: ${(totalItems / parseTime * 1000).toFixed(0)} items/second`);
|
|
|
|
performanceTracker.endOperation('streaming-simulation');
|
|
});
|
|
|
|
await t.test('Chunked processing patterns', async () => {
|
|
performanceTracker.startOperation('chunked-processing');
|
|
|
|
const chunkPatterns = [
|
|
{
|
|
name: 'Fixed size chunks',
|
|
chunkSize: 4096,
|
|
description: 'Process in fixed byte chunks'
|
|
},
|
|
{
|
|
name: 'Line-based chunks',
|
|
chunkSize: 100, // lines
|
|
description: 'Process by number of lines'
|
|
},
|
|
{
|
|
name: 'Element-based chunks',
|
|
chunkSize: 50, // elements
|
|
description: 'Process by complete elements'
|
|
},
|
|
{
|
|
name: 'Memory-based chunks',
|
|
chunkSize: 1024 * 1024, // 1MB
|
|
description: 'Process based on memory limits'
|
|
}
|
|
];
|
|
|
|
for (const pattern of chunkPatterns) {
|
|
console.log(`\n${pattern.name}:`);
|
|
console.log(` ${pattern.description}`);
|
|
console.log(` Chunk size: ${pattern.chunkSize}`);
|
|
|
|
// Simulate processing
|
|
const startTime = performance.now();
|
|
let chunksProcessed = 0;
|
|
let totalBytes = 0;
|
|
|
|
// Process 10 chunks
|
|
for (let i = 0; i < 10; i++) {
|
|
// Simulate chunk processing
|
|
await new Promise(resolve => setTimeout(resolve, 1));
|
|
chunksProcessed++;
|
|
totalBytes += pattern.chunkSize;
|
|
}
|
|
|
|
const processTime = performance.now() - startTime;
|
|
|
|
console.log(` Chunks processed: ${chunksProcessed}`);
|
|
console.log(` Processing rate: ${(totalBytes / processTime * 1000 / 1024).toFixed(2)}KB/s`);
|
|
|
|
performanceTracker.recordMetric(`chunk-${pattern.name}`, processTime);
|
|
}
|
|
|
|
performanceTracker.endOperation('chunked-processing');
|
|
});
|
|
|
|
await t.test('Large corpus file handling', async () => {
|
|
performanceTracker.startOperation('corpus-large-files');
|
|
|
|
const corpusLoader = new CorpusLoader();
|
|
const allFiles = await corpusLoader.getFiles(/\.(xml|ubl|cii)$/);
|
|
|
|
// Find large files
|
|
const fileSizes = await Promise.all(
|
|
allFiles.map(async (file) => {
|
|
const stats = await plugins.fs.stat(file.path);
|
|
return { file, size: stats.size };
|
|
})
|
|
);
|
|
|
|
// Sort by size and get top 10
|
|
const largeFiles = fileSizes
|
|
.sort((a, b) => b.size - a.size)
|
|
.slice(0, 10);
|
|
|
|
console.log(`\nLargest files in corpus:`);
|
|
|
|
for (const { file, size } of largeFiles) {
|
|
console.log(` ${file.name}: ${(size / 1024).toFixed(1)}KB`);
|
|
|
|
if (size > 100 * 1024) { // Files larger than 100KB
|
|
const startTime = performance.now();
|
|
const startMemory = process.memoryUsage();
|
|
|
|
try {
|
|
const content = await plugins.fs.readFile(file.path, 'utf8');
|
|
const invoice = new einvoice.EInvoice();
|
|
|
|
if (invoice.fromXmlString) {
|
|
await invoice.fromXmlString(content);
|
|
|
|
const parseTime = performance.now() - startTime;
|
|
const endMemory = process.memoryUsage();
|
|
const memoryUsed = (endMemory.heapUsed - startMemory.heapUsed) / 1024 / 1024;
|
|
|
|
console.log(` Parse time: ${parseTime.toFixed(2)}ms`);
|
|
console.log(` Memory used: ${memoryUsed.toFixed(2)}MB`);
|
|
console.log(` Parse rate: ${(size / parseTime * 1000 / 1024).toFixed(2)}KB/s`);
|
|
}
|
|
} catch (error) {
|
|
console.log(` Error: ${error.message}`);
|
|
}
|
|
|
|
performanceTracker.recordMetric(`large-file-${file.name}`, performance.now() - startTime);
|
|
}
|
|
}
|
|
|
|
performanceTracker.endOperation('corpus-large-files');
|
|
});
|
|
|
|
await t.test('Progressive parsing with callbacks', async () => {
|
|
performanceTracker.startOperation('progressive-parsing');
|
|
|
|
class ProgressiveParser {
|
|
private invoiceData: any = {};
|
|
private lineItems: any[] = [];
|
|
private currentPath: string[] = [];
|
|
|
|
constructor(
|
|
private onProgress?: (progress: number) => void,
|
|
private onLineItem?: (item: any) => void
|
|
) {}
|
|
|
|
async parse(xml: string): Promise<any> {
|
|
const totalSize = xml.length;
|
|
let processed = 0;
|
|
const chunkSize = 10000;
|
|
|
|
// Parse in chunks
|
|
for (let i = 0; i < totalSize; i += chunkSize) {
|
|
const chunk = xml.substring(i, Math.min(i + chunkSize, totalSize));
|
|
await this.processChunk(chunk);
|
|
|
|
processed += chunk.length;
|
|
|
|
if (this.onProgress) {
|
|
this.onProgress(processed / totalSize * 100);
|
|
}
|
|
|
|
// Simulate async processing
|
|
await new Promise(resolve => setImmediate(resolve));
|
|
}
|
|
|
|
return {
|
|
invoice: this.invoiceData,
|
|
lineItems: this.lineItems
|
|
};
|
|
}
|
|
|
|
private async processChunk(chunk: string): Promise<void> {
|
|
// Simplified parsing - in reality would maintain state across chunks
|
|
const lineItemMatches = chunk.matchAll(/<LineItem>[\s\S]*?<\/LineItem>/g);
|
|
|
|
for (const match of lineItemMatches) {
|
|
const item = this.parseLineItem(match[0]);
|
|
if (item) {
|
|
this.lineItems.push(item);
|
|
if (this.onLineItem) {
|
|
this.onLineItem(item);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
private parseLineItem(xml: string): any {
|
|
const item: any = {};
|
|
|
|
const idMatch = xml.match(/<ID>([^<]+)<\/ID>/);
|
|
if (idMatch) item.id = idMatch[1];
|
|
|
|
const descMatch = xml.match(/<Description>([^<]+)<\/Description>/);
|
|
if (descMatch) item.description = descMatch[1];
|
|
|
|
const amountMatch = xml.match(/<Amount[^>]*>([^<]+)<\/Amount>/);
|
|
if (amountMatch) item.amount = parseFloat(amountMatch[1]);
|
|
|
|
return Object.keys(item).length > 0 ? item : null;
|
|
}
|
|
}
|
|
|
|
// Test progressive parser
|
|
console.log('\nProgressive parsing test:');
|
|
|
|
const largeXml = generateLargeInvoice(500);
|
|
let progressUpdates = 0;
|
|
let itemsFound = 0;
|
|
|
|
const parser = new ProgressiveParser(
|
|
(progress) => {
|
|
progressUpdates++;
|
|
if (progress % 20 < 5) { // Log at ~20% intervals
|
|
console.log(` Progress: ${progress.toFixed(0)}%`);
|
|
}
|
|
},
|
|
(item) => {
|
|
itemsFound++;
|
|
if (itemsFound % 100 === 0) {
|
|
console.log(` Found ${itemsFound} items...`);
|
|
}
|
|
}
|
|
);
|
|
|
|
const startTime = performance.now();
|
|
const result = await parser.parse(largeXml);
|
|
const parseTime = performance.now() - startTime;
|
|
|
|
console.log(`\nProgressive parsing results:`);
|
|
console.log(` Parse time: ${parseTime.toFixed(2)}ms`);
|
|
console.log(` Progress updates: ${progressUpdates}`);
|
|
console.log(` Line items found: ${result.lineItems.length}`);
|
|
console.log(` Items/second: ${(result.lineItems.length / parseTime * 1000).toFixed(0)}`);
|
|
|
|
performanceTracker.endOperation('progressive-parsing');
|
|
|
|
// Helper function
|
|
function generateLargeInvoice(lineItems: number): string {
|
|
let xml = `<?xml version="1.0" encoding="UTF-8"?>
|
|
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
|
|
<ID>LARGE-${lineItems}</ID>
|
|
<IssueDate>2024-01-01</IssueDate>`;
|
|
|
|
for (let i = 1; i <= lineItems; i++) {
|
|
xml += `
|
|
<LineItem>
|
|
<ID>${i}</ID>
|
|
<Description>Product Item ${i} with extended description for testing</Description>
|
|
<Quantity>1</Quantity>
|
|
<Amount currencyID="EUR">${(Math.random() * 1000).toFixed(2)}</Amount>
|
|
</LineItem>`;
|
|
}
|
|
|
|
xml += '\n</Invoice>';
|
|
return xml;
|
|
}
|
|
});
|
|
|
|
await t.test('Stream processing optimization techniques', async () => {
|
|
performanceTracker.startOperation('stream-optimization');
|
|
|
|
const optimizations = [
|
|
{
|
|
name: 'Buffer pooling',
|
|
description: 'Reuse buffers to reduce allocation',
|
|
implementation: () => {
|
|
const bufferPool: Buffer[] = [];
|
|
const poolSize = 10;
|
|
const bufferSize = 4096;
|
|
|
|
// Pre-allocate buffers
|
|
for (let i = 0; i < poolSize; i++) {
|
|
bufferPool.push(Buffer.allocUnsafe(bufferSize));
|
|
}
|
|
|
|
return {
|
|
acquire: () => bufferPool.pop() || Buffer.allocUnsafe(bufferSize),
|
|
release: (buffer: Buffer) => {
|
|
if (bufferPool.length < poolSize) {
|
|
bufferPool.push(buffer);
|
|
}
|
|
}
|
|
};
|
|
}
|
|
},
|
|
{
|
|
name: 'Lazy evaluation',
|
|
description: 'Defer processing until needed',
|
|
implementation: () => {
|
|
const pendingOperations: (() => any)[] = [];
|
|
|
|
return {
|
|
defer: (op: () => any) => pendingOperations.push(op),
|
|
evaluate: () => {
|
|
const results = pendingOperations.map(op => op());
|
|
pendingOperations.length = 0;
|
|
return results;
|
|
}
|
|
};
|
|
}
|
|
},
|
|
{
|
|
name: 'Element skipping',
|
|
description: 'Skip unneeded elements during parsing',
|
|
implementation: () => {
|
|
const skipPaths = new Set(['Signature', 'Extension', 'AdditionalInfo']);
|
|
|
|
return {
|
|
shouldSkip: (elementPath: string) => {
|
|
return skipPaths.has(elementPath.split('/').pop() || '');
|
|
}
|
|
};
|
|
}
|
|
}
|
|
];
|
|
|
|
for (const opt of optimizations) {
|
|
console.log(`\n${opt.name}:`);
|
|
console.log(` ${opt.description}`);
|
|
|
|
const impl = opt.implementation();
|
|
|
|
// Simulate usage
|
|
const startTime = performance.now();
|
|
|
|
if ('acquire' in impl) {
|
|
// Buffer pooling test
|
|
for (let i = 0; i < 1000; i++) {
|
|
const buffer = impl.acquire();
|
|
// Use buffer...
|
|
impl.release(buffer);
|
|
}
|
|
console.log(' ✓ Buffer pool working');
|
|
} else if ('defer' in impl) {
|
|
// Lazy evaluation test
|
|
for (let i = 0; i < 100; i++) {
|
|
impl.defer(() => Math.random() * 1000);
|
|
}
|
|
const results = impl.evaluate();
|
|
console.log(` ✓ Deferred ${results.length} operations`);
|
|
} else if ('shouldSkip' in impl) {
|
|
// Element skipping test
|
|
const testPaths = [
|
|
'Invoice/Signature',
|
|
'Invoice/LineItem/Price',
|
|
'Invoice/Extension'
|
|
];
|
|
const skipped = testPaths.filter(p => impl.shouldSkip(p));
|
|
console.log(` ✓ Skipping ${skipped.length} of ${testPaths.length} paths`);
|
|
}
|
|
|
|
performanceTracker.recordMetric(`optimization-${opt.name}`, performance.now() - startTime);
|
|
}
|
|
|
|
performanceTracker.endOperation('stream-optimization');
|
|
});
|
|
|
|
// Performance summary
|
|
console.log('\n' + performanceTracker.getSummary());
|
|
|
|
// Streaming best practices
|
|
console.log('\nLarge XML Streaming Best Practices:');
|
|
console.log('1. Use streaming parsers for files > 10MB');
|
|
console.log('2. Process data in chunks to control memory usage');
|
|
console.log('3. Implement progress callbacks for user feedback');
|
|
console.log('4. Use buffer pools to reduce allocation overhead');
|
|
console.log('5. Skip unnecessary elements during parsing');
|
|
console.log('6. Monitor memory usage and implement limits');
|
|
console.log('7. Support both streaming and DOM parsing modes');
|
|
console.log('8. Optimize chunk sizes based on document structure');
|
|
});
|
|
|
|
tap.start(); |