einvoice/test/suite/einvoice_parsing/test.parse-12.memory-efficiency.ts

609 lines
20 KiB
TypeScript
Raw Normal View History

2025-05-25 19:45:37 +00:00
import { expect, tap } from '@git.zone/tstest/tapbundle';
import * as einvoice from '../../../ts/index.js';
import * as plugins from '../../plugins.js';
import { CorpusLoader } from '../../helpers/corpus.loader.js';
import { PerformanceTracker } from '../../helpers/performance.tracker.js';
tap.test('PARSE-12: Memory-Efficient Parsing - Optimize memory usage during parsing', async (t) => {
const performanceTracker = new PerformanceTracker('PARSE-12');
await t.test('Memory usage patterns', async () => {
performanceTracker.startOperation('memory-patterns');
// Helper to format memory in MB
const formatMemory = (bytes: number): string => {
return (bytes / 1024 / 1024).toFixed(2) + 'MB';
};
// Helper to get current memory usage
const getMemoryUsage = () => {
const usage = process.memoryUsage();
return {
rss: usage.rss,
heapTotal: usage.heapTotal,
heapUsed: usage.heapUsed,
external: usage.external,
arrayBuffers: usage.arrayBuffers || 0
};
};
// Test different parsing scenarios
const scenarios = [
{
name: 'Small document (1KB)',
generateXml: () => {
return `<?xml version="1.0"?>
<invoice>
<id>SMALL-001</id>
<date>2024-01-01</date>
<amount>100.00</amount>
</invoice>`;
}
},
{
name: 'Medium document (100KB)',
generateXml: () => {
let xml = '<?xml version="1.0"?>\n<invoice>\n';
for (let i = 0; i < 100; i++) {
xml += ` <line number="${i}">
<description>Product description for line ${i} with some additional text to increase size</description>
<quantity>10</quantity>
<price>99.99</price>
</line>\n`;
}
xml += '</invoice>';
return xml;
}
},
{
name: 'Large document (1MB)',
generateXml: () => {
let xml = '<?xml version="1.0"?>\n<invoice>\n';
for (let i = 0; i < 1000; i++) {
xml += ` <line number="${i}">
<description>${'X'.repeat(900)}</description>
<quantity>10</quantity>
<price>99.99</price>
</line>\n`;
}
xml += '</invoice>';
return xml;
}
}
];
for (const scenario of scenarios) {
console.log(`\n${scenario.name}:`);
// Force garbage collection if available
if (global.gc) {
global.gc();
}
const beforeMem = getMemoryUsage();
const xml = scenario.generateXml();
const xmlSize = Buffer.byteLength(xml, 'utf8');
console.log(` Document size: ${formatMemory(xmlSize)}`);
const startTime = performance.now();
try {
const invoice = new einvoice.EInvoice();
if (invoice.fromXmlString) {
await invoice.fromXmlString(xml);
}
const afterMem = getMemoryUsage();
const parseTime = performance.now() - startTime;
const memDelta = {
heapUsed: afterMem.heapUsed - beforeMem.heapUsed,
external: afterMem.external - beforeMem.external,
total: (afterMem.heapUsed + afterMem.external) - (beforeMem.heapUsed + beforeMem.external)
};
console.log(` Parse time: ${parseTime.toFixed(2)}ms`);
console.log(` Memory delta:`);
console.log(` Heap: +${formatMemory(memDelta.heapUsed)}`);
console.log(` External: +${formatMemory(memDelta.external)}`);
console.log(` Total: +${formatMemory(memDelta.total)}`);
console.log(` Memory ratio: ${(memDelta.total / xmlSize).toFixed(2)}x document size`);
performanceTracker.recordMetric(`memory-${scenario.name}`, memDelta.total);
} catch (error) {
console.log(` Error: ${error.message}`);
}
}
performanceTracker.endOperation('memory-patterns');
});
await t.test('DOM vs streaming memory comparison', async () => {
performanceTracker.startOperation('dom-vs-streaming');
// Simulate DOM parser (loads entire document)
class DOMParser {
private document: any = {};
parse(xml: string): void {
// Simulate building full DOM tree
this.document = {
xml: xml, // Keep full XML (worst case)
elements: [],
attributes: new Map(),
textNodes: []
};
// Extract all elements (simplified)
const elementMatches = xml.matchAll(/<(\w+)([^>]*)>/g);
for (const match of elementMatches) {
this.document.elements.push({
name: match[1],
attributes: match[2],
content: '' // Would normally store content
});
}
}
getMemoryFootprint(): number {
// Rough estimate of memory usage
return Buffer.byteLength(this.document.xml, 'utf8') +
this.document.elements.length * 100; // Overhead per element
}
}
// Simulate streaming parser (processes chunks)
class StreamingParser {
private buffer = '';
private processedElements = 0;
private maxBufferSize = 1024 * 10; // 10KB buffer
parseChunk(chunk: string): void {
this.buffer += chunk;
// Process complete elements and discard
let elementEnd;
while ((elementEnd = this.buffer.indexOf('>')) !== -1) {
const element = this.buffer.substring(0, elementEnd + 1);
this.processElement(element);
this.buffer = this.buffer.substring(elementEnd + 1);
// Keep buffer size limited
if (this.buffer.length > this.maxBufferSize) {
this.buffer = this.buffer.substring(this.buffer.length - this.maxBufferSize);
}
}
}
private processElement(element: string): void {
this.processedElements++;
// Process and discard element
}
getMemoryFootprint(): number {
return this.buffer.length + 1024; // Buffer + overhead
}
}
// Test with increasingly large documents
const testSizes = [10, 100, 1000]; // Number of elements
console.log('\nDOM vs Streaming Memory Usage:');
console.log('Elements | DOM Memory | Streaming Memory | Ratio');
console.log('---------|------------|------------------|-------');
for (const size of testSizes) {
// Generate test XML
let xml = '<?xml version="1.0"?>\n<invoice>\n';
for (let i = 0; i < size; i++) {
xml += ` <item id="${i}">
<description>Item description with some text content to simulate real data</description>
<amount>100.00</amount>
</item>\n`;
}
xml += '</invoice>';
const xmlSize = Buffer.byteLength(xml, 'utf8');
// Test DOM parser
const domParser = new DOMParser();
domParser.parse(xml);
const domMemory = domParser.getMemoryFootprint();
// Test streaming parser
const streamParser = new StreamingParser();
const chunkSize = 1024;
for (let i = 0; i < xml.length; i += chunkSize) {
streamParser.parseChunk(xml.substring(i, i + chunkSize));
}
const streamMemory = streamParser.getMemoryFootprint();
const ratio = (domMemory / streamMemory).toFixed(1);
console.log(`${size.toString().padEnd(8)} | ${(domMemory/1024).toFixed(1).padEnd(10)}KB | ${(streamMemory/1024).toFixed(1).padEnd(16)}KB | ${ratio}x`);
performanceTracker.recordMetric(`comparison-${size}`, domMemory - streamMemory);
}
performanceTracker.endOperation('dom-vs-streaming');
});
await t.test('Memory optimization techniques', async () => {
performanceTracker.startOperation('optimization-techniques');
console.log('\nMemory Optimization Techniques:');
const techniques = [
{
name: 'String interning',
description: 'Reuse common strings',
implementation: () => {
const stringPool = new Map<string, string>();
return {
intern: (str: string): string => {
if (!stringPool.has(str)) {
stringPool.set(str, str);
}
return stringPool.get(str)!;
},
getPoolSize: () => stringPool.size
};
},
test: () => {
const interner = techniques[0].implementation();
const tags = ['invoice', 'line', 'amount', 'description'];
const iterations = 1000;
// Without interning
const withoutInterning = [];
for (let i = 0; i < iterations; i++) {
for (const tag of tags) {
withoutInterning.push(tag); // New string each time
}
}
// With interning
const withInterning = [];
for (let i = 0; i < iterations; i++) {
for (const tag of tags) {
withInterning.push(interner.intern(tag)); // Reused string
}
}
console.log(` Unique strings: ${interner.getPoolSize()}`);
console.log(` Memory saved: ~${((iterations - 1) * tags.length * 10)}B`);
}
},
{
name: 'Lazy parsing',
description: 'Parse elements only when accessed',
implementation: () => {
class LazyElement {
constructor(private xmlContent: string) {}
private _parsed: any = null;
get value(): any {
if (!this._parsed) {
// Parse only when accessed
this._parsed = this.parseContent();
}
return this._parsed;
}
private parseContent(): any {
// Simulate parsing
return { parsed: true };
}
}
return LazyElement;
}
},
{
name: 'Selective loading',
description: 'Load only required elements',
implementation: () => {
return {
parseSelective: (xml: string, selector: string) => {
// Only parse elements matching selector
const regex = new RegExp(`<${selector}[^>]*>([^<]*)</${selector}>`, 'g');
const matches = [];
let match;
while ((match = regex.exec(xml)) !== null) {
matches.push(match[1]);
}
return matches;
}
};
}
},
{
name: 'Memory pooling',
description: 'Reuse parser objects',
implementation: () => {
class ParserPool {
private pool: any[] = [];
private maxSize = 10;
acquire(): any {
return this.pool.pop() || { parse: (xml: string) => ({ parsed: true }) };
}
release(parser: any): void {
if (this.pool.length < this.maxSize) {
// Reset parser state
parser.reset?.();
this.pool.push(parser);
}
}
}
return new ParserPool();
}
}
];
for (const technique of techniques) {
console.log(`\n${technique.name}:`);
console.log(` ${technique.description}`);
if (technique.test) {
technique.test();
} else {
console.log(' ✓ Technique implemented');
}
performanceTracker.recordMetric(`technique-${technique.name}`, 1);
}
performanceTracker.endOperation('optimization-techniques');
});
await t.test('Large invoice memory stress test', async () => {
performanceTracker.startOperation('stress-test');
console.log('\nMemory stress test with large invoices:');
// Generate a very large invoice
const generateLargeInvoice = (lines: number, descriptionSize: number): string => {
let xml = `<?xml version="1.0"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<ID>LARGE-${lines}</ID>
<IssueDate>2024-01-01</IssueDate>`;
for (let i = 0; i < lines; i++) {
xml += `
<InvoiceLine>
<ID>${i}</ID>
<Description>${'Product ' + i + ' - ' + 'X'.repeat(descriptionSize)}</Description>
<Quantity>10</Quantity>
<Price>99.99</Price>
<AdditionalInfo>${'Additional information for line ' + i}</AdditionalInfo>
</InvoiceLine>`;
}
xml += '\n</Invoice>';
return xml;
};
const testConfigs = [
{ lines: 100, descSize: 100, expected: '~100KB' },
{ lines: 1000, descSize: 100, expected: '~1MB' },
{ lines: 5000, descSize: 200, expected: '~5MB' }
];
for (const config of testConfigs) {
console.log(`\n${config.lines} lines (${config.expected}):`);
// Force GC before test
if (global.gc) {
global.gc();
}
const beforeMem = process.memoryUsage();
const startTime = performance.now();
try {
const xml = generateLargeInvoice(config.lines, config.descSize);
const xmlSize = Buffer.byteLength(xml, 'utf8');
const invoice = new einvoice.EInvoice();
if (invoice.fromXmlString) {
await invoice.fromXmlString(xml);
}
const afterMem = process.memoryUsage();
const parseTime = performance.now() - startTime;
const memUsed = (afterMem.heapUsed - beforeMem.heapUsed) +
(afterMem.external - beforeMem.external);
console.log(` Document size: ${(xmlSize / 1024 / 1024).toFixed(2)}MB`);
console.log(` Parse time: ${parseTime.toFixed(0)}ms`);
console.log(` Memory used: ${(memUsed / 1024 / 1024).toFixed(2)}MB`);
console.log(` Memory efficiency: ${(memUsed / xmlSize).toFixed(2)}x`);
console.log(` Parse rate: ${(xmlSize / parseTime * 1000 / 1024 / 1024).toFixed(2)}MB/s`);
performanceTracker.recordMetric(`stress-${config.lines}`, memUsed);
} catch (error) {
console.log(` Error: ${error.message}`);
}
// Clean up
if (global.gc) {
global.gc();
}
}
performanceTracker.endOperation('stress-test');
});
await t.test('Memory leak detection', async () => {
performanceTracker.startOperation('leak-detection');
console.log('\nMemory leak detection test:');
const iterations = 10;
const memorySnapshots = [];
// Force initial GC
if (global.gc) {
global.gc();
}
const testXml = `<?xml version="1.0"?>
<invoice>
<id>LEAK-TEST</id>
<items>
${Array(100).fill('<item><desc>Test item</desc><price>10.00</price></item>').join('\n ')}
</items>
</invoice>`;
console.log('Running multiple parse iterations...');
for (let i = 0; i < iterations; i++) {
// Force GC before measurement
if (global.gc) {
global.gc();
}
const beforeMem = process.memoryUsage();
// Parse same document multiple times
const invoice = new einvoice.EInvoice();
if (invoice.fromXmlString) {
await invoice.fromXmlString(testXml);
}
// Force GC after parsing
if (global.gc) {
global.gc();
}
const afterMem = process.memoryUsage();
memorySnapshots.push({
iteration: i + 1,
heapUsed: afterMem.heapUsed,
delta: afterMem.heapUsed - beforeMem.heapUsed
});
// Small delay between iterations
await new Promise(resolve => setTimeout(resolve, 100));
}
// Analyze memory trend
const firstSnapshot = memorySnapshots[0];
const lastSnapshot = memorySnapshots[memorySnapshots.length - 1];
const memoryGrowth = lastSnapshot.heapUsed - firstSnapshot.heapUsed;
const averageDelta = memorySnapshots.reduce((sum, s) => sum + s.delta, 0) / iterations;
console.log('\nMemory analysis:');
console.log(` Initial heap: ${(firstSnapshot.heapUsed / 1024 / 1024).toFixed(2)}MB`);
console.log(` Final heap: ${(lastSnapshot.heapUsed / 1024 / 1024).toFixed(2)}MB`);
console.log(` Total growth: ${(memoryGrowth / 1024 / 1024).toFixed(2)}MB`);
console.log(` Average delta: ${(averageDelta / 1024).toFixed(2)}KB`);
if (memoryGrowth > iterations * 100 * 1024) { // 100KB per iteration threshold
console.log(' ⚠️ Potential memory leak detected!');
} else {
console.log(' ✓ No significant memory leak detected');
}
performanceTracker.endOperation('leak-detection');
});
await t.test('Corpus memory efficiency analysis', async () => {
performanceTracker.startOperation('corpus-efficiency');
const corpusLoader = new CorpusLoader();
const xmlFiles = await corpusLoader.getFiles(/\.(xml|ubl|cii)$/);
console.log(`\nAnalyzing memory efficiency for corpus files...`);
// Test a sample of files
const sampleSize = Math.min(20, xmlFiles.length);
const sampledFiles = xmlFiles
.sort((a, b) => b.size - a.size) // Sort by size, largest first
.slice(0, sampleSize);
const efficiencyStats = {
totalFiles: 0,
totalSize: 0,
totalMemory: 0,
bestRatio: Infinity,
worstRatio: 0,
averageRatio: 0
};
console.log('\nFile | Size | Memory Used | Ratio');
console.log('-----|------|-------------|------');
for (const file of sampledFiles) {
efficiencyStats.totalFiles++;
try {
// Force GC
if (global.gc) {
global.gc();
}
const beforeMem = process.memoryUsage();
const content = await plugins.fs.readFile(file.path, 'utf8');
const fileSize = Buffer.byteLength(content, 'utf8');
const invoice = new einvoice.EInvoice();
if (invoice.fromXmlString) {
await invoice.fromXmlString(content);
}
const afterMem = process.memoryUsage();
const memUsed = (afterMem.heapUsed - beforeMem.heapUsed) +
(afterMem.external - beforeMem.external);
const ratio = memUsed / fileSize;
efficiencyStats.totalSize += fileSize;
efficiencyStats.totalMemory += memUsed;
efficiencyStats.bestRatio = Math.min(efficiencyStats.bestRatio, ratio);
efficiencyStats.worstRatio = Math.max(efficiencyStats.worstRatio, ratio);
console.log(`${file.name.substring(0, 20).padEnd(20)} | ${(fileSize/1024).toFixed(1).padEnd(4)}KB | ${(memUsed/1024).toFixed(1).padEnd(11)}KB | ${ratio.toFixed(2)}x`);
} catch (error) {
console.log(`${file.name.substring(0, 20).padEnd(20)} | Error: ${error.message}`);
}
}
efficiencyStats.averageRatio = efficiencyStats.totalMemory / efficiencyStats.totalSize;
console.log('\nSummary:');
console.log(` Files analyzed: ${efficiencyStats.totalFiles}`);
console.log(` Total size: ${(efficiencyStats.totalSize / 1024 / 1024).toFixed(2)}MB`);
console.log(` Total memory: ${(efficiencyStats.totalMemory / 1024 / 1024).toFixed(2)}MB`);
console.log(` Best ratio: ${efficiencyStats.bestRatio.toFixed(2)}x`);
console.log(` Worst ratio: ${efficiencyStats.worstRatio.toFixed(2)}x`);
console.log(` Average ratio: ${efficiencyStats.averageRatio.toFixed(2)}x`);
performanceTracker.endOperation('corpus-efficiency');
});
// Performance summary
console.log('\n' + performanceTracker.getSummary());
// Memory efficiency best practices
console.log('\nMemory-Efficient Parsing Best Practices:');
console.log('1. Use streaming parsers for large documents');
console.log('2. Implement string interning for repeated values');
console.log('3. Release references to parsed data early');
console.log('4. Use object pools to reduce allocations');
console.log('5. Implement lazy parsing for optional elements');
console.log('6. Monitor memory usage during development');
console.log('7. Set memory limits for production systems');
console.log('8. Consider memory/speed tradeoffs carefully');
});
tap.start();