import { expect, tap } from '@git.zone/tstest/tapbundle'; import * as einvoice from '../../../ts/index.js'; import * as plugins from '../../plugins.js'; import { CorpusLoader } from '../../helpers/corpus.loader.js'; import { PerformanceTracker } from '../../helpers/performance.tracker.js'; tap.test('PARSE-02: Malformed XML Recovery - Recover from common XML parsing errors', async (t) => { const performanceTracker = new PerformanceTracker('PARSE-02'); await t.test('Unclosed tag recovery', async () => { performanceTracker.startOperation('unclosed-tags'); const malformedCases = [ { name: 'Missing closing tag', xml: ` TEST-001 100.00 `, expectedError: /unclosed.*tag|missing.*closing|unexpected.*eof/i, recoverable: true, recoveryStrategy: 'Close unclosed tags' }, { name: 'Mismatched tags', xml: ` TEST-002 100.00 `, expectedError: /mismatch|closing tag.*does not match|invalid.*structure/i, recoverable: true, recoveryStrategy: 'Fix tag mismatch' }, { name: 'Extra closing tag', xml: ` TEST-003 100.00 `, expectedError: /unexpected.*closing|no matching.*opening/i, recoverable: true, recoveryStrategy: 'Remove orphan closing tag' }, { name: 'Nested unclosed tags', xml: `
TEST-004 2024-01-01
`, expectedError: /unclosed|invalid.*nesting/i, recoverable: true, recoveryStrategy: 'Close nested tags in order' } ]; for (const testCase of malformedCases) { const startTime = performance.now(); try { const invoice = new einvoice.EInvoice(); // First try: should fail with malformed XML if (invoice.fromXmlString) { await invoice.fromXmlString(testCase.xml); console.log(`✗ ${testCase.name}: Should have detected malformed XML`); } } catch (error) { expect(error.message.toLowerCase()).toMatch(testCase.expectedError); console.log(`✓ ${testCase.name}: Correctly detected - ${error.message}`); // Try recovery if (testCase.recoverable) { try { const recovered = attemptRecovery(testCase.xml, testCase.name); console.log(` Recovery strategy: ${testCase.recoveryStrategy}`); if (recovered) { const invoice = new einvoice.EInvoice(); if (invoice.fromXmlString) { await invoice.fromXmlString(recovered); console.log(` ✓ Recovery successful`); } } } catch (recoveryError) { console.log(` ✗ Recovery failed: ${recoveryError.message}`); } } } performanceTracker.recordMetric('tag-recovery', performance.now() - startTime); } performanceTracker.endOperation('unclosed-tags'); }); await t.test('Invalid character recovery', async () => { performanceTracker.startOperation('invalid-chars'); const invalidCharCases = [ { name: 'Control characters', xml: ` TEST\x00005 Contains\x01control\x02characters `, expectedError: /invalid.*character|control.*character/i, fixStrategy: 'Remove control characters' }, { name: 'Unescaped special characters', xml: ` Smith & Jones Amount < 1000 & Status > Active `, expectedError: /unescaped|invalid.*entity|ampersand/i, fixStrategy: 'Escape special characters' }, { name: 'Invalid UTF-8 sequences', xml: Buffer.concat([ Buffer.from('\n\n '), Buffer.from([0xFF, 0xFE]), // Invalid UTF-8 Buffer.from('TEST-006\n') ]), expectedError: /invalid.*utf|encoding.*error|character.*encoding/i, fixStrategy: 'Replace invalid sequences' }, { name: 'Mixed quotes', xml: ` 100.00 `, expectedError: /quote|attribute.*value|unterminated/i, fixStrategy: 'Fix quote mismatches' } ]; for (const testCase of invalidCharCases) { const startTime = performance.now(); try { const invoice = new einvoice.EInvoice(); const xmlContent = testCase.xml instanceof Buffer ? testCase.xml : testCase.xml; if (invoice.fromXmlString && typeof xmlContent === 'string') { await invoice.fromXmlString(xmlContent); console.log(`✗ ${testCase.name}: Should have detected invalid characters`); } else if (invoice.fromBuffer && xmlContent instanceof Buffer) { await invoice.fromBuffer(xmlContent); console.log(`✗ ${testCase.name}: Should have detected invalid characters`); } } catch (error) { console.log(`✓ ${testCase.name}: Detected - ${error.message}`); console.log(` Fix strategy: ${testCase.fixStrategy}`); // Attempt fix const fixed = fixInvalidCharacters(testCase.xml); if (fixed) { console.log(` ✓ Characters fixed`); } } performanceTracker.recordMetric('char-recovery', performance.now() - startTime); } performanceTracker.endOperation('invalid-chars'); }); await t.test('Attribute error recovery', async () => { performanceTracker.startOperation('attribute-errors'); const attributeErrors = [ { name: 'Missing attribute quotes', xml: ` 100.00 `, expectedError: /attribute.*quote|unquoted.*attribute/i }, { name: 'Duplicate attributes', xml: ` 100.00 `, expectedError: /duplicate.*attribute|attribute.*already defined/i }, { name: 'Invalid attribute names', xml: ` 100.00 `, expectedError: /invalid.*attribute.*name|attribute.*start/i }, { name: 'Equals sign issues', xml: ` 100.00 `, expectedError: /equals.*sign|attribute.*syntax/i } ]; for (const testCase of attributeErrors) { const startTime = performance.now(); try { const invoice = new einvoice.EInvoice(); if (invoice.fromXmlString) { await invoice.fromXmlString(testCase.xml); console.log(`✗ ${testCase.name}: Should have detected attribute error`); } } catch (error) { console.log(`✓ ${testCase.name}: Detected - ${error.message}`); } performanceTracker.recordMetric('attribute-recovery', performance.now() - startTime); } performanceTracker.endOperation('attribute-errors'); }); await t.test('Structural error recovery', async () => { performanceTracker.startOperation('structural-errors'); const structuralErrors = [ { name: 'Multiple root elements', xml: ` TEST-011 TEST-012 `, expectedError: /multiple.*root|document.*end|junk.*after/i, recoveryHint: 'Wrap in container element' }, { name: 'Missing XML declaration', xml: ` TEST-013 100.00 `, expectedError: null, // Often parseable recoveryHint: 'Add XML declaration' }, { name: 'Content before declaration', xml: `Some text before TEST-014 `, expectedError: /before.*declaration|content.*before.*prolog/i, recoveryHint: 'Remove content before declaration' }, { name: 'Invalid nesting', xml: `
TEST-015
100.00
`, expectedError: /invalid.*nesting|unexpected.*closing/i, recoveryHint: 'Fix element nesting' } ]; for (const testCase of structuralErrors) { const startTime = performance.now(); try { const invoice = new einvoice.EInvoice(); if (invoice.fromXmlString) { await invoice.fromXmlString(testCase.xml); if (testCase.expectedError) { console.log(`✗ ${testCase.name}: Should have detected structural error`); } else { console.log(`✓ ${testCase.name}: Parsed (may need improvement)`); } } } catch (error) { if (testCase.expectedError) { expect(error.message.toLowerCase()).toMatch(testCase.expectedError); console.log(`✓ ${testCase.name}: Detected - ${error.message}`); } else { console.log(`✗ ${testCase.name}: Unexpected error - ${error.message}`); } console.log(` Recovery hint: ${testCase.recoveryHint}`); } performanceTracker.recordMetric('structural-recovery', performance.now() - startTime); } performanceTracker.endOperation('structural-errors'); }); await t.test('Real-world malformed XML patterns', async () => { performanceTracker.startOperation('real-world-patterns'); const realWorldPatterns = [ { name: 'BOM in middle of file', xml: ` TEST-016\uFEFF 100.00 `, issue: 'Byte Order Mark not at start' }, { name: 'Windows line endings mixed', xml: '\r\n\n TEST-017\r\n\n', issue: 'Inconsistent line endings' }, { name: 'HTML entities in XML', xml: ` Müller & Co.   `, issue: 'HTML entities instead of XML' }, { name: 'Truncated file', xml: `
TEST-018 2024-01-01
{ performanceTracker.startOperation('progressive-parsing'); class ProgressiveParser { private errors: Array<{ line: number; column: number; message: string }> = []; async parseWithRecovery(xml: string): Promise<{ success: boolean; errors: any[]; recovered?: string }> { this.errors = []; // Simulate progressive parsing with error collection const lines = xml.split('\n'); let inTag = false; let tagStack: string[] = []; for (let i = 0; i < lines.length; i++) { const line = lines[i]; // Simple tag detection const openTags = line.match(/<([^/][^>]*)>/g) || []; const closeTags = line.match(/<\/([^>]+)>/g) || []; for (const tag of openTags) { const tagName = tag.match(/<([^\s>]+)/)?.[1]; if (tagName) { tagStack.push(tagName); } } for (const tag of closeTags) { const tagName = tag.match(/<\/([^>]+)>/)?.[1]; if (tagName) { const expected = tagStack.pop(); if (expected !== tagName) { this.errors.push({ line: i + 1, column: line.indexOf(tag), message: `Expected but found ` }); } } } } // Check unclosed tags if (tagStack.length > 0) { this.errors.push({ line: lines.length, column: 0, message: `Unclosed tags: ${tagStack.join(', ')}` }); } return { success: this.errors.length === 0, errors: this.errors, recovered: this.errors.length > 0 ? this.attemptAutoFix(xml, this.errors) : xml }; } private attemptAutoFix(xml: string, errors: any[]): string { // Simple auto-fix implementation let fixed = xml; // Add closing tags for unclosed elements const unclosedError = errors.find(e => e.message.includes('Unclosed tags')); if (unclosedError) { const tags = unclosedError.message.match(/Unclosed tags: (.+)/)?.[1].split(', ') || []; for (const tag of tags.reverse()) { fixed += ``; } } return fixed; } } const parser = new ProgressiveParser(); const testXml = `
TEST-019 2024-01-01
100.00
`; const result = await parser.parseWithRecovery(testXml); console.log(`Progressive parsing result:`); console.log(` Success: ${result.success}`); console.log(` Errors found: ${result.errors.length}`); for (const error of result.errors) { console.log(` Line ${error.line}, Column ${error.column}: ${error.message}`); } if (result.recovered && result.recovered !== testXml) { console.log(` ✓ Auto-recovery attempted`); } performanceTracker.endOperation('progressive-parsing'); }); // Helper functions function attemptRecovery(xml: string, errorType: string): string | null { switch (errorType) { case 'Missing closing tag': // Simple strategy: add closing tag for unclosed elements return xml.replace(/100\.00$/, '100.00'); case 'Mismatched tags': // Fix obvious mismatches return xml.replace('', ''); case 'Extra closing tag': // Remove orphan closing tags return xml.replace(/^\s*<\/amount>\s*$/m, ''); default: return null; } } function fixInvalidCharacters(input: string | Buffer): string { let content = input instanceof Buffer ? input.toString('utf8', 0, input.length) : input; // Remove control characters content = content.replace(/[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]/g, ''); // Escape unescaped ampersands content = content.replace(/&(?!(?:amp|lt|gt|quot|apos);)/g, '&'); // Fix common entity issues content = content.replace(//g, '>'); return content; } // Performance summary console.log('\n' + performanceTracker.getSummary()); // Recovery best practices console.log('\nMalformed XML Recovery Best Practices:'); console.log('1. Identify the specific type of malformation'); console.log('2. Apply targeted recovery strategies'); console.log('3. Log all recovery attempts for debugging'); console.log('4. Validate recovered XML before processing'); console.log('5. Maintain original for audit purposes'); console.log('6. Consider security implications of auto-recovery'); console.log('7. Set limits on recovery attempts to prevent infinite loops'); }); tap.start();