einvoice/test/suite/einvoice_parsing/test.parse-02.malformed-recovery.ts

541 lines
17 KiB
TypeScript
Raw Normal View History

2025-05-25 19:45:37 +00:00
import { expect, tap } from '@git.zone/tstest/tapbundle';
import * as einvoice from '../../../ts/index.js';
import * as plugins from '../../plugins.js';
import { CorpusLoader } from '../../helpers/corpus.loader.js';
import { PerformanceTracker } from '../../helpers/performance.tracker.js';
tap.test('PARSE-02: Malformed XML Recovery - Recover from common XML parsing errors', async (t) => {
const performanceTracker = new PerformanceTracker('PARSE-02');
await t.test('Unclosed tag recovery', async () => {
performanceTracker.startOperation('unclosed-tags');
const malformedCases = [
{
name: 'Missing closing tag',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<invoice>
<id>TEST-001</id>
<amount>100.00
</invoice>`,
expectedError: /unclosed.*tag|missing.*closing|unexpected.*eof/i,
recoverable: true,
recoveryStrategy: 'Close unclosed tags'
},
{
name: 'Mismatched tags',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<invoice>
<id>TEST-002</id>
<amount>100.00</price>
</invoice>`,
expectedError: /mismatch|closing tag.*does not match|invalid.*structure/i,
recoverable: true,
recoveryStrategy: 'Fix tag mismatch'
},
{
name: 'Extra closing tag',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<invoice>
<id>TEST-003</id>
</amount>
<amount>100.00</amount>
</invoice>`,
expectedError: /unexpected.*closing|no matching.*opening/i,
recoverable: true,
recoveryStrategy: 'Remove orphan closing tag'
},
{
name: 'Nested unclosed tags',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<invoice>
<header>
<id>TEST-004
<date>2024-01-01</date>
</header>
</invoice>`,
expectedError: /unclosed|invalid.*nesting/i,
recoverable: true,
recoveryStrategy: 'Close nested tags in order'
}
];
for (const testCase of malformedCases) {
const startTime = performance.now();
try {
const invoice = new einvoice.EInvoice();
// First try: should fail with malformed XML
if (invoice.fromXmlString) {
await invoice.fromXmlString(testCase.xml);
console.log(`${testCase.name}: Should have detected malformed XML`);
}
} catch (error) {
expect(error.message.toLowerCase()).toMatch(testCase.expectedError);
console.log(`${testCase.name}: Correctly detected - ${error.message}`);
// Try recovery
if (testCase.recoverable) {
try {
const recovered = attemptRecovery(testCase.xml, testCase.name);
console.log(` Recovery strategy: ${testCase.recoveryStrategy}`);
if (recovered) {
const invoice = new einvoice.EInvoice();
if (invoice.fromXmlString) {
await invoice.fromXmlString(recovered);
console.log(` ✓ Recovery successful`);
}
}
} catch (recoveryError) {
console.log(` ✗ Recovery failed: ${recoveryError.message}`);
}
}
}
performanceTracker.recordMetric('tag-recovery', performance.now() - startTime);
}
performanceTracker.endOperation('unclosed-tags');
});
await t.test('Invalid character recovery', async () => {
performanceTracker.startOperation('invalid-chars');
const invalidCharCases = [
{
name: 'Control characters',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<invoice>
<id>TEST\x00005</id>
<note>Contains\x01control\x02characters</note>
</invoice>`,
expectedError: /invalid.*character|control.*character/i,
fixStrategy: 'Remove control characters'
},
{
name: 'Unescaped special characters',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<invoice>
<supplier>Smith & Jones</supplier>
<condition>Amount < 1000 & Status > Active</condition>
</invoice>`,
expectedError: /unescaped|invalid.*entity|ampersand/i,
fixStrategy: 'Escape special characters'
},
{
name: 'Invalid UTF-8 sequences',
xml: Buffer.concat([
Buffer.from('<?xml version="1.0" encoding="UTF-8"?>\n<invoice>\n <id>'),
Buffer.from([0xFF, 0xFE]), // Invalid UTF-8
Buffer.from('TEST-006</id>\n</invoice>')
]),
expectedError: /invalid.*utf|encoding.*error|character.*encoding/i,
fixStrategy: 'Replace invalid sequences'
},
{
name: 'Mixed quotes',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<invoice id="test' currency='EUR">
<amount>100.00</amount>
</invoice>`,
expectedError: /quote|attribute.*value|unterminated/i,
fixStrategy: 'Fix quote mismatches'
}
];
for (const testCase of invalidCharCases) {
const startTime = performance.now();
try {
const invoice = new einvoice.EInvoice();
const xmlContent = testCase.xml instanceof Buffer ? testCase.xml : testCase.xml;
if (invoice.fromXmlString && typeof xmlContent === 'string') {
await invoice.fromXmlString(xmlContent);
console.log(`${testCase.name}: Should have detected invalid characters`);
} else if (invoice.fromBuffer && xmlContent instanceof Buffer) {
await invoice.fromBuffer(xmlContent);
console.log(`${testCase.name}: Should have detected invalid characters`);
}
} catch (error) {
console.log(`${testCase.name}: Detected - ${error.message}`);
console.log(` Fix strategy: ${testCase.fixStrategy}`);
// Attempt fix
const fixed = fixInvalidCharacters(testCase.xml);
if (fixed) {
console.log(` ✓ Characters fixed`);
}
}
performanceTracker.recordMetric('char-recovery', performance.now() - startTime);
}
performanceTracker.endOperation('invalid-chars');
});
await t.test('Attribute error recovery', async () => {
performanceTracker.startOperation('attribute-errors');
const attributeErrors = [
{
name: 'Missing attribute quotes',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<invoice id=TEST-007 date=2024-01-01>
<amount>100.00</amount>
</invoice>`,
expectedError: /attribute.*quote|unquoted.*attribute/i
},
{
name: 'Duplicate attributes',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<invoice id="TEST-008" id="DUPLICATE">
<amount currency="EUR" currency="USD">100.00</amount>
</invoice>`,
expectedError: /duplicate.*attribute|attribute.*already defined/i
},
{
name: 'Invalid attribute names',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<invoice 123id="TEST-009" data-*field="value">
<amount>100.00</amount>
</invoice>`,
expectedError: /invalid.*attribute.*name|attribute.*start/i
},
{
name: 'Equals sign issues',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<invoice id="TEST-010" status"active">
<amount currency = = "EUR">100.00</amount>
</invoice>`,
expectedError: /equals.*sign|attribute.*syntax/i
}
];
for (const testCase of attributeErrors) {
const startTime = performance.now();
try {
const invoice = new einvoice.EInvoice();
if (invoice.fromXmlString) {
await invoice.fromXmlString(testCase.xml);
console.log(`${testCase.name}: Should have detected attribute error`);
}
} catch (error) {
console.log(`${testCase.name}: Detected - ${error.message}`);
}
performanceTracker.recordMetric('attribute-recovery', performance.now() - startTime);
}
performanceTracker.endOperation('attribute-errors');
});
await t.test('Structural error recovery', async () => {
performanceTracker.startOperation('structural-errors');
const structuralErrors = [
{
name: 'Multiple root elements',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<invoice>
<id>TEST-011</id>
</invoice>
<invoice>
<id>TEST-012</id>
</invoice>`,
expectedError: /multiple.*root|document.*end|junk.*after/i,
recoveryHint: 'Wrap in container element'
},
{
name: 'Missing XML declaration',
xml: `<invoice>
<id>TEST-013</id>
<amount>100.00</amount>
</invoice>`,
expectedError: null, // Often parseable
recoveryHint: 'Add XML declaration'
},
{
name: 'Content before declaration',
xml: `Some text before
<?xml version="1.0" encoding="UTF-8"?>
<invoice>
<id>TEST-014</id>
</invoice>`,
expectedError: /before.*declaration|content.*before.*prolog/i,
recoveryHint: 'Remove content before declaration'
},
{
name: 'Invalid nesting',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<invoice>
<header>
<id>TEST-015</id>
</header>
<line>
</header>
<amount>100.00</amount>
</line>
</invoice>`,
expectedError: /invalid.*nesting|unexpected.*closing/i,
recoveryHint: 'Fix element nesting'
}
];
for (const testCase of structuralErrors) {
const startTime = performance.now();
try {
const invoice = new einvoice.EInvoice();
if (invoice.fromXmlString) {
await invoice.fromXmlString(testCase.xml);
if (testCase.expectedError) {
console.log(`${testCase.name}: Should have detected structural error`);
} else {
console.log(`${testCase.name}: Parsed (may need improvement)`);
}
}
} catch (error) {
if (testCase.expectedError) {
expect(error.message.toLowerCase()).toMatch(testCase.expectedError);
console.log(`${testCase.name}: Detected - ${error.message}`);
} else {
console.log(`${testCase.name}: Unexpected error - ${error.message}`);
}
console.log(` Recovery hint: ${testCase.recoveryHint}`);
}
performanceTracker.recordMetric('structural-recovery', performance.now() - startTime);
}
performanceTracker.endOperation('structural-errors');
});
await t.test('Real-world malformed XML patterns', async () => {
performanceTracker.startOperation('real-world-patterns');
const realWorldPatterns = [
{
name: 'BOM in middle of file',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<invoice>
<id>TEST-016</id>\uFEFF
<amount>100.00</amount>
</invoice>`,
issue: 'Byte Order Mark not at start'
},
{
name: 'Windows line endings mixed',
xml: '<?xml version="1.0" encoding="UTF-8"?>\r\n<invoice>\n <id>TEST-017</id>\r\n</invoice>\n',
issue: 'Inconsistent line endings'
},
{
name: 'HTML entities in XML',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<invoice>
<supplier>M&uuml;ller &amp; Co.</supplier>
<space>&nbsp;</space>
</invoice>`,
issue: 'HTML entities instead of XML'
},
{
name: 'Truncated file',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<invoice>
<header>
<id>TEST-018</id>
<date>2024-01-01</date>
</header>
<body>
<lines>
<line>
<desc`,
issue: 'File truncated mid-tag'
}
];
for (const pattern of realWorldPatterns) {
const startTime = performance.now();
try {
const invoice = new einvoice.EInvoice();
if (invoice.fromXmlString) {
await invoice.fromXmlString(pattern.xml);
console.log(`⚠️ ${pattern.name}: Parsed despite issue - ${pattern.issue}`);
}
} catch (error) {
console.log(`${pattern.name}: Detected issue - ${pattern.issue}`);
console.log(` Error: ${error.message}`);
}
performanceTracker.recordMetric('real-world-recovery', performance.now() - startTime);
}
performanceTracker.endOperation('real-world-patterns');
});
await t.test('Progressive parsing with error recovery', async () => {
performanceTracker.startOperation('progressive-parsing');
class ProgressiveParser {
private errors: Array<{ line: number; column: number; message: string }> = [];
async parseWithRecovery(xml: string): Promise<{
success: boolean;
errors: any[];
recovered?: string
}> {
this.errors = [];
// Simulate progressive parsing with error collection
const lines = xml.split('\n');
let inTag = false;
let tagStack: string[] = [];
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
// Simple tag detection
const openTags = line.match(/<([^/][^>]*)>/g) || [];
const closeTags = line.match(/<\/([^>]+)>/g) || [];
for (const tag of openTags) {
const tagName = tag.match(/<([^\s>]+)/)?.[1];
if (tagName) {
tagStack.push(tagName);
}
}
for (const tag of closeTags) {
const tagName = tag.match(/<\/([^>]+)>/)?.[1];
if (tagName) {
const expected = tagStack.pop();
if (expected !== tagName) {
this.errors.push({
line: i + 1,
column: line.indexOf(tag),
message: `Expected </${expected}> but found </${tagName}>`
});
}
}
}
}
// Check unclosed tags
if (tagStack.length > 0) {
this.errors.push({
line: lines.length,
column: 0,
message: `Unclosed tags: ${tagStack.join(', ')}`
});
}
return {
success: this.errors.length === 0,
errors: this.errors,
recovered: this.errors.length > 0 ? this.attemptAutoFix(xml, this.errors) : xml
};
}
private attemptAutoFix(xml: string, errors: any[]): string {
// Simple auto-fix implementation
let fixed = xml;
// Add closing tags for unclosed elements
const unclosedError = errors.find(e => e.message.includes('Unclosed tags'));
if (unclosedError) {
const tags = unclosedError.message.match(/Unclosed tags: (.+)/)?.[1].split(', ') || [];
for (const tag of tags.reverse()) {
fixed += `</${tag}>`;
}
}
return fixed;
}
}
const parser = new ProgressiveParser();
const testXml = `<?xml version="1.0" encoding="UTF-8"?>
<invoice>
<header>
<id>TEST-019</id>
<date>2024-01-01
</header>
<body>
<amount>100.00</amount>
</invoice>`;
const result = await parser.parseWithRecovery(testXml);
console.log(`Progressive parsing result:`);
console.log(` Success: ${result.success}`);
console.log(` Errors found: ${result.errors.length}`);
for (const error of result.errors) {
console.log(` Line ${error.line}, Column ${error.column}: ${error.message}`);
}
if (result.recovered && result.recovered !== testXml) {
console.log(` ✓ Auto-recovery attempted`);
}
performanceTracker.endOperation('progressive-parsing');
});
// Helper functions
function attemptRecovery(xml: string, errorType: string): string | null {
switch (errorType) {
case 'Missing closing tag':
// Simple strategy: add closing tag for unclosed elements
return xml.replace(/<amount>100\.00$/, '<amount>100.00</amount>');
case 'Mismatched tags':
// Fix obvious mismatches
return xml.replace('</price>', '</amount>');
case 'Extra closing tag':
// Remove orphan closing tags
return xml.replace(/^\s*<\/amount>\s*$/m, '');
default:
return null;
}
}
function fixInvalidCharacters(input: string | Buffer): string {
let content = input instanceof Buffer ? input.toString('utf8', 0, input.length) : input;
// Remove control characters
content = content.replace(/[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]/g, '');
// Escape unescaped ampersands
content = content.replace(/&(?!(?:amp|lt|gt|quot|apos);)/g, '&amp;');
// Fix common entity issues
content = content.replace(/</g, '&lt;').replace(/>/g, '&gt;');
return content;
}
// Performance summary
console.log('\n' + performanceTracker.getSummary());
// Recovery best practices
console.log('\nMalformed XML Recovery Best Practices:');
console.log('1. Identify the specific type of malformation');
console.log('2. Apply targeted recovery strategies');
console.log('3. Log all recovery attempts for debugging');
console.log('4. Validate recovered XML before processing');
console.log('5. Maintain original for audit purposes');
console.log('6. Consider security implications of auto-recovery');
console.log('7. Set limits on recovery attempts to prevent infinite loops');
});
tap.start();