import { expect, tap } from '@git.zone/tstest/tapbundle'; import { CorpusLoader } from '../../helpers/corpus.loader.js'; import { PerformanceTracker } from '../../helpers/performance.tracker.js'; tap.test('FD-11: Confidence Scoring - should provide confidence scores for format detection', async () => { const { FormatDetector } = await import('../../../ts/formats/utils/format.detector.js'); // Test confidence scoring for clear format indicators const highConfidenceTests = [ { name: 'Clear UBL Invoice', xml: ` UBL-HIGH-CONF 2024-01-01 `, expectedFormat: 'ubl', expectedConfidence: 'high' }, { name: 'Clear CII Invoice', xml: ` CII-HIGH-CONF `, expectedFormat: 'cii', expectedConfidence: 'high' }, { name: 'Clear XRechnung', xml: ` urn:cen.eu:en16931:2017#compliant#urn:xoev-de:kosit:standard:xrechnung_3.0 XRECH-HIGH-CONF `, expectedFormat: 'xrechnung', expectedConfidence: 'high' } ]; for (const test of highConfidenceTests) { const { result: format } = await PerformanceTracker.track( 'confidence-scoring-high', async () => FormatDetector.detectFormat(test.xml) ); console.log(`${test.name}: ${format}`); // For now, just test that detection works // In the future, this could test actual confidence scoring const formatStr = format.toString().toLowerCase(); const hasExpectedFormat = formatStr.includes(test.expectedFormat); if (hasExpectedFormat) { console.log(` ✓ High confidence detection successful`); } else { console.log(` ○ Expected ${test.expectedFormat}, got ${format}`); } // Note: Actual confidence scoring would be tested here when implemented // expect(result.confidence).toBeGreaterThan(0.9); } }); tap.test('FD-11: Low Confidence Cases - should handle ambiguous formats with lower confidence', async () => { const { FormatDetector } = await import('../../../ts/formats/utils/format.detector.js'); const lowConfidenceTests = [ { name: 'Minimal XML without clear indicators', xml: ` AMBIGUOUS-001 2024-01-01 `, expectedConfidence: 'low' }, { name: 'Mixed namespace elements', xml: ` MIXED-001 Value1 Value2 `, expectedConfidence: 'low' }, { name: 'Partial UBL structure', xml: ` PARTIAL-UBL `, expectedConfidence: 'medium' } ]; for (const test of lowConfidenceTests) { const { result: format } = await PerformanceTracker.track( 'confidence-scoring-low', async () => FormatDetector.detectFormat(test.xml) ); console.log(`${test.name}: ${format}`); // Should detect something, but with appropriate confidence const formatStr = format.toString().toLowerCase(); if (formatStr === 'unknown') { console.log(` ✓ Correctly identified as unknown for ambiguous input`); } else { console.log(` ○ Detected as ${format} (confidence scoring would help here)`); } // Note: Actual confidence scoring would be tested here when implemented // expect(result.confidence).toBeLessThan(0.7); } }); tap.test('FD-11: Confidence Scoring Algorithm - should test confidence calculation factors', async () => { console.log('Testing confidence scoring factors (placeholder for future implementation)'); // This test documents what confidence scoring should consider const confidenceFactors = [ { factor: 'Namespace presence and correctness', description: 'Strong namespace match should increase confidence', weight: 'high' }, { factor: 'Root element name match', description: 'Correct root element increases confidence', weight: 'high' }, { factor: 'Required child elements present', description: 'Expected structure elements boost confidence', weight: 'medium' }, { factor: 'Profile/customization IDs', description: 'Specific profile markers provide high confidence', weight: 'high' }, { factor: 'Document completeness', description: 'More complete documents have higher confidence', weight: 'low' } ]; console.log('\nConfidence Scoring Factors (for future implementation):'); confidenceFactors.forEach((factor, index) => { console.log(` ${index + 1}. ${factor.factor} (${factor.weight} weight)`); console.log(` ${factor.description}`); }); // Placeholder test that passes expect(confidenceFactors.length).toEqual(5); }); tap.test('FD-11: Format Detection with Confidence Thresholds - should respect confidence thresholds', async () => { const { FormatDetector } = await import('../../../ts/formats/utils/format.detector.js'); // Test case where confidence might affect the result const thresholdTest = { name: 'Borderline UBL case', xml: ` ` }; const { result: format } = await PerformanceTracker.track( 'confidence-threshold-test', async () => FormatDetector.detectFormat(thresholdTest.xml) ); console.log(`${thresholdTest.name}: ${format}`); // For now, just test that it doesn't crash expect(format).toBeTruthy(); // Future implementation could test: // - High threshold: might return UNKNOWN for low confidence // - Low threshold: would return detected format even with low confidence // - Medium threshold: balanced approach console.log('Note: Confidence threshold testing requires confidence scoring implementation'); }); tap.test('FD-11: Real File Confidence Distribution - should show confidence patterns in real files', async () => { // Test confidence distribution across real corpus files const ciiFiles = await CorpusLoader.getFiles('CII_XMLRECHNUNG'); const ublFiles = await CorpusLoader.getFiles('UBL_XMLRECHNUNG'); const testFiles = [ ...ciiFiles.slice(0, 2), ...ublFiles.slice(0, 2) ]; if (testFiles.length === 0) { console.log('No test files available for confidence distribution test'); return; } console.log(`Analyzing confidence patterns in ${testFiles.length} real files`); const { FormatDetector } = await import('../../../ts/formats/utils/format.detector.js'); const { promises: fs } = await import('fs'); const path = await import('path'); const results: { file: string; format: string; size: number }[] = []; for (const filePath of testFiles) { try { const xmlContent = await fs.readFile(filePath, 'utf-8'); const fileName = path.basename(filePath); const { result: format, metric } = await PerformanceTracker.track( 'real-file-confidence', async () => FormatDetector.detectFormat(xmlContent) ); results.push({ file: fileName, format: format.toString(), size: xmlContent.length }); console.log(` ${fileName}: ${format} (${Math.round(xmlContent.length/1024)}KB, ${metric.duration.toFixed(1)}ms)`); } catch (error) { console.log(` ${path.basename(filePath)}: Error - ${error.message}`); } } // Analyze format distribution const formatCounts: Record = {}; results.forEach(r => { const format = r.format.toLowerCase(); formatCounts[format] = (formatCounts[format] || 0) + 1; }); console.log('\nFormat Distribution:'); Object.entries(formatCounts).forEach(([format, count]) => { const percentage = (count / results.length * 100).toFixed(1); console.log(` ${format}: ${count} files (${percentage}%)`); }); expect(results.length).toBeGreaterThan(0); }); tap.start();