update
This commit is contained in:
@ -0,0 +1,260 @@
|
||||
import { expect, tap } from '@git.zone/tstest/tapbundle';
|
||||
import { CorpusLoader } from '../../helpers/corpus.loader.js';
|
||||
import { PerformanceTracker } from '../../helpers/performance.tracker.js';
|
||||
|
||||
tap.test('FD-11: Confidence Scoring - should provide confidence scores for format detection', async () => {
|
||||
const { FormatDetector } = await import('../../../ts/formats/utils/format.detector.js');
|
||||
|
||||
// Test confidence scoring for clear format indicators
|
||||
const highConfidenceTests = [
|
||||
{
|
||||
name: 'Clear UBL Invoice',
|
||||
xml: `<?xml version="1.0"?>
|
||||
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2"
|
||||
xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
|
||||
<cbc:ID>UBL-HIGH-CONF</cbc:ID>
|
||||
<cbc:IssueDate>2024-01-01</cbc:IssueDate>
|
||||
</Invoice>`,
|
||||
expectedFormat: 'ubl',
|
||||
expectedConfidence: 'high'
|
||||
},
|
||||
{
|
||||
name: 'Clear CII Invoice',
|
||||
xml: `<?xml version="1.0"?>
|
||||
<rsm:CrossIndustryInvoice xmlns:rsm="urn:un:unece:uncefact:data:standard:CrossIndustryInvoice:100">
|
||||
<rsm:ExchangedDocument>
|
||||
<ram:ID xmlns:ram="urn:un:unece:uncefact:data:standard:ReusableAggregateBusinessInformationEntity:100">CII-HIGH-CONF</ram:ID>
|
||||
</rsm:ExchangedDocument>
|
||||
</rsm:CrossIndustryInvoice>`,
|
||||
expectedFormat: 'cii',
|
||||
expectedConfidence: 'high'
|
||||
},
|
||||
{
|
||||
name: 'Clear XRechnung',
|
||||
xml: `<?xml version="1.0"?>
|
||||
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2"
|
||||
xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
|
||||
<cbc:CustomizationID>urn:cen.eu:en16931:2017#compliant#urn:xoev-de:kosit:standard:xrechnung_3.0</cbc:CustomizationID>
|
||||
<cbc:ID>XRECH-HIGH-CONF</cbc:ID>
|
||||
</Invoice>`,
|
||||
expectedFormat: 'xrechnung',
|
||||
expectedConfidence: 'high'
|
||||
}
|
||||
];
|
||||
|
||||
for (const test of highConfidenceTests) {
|
||||
const { result: format } = await PerformanceTracker.track(
|
||||
'confidence-scoring-high',
|
||||
async () => FormatDetector.detectFormat(test.xml)
|
||||
);
|
||||
|
||||
console.log(`${test.name}: ${format}`);
|
||||
|
||||
// For now, just test that detection works
|
||||
// In the future, this could test actual confidence scoring
|
||||
const formatStr = format.toString().toLowerCase();
|
||||
const hasExpectedFormat = formatStr.includes(test.expectedFormat);
|
||||
|
||||
if (hasExpectedFormat) {
|
||||
console.log(` ✓ High confidence detection successful`);
|
||||
} else {
|
||||
console.log(` ○ Expected ${test.expectedFormat}, got ${format}`);
|
||||
}
|
||||
|
||||
// Note: Actual confidence scoring would be tested here when implemented
|
||||
// expect(result.confidence).toBeGreaterThan(0.9);
|
||||
}
|
||||
});
|
||||
|
||||
tap.test('FD-11: Low Confidence Cases - should handle ambiguous formats with lower confidence', async () => {
|
||||
const { FormatDetector } = await import('../../../ts/formats/utils/format.detector.js');
|
||||
|
||||
const lowConfidenceTests = [
|
||||
{
|
||||
name: 'Minimal XML without clear indicators',
|
||||
xml: `<?xml version="1.0"?>
|
||||
<Document>
|
||||
<ID>AMBIGUOUS-001</ID>
|
||||
<Date>2024-01-01</Date>
|
||||
</Document>`,
|
||||
expectedConfidence: 'low'
|
||||
},
|
||||
{
|
||||
name: 'Mixed namespace elements',
|
||||
xml: `<?xml version="1.0"?>
|
||||
<Invoice xmlns="http://example.com/custom-namespace">
|
||||
<ID>MIXED-001</ID>
|
||||
<Elements>
|
||||
<Element1>Value1</Element1>
|
||||
<Element2>Value2</Element2>
|
||||
</Elements>
|
||||
</Invoice>`,
|
||||
expectedConfidence: 'low'
|
||||
},
|
||||
{
|
||||
name: 'Partial UBL structure',
|
||||
xml: `<?xml version="1.0"?>
|
||||
<Invoice>
|
||||
<ID>PARTIAL-UBL</ID>
|
||||
<!-- Missing namespace declarations -->
|
||||
</Invoice>`,
|
||||
expectedConfidence: 'medium'
|
||||
}
|
||||
];
|
||||
|
||||
for (const test of lowConfidenceTests) {
|
||||
const { result: format } = await PerformanceTracker.track(
|
||||
'confidence-scoring-low',
|
||||
async () => FormatDetector.detectFormat(test.xml)
|
||||
);
|
||||
|
||||
console.log(`${test.name}: ${format}`);
|
||||
|
||||
// Should detect something, but with appropriate confidence
|
||||
const formatStr = format.toString().toLowerCase();
|
||||
if (formatStr === 'unknown') {
|
||||
console.log(` ✓ Correctly identified as unknown for ambiguous input`);
|
||||
} else {
|
||||
console.log(` ○ Detected as ${format} (confidence scoring would help here)`);
|
||||
}
|
||||
|
||||
// Note: Actual confidence scoring would be tested here when implemented
|
||||
// expect(result.confidence).toBeLessThan(0.7);
|
||||
}
|
||||
});
|
||||
|
||||
tap.test('FD-11: Confidence Scoring Algorithm - should test confidence calculation factors', async () => {
|
||||
console.log('Testing confidence scoring factors (placeholder for future implementation)');
|
||||
|
||||
// This test documents what confidence scoring should consider
|
||||
const confidenceFactors = [
|
||||
{
|
||||
factor: 'Namespace presence and correctness',
|
||||
description: 'Strong namespace match should increase confidence',
|
||||
weight: 'high'
|
||||
},
|
||||
{
|
||||
factor: 'Root element name match',
|
||||
description: 'Correct root element increases confidence',
|
||||
weight: 'high'
|
||||
},
|
||||
{
|
||||
factor: 'Required child elements present',
|
||||
description: 'Expected structure elements boost confidence',
|
||||
weight: 'medium'
|
||||
},
|
||||
{
|
||||
factor: 'Profile/customization IDs',
|
||||
description: 'Specific profile markers provide high confidence',
|
||||
weight: 'high'
|
||||
},
|
||||
{
|
||||
factor: 'Document completeness',
|
||||
description: 'More complete documents have higher confidence',
|
||||
weight: 'low'
|
||||
}
|
||||
];
|
||||
|
||||
console.log('\nConfidence Scoring Factors (for future implementation):');
|
||||
confidenceFactors.forEach((factor, index) => {
|
||||
console.log(` ${index + 1}. ${factor.factor} (${factor.weight} weight)`);
|
||||
console.log(` ${factor.description}`);
|
||||
});
|
||||
|
||||
// Placeholder test that passes
|
||||
expect(confidenceFactors.length).toEqual(5);
|
||||
});
|
||||
|
||||
tap.test('FD-11: Format Detection with Confidence Thresholds - should respect confidence thresholds', async () => {
|
||||
const { FormatDetector } = await import('../../../ts/formats/utils/format.detector.js');
|
||||
|
||||
// Test case where confidence might affect the result
|
||||
const thresholdTest = {
|
||||
name: 'Borderline UBL case',
|
||||
xml: `<?xml version="1.0"?>
|
||||
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
|
||||
<!-- Very minimal UBL - might have low confidence -->
|
||||
</Invoice>`
|
||||
};
|
||||
|
||||
const { result: format } = await PerformanceTracker.track(
|
||||
'confidence-threshold-test',
|
||||
async () => FormatDetector.detectFormat(thresholdTest.xml)
|
||||
);
|
||||
|
||||
console.log(`${thresholdTest.name}: ${format}`);
|
||||
|
||||
// For now, just test that it doesn't crash
|
||||
expect(format).toBeTruthy();
|
||||
|
||||
// Future implementation could test:
|
||||
// - High threshold: might return UNKNOWN for low confidence
|
||||
// - Low threshold: would return detected format even with low confidence
|
||||
// - Medium threshold: balanced approach
|
||||
|
||||
console.log('Note: Confidence threshold testing requires confidence scoring implementation');
|
||||
});
|
||||
|
||||
tap.test('FD-11: Real File Confidence Distribution - should show confidence patterns in real files', async () => {
|
||||
// Test confidence distribution across real corpus files
|
||||
const ciiFiles = await CorpusLoader.getFiles('CII_XMLRECHNUNG');
|
||||
const ublFiles = await CorpusLoader.getFiles('UBL_XMLRECHNUNG');
|
||||
|
||||
const testFiles = [
|
||||
...ciiFiles.slice(0, 2),
|
||||
...ublFiles.slice(0, 2)
|
||||
];
|
||||
|
||||
if (testFiles.length === 0) {
|
||||
console.log('No test files available for confidence distribution test');
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`Analyzing confidence patterns in ${testFiles.length} real files`);
|
||||
|
||||
const { FormatDetector } = await import('../../../ts/formats/utils/format.detector.js');
|
||||
const { promises: fs } = await import('fs');
|
||||
const path = await import('path');
|
||||
|
||||
const results: { file: string; format: string; size: number }[] = [];
|
||||
|
||||
for (const filePath of testFiles) {
|
||||
try {
|
||||
const xmlContent = await fs.readFile(filePath, 'utf-8');
|
||||
const fileName = path.basename(filePath);
|
||||
|
||||
const { result: format, metric } = await PerformanceTracker.track(
|
||||
'real-file-confidence',
|
||||
async () => FormatDetector.detectFormat(xmlContent)
|
||||
);
|
||||
|
||||
results.push({
|
||||
file: fileName,
|
||||
format: format.toString(),
|
||||
size: xmlContent.length
|
||||
});
|
||||
|
||||
console.log(` ${fileName}: ${format} (${Math.round(xmlContent.length/1024)}KB, ${metric.duration.toFixed(1)}ms)`);
|
||||
|
||||
} catch (error) {
|
||||
console.log(` ${path.basename(filePath)}: Error - ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Analyze format distribution
|
||||
const formatCounts: Record<string, number> = {};
|
||||
results.forEach(r => {
|
||||
const format = r.format.toLowerCase();
|
||||
formatCounts[format] = (formatCounts[format] || 0) + 1;
|
||||
});
|
||||
|
||||
console.log('\nFormat Distribution:');
|
||||
Object.entries(formatCounts).forEach(([format, count]) => {
|
||||
const percentage = (count / results.length * 100).toFixed(1);
|
||||
console.log(` ${format}: ${count} files (${percentage}%)`);
|
||||
});
|
||||
|
||||
expect(results.length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
tap.start();
|
Reference in New Issue
Block a user