412 lines
13 KiB
TypeScript
412 lines
13 KiB
TypeScript
import { expect, tap } from '@git.zone/tstest/tapbundle';
|
|
import * as plugins from '../plugins.js';
|
|
import { EInvoice } from '../../../ts/index.js';
|
|
import { CorpusLoader } from '../corpus.loader.js';
|
|
import { PerformanceTracker } from '../performance.tracker.js';
|
|
|
|
tap.test('PDF-06: Multiple Attachments - should handle PDFs with multiple embedded files', async (t) => {
|
|
// PDF-06: Verify handling of PDFs containing multiple attachments
|
|
// This test ensures proper extraction and management of multiple embedded files
|
|
|
|
const performanceTracker = new PerformanceTracker('PDF-06: Multiple Attachments');
|
|
const corpusLoader = new CorpusLoader();
|
|
|
|
t.test('Detect multiple attachments in PDF', async () => {
|
|
const startTime = performance.now();
|
|
|
|
// Create a test PDF with multiple attachments
|
|
const { PDFDocument, PDFName, AFRelationship } = plugins;
|
|
const pdfDoc = await PDFDocument.create();
|
|
|
|
// Add first page
|
|
const page = pdfDoc.addPage([595, 842]); // A4
|
|
page.drawText('Invoice with Multiple Attachments', {
|
|
x: 50,
|
|
y: 750,
|
|
size: 20
|
|
});
|
|
|
|
// Add multiple XML attachments
|
|
const attachments = [
|
|
{
|
|
name: 'invoice.xml',
|
|
content: `<?xml version="1.0" encoding="UTF-8"?>
|
|
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
|
|
<ID>MULTI-ATTACH-001</ID>
|
|
<IssueDate>2025-01-25</IssueDate>
|
|
<Note>Main invoice document</Note>
|
|
</Invoice>`,
|
|
relationship: AFRelationship.Data,
|
|
description: 'Main invoice XML'
|
|
},
|
|
{
|
|
name: 'supplementary.xml',
|
|
content: `<?xml version="1.0" encoding="UTF-8"?>
|
|
<SupplementaryData>
|
|
<InvoiceRef>MULTI-ATTACH-001</InvoiceRef>
|
|
<AdditionalInfo>Extra invoice details</AdditionalInfo>
|
|
</SupplementaryData>`,
|
|
relationship: AFRelationship.Supplement,
|
|
description: 'Supplementary invoice data'
|
|
},
|
|
{
|
|
name: 'signature.xml',
|
|
content: `<?xml version="1.0" encoding="UTF-8"?>
|
|
<Signature xmlns="http://www.w3.org/2000/09/xmldsig#">
|
|
<SignedInfo>
|
|
<Reference URI="#invoice">
|
|
<DigestValue>abc123...</DigestValue>
|
|
</Reference>
|
|
</SignedInfo>
|
|
</Signature>`,
|
|
relationship: AFRelationship.Source,
|
|
description: 'Digital signature'
|
|
}
|
|
];
|
|
|
|
// Embed each attachment
|
|
for (const attachment of attachments) {
|
|
await pdfDoc.attach(
|
|
Buffer.from(attachment.content, 'utf8'),
|
|
attachment.name,
|
|
{
|
|
mimeType: 'application/xml',
|
|
description: attachment.description,
|
|
creationDate: new Date(),
|
|
modificationDate: new Date(),
|
|
afRelationship: attachment.relationship
|
|
}
|
|
);
|
|
}
|
|
|
|
// Add metadata
|
|
pdfDoc.setTitle('Multi-attachment Invoice');
|
|
pdfDoc.setSubject('Invoice with multiple embedded files');
|
|
pdfDoc.setKeywords(['invoice', 'multiple-attachments', 'xml']);
|
|
|
|
// Save PDF
|
|
const pdfBytes = await pdfDoc.save();
|
|
|
|
// Test extraction
|
|
const einvoice = new EInvoice();
|
|
try {
|
|
await einvoice.loadFromPdfBuffer(pdfBytes);
|
|
|
|
// Check if multiple attachments are detected
|
|
// Note: The API might not expose all attachments directly
|
|
const xmlContent = einvoice.getXmlString();
|
|
expect(xmlContent).toContain('MULTI-ATTACH-001');
|
|
|
|
console.log('Successfully extracted primary attachment from multi-attachment PDF');
|
|
} catch (error) {
|
|
console.log('Multi-attachment extraction not fully supported:', error.message);
|
|
}
|
|
|
|
const elapsed = performance.now() - startTime;
|
|
performanceTracker.addMeasurement('detect-multiple', elapsed);
|
|
});
|
|
|
|
t.test('Extract all attachments from PDF', async () => {
|
|
const startTime = performance.now();
|
|
|
|
// Create PDF with various attachment types
|
|
const { PDFDocument } = plugins;
|
|
const pdfDoc = await PDFDocument.create();
|
|
const page = pdfDoc.addPage();
|
|
|
|
// Different file types as attachments
|
|
const mixedAttachments = [
|
|
{
|
|
name: 'invoice_data.xml',
|
|
content: '<?xml version="1.0"?><invoice><id>TEST-001</id></invoice>',
|
|
mimeType: 'application/xml'
|
|
},
|
|
{
|
|
name: 'invoice_image.txt',
|
|
content: 'BASE64_ENCODED_IMAGE_DATA_HERE',
|
|
mimeType: 'text/plain'
|
|
},
|
|
{
|
|
name: 'invoice_style.css',
|
|
content: '.invoice { font-family: Arial; }',
|
|
mimeType: 'text/css'
|
|
},
|
|
{
|
|
name: 'invoice_meta.json',
|
|
content: '{"version":"1.0","format":"UBL"}',
|
|
mimeType: 'application/json'
|
|
}
|
|
];
|
|
|
|
for (const attach of mixedAttachments) {
|
|
await pdfDoc.attach(
|
|
Buffer.from(attach.content, 'utf8'),
|
|
attach.name,
|
|
{
|
|
mimeType: attach.mimeType,
|
|
description: `${attach.name} attachment`
|
|
}
|
|
);
|
|
}
|
|
|
|
const pdfBytes = await pdfDoc.save();
|
|
|
|
// Test if we can identify all attachments
|
|
const einvoice = new EInvoice();
|
|
try {
|
|
await einvoice.loadFromPdfBuffer(pdfBytes);
|
|
|
|
// The library might only extract XML attachments
|
|
console.log('Extracted attachment from PDF with mixed file types');
|
|
} catch (error) {
|
|
console.log('Mixed attachment handling:', error.message);
|
|
}
|
|
|
|
const elapsed = performance.now() - startTime;
|
|
performanceTracker.addMeasurement('extract-all', elapsed);
|
|
});
|
|
|
|
t.test('Handle attachment relationships', async () => {
|
|
const startTime = performance.now();
|
|
|
|
const { PDFDocument, AFRelationship } = plugins;
|
|
const pdfDoc = await PDFDocument.create();
|
|
const page = pdfDoc.addPage();
|
|
|
|
// Test different AFRelationship types
|
|
const relationshipTests = [
|
|
{ rel: AFRelationship.Source, desc: 'Source document' },
|
|
{ rel: AFRelationship.Data, desc: 'Data file' },
|
|
{ rel: AFRelationship.Alternative, desc: 'Alternative representation' },
|
|
{ rel: AFRelationship.Supplement, desc: 'Supplementary data' },
|
|
{ rel: AFRelationship.Unspecified, desc: 'Unspecified relationship' }
|
|
];
|
|
|
|
for (const test of relationshipTests) {
|
|
const xmlContent = `<?xml version="1.0"?>
|
|
<Document type="${test.desc}">
|
|
<Relationship>${test.rel}</Relationship>
|
|
</Document>`;
|
|
|
|
await pdfDoc.attach(
|
|
Buffer.from(xmlContent, 'utf8'),
|
|
`${test.rel}_document.xml`,
|
|
{
|
|
mimeType: 'application/xml',
|
|
description: test.desc,
|
|
afRelationship: test.rel
|
|
}
|
|
);
|
|
}
|
|
|
|
const pdfBytes = await pdfDoc.save();
|
|
expect(pdfBytes.length).toBeGreaterThan(0);
|
|
|
|
console.log('Created PDF with various attachment relationships');
|
|
|
|
const elapsed = performance.now() - startTime;
|
|
performanceTracker.addMeasurement('relationships', elapsed);
|
|
});
|
|
|
|
t.test('Attachment size limits', async () => {
|
|
const startTime = performance.now();
|
|
|
|
const { PDFDocument } = plugins;
|
|
const pdfDoc = await PDFDocument.create();
|
|
const page = pdfDoc.addPage();
|
|
|
|
// Test with increasingly large attachments
|
|
const sizes = [
|
|
{ size: 1024, name: '1KB' }, // 1 KB
|
|
{ size: 10 * 1024, name: '10KB' }, // 10 KB
|
|
{ size: 100 * 1024, name: '100KB' }, // 100 KB
|
|
{ size: 1024 * 1024, name: '1MB' } // 1 MB
|
|
];
|
|
|
|
for (const sizeTest of sizes) {
|
|
// Generate XML content of specified size
|
|
let content = '<?xml version="1.0" encoding="UTF-8"?>\n<LargeInvoice>\n';
|
|
const padding = '<Data>';
|
|
while (content.length < sizeTest.size - 100) {
|
|
content += padding + 'x'.repeat(80) + '</Data>\n';
|
|
}
|
|
content += '</LargeInvoice>';
|
|
|
|
try {
|
|
await pdfDoc.attach(
|
|
Buffer.from(content, 'utf8'),
|
|
`large_${sizeTest.name}.xml`,
|
|
{
|
|
mimeType: 'application/xml',
|
|
description: `Large attachment test ${sizeTest.name}`
|
|
}
|
|
);
|
|
console.log(`Successfully attached ${sizeTest.name} file`);
|
|
} catch (error) {
|
|
console.log(`Failed to attach ${sizeTest.name}:`, error.message);
|
|
}
|
|
}
|
|
|
|
const pdfBytes = await pdfDoc.save();
|
|
console.log(`Final PDF size with attachments: ${(pdfBytes.length / 1024).toFixed(2)} KB`);
|
|
|
|
const elapsed = performance.now() - startTime;
|
|
performanceTracker.addMeasurement('size-limits', elapsed);
|
|
});
|
|
|
|
t.test('Duplicate attachment names', async () => {
|
|
const startTime = performance.now();
|
|
|
|
const { PDFDocument } = plugins;
|
|
const pdfDoc = await PDFDocument.create();
|
|
const page = pdfDoc.addPage();
|
|
|
|
// Try to add multiple attachments with same name
|
|
const attachmentName = 'invoice.xml';
|
|
const versions = [
|
|
{ content: '<invoice version="1.0"/>', desc: 'Version 1.0' },
|
|
{ content: '<invoice version="2.0"/>', desc: 'Version 2.0' },
|
|
{ content: '<invoice version="3.0"/>', desc: 'Version 3.0' }
|
|
];
|
|
|
|
for (const version of versions) {
|
|
try {
|
|
await pdfDoc.attach(
|
|
Buffer.from(version.content, 'utf8'),
|
|
attachmentName,
|
|
{
|
|
mimeType: 'application/xml',
|
|
description: version.desc
|
|
}
|
|
);
|
|
console.log(`Attached: ${version.desc}`);
|
|
} catch (error) {
|
|
console.log(`Duplicate name handling for ${version.desc}:`, error.message);
|
|
}
|
|
}
|
|
|
|
const pdfBytes = await pdfDoc.save();
|
|
|
|
// Check if duplicates are handled
|
|
const einvoice = new EInvoice();
|
|
try {
|
|
await einvoice.loadFromPdfBuffer(pdfBytes);
|
|
console.log('Handled PDF with duplicate attachment names');
|
|
} catch (error) {
|
|
console.log('Duplicate name error:', error.message);
|
|
}
|
|
|
|
const elapsed = performance.now() - startTime;
|
|
performanceTracker.addMeasurement('duplicate-names', elapsed);
|
|
});
|
|
|
|
t.test('Corpus PDFs with multiple attachments', async () => {
|
|
const startTime = performance.now();
|
|
let multiAttachmentCount = 0;
|
|
let processedCount = 0;
|
|
|
|
const files = await corpusLoader.getAllFiles();
|
|
const pdfFiles = files.filter(f => f.endsWith('.pdf'));
|
|
|
|
// Sample PDFs to check for multiple attachments
|
|
const sampleSize = Math.min(30, pdfFiles.length);
|
|
const sample = pdfFiles.slice(0, sampleSize);
|
|
|
|
for (const file of sample) {
|
|
try {
|
|
const content = await corpusLoader.readFile(file);
|
|
const einvoice = new EInvoice();
|
|
|
|
// Try to load and check for attachments
|
|
try {
|
|
await einvoice.loadFromPdfBuffer(content);
|
|
|
|
// Check if PDF might have multiple attachments
|
|
// This is approximate since we can't directly query attachment count
|
|
const pdfString = content.toString('binary');
|
|
const attachmentMatches = pdfString.match(/\/EmbeddedFiles/g);
|
|
|
|
if (attachmentMatches && attachmentMatches.length > 1) {
|
|
multiAttachmentCount++;
|
|
console.log(`Multiple attachments detected in: ${file}`);
|
|
}
|
|
} catch (error) {
|
|
// Skip PDFs that can't be processed
|
|
}
|
|
|
|
processedCount++;
|
|
} catch (error) {
|
|
console.log(`Error reading ${file}:`, error.message);
|
|
}
|
|
}
|
|
|
|
console.log(`Corpus analysis: ${multiAttachmentCount}/${processedCount} PDFs may have multiple attachments`);
|
|
|
|
const elapsed = performance.now() - startTime;
|
|
performanceTracker.addMeasurement('corpus-multi-attach', elapsed);
|
|
});
|
|
|
|
t.test('Attachment extraction order', async () => {
|
|
const startTime = performance.now();
|
|
|
|
const { PDFDocument, AFRelationship } = plugins;
|
|
const pdfDoc = await PDFDocument.create();
|
|
const page = pdfDoc.addPage();
|
|
|
|
// Add attachments in specific order
|
|
const orderedAttachments = [
|
|
{ name: '1_first.xml', priority: 'high', afRel: AFRelationship.Data },
|
|
{ name: '2_second.xml', priority: 'medium', afRel: AFRelationship.Supplement },
|
|
{ name: '3_third.xml', priority: 'low', afRel: AFRelationship.Alternative }
|
|
];
|
|
|
|
for (const attach of orderedAttachments) {
|
|
const content = `<?xml version="1.0"?>
|
|
<Document>
|
|
<Order>${attach.name}</Order>
|
|
<Priority>${attach.priority}</Priority>
|
|
</Document>`;
|
|
|
|
await pdfDoc.attach(
|
|
Buffer.from(content, 'utf8'),
|
|
attach.name,
|
|
{
|
|
mimeType: 'application/xml',
|
|
description: `Priority: ${attach.priority}`,
|
|
afRelationship: attach.afRel
|
|
}
|
|
);
|
|
}
|
|
|
|
const pdfBytes = await pdfDoc.save();
|
|
|
|
// Test extraction order
|
|
const einvoice = new EInvoice();
|
|
try {
|
|
await einvoice.loadFromPdfBuffer(pdfBytes);
|
|
|
|
// Check which attachment was extracted
|
|
const xmlContent = einvoice.getXmlString();
|
|
console.log('Extraction order test completed');
|
|
|
|
// Library likely extracts based on AFRelationship priority
|
|
if (xmlContent.includes('1_first.xml')) {
|
|
console.log('Extracted primary (Data) attachment first');
|
|
}
|
|
} catch (error) {
|
|
console.log('Order extraction error:', error.message);
|
|
}
|
|
|
|
const elapsed = performance.now() - startTime;
|
|
performanceTracker.addMeasurement('extraction-order', elapsed);
|
|
});
|
|
|
|
// Print performance summary
|
|
performanceTracker.printSummary();
|
|
|
|
// Performance assertions
|
|
const avgTime = performanceTracker.getAverageTime();
|
|
expect(avgTime).toBeLessThan(500); // Multiple attachments may take longer
|
|
});
|
|
|
|
tap.start(); |