fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata.

2025-04-04 12:14:41 +00:00
parent 68fd50fd4c
commit 5d43c1ce4e
15 changed files with 1957 additions and 418 deletions
--- a/ts/formats/pdf/pdf.extractor.ts
+++ b/ts/formats/pdf/pdf.extractor.ts
@@ -4,6 +4,32 @@ import {
  AssociatedFilesExtractor,
  TextXMLExtractor
 } from './extractors/index.js';
+import { FormatDetector } from '../utils/format.detector.js';
+import { InvoiceFormat } from '../../interfaces/common.js';
+
+/**
+ * Error types for PDF extraction operations
+ */
+export enum PDFExtractError {
+  EXTRACT_ERROR = 'XML extraction failed',
+  INVALID_INPUT = 'Invalid input parameters',
+  NO_XML_FOUND = 'No XML found in PDF'
+}
+
+/**
+ * Result of a PDF extraction operation
+ */
+export interface PDFExtractResult {
+  success: boolean;
+  xml?: string;
+  format?: InvoiceFormat;
+  extractorUsed?: string;
+  error?: {
+    type: PDFExtractError;
+    message: string;
+    originalError?: Error;
+  };
+}

 /**
 * Main PDF extractor class that orchestrates the extraction process
@@ -18,9 +44,9 @@ export class PDFExtractor {
  constructor() {
    // Add extractors in order of preference/likelihood of success
    this.extractors.push(
-      new StandardXMLExtractor(),    // Standard PDF/A-3 embedded files
-      new AssociatedFilesExtractor(), // Associated files (ZUGFeRD v1, some Factur-X)
-      new TextXMLExtractor()          // Text-based extraction (fallback)
+      new StandardXMLExtractor(),      // Standard PDF/A-3 embedded files
+      new AssociatedFilesExtractor(),  // Associated files (ZUGFeRD v1, some Factur-X)
+      new TextXMLExtractor()           // Text-based extraction (fallback)
    );
  }

@@ -28,36 +54,88 @@ export class PDFExtractor {
   * Extract XML from a PDF buffer
   * Tries multiple extraction methods in sequence
   * @param pdfBuffer PDF buffer
-   * @returns XML content or null if not found
+   * @returns Result with either the extracted XML or error information
   */
-  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
+  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<PDFExtractResult> {
    try {
      console.log('Starting XML extraction from PDF...');

+      // Validate input
+      if (!pdfBuffer || pdfBuffer.length === 0) {
+        return this.createErrorResult(PDFExtractError.INVALID_INPUT, 'PDF buffer is empty or undefined');
+      }
+
+      // Ensure buffer is Uint8Array
+      const pdfBufferArray = Buffer.isBuffer(pdfBuffer) ? new Uint8Array(pdfBuffer) : pdfBuffer;
+
      // Try each extractor in sequence
      for (const extractor of this.extractors) {
        const extractorName = extractor.constructor.name;
        console.log(`Trying extraction with ${extractorName}...`);

-        const xml = await extractor.extractXml(pdfBuffer);
-        if (xml) {
-          console.log(`Successfully extracted XML using ${extractorName}`);
-          return xml;
+        try {
+          const xml = await extractor.extractXml(pdfBufferArray);
+          
+          if (xml) {
+            console.log(`Successfully extracted XML using ${extractorName}`);
+            
+            // Detect format of the extracted XML
+            const format = FormatDetector.detectFormat(xml);
+            
+            return {
+              success: true,
+              xml,
+              format,
+              extractorUsed: extractorName
+            };
+          }
+          
+          console.log(`Extraction with ${extractorName} failed, trying next method...`);
+        } catch (error) {
+          // Log error but continue with next extractor
+          console.warn(`Error using ${extractorName}: ${error instanceof Error ? error.message : String(error)}`);
        }
-
-        console.log(`Extraction with ${extractorName} failed, trying next method...`);
      }

-      // If all extractors fail, return null
-      console.warn('All extraction methods failed, no valid XML found in PDF');
-      return null;
+      // If all extractors fail, return a no XML found error
+      return this.createErrorResult(
+        PDFExtractError.NO_XML_FOUND,
+        'All extraction methods failed, no valid XML found in PDF'
+      );
    } catch (error) {
-      console.error('Error extracting XML from PDF:', error);
-      return null;
+      // Handle any unexpected errors
+      return this.createErrorResult(
+        PDFExtractError.EXTRACT_ERROR,
+        `Unexpected error during XML extraction: ${error instanceof Error ? error.message : String(error)}`,
+        error instanceof Error ? error : undefined
+      );
    }
  }

-
-
-
-}
+  /**
+   * Create a PDF extract result with error information
+   * @param type Error type
+   * @param message Error message
+   * @param originalError Original error object
+   * @returns Error result
+   */
+  private createErrorResult(
+    type: PDFExtractError,
+    message: string,
+    originalError?: Error
+  ): PDFExtractResult {
+    console.error(`PDF Extractor Error (${type}): ${message}`);
+    if (originalError) {
+      console.error(originalError);
+    }
+    
+    return {
+      success: false,
+      error: {
+        type,
+        message,
+        originalError
+      }
+    };
+  }
+}