xinvoice/ts/formats/pdf/extractors/associated.extractor.ts

import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from '../../../plugins.js';
import { BaseXMLExtractor } from './base.extractor.js';

/**
 * Associated files extractor for PDF/A-3 documents
 * Extracts XML from associated files (AF entry in the catalog)
 * Particularly useful for ZUGFeRD v1 and some Factur-X documents
 */
export class AssociatedFilesExtractor extends BaseXMLExtractor {
  /**
   * Extract XML from a PDF buffer using associated files
   * @param pdfBuffer PDF buffer
   * @returns XML content or null if not found
   */
  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
    try {
      const pdfDoc = await PDFDocument.load(pdfBuffer);

      // Try to find associated files via the AF entry in the catalog
      const afArray = pdfDoc.catalog.lookup(PDFName.of('AF'));
      if (!(afArray instanceof PDFArray)) {
        console.warn('No AF (Associated Files) entry found in PDF catalog');
        return null;
      }

      // Process each associated file
      for (let i = 0; i < afArray.size(); i++) {
        const fileSpec = afArray.lookup(i);
        if (!(fileSpec instanceof PDFDict)) {
          continue;
        }

        // Get the file name
        const fileNameObj = fileSpec.lookup(PDFName.of('F')) || fileSpec.lookup(PDFName.of('UF'));
        if (!(fileNameObj instanceof PDFString)) {
          continue;
        }

        const fileName = fileNameObj.decodeText();

        // Check if it's a known invoice XML file name
        const isKnownFileName = this.knownFileNames.some(
          knownName => fileName.toLowerCase() === knownName.toLowerCase()
        );

        // Check if it's any XML file or has invoice-related keywords
        const isXmlFile = fileName.toLowerCase().endsWith('.xml') ||
                          fileName.toLowerCase().includes('zugferd') ||
                          fileName.toLowerCase().includes('factur-x') ||
                          fileName.toLowerCase().includes('xrechnung') ||
                          fileName.toLowerCase().includes('invoice');

        if (isKnownFileName || isXmlFile) {
          // Get the embedded file dictionary
          const efDict = fileSpec.lookup(PDFName.of('EF'));
          if (!(efDict instanceof PDFDict)) {
            continue;
          }

          // Get the file stream
          const fileStream = efDict.lookup(PDFName.of('F'));
          if (fileStream instanceof PDFRawStream) {
            const xmlContent = await this.extractXmlFromStream(fileStream, fileName);
            if (xmlContent) {
              return xmlContent;
            }
          }
        }
      }

      console.warn('No valid XML found in associated files');
      return null;
    } catch (error) {
      console.error('Error in associated files extraction:', error);
      return null;
    }
  }
}
fix(core): Refactor module imports to use the centralized plugins module and update relative paths across the codebase. Also remove the obsolete test file (test/test.other-formats-corpus.ts) and update file metadata in test outputs. 2025-04-03 21:07:21 +00:00			`import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from '../../../plugins.js';`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`import { BaseXMLExtractor } from './base.extractor.js';`

			`/**`
			`* Associated files extractor for PDF/A-3 documents`
			`* Extracts XML from associated files (AF entry in the catalog)`
			`* Particularly useful for ZUGFeRD v1 and some Factur-X documents`
			`*/`
			`export class AssociatedFilesExtractor extends BaseXMLExtractor {`
			`/**`
			`* Extract XML from a PDF buffer using associated files`
			`* @param pdfBuffer PDF buffer`
			`* @returns XML content or null if not found`
			`*/`
			`public async extractXml(pdfBuffer: Uint8Array \| Buffer): Promise<string \| null> {`
			`try {`
			`const pdfDoc = await PDFDocument.load(pdfBuffer);`
fix(core): Refactor module imports to use the centralized plugins module and update relative paths across the codebase. Also remove the obsolete test file (test/test.other-formats-corpus.ts) and update file metadata in test outputs. 2025-04-03 21:07:21 +00:00
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`// Try to find associated files via the AF entry in the catalog`
			`const afArray = pdfDoc.catalog.lookup(PDFName.of('AF'));`
			`if (!(afArray instanceof PDFArray)) {`
			`console.warn('No AF (Associated Files) entry found in PDF catalog');`
			`return null;`
			`}`
fix(core): Refactor module imports to use the centralized plugins module and update relative paths across the codebase. Also remove the obsolete test file (test/test.other-formats-corpus.ts) and update file metadata in test outputs. 2025-04-03 21:07:21 +00:00
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`// Process each associated file`
			`for (let i = 0; i < afArray.size(); i++) {`
			`const fileSpec = afArray.lookup(i);`
			`if (!(fileSpec instanceof PDFDict)) {`
			`continue;`
			`}`
fix(core): Refactor module imports to use the centralized plugins module and update relative paths across the codebase. Also remove the obsolete test file (test/test.other-formats-corpus.ts) and update file metadata in test outputs. 2025-04-03 21:07:21 +00:00
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`// Get the file name`
			`const fileNameObj = fileSpec.lookup(PDFName.of('F')) \|\| fileSpec.lookup(PDFName.of('UF'));`
			`if (!(fileNameObj instanceof PDFString)) {`
			`continue;`
			`}`
fix(core): Refactor module imports to use the centralized plugins module and update relative paths across the codebase. Also remove the obsolete test file (test/test.other-formats-corpus.ts) and update file metadata in test outputs. 2025-04-03 21:07:21 +00:00
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`const fileName = fileNameObj.decodeText();`
fix(core): Refactor module imports to use the centralized plugins module and update relative paths across the codebase. Also remove the obsolete test file (test/test.other-formats-corpus.ts) and update file metadata in test outputs. 2025-04-03 21:07:21 +00:00
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`// Check if it's a known invoice XML file name`
			`const isKnownFileName = this.knownFileNames.some(`
			`knownName => fileName.toLowerCase() === knownName.toLowerCase()`
			`);`
fix(core): Refactor module imports to use the centralized plugins module and update relative paths across the codebase. Also remove the obsolete test file (test/test.other-formats-corpus.ts) and update file metadata in test outputs. 2025-04-03 21:07:21 +00:00
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`// Check if it's any XML file or has invoice-related keywords`
fix(core): Refactor module imports to use the centralized plugins module and update relative paths across the codebase. Also remove the obsolete test file (test/test.other-formats-corpus.ts) and update file metadata in test outputs. 2025-04-03 21:07:21 +00:00			`const isXmlFile = fileName.toLowerCase().endsWith('.xml') \|\|`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`fileName.toLowerCase().includes('zugferd') \|\|`
			`fileName.toLowerCase().includes('factur-x') \|\|`
			`fileName.toLowerCase().includes('xrechnung') \|\|`
			`fileName.toLowerCase().includes('invoice');`
fix(core): Refactor module imports to use the centralized plugins module and update relative paths across the codebase. Also remove the obsolete test file (test/test.other-formats-corpus.ts) and update file metadata in test outputs. 2025-04-03 21:07:21 +00:00
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`if (isKnownFileName \|\| isXmlFile) {`
			`// Get the embedded file dictionary`
			`const efDict = fileSpec.lookup(PDFName.of('EF'));`
			`if (!(efDict instanceof PDFDict)) {`
			`continue;`
			`}`
fix(core): Refactor module imports to use the centralized plugins module and update relative paths across the codebase. Also remove the obsolete test file (test/test.other-formats-corpus.ts) and update file metadata in test outputs. 2025-04-03 21:07:21 +00:00
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`// Get the file stream`
			`const fileStream = efDict.lookup(PDFName.of('F'));`
			`if (fileStream instanceof PDFRawStream) {`
			`const xmlContent = await this.extractXmlFromStream(fileStream, fileName);`
			`if (xmlContent) {`
			`return xmlContent;`
			`}`
			`}`
			`}`
			`}`
fix(core): Refactor module imports to use the centralized plugins module and update relative paths across the codebase. Also remove the obsolete test file (test/test.other-formats-corpus.ts) and update file metadata in test outputs. 2025-04-03 21:07:21 +00:00
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`console.warn('No valid XML found in associated files');`
			`return null;`
			`} catch (error) {`
			`console.error('Error in associated files extraction:', error);`
			`return null;`
			`}`
			`}`
			`}`