dcrouter/ts/security/classes.contentscanner.ts

import * as plugins from '../plugins.js';
import * as paths from '../paths.js';
import { logger } from '../logger.js';
import { Email } from '../mail/core/classes.email.js';
import type { IAttachment } from '../mail/core/classes.email.js';
import { SecurityLogger, SecurityLogLevel, SecurityEventType } from './classes.securitylogger.js';
import { LRUCache } from 'lru-cache';

/**
 * Scan result information
 */
export interface IScanResult {
  isClean: boolean;           // Whether the content is clean (no threats detected)
  threatType?: string;        // Type of threat if detected
  threatDetails?: string;     // Details about the detected threat
  threatScore: number;        // 0 (clean) to 100 (definitely malicious)
  scannedElements: string[];  // What was scanned (subject, body, attachments, etc.)
  timestamp: number;          // When this scan was performed
}

/**
 * Options for content scanner configuration
 */
export interface IContentScannerOptions {
  maxCacheSize?: number;              // Maximum number of entries to cache
  cacheTTL?: number;                  // TTL for cache entries in ms
  scanSubject?: boolean;              // Whether to scan email subjects
  scanBody?: boolean;                 // Whether to scan email bodies
  scanAttachments?: boolean;          // Whether to scan attachments
  maxAttachmentSizeToScan?: number;   // Max size of attachments to scan in bytes
  scanAttachmentNames?: boolean;      // Whether to scan attachment filenames
  blockExecutables?: boolean;         // Whether to block executable attachments
  blockMacros?: boolean;              // Whether to block documents with macros
  customRules?: Array<{              // Custom scanning rules
    pattern: string | RegExp;         // Pattern to match
    type: string;                     // Type of threat
    score: number;                    // Threat score
    description: string;              // Description of the threat
  }>;
  minThreatScore?: number;            // Minimum score to consider content as a threat
  highThreatScore?: number;           // Score above which content is considered high threat
}

/**
 * Threat categories
 */
export enum ThreatCategory {
  SPAM = 'spam',
  PHISHING = 'phishing',
  MALWARE = 'malware',
  EXECUTABLE = 'executable',
  SUSPICIOUS_LINK = 'suspicious_link',
  MALICIOUS_MACRO = 'malicious_macro',
  XSS = 'xss',
  SENSITIVE_DATA = 'sensitive_data',
  BLACKLISTED_CONTENT = 'blacklisted_content',
  CUSTOM_RULE = 'custom_rule'
}

/**
 * Content Scanner for detecting malicious email content
 */
export class ContentScanner {
  private static instance: ContentScanner;
  private scanCache: LRUCache<string, IScanResult>;
  private options: Required<IContentScannerOptions>;

  // Predefined patterns for common threats
  private static readonly MALICIOUS_PATTERNS = {
    // Phishing patterns
    phishing: [
      /(?:verify|confirm|update|login).*(?:account|password|details)/i,
      /urgent.*(?:action|attention|required)/i,
      /(?:paypal|apple|microsoft|amazon|google|bank).*(?:verify|confirm|suspend)/i,
      /your.*(?:account).*(?:suspended|compromised|locked)/i,
      /\b(?:password reset|security alert|security notice)\b/i
    ],

    // Spam indicators
    spam: [
      /\b(?:viagra|cialis|enlargement|diet pill|lose weight fast|cheap meds)\b/i,
      /\b(?:million dollars|lottery winner|prize claim|inheritance|rich widow)\b/i,
      /\b(?:earn from home|make money fast|earn \$\d{3,}\/day)\b/i,
      /\b(?:limited time offer|act now|exclusive deal|only \d+ left)\b/i,
      /\b(?:forex|stock tip|investment opportunity|cryptocurrency|bitcoin)\b/i
    ],

    // Malware indicators in text
    malware: [
      /(?:attached file|see attachment).*(?:invoice|receipt|statement|document)/i,
      /open.*(?:the attached|this attachment)/i,
      /(?:enable|allow).*(?:macros|content|editing)/i,
      /download.*(?:attachment|file|document)/i,
      /\b(?:ransomware protection|virus alert|malware detected)\b/i
    ],

    // Suspicious links
    suspiciousLinks: [
      /https?:\/\/bit\.ly\//i,
      /https?:\/\/goo\.gl\//i,
      /https?:\/\/t\.co\//i,
      /https?:\/\/tinyurl\.com\//i,
      /https?:\/\/(?:\d{1,3}\.){3}\d{1,3}/i, // IP address URLs
      /https?:\/\/.*\.(?:xyz|top|club|gq|cf)\//i, // Suspicious TLDs
      /(?:login|account|signin|auth).*\.(?!gov|edu|com|org|net)\w+\.\w+/i, // Login pages on unusual domains
    ],

    // XSS and script injection
    scriptInjection: [
      /<script.*>.*<\/script>/is,
      /javascript:/i,
      /on(?:click|load|mouse|error|focus|blur)=".*"/i,
      /document\.(?:cookie|write|location)/i,
      /eval\s*\(/i
    ],

    // Sensitive data patterns
    sensitiveData: [
      /\b(?:\d{3}-\d{2}-\d{4}|\d{9})\b/, // SSN
      /\b\d{13,16}\b/, // Credit card numbers
      /\b(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=|[A-Za-z0-9+/]{4})\b/ // Possible Base64
    ]
  };

  // Common executable extensions
  private static readonly EXECUTABLE_EXTENSIONS = [
    '.exe', '.dll', '.bat', '.cmd', '.msi', '.js', '.vbs', '.ps1',
    '.sh', '.jar', '.py', '.com', '.scr', '.pif', '.hta', '.cpl',
    '.reg', '.vba', '.lnk', '.wsf', '.msi', '.msp', '.mst'
  ];

  // Document formats that may contain macros
  private static readonly MACRO_DOCUMENT_EXTENSIONS = [
    '.doc', '.docm', '.xls', '.xlsm', '.ppt', '.pptm', '.dotm', '.xlsb', '.ppam', '.potm'
  ];

  /**
   * Default options for the content scanner
   */
  private static readonly DEFAULT_OPTIONS: Required<IContentScannerOptions> = {
    maxCacheSize: 10000,
    cacheTTL: 24 * 60 * 60 * 1000, // 24 hours
    scanSubject: true,
    scanBody: true,
    scanAttachments: true,
    maxAttachmentSizeToScan: 10 * 1024 * 1024, // 10MB
    scanAttachmentNames: true,
    blockExecutables: true,
    blockMacros: true,
    customRules: [],
    minThreatScore: 30, // Minimum score to consider content as a threat
    highThreatScore: 70  // Score above which content is considered high threat
  };

  /**
   * Constructor for the ContentScanner
   * @param options Configuration options
   */
  constructor(options: IContentScannerOptions = {}) {
    // Merge with default options
    this.options = {
      ...ContentScanner.DEFAULT_OPTIONS,
      ...options
    };

    // Initialize cache
    this.scanCache = new LRUCache<string, IScanResult>({
      max: this.options.maxCacheSize,
      ttl: this.options.cacheTTL,
    });

    logger.log('info', 'ContentScanner initialized');
  }

  /**
   * Get the singleton instance of the scanner
   * @param options Configuration options
   * @returns Singleton scanner instance
   */
  public static getInstance(options: IContentScannerOptions = {}): ContentScanner {
    if (!ContentScanner.instance) {
      ContentScanner.instance = new ContentScanner(options);
    }
    return ContentScanner.instance;
  }

  /**
   * Scan an email for malicious content
   * @param email The email to scan
   * @returns Scan result
   */
  public async scanEmail(email: Email): Promise<IScanResult> {
    try {
      // Generate a cache key from the email
      const cacheKey = this.generateCacheKey(email);

      // Check cache first
      const cachedResult = this.scanCache.get(cacheKey);
      if (cachedResult) {
        logger.log('info', `Using cached scan result for email ${email.getMessageId()}`);
        return cachedResult;
      }

      // Initialize scan result
      const result: IScanResult = {
        isClean: true,
        threatScore: 0,
        scannedElements: [],
        timestamp: Date.now()
      };

      // List of scan promises
      const scanPromises: Array<Promise<void>> = [];

      // Scan subject
      if (this.options.scanSubject && email.subject) {
        scanPromises.push(this.scanSubject(email.subject, result));
      }

      // Scan body content
      if (this.options.scanBody) {
        if (email.text) {
          scanPromises.push(this.scanTextContent(email.text, result));
        }

        if (email.html) {
          scanPromises.push(this.scanHtmlContent(email.html, result));
        }
      }

      // Scan attachments
      if (this.options.scanAttachments && email.attachments && email.attachments.length > 0) {
        for (const attachment of email.attachments) {
          scanPromises.push(this.scanAttachment(attachment, result));
        }
      }

      // Run all scans in parallel
      await Promise.all(scanPromises);

      // Determine if the email is clean based on threat score
      result.isClean = result.threatScore < this.options.minThreatScore;

      // Save to cache
      this.scanCache.set(cacheKey, result);

      // Log high threat findings
      if (result.threatScore >= this.options.highThreatScore) {
        this.logHighThreatFound(email, result);
      } else if (!result.isClean) {
        this.logThreatFound(email, result);
      }

      return result;
    } catch (error) {
      logger.log('error', `Error scanning email: ${error.message}`, {
        messageId: email.getMessageId(),
        error: error.stack
      });

      // Return a safe default with error indication
      return {
        isClean: true, // Let it pass if scanner fails (configure as desired)
        threatScore: 0,
        scannedElements: ['error'],
        timestamp: Date.now(),
        threatType: 'scan_error',
        threatDetails: `Scan error: ${error.message}`
      };
    }
  }

  /**
   * Generate a cache key from an email
   * @param email The email to generate a key for
   * @returns Cache key
   */
  private generateCacheKey(email: Email): string {
    // Use message ID if available
    if (email.getMessageId()) {
      return `email:${email.getMessageId()}`;
    }

    // Fallback to a hash of key content
    const contentToHash = [
      email.from,
      email.subject || '',
      email.text?.substring(0, 1000) || '',
      email.html?.substring(0, 1000) || '',
      email.attachments?.length || 0
    ].join(':');

    return `email:${plugins.crypto.createHash('sha256').update(contentToHash).digest('hex')}`;
  }

  /**
   * Scan email subject for threats
   * @param subject The subject to scan
   * @param result The scan result to update
   */
  private async scanSubject(subject: string, result: IScanResult): Promise<void> {
    result.scannedElements.push('subject');

    // Check against phishing patterns
    for (const pattern of ContentScanner.MALICIOUS_PATTERNS.phishing) {
      if (pattern.test(subject)) {
        result.threatScore += 25;
        result.threatType = ThreatCategory.PHISHING;
        result.threatDetails = `Subject contains potential phishing indicators: ${subject}`;
        return;
      }
    }

    // Check against spam patterns
    for (const pattern of ContentScanner.MALICIOUS_PATTERNS.spam) {
      if (pattern.test(subject)) {
        result.threatScore += 15;
        result.threatType = ThreatCategory.SPAM;
        result.threatDetails = `Subject contains potential spam indicators: ${subject}`;
        return;
      }
    }

    // Check custom rules
    for (const rule of this.options.customRules) {
      const pattern = rule.pattern instanceof RegExp ? rule.pattern : new RegExp(rule.pattern, 'i');
      if (pattern.test(subject)) {
        result.threatScore += rule.score;
        result.threatType = rule.type;
        result.threatDetails = rule.description;
        return;
      }
    }
  }

  /**
   * Scan plain text content for threats
   * @param text The text content to scan
   * @param result The scan result to update
   */
  private async scanTextContent(text: string, result: IScanResult): Promise<void> {
    result.scannedElements.push('text');

    // Check suspicious links
    for (const pattern of ContentScanner.MALICIOUS_PATTERNS.suspiciousLinks) {
      if (pattern.test(text)) {
        result.threatScore += 20;
        if (!result.threatType || result.threatScore > (result.threatType === ThreatCategory.SUSPICIOUS_LINK ? 0 : 20)) {
          result.threatType = ThreatCategory.SUSPICIOUS_LINK;
          result.threatDetails = `Text contains suspicious links`;
        }
      }
    }

    // Check phishing
    for (const pattern of ContentScanner.MALICIOUS_PATTERNS.phishing) {
      if (pattern.test(text)) {
        result.threatScore += 25;
        if (!result.threatType || result.threatScore > (result.threatType === ThreatCategory.PHISHING ? 0 : 25)) {
          result.threatType = ThreatCategory.PHISHING;
          result.threatDetails = `Text contains potential phishing indicators`;
        }
      }
    }

    // Check spam
    for (const pattern of ContentScanner.MALICIOUS_PATTERNS.spam) {
      if (pattern.test(text)) {
        result.threatScore += 15;
        if (!result.threatType || result.threatScore > (result.threatType === ThreatCategory.SPAM ? 0 : 15)) {
          result.threatType = ThreatCategory.SPAM;
          result.threatDetails = `Text contains potential spam indicators`;
        }
      }
    }

    // Check malware indicators
    for (const pattern of ContentScanner.MALICIOUS_PATTERNS.malware) {
      if (pattern.test(text)) {
        result.threatScore += 30;
        if (!result.threatType || result.threatScore > (result.threatType === ThreatCategory.MALWARE ? 0 : 30)) {
          result.threatType = ThreatCategory.MALWARE;
          result.threatDetails = `Text contains potential malware indicators`;
        }
      }
    }

    // Check sensitive data
    for (const pattern of ContentScanner.MALICIOUS_PATTERNS.sensitiveData) {
      if (pattern.test(text)) {
        result.threatScore += 25;
        if (!result.threatType || result.threatScore > (result.threatType === ThreatCategory.SENSITIVE_DATA ? 0 : 25)) {
          result.threatType = ThreatCategory.SENSITIVE_DATA;
          result.threatDetails = `Text contains potentially sensitive data patterns`;
        }
      }
    }

    // Check custom rules
    for (const rule of this.options.customRules) {
      const pattern = rule.pattern instanceof RegExp ? rule.pattern : new RegExp(rule.pattern, 'i');
      if (pattern.test(text)) {
        result.threatScore += rule.score;
        if (!result.threatType || result.threatScore > 20) {
          result.threatType = rule.type;
          result.threatDetails = rule.description;
        }
      }
    }
  }

  /**
   * Scan HTML content for threats
   * @param html The HTML content to scan
   * @param result The scan result to update
   */
  private async scanHtmlContent(html: string, result: IScanResult): Promise<void> {
    result.scannedElements.push('html');

    // Check for script injection
    for (const pattern of ContentScanner.MALICIOUS_PATTERNS.scriptInjection) {
      if (pattern.test(html)) {
        result.threatScore += 40;
        if (!result.threatType || result.threatType !== ThreatCategory.XSS) {
          result.threatType = ThreatCategory.XSS;
          result.threatDetails = `HTML contains potentially malicious script content`;
        }
      }
    }

    // Extract text content from HTML for further scanning
    const textContent = this.extractTextFromHtml(html);
    if (textContent) {
      // We'll leverage the text scanning but not double-count threat score
      const tempResult: IScanResult = {
        isClean: true,
        threatScore: 0,
        scannedElements: [],
        timestamp: Date.now()
      };

      await this.scanTextContent(textContent, tempResult);

      // Only add additional threat types if they're more severe
      if (tempResult.threatType && tempResult.threatScore > 0) {
        // Add half of the text content score to avoid double counting
        result.threatScore += Math.floor(tempResult.threatScore / 2);

        // Adopt the threat type if more severe or no existing type
        if (!result.threatType || tempResult.threatScore > result.threatScore) {
          result.threatType = tempResult.threatType;
          result.threatDetails = tempResult.threatDetails;
        }
      }
    }

    // Extract and check links from HTML
    const links = this.extractLinksFromHtml(html);
    if (links.length > 0) {
      // Check for suspicious links
      let suspiciousLinks = 0;
      for (const link of links) {
        for (const pattern of ContentScanner.MALICIOUS_PATTERNS.suspiciousLinks) {
          if (pattern.test(link)) {
            suspiciousLinks++;
            break;
          }
        }
      }

      if (suspiciousLinks > 0) {
        // Add score based on percentage of suspicious links
        const suspiciousPercentage = (suspiciousLinks / links.length) * 100;
        const additionalScore = Math.min(40, Math.floor(suspiciousPercentage / 2.5));
        result.threatScore += additionalScore;

        if (!result.threatType || additionalScore > 20) {
          result.threatType = ThreatCategory.SUSPICIOUS_LINK;
          result.threatDetails = `HTML contains ${suspiciousLinks} suspicious links out of ${links.length} total links`;
        }
      }
    }
  }

  /**
   * Scan an attachment for threats
   * @param attachment The attachment to scan
   * @param result The scan result to update
   */
  private async scanAttachment(attachment: IAttachment, result: IScanResult): Promise<void> {
    const filename = attachment.filename.toLowerCase();
    result.scannedElements.push(`attachment:${filename}`);

    // Skip large attachments if configured
    if (attachment.content && attachment.content.length > this.options.maxAttachmentSizeToScan) {
      logger.log('info', `Skipping scan of large attachment: ${filename} (${attachment.content.length} bytes)`);
      return;
    }

    // Check filename for executable extensions
    if (this.options.blockExecutables) {
      for (const ext of ContentScanner.EXECUTABLE_EXTENSIONS) {
        if (filename.endsWith(ext)) {
          result.threatScore += 70; // High score for executable attachments
          result.threatType = ThreatCategory.EXECUTABLE;
          result.threatDetails = `Attachment has a potentially dangerous extension: ${filename}`;
          return; // No need to scan contents if filename already flagged
        }
      }
    }

    // Check for Office documents with macros
    if (this.options.blockMacros) {
      for (const ext of ContentScanner.MACRO_DOCUMENT_EXTENSIONS) {
        if (filename.endsWith(ext)) {
          // For Office documents, check if they contain macros
          // This is a simplified check - a real implementation would use specialized libraries
          // to detect macros in Office documents
          if (attachment.content && this.likelyContainsMacros(attachment)) {
            result.threatScore += 60;
            result.threatType = ThreatCategory.MALICIOUS_MACRO;
            result.threatDetails = `Attachment appears to contain macros: ${filename}`;
            return;
          }
        }
      }
    }

    // Perform basic content analysis if we have content buffer
    if (attachment.content) {
      // Convert to string for scanning, with a limit to prevent memory issues
      const textContent = this.extractTextFromBuffer(attachment.content);

      if (textContent) {
        // Scan for malicious patterns in attachment content
        for (const category in ContentScanner.MALICIOUS_PATTERNS) {
          const patterns = ContentScanner.MALICIOUS_PATTERNS[category];
          for (const pattern of patterns) {
            if (pattern.test(textContent)) {
              result.threatScore += 30;

              if (!result.threatType) {
                result.threatType = this.mapCategoryToThreatType(category);
                result.threatDetails = `Attachment content contains suspicious patterns: ${filename}`;
              }

              break;
            }
          }
        }
      }

      // Check for PE headers (Windows executables)
      if (attachment.content.length > 64 &&
          attachment.content[0] === 0x4D &&
          attachment.content[1] === 0x5A) { // 'MZ' header
        result.threatScore += 80;
        result.threatType = ThreatCategory.EXECUTABLE;
        result.threatDetails = `Attachment contains executable code: ${filename}`;
      }
    }
  }

  /**
   * Extract links from HTML content
   * @param html HTML content
   * @returns Array of extracted links
   */
  private extractLinksFromHtml(html: string): string[] {
    const links: string[] = [];

    // Simple regex-based extraction - a real implementation might use a proper HTML parser
    const matches = html.match(/href=["'](https?:\/\/[^"']+)["']/gi);
    if (matches) {
      for (const match of matches) {
        const linkMatch = match.match(/href=["'](https?:\/\/[^"']+)["']/i);
        if (linkMatch && linkMatch[1]) {
          links.push(linkMatch[1]);
        }
      }
    }

    return links;
  }

  /**
   * Extract plain text from HTML
   * @param html HTML content
   * @returns Extracted text
   */
  private extractTextFromHtml(html: string): string {
    // Remove HTML tags and decode entities - simplified version
    return html
      .replace(/<style[^>]*>.*?<\/style>/gs, '')
      .replace(/<script[^>]*>.*?<\/script>/gs, '')
      .replace(/<[^>]+>/g, ' ')
      .replace(/&nbsp;/g, ' ')
      .replace(/&lt;/g, '<')
      .replace(/&gt;/g, '>')
      .replace(/&amp;/g, '&')
      .replace(/&quot;/g, '"')
      .replace(/&apos;/g, "'")
      .replace(/\s+/g, ' ')
      .trim();
  }

  /**
   * Extract text from a binary buffer for scanning
   * @param buffer Binary content
   * @returns Extracted text (may be partial)
   */
  private extractTextFromBuffer(buffer: Buffer): string {
    try {
      // Limit the amount we convert to avoid memory issues
      const sampleSize = Math.min(buffer.length, 100 * 1024); // 100KB max sample
      const sample = buffer.slice(0, sampleSize);

      // Try to convert to string, filtering out non-printable chars
      return sample.toString('utf8')
        .replace(/[\x00-\x09\x0B-\x1F\x7F-\x9F]/g, '') // Remove control chars
        .replace(/\uFFFD/g, ''); // Remove replacement char
    } catch (error) {
      logger.log('warn', `Error extracting text from buffer: ${error.message}`);
      return '';
    }
  }

  /**
   * Check if an Office document likely contains macros
   * This is a simplified check - real implementation would use specialized libraries
   * @param attachment The attachment to check
   * @returns Whether the file likely contains macros
   */
  private likelyContainsMacros(attachment: IAttachment): boolean {
    // Simple heuristic: look for VBA/macro related strings
    // This is a simplified approach and not comprehensive
    const content = this.extractTextFromBuffer(attachment.content);
    const macroIndicators = [
      /vbaProject\.bin/i,
      /Microsoft VBA/i,
      /\bVBA\b/,
      /Auto_Open/i,
      /AutoExec/i,
      /DocumentOpen/i,
      /AutoOpen/i,
      /\bExecute\(/i,
      /\bShell\(/i,
      /\bCreateObject\(/i
    ];

    for (const indicator of macroIndicators) {
      if (indicator.test(content)) {
        return true;
      }
    }

    return false;
  }

  /**
   * Map a pattern category to a threat type
   * @param category The pattern category
   * @returns The corresponding threat type
   */
  private mapCategoryToThreatType(category: string): string {
    switch (category) {
      case 'phishing': return ThreatCategory.PHISHING;
      case 'spam': return ThreatCategory.SPAM;
      case 'malware': return ThreatCategory.MALWARE;
      case 'suspiciousLinks': return ThreatCategory.SUSPICIOUS_LINK;
      case 'scriptInjection': return ThreatCategory.XSS;
      case 'sensitiveData': return ThreatCategory.SENSITIVE_DATA;
      default: return ThreatCategory.BLACKLISTED_CONTENT;
    }
  }

  /**
   * Log a high threat finding to the security logger
   * @param email The email containing the threat
   * @param result The scan result
   */
  private logHighThreatFound(email: Email, result: IScanResult): void {
    SecurityLogger.getInstance().logEvent({
      level: SecurityLogLevel.ERROR,
      type: SecurityEventType.MALWARE,
      message: `High threat content detected in email from ${email.from} to ${email.to.join(', ')}`,
      details: {
        messageId: email.getMessageId(),
        threatType: result.threatType,
        threatDetails: result.threatDetails,
        threatScore: result.threatScore,
        scannedElements: result.scannedElements,
        subject: email.subject
      },
      success: false,
      domain: email.getFromDomain()
    });
  }

  /**
   * Log a threat finding to the security logger
   * @param email The email containing the threat
   * @param result The scan result
   */
  private logThreatFound(email: Email, result: IScanResult): void {
    SecurityLogger.getInstance().logEvent({
      level: SecurityLogLevel.WARN,
      type: SecurityEventType.SPAM,
      message: `Suspicious content detected in email from ${email.from} to ${email.to.join(', ')}`,
      details: {
        messageId: email.getMessageId(),
        threatType: result.threatType,
        threatDetails: result.threatDetails,
        threatScore: result.threatScore,
        scannedElements: result.scannedElements,
        subject: email.subject
      },
      success: false,
      domain: email.getFromDomain()
    });
  }

  /**
   * Get threat level description based on score
   * @param score Threat score
   * @returns Threat level description
   */
  public static getThreatLevel(score: number): 'none' | 'low' | 'medium' | 'high' {
    if (score < 20) {
      return 'none';
    } else if (score < 40) {
      return 'low';
    } else if (score < 70) {
      return 'medium';
    } else {
      return 'high';
    }
  }
}