feat(security): migrate content scanning and bounce detection to Rust security bridge; add scanContent IPC command and Rust content scanner with tests; update TS RustSecurityBridge and callers, and adjust CI package references

2026-02-10 21:19:13 +00:00
parent b82468ab1e
commit 15a45089aa
21 changed files with 844 additions and 1530 deletions
@@ -17,3 +17,4 @@ hickory-resolver.workspace = true
 ipnet.workspace = true
 rustls-pki-types.workspace = true
 psl.workspace = true
+regex.workspace = true
@@ -0,0 +1,515 @@
+//! Content scanning for email threat detection.
+//!
+//! Provides pattern-based scanning of email subjects, text bodies, HTML bodies,
+//! and attachment filenames for phishing, spam, malware, suspicious links,
+//! script injection, and sensitive data patterns.
+
+use regex::Regex;
+use serde::Serialize;
+use std::sync::LazyLock;
+
+// ---------------------------------------------------------------------------
+// Result types
+// ---------------------------------------------------------------------------
+
+#[derive(Debug, Clone, Serialize)]
+#[serde(rename_all = "camelCase")]
+pub struct ContentScanResult {
+    pub threat_score: u32,
+    pub threat_type: Option<String>,
+    pub threat_details: Option<String>,
+    pub scanned_elements: Vec<String>,
+}
+
+// ---------------------------------------------------------------------------
+// Pattern definitions (compiled once via LazyLock)
+// ---------------------------------------------------------------------------
+
+static PHISHING_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
+    vec![
+        Regex::new(r"(?i)(?:verify|confirm|update|login).*(?:account|password|details)").unwrap(),
+        Regex::new(r"(?i)urgent.*(?:action|attention|required)").unwrap(),
+        Regex::new(r"(?i)(?:paypal|apple|microsoft|amazon|google|bank).*(?:verify|confirm|suspend)").unwrap(),
+        Regex::new(r"(?i)your.*(?:account).*(?:suspended|compromised|locked)").unwrap(),
+        Regex::new(r"(?i)\b(?:password reset|security alert|security notice)\b").unwrap(),
+    ]
+});
+
+static SPAM_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
+    vec![
+        Regex::new(r"(?i)\b(?:viagra|cialis|enlargement|diet pill|lose weight fast|cheap meds)\b").unwrap(),
+        Regex::new(r"(?i)\b(?:million dollars|lottery winner|prize claim|inheritance|rich widow)\b").unwrap(),
+        Regex::new(r"(?i)\b(?:earn from home|make money fast|earn \$\d{3,}/day)\b").unwrap(),
+        Regex::new(r"(?i)\b(?:limited time offer|act now|exclusive deal|only \d+ left)\b").unwrap(),
+        Regex::new(r"(?i)\b(?:forex|stock tip|investment opportunity|cryptocurrency|bitcoin)\b").unwrap(),
+    ]
+});
+
+static MALWARE_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
+    vec![
+        Regex::new(r"(?i)(?:attached file|see attachment).*(?:invoice|receipt|statement|document)").unwrap(),
+        Regex::new(r"(?i)open.*(?:the attached|this attachment)").unwrap(),
+        Regex::new(r"(?i)(?:enable|allow).*(?:macros|content|editing)").unwrap(),
+        Regex::new(r"(?i)download.*(?:attachment|file|document)").unwrap(),
+        Regex::new(r"(?i)\b(?:ransomware protection|virus alert|malware detected)\b").unwrap(),
+    ]
+});
+
+static SUSPICIOUS_LINK_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
+    vec![
+        Regex::new(r"(?i)https?://bit\.ly/").unwrap(),
+        Regex::new(r"(?i)https?://goo\.gl/").unwrap(),
+        Regex::new(r"(?i)https?://t\.co/").unwrap(),
+        Regex::new(r"(?i)https?://tinyurl\.com/").unwrap(),
+        Regex::new(r"(?i)https?://(?:\d{1,3}\.){3}\d{1,3}").unwrap(),
+        Regex::new(r"(?i)https?://.*\.(?:xyz|top|club|gq|cf)/").unwrap(),
+        Regex::new(r"(?i)(?:login|account|signin|auth).*\.(?:xyz|top|club|gq|cf|tk|ml|ga|pw|ws|buzz)\b").unwrap(),
+    ]
+});
+
+static SCRIPT_INJECTION_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
+    vec![
+        Regex::new(r"(?is)<script.*>.*</script>").unwrap(),
+        Regex::new(r"(?i)javascript:").unwrap(),
+        Regex::new(r#"(?i)on(?:click|load|mouse|error|focus|blur)=".*""#).unwrap(),
+        Regex::new(r"(?i)document\.(?:cookie|write|location)").unwrap(),
+        Regex::new(r"(?i)eval\s*\(").unwrap(),
+    ]
+});
+
+static SENSITIVE_DATA_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
+    vec![
+        Regex::new(r"\b(?:\d{3}-\d{2}-\d{4}|\d{9})\b").unwrap(),
+        Regex::new(r"\b\d{13,16}\b").unwrap(),
+        Regex::new(r"\b(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=|[A-Za-z0-9+/]{4})\b").unwrap(),
+    ]
+});
+
+/// Link extraction from HTML href attributes.
+static HREF_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
+    Regex::new(r#"(?i)href=["'](https?://[^"']+)["']"#).unwrap()
+});
+
+/// Executable file extensions that are considered dangerous.
+static EXECUTABLE_EXTENSIONS: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
+    vec![
+        ".exe", ".dll", ".bat", ".cmd", ".msi", ".vbs", ".ps1",
+        ".sh", ".jar", ".py", ".com", ".scr", ".pif", ".hta", ".cpl",
+        ".reg", ".vba", ".lnk", ".wsf", ".msp", ".mst",
+    ]
+});
+
+/// Document extensions that may contain macros.
+static MACRO_DOCUMENT_EXTENSIONS: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
+    vec![
+        ".doc", ".docm", ".xls", ".xlsm", ".ppt", ".pptm",
+        ".dotm", ".xlsb", ".ppam", ".potm",
+    ]
+});
+
+// ---------------------------------------------------------------------------
+// HTML helpers
+// ---------------------------------------------------------------------------
+
+/// Strip HTML tags and decode common entities to produce plain text.
+fn extract_text_from_html(html: &str) -> String {
+    // Remove style and script blocks first
+    let no_style = Regex::new(r"(?is)<style[^>]*>.*?</style>").unwrap();
+    let no_script = Regex::new(r"(?is)<script[^>]*>.*?</script>").unwrap();
+    let no_tags = Regex::new(r"<[^>]+>").unwrap();
+
+    let text = no_style.replace_all(html, " ");
+    let text = no_script.replace_all(&text, " ");
+    let text = no_tags.replace_all(&text, " ");
+
+    text.replace("&nbsp;", " ")
+        .replace("&lt;", "<")
+        .replace("&gt;", ">")
+        .replace("&amp;", "&")
+        .replace("&quot;", "\"")
+        .replace("&apos;", "'")
+        .split_whitespace()
+        .collect::<Vec<_>>()
+        .join(" ")
+}
+
+/// Extract all href links from HTML.
+fn extract_links_from_html(html: &str) -> Vec<String> {
+    HREF_PATTERN
+        .captures_iter(html)
+        .filter_map(|cap| cap.get(1).map(|m| m.as_str().to_string()))
+        .collect()
+}
+
+// ---------------------------------------------------------------------------
+// Scoring helpers
+// ---------------------------------------------------------------------------
+
+fn matches_any(text: &str, patterns: &[Regex]) -> bool {
+    patterns.iter().any(|p| p.is_match(text))
+}
+
+// ---------------------------------------------------------------------------
+// Main scan entry point
+// ---------------------------------------------------------------------------
+
+/// Scan email content for threats.
+///
+/// This mirrors the TypeScript ContentScanner logic — scanning the subject,
+/// text body, HTML body, and attachment filenames against predefined patterns.
+/// Returns an aggregate threat score and the highest-severity threat type.
+pub fn scan_content(
+    subject: Option<&str>,
+    text_body: Option<&str>,
+    html_body: Option<&str>,
+    attachment_names: &[String],
+) -> ContentScanResult {
+    let mut score: u32 = 0;
+    let mut threat_type: Option<String> = None;
+    let mut threat_details: Option<String> = None;
+    let mut scanned: Vec<String> = Vec::new();
+
+    // Helper: upgrade threat info only if the new finding is more severe.
+    macro_rules! record {
+        ($new_score:expr, $ttype:expr, $details:expr) => {
+            score += $new_score;
+            // Always adopt the threat type from the highest-scoring match.
+            threat_type = Some($ttype.to_string());
+            threat_details = Some($details.to_string());
+        };
+    }
+
+    // ── Subject scanning ──────────────────────────────────────────────
+    if let Some(subj) = subject {
+        scanned.push("subject".into());
+
+        if matches_any(subj, &PHISHING_PATTERNS) {
+            record!(25, "phishing", format!("Subject contains potential phishing indicators: {}", subj));
+        } else if matches_any(subj, &SPAM_PATTERNS) {
+            record!(15, "spam", format!("Subject contains potential spam indicators: {}", subj));
+        }
+    }
+
+    // ── Text body scanning ────────────────────────────────────────────
+    if let Some(text) = text_body {
+        scanned.push("text".into());
+
+        // Check each category and accumulate score (same order as TS)
+        for pat in SUSPICIOUS_LINK_PATTERNS.iter() {
+            if pat.is_match(text) {
+                score += 20;
+                if threat_type.as_deref() != Some("suspicious_link") {
+                    threat_type = Some("suspicious_link".into());
+                    threat_details = Some("Text contains suspicious links".into());
+                }
+            }
+        }
+
+        for pat in PHISHING_PATTERNS.iter() {
+            if pat.is_match(text) {
+                score += 25;
+                threat_type = Some("phishing".into());
+                threat_details = Some("Text contains potential phishing indicators".into());
+            }
+        }
+
+        for pat in SPAM_PATTERNS.iter() {
+            if pat.is_match(text) {
+                score += 15;
+                if threat_type.is_none() {
+                    threat_type = Some("spam".into());
+                    threat_details = Some("Text contains potential spam indicators".into());
+                }
+            }
+        }
+
+        for pat in MALWARE_PATTERNS.iter() {
+            if pat.is_match(text) {
+                score += 30;
+                threat_type = Some("malware".into());
+                threat_details = Some("Text contains potential malware indicators".into());
+            }
+        }
+
+        for pat in SENSITIVE_DATA_PATTERNS.iter() {
+            if pat.is_match(text) {
+                score += 25;
+                if threat_type.is_none() {
+                    threat_type = Some("sensitive_data".into());
+                    threat_details = Some("Text contains potentially sensitive data patterns".into());
+                }
+            }
+        }
+    }
+
+    // ── HTML body scanning ────────────────────────────────────────────
+    if let Some(html) = html_body {
+        scanned.push("html".into());
+
+        // Script injection check
+        for pat in SCRIPT_INJECTION_PATTERNS.iter() {
+            if pat.is_match(html) {
+                score += 40;
+                if threat_type.as_deref() != Some("xss") {
+                    threat_type = Some("xss".into());
+                    threat_details = Some("HTML contains potentially malicious script content".into());
+                }
+            }
+        }
+
+        // Extract text from HTML and scan (half score to avoid double counting)
+        let text_content = extract_text_from_html(html);
+        if !text_content.is_empty() {
+            let mut html_text_score: u32 = 0;
+            let mut html_text_type: Option<String> = None;
+            let mut html_text_details: Option<String> = None;
+
+            // Re-run text patterns on extracted HTML text
+            for pat in SUSPICIOUS_LINK_PATTERNS.iter() {
+                if pat.is_match(&text_content) {
+                    html_text_score += 20;
+                    html_text_type = Some("suspicious_link".into());
+                    html_text_details = Some("Text contains suspicious links".into());
+                }
+            }
+            for pat in PHISHING_PATTERNS.iter() {
+                if pat.is_match(&text_content) {
+                    html_text_score += 25;
+                    html_text_type = Some("phishing".into());
+                    html_text_details = Some("Text contains potential phishing indicators".into());
+                }
+            }
+            for pat in SPAM_PATTERNS.iter() {
+                if pat.is_match(&text_content) {
+                    html_text_score += 15;
+                    if html_text_type.is_none() {
+                        html_text_type = Some("spam".into());
+                        html_text_details = Some("Text contains potential spam indicators".into());
+                    }
+                }
+            }
+            for pat in MALWARE_PATTERNS.iter() {
+                if pat.is_match(&text_content) {
+                    html_text_score += 30;
+                    html_text_type = Some("malware".into());
+                    html_text_details = Some("Text contains potential malware indicators".into());
+                }
+            }
+            for pat in SENSITIVE_DATA_PATTERNS.iter() {
+                if pat.is_match(&text_content) {
+                    html_text_score += 25;
+                    if html_text_type.is_none() {
+                        html_text_type = Some("sensitive_data".into());
+                        html_text_details = Some("Text contains potentially sensitive data patterns".into());
+                    }
+                }
+            }
+
+            if html_text_score > 0 {
+                // Add half of the text content score to avoid double counting
+                score += html_text_score / 2;
+                if let Some(t) = html_text_type {
+                    if threat_type.is_none() || html_text_score > score {
+                        threat_type = Some(t);
+                        threat_details = html_text_details;
+                    }
+                }
+            }
+        }
+
+        // Extract and check links from HTML
+        let links = extract_links_from_html(html);
+        if !links.is_empty() {
+            let mut suspicious_count = 0u32;
+            for link in &links {
+                if matches_any(link, &SUSPICIOUS_LINK_PATTERNS) {
+                    suspicious_count += 1;
+                }
+            }
+
+            if suspicious_count > 0 {
+                let pct = (suspicious_count as f64 / links.len() as f64) * 100.0;
+                let additional = std::cmp::min(40, (pct / 2.5) as u32);
+                score += additional;
+
+                if additional > 20 || threat_type.is_none() {
+                    threat_type = Some("suspicious_link".into());
+                    threat_details = Some(format!(
+                        "HTML contains {} suspicious links out of {} total links",
+                        suspicious_count,
+                        links.len()
+                    ));
+                }
+            }
+        }
+    }
+
+    // ── Attachment filename scanning ──────────────────────────────────
+    for name in attachment_names {
+        let lower = name.to_lowercase();
+        scanned.push(format!("attachment:{}", lower));
+
+        // Check executable extensions
+        for ext in EXECUTABLE_EXTENSIONS.iter() {
+            if lower.ends_with(ext) {
+                score += 70;
+                threat_type = Some("executable".into());
+                threat_details = Some(format!(
+                    "Attachment has a potentially dangerous extension: {}",
+                    name
+                ));
+                break;
+            }
+        }
+
+        // Check macro document extensions
+        for ext in MACRO_DOCUMENT_EXTENSIONS.iter() {
+            if lower.ends_with(ext) {
+                // Flag macro-capable documents (lower score than executables)
+                score += 20;
+                if threat_type.is_none() {
+                    threat_type = Some("malicious_macro".into());
+                    threat_details = Some(format!(
+                        "Attachment is a macro-capable document: {}",
+                        name
+                    ));
+                }
+                break;
+            }
+        }
+    }
+
+    ContentScanResult {
+        threat_score: score,
+        threat_type,
+        threat_details,
+        scanned_elements: scanned,
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_clean_content() {
+        let result = scan_content(
+            Some("Project Update"),
+            Some("The project is on track."),
+            None,
+            &[],
+        );
+        assert_eq!(result.threat_score, 0);
+        assert!(result.threat_type.is_none());
+    }
+
+    #[test]
+    fn test_phishing_subject() {
+        let result = scan_content(
+            Some("URGENT: Verify your bank account details immediately"),
+            None,
+            None,
+            &[],
+        );
+        assert!(result.threat_score >= 25);
+        assert_eq!(result.threat_type.as_deref(), Some("phishing"));
+    }
+
+    #[test]
+    fn test_spam_body() {
+        let result = scan_content(
+            None,
+            Some("Win a million dollars in the lottery winner contest!"),
+            None,
+            &[],
+        );
+        assert!(result.threat_score >= 15);
+        assert_eq!(result.threat_type.as_deref(), Some("spam"));
+    }
+
+    #[test]
+    fn test_suspicious_links() {
+        let result = scan_content(
+            None,
+            Some("Check out https://bit.ly/2x3F5 for more info"),
+            None,
+            &[],
+        );
+        assert!(result.threat_score >= 20);
+        assert_eq!(result.threat_type.as_deref(), Some("suspicious_link"));
+    }
+
+    #[test]
+    fn test_script_injection() {
+        let result = scan_content(
+            None,
+            None,
+            Some("<p>Hello</p><script>document.cookie='steal';</script>"),
+            &[],
+        );
+        assert!(result.threat_score >= 40);
+        assert_eq!(result.threat_type.as_deref(), Some("xss"));
+    }
+
+    #[test]
+    fn test_executable_attachment() {
+        let result = scan_content(
+            None,
+            None,
+            None,
+            &["update.exe".into()],
+        );
+        assert!(result.threat_score >= 70);
+        assert_eq!(result.threat_type.as_deref(), Some("executable"));
+    }
+
+    #[test]
+    fn test_macro_document() {
+        let result = scan_content(
+            None,
+            None,
+            None,
+            &["report.docm".into()],
+        );
+        assert!(result.threat_score >= 20);
+        assert_eq!(result.threat_type.as_deref(), Some("malicious_macro"));
+    }
+
+    #[test]
+    fn test_malware_indicators() {
+        let result = scan_content(
+            None,
+            Some("Please enable macros to view this document properly."),
+            None,
+            &[],
+        );
+        assert!(result.threat_score >= 30);
+        assert_eq!(result.threat_type.as_deref(), Some("malware"));
+    }
+
+    #[test]
+    fn test_html_link_extraction() {
+        let result = scan_content(
+            None,
+            None,
+            Some(r#"<a href="https://bit.ly/abc">click</a> and <a href="https://t.co/xyz">here</a>"#),
+            &[],
+        );
+        assert!(result.threat_score > 0);
+    }
+
+    #[test]
+    fn test_compound_threats() {
+        let result = scan_content(
+            Some("URGENT: Verify your account details immediately"),
+            Some("Your account will be suspended unless you verify at https://bit.ly/2x3F5"),
+            Some(r#"<a href="https://bit.ly/2x3F5">verify</a>"#),
+            &["verification.exe".into()],
+        );
+        assert!(result.threat_score > 70);
+    }
+}
@@ -1,5 +1,6 @@
 //! mailer-security: DKIM, SPF, DMARC verification, and IP reputation checking.

+pub mod content_scanner;
 pub mod dkim;
 pub mod dmarc;
 pub mod error;