//! Content scanning for email threat detection. //! //! Provides pattern-based scanning of email subjects, text bodies, HTML bodies, //! and attachment filenames for phishing, spam, malware, suspicious links, //! script injection, and sensitive data patterns. use regex::Regex; use serde::Serialize; use std::sync::LazyLock; // --------------------------------------------------------------------------- // Result types // --------------------------------------------------------------------------- #[derive(Debug, Clone, Serialize)] #[serde(rename_all = "camelCase")] pub struct ContentScanResult { pub threat_score: u32, pub threat_type: Option, pub threat_details: Option, pub scanned_elements: Vec, } // --------------------------------------------------------------------------- // Pattern definitions (compiled once via LazyLock) // --------------------------------------------------------------------------- static PHISHING_PATTERNS: LazyLock> = LazyLock::new(|| { vec![ Regex::new(r"(?i)(?:verify|confirm|update|login).*(?:account|password|details)").unwrap(), Regex::new(r"(?i)urgent.*(?:action|attention|required)").unwrap(), Regex::new(r"(?i)(?:paypal|apple|microsoft|amazon|google|bank).*(?:verify|confirm|suspend)").unwrap(), Regex::new(r"(?i)your.*(?:account).*(?:suspended|compromised|locked)").unwrap(), Regex::new(r"(?i)\b(?:password reset|security alert|security notice)\b").unwrap(), ] }); static SPAM_PATTERNS: LazyLock> = LazyLock::new(|| { vec![ Regex::new(r"(?i)\b(?:viagra|cialis|enlargement|diet pill|lose weight fast|cheap meds)\b").unwrap(), Regex::new(r"(?i)\b(?:million dollars|lottery winner|prize claim|inheritance|rich widow)\b").unwrap(), Regex::new(r"(?i)\b(?:earn from home|make money fast|earn \$\d{3,}/day)\b").unwrap(), Regex::new(r"(?i)\b(?:limited time offer|act now|exclusive deal|only \d+ left)\b").unwrap(), Regex::new(r"(?i)\b(?:forex|stock tip|investment opportunity|cryptocurrency|bitcoin)\b").unwrap(), ] }); static MALWARE_PATTERNS: LazyLock> = LazyLock::new(|| { vec![ Regex::new(r"(?i)(?:attached file|see attachment).*(?:invoice|receipt|statement|document)").unwrap(), Regex::new(r"(?i)open.*(?:the attached|this attachment)").unwrap(), Regex::new(r"(?i)(?:enable|allow).*(?:macros|content|editing)").unwrap(), Regex::new(r"(?i)download.*(?:attachment|file|document)").unwrap(), Regex::new(r"(?i)\b(?:ransomware protection|virus alert|malware detected)\b").unwrap(), ] }); static SUSPICIOUS_LINK_PATTERNS: LazyLock> = LazyLock::new(|| { vec![ Regex::new(r"(?i)https?://bit\.ly/").unwrap(), Regex::new(r"(?i)https?://goo\.gl/").unwrap(), Regex::new(r"(?i)https?://t\.co/").unwrap(), Regex::new(r"(?i)https?://tinyurl\.com/").unwrap(), Regex::new(r"(?i)https?://(?:\d{1,3}\.){3}\d{1,3}").unwrap(), Regex::new(r"(?i)https?://.*\.(?:xyz|top|club|gq|cf)/").unwrap(), Regex::new(r"(?i)(?:login|account|signin|auth).*\.(?:xyz|top|club|gq|cf|tk|ml|ga|pw|ws|buzz)\b").unwrap(), ] }); static SCRIPT_INJECTION_PATTERNS: LazyLock> = LazyLock::new(|| { vec![ Regex::new(r"(?is).*").unwrap(), Regex::new(r"(?i)javascript:").unwrap(), Regex::new(r#"(?i)on(?:click|load|mouse|error|focus|blur)=".*""#).unwrap(), Regex::new(r"(?i)document\.(?:cookie|write|location)").unwrap(), Regex::new(r"(?i)eval\s*\(").unwrap(), ] }); static SENSITIVE_DATA_PATTERNS: LazyLock> = LazyLock::new(|| { vec![ Regex::new(r"\b(?:\d{3}-\d{2}-\d{4}|\d{9})\b").unwrap(), Regex::new(r"\b\d{13,16}\b").unwrap(), Regex::new(r"\b(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=|[A-Za-z0-9+/]{4})\b").unwrap(), ] }); /// Link extraction from HTML href attributes. static HREF_PATTERN: LazyLock = LazyLock::new(|| { Regex::new(r#"(?i)href=["'](https?://[^"']+)["']"#).unwrap() }); /// Executable file extensions that are considered dangerous. static EXECUTABLE_EXTENSIONS: LazyLock> = LazyLock::new(|| { vec![ ".exe", ".dll", ".bat", ".cmd", ".msi", ".vbs", ".ps1", ".sh", ".jar", ".py", ".com", ".scr", ".pif", ".hta", ".cpl", ".reg", ".vba", ".lnk", ".wsf", ".msp", ".mst", ] }); /// Document extensions that may contain macros. static MACRO_DOCUMENT_EXTENSIONS: LazyLock> = LazyLock::new(|| { vec![ ".doc", ".docm", ".xls", ".xlsm", ".ppt", ".pptm", ".dotm", ".xlsb", ".ppam", ".potm", ] }); // --------------------------------------------------------------------------- // HTML helpers // --------------------------------------------------------------------------- /// Strip HTML tags and decode common entities to produce plain text. fn extract_text_from_html(html: &str) -> String { // Remove style and script blocks first let no_style = Regex::new(r"(?is)]*>.*?").unwrap(); let no_script = Regex::new(r"(?is)]*>.*?").unwrap(); let no_tags = Regex::new(r"<[^>]+>").unwrap(); let text = no_style.replace_all(html, " "); let text = no_script.replace_all(&text, " "); let text = no_tags.replace_all(&text, " "); text.replace(" ", " ") .replace("<", "<") .replace(">", ">") .replace("&", "&") .replace(""", "\"") .replace("'", "'") .split_whitespace() .collect::>() .join(" ") } /// Extract all href links from HTML. fn extract_links_from_html(html: &str) -> Vec { HREF_PATTERN .captures_iter(html) .filter_map(|cap| cap.get(1).map(|m| m.as_str().to_string())) .collect() } // --------------------------------------------------------------------------- // Scoring helpers // --------------------------------------------------------------------------- fn matches_any(text: &str, patterns: &[Regex]) -> bool { patterns.iter().any(|p| p.is_match(text)) } // --------------------------------------------------------------------------- // Main scan entry point // --------------------------------------------------------------------------- /// Scan email content for threats. /// /// This mirrors the TypeScript ContentScanner logic — scanning the subject, /// text body, HTML body, and attachment filenames against predefined patterns. /// Returns an aggregate threat score and the highest-severity threat type. pub fn scan_content( subject: Option<&str>, text_body: Option<&str>, html_body: Option<&str>, attachment_names: &[String], ) -> ContentScanResult { let mut score: u32 = 0; let mut threat_type: Option = None; let mut threat_details: Option = None; let mut scanned: Vec = Vec::new(); // Helper: upgrade threat info only if the new finding is more severe. macro_rules! record { ($new_score:expr, $ttype:expr, $details:expr) => { score += $new_score; // Always adopt the threat type from the highest-scoring match. threat_type = Some($ttype.to_string()); threat_details = Some($details.to_string()); }; } // ── Subject scanning ────────────────────────────────────────────── if let Some(subj) = subject { scanned.push("subject".into()); if matches_any(subj, &PHISHING_PATTERNS) { record!(25, "phishing", format!("Subject contains potential phishing indicators: {}", subj)); } else if matches_any(subj, &SPAM_PATTERNS) { record!(15, "spam", format!("Subject contains potential spam indicators: {}", subj)); } } // ── Text body scanning ──────────────────────────────────────────── if let Some(text) = text_body { scanned.push("text".into()); // Check each category and accumulate score (same order as TS) for pat in SUSPICIOUS_LINK_PATTERNS.iter() { if pat.is_match(text) { score += 20; if threat_type.as_deref() != Some("suspicious_link") { threat_type = Some("suspicious_link".into()); threat_details = Some("Text contains suspicious links".into()); } } } for pat in PHISHING_PATTERNS.iter() { if pat.is_match(text) { score += 25; threat_type = Some("phishing".into()); threat_details = Some("Text contains potential phishing indicators".into()); } } for pat in SPAM_PATTERNS.iter() { if pat.is_match(text) { score += 15; if threat_type.is_none() { threat_type = Some("spam".into()); threat_details = Some("Text contains potential spam indicators".into()); } } } for pat in MALWARE_PATTERNS.iter() { if pat.is_match(text) { score += 30; threat_type = Some("malware".into()); threat_details = Some("Text contains potential malware indicators".into()); } } for pat in SENSITIVE_DATA_PATTERNS.iter() { if pat.is_match(text) { score += 25; if threat_type.is_none() { threat_type = Some("sensitive_data".into()); threat_details = Some("Text contains potentially sensitive data patterns".into()); } } } } // ── HTML body scanning ──────────────────────────────────────────── if let Some(html) = html_body { scanned.push("html".into()); // Script injection check for pat in SCRIPT_INJECTION_PATTERNS.iter() { if pat.is_match(html) { score += 40; if threat_type.as_deref() != Some("xss") { threat_type = Some("xss".into()); threat_details = Some("HTML contains potentially malicious script content".into()); } } } // Extract text from HTML and scan (half score to avoid double counting) let text_content = extract_text_from_html(html); if !text_content.is_empty() { let mut html_text_score: u32 = 0; let mut html_text_type: Option = None; let mut html_text_details: Option = None; // Re-run text patterns on extracted HTML text for pat in SUSPICIOUS_LINK_PATTERNS.iter() { if pat.is_match(&text_content) { html_text_score += 20; html_text_type = Some("suspicious_link".into()); html_text_details = Some("Text contains suspicious links".into()); } } for pat in PHISHING_PATTERNS.iter() { if pat.is_match(&text_content) { html_text_score += 25; html_text_type = Some("phishing".into()); html_text_details = Some("Text contains potential phishing indicators".into()); } } for pat in SPAM_PATTERNS.iter() { if pat.is_match(&text_content) { html_text_score += 15; if html_text_type.is_none() { html_text_type = Some("spam".into()); html_text_details = Some("Text contains potential spam indicators".into()); } } } for pat in MALWARE_PATTERNS.iter() { if pat.is_match(&text_content) { html_text_score += 30; html_text_type = Some("malware".into()); html_text_details = Some("Text contains potential malware indicators".into()); } } for pat in SENSITIVE_DATA_PATTERNS.iter() { if pat.is_match(&text_content) { html_text_score += 25; if html_text_type.is_none() { html_text_type = Some("sensitive_data".into()); html_text_details = Some("Text contains potentially sensitive data patterns".into()); } } } if html_text_score > 0 { // Add half of the text content score to avoid double counting score += html_text_score / 2; if let Some(t) = html_text_type { if threat_type.is_none() || html_text_score > score { threat_type = Some(t); threat_details = html_text_details; } } } } // Extract and check links from HTML let links = extract_links_from_html(html); if !links.is_empty() { let mut suspicious_count = 0u32; for link in &links { if matches_any(link, &SUSPICIOUS_LINK_PATTERNS) { suspicious_count += 1; } } if suspicious_count > 0 { let pct = (suspicious_count as f64 / links.len() as f64) * 100.0; let additional = std::cmp::min(40, (pct / 2.5) as u32); score += additional; if additional > 20 || threat_type.is_none() { threat_type = Some("suspicious_link".into()); threat_details = Some(format!( "HTML contains {} suspicious links out of {} total links", suspicious_count, links.len() )); } } } } // ── Attachment filename scanning ────────────────────────────────── for name in attachment_names { let lower = name.to_lowercase(); scanned.push(format!("attachment:{}", lower)); // Check executable extensions for ext in EXECUTABLE_EXTENSIONS.iter() { if lower.ends_with(ext) { score += 70; threat_type = Some("executable".into()); threat_details = Some(format!( "Attachment has a potentially dangerous extension: {}", name )); break; } } // Check macro document extensions for ext in MACRO_DOCUMENT_EXTENSIONS.iter() { if lower.ends_with(ext) { // Flag macro-capable documents (lower score than executables) score += 20; if threat_type.is_none() { threat_type = Some("malicious_macro".into()); threat_details = Some(format!( "Attachment is a macro-capable document: {}", name )); } break; } } } ContentScanResult { threat_score: score, threat_type, threat_details, scanned_elements: scanned, } } // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- #[cfg(test)] mod tests { use super::*; #[test] fn test_clean_content() { let result = scan_content( Some("Project Update"), Some("The project is on track."), None, &[], ); assert_eq!(result.threat_score, 0); assert!(result.threat_type.is_none()); } #[test] fn test_phishing_subject() { let result = scan_content( Some("URGENT: Verify your bank account details immediately"), None, None, &[], ); assert!(result.threat_score >= 25); assert_eq!(result.threat_type.as_deref(), Some("phishing")); } #[test] fn test_spam_body() { let result = scan_content( None, Some("Win a million dollars in the lottery winner contest!"), None, &[], ); assert!(result.threat_score >= 15); assert_eq!(result.threat_type.as_deref(), Some("spam")); } #[test] fn test_suspicious_links() { let result = scan_content( None, Some("Check out https://bit.ly/2x3F5 for more info"), None, &[], ); assert!(result.threat_score >= 20); assert_eq!(result.threat_type.as_deref(), Some("suspicious_link")); } #[test] fn test_script_injection() { let result = scan_content( None, None, Some("

Hello

"), &[], ); assert!(result.threat_score >= 40); assert_eq!(result.threat_type.as_deref(), Some("xss")); } #[test] fn test_executable_attachment() { let result = scan_content( None, None, None, &["update.exe".into()], ); assert!(result.threat_score >= 70); assert_eq!(result.threat_type.as_deref(), Some("executable")); } #[test] fn test_macro_document() { let result = scan_content( None, None, None, &["report.docm".into()], ); assert!(result.threat_score >= 20); assert_eq!(result.threat_type.as_deref(), Some("malicious_macro")); } #[test] fn test_malware_indicators() { let result = scan_content( None, Some("Please enable macros to view this document properly."), None, &[], ); assert!(result.threat_score >= 30); assert_eq!(result.threat_type.as_deref(), Some("malware")); } #[test] fn test_html_link_extraction() { let result = scan_content( None, None, Some(r#"click and here"#), &[], ); assert!(result.threat_score > 0); } #[test] fn test_compound_threats() { let result = scan_content( Some("URGENT: Verify your account details immediately"), Some("Your account will be suspended unless you verify at https://bit.ly/2x3F5"), Some(r#"verify"#), &["verification.exe".into()], ); assert!(result.threat_score > 70); } }