//! Content scanning for email threat detection.
//!
//! Provides pattern-based scanning of email subjects, text bodies, HTML bodies,
//! and attachment filenames for phishing, spam, malware, suspicious links,
//! script injection, and sensitive data patterns.

use regex::Regex;
use serde::Serialize;
use std::sync::LazyLock;

// ---------------------------------------------------------------------------
// Result types
// ---------------------------------------------------------------------------

#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct ContentScanResult {
    pub threat_score: u32,
    pub threat_type: Option<String>,
    pub threat_details: Option<String>,
    pub scanned_elements: Vec<String>,
}

// ---------------------------------------------------------------------------
// Pattern definitions (compiled once via LazyLock)
// ---------------------------------------------------------------------------

static PHISHING_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
    vec![
        Regex::new(r"(?i)(?:verify|confirm|update|login).*(?:account|password|details)").unwrap(),
        Regex::new(r"(?i)urgent.*(?:action|attention|required)").unwrap(),
        Regex::new(r"(?i)(?:paypal|apple|microsoft|amazon|google|bank).*(?:verify|confirm|suspend)").unwrap(),
        Regex::new(r"(?i)your.*(?:account).*(?:suspended|compromised|locked)").unwrap(),
        Regex::new(r"(?i)\b(?:password reset|security alert|security notice)\b").unwrap(),
    ]
});

static SPAM_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
    vec![
        Regex::new(r"(?i)\b(?:viagra|cialis|enlargement|diet pill|lose weight fast|cheap meds)\b").unwrap(),
        Regex::new(r"(?i)\b(?:million dollars|lottery winner|prize claim|inheritance|rich widow)\b").unwrap(),
        Regex::new(r"(?i)\b(?:earn from home|make money fast|earn \$\d{3,}/day)\b").unwrap(),
        Regex::new(r"(?i)\b(?:limited time offer|act now|exclusive deal|only \d+ left)\b").unwrap(),
        Regex::new(r"(?i)\b(?:forex|stock tip|investment opportunity|cryptocurrency|bitcoin)\b").unwrap(),
    ]
});

static MALWARE_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
    vec![
        Regex::new(r"(?i)(?:attached file|see attachment).*(?:invoice|receipt|statement|document)").unwrap(),
        Regex::new(r"(?i)open.*(?:the attached|this attachment)").unwrap(),
        Regex::new(r"(?i)(?:enable|allow).*(?:macros|content|editing)").unwrap(),
        Regex::new(r"(?i)download.*(?:attachment|file|document)").unwrap(),
        Regex::new(r"(?i)\b(?:ransomware protection|virus alert|malware detected)\b").unwrap(),
    ]
});

static SUSPICIOUS_LINK_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
    vec![
        Regex::new(r"(?i)https?://bit\.ly/").unwrap(),
        Regex::new(r"(?i)https?://goo\.gl/").unwrap(),
        Regex::new(r"(?i)https?://t\.co/").unwrap(),
        Regex::new(r"(?i)https?://tinyurl\.com/").unwrap(),
        Regex::new(r"(?i)https?://(?:\d{1,3}\.){3}\d{1,3}").unwrap(),
        Regex::new(r"(?i)https?://.*\.(?:xyz|top|club|gq|cf)/").unwrap(),
        Regex::new(r"(?i)(?:login|account|signin|auth).*\.(?:xyz|top|club|gq|cf|tk|ml|ga|pw|ws|buzz)\b").unwrap(),
    ]
});

static SCRIPT_INJECTION_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
    vec![
        Regex::new(r"(?is)<script.*>.*</script>").unwrap(),
        Regex::new(r"(?i)javascript:").unwrap(),
        Regex::new(r#"(?i)on(?:click|load|mouse|error|focus|blur)=".*""#).unwrap(),
        Regex::new(r"(?i)document\.(?:cookie|write|location)").unwrap(),
        Regex::new(r"(?i)eval\s*\(").unwrap(),
    ]
});

static SENSITIVE_DATA_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
    vec![
        Regex::new(r"\b(?:\d{3}-\d{2}-\d{4}|\d{9})\b").unwrap(),
        Regex::new(r"\b\d{13,16}\b").unwrap(),
        Regex::new(r"\b(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=|[A-Za-z0-9+/]{4})\b").unwrap(),
    ]
});

/// Link extraction from HTML href attributes.
static HREF_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(r#"(?i)href=["'](https?://[^"']+)["']"#).unwrap()
});

/// Executable file extensions that are considered dangerous.
static EXECUTABLE_EXTENSIONS: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
    vec![
        ".exe", ".dll", ".bat", ".cmd", ".msi", ".vbs", ".ps1",
        ".sh", ".jar", ".py", ".com", ".scr", ".pif", ".hta", ".cpl",
        ".reg", ".vba", ".lnk", ".wsf", ".msp", ".mst",
    ]
});

/// Document extensions that may contain macros.
static MACRO_DOCUMENT_EXTENSIONS: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
    vec![
        ".doc", ".docm", ".xls", ".xlsm", ".ppt", ".pptm",
        ".dotm", ".xlsb", ".ppam", ".potm",
    ]
});

// ---------------------------------------------------------------------------
// HTML helpers
// ---------------------------------------------------------------------------

/// Strip HTML tags and decode common entities to produce plain text.
fn extract_text_from_html(html: &str) -> String {
    // Remove style and script blocks first
    let no_style = Regex::new(r"(?is)<style[^>]*>.*?</style>").unwrap();
    let no_script = Regex::new(r"(?is)<script[^>]*>.*?</script>").unwrap();
    let no_tags = Regex::new(r"<[^>]+>").unwrap();

    let text = no_style.replace_all(html, " ");
    let text = no_script.replace_all(&text, " ");
    let text = no_tags.replace_all(&text, " ");

    text.replace("&nbsp;", " ")
        .replace("&lt;", "<")
        .replace("&gt;", ">")
        .replace("&amp;", "&")
        .replace("&quot;", "\"")
        .replace("&apos;", "'")
        .split_whitespace()
        .collect::<Vec<_>>()
        .join(" ")
}

/// Extract all href links from HTML.
fn extract_links_from_html(html: &str) -> Vec<String> {
    HREF_PATTERN
        .captures_iter(html)
        .filter_map(|cap| cap.get(1).map(|m| m.as_str().to_string()))
        .collect()
}

// ---------------------------------------------------------------------------
// Scoring helpers
// ---------------------------------------------------------------------------

fn matches_any(text: &str, patterns: &[Regex]) -> bool {
    patterns.iter().any(|p| p.is_match(text))
}

// ---------------------------------------------------------------------------
// Main scan entry point
// ---------------------------------------------------------------------------

/// Scan email content for threats.
///
/// This mirrors the TypeScript ContentScanner logic — scanning the subject,
/// text body, HTML body, and attachment filenames against predefined patterns.
/// Returns an aggregate threat score and the highest-severity threat type.
pub fn scan_content(
    subject: Option<&str>,
    text_body: Option<&str>,
    html_body: Option<&str>,
    attachment_names: &[String],
) -> ContentScanResult {
    let mut score: u32 = 0;
    let mut threat_type: Option<String> = None;
    let mut threat_details: Option<String> = None;
    let mut scanned: Vec<String> = Vec::new();

    // Helper: upgrade threat info only if the new finding is more severe.
    macro_rules! record {
        ($new_score:expr, $ttype:expr, $details:expr) => {
            score += $new_score;
            // Always adopt the threat type from the highest-scoring match.
            threat_type = Some($ttype.to_string());
            threat_details = Some($details.to_string());
        };
    }

    // ── Subject scanning ──────────────────────────────────────────────
    if let Some(subj) = subject {
        scanned.push("subject".into());

        if matches_any(subj, &PHISHING_PATTERNS) {
            record!(25, "phishing", format!("Subject contains potential phishing indicators: {}", subj));
        } else if matches_any(subj, &SPAM_PATTERNS) {
            record!(15, "spam", format!("Subject contains potential spam indicators: {}", subj));
        }
    }

    // ── Text body scanning ────────────────────────────────────────────
    if let Some(text) = text_body {
        scanned.push("text".into());

        // Check each category and accumulate score (same order as TS)
        for pat in SUSPICIOUS_LINK_PATTERNS.iter() {
            if pat.is_match(text) {
                score += 20;
                if threat_type.as_deref() != Some("suspicious_link") {
                    threat_type = Some("suspicious_link".into());
                    threat_details = Some("Text contains suspicious links".into());
                }
            }
        }

        for pat in PHISHING_PATTERNS.iter() {
            if pat.is_match(text) {
                score += 25;
                threat_type = Some("phishing".into());
                threat_details = Some("Text contains potential phishing indicators".into());
            }
        }

        for pat in SPAM_PATTERNS.iter() {
            if pat.is_match(text) {
                score += 15;
                if threat_type.is_none() {
                    threat_type = Some("spam".into());
                    threat_details = Some("Text contains potential spam indicators".into());
                }
            }
        }

        for pat in MALWARE_PATTERNS.iter() {
            if pat.is_match(text) {
                score += 30;
                threat_type = Some("malware".into());
                threat_details = Some("Text contains potential malware indicators".into());
            }
        }

        for pat in SENSITIVE_DATA_PATTERNS.iter() {
            if pat.is_match(text) {
                score += 25;
                if threat_type.is_none() {
                    threat_type = Some("sensitive_data".into());
                    threat_details = Some("Text contains potentially sensitive data patterns".into());
                }
            }
        }
    }

    // ── HTML body scanning ────────────────────────────────────────────
    if let Some(html) = html_body {
        scanned.push("html".into());

        // Script injection check
        for pat in SCRIPT_INJECTION_PATTERNS.iter() {
            if pat.is_match(html) {
                score += 40;
                if threat_type.as_deref() != Some("xss") {
                    threat_type = Some("xss".into());
                    threat_details = Some("HTML contains potentially malicious script content".into());
                }
            }
        }

        // Extract text from HTML and scan (half score to avoid double counting)
        let text_content = extract_text_from_html(html);
        if !text_content.is_empty() {
            let mut html_text_score: u32 = 0;
            let mut html_text_type: Option<String> = None;
            let mut html_text_details: Option<String> = None;

            // Re-run text patterns on extracted HTML text
            for pat in SUSPICIOUS_LINK_PATTERNS.iter() {
                if pat.is_match(&text_content) {
                    html_text_score += 20;
                    html_text_type = Some("suspicious_link".into());
                    html_text_details = Some("Text contains suspicious links".into());
                }
            }
            for pat in PHISHING_PATTERNS.iter() {
                if pat.is_match(&text_content) {
                    html_text_score += 25;
                    html_text_type = Some("phishing".into());
                    html_text_details = Some("Text contains potential phishing indicators".into());
                }
            }
            for pat in SPAM_PATTERNS.iter() {
                if pat.is_match(&text_content) {
                    html_text_score += 15;
                    if html_text_type.is_none() {
                        html_text_type = Some("spam".into());
                        html_text_details = Some("Text contains potential spam indicators".into());
                    }
                }
            }
            for pat in MALWARE_PATTERNS.iter() {
                if pat.is_match(&text_content) {
                    html_text_score += 30;
                    html_text_type = Some("malware".into());
                    html_text_details = Some("Text contains potential malware indicators".into());
                }
            }
            for pat in SENSITIVE_DATA_PATTERNS.iter() {
                if pat.is_match(&text_content) {
                    html_text_score += 25;
                    if html_text_type.is_none() {
                        html_text_type = Some("sensitive_data".into());
                        html_text_details = Some("Text contains potentially sensitive data patterns".into());
                    }
                }
            }

            if html_text_score > 0 {
                // Add half of the text content score to avoid double counting
                score += html_text_score / 2;
                if let Some(t) = html_text_type {
                    if threat_type.is_none() || html_text_score > score {
                        threat_type = Some(t);
                        threat_details = html_text_details;
                    }
                }
            }
        }

        // Extract and check links from HTML
        let links = extract_links_from_html(html);
        if !links.is_empty() {
            let mut suspicious_count = 0u32;
            for link in &links {
                if matches_any(link, &SUSPICIOUS_LINK_PATTERNS) {
                    suspicious_count += 1;
                }
            }

            if suspicious_count > 0 {
                let pct = (suspicious_count as f64 / links.len() as f64) * 100.0;
                let additional = std::cmp::min(40, (pct / 2.5) as u32);
                score += additional;

                if additional > 20 || threat_type.is_none() {
                    threat_type = Some("suspicious_link".into());
                    threat_details = Some(format!(
                        "HTML contains {} suspicious links out of {} total links",
                        suspicious_count,
                        links.len()
                    ));
                }
            }
        }
    }

    // ── Attachment filename scanning ──────────────────────────────────
    for name in attachment_names {
        let lower = name.to_lowercase();
        scanned.push(format!("attachment:{}", lower));

        // Check executable extensions
        for ext in EXECUTABLE_EXTENSIONS.iter() {
            if lower.ends_with(ext) {
                score += 70;
                threat_type = Some("executable".into());
                threat_details = Some(format!(
                    "Attachment has a potentially dangerous extension: {}",
                    name
                ));
                break;
            }
        }

        // Check macro document extensions
        for ext in MACRO_DOCUMENT_EXTENSIONS.iter() {
            if lower.ends_with(ext) {
                // Flag macro-capable documents (lower score than executables)
                score += 20;
                if threat_type.is_none() {
                    threat_type = Some("malicious_macro".into());
                    threat_details = Some(format!(
                        "Attachment is a macro-capable document: {}",
                        name
                    ));
                }
                break;
            }
        }
    }

    ContentScanResult {
        threat_score: score,
        threat_type,
        threat_details,
        scanned_elements: scanned,
    }
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_clean_content() {
        let result = scan_content(
            Some("Project Update"),
            Some("The project is on track."),
            None,
            &[],
        );
        assert_eq!(result.threat_score, 0);
        assert!(result.threat_type.is_none());
    }

    #[test]
    fn test_phishing_subject() {
        let result = scan_content(
            Some("URGENT: Verify your bank account details immediately"),
            None,
            None,
            &[],
        );
        assert!(result.threat_score >= 25);
        assert_eq!(result.threat_type.as_deref(), Some("phishing"));
    }

    #[test]
    fn test_spam_body() {
        let result = scan_content(
            None,
            Some("Win a million dollars in the lottery winner contest!"),
            None,
            &[],
        );
        assert!(result.threat_score >= 15);
        assert_eq!(result.threat_type.as_deref(), Some("spam"));
    }

    #[test]
    fn test_suspicious_links() {
        let result = scan_content(
            None,
            Some("Check out https://bit.ly/2x3F5 for more info"),
            None,
            &[],
        );
        assert!(result.threat_score >= 20);
        assert_eq!(result.threat_type.as_deref(), Some("suspicious_link"));
    }

    #[test]
    fn test_script_injection() {
        let result = scan_content(
            None,
            None,
            Some("<p>Hello</p><script>document.cookie='steal';</script>"),
            &[],
        );
        assert!(result.threat_score >= 40);
        assert_eq!(result.threat_type.as_deref(), Some("xss"));
    }

    #[test]
    fn test_executable_attachment() {
        let result = scan_content(
            None,
            None,
            None,
            &["update.exe".into()],
        );
        assert!(result.threat_score >= 70);
        assert_eq!(result.threat_type.as_deref(), Some("executable"));
    }

    #[test]
    fn test_macro_document() {
        let result = scan_content(
            None,
            None,
            None,
            &["report.docm".into()],
        );
        assert!(result.threat_score >= 20);
        assert_eq!(result.threat_type.as_deref(), Some("malicious_macro"));
    }

    #[test]
    fn test_malware_indicators() {
        let result = scan_content(
            None,
            Some("Please enable macros to view this document properly."),
            None,
            &[],
        );
        assert!(result.threat_score >= 30);
        assert_eq!(result.threat_type.as_deref(), Some("malware"));
    }

    #[test]
    fn test_html_link_extraction() {
        let result = scan_content(
            None,
            None,
            Some(r#"<a href="https://bit.ly/abc">click</a> and <a href="https://t.co/xyz">here</a>"#),
            &[],
        );
        assert!(result.threat_score > 0);
    }

    #[test]
    fn test_compound_threats() {
        let result = scan_content(
            Some("URGENT: Verify your account details immediately"),
            Some("Your account will be suspended unless you verify at https://bit.ly/2x3F5"),
            Some(r#"<a href="https://bit.ly/2x3F5">verify</a>"#),
            &["verification.exe".into()],
        );
        assert!(result.threat_score > 70);
    }
}