mod_smtp_response_normalize/
lib.rs

1use chrono::DateTime;
2use config::get_or_create_sub_module;
3use mlua::Lua;
4use regex::{RegexSet, RegexSetBuilder};
5use std::borrow::Cow;
6use std::sync::LazyLock;
7use uuid::Uuid;
8mod dict;
9
10type Normalizer = for<'a> fn(word: &'a str) -> Option<Cow<'a, str>>;
11
12fn tokenize_timestamp_3339<'a>(word: &'a str) -> Option<Cow<'a, str>> {
13    DateTime::parse_from_rfc3339(word)
14        .ok()
15        .map(|_| Cow::Borrowed("{timestamp}"))
16}
17
18fn tokenize_uuid<'a>(word: &'a str) -> Option<Cow<'a, str>> {
19    Uuid::try_parse(word).ok().map(|_| Cow::Borrowed("{uuid}"))
20}
21
22/// A number of dictionary words are technically valid base64 (eg: "duration")
23/// and we don't want them to be flagged as base64.  This recognizes
24/// a dictionary word and returns that word as the token, preventing
25/// further tokenization
26fn tokenize_dictionary_word_phf<'a>(word: &'a str) -> Option<Cow<'a, str>> {
27    if crate::dict::DICT.contains(word) {
28        Some(Cow::Borrowed(word))
29    } else {
30        None
31    }
32}
33
34/// Match either base64 or base64-url
35const BASE64_RE: &str =
36    r"^(:?[a-zA-Z0-9+/_\-]{4})+(:?[a-zA-Z0-9+/_\-]{2}==|[a-zA-Z0-9+/_\-]{3}=)?$";
37
38/// Match ipv4 or ipv6 addresses, followed by optional :port.
39/// This doesn't do anything about the ipv6 .port syntax.
40/// ipv6 portion of this is taken from
41/// <https://stackoverflow.com/a/17871737/149111>
42const IP_RE: &str = r"^(:?\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))(:?:\d{1,5})?$";
43
44/// Match email addresses.
45/// The complicated regex here outperforms the more simplistic and
46/// obvious regex that you might otherwise be inclined to write.
47/// <https://stackoverflow.com/a/201378/149111>
48const EMAIL_RE: &str = r#"^(?:[a-z0-9!#$%&'*+\x2f=?^_`\x7b-\x7d~\x2d]+(?:\.[a-z0-9!#$%&'*+\x2f=?^_`\x7b-\x7d~\x2d]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9\x2d]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9\x2d]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9\x2d]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])$"#;
49
50fn tokenize_re<'a>(word: &'a str) -> Option<Cow<'a, str>> {
51    static MAPPING: &[(&str, &str)] = &[
52        (IP_RE, "{ipaddr}"),
53        (BASE64_RE, "{base64}"),
54        (EMAIL_RE, "{email}"),
55    ];
56    static SET: LazyLock<RegexSet> = LazyLock::new(|| {
57        RegexSetBuilder::new(MAPPING.iter().map(|(re, _label)| re))
58            .build()
59            .unwrap()
60    });
61
62    let matching_idx: usize = SET.matches(word).into_iter().next()?;
63
64    Some(Cow::Borrowed(MAPPING[matching_idx].1))
65}
66
67/// Tokenize things that look a bit like some kind of hash that are
68/// not otherwise matchable as base64 or a uuid.
69/// We only consider words that are 8 or more characters and we
70/// want to see a mix of alphanumerics and punctuation like `-._`.
71/// We need at least two alpha and two numeric characters to
72/// consider it hashy enough.
73/// You might wonder why we can't encode this as a regex; the answer
74/// is that the regex crate doesn't support the lookaround assertions
75/// required to prevent this from matching dictionary words or simple
76/// numbers, and the fancy-regex crate, which does support those
77/// assertions, doesn't support the regex set builder.
78fn tokenize_hash<'a>(word: &'a str) -> Option<Cow<'a, str>> {
79    if word.len() < 8 {
80        return None;
81    }
82
83    let mut num_alpha = 0;
84    let mut num_digit = 0;
85
86    for c in word.chars() {
87        if c.is_ascii_alphabetic() {
88            num_alpha += 1;
89        } else if c.is_ascii_digit() {
90            num_digit += 1;
91        } else if c == '-' || c == '.' || c == '_' {
92            // OK
93        } else {
94            // Not hash-y
95            return None;
96        }
97    }
98
99    if num_alpha > 2 && num_digit > 2 {
100        return Some(Cow::Borrowed("{hash}"));
101    }
102
103    None
104}
105
106/// Look for `something=token` and replace the RHS with a token.
107/// This recurses on the RHS of the equals sign.
108fn tokenize_compound<'a>(word: &'a str) -> Option<Cow<'a, str>> {
109    if let Some((lhs, rhs)) = word.split_once('=') {
110        let tokenized = normalize_word(rhs)?;
111        Some(format!("{lhs}={tokenized}").into())
112    } else {
113        None
114    }
115}
116
117// Annotated here with bench throughput on my 7965WX.
118// Overall is 290 MiB/s 1.4us. Contrast with NOP (empty table)
119// throughput of 1.6GiB/s 244ns.
120// The numbers next to the entries below are the throughput
121// when just that particular item is enabled.
122const FUNCS: &[Normalizer] = &[
123    // Should always be first
124    tokenize_dictionary_word_phf, // 1.4266 GiB/s 272ns
125    tokenize_timestamp_3339,      // 1017MiB/s 392ns
126    tokenize_uuid,                // 1.13GiB/s 342ns
127    tokenize_hash,                // 1.19GiB/s 325ns
128    tokenize_re,                  // 233MiB/s 1.7us
129    // Should always be last
130    tokenize_compound,
131];
132
133fn normalize_word<'a>(word: &'a str) -> Option<Cow<'a, str>> {
134    for func in FUNCS {
135        let res = (func)(word);
136        if res.is_some() {
137            return res;
138        }
139    }
140    None
141}
142
143pub fn normalize(s: &str) -> String {
144    let mut result = String::with_capacity(s.len());
145
146    for word in s.split_ascii_whitespace() {
147        let word = match normalize_word(word) {
148            Some(tokenized) => tokenized,
149            None => Cow::Borrowed(word),
150        };
151
152        if !result.is_empty() {
153            result.push(' ');
154        }
155        result.push_str(&word);
156    }
157
158    result
159}
160
161pub fn register(lua: &Lua) -> anyhow::Result<()> {
162    let string_mod = get_or_create_sub_module(lua, "string")?;
163
164    string_mod.set(
165        "normalize_smtp_response",
166        lua.create_function(move |_, text: String| Ok(normalize(&text)))?,
167    )?;
168    Ok(())
169}
170
171#[cfg(test)]
172mod tests {
173    use super::*;
174
175    #[test]
176    fn various() {
177        const CASES: &[(&str, &str)] = &[
178            (
179                "retry again at 2025-11-06T17:11:34.261306612Z",
180                "retry again at {timestamp}",
181            ),
182            (
183                "a uuid 10aa5da5-3f3b-4176-beb9-32875830f082",
184                "a uuid {uuid}",
185            ),
186            ("aGVsbG8uCg==", "{base64}"),
187            ("aGVsbG8K", "{base64}"),
188            ("aGVsbG8K aGVsbG8K", "{base64} {base64}"),
189            ("hello aGVsbG8uCg==", "hello {base64}"),
190            ("hello", "hello"),
191            ("hello aGVsbG8K", "hello {base64}"),
192            (
193                "421 4.1.0 10.0.0.1 throttled try later",
194                "421 4.1.0 {ipaddr} throttled try later",
195            ),
196            (
197                "421 4.1.0 ::1 throttled try later",
198                "421 4.1.0 {ipaddr} throttled try later",
199            ),
200            (
201                "Accepting connection from 42.69.10.20:25",
202                "Accepting connection from {ipaddr}",
203            ),
204            ("duration 00:10:34", "duration 00:10:34"),
205            (
206                "rejecting mail for some.body@gmail.com",
207                "rejecting mail for {email}",
208            ),
209            (
210                "Your email has been rate limited because the From: header (RFC5322) in this message isn't aligned with either the authenticated SPF or DKIM organizational domain. To learn more about DMARC alignment, visit  https://support.google.com/a?p=dmarc-alignment  To learn more about Gmail requirements for bulk senders, visit  https://support.google.com/a?p=sender-guidelines. a640c23a62f3a-ab67626ed70si756442266b.465 - gsmtp",
211                "Your email has been rate limited because the From: header (RFC5322) in this message isn't aligned with either the authenticated SPF or DKIM organizational domain. To learn more about DMARC alignment, visit https://support.google.com/a?p=dmarc-alignment To learn more about Gmail requirements for bulk senders, visit https://support.google.com/a?p=sender-guidelines. {hash} - gsmtp",
212            ),
213            (
214                "550 5.1.1 The email account that you tried to reach does not exist. Please try double-checking the recipient's email address for typos or unnecessary spaces. For more information, go to  https://support.google.com/mail/?p=NoSuchUser 41be03b00d2f7-b93bf44f0c0si6882731a12.803 - gsmtp",
215                "550 5.1.1 The email account that you tried to reach does not exist. Please try double-checking the recipient's email address for typos or unnecessary spaces. For more information, go to https://support.google.com/mail/?p=NoSuchUser {hash} - gsmtp",
216            ),
217            ("OK ids=8a5475ccbbc611eda12250ebf67f93bd", "OK ids={uuid}"),
218        ];
219
220        for (input, expected_output) in CASES {
221            let output = normalize(input);
222
223            assert_eq!(output, *expected_output, "input={input}");
224        }
225    }
226}