mod_smtp_response_normalize/
lib.rs

1use chrono::DateTime;
2#[cfg(feature = "lua")]
3use config::get_or_create_sub_module;
4#[cfg(feature = "lua")]
5use mlua::Lua;
6use regex::{RegexSet, RegexSetBuilder};
7use std::borrow::Cow;
8use std::sync::LazyLock;
9use uuid::Uuid;
10mod dict;
11
12type Normalizer = for<'a> fn(word: &'a str) -> Option<Cow<'a, str>>;
13
14fn tokenize_timestamp_3339<'a>(word: &'a str) -> Option<Cow<'a, str>> {
15    DateTime::parse_from_rfc3339(word)
16        .ok()
17        .map(|_| Cow::Borrowed("{timestamp}"))
18}
19
20fn tokenize_uuid<'a>(word: &'a str) -> Option<Cow<'a, str>> {
21    Uuid::try_parse(word).ok().map(|_| Cow::Borrowed("{uuid}"))
22}
23
24/// A number of dictionary words are technically valid base64 (eg: "duration")
25/// and we don't want them to be flagged as base64.  This recognizes
26/// a dictionary word and returns that word as the token, preventing
27/// further tokenization
28fn tokenize_dictionary_word_phf<'a>(word: &'a str) -> Option<Cow<'a, str>> {
29    if crate::dict::DICT.contains(word) {
30        Some(Cow::Borrowed(word))
31    } else {
32        None
33    }
34}
35
36/// Match either base64 or base64-url
37const BASE64_RE: &str =
38    r"^(?:[a-zA-Z0-9+/_\-]{4})+(?:[a-zA-Z0-9+/_\-]{2}==|[a-zA-Z0-9+/_\-]{3}=)?$";
39
40/// Match ipv4 or ipv6 addresses, followed by optional :port.
41/// This doesn't do anything about the ipv6 .port syntax.
42/// ipv6 portion of this is taken from
43/// <https://stackoverflow.com/a/17871737/149111>
44const IP_RE: &str = r"^(?:\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]))(:?:\d{1,5})?$";
45
46/// Match email addresses.
47/// The complicated regex here outperforms the more simplistic and
48/// obvious regex that you might otherwise be inclined to write.
49/// <https://stackoverflow.com/a/201378/149111>
50const EMAIL_RE: &str = r#"^(?:[a-z0-9!#$%&'*+\x2f=?^_`\x7b-\x7d~\x2d]+(?:\.[a-z0-9!#$%&'*+\x2f=?^_`\x7b-\x7d~\x2d]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9\x2d]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9\x2d]*[a-z0-9])?|\[(?:(?:(?:2(?:5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(?:2(?:5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9\x2d]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])$"#;
51
52/// Match ISO 8601 duration strings.
53/// Format: P[n]Y[n]M[n]DT[n]H[n]M[n]S where P is the duration designator,
54/// T separates date and time components, and each component is optional.
55/// Examples: "P23DT23H", "P4Y", "P1Y2M3DT4H5M6S"
56const ISO8601_DURATION_RE: &str = r"^P(?:\d+(?:\.\d+)?Y)?(?:\d+(?:\.\d+)?M)?(?:\d+(?:\.\d+)?W)?(?:\d+(?:\.\d+)?D)?(?:T(?:\d+(?:\.\d+)?H)?(?:\d+(?:\.\d+)?M)?(?:\d+(?:\.\d+)?S)?)?$";
57
58fn tokenize_re<'a>(word: &'a str) -> Option<Cow<'a, str>> {
59    static MAPPING: &[(&str, &str)] = &[
60        (IP_RE, "{ipaddr}"),
61        (BASE64_RE, "{base64}"),
62        (EMAIL_RE, "{email}"),
63    ];
64    static SET: LazyLock<RegexSet> = LazyLock::new(|| {
65        RegexSetBuilder::new(MAPPING.iter().map(|(re, _label)| re))
66            .build()
67            .unwrap()
68    });
69
70    let matching_idx: usize = SET.matches(word).into_iter().next()?;
71
72    Some(Cow::Borrowed(MAPPING[matching_idx].1))
73}
74
75/// Tokenize things that look a bit like some kind of hash that are
76/// not otherwise matchable as base64 or a uuid.
77/// We only consider words that are 8 or more characters and we
78/// want to see a mix of alphanumerics and punctuation like `-._`.
79/// We need at least two alpha and two numeric characters to
80/// consider it hashy enough.
81/// You might wonder why we can't encode this as a regex; the answer
82/// is that the regex crate doesn't support the lookaround assertions
83/// required to prevent this from matching dictionary words or simple
84/// numbers, and the fancy-regex crate, which does support those
85/// assertions, doesn't support the regex set builder.
86fn tokenize_hash<'a>(word: &'a str) -> Option<Cow<'a, str>> {
87    if word.len() < 8 {
88        return None;
89    }
90
91    let mut num_alpha = 0;
92    let mut num_digit = 0;
93
94    for c in word.chars() {
95        if c.is_ascii_alphabetic() {
96            num_alpha += 1;
97        } else if c.is_ascii_digit() {
98            num_digit += 1;
99        } else if c == '-' || c == '.' || c == '_' {
100            // OK
101        } else {
102            // Not hash-y
103            return None;
104        }
105    }
106
107    if num_alpha > 2 && num_digit > 2 {
108        return Some(Cow::Borrowed("{hash}"));
109    }
110
111    None
112}
113
114/// Look for `something=token` and replace the RHS with a token.
115/// This recurses on the RHS of the equals sign.
116fn tokenize_compound<'a>(word: &'a str) -> Option<Cow<'a, str>> {
117    if let Some((lhs, rhs)) = word.split_once('=') {
118        let tokenized = normalize_word(rhs)?;
119        Some(format!("{lhs}={tokenized}").into())
120    } else {
121        None
122    }
123}
124
125/// Preprocess the input string to replace duration strings with a placeholder.
126/// Duration strings are sequences like "11s 999ms 990us 55ns".
127/// This must happen before splitting by whitespace since the duration itself
128/// contains whitespace.
129/// We use a placeholder without special characters so the bracket processing doesn't
130/// strip braces from it.
131fn preprocess_duration<'a>(s: &'a str) -> Cow<'a, str> {
132    // Pattern: number followed by time unit, optionally repeated with whitespace
133    // Units: ns, us, ms, s, m, h, day, month, year
134    // This regex matches sequences like "11s 999ms 990us 55ns"
135    static RE: LazyLock<regex::Regex> = LazyLock::new(|| {
136        // Match: number + unit, optionally followed by (whitespace + number + unit) repeated
137        // Example: "11s 999ms 990us 55ns"
138        // Word boundaries ensure we don't match "70s" inside "70si756..." or "abc11s"
139        let pattern =
140            r"\b\d+(?:ns|us|ms|s|m|h|day|month|year)(?:\s+\d+(?:ns|us|ms|s|m|h|day|month|year))*\b";
141        regex::Regex::new(pattern).unwrap()
142    });
143
144    // Replace all duration matches with a simple placeholder
145    let result = RE.replace_all(s, "__DURATION__");
146    Cow::Owned(result.into_owned())
147}
148
149/// Tokenize duration placeholders like "__DURATION__" to {duration}
150/// Also recognizes ISO 8601 duration strings (e.g., "P23DT23H", "P4Y", "P1Y2M3DT4H5M6S")
151fn tokenize_duration<'a>(word: &'a str) -> Option<Cow<'a, str>> {
152    if word == "__DURATION__" {
153        Some(Cow::Borrowed("{duration}"))
154    } else if word.starts_with('P') {
155        // ISO 8601 duration format: P[n]Y[n]M[n]DT[n]H[n]M[n]S
156        // T separates date and time components
157        // Check if it matches the ISO 8601 pattern
158        static ISO8601_RE: LazyLock<regex::Regex> =
159            LazyLock::new(|| regex::Regex::new(ISO8601_DURATION_RE).unwrap());
160        if ISO8601_RE.is_match(word) {
161            Some(Cow::Borrowed("{duration}"))
162        } else {
163            None
164        }
165    } else {
166        None
167    }
168}
169
170// Annotated here with bench throughput on my 7965WX.
171// Overall is 290 MiB/s 1.4us. Contrast with NOP (empty table)
172// throughput of 1.6GiB/s 244ns.
173// The numbers next to the entries below are the throughput
174// when just that particular item is enabled.
175const FUNCS: &[Normalizer] = &[
176    // Should always be first
177    tokenize_dictionary_word_phf, // 1.4266 GiB/s 272ns
178    tokenize_duration,            // duration placeholder
179    tokenize_timestamp_3339,      // 1017MiB/s 392ns
180    tokenize_uuid,                // 1.13GiB/s 342ns
181    tokenize_hash,                // 1.19GiB/s 325ns
182    tokenize_re,                  // 233MiB/s 1.7us
183    // Should always be last
184    tokenize_compound,
185];
186
187fn normalize_word<'a>(word: &'a str) -> Option<Cow<'a, str>> {
188    for func in FUNCS {
189        let res = (func)(word);
190        if res.is_some() {
191            return res;
192        }
193    }
194    None
195}
196
197pub fn normalize(s: &str) -> String {
198    let mut result = String::with_capacity(s.len());
199
200    // Preprocess to replace duration strings before splitting
201    // We need to do this first because duration strings contain whitespace
202    let s = match preprocess_duration(s) {
203        Cow::Borrowed(b) => b.to_string(),
204        Cow::Owned(o) => o,
205    };
206
207    let mut processed;
208
209    // pre-process to remove parenthetical delimited sequences and replace
210    // that punctuation with whitespace.  That allows the tokenizer to
211    // see more tokens and do a better job, without harming the prose
212    // in the response text.
213    // Do a quick test to see if any opening parens are present so that
214    // we can avoid allocating an additional string in the more common case
215    // where they are not present.
216    //
217    // This transforms eg: " [" -> " " and "] " -> " ",
218    // for each ASCII bracket character.
219    //
220    // To spell that out a bit more clearly, this transformation has the
221    // side effect of changing " (RFC5322) " into " RFC5322 "
222    // in the normalized output.
223    let needs_process = memchr::memchr3(b'[', b'(', b'{', s.as_bytes()).is_some();
224    let s = if needs_process {
225        processed = String::with_capacity(s.len());
226        let mut iter = s.chars().peekable();
227        while let Some(c) = iter.next() {
228            if (c.is_ascii_whitespace() || processed.is_empty())
229                && matches!(iter.peek(), Some('[' | '(' | '{'))
230            {
231                iter.next();
232                processed.push(' ');
233                continue;
234            }
235
236            if matches!(c, ']' | ')' | '}')
237                && iter
238                    .peek()
239                    .map(|c| c.is_ascii_whitespace() || c.is_ascii_punctuation())
240                    .unwrap_or(true)
241            {
242                iter.next();
243                processed.push(' ');
244                continue;
245            }
246
247            processed.push(c);
248        }
249        &processed
250    } else {
251        &s
252    };
253
254    for word in s.split_ascii_whitespace() {
255        let word = match normalize_word(word) {
256            Some(tokenized) => tokenized,
257            None => Cow::Borrowed(word),
258        };
259
260        // Collapse runs of 1+ spaces (implied between the split iter)
261        // into a single space character
262        if !result.is_empty() {
263            result.push(' ');
264        }
265        result.push_str(&word);
266    }
267
268    result
269}
270
271#[cfg(feature = "lua")]
272pub fn register(lua: &Lua) -> anyhow::Result<()> {
273    let string_mod = get_or_create_sub_module(lua, "string")?;
274
275    string_mod.set(
276        "normalize_smtp_response",
277        lua.create_function(move |_, text: String| Ok(normalize(&text)))?,
278    )?;
279    Ok(())
280}
281
282#[cfg(test)]
283mod tests {
284    use super::*;
285
286    #[test]
287    fn various() {
288        const CASES: &[(&str, &str)] = &[
289            (
290                "retry again at 2025-11-06T17:11:34.261306612Z",
291                "retry again at {timestamp}",
292            ),
293            (
294                "a uuid 10aa5da5-3f3b-4176-beb9-32875830f082",
295                "a uuid {uuid}",
296            ),
297            ("aGVsbG8uCg==", "{base64}"),
298            ("aGVsbG8K", "{base64}"),
299            ("aGVsbG8K aGVsbG8K", "{base64} {base64}"),
300            ("hello aGVsbG8uCg==", "hello {base64}"),
301            ("hello", "hello"),
302            ("hello aGVsbG8K", "hello {base64}"),
303            (
304                "421 4.1.0 10.0.0.1 throttled try later",
305                "421 4.1.0 {ipaddr} throttled try later",
306            ),
307            (
308                "421 4.1.0 ::1 throttled try later",
309                "421 4.1.0 {ipaddr} throttled try later",
310            ),
311            (
312                "Accepting connection from 42.69.10.20:25",
313                "Accepting connection from {ipaddr}",
314            ),
315            ("duration 00:10:34", "duration 00:10:34"),
316            (
317                "rejecting mail for some.body@gmail.com",
318                "rejecting mail for {email}",
319            ),
320            (
321                "Your email has been rate limited because the From: header (RFC5322) in this message isn't aligned with either the authenticated SPF or DKIM organizational domain. To learn more about DMARC alignment, visit  https://support.google.com/a?p=dmarc-alignment  To learn more about Gmail requirements for bulk senders, visit  https://support.google.com/a?p=sender-guidelines. a640c23a62f3a-ab67626ed70si756442266b.465 - gsmtp",
322                "Your email has been rate limited because the From: header RFC5322 in this message isn't aligned with either the authenticated SPF or DKIM organizational domain. To learn more about DMARC alignment, visit https://support.google.com/a?p=dmarc-alignment To learn more about Gmail requirements for bulk senders, visit https://support.google.com/a?p=sender-guidelines. {hash} - gsmtp",
323            ),
324            (
325                "550 5.1.1 The email account that you tried to reach does not exist. Please try double-checking the recipient's email address for typos or unnecessary spaces. For more information, go to  https://support.google.com/mail/?p=NoSuchUser 41be03b00d2f7-b93bf44f0c0si6882731a12.803 - gsmtp",
326                "550 5.1.1 The email account that you tried to reach does not exist. Please try double-checking the recipient's email address for typos or unnecessary spaces. For more information, go to https://support.google.com/mail/?p=NoSuchUser {hash} - gsmtp",
327            ),
328            ("OK ids=8a5475ccbbc611eda12250ebf67f93bd", "OK ids={uuid}"),
329            (
330                "550 Mail is rejected by recipients [aGVsbG8uCg== IP: 10.10.10.10]. https://service.mail.qq.com/detail/0/92.",
331                "550 Mail is rejected by recipients {base64} IP: {ipaddr} https://service.mail.qq.com/detail/0/92.",
332            ),
333            (
334                "Context: DispatcherDrop. Next due in 11s 999ms 990us 55ns at 2026-04-05T07:34:04.198063031Z",
335                "Context: DispatcherDrop. Next due in {duration} at {timestamp}",
336            ),
337            ("P23DT23H", "{duration}"),
338            ("P4Y", "{duration}"),
339            ("P1Y2M3DT4H5M6S", "{duration}"),
340            ("P1Y2M3DT4H5M6Shello", "{hash}"),
341            ("abc11s 999ms", "abc11s {duration}"),
342            ("2year", "{duration}"),
343            ("1month", "{duration}"),
344            ("3day", "{duration}"),
345            ("5h 30m", "{duration}"),
346            ("2yearhello", "2yearhello"),
347        ];
348
349        for (input, expected_output) in CASES {
350            let output = normalize(input);
351
352            k9::assert_equal!(output, *expected_output, "input={input}");
353        }
354    }
355}