mod_smtp_response_normalize/
lib.rs

1use chrono::DateTime;
2use config::get_or_create_sub_module;
3use mlua::Lua;
4use regex::{RegexSet, RegexSetBuilder};
5use std::borrow::Cow;
6use std::sync::LazyLock;
7use uuid::Uuid;
8mod dict;
9
10type Normalizer = for<'a> fn(word: &'a str) -> Option<Cow<'a, str>>;
11
12fn tokenize_timestamp_3339<'a>(word: &'a str) -> Option<Cow<'a, str>> {
13    DateTime::parse_from_rfc3339(word)
14        .ok()
15        .map(|_| Cow::Borrowed("{timestamp}"))
16}
17
18fn tokenize_uuid<'a>(word: &'a str) -> Option<Cow<'a, str>> {
19    Uuid::try_parse(word).ok().map(|_| Cow::Borrowed("{uuid}"))
20}
21
22/// A number of dictionary words are technically valid base64 (eg: "duration")
23/// and we don't want them to be flagged as base64.  This recognizes
24/// a dictionary word and returns that word as the token, preventing
25/// further tokenization
26fn tokenize_dictionary_word_phf<'a>(word: &'a str) -> Option<Cow<'a, str>> {
27    if crate::dict::DICT.contains(word) {
28        Some(Cow::Borrowed(word))
29    } else {
30        None
31    }
32}
33
34/// Match either base64 or base64-url
35const BASE64_RE: &str =
36    r"^(?:[a-zA-Z0-9+/_\-]{4})+(?:[a-zA-Z0-9+/_\-]{2}==|[a-zA-Z0-9+/_\-]{3}=)?$";
37
38/// Match ipv4 or ipv6 addresses, followed by optional :port.
39/// This doesn't do anything about the ipv6 .port syntax.
40/// ipv6 portion of this is taken from
41/// <https://stackoverflow.com/a/17871737/149111>
42const IP_RE: &str = r"^(?:\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]))(:?:\d{1,5})?$";
43
44/// Match email addresses.
45/// The complicated regex here outperforms the more simplistic and
46/// obvious regex that you might otherwise be inclined to write.
47/// <https://stackoverflow.com/a/201378/149111>
48const EMAIL_RE: &str = r#"^(?:[a-z0-9!#$%&'*+\x2f=?^_`\x7b-\x7d~\x2d]+(?:\.[a-z0-9!#$%&'*+\x2f=?^_`\x7b-\x7d~\x2d]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9\x2d]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9\x2d]*[a-z0-9])?|\[(?:(?:(?:2(?:5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(?:2(?:5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9\x2d]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])$"#;
49
50fn tokenize_re<'a>(word: &'a str) -> Option<Cow<'a, str>> {
51    static MAPPING: &[(&str, &str)] = &[
52        (IP_RE, "{ipaddr}"),
53        (BASE64_RE, "{base64}"),
54        (EMAIL_RE, "{email}"),
55    ];
56    static SET: LazyLock<RegexSet> = LazyLock::new(|| {
57        RegexSetBuilder::new(MAPPING.iter().map(|(re, _label)| re))
58            .build()
59            .unwrap()
60    });
61
62    let matching_idx: usize = SET.matches(word).into_iter().next()?;
63
64    Some(Cow::Borrowed(MAPPING[matching_idx].1))
65}
66
67/// Tokenize things that look a bit like some kind of hash that are
68/// not otherwise matchable as base64 or a uuid.
69/// We only consider words that are 8 or more characters and we
70/// want to see a mix of alphanumerics and punctuation like `-._`.
71/// We need at least two alpha and two numeric characters to
72/// consider it hashy enough.
73/// You might wonder why we can't encode this as a regex; the answer
74/// is that the regex crate doesn't support the lookaround assertions
75/// required to prevent this from matching dictionary words or simple
76/// numbers, and the fancy-regex crate, which does support those
77/// assertions, doesn't support the regex set builder.
78fn tokenize_hash<'a>(word: &'a str) -> Option<Cow<'a, str>> {
79    if word.len() < 8 {
80        return None;
81    }
82
83    let mut num_alpha = 0;
84    let mut num_digit = 0;
85
86    for c in word.chars() {
87        if c.is_ascii_alphabetic() {
88            num_alpha += 1;
89        } else if c.is_ascii_digit() {
90            num_digit += 1;
91        } else if c == '-' || c == '.' || c == '_' {
92            // OK
93        } else {
94            // Not hash-y
95            return None;
96        }
97    }
98
99    if num_alpha > 2 && num_digit > 2 {
100        return Some(Cow::Borrowed("{hash}"));
101    }
102
103    None
104}
105
106/// Look for `something=token` and replace the RHS with a token.
107/// This recurses on the RHS of the equals sign.
108fn tokenize_compound<'a>(word: &'a str) -> Option<Cow<'a, str>> {
109    if let Some((lhs, rhs)) = word.split_once('=') {
110        let tokenized = normalize_word(rhs)?;
111        Some(format!("{lhs}={tokenized}").into())
112    } else {
113        None
114    }
115}
116
117// Annotated here with bench throughput on my 7965WX.
118// Overall is 290 MiB/s 1.4us. Contrast with NOP (empty table)
119// throughput of 1.6GiB/s 244ns.
120// The numbers next to the entries below are the throughput
121// when just that particular item is enabled.
122const FUNCS: &[Normalizer] = &[
123    // Should always be first
124    tokenize_dictionary_word_phf, // 1.4266 GiB/s 272ns
125    tokenize_timestamp_3339,      // 1017MiB/s 392ns
126    tokenize_uuid,                // 1.13GiB/s 342ns
127    tokenize_hash,                // 1.19GiB/s 325ns
128    tokenize_re,                  // 233MiB/s 1.7us
129    // Should always be last
130    tokenize_compound,
131];
132
133fn normalize_word<'a>(word: &'a str) -> Option<Cow<'a, str>> {
134    for func in FUNCS {
135        let res = (func)(word);
136        if res.is_some() {
137            return res;
138        }
139    }
140    None
141}
142
143pub fn normalize(s: &str) -> String {
144    let mut result = String::with_capacity(s.len());
145
146    let mut processed;
147
148    // pre-process to remove parenthetical delimited sequences and replace
149    // that punctuation with whitespace.  That allows the tokenizer to
150    // see more tokens and do a better job, without harming the prose
151    // in the response text.
152    // Do a quick test to see if any opening parens are present so that
153    // we can avoid allocating an additional string in the more common case
154    // where they are not present.
155    //
156    // This transforms eg: " [" -> " " and "] " -> " ",
157    // for each ASCII bracket character.
158    //
159    // To spell that out a bit more clearly, this transformation has the
160    // side effect of changing " (RFC5322) " into " RFC5322 "
161    // in the normalized output.
162    let needs_process = memchr::memchr3(b'[', b'(', b'{', s.as_bytes()).is_some();
163    let s = if needs_process {
164        processed = String::with_capacity(s.len());
165        let mut iter = s.chars().peekable();
166        while let Some(c) = iter.next() {
167            if (c.is_ascii_whitespace() || processed.is_empty())
168                && matches!(iter.peek(), Some('[' | '(' | '{'))
169            {
170                iter.next();
171                processed.push(' ');
172                continue;
173            }
174
175            if matches!(c, ']' | ')' | '}')
176                && iter
177                    .peek()
178                    .map(|c| c.is_ascii_whitespace() || c.is_ascii_punctuation())
179                    .unwrap_or(true)
180            {
181                iter.next();
182                processed.push(' ');
183                continue;
184            }
185
186            processed.push(c);
187        }
188        &processed
189    } else {
190        s
191    };
192
193    for word in s.split_ascii_whitespace() {
194        let word = match normalize_word(word) {
195            Some(tokenized) => tokenized,
196            None => Cow::Borrowed(word),
197        };
198
199        // Collapse runs of 1+ spaces (implied between the split iter)
200        // into a single space character
201        if !result.is_empty() {
202            result.push(' ');
203        }
204        result.push_str(&word);
205    }
206
207    result
208}
209
210pub fn register(lua: &Lua) -> anyhow::Result<()> {
211    let string_mod = get_or_create_sub_module(lua, "string")?;
212
213    string_mod.set(
214        "normalize_smtp_response",
215        lua.create_function(move |_, text: String| Ok(normalize(&text)))?,
216    )?;
217    Ok(())
218}
219
220#[cfg(test)]
221mod tests {
222    use super::*;
223
224    #[test]
225    fn various() {
226        const CASES: &[(&str, &str)] = &[
227            (
228                "retry again at 2025-11-06T17:11:34.261306612Z",
229                "retry again at {timestamp}",
230            ),
231            (
232                "a uuid 10aa5da5-3f3b-4176-beb9-32875830f082",
233                "a uuid {uuid}",
234            ),
235            ("aGVsbG8uCg==", "{base64}"),
236            ("aGVsbG8K", "{base64}"),
237            ("aGVsbG8K aGVsbG8K", "{base64} {base64}"),
238            ("hello aGVsbG8uCg==", "hello {base64}"),
239            ("hello", "hello"),
240            ("hello aGVsbG8K", "hello {base64}"),
241            (
242                "421 4.1.0 10.0.0.1 throttled try later",
243                "421 4.1.0 {ipaddr} throttled try later",
244            ),
245            (
246                "421 4.1.0 ::1 throttled try later",
247                "421 4.1.0 {ipaddr} throttled try later",
248            ),
249            (
250                "Accepting connection from 42.69.10.20:25",
251                "Accepting connection from {ipaddr}",
252            ),
253            ("duration 00:10:34", "duration 00:10:34"),
254            (
255                "rejecting mail for some.body@gmail.com",
256                "rejecting mail for {email}",
257            ),
258            (
259                "Your email has been rate limited because the From: header (RFC5322) in this message isn't aligned with either the authenticated SPF or DKIM organizational domain. To learn more about DMARC alignment, visit  https://support.google.com/a?p=dmarc-alignment  To learn more about Gmail requirements for bulk senders, visit  https://support.google.com/a?p=sender-guidelines. a640c23a62f3a-ab67626ed70si756442266b.465 - gsmtp",
260                "Your email has been rate limited because the From: header RFC5322 in this message isn't aligned with either the authenticated SPF or DKIM organizational domain. To learn more about DMARC alignment, visit https://support.google.com/a?p=dmarc-alignment To learn more about Gmail requirements for bulk senders, visit https://support.google.com/a?p=sender-guidelines. {hash} - gsmtp",
261            ),
262            (
263                "550 5.1.1 The email account that you tried to reach does not exist. Please try double-checking the recipient's email address for typos or unnecessary spaces. For more information, go to  https://support.google.com/mail/?p=NoSuchUser 41be03b00d2f7-b93bf44f0c0si6882731a12.803 - gsmtp",
264                "550 5.1.1 The email account that you tried to reach does not exist. Please try double-checking the recipient's email address for typos or unnecessary spaces. For more information, go to https://support.google.com/mail/?p=NoSuchUser {hash} - gsmtp",
265            ),
266            ("OK ids=8a5475ccbbc611eda12250ebf67f93bd", "OK ids={uuid}"),
267            (
268                "550 Mail is rejected by recipients [aGVsbG8uCg== IP: 10.10.10.10]. https://service.mail.qq.com/detail/0/92.",
269                "550 Mail is rejected by recipients {base64} IP: {ipaddr} https://service.mail.qq.com/detail/0/92.",
270            ),
271        ];
272
273        for (input, expected_output) in CASES {
274            let output = normalize(input);
275
276            k9::assert_equal!(output, *expected_output, "input={input}");
277        }
278    }
279}