mod_smtp_response_normalize/
lib.rs1use chrono::DateTime;
2use config::get_or_create_sub_module;
3use mlua::Lua;
4use regex::{RegexSet, RegexSetBuilder};
5use std::borrow::Cow;
6use std::sync::LazyLock;
7use uuid::Uuid;
8mod dict;
9
10type Normalizer = for<'a> fn(word: &'a str) -> Option<Cow<'a, str>>;
11
12fn tokenize_timestamp_3339<'a>(word: &'a str) -> Option<Cow<'a, str>> {
13 DateTime::parse_from_rfc3339(word)
14 .ok()
15 .map(|_| Cow::Borrowed("{timestamp}"))
16}
17
18fn tokenize_uuid<'a>(word: &'a str) -> Option<Cow<'a, str>> {
19 Uuid::try_parse(word).ok().map(|_| Cow::Borrowed("{uuid}"))
20}
21
22fn tokenize_dictionary_word_phf<'a>(word: &'a str) -> Option<Cow<'a, str>> {
27 if crate::dict::DICT.contains(word) {
28 Some(Cow::Borrowed(word))
29 } else {
30 None
31 }
32}
33
34const BASE64_RE: &str =
36 r"^(?:[a-zA-Z0-9+/_\-]{4})+(?:[a-zA-Z0-9+/_\-]{2}==|[a-zA-Z0-9+/_\-]{3}=)?$";
37
38const IP_RE: &str = r"^(?:\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]))(:?:\d{1,5})?$";
43
44const EMAIL_RE: &str = r#"^(?:[a-z0-9!#$%&'*+\x2f=?^_`\x7b-\x7d~\x2d]+(?:\.[a-z0-9!#$%&'*+\x2f=?^_`\x7b-\x7d~\x2d]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9\x2d]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9\x2d]*[a-z0-9])?|\[(?:(?:(?:2(?:5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(?:2(?:5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9\x2d]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])$"#;
49
50fn tokenize_re<'a>(word: &'a str) -> Option<Cow<'a, str>> {
51 static MAPPING: &[(&str, &str)] = &[
52 (IP_RE, "{ipaddr}"),
53 (BASE64_RE, "{base64}"),
54 (EMAIL_RE, "{email}"),
55 ];
56 static SET: LazyLock<RegexSet> = LazyLock::new(|| {
57 RegexSetBuilder::new(MAPPING.iter().map(|(re, _label)| re))
58 .build()
59 .unwrap()
60 });
61
62 let matching_idx: usize = SET.matches(word).into_iter().next()?;
63
64 Some(Cow::Borrowed(MAPPING[matching_idx].1))
65}
66
67fn tokenize_hash<'a>(word: &'a str) -> Option<Cow<'a, str>> {
79 if word.len() < 8 {
80 return None;
81 }
82
83 let mut num_alpha = 0;
84 let mut num_digit = 0;
85
86 for c in word.chars() {
87 if c.is_ascii_alphabetic() {
88 num_alpha += 1;
89 } else if c.is_ascii_digit() {
90 num_digit += 1;
91 } else if c == '-' || c == '.' || c == '_' {
92 } else {
94 return None;
96 }
97 }
98
99 if num_alpha > 2 && num_digit > 2 {
100 return Some(Cow::Borrowed("{hash}"));
101 }
102
103 None
104}
105
106fn tokenize_compound<'a>(word: &'a str) -> Option<Cow<'a, str>> {
109 if let Some((lhs, rhs)) = word.split_once('=') {
110 let tokenized = normalize_word(rhs)?;
111 Some(format!("{lhs}={tokenized}").into())
112 } else {
113 None
114 }
115}
116
117const FUNCS: &[Normalizer] = &[
123 tokenize_dictionary_word_phf, tokenize_timestamp_3339, tokenize_uuid, tokenize_hash, tokenize_re, tokenize_compound,
131];
132
133fn normalize_word<'a>(word: &'a str) -> Option<Cow<'a, str>> {
134 for func in FUNCS {
135 let res = (func)(word);
136 if res.is_some() {
137 return res;
138 }
139 }
140 None
141}
142
143pub fn normalize(s: &str) -> String {
144 let mut result = String::with_capacity(s.len());
145
146 let mut processed;
147
148 let needs_process = memchr::memchr3(b'[', b'(', b'{', s.as_bytes()).is_some();
163 let s = if needs_process {
164 processed = String::with_capacity(s.len());
165 let mut iter = s.chars().peekable();
166 while let Some(c) = iter.next() {
167 if (c.is_ascii_whitespace() || processed.is_empty())
168 && matches!(iter.peek(), Some('[' | '(' | '{'))
169 {
170 iter.next();
171 processed.push(' ');
172 continue;
173 }
174
175 if matches!(c, ']' | ')' | '}')
176 && iter
177 .peek()
178 .map(|c| c.is_ascii_whitespace() || c.is_ascii_punctuation())
179 .unwrap_or(true)
180 {
181 iter.next();
182 processed.push(' ');
183 continue;
184 }
185
186 processed.push(c);
187 }
188 &processed
189 } else {
190 s
191 };
192
193 for word in s.split_ascii_whitespace() {
194 let word = match normalize_word(word) {
195 Some(tokenized) => tokenized,
196 None => Cow::Borrowed(word),
197 };
198
199 if !result.is_empty() {
202 result.push(' ');
203 }
204 result.push_str(&word);
205 }
206
207 result
208}
209
210pub fn register(lua: &Lua) -> anyhow::Result<()> {
211 let string_mod = get_or_create_sub_module(lua, "string")?;
212
213 string_mod.set(
214 "normalize_smtp_response",
215 lua.create_function(move |_, text: String| Ok(normalize(&text)))?,
216 )?;
217 Ok(())
218}
219
220#[cfg(test)]
221mod tests {
222 use super::*;
223
224 #[test]
225 fn various() {
226 const CASES: &[(&str, &str)] = &[
227 (
228 "retry again at 2025-11-06T17:11:34.261306612Z",
229 "retry again at {timestamp}",
230 ),
231 (
232 "a uuid 10aa5da5-3f3b-4176-beb9-32875830f082",
233 "a uuid {uuid}",
234 ),
235 ("aGVsbG8uCg==", "{base64}"),
236 ("aGVsbG8K", "{base64}"),
237 ("aGVsbG8K aGVsbG8K", "{base64} {base64}"),
238 ("hello aGVsbG8uCg==", "hello {base64}"),
239 ("hello", "hello"),
240 ("hello aGVsbG8K", "hello {base64}"),
241 (
242 "421 4.1.0 10.0.0.1 throttled try later",
243 "421 4.1.0 {ipaddr} throttled try later",
244 ),
245 (
246 "421 4.1.0 ::1 throttled try later",
247 "421 4.1.0 {ipaddr} throttled try later",
248 ),
249 (
250 "Accepting connection from 42.69.10.20:25",
251 "Accepting connection from {ipaddr}",
252 ),
253 ("duration 00:10:34", "duration 00:10:34"),
254 (
255 "rejecting mail for some.body@gmail.com",
256 "rejecting mail for {email}",
257 ),
258 (
259 "Your email has been rate limited because the From: header (RFC5322) in this message isn't aligned with either the authenticated SPF or DKIM organizational domain. To learn more about DMARC alignment, visit https://support.google.com/a?p=dmarc-alignment To learn more about Gmail requirements for bulk senders, visit https://support.google.com/a?p=sender-guidelines. a640c23a62f3a-ab67626ed70si756442266b.465 - gsmtp",
260 "Your email has been rate limited because the From: header RFC5322 in this message isn't aligned with either the authenticated SPF or DKIM organizational domain. To learn more about DMARC alignment, visit https://support.google.com/a?p=dmarc-alignment To learn more about Gmail requirements for bulk senders, visit https://support.google.com/a?p=sender-guidelines. {hash} - gsmtp",
261 ),
262 (
263 "550 5.1.1 The email account that you tried to reach does not exist. Please try double-checking the recipient's email address for typos or unnecessary spaces. For more information, go to https://support.google.com/mail/?p=NoSuchUser 41be03b00d2f7-b93bf44f0c0si6882731a12.803 - gsmtp",
264 "550 5.1.1 The email account that you tried to reach does not exist. Please try double-checking the recipient's email address for typos or unnecessary spaces. For more information, go to https://support.google.com/mail/?p=NoSuchUser {hash} - gsmtp",
265 ),
266 ("OK ids=8a5475ccbbc611eda12250ebf67f93bd", "OK ids={uuid}"),
267 (
268 "550 Mail is rejected by recipients [aGVsbG8uCg== IP: 10.10.10.10]. https://service.mail.qq.com/detail/0/92.",
269 "550 Mail is rejected by recipients {base64} IP: {ipaddr} https://service.mail.qq.com/detail/0/92.",
270 ),
271 ];
272
273 for (input, expected_output) in CASES {
274 let output = normalize(input);
275
276 k9::assert_equal!(output, *expected_output, "input={input}");
277 }
278 }
279}