mod_smtp_response_normalize/
lib.rs1use chrono::DateTime;
2use config::get_or_create_sub_module;
3use mlua::Lua;
4use regex::{RegexSet, RegexSetBuilder};
5use std::borrow::Cow;
6use std::sync::LazyLock;
7use uuid::Uuid;
8mod dict;
9
10type Normalizer = for<'a> fn(word: &'a str) -> Option<Cow<'a, str>>;
11
12fn tokenize_timestamp_3339<'a>(word: &'a str) -> Option<Cow<'a, str>> {
13 DateTime::parse_from_rfc3339(word)
14 .ok()
15 .map(|_| Cow::Borrowed("{timestamp}"))
16}
17
18fn tokenize_uuid<'a>(word: &'a str) -> Option<Cow<'a, str>> {
19 Uuid::try_parse(word).ok().map(|_| Cow::Borrowed("{uuid}"))
20}
21
22fn tokenize_dictionary_word_phf<'a>(word: &'a str) -> Option<Cow<'a, str>> {
27 if crate::dict::DICT.contains(word) {
28 Some(Cow::Borrowed(word))
29 } else {
30 None
31 }
32}
33
34const BASE64_RE: &str =
36 r"^(:?[a-zA-Z0-9+/_\-]{4})+(:?[a-zA-Z0-9+/_\-]{2}==|[a-zA-Z0-9+/_\-]{3}=)?$";
37
38const IP_RE: &str = r"^(:?\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))(:?:\d{1,5})?$";
43
44const EMAIL_RE: &str = r#"^(?:[a-z0-9!#$%&'*+\x2f=?^_`\x7b-\x7d~\x2d]+(?:\.[a-z0-9!#$%&'*+\x2f=?^_`\x7b-\x7d~\x2d]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9\x2d]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9\x2d]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9\x2d]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])$"#;
49
50fn tokenize_re<'a>(word: &'a str) -> Option<Cow<'a, str>> {
51 static MAPPING: &[(&str, &str)] = &[
52 (IP_RE, "{ipaddr}"),
53 (BASE64_RE, "{base64}"),
54 (EMAIL_RE, "{email}"),
55 ];
56 static SET: LazyLock<RegexSet> = LazyLock::new(|| {
57 RegexSetBuilder::new(MAPPING.iter().map(|(re, _label)| re))
58 .build()
59 .unwrap()
60 });
61
62 let matching_idx: usize = SET.matches(word).into_iter().next()?;
63
64 Some(Cow::Borrowed(MAPPING[matching_idx].1))
65}
66
67fn tokenize_hash<'a>(word: &'a str) -> Option<Cow<'a, str>> {
79 if word.len() < 8 {
80 return None;
81 }
82
83 let mut num_alpha = 0;
84 let mut num_digit = 0;
85
86 for c in word.chars() {
87 if c.is_ascii_alphabetic() {
88 num_alpha += 1;
89 } else if c.is_ascii_digit() {
90 num_digit += 1;
91 } else if c == '-' || c == '.' || c == '_' {
92 } else {
94 return None;
96 }
97 }
98
99 if num_alpha > 2 && num_digit > 2 {
100 return Some(Cow::Borrowed("{hash}"));
101 }
102
103 None
104}
105
106fn tokenize_compound<'a>(word: &'a str) -> Option<Cow<'a, str>> {
109 if let Some((lhs, rhs)) = word.split_once('=') {
110 let tokenized = normalize_word(rhs)?;
111 Some(format!("{lhs}={tokenized}").into())
112 } else {
113 None
114 }
115}
116
117const FUNCS: &[Normalizer] = &[
123 tokenize_dictionary_word_phf, tokenize_timestamp_3339, tokenize_uuid, tokenize_hash, tokenize_re, tokenize_compound,
131];
132
133fn normalize_word<'a>(word: &'a str) -> Option<Cow<'a, str>> {
134 for func in FUNCS {
135 let res = (func)(word);
136 if res.is_some() {
137 return res;
138 }
139 }
140 None
141}
142
143pub fn normalize(s: &str) -> String {
144 let mut result = String::with_capacity(s.len());
145
146 for word in s.split_ascii_whitespace() {
147 let word = match normalize_word(word) {
148 Some(tokenized) => tokenized,
149 None => Cow::Borrowed(word),
150 };
151
152 if !result.is_empty() {
153 result.push(' ');
154 }
155 result.push_str(&word);
156 }
157
158 result
159}
160
161pub fn register(lua: &Lua) -> anyhow::Result<()> {
162 let string_mod = get_or_create_sub_module(lua, "string")?;
163
164 string_mod.set(
165 "normalize_smtp_response",
166 lua.create_function(move |_, text: String| Ok(normalize(&text)))?,
167 )?;
168 Ok(())
169}
170
171#[cfg(test)]
172mod tests {
173 use super::*;
174
175 #[test]
176 fn various() {
177 const CASES: &[(&str, &str)] = &[
178 (
179 "retry again at 2025-11-06T17:11:34.261306612Z",
180 "retry again at {timestamp}",
181 ),
182 (
183 "a uuid 10aa5da5-3f3b-4176-beb9-32875830f082",
184 "a uuid {uuid}",
185 ),
186 ("aGVsbG8uCg==", "{base64}"),
187 ("aGVsbG8K", "{base64}"),
188 ("aGVsbG8K aGVsbG8K", "{base64} {base64}"),
189 ("hello aGVsbG8uCg==", "hello {base64}"),
190 ("hello", "hello"),
191 ("hello aGVsbG8K", "hello {base64}"),
192 (
193 "421 4.1.0 10.0.0.1 throttled try later",
194 "421 4.1.0 {ipaddr} throttled try later",
195 ),
196 (
197 "421 4.1.0 ::1 throttled try later",
198 "421 4.1.0 {ipaddr} throttled try later",
199 ),
200 (
201 "Accepting connection from 42.69.10.20:25",
202 "Accepting connection from {ipaddr}",
203 ),
204 ("duration 00:10:34", "duration 00:10:34"),
205 (
206 "rejecting mail for some.body@gmail.com",
207 "rejecting mail for {email}",
208 ),
209 (
210 "Your email has been rate limited because the From: header (RFC5322) in this message isn't aligned with either the authenticated SPF or DKIM organizational domain. To learn more about DMARC alignment, visit https://support.google.com/a?p=dmarc-alignment To learn more about Gmail requirements for bulk senders, visit https://support.google.com/a?p=sender-guidelines. a640c23a62f3a-ab67626ed70si756442266b.465 - gsmtp",
211 "Your email has been rate limited because the From: header (RFC5322) in this message isn't aligned with either the authenticated SPF or DKIM organizational domain. To learn more about DMARC alignment, visit https://support.google.com/a?p=dmarc-alignment To learn more about Gmail requirements for bulk senders, visit https://support.google.com/a?p=sender-guidelines. {hash} - gsmtp",
212 ),
213 (
214 "550 5.1.1 The email account that you tried to reach does not exist. Please try double-checking the recipient's email address for typos or unnecessary spaces. For more information, go to https://support.google.com/mail/?p=NoSuchUser 41be03b00d2f7-b93bf44f0c0si6882731a12.803 - gsmtp",
215 "550 5.1.1 The email account that you tried to reach does not exist. Please try double-checking the recipient's email address for typos or unnecessary spaces. For more information, go to https://support.google.com/mail/?p=NoSuchUser {hash} - gsmtp",
216 ),
217 ("OK ids=8a5475ccbbc611eda12250ebf67f93bd", "OK ids={uuid}"),
218 ];
219
220 for (input, expected_output) in CASES {
221 let output = normalize(input);
222
223 assert_eq!(output, *expected_output, "input={input}");
224 }
225 }
226}