1use chrono::DateTime;
2#[cfg(feature = "lua")]
3use config::get_or_create_sub_module;
4#[cfg(feature = "lua")]
5use mlua::Lua;
6use regex::{RegexSet, RegexSetBuilder};
7use std::borrow::Cow;
8use std::sync::LazyLock;
9use uuid::Uuid;
10mod dict;
11
12type Normalizer = for<'a> fn(word: &'a str) -> Option<Cow<'a, str>>;
13
14fn tokenize_timestamp_3339<'a>(word: &'a str) -> Option<Cow<'a, str>> {
15 DateTime::parse_from_rfc3339(word)
16 .ok()
17 .map(|_| Cow::Borrowed("{timestamp}"))
18}
19
20fn tokenize_uuid<'a>(word: &'a str) -> Option<Cow<'a, str>> {
21 Uuid::try_parse(word).ok().map(|_| Cow::Borrowed("{uuid}"))
22}
23
24fn tokenize_dictionary_word_phf<'a>(word: &'a str) -> Option<Cow<'a, str>> {
29 if crate::dict::DICT.contains(word) {
30 Some(Cow::Borrowed(word))
31 } else {
32 None
33 }
34}
35
36const BASE64_RE: &str =
38 r"^(?:[a-zA-Z0-9+/_\-]{4})+(?:[a-zA-Z0-9+/_\-]{2}==|[a-zA-Z0-9+/_\-]{3}=)?$";
39
40const IP_RE: &str = r"^(?:\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]))(:?:\d{1,5})?$";
45
46const EMAIL_RE: &str = r#"^(?:[a-z0-9!#$%&'*+\x2f=?^_`\x7b-\x7d~\x2d]+(?:\.[a-z0-9!#$%&'*+\x2f=?^_`\x7b-\x7d~\x2d]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9\x2d]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9\x2d]*[a-z0-9])?|\[(?:(?:(?:2(?:5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(?:2(?:5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9\x2d]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])$"#;
51
52const ISO8601_DURATION_RE: &str = r"^P(?:\d+(?:\.\d+)?Y)?(?:\d+(?:\.\d+)?M)?(?:\d+(?:\.\d+)?W)?(?:\d+(?:\.\d+)?D)?(?:T(?:\d+(?:\.\d+)?H)?(?:\d+(?:\.\d+)?M)?(?:\d+(?:\.\d+)?S)?)?$";
57
58fn tokenize_re<'a>(word: &'a str) -> Option<Cow<'a, str>> {
59 static MAPPING: &[(&str, &str)] = &[
60 (IP_RE, "{ipaddr}"),
61 (BASE64_RE, "{base64}"),
62 (EMAIL_RE, "{email}"),
63 ];
64 static SET: LazyLock<RegexSet> = LazyLock::new(|| {
65 RegexSetBuilder::new(MAPPING.iter().map(|(re, _label)| re))
66 .build()
67 .unwrap()
68 });
69
70 let matching_idx: usize = SET.matches(word).into_iter().next()?;
71
72 Some(Cow::Borrowed(MAPPING[matching_idx].1))
73}
74
75fn tokenize_hash<'a>(word: &'a str) -> Option<Cow<'a, str>> {
87 if word.len() < 8 {
88 return None;
89 }
90
91 let mut num_alpha = 0;
92 let mut num_digit = 0;
93
94 for c in word.chars() {
95 if c.is_ascii_alphabetic() {
96 num_alpha += 1;
97 } else if c.is_ascii_digit() {
98 num_digit += 1;
99 } else if c == '-' || c == '.' || c == '_' {
100 } else {
102 return None;
104 }
105 }
106
107 if num_alpha > 2 && num_digit > 2 {
108 return Some(Cow::Borrowed("{hash}"));
109 }
110
111 None
112}
113
114fn tokenize_compound<'a>(word: &'a str) -> Option<Cow<'a, str>> {
117 if let Some((lhs, rhs)) = word.split_once('=') {
118 let tokenized = normalize_word(rhs)?;
119 Some(format!("{lhs}={tokenized}").into())
120 } else {
121 None
122 }
123}
124
125fn preprocess_duration<'a>(s: &'a str) -> Cow<'a, str> {
132 static RE: LazyLock<regex::Regex> = LazyLock::new(|| {
136 let pattern =
140 r"\b\d+(?:ns|us|ms|s|m|h|day|month|year)(?:\s+\d+(?:ns|us|ms|s|m|h|day|month|year))*\b";
141 regex::Regex::new(pattern).unwrap()
142 });
143
144 let result = RE.replace_all(s, "__DURATION__");
146 Cow::Owned(result.into_owned())
147}
148
149fn tokenize_duration<'a>(word: &'a str) -> Option<Cow<'a, str>> {
152 if word == "__DURATION__" {
153 Some(Cow::Borrowed("{duration}"))
154 } else if word.starts_with('P') {
155 static ISO8601_RE: LazyLock<regex::Regex> =
159 LazyLock::new(|| regex::Regex::new(ISO8601_DURATION_RE).unwrap());
160 if ISO8601_RE.is_match(word) {
161 Some(Cow::Borrowed("{duration}"))
162 } else {
163 None
164 }
165 } else {
166 None
167 }
168}
169
170const FUNCS: &[Normalizer] = &[
176 tokenize_dictionary_word_phf, tokenize_duration, tokenize_timestamp_3339, tokenize_uuid, tokenize_hash, tokenize_re, tokenize_compound,
185];
186
187fn normalize_word<'a>(word: &'a str) -> Option<Cow<'a, str>> {
188 for func in FUNCS {
189 let res = (func)(word);
190 if res.is_some() {
191 return res;
192 }
193 }
194 None
195}
196
197pub fn normalize(s: &str) -> String {
198 let mut result = String::with_capacity(s.len());
199
200 let s = match preprocess_duration(s) {
203 Cow::Borrowed(b) => b.to_string(),
204 Cow::Owned(o) => o,
205 };
206
207 let mut processed;
208
209 let needs_process = memchr::memchr3(b'[', b'(', b'{', s.as_bytes()).is_some();
224 let s = if needs_process {
225 processed = String::with_capacity(s.len());
226 let mut iter = s.chars().peekable();
227 while let Some(c) = iter.next() {
228 if (c.is_ascii_whitespace() || processed.is_empty())
229 && matches!(iter.peek(), Some('[' | '(' | '{'))
230 {
231 iter.next();
232 processed.push(' ');
233 continue;
234 }
235
236 if matches!(c, ']' | ')' | '}')
237 && iter
238 .peek()
239 .map(|c| c.is_ascii_whitespace() || c.is_ascii_punctuation())
240 .unwrap_or(true)
241 {
242 iter.next();
243 processed.push(' ');
244 continue;
245 }
246
247 processed.push(c);
248 }
249 &processed
250 } else {
251 &s
252 };
253
254 for word in s.split_ascii_whitespace() {
255 let word = match normalize_word(word) {
256 Some(tokenized) => tokenized,
257 None => Cow::Borrowed(word),
258 };
259
260 if !result.is_empty() {
263 result.push(' ');
264 }
265 result.push_str(&word);
266 }
267
268 result
269}
270
271#[cfg(feature = "lua")]
272pub fn register(lua: &Lua) -> anyhow::Result<()> {
273 let string_mod = get_or_create_sub_module(lua, "string")?;
274
275 string_mod.set(
276 "normalize_smtp_response",
277 lua.create_function(move |_, text: String| Ok(normalize(&text)))?,
278 )?;
279 Ok(())
280}
281
282#[cfg(test)]
283mod tests {
284 use super::*;
285
286 #[test]
287 fn various() {
288 const CASES: &[(&str, &str)] = &[
289 (
290 "retry again at 2025-11-06T17:11:34.261306612Z",
291 "retry again at {timestamp}",
292 ),
293 (
294 "a uuid 10aa5da5-3f3b-4176-beb9-32875830f082",
295 "a uuid {uuid}",
296 ),
297 ("aGVsbG8uCg==", "{base64}"),
298 ("aGVsbG8K", "{base64}"),
299 ("aGVsbG8K aGVsbG8K", "{base64} {base64}"),
300 ("hello aGVsbG8uCg==", "hello {base64}"),
301 ("hello", "hello"),
302 ("hello aGVsbG8K", "hello {base64}"),
303 (
304 "421 4.1.0 10.0.0.1 throttled try later",
305 "421 4.1.0 {ipaddr} throttled try later",
306 ),
307 (
308 "421 4.1.0 ::1 throttled try later",
309 "421 4.1.0 {ipaddr} throttled try later",
310 ),
311 (
312 "Accepting connection from 42.69.10.20:25",
313 "Accepting connection from {ipaddr}",
314 ),
315 ("duration 00:10:34", "duration 00:10:34"),
316 (
317 "rejecting mail for some.body@gmail.com",
318 "rejecting mail for {email}",
319 ),
320 (
321 "Your email has been rate limited because the From: header (RFC5322) in this message isn't aligned with either the authenticated SPF or DKIM organizational domain. To learn more about DMARC alignment, visit https://support.google.com/a?p=dmarc-alignment To learn more about Gmail requirements for bulk senders, visit https://support.google.com/a?p=sender-guidelines. a640c23a62f3a-ab67626ed70si756442266b.465 - gsmtp",
322 "Your email has been rate limited because the From: header RFC5322 in this message isn't aligned with either the authenticated SPF or DKIM organizational domain. To learn more about DMARC alignment, visit https://support.google.com/a?p=dmarc-alignment To learn more about Gmail requirements for bulk senders, visit https://support.google.com/a?p=sender-guidelines. {hash} - gsmtp",
323 ),
324 (
325 "550 5.1.1 The email account that you tried to reach does not exist. Please try double-checking the recipient's email address for typos or unnecessary spaces. For more information, go to https://support.google.com/mail/?p=NoSuchUser 41be03b00d2f7-b93bf44f0c0si6882731a12.803 - gsmtp",
326 "550 5.1.1 The email account that you tried to reach does not exist. Please try double-checking the recipient's email address for typos or unnecessary spaces. For more information, go to https://support.google.com/mail/?p=NoSuchUser {hash} - gsmtp",
327 ),
328 ("OK ids=8a5475ccbbc611eda12250ebf67f93bd", "OK ids={uuid}"),
329 (
330 "550 Mail is rejected by recipients [aGVsbG8uCg== IP: 10.10.10.10]. https://service.mail.qq.com/detail/0/92.",
331 "550 Mail is rejected by recipients {base64} IP: {ipaddr} https://service.mail.qq.com/detail/0/92.",
332 ),
333 (
334 "Context: DispatcherDrop. Next due in 11s 999ms 990us 55ns at 2026-04-05T07:34:04.198063031Z",
335 "Context: DispatcherDrop. Next due in {duration} at {timestamp}",
336 ),
337 ("P23DT23H", "{duration}"),
338 ("P4Y", "{duration}"),
339 ("P1Y2M3DT4H5M6S", "{duration}"),
340 ("P1Y2M3DT4H5M6Shello", "{hash}"),
341 ("abc11s 999ms", "abc11s {duration}"),
342 ("2year", "{duration}"),
343 ("1month", "{duration}"),
344 ("3day", "{duration}"),
345 ("5h 30m", "{duration}"),
346 ("2yearhello", "2yearhello"),
347 ];
348
349 for (input, expected_output) in CASES {
350 let output = normalize(input);
351
352 k9::assert_equal!(output, *expected_output, "input={input}");
353 }
354 }
355}