From 5c1c4033de7c2d7e1b77e60c02a52ecedb85cc18 Mon Sep 17 00:00:00 2001 From: Harlan Lieberman-Berg Date: Sat, 9 Mar 2024 22:46:57 -0500 Subject: [PATCH] Rework regex in xx-Remove-emails This changes the regex to be stricter about what it matches in terms of email addresses. That should cut down on the amount of false positives from censored swearing (e.g., "f!@#$%%"). Unscientifically, it's also faster, at least on significant sized archives. --- xx-Remove-emails-from-Open-Doors-Tables.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/xx-Remove-emails-from-Open-Doors-Tables.py b/xx-Remove-emails-from-Open-Doors-Tables.py index 6a718a1..ebd34b3 100755 --- a/xx-Remove-emails-from-Open-Doors-Tables.py +++ b/xx-Remove-emails-from-Open-Doors-Tables.py @@ -6,8 +6,11 @@ from prompt_toolkit.formatted_text import FormattedText from prompt_toolkit.shortcuts import clear +# This regex is pulled from the HTML5 spec. Though it is technically not +# compliant with RFC 5322 ("a willful violation"), it's good enough for our +# purposes. email_regex = re.compile( - r"([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\"([]!#-[^-~ \t]|(\\[\t -~]))+\")@([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\[[\t -Z^-~]*])" + r"([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+)@([a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)" ) @@ -44,7 +47,7 @@ def is_mailto(match) -> bool: def ask_user_for_action(match) -> str: start, end = match.span() raw_email = match.string[start:end] - domain = match.group(5) + domain = match.group(2) clear() print_context(match, 50) while True: @@ -85,7 +88,7 @@ def return_from_list(match) -> str: return raw_email elif address_entry is not None: return address_entry - domain = match.group(5) + domain = match.group(2) domain_entry = domains.get(domain) if domain_entry is True: return raw_email