From fe2a03e9062e2bda446e63a02fe4e5d03494e98d Mon Sep 17 00:00:00 2001 From: Harlan Lieberman-Berg Date: Sat, 9 Mar 2024 22:46:57 -0500 Subject: [PATCH 1/2] Rework regex in xx-Remove-emails This changes the regex to be stricter about what it matches in terms of email addresses. That should cut down on the amount of false positives from censored swearing (e.g., "f!@#$%%"). Unscientifically, it's also faster, at least on significant sized archives. --- xx-Remove-emails-from-Open-Doors-Tables.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/xx-Remove-emails-from-Open-Doors-Tables.py b/xx-Remove-emails-from-Open-Doors-Tables.py index 6a718a1..ebd34b3 100755 --- a/xx-Remove-emails-from-Open-Doors-Tables.py +++ b/xx-Remove-emails-from-Open-Doors-Tables.py @@ -6,8 +6,11 @@ from prompt_toolkit.formatted_text import FormattedText from prompt_toolkit.shortcuts import clear +# This regex is pulled from the HTML5 spec. Though it is technically not +# compliant with RFC 5322 ("a willful violation"), it's good enough for our +# purposes. email_regex = re.compile( - r"([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\"([]!#-[^-~ \t]|(\\[\t -~]))+\")@([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\[[\t -Z^-~]*])" + r"([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+)@([a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)" ) @@ -44,7 +47,7 @@ def is_mailto(match) -> bool: def ask_user_for_action(match) -> str: start, end = match.span() raw_email = match.string[start:end] - domain = match.group(5) + domain = match.group(2) clear() print_context(match, 50) while True: @@ -85,7 +88,7 @@ def return_from_list(match) -> str: return raw_email elif address_entry is not None: return address_entry - domain = match.group(5) + domain = match.group(2) domain_entry = domains.get(domain) if domain_entry is True: return raw_email From 633748b2f91aaa8d5f7d1a754b95894b7d35ca94 Mon Sep 17 00:00:00 2001 From: Harlan Lieberman-Berg Date: Sat, 16 Mar 2024 22:06:03 -0400 Subject: [PATCH 2/2] Add check for stories without any chapters. Thanks, @minjonet, for the idea! --- 08-Check-ODAP-Tables.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/08-Check-ODAP-Tables.py b/08-Check-ODAP-Tables.py index 7ffec17..8f6b0fd 100755 --- a/08-Check-ODAP-Tables.py +++ b/08-Check-ODAP-Tables.py @@ -211,4 +211,24 @@ log.error("Found at least one bad author email; ending audit here.") sys.exit(7) + ## + ## Check for stories without chapters + ## + + log.debug("Checking for stories without any chapters.") + found_error = False + + empty_stories = sql.execute_dict( + "SELECT s.id as sid FROM stories s LEFT JOIN chapters c ON c.story_id = s.id WHERE c.story_id IS NULL" + ) + + if empty_stories: + found_error = True + for story in empty_stories: + log.error(f"Found story with no chapters: {story['sid']}") + + if found_error: + log.error("Found at least one story with no chapters; ending audit here.") + sys.exit(8) + log.info("All checks completed successfully.")