diff --git a/.github/workflows/python-app-macos-windows.yml b/.github/workflows/python-app-macos-windows.yml index 9d0d720..ec824a5 100644 --- a/.github/workflows/python-app-macos-windows.yml +++ b/.github/workflows/python-app-macos-windows.yml @@ -15,7 +15,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ macos-latest, windows-latest ] + os: [ macos-13, windows-latest ] # Using macos-13 since macos-latest no longer supports 3.8 steps: - uses: actions/checkout@v2 diff --git a/02b-Extract-Tags-From-Stories.py b/02b-Extract-Tags-From-Stories.py index dce273e..0b6fa77 100755 --- a/02b-Extract-Tags-From-Stories.py +++ b/02b-Extract-Tags-From-Stories.py @@ -22,7 +22,6 @@ args.temp_db_database ) ) - tags.create_tags_table() tag_col_list = {} stories_id_name = "" diff --git a/automated_archive/aa.py b/automated_archive/aa.py index 44b4b4d..6d86166 100755 --- a/automated_archive/aa.py +++ b/automated_archive/aa.py @@ -1,10 +1,10 @@ # -- coding: utf-8 -- -import datetime +from datetime import datetime import codecs import re -import os -from html.parser import HTMLParser +import html +import urllib.request from pymysql import connect @@ -22,11 +22,15 @@ def _clean_file(filepath, log): :param filepath: Path to ARCHIVE_DB.pl :return: Python dictionary keyed by original story id """ - h = HTMLParser() - archive_db = codecs.open(filepath, "r", encoding="utf-8").read() + encoding = input( + 'Encoding for the ARCHIVE_DB.pl file, e.g. "utf-8", "latin_1", "cp1252" (default: "utf-8"): ' + ) + if encoding is None or encoding == "": + encoding = "utf-8" + archive_db = codecs.open(filepath, "r", encoding=encoding).read() # Manually escape single quote entity and reformat file as a Python dictionary - step1 = h.unescape(archive_db.replace("'", "\\'")) + step1 = html.unescape(archive_db.replace("'", "\\'")) # Indent the file with a single tab instead of whatever is currently used step15 = re.sub(r"^\s+", "\t", step1) @@ -122,8 +126,32 @@ def _extract_fandoms(args, record): return tags.strip(", ") +def _extract_date(args, record, log): + date_string = record.get( + "PrintTime", + record.get( + "DatePrint", + record.get("Date", str(datetime.now().strftime("%m/%d/%y"))), + ), + ) + + dt = None + try: + # If the date is in the form of a Unix timestamp + if date_string.isdigit(): + dt = datetime.fromtimestamp(int(date_string)) + else: + dt = datetime.strptime(date_string, "%m/%d/%y") + except Exception as e: + log.error( + f"Failed to parse date value '{date_string}' due to exception: {str(e)}" + ) + + return dt.strftime("%Y-%m-%d") if dt else "" + + def _create_mysql(args, FILES, log): - db = connect(args.db_host, args.db_user, args.db_password, "") + db = connect(host=args.db_host, user=args.db_user, password=args.db_password, db="") cursor = db.cursor() DATABASE_NAME = args.temp_db_database @@ -132,12 +160,13 @@ def _create_mysql(args, FILES, log): cursor.execute("create database {0};".format(DATABASE_NAME)) cursor.execute("use {0}".format(DATABASE_NAME)) - sql = Sql(args) - codepath = os.path.dirname(os.path.realpath(__file__)) + # Instead of duplicating this file in the repo grab it from the master branch of eFiction + url = "https://raw.githubusercontent.com/otwcode/open-doors-eFiction/refs/heads/master/opendoors/open-doors-tables-working.sql" + with urllib.request.urlopen(url) as response: + script = response.read().decode() - sql.run_script_from_file( - codepath + "/shared_python/create-open-doors-tables.sql", database=DATABASE_NAME - ) + sql = Sql(args, log) + sql.run_sql_file(script, database=DATABASE_NAME) db.commit() authors = [ @@ -164,18 +193,7 @@ def _create_mysql(args, FILES, log): FILES[i].get("Summary", "").replace("'", "\\'"), _extract_tags(args, FILES[i]), _extract_characters(args, FILES[i]), - datetime.datetime.strptime( - FILES[i].get( - "PrintTime", - FILES[i].get( - "DatePrint", - FILES[i].get( - "Date", str(datetime.datetime.now().strftime("%m/%d/%y")) - ), - ), - ), - "%m/%d/%y", - ).strftime("%Y-%m-%d"), + _extract_date(args, FILES[i], log), FILES[i].get("Location", "").replace("'", "\\'"), FILES[i] .get("LocationURL", FILES[i].get("StoryURL", "")) @@ -183,7 +201,9 @@ def _create_mysql(args, FILES, log): FILES[i].get("Notes", "").replace("'", "\\'"), _extract_relationships(args, FILES[i]), FILES[i].get("Rating", ""), - FILES[i].get("Warnings", "").replace("'", "\\'"), + FILES[i] + .get("Warnings", FILES[i].get("OptionalWarnings", "")) + .replace("'", "\\'"), FILES[i].get("Author", "").strip(), FILES[i].get("Email", FILES[i].get("EmailAuthor", "")).lower().strip(), FILES[i].get("FileType", args.chapters_file_extensions) @@ -196,6 +216,7 @@ def _create_mysql(args, FILES, log): cur = 0 total = len(FILES) + item_dict = {} for ( original_id, title, @@ -225,7 +246,7 @@ def _create_mysql(args, FILES, log): table_name = "stories" else: filename = url - table_name = "bookmarks" + table_name = "story_links" # Clean up fandoms and add default fandom if it exists final_fandoms = fandoms.replace("'", r"\'") @@ -241,10 +262,14 @@ def _create_mysql(args, FILES, log): if element[1] == author and element[2] == email ] authorid = result[0][0] + item_dict[original_id] = { + "authorid": authorid, + "itemtype": "story_link" if table_name == "story_links" else "story", + } stor = """ - INSERT INTO {0} (id, fandoms, title, summary, tags, characters, date, url, notes, relationships, rating, warnings, author_id) - VALUES({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}', '{10}', '{11}', '{12}', '{13}');\n""".format( + INSERT INTO {0} (id, fandoms, title, summary, tags, characters, date, url, notes, relationships, rating, warnings) + VALUES({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}', '{10}', '{11}', '{12}');\n""".format( table_name, original_id, final_fandoms.replace(r"\\", "\\"), @@ -258,7 +283,6 @@ def _create_mysql(args, FILES, log): pairings, rating, warnings, - authorid, ) cursor.execute(stor) except: @@ -285,6 +309,21 @@ def _create_mysql(args, FILES, log): raise db.commit() + for itemid, item_info in item_dict.items(): + try: + item_auth = """ + INSERT INTO item_authors (author_id, item_id, item_type) + VALUES({0}, {1}, '{2}');\n""".format( + item_info["authorid"], itemid, item_info["itemtype"] + ) + cursor.execute(item_auth) + except: + log.error( + f"Failed to insert item_authors for {item_info['itemtype']} {itemid} with author {item_info['authorid']}" + ) + raise + db.commit() + def clean_and_load_data(args, log): data = _clean_file(args.db_input_file, log) diff --git a/shared_python/Chapters.py b/shared_python/Chapters.py index 7bd44f6..6d79b35 100755 --- a/shared_python/Chapters.py +++ b/shared_python/Chapters.py @@ -81,7 +81,8 @@ def _gather_and_dedupe(self, chapters_path, extensions, has_ids=False): for cid, duplicate in duplicate_chapters.items(): # look up the author id and add that one to the file_names list sql_author_id = self.sql.execute_and_fetchall( - "SELECT author_id FROM chapters WHERE id = {0}".format(cid) + self.sql.database, + "SELECT author_id FROM chapters WHERE id = {0}".format(cid), ) if len(sql_author_id) > 0: author_id = sql_author_id[0][0] @@ -142,6 +143,8 @@ def populate_chapters(self, folder=None, extensions=None): else: for _, chapter_path in file_paths.items(): path = chapter_path.replace(self.args.chapters_path, "")[1:] + if os.sep == "\\": # if this script is run on windows + path = path.replace("\\", "/") with codecs.open(chapter_path, "r", encoding=char_encoding) as c: try: cur = Common.print_progress(cur, total) diff --git a/shared_python/Sql.py b/shared_python/Sql.py index 9a5227b..54057aa 100755 --- a/shared_python/Sql.py +++ b/shared_python/Sql.py @@ -58,7 +58,9 @@ def run_script_from_file(self, filename, database, initial_load=False): fd = open(filename, "r") sqlFile = fd.read() fd.close() + self.run_sql_file(sqlFile, database, initial_load) + def run_sql_file(self, sqlFile, database, initial_load=False): # replace placeholders and return all SQL commands (split on ';') sqlCommands = sqlFile.replace("$DATABASE$", database).split(";\n") diff --git a/shared_python/Tags.py b/shared_python/Tags.py index e69068f..1dc780c 100755 --- a/shared_python/Tags.py +++ b/shared_python/Tags.py @@ -1,4 +1,5 @@ import re +from collections import defaultdict from html.parser import HTMLParser from logging import Logger @@ -83,8 +84,9 @@ def populate_tag_table( ) ) + tags_to_insert = {} + tags_to_story_ids = defaultdict(list) for story_tags_row in data: - values = [] for col in tag_columns: needs_fandom = col in tags_with_fandoms if story_tags_row[col] is not None: @@ -93,27 +95,42 @@ def populate_tag_table( if isinstance( tag_col_lookup[col], str ): # Probably AA or a custom archive - cleaned_tag = ( - val.encode("utf-8").replace("'", "'").strip() + cleaned_tag = re.sub( + r'(? 0: - self.sql.execute( - """ - INSERT INTO tags (storyid, original_tag, original_table, ao3_tag_fandom) VALUES {0} - """.format(", ".join(values)) - ) + if len(tags_to_insert) > 0: + self.sql.execute( + """ + INSERT INTO tags (original_tag, original_type, ao3_tag_fandom) VALUES {0} + """.format(", ".join(tags_to_insert.values())) + ) + + tag_data = self.sql.execute_dict("SELECT id, original_tag FROM tags") + for tag_row in tag_data: + story_ids = set(tags_to_story_ids[tag_row["original_tag"]]) + for story_id in story_ids: + self.sql.execute( + """ + INSERT INTO item_tags (item_id, item_type, tag_id) VALUES ({0}, "{1}", {2}) + """.format( + story_id, + "story_link" if table_name == "story_links" else "story", + tag_row["id"], + ) + ) def distinct_tags(self, database): """