add spellcheck workflow

yaugenst-flex · yaugenst-flex · commit 973241bb1577 · 2025-04-16T10:53:15.000+02:00
diff --git a/.github/workflows/lint-notebooks.yml b/.github/workflows/lint-notebooks.yml
@@ -1,27 +1,55 @@
-name: "notebooks-linting"
+name: "lint-notebooks"
 
 on:
-  workflow_dispatch:
   push:
-    branches: [ main, develop ]
+    branches: [ develop ]
   pull_request:
-    branches: [ main, develop ]
+    branches: [ develop ]
+
+permissions:
+  contents: read
+  pull-requests: write
 
 jobs:
   lint:
-    name: Run notebook linting
+    name: Run notebook linting and spell check
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: 1
-      
-      - uses: astral-sh/ruff-action@v3
+      - name: Checkout code
+        uses: actions/checkout@v4
         with:
-          version: 0.5.5
-      
+          fetch-depth: 2
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+
       - name: Run ruff format
-        run: ruff format --check --diff .
-      
+        run: uvx ruff format --check --diff .
+
       - name: Run ruff check
-        run: ruff check .
+        run: uvx ruff check .
+
+      - name: Get changed notebook files
+        id: changed_notebooks
+        if: github.event_name == 'pull_request'
+        uses: tj-actions/changed-files@v46
+        with:
+          files: |
+            **.ipynb
+
+      - name: Run spell check on changed notebooks
+        id: spellcheck
+        if: github.event_name == 'pull_request' && steps.changed_notebooks.outputs.any_changed == 'true'
+        continue-on-error: true
+        run: |
+          uvx python spellcheck.py ${{ steps.changed_notebooks.outputs.all_changed_files }} > spellcheck_output.txt || true
+
+      - name: Post spell check comment
+        if: github.event_name == 'pull_request' && steps.changed_notebooks.outputs.any_changed == 'true' && steps.spellcheck.outcome != 'skipped' && hashFiles('spellcheck_output.txt') != ''
+        uses: peter-evans/create-or-update-comment@v4
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          repository: ${{ github.repository }}
+          issue-number: ${{ github.event.pull_request.number }}
+          body-path: spellcheck_output.txt
+          edit-mode: replace
diff --git a/spellcheck.py b/spellcheck.py
@@ -5,75 +5,132 @@
 import re
 import subprocess
 import sys
+from typing import Optional
 
 
-def get_relative_path(notebook):
+def get_relative_path(notebook: str) -> str:
     """Get the relative path of the notebook from the current directory."""
     return os.path.relpath(notebook, os.getcwd())
 
 
-def check_spelling(notebook):
-    """Check spelling in a notebook and return any errors."""
-    rel_path = get_relative_path(notebook)
-
-    with open(notebook, encoding="utf-8") as f:
-        content = f.read()
-
-    # nbstripout to remove outputs
-    nbstripout_proc = subprocess.run(
-        ["uvx", "nbstripout"], input=content, capture_output=True, text=True
-    )
-
-    # remove image tags with base64 data
-    stripped_content = re.sub(
-        r'<img\s+src="data:image/[^"]+;base64,[^"]+"[^>]*>|<img\s+src="data:image/[^"]+;base64,[^"]+"[^/>]*/>',
-        "",
-        nbstripout_proc.stdout,
-        flags=re.DOTALL,
-    )
-
-    # remove any remaining base64 strings that might appear without proper HTML tags
-    stripped_content = re.sub(
-        r"data:image/[^;]+;base64,[A-Za-z0-9+/=]+",
-        "",
-        stripped_content,
-        flags=re.DOTALL,
-    )
-
-    codespell_proc = subprocess.run(
-        ["uvx", "codespell", "-"], input=stripped_content, capture_output=True, text=True
-    )
+def check_spelling(notebook: str) -> Optional[str]:
+    """
+    Check spelling in a notebook.
 
-    # sadly we can't get rid of the "Used config files: ..." so we filter it here
-    output_lines = []
-    for line in codespell_proc.stdout.splitlines():
-        if "Used config files:" in line or "    1: .codespellrc" in line:
-            continue
-        output_lines.append(line)
-
-    output = "\n".join(output_lines)
-
-    if output:
-        print(f"{rel_path}:")
-        print(output.replace("-", ""))
-        print("-------------------------------------------")
-
-    return bool(output)
+    Returns:
+        A formatted Markdown string containing spelling errors for the notebook,
+        using a code block to show codespell's output, or None if no errors were found.
+    """
+    rel_path = get_relative_path(notebook)
+    error_message_block = None
+
+    try:
+        with open(notebook, encoding="utf-8") as f:
+            content = f.read()
+
+        # nbstripout to remove outputs
+        nbstripout_proc = subprocess.run(
+            ["uvx", "nbstripout"],
+            input=content,
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+
+        # remove image tags with base64 data
+        stripped_content = re.sub(
+            r'<img\s+src="data:image/[^"]+;base64,[^"]+"[^>]*>|<img\s+src="data:image/[^"]+;base64,[^"]+"[^/>]*/>',
+            "",
+            nbstripout_proc.stdout,
+            flags=re.DOTALL,
+        )
+
+        # remove any remaining base64 strings that might appear without proper HTML tags
+        stripped_content = re.sub(
+            r"data:image/[^;]+;base64,[A-Za-z0-9+/=]+",
+            "",
+            stripped_content,
+            flags=re.DOTALL,
+        )
+
+        codespell_proc = subprocess.run(
+            ["uvx", "codespell", "-"],
+            input=stripped_content,
+            capture_output=True,
+            text=True,
+            check=False,  # codespell exits non-zero on errors, which is expected
+        )
+
+        # filter codespell's config file lines
+        output_lines = []
+        for line in codespell_proc.stdout.splitlines():
+            if line.strip().startswith("Used config files:") or re.match(
+                r"^\s+\d+:\s+\.codespellrc", line
+            ):
+                continue
+            output_lines.append(line.replace("-:", "Line ", 1))
+
+        filtered_output = "\n".join(output_lines).strip()
+
+        if filtered_output:
+            error_message_block = f"**{rel_path}**:\n```\n{filtered_output}\n```"
+
+    except FileNotFoundError:
+        error_message_block = f"**{rel_path}**: Error - File not found."
+    except subprocess.CalledProcessError as e:
+        cmd_str = " ".join(e.cmd)
+        error_message_block = (
+            f"**{rel_path}**: Error running command `{cmd_str}`:\n```\n{e.stderr}\n```"
+        )
+    except Exception as e:
+        error_message_block = f"**{rel_path}**: An unexpected error occurred:\n```\n{str(e)}\n```"
+
+    return error_message_block
 
 
 def main():
     parser = argparse.ArgumentParser(description="Check spelling in Jupyter notebooks")
     parser.add_argument("notebooks", nargs="+", help="List of notebook files to check")
     args = parser.parse_args()
 
-    has_errors = False
+    all_errors: list[str] = []
+    num_files_processed = 0
+    num_files_with_errors = 0
+    num_files_with_processing_errors = 0
 
     for notebook in args.notebooks:
-        if check_spelling(notebook):
-            has_errors = True
-
-    if has_errors:
+        num_files_processed += 1
+        error_output = check_spelling(notebook)
+        if error_output:
+            all_errors.append(error_output)
+            if (
+                "Error running command" in error_output
+                or "An unexpected error occurred" in error_output
+                or "Error - File not found" in error_output
+            ):
+                num_files_with_processing_errors += 1
+            else:
+                num_files_with_errors += 1
+
+    if all_errors:
+        print("## Spell Check Report\n")
+        print("\n\n---\n\n".join(all_errors))
+
+        summary_lines = []
+        if num_files_with_errors > 0:
+            summary_lines.append(f"Found spelling errors in {num_files_with_errors} file(s).")
+        if num_files_with_processing_errors > 0:
+            summary_lines.append(
+                f"Encountered processing errors in {num_files_with_processing_errors} file(s)."
+            )
+        if not summary_lines:
+            summary_lines.append(f"Found issues in {len(all_errors)} file(s).")
+
+        print(f"\n---\nChecked {num_files_processed} notebook(s). " + " ".join(summary_lines))
         sys.exit(1)
+    else:
+        print(f"Spell check passed successfully for {num_files_processed} notebook(s).")
+        sys.exit(0)
 
 
 if __name__ == "__main__":