|
5 | 5 | import re |
6 | 6 | import subprocess |
7 | 7 | import sys |
| 8 | +from typing import Optional |
8 | 9 |
|
9 | 10 |
|
10 | | -def get_relative_path(notebook): |
| 11 | +def get_relative_path(notebook: str) -> str: |
11 | 12 | """Get the relative path of the notebook from the current directory.""" |
12 | 13 | return os.path.relpath(notebook, os.getcwd()) |
13 | 14 |
|
14 | 15 |
|
15 | | -def check_spelling(notebook): |
16 | | - """Check spelling in a notebook and return any errors.""" |
17 | | - rel_path = get_relative_path(notebook) |
18 | | - |
19 | | - with open(notebook, encoding="utf-8") as f: |
20 | | - content = f.read() |
21 | | - |
22 | | - # nbstripout to remove outputs |
23 | | - nbstripout_proc = subprocess.run( |
24 | | - ["uvx", "nbstripout"], input=content, capture_output=True, text=True |
25 | | - ) |
26 | | - |
27 | | - # remove image tags with base64 data |
28 | | - stripped_content = re.sub( |
29 | | - r'<img\s+src="data:image/[^"]+;base64,[^"]+"[^>]*>|<img\s+src="data:image/[^"]+;base64,[^"]+"[^/>]*/>', |
30 | | - "", |
31 | | - nbstripout_proc.stdout, |
32 | | - flags=re.DOTALL, |
33 | | - ) |
34 | | - |
35 | | - # remove any remaining base64 strings that might appear without proper HTML tags |
36 | | - stripped_content = re.sub( |
37 | | - r"data:image/[^;]+;base64,[A-Za-z0-9+/=]+", |
38 | | - "", |
39 | | - stripped_content, |
40 | | - flags=re.DOTALL, |
41 | | - ) |
42 | | - |
43 | | - codespell_proc = subprocess.run( |
44 | | - ["uvx", "codespell", "-"], input=stripped_content, capture_output=True, text=True |
45 | | - ) |
| 16 | +def check_spelling(notebook: str) -> Optional[str]: |
| 17 | + """ |
| 18 | + Check spelling in a notebook. |
46 | 19 |
|
47 | | - # sadly we can't get rid of the "Used config files: ..." so we filter it here |
48 | | - output_lines = [] |
49 | | - for line in codespell_proc.stdout.splitlines(): |
50 | | - if "Used config files:" in line or " 1: .codespellrc" in line: |
51 | | - continue |
52 | | - output_lines.append(line) |
53 | | - |
54 | | - output = "\n".join(output_lines) |
55 | | - |
56 | | - if output: |
57 | | - print(f"{rel_path}:") |
58 | | - print(output.replace("-", "")) |
59 | | - print("-------------------------------------------") |
60 | | - |
61 | | - return bool(output) |
| 20 | + Returns: |
| 21 | + A formatted Markdown string containing spelling errors for the notebook, |
| 22 | + using a code block to show codespell's output, or None if no errors were found. |
| 23 | + """ |
| 24 | + rel_path = get_relative_path(notebook) |
| 25 | + error_message_block = None |
| 26 | + |
| 27 | + try: |
| 28 | + with open(notebook, encoding="utf-8") as f: |
| 29 | + content = f.read() |
| 30 | + |
| 31 | + # nbstripout to remove outputs |
| 32 | + nbstripout_proc = subprocess.run( |
| 33 | + ["uvx", "nbstripout"], |
| 34 | + input=content, |
| 35 | + capture_output=True, |
| 36 | + text=True, |
| 37 | + check=True, |
| 38 | + ) |
| 39 | + |
| 40 | + # remove image tags with base64 data |
| 41 | + stripped_content = re.sub( |
| 42 | + r'<img\s+src="data:image/[^"]+;base64,[^"]+"[^>]*>|<img\s+src="data:image/[^"]+;base64,[^"]+"[^/>]*/>', |
| 43 | + "", |
| 44 | + nbstripout_proc.stdout, |
| 45 | + flags=re.DOTALL, |
| 46 | + ) |
| 47 | + |
| 48 | + # remove any remaining base64 strings that might appear without proper HTML tags |
| 49 | + stripped_content = re.sub( |
| 50 | + r"data:image/[^;]+;base64,[A-Za-z0-9+/=]+", |
| 51 | + "", |
| 52 | + stripped_content, |
| 53 | + flags=re.DOTALL, |
| 54 | + ) |
| 55 | + |
| 56 | + codespell_proc = subprocess.run( |
| 57 | + ["uvx", "codespell", "-"], |
| 58 | + input=stripped_content, |
| 59 | + capture_output=True, |
| 60 | + text=True, |
| 61 | + check=False, # codespell exits non-zero on errors, which is expected |
| 62 | + ) |
| 63 | + |
| 64 | + # filter codespell's config file lines |
| 65 | + output_lines = [] |
| 66 | + for line in codespell_proc.stdout.splitlines(): |
| 67 | + if line.strip().startswith("Used config files:") or re.match( |
| 68 | + r"^\s+\d+:\s+\.codespellrc", line |
| 69 | + ): |
| 70 | + continue |
| 71 | + output_lines.append(line.replace("-:", "Line ", 1)) |
| 72 | + |
| 73 | + filtered_output = "\n".join(output_lines).strip() |
| 74 | + |
| 75 | + if filtered_output: |
| 76 | + error_message_block = f"**{rel_path}**:\n```\n{filtered_output}\n```" |
| 77 | + |
| 78 | + except FileNotFoundError: |
| 79 | + error_message_block = f"**{rel_path}**: Error - File not found." |
| 80 | + except subprocess.CalledProcessError as e: |
| 81 | + cmd_str = " ".join(e.cmd) |
| 82 | + error_message_block = ( |
| 83 | + f"**{rel_path}**: Error running command `{cmd_str}`:\n```\n{e.stderr}\n```" |
| 84 | + ) |
| 85 | + except Exception as e: |
| 86 | + error_message_block = f"**{rel_path}**: An unexpected error occurred:\n```\n{str(e)}\n```" |
| 87 | + |
| 88 | + return error_message_block |
62 | 89 |
|
63 | 90 |
|
64 | 91 | def main(): |
65 | 92 | parser = argparse.ArgumentParser(description="Check spelling in Jupyter notebooks") |
66 | 93 | parser.add_argument("notebooks", nargs="+", help="List of notebook files to check") |
67 | 94 | args = parser.parse_args() |
68 | 95 |
|
69 | | - has_errors = False |
| 96 | + all_errors: list[str] = [] |
| 97 | + num_files_processed = 0 |
| 98 | + num_files_with_errors = 0 |
| 99 | + num_files_with_processing_errors = 0 |
70 | 100 |
|
71 | 101 | for notebook in args.notebooks: |
72 | | - if check_spelling(notebook): |
73 | | - has_errors = True |
74 | | - |
75 | | - if has_errors: |
| 102 | + num_files_processed += 1 |
| 103 | + error_output = check_spelling(notebook) |
| 104 | + if error_output: |
| 105 | + all_errors.append(error_output) |
| 106 | + if ( |
| 107 | + "Error running command" in error_output |
| 108 | + or "An unexpected error occurred" in error_output |
| 109 | + or "Error - File not found" in error_output |
| 110 | + ): |
| 111 | + num_files_with_processing_errors += 1 |
| 112 | + else: |
| 113 | + num_files_with_errors += 1 |
| 114 | + |
| 115 | + if all_errors: |
| 116 | + print("## Spell Check Report\n") |
| 117 | + print("\n\n---\n\n".join(all_errors)) |
| 118 | + |
| 119 | + summary_lines = [] |
| 120 | + if num_files_with_errors > 0: |
| 121 | + summary_lines.append(f"Found spelling errors in {num_files_with_errors} file(s).") |
| 122 | + if num_files_with_processing_errors > 0: |
| 123 | + summary_lines.append( |
| 124 | + f"Encountered processing errors in {num_files_with_processing_errors} file(s)." |
| 125 | + ) |
| 126 | + if not summary_lines: |
| 127 | + summary_lines.append(f"Found issues in {len(all_errors)} file(s).") |
| 128 | + |
| 129 | + print(f"\n---\nChecked {num_files_processed} notebook(s). " + " ".join(summary_lines)) |
76 | 130 | sys.exit(1) |
| 131 | + else: |
| 132 | + print(f"Spell check passed successfully for {num_files_processed} notebook(s).") |
| 133 | + sys.exit(0) |
77 | 134 |
|
78 | 135 |
|
79 | 136 | if __name__ == "__main__": |
|
0 commit comments