diff --git a/execexam/advise.py b/execexam/advise.py index fb67f93..767eb03 100644 --- a/execexam/advise.py +++ b/execexam/advise.py @@ -127,6 +127,8 @@ def fix_failures( # noqa: PLR0913 filtered_test_output: str, exec_exam_test_assertion_details: str, test_overview: str, + traceback: List[str], + functions: List, failing_test_details: str, failing_test_code: str, advice_method: enumerations.AdviceMethod, @@ -148,18 +150,30 @@ def fix_failures( # noqa: PLR0913 test_overview = ( filtered_test_output + exec_exam_test_assertion_details ) + # create an LLM debugging request that contains all of the + # information that is needed to provide advice about how + # to fix the bug(s) in the program that are part of an + # executable examination; note that, essentially, an + # examination consists of Python functions that a student + # must complete and then test cases that confirm the correctness + # of the functions that are implemented; note also that + # ExecExam has a Pytest plugin that collects additional details llm_debugging_request = ( "I am an undergraduate student completing a programming examination." - + " You may never make suggestions to change the source code of the test cases." - + " Always make suggestions about how to improve the Python source code of the program under test." - + " Always give Python code in a Markdown fenced code block with your suggested program." - + " Always start your response with a friendly greeting and overview of what you will provide." - + " Always conclude by saying that you are making a helpful suggestion but could be wrong." - + " Always be helpful, upbeat, friendly, encouraging, and concise when making a response." - + " Your task is to suggest, in a step-by-step fashion, how to fix the bug(s) in the program?" - + f" Here is the test overview with test output and details about test assertions: {test_overview}" - + f" Here is a brief overview of the test failure information: {failing_test_details}" - + f" Here is the source code for the one or more failing test(s): {failing_test_code}" + + "You may never make suggestions to change the source code of the test cases." + + "Always make suggestions about how to improve the Python source code of the program under test." + + "Always give Python code in a Markdown fenced code block with your suggested program." + + "Always start your response with a friendly greeting and overview of what you will provide." + + "Always conclude by saying that you are making a helpful suggestion but could be wrong." + + "Always be helpful, upbeat, friendly, encouraging, and concise when making a response." + + "Your task is to suggest, in a step-by-step fashion, how to fix the bug(s) in the program?" + + "What follows is all of the information you need to complete the debugging task." + + f"Here is the error traceback, which will guide you in identifying which functions to fix: {traceback}" + + f"Below is the source code for all functions that have failed; focus your suggestions on these functions: {functions}" + + f"Here is an overview of the test details and output, which will help you understand the issue: {test_overview}" + + f"A brief summary of the test failure information is provided here: {failing_test_details}" + + f"Finally, here is the source code for the failing test(s):: {failing_test_code}" + + "Based on this, suggest what changes need to be made to fix the failing functions." ) if advice_method == enumerations.AdviceMethod.api_key: diff --git a/execexam/extract.py b/execexam/extract.py index ce416f5..f403fd0 100644 --- a/execexam/extract.py +++ b/execexam/extract.py @@ -1,7 +1,11 @@ """Extract contents from data structures.""" +import ast +import importlib +import inspect +import re from pathlib import Path -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List, Optional, Tuple from . import convert @@ -178,3 +182,269 @@ def extract_test_output_multiple_labels( filtered_output += line + "\n" # return the filtered output return filtered_output + + +def extract_tested_functions(failing_test_code: str) -> Any: + """Extract all functions being tested from the failing test code.""" + # Find all function calls in the code + function_calls = re.findall(r"(\w+)\(", failing_test_code) + # List of prefixes for functions we want to ignore + ignore_prefixes = ["assert", "test_"] + # Initialize a list to store valid function names + tested_functions = set() + # Check each function call + for func_name in function_calls: + # If the function name doesn't start with any ignore prefix, add it to the list + if not any(func_name.startswith(prefix) for prefix in ignore_prefixes): + tested_functions.add(func_name) + # If no matching functions are found, return the full failing_test_code + return tested_functions if tested_functions else failing_test_code + + +def get_called_functions_from_test(test_path: str) -> list[str]: + """Get the functions called in a test from the test path.""" + # Extract the module name and function name from test_path + module_name, func_name = test_path.split("::") + # Import the test module + test_module = importlib.import_module( + module_name.replace("/", ".").replace(".py", "") + ) + # Get the function object + test_function = getattr(test_module, func_name) + # Get the source code of the function + source_code = inspect.getsource(test_function) + # Use regex to find called functions in the source code + called_functions = re.findall(r"\b(\w+)\s*\(", source_code) + return called_functions + + +def function_exists_in_file(file_path: str, function_name: str) -> bool: + """Check if a function with the given name is defined in the source file.""" + try: + with open(file_path, "r") as file: + file_contents = file.read() + # Parse file contents + tree = ast.parse(file_contents) + # Search for the function definition + for node in ast.walk(tree): + if ( + isinstance(node, ast.FunctionDef) + and node.name == function_name + ): + return True + except Exception: + return False + return False + + +def find_source_file(test_path: str, function: str) -> str: + """Find the source file being tested using imports""" + test_file = test_path.split("::")[0] + try: + with open(test_file, "r") as f: + for line in f: + if "import" in line: + # Extract the module being imported + imported = line.split("import")[-1].strip() + if "." in imported: + imported = imported.split(".")[-1] + if "from" in line: + imported = ( + line.split("from")[-1].split("import")[0].strip() + ) + # Skip if 'pytest' is imported + if imported == "pytest": + continue + # Convert module name to potential file path + file_path = f"{imported.replace('.', '/')}.py" + if file_path != "pytest.py": + if function_exists_in_file(file_path, function): + return file_path + except Exception as e: + return f"Error reading file {test_file}: {e}" + return "" + + +def extract_tracebacks(json_report: Optional[dict], failing_code: str) -> list: + """Extract comprehensive test failure information from pytest JSON report including test details, assertions, variables, and complete stack traces. Handles if JSON report returns string or dictionary""" + # Handle the case where there is no json_report + if not json_report: + return ["No Traceback Found"] + traceback_info_list = [] + tests = json_report.get("tests", []) + # Go through all the tests and pull out which ones failed + for test in tests: + if test.get("outcome") in ("failed", "error"): + test_path = test.get("nodeid", "") + call = test.get("call", {}) + traceback_info = { + "test_path": test_path, + "source_file": "", + "tested_function": "", + "full_traceback": "", + "error_type": "", + "error_message": "", + "stack_trace": [], + "variables": {}, + "assertion_detail": "", + "expected_value": None, + "actual_value": None, + } + longrepr = call.get("longrepr", {}) + # Handle string longrepr + if isinstance(longrepr, str): + process_string_longrepr( + longrepr, traceback_info, test_path, failing_code + ) + # Handle dictionary of longrepr + elif isinstance(longrepr, dict): + process_dict_longrepr( + longrepr, traceback_info, test_path, failing_code + ) + # Ensure we have a full traceback + if not traceback_info["full_traceback"] and "log" in call: + traceback_info["full_traceback"] = call["log"] + # Append if there is information + if ( + traceback_info["full_traceback"] + or traceback_info["error_message"] + or traceback_info["stack_trace"] + ): + traceback_info_list.append(traceback_info) + return traceback_info_list + + +def process_string_longrepr( + longrepr: str, traceback_info: dict, test_path: str, failing_code: str +) -> None: + """Process traceback when longrepr is a string.""" + traceback_info["full_traceback"] = longrepr + lines = longrepr.split("\n") + # Get the name of the actual function being tested + called_functions = get_called_functions_from_test(test_path) + tested_funcs = extract_tested_functions(failing_code) + func = "" + for func in tested_funcs: + if func in called_functions: + traceback_info["tested_function"] = func + break + # Find source file from imports + source_file = find_source_file(test_path, func) + if source_file: + traceback_info["source_file"] = source_file + for i, line in enumerate(lines): + # Look for file locations in traceback + if "File " in line and ", line " in line: + loc = line.strip() + traceback_info["stack_trace"].append(loc) + # Extract error type and message + elif line.startswith("E "): + if not traceback_info["error_message"]: + error_parts = line[4:].split(": ", 1) + if len(error_parts) > 1: + traceback_info["error_type"] = error_parts[0] + traceback_info["error_message"] = error_parts[1] + else: + traceback_info["error_message"] = error_parts[0] + # Look for assertion details + if "assert" in line: + traceback_info["assertion_detail"] = line.strip() + try: + if "==" in line: + expr = line.split("assert")[-1].strip() + actual, expected = expr.split("==", 1) + traceback_info["actual_value"] = eval(actual.strip("() ")) + traceback_info["expected_value"] = eval( + expected.strip("() ") + ) + except Exception: + pass + + +def process_dict_longrepr( + longrepr: dict, traceback_info: dict, test_path: str, failing_code: str +) -> None: + """Process traceback when longrepr is a dictionary.""" + crash = longrepr.get("reprcrash", {}) + entries = longrepr.get("reprtraceback", {}).get("reprentries", []) + # Initialize stack_trace if it doesn't exist + if "stack_trace" not in traceback_info: + traceback_info["stack_trace"] = [] + # Get the name of the actual function being tested + tested_funcs = extract_tested_functions(failing_code) + called_functions = get_called_functions_from_test(test_path) + func = "" + # Find the function name from the tested and called functions + for func in tested_funcs: + if func in called_functions: + traceback_info["tested_function"] = func + break + # First try to find source file from traceback entries + source_file = "" + try: + source_file = find_source_file(test_path, func) + except Exception: + pass + # If no source file is found, set the default value + if not source_file: + source_file = "File not found" + traceback_info["source_file"] = source_file + # Get error type and message (split based on the first occurrence of ": ") + message = crash.get("message", "") + if ": " in message: + error_type, error_msg = message.split(": ", 1) + traceback_info["error_type"] = error_type + traceback_info["error_message"] = error_msg + else: + traceback_info["error_message"] = message + # Build stack trace + for entry in entries: + if isinstance(entry, dict): + loc = entry.get("reprfileloc", {}) + if loc: + file_path = loc.get("path", "") + line_no = loc.get("lineno", "") + if file_path and line_no: + stack_entry = f"File {file_path}, line {line_no}" + traceback_info["stack_trace"].append(stack_entry) + + +def extract_function_code_from_traceback( + traceback_info_list: list, +) -> List[List[str]]: + """Extracts function code from a traceback information list.""" + # Check if the list is empty + if not traceback_info_list: + return [["No Functions Found"]] + functions = [] + for test_info in traceback_info_list: + source_file = test_info.get("source_file", "") + tested_function = test_info.get("tested_function", "") + # Proceed if the source file and function name are provided + if source_file and tested_function: + try: + # Read the file contents + with open(source_file, "r") as file: + file_contents = file.read() + # Parse the file contents to find the function definition + tree = ast.parse(file_contents) + for node in ast.walk(tree): + if ( + isinstance(node, ast.FunctionDef) + and node.name == tested_function + ): + # Ensure end_lineno is accessible + if hasattr(node, "end_lineno"): + function_lines = [ + line.strip() + for line in file_contents.splitlines()[ + node.lineno - 1 : node.end_lineno + ] + ] + functions.append(function_lines) + break + except FileNotFoundError: + functions.append([f"File not found: {source_file}"]) + except Exception as e: + functions.append([f"Error: {e}"]) + return functions diff --git a/execexam/main.py b/execexam/main.py index c6ba59f..a7a61e6 100644 --- a/execexam/main.py +++ b/execexam/main.py @@ -286,7 +286,7 @@ def run( # noqa: PLR0913, PLR0915 # build the command for running symbex; this tool can # perform static analysis of Python source code and # extract the code of a function inside of a file - command = f"symbex {test_name} -f {failing_test_path}" + command = f'symbex "{test_name}" -f "{failing_test_path}"' # run the symbex command and collect its output process = subprocess.run( command, @@ -335,6 +335,10 @@ def run( # noqa: PLR0913, PLR0915 # litellm module has been loaded in a separate thread litellm_thread.join() debugger.debug(debug, debugger.Debug.stopped_litellm_thread.value) + tracebacks = extract.extract_tracebacks( + json_report_plugin.report, failing_test_code_overall + ) + functions = extract.extract_function_code_from_traceback(tracebacks) # provide advice about how to fix the failing tests # because the non-zero return code indicates that # there was a test failure and that overall there @@ -346,6 +350,8 @@ def run( # noqa: PLR0913, PLR0915 filtered_test_output, exec_exam_test_assertion_details, filtered_test_output + exec_exam_test_assertion_details, + tracebacks, + functions, failing_test_details, failing_test_code_overall, advice_method, diff --git a/pyproject.toml b/pyproject.toml index 11bb85d..15c7771 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,8 @@ [tool.poetry] name = "execexam" -version = "0.3.2" +version = "0.3.3" description = "ExecExam runs executable examinations, providing feedback and assistance!" -authors = ["Hemani Alaparthi ","Gregory M. Kapfhammer "] +authors = ["Hemani Alaparthi ","Pallas-Athena Cain ","Gregory M. Kapfhammer "] readme = "README.md" [tool.poetry.scripts] diff --git a/tests/test_extract.py b/tests/test_extract.py index 25cdeee..6c60ad0 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -1,5 +1,8 @@ """Test cases for the extract.py file.""" +import importlib +import os +import sys from pathlib import Path import pytest @@ -9,12 +12,18 @@ from execexam.extract import ( extract_details, extract_failing_test_details, + extract_function_code_from_traceback, extract_test_assertion_details, extract_test_assertion_details_list, extract_test_assertions_details, extract_test_output, extract_test_output_multiple_labels, extract_test_run_details, + extract_tested_functions, + extract_tracebacks, + find_source_file, + function_exists_in_file, + get_called_functions_from_test, is_failing_test_details_empty, ) @@ -254,3 +263,165 @@ def test_is_failing_test_details_empty_with_empty_string(): result = is_failing_test_details_empty(details) # check the result assert result is False + + +def test_extract_tested_functions_no_calls(): + """Test extract_tested_functions with no function calls.""" + failing_code = "assert 1 == 1" + result = extract_tested_functions(failing_code) + assert ( + result == failing_code + ), "Should return the full code when no functions are called." + + +def test_extract_tested_functions_with_calls(): + """Test extract_tested_functions with multiple function calls.""" + failing_code = "func1()\nfunc2()\nassert test_function()" + result = extract_tested_functions(failing_code) + assert result == { + "func1", + "func2", + }, "Should extract only the non-test/assert functions." + + +def test_get_called_functions_from_test_simple(): + """Test get_called_functions_from_test with a simple test function.""" + module_name = "temp_test_module" + try: + # Write the temporary module file + with open(f"{module_name}.py", "w") as f: + f.write(""" +def test_sample(): + func_a() + func_b() +""") + # Add the current directory to sys.path temporarily + sys.path.insert(0, os.getcwd()) + # Call function and check the result + result = get_called_functions_from_test(f"{module_name}::test_sample") + assert result == ["test_sample", "func_a", "func_b"] + finally: + # Remove the temporary module and reset sys.path + if os.path.exists(f"{module_name}.py"): + os.remove(f"{module_name}.py") + sys.path.pop(0) + # Clear the module from import cache to avoid stale imports in future tests + if module_name in sys.modules: + del sys.modules[module_name] + + +def test_function_exists_in_file_exists(): + """Test function_exists_in_file when the function exists in the file.""" + # Create a temporary Python file to use for testing + with open("temp_module.py", "w") as f: + f.write("def existing_function(): pass") + result = function_exists_in_file("temp_module.py", "existing_function") + assert result, "Should return True when function exists in the file." + os.remove("temp_module.py") + + +def test_function_exists_in_file_not_exists(): + """Test function_exists_in_file when the function does not exist in the file.""" + # Create a temporary Python file to use for testing + with open("temp_module.py", "w") as f: + f.write("def some_other_function(): pass") + result = function_exists_in_file("temp_module.py", "non_existing_function") + assert ( + not result + ), "Should return False when function does not exist in the file." + os.remove("temp_module.py") + + +def test_find_source_file_simple_import(): + """Test find_source_file with a simple import.""" + # Create a test file with an import statement + with open("test_file.py", "w") as f: + f.write("import module_a\n") + with open("module_a.py", "w") as f: + f.write("def test_func(): pass") + result = find_source_file("test_file.py::test_func", "test_func") + assert ( + result == "module_a.py" + ), "Should return the correct source file when found." + os.remove("test_file.py") + os.remove("module_a.py") + + +def test_extract_tracebacks_no_failures(): + """Test extract_tracebacks with no failures in the JSON report.""" + # Create a simple JSON report for testing that passes + json_report = { + "tests": [ + {"outcome": "passed", "nodeid": "test_module.py::test_function"} + ] + } + # Check the results are empty when the report passed + result = extract_tracebacks(json_report, "sample failing code") + assert ( + result == [] + ), "Should return an empty list when no failures are present." + + +def test_extract_tracebacks_with_failures(): + """Test extract_tracebacks with a failure in the JSON report.""" + module_name = "my_tests" + try: + # Create a test file `my_tests.py` with a failing test + with open(f"{module_name}.py", "w") as f: + f.write(""" +def test_sample(): + assert False, "test failed" +""") + # Add the current directory to sys.path temporarily + sys.path.insert(0, os.getcwd()) + importlib.invalidate_caches() # Ensure the new module can be found + # Create a test JSON report with a failure + json_report = { + "tests": [ + { + "outcome": "failed", + "nodeid": f"{module_name}.py::test_sample", + "call": { + "longrepr": "E AssertionError: test failed\nFile 'my_tests.py', line 3" + }, + } + ] + } + # Run the function and check the result + result = extract_tracebacks(json_report, "def func_a(): pass") + assert isinstance( + result, list + ), "The result should be a list of tracebacks" + assert len(result) == 1, "There should be one traceback in the result" + assert ( + result[0]["error_type"] == "AssertionError" + ), "The error_type should be 'AssertionError'" + assert ( + "test failed" in result[0]["full_traceback"] + ), "The traceback should contain 'test failed'" + finally: + # Clean up the temporary module and reset sys.path + if os.path.exists(f"{module_name}.py"): + os.remove(f"{module_name}.py") + sys.path.pop(0) + if module_name in sys.modules: + del sys.modules[module_name] + + +def test_extract_function_code_from_traceback(): + """Test extract_function_code_from_traceback with a sample function.""" + # Create a source file with a sample function + with open("source_file.py", "w") as f: + f.write("""\ +def sample_func(): + return True +""") + # Prepare traceback info list for testing + traceback_info_list = [ + {"source_file": "source_file.py", "tested_function": "sample_func"} + ] + # Extract the function code from the traceback + result = extract_function_code_from_traceback(traceback_info_list) + assert result is not None + assert any("sample_func" in line for sublist in result for line in sublist) + os.remove("source_file.py")