add bandit external mode

LovelyBuggies · LovelyBuggies · commit 3c0ea36f0c8c · 2025-09-28T19:58:37.000-04:00
diff --git a/README.md b/README.md
@@ -56,9 +56,14 @@ We do not apply the importance sampling ratio because the policy changes slowly
 
 ### External Modes
 
-`external.mode` is set to be 'level_feedback' by default. This gives additional information from external to prompts in the following turns; 'level_feedback' attaches test‑driven diagnostics, while alternatives include 'expert_edits' (an LLM proposes edits), 'level_passed'/'passed' (binary outcomes), and 'plain' (no signals). 
+`external.mode` is set to 'level_feedback' by default. This gives additional information from external to prompts in the following turns; 'level_feedback' attaches test‑driven diagnostics, while alternatives include:
 
-Specific settings for 'level_feedback' is `external.sandbox_slice`, which controls how many eval tests to include in the feedback. By default, sandbox executes only the first assert (sandbox_slice=1). Use all eval tests by setting `external.sandbox_slice` to 0, None, or 'all'. Negative values use the last asserts. `external.sandbox_slice` only affects analysis-based modes ('level_feedback', 'level_passed', 'passed'), and it has no effect on 'expert_edits'.
+- `expert_edits`: an LLM proposes edits; prompts include edit suggestions plus context.
+- `level_passed` / `passed`: binary outcome oriented prompts with minimal context.
+- `plain`: no diagnostics, but still includes previous response (unless disabled) and a "Revise ..." instruction.
+- `bandit`: returns the first‑turn prompts every turn, which enforces `external.original_prompt=true` and `external.previous_response=false` automatically so that turn 1 and later turns receive the same prompt text.
+
+Specific settings for 'level_feedback' is `external.sandbox_slice`, which controls how many eval tests to include in the feedback. By default, sandbox executes only the first assert (sandbox_slice=1). Use all eval tests by setting `external.sandbox_slice` to 0, None, or 'all'. Negative values use the last asserts. `external.sandbox_slice` only affects analysis-based modes ('level_feedback', 'level_passed', 'passed'), and it has no effect on 'expert_edits' or 'bandit'.
 
 Specific settings for 'expert_edits' is `external.expert_edits_model`, which controls which LLM to use for proposing edits. By default, it uses DeepSeek-Coder. You can also change it to Claude-3, GPT-4, once you have keys/tokens in your global environment variables.
 
diff --git a/external/__init__.py b/external/__init__.py
@@ -6,6 +6,7 @@
 from . import level_passed
 from . import passed
 from . import plain
+from . import bandit
 import builtins
 
 # Verbose toggle for external previews
@@ -246,7 +247,39 @@ def print(*args, **kwargs):  # type: ignore
         print("=" * 60 + "\n")
         return (aux_prompt, main_prompt) if int(num_agents) > 1 else [main_prompt]
 
-    supported = ["expert_edits", "level_feedback", "level_passed", "passed", "plain"]
+    if mode == "bandit":
+        # Enforce flags: original_prompt=True, previous_response=False
+        original_prompt_flag = True
+        previous_response_flag = False
+        if int(num_agents) == 1:
+            main_comp = agent_completions[0]
+            aux_comp = ""
+        else:
+            aux_comp, main_comp = agent_completions[0], agent_completions[1]
+        ctx = get_context(prompt) or {}
+        entry_point = ctx.get("entry_point", "")
+        test_code = ctx.get("tests_sandbox") or ctx.get("tests_eval", "")
+        aux_prompt, main_prompt = bandit.format_followup_prompts(
+            original_prompt=prompt,
+            aux_completion=aux_comp,
+            main_completion=main_comp,
+            test_code=test_code,
+            entry_point=entry_point,
+            original_prompt_flag=original_prompt_flag,
+            previous_response_flag=previous_response_flag,
+            num_agent=int(num_agents),
+        )
+        print("\n" + "=" * 60)
+        print("EXTERNAL MODE PREVIEW: bandit")
+        print("-" * 60)
+        if int(num_agents) > 1:
+            print("AUX PROMPT:\n" + aux_prompt)
+            print("-" * 60)
+        print("MAIN PROMPT:\n" + main_prompt)
+        print("=" * 60 + "\n")
+        return (aux_prompt, main_prompt) if int(num_agents) > 1 else [main_prompt]
+
+    supported = ["expert_edits", "level_feedback", "level_passed", "passed", "plain", "bandit"]
     raise NotImplementedError(
         f"External transition mode '{mode}' is not implemented yet. Supported: {', '.join(supported)}"
     )
diff --git a/external/bandit.py b/external/bandit.py
@@ -0,0 +1,29 @@
+from typing import Tuple
+
+from .common import build_first_turn_prompts
+
+
+def format_followup_prompts(
+    original_prompt: str,
+    aux_completion: str,
+    main_completion: str,
+    test_code: str,
+    entry_point: str,
+    original_prompt_flag: bool = True,
+    previous_response_flag: bool = False,
+    num_agent: int = 2,
+) -> Tuple[str, str]:
+    """
+    Bandit mode: make follow-up prompts identical to the canonical first-turn
+    prompts. No analysis and no "Revise ..." instructions. Ignores completions.
+
+    Returns (aux_prompt, main_prompt). For single-agent, aux will be an empty string
+    and the caller should use only the main prompt.
+    """
+    # Build the canonical first-turn prompts (context + instructions)
+    aux_base, main_base = build_first_turn_prompts(original_prompt, entry_point)
+    if int(num_agent) == 1:
+        # For single-agent, aux prompt is unused; return an empty aux and main_base
+        return "", main_base
+    return aux_base, main_base
+