revert the last commit (dummy)

LovelyBuggies · LovelyBuggies · commit 81cb0b6fddbc · 2025-10-03T21:05:13.000-04:00
diff --git a/README.md b/README.md
@@ -61,9 +61,8 @@ We do not apply the importance sampling ratio because the policy changes slowly
 - `expert_edits`: an LLM proposes edits; prompts include edit suggestions plus context.
 - `level_passed` / `passed`: binary outcome oriented prompts with minimal context.
 - `plain`: no diagnostics, but still includes previous response (unless disabled) and a "Revise ..." instruction.
-- `bandit`: returns the first‑turn prompts every turn, which overrides `external.original_prompt=true` and `external.previous_response=false` automatically so that turn 1 and later turns receive the same prompt text.
 
-Specific settings for 'level_feedback' is `external.sandbox_slice`, which controls how many eval tests to include in the feedback. By default, sandbox executes only the first assert (sandbox_slice=1). Use all eval tests by setting `external.sandbox_slice` to 0, None, or 'all'. Negative values use the last asserts. `external.sandbox_slice` only affects analysis-based modes ('level_feedback', 'level_passed', 'passed'), and it has no effect on 'expert_edits' or 'bandit'.
+Specific settings for 'level_feedback' is `external.sandbox_slice`, which controls how many eval tests to include in the feedback. By default, sandbox executes only the first assert (sandbox_slice=1). Use all eval tests by setting `external.sandbox_slice` to 0, None, or 'all'. Negative values use the last asserts. `external.sandbox_slice` only affects analysis-based modes ('level_feedback', 'level_passed', 'passed'), and it has no effect on 'expert_edits'.
 
 Specific settings for 'expert_edits' is `external.expert_edits_model`, which controls which LLM to use for proposing edits. By default, it uses DeepSeek-Coder. You can also change it to Claude-3, GPT-4, once you have keys/tokens in your global environment variables.
 
diff --git a/configs/grpo_che_config.yaml b/configs/grpo_che_config.yaml
@@ -36,7 +36,7 @@ grpo:
   num_turns: 2
   num_train_epochs: 8
   per_device_train_batch_size: 1
-  learning_rate: 3.0e-5
+  learning_rate: 2.0e-5
   logging_steps: 50
   save_steps: 200
   num_generations: 4
diff --git a/configs/grpo_he_config.yaml b/configs/grpo_he_config.yaml
@@ -36,7 +36,7 @@ grpo:
   num_turns: 2
   num_train_epochs: 6
   per_device_train_batch_size: 1
-  learning_rate: 3.0e-5
+  learning_rate: 2.0e-5
   logging_steps: 50
   save_steps: 200
   num_generations: 4
diff --git a/configs/magrpo_che_config.yaml b/configs/magrpo_che_config.yaml
@@ -36,7 +36,7 @@ magrpo:
   num_turns: 2
   num_train_epochs: 8
   per_device_train_batch_size: 1
-  learning_rate: 3.0e-5
+  learning_rate: 2.0e-5
   logging_steps: 50
   save_steps: 200
   num_generations: 4
diff --git a/configs/magrpo_he_config.yaml b/configs/magrpo_he_config.yaml
@@ -36,7 +36,7 @@ magrpo:
   num_turns: 2
   num_train_epochs: 6
   per_device_train_batch_size: 1
-  learning_rate: 3.0e-5
+  learning_rate: 2.0e-5
   logging_steps: 50
   save_steps: 200
   num_generations: 4
diff --git a/external/__init__.py b/external/__init__.py
@@ -6,7 +6,6 @@
 from . import level_passed
 from . import passed
 from . import plain
-from . import bandit
 import builtins
 
 # Verbose toggle for external previews
@@ -247,39 +246,7 @@ def print(*args, **kwargs):  # type: ignore
         print("=" * 60 + "\n")
         return (aux_prompt, main_prompt) if int(num_agents) > 1 else [main_prompt]
 
-    if mode == "bandit":
-        # Enforce flags: original_prompt=True, previous_response=False
-        original_prompt_flag = True
-        previous_response_flag = False
-        if int(num_agents) == 1:
-            main_comp = agent_completions[0]
-            aux_comp = ""
-        else:
-            aux_comp, main_comp = agent_completions[0], agent_completions[1]
-        ctx = get_context(prompt) or {}
-        entry_point = ctx.get("entry_point", "")
-        test_code = ctx.get("tests_sandbox") or ctx.get("tests_eval", "")
-        aux_prompt, main_prompt = bandit.format_followup_prompts(
-            original_prompt=prompt,
-            aux_completion=aux_comp,
-            main_completion=main_comp,
-            test_code=test_code,
-            entry_point=entry_point,
-            original_prompt_flag=original_prompt_flag,
-            previous_response_flag=previous_response_flag,
-            num_agent=int(num_agents),
-        )
-        print("\n" + "=" * 60)
-        print("EXTERNAL MODE PREVIEW: bandit")
-        print("-" * 60)
-        if int(num_agents) > 1:
-            print("AUX PROMPT:\n" + aux_prompt)
-            print("-" * 60)
-        print("MAIN PROMPT:\n" + main_prompt)
-        print("=" * 60 + "\n")
-        return (aux_prompt, main_prompt) if int(num_agents) > 1 else [main_prompt]
-
-    supported = ["expert_edits", "level_feedback", "level_passed", "passed", "plain", "bandit"]
+    supported = ["expert_edits", "level_feedback", "level_passed", "passed", "plain"]
     raise NotImplementedError(
         f"External transition mode '{mode}' is not implemented yet. Supported: {', '.join(supported)}"
     )
diff --git a/external/bandit.py b/external/bandit.py
diff --git a/train_grpo.py b/train_grpo.py
@@ -344,7 +344,7 @@ def _resolver(prompt: str):
         output_dir=output_dir,
         num_train_epochs=grpo_config.get("num_train_epochs", 10),
         per_device_train_batch_size=grpo_config.get("per_device_train_batch_size", 1),
-        learning_rate=grpo_config.get("learning_rate", 3e-5),
+        learning_rate=grpo_config.get("learning_rate", 2e-5),
         logging_steps=grpo_config.get("logging_steps", 50),
         save_steps=grpo_config.get("save_steps", 200),
         num_generations=grpo_config.get("num_generations", 4),
diff --git a/train_magrpo.py b/train_magrpo.py
@@ -396,7 +396,7 @@ def _resolver(prompt: str):
             "num_train_epochs", 10 if not is_multi_turn else 7
         ),
         per_device_train_batch_size=magrpo_config.get("per_device_train_batch_size", 1),
-        learning_rate=magrpo_config.get("learning_rate", 3e-5),
+        learning_rate=magrpo_config.get("learning_rate", 2e-5),
         logging_steps=magrpo_config.get("logging_steps", 50),
         save_steps=magrpo_config.get("save_steps", 200),
         num_generations=magrpo_config.get("num_generations", 4),