Skip to content

Commit 81cb0b6

Browse files
committed
revert the last commit (dummy)
1 parent 5f8f1a3 commit 81cb0b6

File tree

9 files changed

+8
-71
lines changed

9 files changed

+8
-71
lines changed

README.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,8 @@ We do not apply the importance sampling ratio because the policy changes slowly
6161
- `expert_edits`: an LLM proposes edits; prompts include edit suggestions plus context.
6262
- `level_passed` / `passed`: binary outcome oriented prompts with minimal context.
6363
- `plain`: no diagnostics, but still includes previous response (unless disabled) and a "Revise ..." instruction.
64-
- `bandit`: returns the first‑turn prompts every turn, which overrides `external.original_prompt=true` and `external.previous_response=false` automatically so that turn 1 and later turns receive the same prompt text.
6564

66-
Specific settings for 'level_feedback' is `external.sandbox_slice`, which controls how many eval tests to include in the feedback. By default, sandbox executes only the first assert (sandbox_slice=1). Use all eval tests by setting `external.sandbox_slice` to 0, None, or 'all'. Negative values use the last asserts. `external.sandbox_slice` only affects analysis-based modes ('level_feedback', 'level_passed', 'passed'), and it has no effect on 'expert_edits' or 'bandit'.
65+
Specific settings for 'level_feedback' is `external.sandbox_slice`, which controls how many eval tests to include in the feedback. By default, sandbox executes only the first assert (sandbox_slice=1). Use all eval tests by setting `external.sandbox_slice` to 0, None, or 'all'. Negative values use the last asserts. `external.sandbox_slice` only affects analysis-based modes ('level_feedback', 'level_passed', 'passed'), and it has no effect on 'expert_edits'.
6766

6867
Specific settings for 'expert_edits' is `external.expert_edits_model`, which controls which LLM to use for proposing edits. By default, it uses DeepSeek-Coder. You can also change it to Claude-3, GPT-4, once you have keys/tokens in your global environment variables.
6968

configs/grpo_che_config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ grpo:
3636
num_turns: 2
3737
num_train_epochs: 8
3838
per_device_train_batch_size: 1
39-
learning_rate: 3.0e-5
39+
learning_rate: 2.0e-5
4040
logging_steps: 50
4141
save_steps: 200
4242
num_generations: 4

configs/grpo_he_config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ grpo:
3636
num_turns: 2
3737
num_train_epochs: 6
3838
per_device_train_batch_size: 1
39-
learning_rate: 3.0e-5
39+
learning_rate: 2.0e-5
4040
logging_steps: 50
4141
save_steps: 200
4242
num_generations: 4

configs/magrpo_che_config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ magrpo:
3636
num_turns: 2
3737
num_train_epochs: 8
3838
per_device_train_batch_size: 1
39-
learning_rate: 3.0e-5
39+
learning_rate: 2.0e-5
4040
logging_steps: 50
4141
save_steps: 200
4242
num_generations: 4

configs/magrpo_he_config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ magrpo:
3636
num_turns: 2
3737
num_train_epochs: 6
3838
per_device_train_batch_size: 1
39-
learning_rate: 3.0e-5
39+
learning_rate: 2.0e-5
4040
logging_steps: 50
4141
save_steps: 200
4242
num_generations: 4

external/__init__.py

Lines changed: 1 addition & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from . import level_passed
77
from . import passed
88
from . import plain
9-
from . import bandit
109
import builtins
1110

1211
# Verbose toggle for external previews
@@ -247,39 +246,7 @@ def print(*args, **kwargs): # type: ignore
247246
print("=" * 60 + "\n")
248247
return (aux_prompt, main_prompt) if int(num_agents) > 1 else [main_prompt]
249248

250-
if mode == "bandit":
251-
# Enforce flags: original_prompt=True, previous_response=False
252-
original_prompt_flag = True
253-
previous_response_flag = False
254-
if int(num_agents) == 1:
255-
main_comp = agent_completions[0]
256-
aux_comp = ""
257-
else:
258-
aux_comp, main_comp = agent_completions[0], agent_completions[1]
259-
ctx = get_context(prompt) or {}
260-
entry_point = ctx.get("entry_point", "")
261-
test_code = ctx.get("tests_sandbox") or ctx.get("tests_eval", "")
262-
aux_prompt, main_prompt = bandit.format_followup_prompts(
263-
original_prompt=prompt,
264-
aux_completion=aux_comp,
265-
main_completion=main_comp,
266-
test_code=test_code,
267-
entry_point=entry_point,
268-
original_prompt_flag=original_prompt_flag,
269-
previous_response_flag=previous_response_flag,
270-
num_agent=int(num_agents),
271-
)
272-
print("\n" + "=" * 60)
273-
print("EXTERNAL MODE PREVIEW: bandit")
274-
print("-" * 60)
275-
if int(num_agents) > 1:
276-
print("AUX PROMPT:\n" + aux_prompt)
277-
print("-" * 60)
278-
print("MAIN PROMPT:\n" + main_prompt)
279-
print("=" * 60 + "\n")
280-
return (aux_prompt, main_prompt) if int(num_agents) > 1 else [main_prompt]
281-
282-
supported = ["expert_edits", "level_feedback", "level_passed", "passed", "plain", "bandit"]
249+
supported = ["expert_edits", "level_feedback", "level_passed", "passed", "plain"]
283250
raise NotImplementedError(
284251
f"External transition mode '{mode}' is not implemented yet. Supported: {', '.join(supported)}"
285252
)

external/bandit.py

Lines changed: 0 additions & 29 deletions
This file was deleted.

train_grpo.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ def _resolver(prompt: str):
344344
output_dir=output_dir,
345345
num_train_epochs=grpo_config.get("num_train_epochs", 10),
346346
per_device_train_batch_size=grpo_config.get("per_device_train_batch_size", 1),
347-
learning_rate=grpo_config.get("learning_rate", 3e-5),
347+
learning_rate=grpo_config.get("learning_rate", 2e-5),
348348
logging_steps=grpo_config.get("logging_steps", 50),
349349
save_steps=grpo_config.get("save_steps", 200),
350350
num_generations=grpo_config.get("num_generations", 4),

train_magrpo.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,7 @@ def _resolver(prompt: str):
396396
"num_train_epochs", 10 if not is_multi_turn else 7
397397
),
398398
per_device_train_batch_size=magrpo_config.get("per_device_train_batch_size", 1),
399-
learning_rate=magrpo_config.get("learning_rate", 3e-5),
399+
learning_rate=magrpo_config.get("learning_rate", 2e-5),
400400
logging_steps=magrpo_config.get("logging_steps", 50),
401401
save_steps=magrpo_config.get("save_steps", 200),
402402
num_generations=magrpo_config.get("num_generations", 4),

0 commit comments

Comments
 (0)