Skip to content

Feature/reward task #53

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Binary file modified .coverage
Binary file not shown.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
![promptolution](https://github.com/user-attachments/assets/84c050bd-61a1-4f2e-bc4e-874d9b4a69af)

![Coverage](https://img.shields.io/badge/Coverage-87%25-green)
![Coverage](https://img.shields.io/badge/Coverage-92%25-brightgreen)
[![CI](https://github.com/finitearth/promptolution/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/finitearth/promptolution/actions/workflows/ci.yml)
[![Docs](https://github.com/finitearth/promptolution/actions/workflows/docs.yml/badge.svg?branch=main)](https://github.com/finitearth/promptolution/actions/workflows/docs.yml)
![Code Style](https://img.shields.io/badge/Code%20Style-black-black)
Expand Down Expand Up @@ -36,7 +36,7 @@ to install the necessary dependencies. You might need to install [pipx](https://

## Usage

To get started right away, take a look at our [getting started notebook](https://github.com/finitearth/promptolution/blob/main/notebooks/getting_started.ipynb).
To get started right away, take a look at our [getting started notebook](https://github.com/finitearth/promptolution/blob/main/tutorials/getting_started.ipynb) and our [other demos and tutorials](https://github.com/finitearth/promptolution/blob/main/tutorials).
For more details, a comprehensive **documentation** with API reference is availabe at https://finitearth.github.io/promptolution/.

### Featured Optimizers
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from typing import TYPE_CHECKING

if TYPE_CHECKING:
if TYPE_CHECKING: # pragma: no cover
from promptolution.predictors.base_predictor import BasePredictor
from promptolution.tasks.base_task import BaseTask
from promptolution.utils.config import ExperimentConfig
Expand Down
2 changes: 1 addition & 1 deletion promptolution/exemplar_selectors/random_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from promptolution.exemplar_selectors.base_exemplar_selector import BaseExemplarSelector

if TYPE_CHECKING:
if TYPE_CHECKING: # pragma: no cover
from promptolution.predictors.base_predictor import BasePredictor
from promptolution.tasks.base_task import BaseTask
from promptolution.utils.config import ExperimentConfig
Expand Down
31 changes: 26 additions & 5 deletions promptolution/helpers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
"""Helper functions for the usage of the libary."""


from typing import TYPE_CHECKING, List, Literal
from typing import TYPE_CHECKING, Callable, List, Literal

if TYPE_CHECKING:
from promptolution.tasks.judge_tasks import JudgeTask
from promptolution.tasks.reward_tasks import RewardTask

if TYPE_CHECKING: # pragma: no cover
from promptolution.exemplar_selectors.base_exemplar_selector import BaseExemplarSelector
from promptolution.llms.base_llm import BaseLLM
from promptolution.optimizers.base_optimizer import BaseOptimizer
Expand Down Expand Up @@ -75,7 +78,7 @@ def run_optimization(df: pd.DataFrame, config: "ExperimentConfig") -> List[str]:
logger.warning("📌 CAPO requires block evaluation strategy. Setting it to 'sequential_block'.")
config.eval_strategy = "sequential_block"

task = get_task(df, config)
task = get_task(df, config, judge_llm=llm)
optimizer = get_optimizer(
predictor=predictor,
meta_llm=llm,
Expand Down Expand Up @@ -103,8 +106,8 @@ def run_evaluation(df: pd.DataFrame, config: "ExperimentConfig", prompts: List[s
Returns:
pd.DataFrame: A DataFrame containing the prompts and their scores.
"""
task = get_task(df, config)
llm = get_llm(config=config)
task = get_task(df, config, judge_llm=llm)
predictor = get_predictor(llm, config=config)
logger.warning("📊 Starting evaluation...")
scores = task.evaluate(prompts, predictor, eval_strategy="full")
Expand Down Expand Up @@ -144,7 +147,13 @@ def get_llm(model_id: str = None, config: "ExperimentConfig" = None) -> "BaseLLM
return APILLM(model_id=model_id, config=config)


def get_task(df: pd.DataFrame, config: "ExperimentConfig") -> "BaseTask":
def get_task(
df: pd.DataFrame,
config: "ExperimentConfig",
task_type: Literal["classification", "reward", "judge"] = None,
judge_llm: "BaseLLM" = None,
reward_function: Callable = None,
) -> "BaseTask":
"""Get the task based on the provided DataFrame and configuration.

So far only ClassificationTask is supported.
Expand All @@ -156,6 +165,18 @@ def get_task(df: pd.DataFrame, config: "ExperimentConfig") -> "BaseTask":
Returns:
BaseTask: An instance of a task class based on the provided DataFrame and configuration.
"""
if task_type is None:
task_type = config.task_type

if task_type == "reward":
return RewardTask(
df=df,
reward_function=reward_function,
config=config,
)
elif task_type == "judge":
return JudgeTask(df, judge_llm=judge_llm, config=config)

return ClassificationTask(df, config=config)


Expand Down
2 changes: 1 addition & 1 deletion promptolution/llms/api_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

from promptolution.llms.base_llm import BaseLLM

if TYPE_CHECKING:
if TYPE_CHECKING: # pragma: no cover
from promptolution.utils.config import ExperimentConfig

from promptolution.utils.logging import get_logger
Expand Down
2 changes: 1 addition & 1 deletion promptolution/llms/base_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from typing import TYPE_CHECKING, List

if TYPE_CHECKING:
if TYPE_CHECKING: # pragma: no cover
from promptolution.utils.config import ExperimentConfig

from promptolution.optimizers.templates import DEFAULT_SYS_PROMPT
Expand Down
2 changes: 1 addition & 1 deletion promptolution/llms/local_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from typing import TYPE_CHECKING

if TYPE_CHECKING:
if TYPE_CHECKING: # pragma: no cover
from promptolution.utils.config import ExperimentConfig


Expand Down
2 changes: 1 addition & 1 deletion promptolution/llms/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from typing import TYPE_CHECKING, List

if TYPE_CHECKING:
if TYPE_CHECKING: # pragma: no cover
from promptolution.utils.config import ExperimentConfig


Expand Down
2 changes: 1 addition & 1 deletion promptolution/optimizers/base_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from typing import TYPE_CHECKING, Callable, List

if TYPE_CHECKING:
if TYPE_CHECKING: # pragma: no cover
from promptolution.tasks.base_task import BaseTask
from promptolution.utils.config import ExperimentConfig

Expand Down
25 changes: 20 additions & 5 deletions promptolution/optimizers/capo.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@

from typing import TYPE_CHECKING, Callable, List, Tuple

if TYPE_CHECKING:
from promptolution.utils.formatting import extract_from_tag

if TYPE_CHECKING: # pragma: no cover
from promptolution.llms.base_llm import BaseLLM
from promptolution.predictors.base_predictor import BasePredictor
from promptolution.tasks.base_task import BaseTask
Expand Down Expand Up @@ -83,6 +85,8 @@ def __init__(
test_statistic: "TestStatistics" = "paired_t_test",
alpha: float = 0.2,
length_penalty: float = 0.05,
check_fs_accuracy: bool = True,
create_fs_reasoning: bool = True,
df_few_shots: pd.DataFrame = None,
crossover_template: str = None,
mutation_template: str = None,
Expand All @@ -103,6 +107,10 @@ def __init__(
test_statistic (TestStatistics): Statistical test to compare prompt performance. Default is "paired_t_test".
alpha (float): Significance level for the statistical test.
length_penalty (float): Penalty factor for prompt length.
check_fs_accuracy (bool): Whether to check the accuracy of few-shot examples before appending them to the prompt.
In cases such as reward tasks, this can be set to False, as no ground truth is available. Default is True.
create_fs_reasoning (bool): Whether to create reasoning for few-shot examples using the downstream model,
instead of simply using input-output pairs from the few shots DataFrame. Default is True.
df_few_shots (pd.DataFrame): DataFrame containing few-shot examples. If None, will pop 10% of datapoints from task.
crossover_template (str, optional): Template for crossover instructions.
mutation_template (str, optional): Template for mutation instructions.
Expand All @@ -124,6 +132,9 @@ def __init__(
self.length_penalty = length_penalty
self.token_counter = get_token_counter(self.downstream_llm)

self.check_fs_accuracy = check_fs_accuracy
self.create_fs_reasoning = create_fs_reasoning

self.scores = np.empty(0)
super().__init__(predictor, task, initial_prompts, callbacks, config)
self.df_few_shots = df_few_shots if df_few_shots is not None else task.pop_datapoints(frac=0.1)
Expand Down Expand Up @@ -172,7 +183,11 @@ def _create_few_shot_examples(self, instruction: str, num_examples: int) -> List
)
for i, t in zip(sample_inputs, sample_targets)
]
# Select partition of the examples to generate reasoning from downstream model

if not self.create_fs_reasoning:
# If we do not create reasoning, return the few-shot examples directly
return few_shots

preds, seqs = self.predictor.predict(
[instruction] * num_examples,
sample_inputs,
Expand All @@ -184,7 +199,7 @@ def _create_few_shot_examples(self, instruction: str, num_examples: int) -> List
# Process and clean up the generated sequences
seqs[j] = seqs[j].replace(sample_inputs[j], "").strip()
# Check if the prediction is correct and add reasoning if so
if preds[j] == sample_targets[j]:
if preds[j] == sample_targets[j] or not self.check_fs_accuracy:
few_shots[j] = CAPO_FEWSHOT_TEMPLATE.replace("<input>", sample_inputs[j]).replace("<output>", seqs[j])

return few_shots
Expand Down Expand Up @@ -218,7 +233,7 @@ def _crossover(self, parents: List[CAPOPrompt]) -> List[CAPOPrompt]:

offsprings = []
for instruction, examples in zip(child_instructions, offspring_few_shots):
instruction = instruction.split("<prompt>")[-1].split("</prompt>")[0].strip()
instruction = extract_from_tag(instruction, "<prompt>", "</prompt>")
offsprings.append(CAPOPrompt(instruction, examples))

return offsprings
Expand All @@ -240,7 +255,7 @@ def _mutate(self, offsprings: List[CAPOPrompt]) -> List[CAPOPrompt]:

mutated = []
for new_instruction, prompt in zip(new_instructions, offsprings):
new_instruction = new_instruction.split("<prompt>")[-1].split("</prompt>")[0].strip()
new_instruction = extract_from_tag(new_instruction, "<prompt>", "</prompt>")
p = random.random()

if p < 1 / 3 and len(prompt.few_shots) < self.upper_shots: # add a random few shot
Expand Down
5 changes: 3 additions & 2 deletions promptolution/optimizers/evoprompt_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
from typing import TYPE_CHECKING, List

from promptolution.optimizers.base_optimizer import BaseOptimizer
from promptolution.utils.formatting import extract_from_tag

if TYPE_CHECKING:
if TYPE_CHECKING: # pragma: no cover
from promptolution.llms.base_llm import BaseLLM
from promptolution.predictors.base_predictor import BasePredictor
from promptolution.tasks.base_task import BaseTask
Expand Down Expand Up @@ -94,7 +95,7 @@ def _step(self) -> List[str]:
meta_prompts.append(meta_prompt)

child_prompts = self.meta_llm.get_response(meta_prompts)
child_prompts = [prompt.split("<prompt>")[-1].split("</prompt>")[0].strip() for prompt in child_prompts]
child_prompts = extract_from_tag(child_prompts, "<prompt>", "</prompt>")

child_scores = self.task.evaluate(child_prompts, self.predictor, return_agg_scores=True)

Expand Down
5 changes: 3 additions & 2 deletions promptolution/optimizers/evoprompt_ga.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,14 @@

from promptolution.optimizers.base_optimizer import BaseOptimizer

if TYPE_CHECKING:
if TYPE_CHECKING: # pragma: no cover
from promptolution.llms.base_llm import BaseLLM
from promptolution.predictors.base_predictor import BasePredictor
from promptolution.tasks.base_task import BaseTask
from promptolution.utils.callbacks import BaseCallback
from promptolution.utils.config import ExperimentConfig

from promptolution.utils.formatting import extract_from_tag
from promptolution.utils.logging import get_logger

logger = get_logger(__name__)
Expand Down Expand Up @@ -126,6 +127,6 @@ def _crossover(self, prompts, scores) -> str:
meta_prompts.append(meta_prompt)

child_prompts = self.meta_llm.get_response(meta_prompts)
child_prompts = [prompt.split("<prompt>")[-1].split("</prompt>")[0].strip() for prompt in child_prompts]
child_prompts = extract_from_tag(child_prompts, "<prompt>", "</prompt>")

return child_prompts
5 changes: 3 additions & 2 deletions promptolution/optimizers/opro.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@

from promptolution.optimizers.base_optimizer import BaseOptimizer
from promptolution.optimizers.templates import OPRO_TEMPLATE
from promptolution.utils.formatting import extract_from_tag

if TYPE_CHECKING:
if TYPE_CHECKING: # pragma: no cover
from promptolution.llms.base_llm import BaseLLM
from promptolution.predictors.base_predictor import BasePredictor
from promptolution.tasks.base_task import BaseTask
Expand Down Expand Up @@ -119,7 +120,7 @@ def _step(self) -> List[str]:

response = self.meta_llm.get_response([self.meta_prompt])[0]

prompt = response.split("<prompt>")[-1].split("</prompt>")[0].strip()
prompt = extract_from_tag(response, "<prompt>", "</prompt>")

if prompt in self.prompts:
duplicate_prompts += 1
Expand Down
2 changes: 1 addition & 1 deletion promptolution/predictors/base_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from promptolution.llms.base_llm import BaseLLM

if TYPE_CHECKING:
if TYPE_CHECKING: # pragma: no cover
from promptolution.utils.config import ExperimentConfig

import numpy as np
Expand Down
5 changes: 3 additions & 2 deletions promptolution/predictors/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
from typing import TYPE_CHECKING, List

from promptolution.predictors.base_predictor import BasePredictor
from promptolution.utils.formatting import extract_from_tag

if TYPE_CHECKING:
if TYPE_CHECKING: # pragma: no cover
from promptolution.utils.config import ExperimentConfig


Expand Down Expand Up @@ -124,7 +125,7 @@ def _extract_preds(self, preds: List[str]) -> np.ndarray:
"""
response = []
for pred in preds:
pred = pred.split(self.begin_marker)[-1].split(self.end_marker)[0].strip().lower()
pred = extract_from_tag(pred, self.begin_marker, self.end_marker).lower()
if self.classes is not None and pred not in self.classes:
pred = self.classes[0]

Expand Down
Loading
Loading