finitearth · finitearth · Jul 17, 2025 · Jul 17, 2025 · Jul 17, 2025 · Jul 18, 2025
@@ -1,6 +1,6 @@
 ![promptolution](https://github.com/user-attachments/assets/84c050bd-61a1-4f2e-bc4e-874d9b4a69af)
 
-![Coverage](https://img.shields.io/badge/Coverage-87%25-green)
+![Coverage](https://img.shields.io/badge/Coverage-92%25-brightgreen)
 [![CI](https://github.com/finitearth/promptolution/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/finitearth/promptolution/actions/workflows/ci.yml)
 [![Docs](https://github.com/finitearth/promptolution/actions/workflows/docs.yml/badge.svg?branch=main)](https://github.com/finitearth/promptolution/actions/workflows/docs.yml)
 ![Code Style](https://img.shields.io/badge/Code%20Style-black-black)
@@ -36,7 +36,7 @@ to install the necessary dependencies. You might need to install [pipx](https://
 
 ## Usage
 
-To get started right away, take a look at our [getting started notebook](https://github.com/finitearth/promptolution/blob/main/notebooks/getting_started.ipynb).
+To get started right away, take a look at our [getting started notebook](https://github.com/finitearth/promptolution/blob/main/tutorials/getting_started.ipynb) and our [other demos and tutorials](https://github.com/finitearth/promptolution/blob/main/tutorials).
 For more details, a comprehensive **documentation** with API reference is availabe at https://finitearth.github.io/promptolution/.
 
 ### Featured Optimizers

@@ -5,7 +5,7 @@
 
 from typing import TYPE_CHECKING
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.predictors.base_predictor import BasePredictor
     from promptolution.tasks.base_task import BaseTask
     from promptolution.utils.config import ExperimentConfig

@@ -4,7 +4,7 @@
 
 from promptolution.exemplar_selectors.base_exemplar_selector import BaseExemplarSelector
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.predictors.base_predictor import BasePredictor
     from promptolution.tasks.base_task import BaseTask
     from promptolution.utils.config import ExperimentConfig

@@ -1,9 +1,12 @@
 """Helper functions for the usage of the libary."""
 
 
-from typing import TYPE_CHECKING, List, Literal
+from typing import TYPE_CHECKING, Callable, List, Literal
 
-if TYPE_CHECKING:
+from promptolution.tasks.judge_tasks import JudgeTask
+from promptolution.tasks.reward_tasks import RewardTask
+
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.exemplar_selectors.base_exemplar_selector import BaseExemplarSelector
     from promptolution.llms.base_llm import BaseLLM
     from promptolution.optimizers.base_optimizer import BaseOptimizer
@@ -75,7 +78,7 @@ def run_optimization(df: pd.DataFrame, config: "ExperimentConfig") -> List[str]:
         logger.warning("📌 CAPO requires block evaluation strategy. Setting it to 'sequential_block'.")
         config.eval_strategy = "sequential_block"
 
-    task = get_task(df, config)
+    task = get_task(df, config, judge_llm=llm)
     optimizer = get_optimizer(
         predictor=predictor,
         meta_llm=llm,
@@ -103,8 +106,8 @@ def run_evaluation(df: pd.DataFrame, config: "ExperimentConfig", prompts: List[s
     Returns:
         pd.DataFrame: A DataFrame containing the prompts and their scores.
     """
-    task = get_task(df, config)
     llm = get_llm(config=config)
+    task = get_task(df, config, judge_llm=llm)
     predictor = get_predictor(llm, config=config)
     logger.warning("📊 Starting evaluation...")
     scores = task.evaluate(prompts, predictor, eval_strategy="full")
@@ -144,7 +147,13 @@ def get_llm(model_id: str = None, config: "ExperimentConfig" = None) -> "BaseLLM
     return APILLM(model_id=model_id, config=config)
 
 
-def get_task(df: pd.DataFrame, config: "ExperimentConfig") -> "BaseTask":
+def get_task(
+    df: pd.DataFrame,
+    config: "ExperimentConfig",
+    task_type: Literal["classification", "reward", "judge"] = None,
+    judge_llm: "BaseLLM" = None,
+    reward_function: Callable = None,
+) -> "BaseTask":
     """Get the task based on the provided DataFrame and configuration.
 
     So far only ClassificationTask is supported.
@@ -156,6 +165,18 @@ def get_task(df: pd.DataFrame, config: "ExperimentConfig") -> "BaseTask":
     Returns:
         BaseTask: An instance of a task class based on the provided DataFrame and configuration.
     """
+    if task_type is None:
+        task_type = config.task_type
+
+    if task_type == "reward":
+        return RewardTask(
+            df=df,
+            reward_function=reward_function,
+            config=config,
+        )
+    elif task_type == "judge":
+        return JudgeTask(df, judge_llm=judge_llm, config=config)
+
     return ClassificationTask(df, config=config)
 
 

@@ -15,7 +15,7 @@
 
 from promptolution.llms.base_llm import BaseLLM
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.utils.config import ExperimentConfig
 
 from promptolution.utils.logging import get_logger

@@ -5,7 +5,7 @@
 
 from typing import TYPE_CHECKING, List
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.utils.config import ExperimentConfig
 
 from promptolution.optimizers.templates import DEFAULT_SYS_PROMPT

@@ -9,7 +9,7 @@
 
 from typing import TYPE_CHECKING
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.utils.config import ExperimentConfig
 
 

@@ -3,7 +3,7 @@
 
 from typing import TYPE_CHECKING, List
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.utils.config import ExperimentConfig
 
 

@@ -5,7 +5,7 @@
 
 from typing import TYPE_CHECKING, Callable, List
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.tasks.base_task import BaseTask
     from promptolution.utils.config import ExperimentConfig
 

@@ -8,7 +8,9 @@
 
 from typing import TYPE_CHECKING, Callable, List, Tuple
 
-if TYPE_CHECKING:
+from promptolution.utils.formatting import extract_from_tag
+
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.llms.base_llm import BaseLLM
     from promptolution.predictors.base_predictor import BasePredictor
     from promptolution.tasks.base_task import BaseTask
@@ -83,6 +85,8 @@ def __init__(
         test_statistic: "TestStatistics" = "paired_t_test",
         alpha: float = 0.2,
         length_penalty: float = 0.05,
+        check_fs_accuracy: bool = True,
+        create_fs_reasoning: bool = True,
         df_few_shots: pd.DataFrame = None,
         crossover_template: str = None,
         mutation_template: str = None,
@@ -103,6 +107,10 @@ def __init__(
             test_statistic (TestStatistics): Statistical test to compare prompt performance. Default is "paired_t_test".
             alpha (float): Significance level for the statistical test.
             length_penalty (float): Penalty factor for prompt length.
+            check_fs_accuracy (bool): Whether to check the accuracy of few-shot examples before appending them to the prompt.
+                In cases such as reward tasks, this can be set to False, as no ground truth is available. Default is True.
+            create_fs_reasoning (bool): Whether to create reasoning for few-shot examples using the downstream model,
+                instead of simply using input-output pairs from the few shots DataFrame. Default is True.
             df_few_shots (pd.DataFrame): DataFrame containing few-shot examples. If None, will pop 10% of datapoints from task.
             crossover_template (str, optional): Template for crossover instructions.
             mutation_template (str, optional): Template for mutation instructions.
@@ -124,6 +132,9 @@ def __init__(
         self.length_penalty = length_penalty
         self.token_counter = get_token_counter(self.downstream_llm)
 
+        self.check_fs_accuracy = check_fs_accuracy
+        self.create_fs_reasoning = create_fs_reasoning
+
         self.scores = np.empty(0)
         super().__init__(predictor, task, initial_prompts, callbacks, config)
         self.df_few_shots = df_few_shots if df_few_shots is not None else task.pop_datapoints(frac=0.1)
@@ -172,7 +183,11 @@ def _create_few_shot_examples(self, instruction: str, num_examples: int) -> List
             )
             for i, t in zip(sample_inputs, sample_targets)
         ]
-        # Select partition of the examples to generate reasoning from downstream model
+
+        if not self.create_fs_reasoning:
+            # If we do not create reasoning, return the few-shot examples directly
+            return few_shots
+
         preds, seqs = self.predictor.predict(
             [instruction] * num_examples,
             sample_inputs,
@@ -184,7 +199,7 @@ def _create_few_shot_examples(self, instruction: str, num_examples: int) -> List
             # Process and clean up the generated sequences
             seqs[j] = seqs[j].replace(sample_inputs[j], "").strip()
             # Check if the prediction is correct and add reasoning if so
-            if preds[j] == sample_targets[j]:
+            if preds[j] == sample_targets[j] or not self.check_fs_accuracy:
                 few_shots[j] = CAPO_FEWSHOT_TEMPLATE.replace("<input>", sample_inputs[j]).replace("<output>", seqs[j])
 
         return few_shots
@@ -218,7 +233,7 @@ def _crossover(self, parents: List[CAPOPrompt]) -> List[CAPOPrompt]:
 
         offsprings = []
         for instruction, examples in zip(child_instructions, offspring_few_shots):
-            instruction = instruction.split("<prompt>")[-1].split("</prompt>")[0].strip()
+            instruction = extract_from_tag(instruction, "<prompt>", "</prompt>")
             offsprings.append(CAPOPrompt(instruction, examples))
 
         return offsprings
@@ -240,7 +255,7 @@ def _mutate(self, offsprings: List[CAPOPrompt]) -> List[CAPOPrompt]:
 
         mutated = []
         for new_instruction, prompt in zip(new_instructions, offsprings):
-            new_instruction = new_instruction.split("<prompt>")[-1].split("</prompt>")[0].strip()
+            new_instruction = extract_from_tag(new_instruction, "<prompt>", "</prompt>")
             p = random.random()
 
             if p < 1 / 3 and len(prompt.few_shots) < self.upper_shots:  # add a random few shot

@@ -6,8 +6,9 @@
 from typing import TYPE_CHECKING, List
 
 from promptolution.optimizers.base_optimizer import BaseOptimizer
+from promptolution.utils.formatting import extract_from_tag
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.llms.base_llm import BaseLLM
     from promptolution.predictors.base_predictor import BasePredictor
     from promptolution.tasks.base_task import BaseTask
@@ -94,7 +95,7 @@ def _step(self) -> List[str]:
             meta_prompts.append(meta_prompt)
 
         child_prompts = self.meta_llm.get_response(meta_prompts)
-        child_prompts = [prompt.split("<prompt>")[-1].split("</prompt>")[0].strip() for prompt in child_prompts]
+        child_prompts = extract_from_tag(child_prompts, "<prompt>", "</prompt>")
 
         child_scores = self.task.evaluate(child_prompts, self.predictor, return_agg_scores=True)
 

@@ -7,13 +7,14 @@
 
 from promptolution.optimizers.base_optimizer import BaseOptimizer
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.llms.base_llm import BaseLLM
     from promptolution.predictors.base_predictor import BasePredictor
     from promptolution.tasks.base_task import BaseTask
     from promptolution.utils.callbacks import BaseCallback
     from promptolution.utils.config import ExperimentConfig
 
+from promptolution.utils.formatting import extract_from_tag
 from promptolution.utils.logging import get_logger
 
 logger = get_logger(__name__)
@@ -126,6 +127,6 @@ def _crossover(self, prompts, scores) -> str:
             meta_prompts.append(meta_prompt)
 
         child_prompts = self.meta_llm.get_response(meta_prompts)
-        child_prompts = [prompt.split("<prompt>")[-1].split("</prompt>")[0].strip() for prompt in child_prompts]
+        child_prompts = extract_from_tag(child_prompts, "<prompt>", "</prompt>")
 
         return child_prompts
@@ -7,8 +7,9 @@
 
 from promptolution.optimizers.base_optimizer import BaseOptimizer
 from promptolution.optimizers.templates import OPRO_TEMPLATE
+from promptolution.utils.formatting import extract_from_tag
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.llms.base_llm import BaseLLM
     from promptolution.predictors.base_predictor import BasePredictor
     from promptolution.tasks.base_task import BaseTask
@@ -119,7 +120,7 @@ def _step(self) -> List[str]:
 
             response = self.meta_llm.get_response([self.meta_prompt])[0]
 
-            prompt = response.split("<prompt>")[-1].split("</prompt>")[0].strip()
+            prompt = extract_from_tag(response, "<prompt>", "</prompt>")
 
             if prompt in self.prompts:
                 duplicate_prompts += 1

@@ -7,7 +7,7 @@
 
 from promptolution.llms.base_llm import BaseLLM
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.utils.config import ExperimentConfig
 
 import numpy as np

@@ -6,8 +6,9 @@
 from typing import TYPE_CHECKING, List
 
 from promptolution.predictors.base_predictor import BasePredictor
+from promptolution.utils.formatting import extract_from_tag
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.utils.config import ExperimentConfig
 
 
@@ -124,7 +125,7 @@ def _extract_preds(self, preds: List[str]) -> np.ndarray:
         """
         response = []
         for pred in preds:
-            pred = pred.split(self.begin_marker)[-1].split(self.end_marker)[0].strip().lower()
+            pred = extract_from_tag(pred, self.begin_marker, self.end_marker).lower()
             if self.classes is not None and pred not in self.classes:
                 pred = self.classes[0]