diff --git a/.coverage b/.coverage
index 96f4053..7ac1d3f 100644
Binary files a/.coverage and b/.coverage differ
diff --git a/README.md b/README.md
index 92bd4b7..0a2ae87 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 ![promptolution](https://github.com/user-attachments/assets/84c050bd-61a1-4f2e-bc4e-874d9b4a69af)
 
-![Coverage](https://img.shields.io/badge/Coverage-87%25-green)
+![Coverage](https://img.shields.io/badge/Coverage-92%25-brightgreen)
 [![CI](https://github.com/finitearth/promptolution/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/finitearth/promptolution/actions/workflows/ci.yml)
 [![Docs](https://github.com/finitearth/promptolution/actions/workflows/docs.yml/badge.svg?branch=main)](https://github.com/finitearth/promptolution/actions/workflows/docs.yml)
 ![Code Style](https://img.shields.io/badge/Code%20Style-black-black)
@@ -36,7 +36,7 @@ to install the necessary dependencies. You might need to install [pipx](https://
 
 ## Usage
 
-To get started right away, take a look at our [getting started notebook](https://github.com/finitearth/promptolution/blob/main/notebooks/getting_started.ipynb).
+To get started right away, take a look at our [getting started notebook](https://github.com/finitearth/promptolution/blob/main/tutorials/getting_started.ipynb) and our [other demos and tutorials](https://github.com/finitearth/promptolution/blob/main/tutorials).
 For more details, a comprehensive **documentation** with API reference is availabe at https://finitearth.github.io/promptolution/.
 
 ### Featured Optimizers
diff --git a/promptolution/exemplar_selectors/base_exemplar_selector.py b/promptolution/exemplar_selectors/base_exemplar_selector.py
index 1f52ccb..b65b3c5 100644
--- a/promptolution/exemplar_selectors/base_exemplar_selector.py
+++ b/promptolution/exemplar_selectors/base_exemplar_selector.py
@@ -5,7 +5,7 @@
 
 from typing import TYPE_CHECKING
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.predictors.base_predictor import BasePredictor
     from promptolution.tasks.base_task import BaseTask
     from promptolution.utils.config import ExperimentConfig
diff --git a/promptolution/exemplar_selectors/random_selector.py b/promptolution/exemplar_selectors/random_selector.py
index 730e4d6..c6f98e8 100644
--- a/promptolution/exemplar_selectors/random_selector.py
+++ b/promptolution/exemplar_selectors/random_selector.py
@@ -4,7 +4,7 @@
 
 from promptolution.exemplar_selectors.base_exemplar_selector import BaseExemplarSelector
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.predictors.base_predictor import BasePredictor
     from promptolution.tasks.base_task import BaseTask
     from promptolution.utils.config import ExperimentConfig
diff --git a/promptolution/helpers.py b/promptolution/helpers.py
index ec603a4..fda15e5 100644
--- a/promptolution/helpers.py
+++ b/promptolution/helpers.py
@@ -1,9 +1,12 @@
 """Helper functions for the usage of the libary."""
 
 
-from typing import TYPE_CHECKING, List, Literal
+from typing import TYPE_CHECKING, Callable, List, Literal
 
-if TYPE_CHECKING:
+from promptolution.tasks.judge_tasks import JudgeTask
+from promptolution.tasks.reward_tasks import RewardTask
+
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.exemplar_selectors.base_exemplar_selector import BaseExemplarSelector
     from promptolution.llms.base_llm import BaseLLM
     from promptolution.optimizers.base_optimizer import BaseOptimizer
@@ -75,7 +78,7 @@ def run_optimization(df: pd.DataFrame, config: "ExperimentConfig") -> List[str]:
         logger.warning("📌 CAPO requires block evaluation strategy. Setting it to 'sequential_block'.")
         config.eval_strategy = "sequential_block"
 
-    task = get_task(df, config)
+    task = get_task(df, config, judge_llm=llm)
     optimizer = get_optimizer(
         predictor=predictor,
         meta_llm=llm,
@@ -103,8 +106,8 @@ def run_evaluation(df: pd.DataFrame, config: "ExperimentConfig", prompts: List[s
     Returns:
         pd.DataFrame: A DataFrame containing the prompts and their scores.
     """
-    task = get_task(df, config)
     llm = get_llm(config=config)
+    task = get_task(df, config, judge_llm=llm)
     predictor = get_predictor(llm, config=config)
     logger.warning("📊 Starting evaluation...")
     scores = task.evaluate(prompts, predictor, eval_strategy="full")
@@ -144,7 +147,13 @@ def get_llm(model_id: str = None, config: "ExperimentConfig" = None) -> "BaseLLM
     return APILLM(model_id=model_id, config=config)
 
 
-def get_task(df: pd.DataFrame, config: "ExperimentConfig") -> "BaseTask":
+def get_task(
+    df: pd.DataFrame,
+    config: "ExperimentConfig",
+    task_type: Literal["classification", "reward", "judge"] = None,
+    judge_llm: "BaseLLM" = None,
+    reward_function: Callable = None,
+) -> "BaseTask":
     """Get the task based on the provided DataFrame and configuration.
 
     So far only ClassificationTask is supported.
@@ -156,6 +165,18 @@ def get_task(df: pd.DataFrame, config: "ExperimentConfig") -> "BaseTask":
     Returns:
         BaseTask: An instance of a task class based on the provided DataFrame and configuration.
     """
+    if task_type is None:
+        task_type = config.task_type
+
+    if task_type == "reward":
+        return RewardTask(
+            df=df,
+            reward_function=reward_function,
+            config=config,
+        )
+    elif task_type == "judge":
+        return JudgeTask(df, judge_llm=judge_llm, config=config)
+
     return ClassificationTask(df, config=config)
 
 
diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py
index 330b3c8..13d37f2 100644
--- a/promptolution/llms/api_llm.py
+++ b/promptolution/llms/api_llm.py
@@ -15,7 +15,7 @@
 
 from promptolution.llms.base_llm import BaseLLM
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.utils.config import ExperimentConfig
 
 from promptolution.utils.logging import get_logger
diff --git a/promptolution/llms/base_llm.py b/promptolution/llms/base_llm.py
index 704942b..aa2382d 100644
--- a/promptolution/llms/base_llm.py
+++ b/promptolution/llms/base_llm.py
@@ -5,7 +5,7 @@
 
 from typing import TYPE_CHECKING, List
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.utils.config import ExperimentConfig
 
 from promptolution.optimizers.templates import DEFAULT_SYS_PROMPT
diff --git a/promptolution/llms/local_llm.py b/promptolution/llms/local_llm.py
index 33f489e..dc180a6 100644
--- a/promptolution/llms/local_llm.py
+++ b/promptolution/llms/local_llm.py
@@ -9,7 +9,7 @@
 
 from typing import TYPE_CHECKING
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.utils.config import ExperimentConfig
 
 
diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
index 1806f6b..e57dd4d 100644
--- a/promptolution/llms/vllm.py
+++ b/promptolution/llms/vllm.py
@@ -3,7 +3,7 @@
 
 from typing import TYPE_CHECKING, List
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.utils.config import ExperimentConfig
 
 
diff --git a/promptolution/optimizers/base_optimizer.py b/promptolution/optimizers/base_optimizer.py
index 710701a..67a85b5 100644
--- a/promptolution/optimizers/base_optimizer.py
+++ b/promptolution/optimizers/base_optimizer.py
@@ -5,7 +5,7 @@
 
 from typing import TYPE_CHECKING, Callable, List
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.tasks.base_task import BaseTask
     from promptolution.utils.config import ExperimentConfig
 
diff --git a/promptolution/optimizers/capo.py b/promptolution/optimizers/capo.py
index 60f8a12..49114b5 100644
--- a/promptolution/optimizers/capo.py
+++ b/promptolution/optimizers/capo.py
@@ -8,7 +8,9 @@
 
 from typing import TYPE_CHECKING, Callable, List, Tuple
 
-if TYPE_CHECKING:
+from promptolution.utils.formatting import extract_from_tag
+
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.llms.base_llm import BaseLLM
     from promptolution.predictors.base_predictor import BasePredictor
     from promptolution.tasks.base_task import BaseTask
@@ -83,6 +85,8 @@ def __init__(
         test_statistic: "TestStatistics" = "paired_t_test",
         alpha: float = 0.2,
         length_penalty: float = 0.05,
+        check_fs_accuracy: bool = True,
+        create_fs_reasoning: bool = True,
         df_few_shots: pd.DataFrame = None,
         crossover_template: str = None,
         mutation_template: str = None,
@@ -103,6 +107,10 @@ def __init__(
             test_statistic (TestStatistics): Statistical test to compare prompt performance. Default is "paired_t_test".
             alpha (float): Significance level for the statistical test.
             length_penalty (float): Penalty factor for prompt length.
+            check_fs_accuracy (bool): Whether to check the accuracy of few-shot examples before appending them to the prompt.
+                In cases such as reward tasks, this can be set to False, as no ground truth is available. Default is True.
+            create_fs_reasoning (bool): Whether to create reasoning for few-shot examples using the downstream model,
+                instead of simply using input-output pairs from the few shots DataFrame. Default is True.
             df_few_shots (pd.DataFrame): DataFrame containing few-shot examples. If None, will pop 10% of datapoints from task.
             crossover_template (str, optional): Template for crossover instructions.
             mutation_template (str, optional): Template for mutation instructions.
@@ -124,6 +132,9 @@ def __init__(
         self.length_penalty = length_penalty
         self.token_counter = get_token_counter(self.downstream_llm)
 
+        self.check_fs_accuracy = check_fs_accuracy
+        self.create_fs_reasoning = create_fs_reasoning
+
         self.scores = np.empty(0)
         super().__init__(predictor, task, initial_prompts, callbacks, config)
         self.df_few_shots = df_few_shots if df_few_shots is not None else task.pop_datapoints(frac=0.1)
@@ -172,7 +183,11 @@ def _create_few_shot_examples(self, instruction: str, num_examples: int) -> List
             )
             for i, t in zip(sample_inputs, sample_targets)
         ]
-        # Select partition of the examples to generate reasoning from downstream model
+
+        if not self.create_fs_reasoning:
+            # If we do not create reasoning, return the few-shot examples directly
+            return few_shots
+
         preds, seqs = self.predictor.predict(
             [instruction] * num_examples,
             sample_inputs,
@@ -184,7 +199,7 @@ def _create_few_shot_examples(self, instruction: str, num_examples: int) -> List
             # Process and clean up the generated sequences
             seqs[j] = seqs[j].replace(sample_inputs[j], "").strip()
             # Check if the prediction is correct and add reasoning if so
-            if preds[j] == sample_targets[j]:
+            if preds[j] == sample_targets[j] or not self.check_fs_accuracy:
                 few_shots[j] = CAPO_FEWSHOT_TEMPLATE.replace("<input>", sample_inputs[j]).replace("<output>", seqs[j])
 
         return few_shots
@@ -218,7 +233,7 @@ def _crossover(self, parents: List[CAPOPrompt]) -> List[CAPOPrompt]:
 
         offsprings = []
         for instruction, examples in zip(child_instructions, offspring_few_shots):
-            instruction = instruction.split("<prompt>")[-1].split("</prompt>")[0].strip()
+            instruction = extract_from_tag(instruction, "<prompt>", "</prompt>")
             offsprings.append(CAPOPrompt(instruction, examples))
 
         return offsprings
@@ -240,7 +255,7 @@ def _mutate(self, offsprings: List[CAPOPrompt]) -> List[CAPOPrompt]:
 
         mutated = []
         for new_instruction, prompt in zip(new_instructions, offsprings):
-            new_instruction = new_instruction.split("<prompt>")[-1].split("</prompt>")[0].strip()
+            new_instruction = extract_from_tag(new_instruction, "<prompt>", "</prompt>")
             p = random.random()
 
             if p < 1 / 3 and len(prompt.few_shots) < self.upper_shots:  # add a random few shot
diff --git a/promptolution/optimizers/evoprompt_de.py b/promptolution/optimizers/evoprompt_de.py
index 426e973..e561096 100644
--- a/promptolution/optimizers/evoprompt_de.py
+++ b/promptolution/optimizers/evoprompt_de.py
@@ -6,8 +6,9 @@
 from typing import TYPE_CHECKING, List
 
 from promptolution.optimizers.base_optimizer import BaseOptimizer
+from promptolution.utils.formatting import extract_from_tag
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.llms.base_llm import BaseLLM
     from promptolution.predictors.base_predictor import BasePredictor
     from promptolution.tasks.base_task import BaseTask
@@ -94,7 +95,7 @@ def _step(self) -> List[str]:
             meta_prompts.append(meta_prompt)
 
         child_prompts = self.meta_llm.get_response(meta_prompts)
-        child_prompts = [prompt.split("<prompt>")[-1].split("</prompt>")[0].strip() for prompt in child_prompts]
+        child_prompts = extract_from_tag(child_prompts, "<prompt>", "</prompt>")
 
         child_scores = self.task.evaluate(child_prompts, self.predictor, return_agg_scores=True)
 
diff --git a/promptolution/optimizers/evoprompt_ga.py b/promptolution/optimizers/evoprompt_ga.py
index 6fc1215..80bc63f 100644
--- a/promptolution/optimizers/evoprompt_ga.py
+++ b/promptolution/optimizers/evoprompt_ga.py
@@ -7,13 +7,14 @@
 
 from promptolution.optimizers.base_optimizer import BaseOptimizer
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.llms.base_llm import BaseLLM
     from promptolution.predictors.base_predictor import BasePredictor
     from promptolution.tasks.base_task import BaseTask
     from promptolution.utils.callbacks import BaseCallback
     from promptolution.utils.config import ExperimentConfig
 
+from promptolution.utils.formatting import extract_from_tag
 from promptolution.utils.logging import get_logger
 
 logger = get_logger(__name__)
@@ -126,6 +127,6 @@ def _crossover(self, prompts, scores) -> str:
             meta_prompts.append(meta_prompt)
 
         child_prompts = self.meta_llm.get_response(meta_prompts)
-        child_prompts = [prompt.split("<prompt>")[-1].split("</prompt>")[0].strip() for prompt in child_prompts]
+        child_prompts = extract_from_tag(child_prompts, "<prompt>", "</prompt>")
 
         return child_prompts
diff --git a/promptolution/optimizers/opro.py b/promptolution/optimizers/opro.py
index 0e3f892..8bb53d8 100644
--- a/promptolution/optimizers/opro.py
+++ b/promptolution/optimizers/opro.py
@@ -7,8 +7,9 @@
 
 from promptolution.optimizers.base_optimizer import BaseOptimizer
 from promptolution.optimizers.templates import OPRO_TEMPLATE
+from promptolution.utils.formatting import extract_from_tag
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.llms.base_llm import BaseLLM
     from promptolution.predictors.base_predictor import BasePredictor
     from promptolution.tasks.base_task import BaseTask
@@ -119,7 +120,7 @@ def _step(self) -> List[str]:
 
             response = self.meta_llm.get_response([self.meta_prompt])[0]
 
-            prompt = response.split("<prompt>")[-1].split("</prompt>")[0].strip()
+            prompt = extract_from_tag(response, "<prompt>", "</prompt>")
 
             if prompt in self.prompts:
                 duplicate_prompts += 1
diff --git a/promptolution/predictors/base_predictor.py b/promptolution/predictors/base_predictor.py
index ffcfc15..cec50c6 100644
--- a/promptolution/predictors/base_predictor.py
+++ b/promptolution/predictors/base_predictor.py
@@ -7,7 +7,7 @@
 
 from promptolution.llms.base_llm import BaseLLM
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.utils.config import ExperimentConfig
 
 import numpy as np
diff --git a/promptolution/predictors/classifier.py b/promptolution/predictors/classifier.py
index 3fca57f..a950a42 100644
--- a/promptolution/predictors/classifier.py
+++ b/promptolution/predictors/classifier.py
@@ -6,8 +6,9 @@
 from typing import TYPE_CHECKING, List
 
 from promptolution.predictors.base_predictor import BasePredictor
+from promptolution.utils.formatting import extract_from_tag
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.utils.config import ExperimentConfig
 
 
@@ -124,7 +125,7 @@ def _extract_preds(self, preds: List[str]) -> np.ndarray:
         """
         response = []
         for pred in preds:
-            pred = pred.split(self.begin_marker)[-1].split(self.end_marker)[0].strip().lower()
+            pred = extract_from_tag(pred, self.begin_marker, self.end_marker).lower()
             if self.classes is not None and pred not in self.classes:
                 pred = self.classes[0]
 
diff --git a/promptolution/tasks/base_task.py b/promptolution/tasks/base_task.py
index 418ab87..2b67495 100644
--- a/promptolution/tasks/base_task.py
+++ b/promptolution/tasks/base_task.py
@@ -4,41 +4,260 @@
 from abc import ABC, abstractmethod
 
 import numpy as np
+import pandas as pd
 
-from typing import TYPE_CHECKING, List
+from typing import TYPE_CHECKING, Any, List, Literal, Optional, Tuple, Union
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
+    from promptolution.predictors.base_predictor import BasePredictor
     from promptolution.utils.config import ExperimentConfig
 
 
 class BaseTask(ABC):
-    """Abstract base class for tasks in the promptolution library.
+    """Abstract base class for tasks in the promptolution library."""
 
-    This class defines the interface that all concrete task implementations should follow.
+    def __init__(
+        self,
+        df: pd.DataFrame,
+        x_column: str,
+        y_column: Optional[str] = None,
+        task_description: str = None,
+        n_subsamples: int = 30,
+        eval_strategy: Literal["full", "subsample", "sequential_block", "random_block", "evaluated"] = "full",
+        seed: int = 42,
+        config: "ExperimentConfig" = None,
+    ):
+        """Initialize the BaseTask.
 
-    Methods:
-        evaluate: An abstract method that should be implemented by subclasses
-                  to evaluate prompts using a given predictor.
-    """
+        Args:
+            df (pd.DataFrame): The input DataFrame containing the data.
+            x_column (str): Name of the column containing input texts.
+            y_column (Optional[str]): Name of the column containing labels/ground truth (if applicable).
+            task_description (str): Description of the task.
+            n_subsamples (int): Number of subsamples to use for evaluation.
+            eval_strategy (Literal): Subsampling strategy ("full", "subsample", "sequential_block", "random_block", "evaluated").
+            seed (int): Random seed for reproducibility.
+            config (ExperimentConfig, optional): Configuration for the task, overriding defaults.
+        """
+        self.df = df
+        self.x_column = x_column
+        self.y_column = y_column
+        self.task_description = task_description
+        self.n_subsamples = n_subsamples
+        self.eval_strategy = eval_strategy
+        self.seed = seed
 
-    def __init__(self, config: "ExperimentConfig" = None):
-        """Initialize the BaseTask."""
+        super().__init__()
         if config is not None:
             config.apply_to(self)
 
+        self.xs = df[self.x_column].values
+        self.has_y = y_column is not None
+        if self.has_y:
+            self.ys = df[self.y_column].values
+        else:
+            # If no y_column is provided, create a dummy y array
+            self.ys = np.array([None] * len(self.xs))
+
+        self.block_idx = 0
+        self.n_blocks = len(self.xs) // self.n_subsamples if self.n_subsamples > 0 else 1
+        self.rng = np.random.default_rng(seed)
+
+        self.eval_cache = {}  # (prompt, x, y): scores per datapoint
+        self.seq_cache = {}  # (prompt, x, y): generating sequence per datapoint
+
+    def subsample(
+        self, eval_strategy: Literal["full", "subsample", "sequential_block", "random_block"] = None
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """Subsample the dataset based on the specified parameters.
+
+        Args:
+            eval_strategy (str, optional): Subsampling strategy to use instead of self.eval_strategy. Defaults to None.
+
+        Returns:
+            Tuple[np.ndarray, np.ndarray]: Subsampled input data and labels.
+        """
+        if eval_strategy is None:
+            eval_strategy = self.eval_strategy
+
+        if eval_strategy in ["full", "evaluated"]:
+            return self.xs, self.ys
+        elif eval_strategy == "subsample":
+            indices = self.rng.choice(len(self.xs), min(self.n_subsamples, len(self.xs)), replace=False)
+            return self.xs[indices], self.ys[indices]
+        elif eval_strategy == "random_block":
+            block_id = self.rng.integers(0, self.n_blocks)
+            start_idx = block_id * self.n_subsamples
+            end_idx = min((block_id + 1) * self.n_subsamples, len(self.xs))
+            indices = np.arange(start_idx, end_idx)
+            return self.xs[indices], self.ys[indices]
+        elif eval_strategy == "sequential_block":
+            start_idx = self.block_idx * self.n_subsamples
+            end_idx = min((self.block_idx + 1) * self.n_subsamples, len(self.xs))
+            indices = np.arange(start_idx, end_idx)
+            return self.xs[indices], self.ys[indices]
+        else:
+            raise ValueError(f"Unknown subsampling strategy: '{eval_strategy}'")
+
+    def _prepare_batch(
+        self, prompts: List[str], xs: np.ndarray, ys: np.ndarray, eval_strategy: str
+    ) -> List[Tuple[str, str, Any]]:
+        """Generates (prompt, x, y) keys that require prediction.
+
+        Returns keys not found in eval_cache.
+        """
+        if eval_strategy == "evaluated":
+            return []
+        keys_to_predict = []
+        for prompt in prompts:
+            for x, y in zip(xs, ys):
+                cache_key = (prompt, x, y)
+                if cache_key not in self.eval_cache:
+                    keys_to_predict.append(cache_key)
+        return keys_to_predict
+
+    def _collect_results_from_cache(
+        self,
+        prompts: List[str],
+        xs: np.ndarray,
+        ys: np.ndarray,
+        return_agg_scores: bool,
+        return_seq: bool,
+    ) -> Union[np.ndarray, Tuple[np.ndarray, Union[List[Any], np.ndarray]]]:
+        """Collects all results for the current batch from the cache and formats them."""
+        scores = []
+        seqs = []
+
+        for prompt in prompts:
+            datapoint_scores = []
+            datapoint_seqs = []
+            for x, y in zip(xs, ys):
+                cache_key = (prompt, x, y)
+                datapoint_scores.append(self.eval_cache.get(cache_key, np.nan))
+                datapoint_seqs.append(self.seq_cache.get(cache_key))
+            scores.append(datapoint_scores)
+            seqs.append(datapoint_seqs)
+
+        if return_agg_scores:
+            scores = [np.nanmean(s) for s in scores]
+
+        scores = np.array(scores)
+        seqs = np.array(seqs)
+
+        return scores if not return_seq else (scores, seqs)
+
     @abstractmethod
-    def evaluate(self, prompts: List[str], predictor, system_prompts: List[str] = None) -> np.ndarray:
-        """Abstract method to evaluate prompts using a given predictor.
+    def _evaluate(self, xs: np.ndarray, ys: np.ndarray, preds: np.ndarray) -> List[float]:
+        """Abstract method to calculate the score for a predictions.
+
+        This method should be implemented by subclasses based on their specific evaluation logic.
+        """
+        raise NotImplementedError
+
+    def evaluate(
+        self,
+        prompts: Union[str, List[str]],
+        predictor: "BasePredictor",
+        system_prompts: List[str] = None,
+        return_agg_scores: bool = True,
+        return_seq: bool = False,
+        eval_strategy: str = None,
+    ) -> Union[np.ndarray, Tuple[np.ndarray, Union[List[Any], np.ndarray]]]:
+        """Evaluate a set of prompts using a given predictor.
+
+        This method orchestrates subsampling, prediction, caching, and result collection.
+        """
+        prompts = [prompts] if isinstance(prompts, str) else prompts
+        eval_strategy = eval_strategy or self.eval_strategy
+
+        xs, ys = self.subsample(eval_strategy=eval_strategy)
+        batches = self._prepare_batch(prompts, xs, ys, eval_strategy=eval_strategy)
+        (prompts_to_evaluate, xs_to_evaluate, ys_to_evaluate) = zip(*batches) if batches else ([], [], [])
+
+        preds = predictor.predict(
+            prompts=prompts_to_evaluate,
+            xs=xs_to_evaluate,
+            system_prompts=system_prompts,
+            return_seq=return_seq,
+        )
+
+        if return_seq:
+            preds, seqs = preds
+        scores = self._evaluate(xs_to_evaluate, ys_to_evaluate, preds)
+        for i, cache_key in enumerate(batches):
+            self.eval_cache[cache_key] = scores[i]
+
+            if return_seq:
+                self.seq_cache[cache_key] = seqs[i]
+
+        return self._collect_results_from_cache(
+            prompts,
+            xs,
+            ys,
+            return_agg_scores,
+            return_seq,
+        )
+
+    def pop_datapoints(self, n: int = None, frac: float = None) -> pd.DataFrame:
+        """Pop a number of datapoints from the dataset.
 
         Args:
-            prompts (List[str]): List of prompts to evaluate.
-            predictor: The predictor to use for evaluation.
-            system_prompts (List[str]): List of system prompts to evaluate.
+            n (int, optional): Number of datapoints to pop. Defaults to None.
+            frac (float, optional): Fraction of datapoints to pop. Defaults to None.
 
         Returns:
-            np.ndarray: Array of evaluation scores for each prompt.
+            pd.DataFrame: DataFrame containing the popped datapoints.
+        """
+        assert n is None or frac is None, "Only one of n or frac can be specified."
+        if n is not None:
+            indices = self.rng.choice(len(self.xs), n, replace=False)
+        elif frac is not None:
+            indices = self.rng.choice(len(self.xs), int(len(self.xs) * frac), replace=False)
+        else:
+            raise ValueError("Either n or frac must be specified.")
+
+        popped_xs = self.xs[indices]
+        popped_ys = self.ys[indices]
+        df_popped = pd.DataFrame({self.x_column: popped_xs, self.y_column: popped_ys})
+
+        self.xs = np.delete(self.xs, indices)
+        self.ys = np.delete(self.ys, indices)
+
+        # Update n_blocks and block_idx based on the new dataset size
+        self.n_blocks = len(self.xs) // self.n_subsamples if self.n_subsamples > 0 else 1
+        self.block_idx = min(self.block_idx, self.n_blocks - 1) if self.n_blocks > 0 else 0
+
+        # Clear cache for popped items (optional, but good practice if memory is a concern)
+        keys_to_remove = []
+        for key in self.eval_cache:
+            if key[1] in popped_xs and key[2] in popped_ys:  # Check if the x and y correspond to popped data
+                keys_to_remove.append(key)
+        for key in keys_to_remove:
+            self.eval_cache.pop(key, None)
+            self.seq_cache.pop(key, None)
+
+        return df_popped
+
+    def increment_block_idx(self) -> None:
+        """Increment the block index for subsampling.
 
         Raises:
-            NotImplementedError: If not implemented by a subclass.
+            ValueError: If the eval_strategy does not contain "block".
         """
-        raise NotImplementedError
+        if "block" not in self.eval_strategy:
+            raise ValueError("Block increment is only valid for block subsampling.")
+        self.block_idx += 1
+        if self.n_blocks > 0:  # Ensure n_blocks is not zero to avoid division by zero
+            self.block_idx %= self.n_blocks
+        else:
+            self.block_idx = 0  # If no blocks, reset to 0
+
+    def reset_block_idx(self) -> None:
+        """Reset the block index for subsampling.
+
+        Raises:
+            ValueError: If the eval_strategy does not contain "block".
+        """
+        if "block" not in self.eval_strategy:
+            raise ValueError("Block reset is only valid for block subsampling.")
+        self.block_idx = 0
diff --git a/promptolution/tasks/classification_tasks.py b/promptolution/tasks/classification_tasks.py
index 9ff156f..6ca509c 100644
--- a/promptolution/tasks/classification_tasks.py
+++ b/promptolution/tasks/classification_tasks.py
@@ -5,12 +5,11 @@
 import pandas as pd
 from sklearn.metrics import accuracy_score
 
-from typing import TYPE_CHECKING, Any, Callable, List, Literal, Tuple, Union
+from typing import TYPE_CHECKING, Callable, List, Literal
 
 from promptolution.tasks.base_task import BaseTask
 
-if TYPE_CHECKING:
-    from promptolution.predictors.base_predictor import BasePredictor
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.utils.config import ExperimentConfig
 
 
@@ -24,7 +23,7 @@ class ClassificationTask(BaseTask):
     def __init__(
         self,
         df: pd.DataFrame,
-        description: str = None,
+        task_description: str = None,
         x_column: str = "x",
         y_column: str = "y",
         n_subsamples: int = 30,
@@ -37,7 +36,7 @@ def __init__(
 
         Args:
             df (pd.DataFrame): Input DataFrame containing the data
-            description (str): Description of the task
+            task_description (str): Description of the task
             x_column (str, optional): Name of the column containing input texts. Defaults to "x".
             y_column (str, optional): Name of the column containing labels. Defaults to "y".
             n_subsamples (int, optional): Number of subsamples to use. No subsampling if None. Defaults to None.
@@ -52,193 +51,23 @@ def __init__(
             metric (Callable, optional): Metric to use for evaluation. Defaults to accuracy_score.
             config (ExperimentConfig, optional): Configuration for the task, overriding defaults.
         """
-        self.description = description
         self.metric = metric
-
-        self.x_column = x_column
-        self.y_column = y_column
-        self.eval_strategy = eval_strategy
-        self.n_subsamples = n_subsamples
-        super().__init__(config)
-
-        self.xs = df[self.x_column].values
-        self.ys = df[self.y_column].str.lower().values
+        super().__init__(
+            df=df,
+            x_column=x_column,
+            y_column=y_column,
+            task_description=task_description,
+            n_subsamples=n_subsamples,
+            eval_strategy=eval_strategy,
+            seed=seed,
+            config=config,
+        )
+        self.ys = df[self.y_column].str.lower().values  # Ensure y values are lowercase for consistent comparison
         self.classes = np.unique(self.ys)
 
-        self.block_idx = 0
-        self.n_blocks = len(self.xs) // self.n_subsamples
-        self.rng = np.random.default_rng(seed)
-
-        self.eval_cache = {}  # (prompt, x, y): scores per datapoint
-        self.seq_cache = {}  # (prompt, x, y): generating sequence per datapoint
-
-    def subsample(
-        self, eval_strategy: Literal["full", "subsample", "sequential_block", "random_block"] = None
-    ) -> Tuple[np.ndarray, np.ndarray]:
-        """Subsample the dataset based on the specified parameters.
-
-        Args:
-            strategy (str, optional): Subsampling strategy to use instead of self.subsample_strategy. Defaults to None.
-
-        Returns:
-            Tuple[np.ndarray, np.ndarray]: Subsampled input data and labels.
-        """
-        if eval_strategy is None:
-            eval_strategy = self.eval_strategy
-
-        if eval_strategy in ["full", "evaluated"]:
-            return self.xs, self.ys
-
-        elif eval_strategy == "subsample":
-            indices = self.rng.choice(len(self.xs), self.n_subsamples, replace=False)
-            return self.xs[indices], self.ys[indices]
-
-        elif eval_strategy == "random_block":
-            block_id = self.rng.integers(0, len(self.xs) // self.n_subsamples)
-            indices = np.arange(block_id * self.n_subsamples, (block_id + 1) * self.n_subsamples)
-            return self.xs[indices], self.ys[indices]
-
-        elif eval_strategy == "sequential_block":
-            indices = np.arange(self.block_idx * self.n_subsamples, (self.block_idx + 1) * self.n_subsamples)
-            return self.xs[indices], self.ys[indices]
-
-        else:
-            raise ValueError(f"Unknown subsampling strategy: '{eval_strategy}")
-
-    def _prepare_batch(
-        self, prompts: List[str], xs: np.ndarray, ys: np.ndarray, eval_strategy: str
-    ) -> List[Tuple[str, str, str]]:
-        """Generates (prompt, x, y) keys that require prediction.
-
-        If strategy is "evaluated", returns an empty list.
-        Otherwise, returns keys not found in eval_cache.
-        """
-        if eval_strategy == "evaluated":
-            return []
-
-        keys_to_predict = []
-        for prompt in prompts:
-            for x, y in zip(xs, ys):
-                cache_key = (prompt, x, y)
-                if cache_key not in self.eval_cache:
-                    keys_to_predict.append(cache_key)
-        return keys_to_predict
-
-    def _collect_results_from_cache(
-        self,
-        prompts: List[str],
-        xs: np.ndarray,
-        ys: np.ndarray,
-        return_agg_scores: bool,
-        return_seq: bool,
-    ) -> Union[np.ndarray, Tuple[np.ndarray, Union[List[Any], np.ndarray]]]:
-        """Collects all results for the current batch from the cache and formats them."""
+    def _evaluate(self, xs: np.ndarray, ys: np.ndarray, preds: np.ndarray) -> List[float]:
+        """Calculate the score for a single prediction."""
         scores = []
-        seqs = []
-
-        for prompt in prompts:
-            cache_keys = [(prompt, x, y) for x, y in zip(xs, ys)]
-            scores += [[self.eval_cache.get(key, np.nan) for key in cache_keys]]
-            seqs += [[self.seq_cache.get(key) for key in cache_keys]]
-        if return_agg_scores:
-            scores = [np.nanmean(s) for s in scores]
-        scores = np.array(scores)
-        seqs = np.array(seqs)
-
-        return scores if not return_seq else (scores, seqs)
-
-    def evaluate(
-        self,
-        prompts: Union[str, List[str]],
-        predictor: "BasePredictor",
-        system_prompts: List[str] = None,
-        return_agg_scores: bool = True,
-        return_seq: bool = False,
-        eval_strategy: str = None,
-    ) -> Union[np.ndarray, Tuple[np.ndarray, Union[List[Any], np.ndarray]]]:
-        """Evaluate a set of prompts using a given predictor.
-
-        This method orchestrates subsampling, prediction, caching, and result collection.
-        """
-        prompts = [prompts] if isinstance(prompts, str) else prompts
-        eval_strategy = eval_strategy or self.eval_strategy
-
-        xs, ys = self.subsample(eval_strategy=eval_strategy)
-        batches = self._prepare_batch(prompts, xs, ys, eval_strategy)
-        prompts_to_evaluate, xs_to_evaluate, ys_to_evaluate = zip(*batches) if batches else ([], [], [])
-
-        preds = predictor.predict(
-            prompts=prompts_to_evaluate,
-            xs=xs_to_evaluate,
-            system_prompts=system_prompts,
-            return_seq=return_seq,
-        )
-
-        if return_seq:
-            preds, seqs = preds
-
-        for i, cache_key in enumerate(batches):
-            y_pred, y_true = preds[i], ys_to_evaluate[i]
-            if return_seq:
-                self.seq_cache[cache_key] = seqs[i]
-            self.eval_cache[cache_key] = self.metric([y_pred], [y_true])
-
-        return self._collect_results_from_cache(
-            prompts,
-            xs,
-            ys,
-            return_agg_scores,
-            return_seq,
-        )
-
-    def pop_datapoints(self, n: int = None, frac: float = None) -> pd.DataFrame:
-        """Pop a number of datapoints from the dataset.
-
-        Args:
-            n (int, optional): Number of datapoints to pop. Defaults to None.
-            frac (float, optional): Fraction of datapoints to pop. Defaults to None.
-
-        Returns:
-            pd.DataFrame: DataFrame containing the popped datapoints.
-        """
-        assert n is None or frac is None, "Only one of n or frac can be specified."
-        if n is not None:
-            indices = self.rng.choice(len(self.xs), n, replace=False)
-        elif frac is not None:
-            indices = self.rng.choice(len(self.xs), int(len(self.xs) * frac), replace=False)
-        else:
-            raise ValueError("Either n or frac must be specified.")
-
-        xs = self.xs[indices]
-        ys = self.ys[indices]
-        df = pd.DataFrame({self.x_column: xs, self.y_column: ys})
-
-        self.xs = np.delete(self.xs, indices)
-        self.ys = np.delete(self.ys, indices)
-
-        self.n_blocks = len(self.xs) // self.n_subsamples
-        self.block_idx = min(self.block_idx, self.n_blocks - 1)
-
-        return df
-
-    def increment_block_idx(self) -> None:
-        """Increment the block index for subsampling.
-
-        Raises:
-            ValueError: If the eval_strategy does not contain "block".
-        """
-        if "block" not in self.eval_strategy:
-            raise ValueError("Block increment is only valid for block subsampling.")
-        self.block_idx += 1
-        if self.block_idx >= self.n_blocks:
-            self.block_idx = 0
-
-    def reset_block_idx(self) -> None:
-        """Reset the block index for subsampling.
-
-        Raises:
-            ValueError: If the eval_strategy does not contain "block".
-        """
-        if "block" not in self.eval_strategy:
-            raise ValueError("Block reset is only valid for block subsampling.")
-        self.block_idx = 0
+        for pred, y in zip(preds, ys):
+            scores.append(self.metric([y], [pred]))
+        return scores
diff --git a/promptolution/tasks/judge_tasks.py b/promptolution/tasks/judge_tasks.py
new file mode 100644
index 0000000..9d98006
--- /dev/null
+++ b/promptolution/tasks/judge_tasks.py
@@ -0,0 +1,133 @@
+"""Module for judge tasks."""
+
+import numpy as np
+import pandas as pd
+
+from typing import TYPE_CHECKING, List, Literal, Optional
+
+from promptolution.llms.base_llm import BaseLLM
+from promptolution.tasks.base_task import BaseTask
+from promptolution.utils.formatting import extract_from_tag
+from promptolution.utils.logging import get_logger
+
+if TYPE_CHECKING:  # pragma: no cover
+    from promptolution.utils.config import ExperimentConfig
+
+logger = get_logger(__name__)
+
+JUDGE_PROMPT_WITH_GROUND_TRUTH = """You are an expert evaluator. Judge how well the prediction matches the ground truth, for the given task.
+
+Task:
+{task}
+
+Input:
+{input}
+
+Ground Truth:
+{ground_truth}
+
+Prediction:
+{prediction}
+
+Evaluate how closely the prediction aligns with the ground truth. Consider correctness, completeness, and accuracy of the match.
+
+Provide a score from -5 to +5 where:
+- -5: Completely incorrect/opposite
+- 0: Partially correct
+- +5: Perfect match
+
+Return your answer encompased by <final_score></final_score>"""
+
+JUDGE_PROMPT_WITHOUT_GROUND_TRUTH = """You are an expert evaluator. Judge the quality of the response, for the given task.
+
+Task:
+{task}
+
+Input:
+{input}
+
+Prediction:
+{prediction}
+
+Evaluate how well the response addresses the input for the given task. Consider correctness, quality, relevance, completeness, and excellence of execution.
+
+Provide a score from -5 to +5 where:
+- -5: Completely wrong/inappropriate
+- 0: Partially addresses the task with mixed quality
+- +5: Exceptional response that brilliantly solves the task with creativity, insight, or outstanding execution that goes beyond basic correctness
+
+Return your answer encompased by <final_score></final_score>"""
+
+
+class JudgeTask(BaseTask):
+    """Task that evaluates a predictor using an LLM as a judge, optionally accepting a ground truth."""
+
+    def __init__(
+        self,
+        df: pd.DataFrame,
+        judge_llm: "BaseLLM",
+        x_column: str = "x",
+        y_column: Optional[str] = None,
+        task_description: Optional[str] = None,
+        n_subsamples: int = 30,
+        eval_strategy: Literal["full", "subsample", "sequential_block", "random_block"] = "full",
+        seed: int = 42,
+        judge_prompt: Optional[str] = None,
+        min_score: float = -5.0,
+        max_score: float = 5.0,
+        config: "ExperimentConfig" = None,
+    ):
+        """Initialize the JudgeTask."""
+        if judge_prompt is None:
+            judge_prompt = JUDGE_PROMPT_WITH_GROUND_TRUTH if y_column else JUDGE_PROMPT_WITHOUT_GROUND_TRUTH
+        self.judge_prompt = judge_prompt
+        self.min_score = min_score
+        self.max_score = max_score
+
+        super().__init__(
+            df=df,
+            x_column=x_column,
+            y_column=y_column,
+            task_description=task_description,
+            n_subsamples=n_subsamples,
+            eval_strategy=eval_strategy,
+            seed=seed,
+            config=config,
+        )
+        assert judge_llm is not None, "judge_llm must be provided for JudgeTask"
+        self.judge_llm = judge_llm
+
+    def _construct_judge_prompt(self, x: str, pred: str, y: Optional[str] = None) -> str:
+        """Constructs the judge prompt based on whether ground truth is available."""
+        if y is not None:
+            prompt = self.judge_prompt.replace("{ground_truth}", str(y))
+        else:
+            prompt = self.judge_prompt
+
+        prompt = prompt.replace("{task}", self.task_description).replace("{input}", x).replace("{prediction}", pred)
+        return prompt
+
+    def _evaluate(self, xs: np.ndarray, ys: np.ndarray, preds: np.ndarray) -> List[float]:
+        """Calculate the score for a single prediction using the LLM judge."""
+        prompts: List[str] = []
+        for x, y, pred in zip(xs, ys, preds):
+            judge_prompt = self._construct_judge_prompt(x, pred, y)
+            prompts.append(judge_prompt)
+        judge_responses = self.judge_llm.get_response(prompts)
+        scores_str = extract_from_tag(judge_responses, "<final_score>", "</final_score>")
+        scores = []
+        for score in scores_str:
+            try:
+                # only numeric chars, - or . are allowed
+                score = "".join(filter(lambda c: c.isdigit() or c in "-.", score))
+                score = float(score)
+                # normalize from [-5, +5] to [0, 1]
+                score = (score + self.min_score) / (self.max_score - self.min_score)
+                score = max(0.0, min(1.0, score))
+            except ValueError:
+                logger.warning(f"Failed to parse score '{score}' as float. Defaulting to 0.0.")
+                score = 0.0
+
+            scores.append(score)
+
+        return scores
diff --git a/promptolution/tasks/reward_tasks.py b/promptolution/tasks/reward_tasks.py
new file mode 100644
index 0000000..c7c36db
--- /dev/null
+++ b/promptolution/tasks/reward_tasks.py
@@ -0,0 +1,59 @@
+"""Module for Reward tasks."""
+
+
+import numpy as np
+import pandas as pd
+
+from typing import TYPE_CHECKING, Callable, List, Literal, Optional
+
+from promptolution.tasks.base_task import BaseTask
+
+if TYPE_CHECKING:  # pragma: no cover
+    from promptolution.utils.config import ExperimentConfig
+
+
+class RewardTask(BaseTask):
+    """A task that evaluates a predictor using a reward function.
+
+    This task takes a DataFrame, a column name for input data, and a reward function.
+    The reward function should take a prediction and return a reward.
+    """
+
+    def __init__(
+        self,
+        df: pd.DataFrame,
+        reward_function: Callable,
+        x_column: str = "x",
+        task_description: Optional[str] = None,
+        n_subsamples: int = 30,
+        eval_strategy: Literal["full", "subsample", "sequential_block", "random_block"] = "full",
+        seed: int = 42,
+        config: "ExperimentConfig" = None,
+    ):
+        """Initialize the RewardTask.
+
+        Args:
+            df (pd.DataFrame): Input DataFrame containing the data.
+            reward_function (Callable): Function that takes a prediction and returns a reward score.
+            x_column (str, optional): Name of the column containing input texts. Defaults to "x".
+            task_description (str, optional): Description of the task.
+            n_subsamples (int, optional): Number of subsamples to use. Defaults to 30.
+            eval_strategy (str, optional): Subsampling strategy to use. Defaults to "full".
+            seed (int, optional): Random seed for reproducibility. Defaults to 42.
+            config (ExperimentConfig, optional): Configuration for the task, overriding defaults.
+        """
+        self.reward_function = reward_function
+        super().__init__(
+            df=df,
+            x_column=x_column,
+            task_description=task_description,
+            n_subsamples=n_subsamples,
+            eval_strategy=eval_strategy,
+            seed=seed,
+            config=config,
+        )
+
+    def _evaluate(self, xs: np.ndarray, ys: np.ndarray, preds: np.ndarray) -> List[float]:
+        """Calculate the score for a single reward prediction using the reward function."""
+        rewards = [self.reward_function(pred) for pred in preds]
+        return rewards
diff --git a/promptolution/utils/formatting.py b/promptolution/utils/formatting.py
new file mode 100644
index 0000000..89426a3
--- /dev/null
+++ b/promptolution/utils/formatting.py
@@ -0,0 +1,18 @@
+"""Utils for formatting prompts and outputs."""
+from typing import List, Union
+
+
+def extract_from_tag(text: str, start_tag: str, end_tag: str) -> Union[List[str], str]:
+    """Extracts content from a string between specified start and end tags."""
+    was_list = True
+    if isinstance(text, str):
+        text = [text]
+        was_list = False
+
+    outs = []
+    for t in text:
+        out = t.split(start_tag)[-1].split(end_tag)[0].strip()
+        outs.append(out)
+    if was_list:
+        return outs
+    return outs[0]
diff --git a/promptolution/utils/prompt_creation.py b/promptolution/utils/prompt_creation.py
index d4764df..8f0388c 100644
--- a/promptolution/utils/prompt_creation.py
+++ b/promptolution/utils/prompt_creation.py
@@ -5,7 +5,9 @@
 
 from typing import TYPE_CHECKING, List, Union
 
-if TYPE_CHECKING:
+from promptolution.utils.formatting import extract_from_tag
+
+if TYPE_CHECKING:  # pragma: no cover
     from promptolution.llms.base_llm import BaseLLM
     from promptolution.tasks.base_task import BaseTask
 
@@ -36,8 +38,7 @@ def create_prompt_variation(prompt: Union[List[str], str], llm: "BaseLLM", meta_
     if isinstance(prompt, str):
         prompt = [prompt]
     varied_prompts = llm.get_response([meta_prompt.replace("<prev_prompt>", p) for p in prompt])
-
-    varied_prompts = [p.split("</prompt>")[0].split("<prompt>")[-1] for p in varied_prompts]
+    varied_prompts = extract_from_tag(varied_prompts, "<prompt>", "</prompt>")
 
     return varied_prompts
 
@@ -110,6 +111,6 @@ def create_prompts_from_samples(
         meta_prompts.append(meta_prompt)
 
     prompts = llm.get_response(meta_prompts)
-    prompts = [prompt.split("</prompt>")[0].split("<prompt>")[-1].strip() for prompt in prompts]
+    prompts = extract_from_tag(prompts, "<prompt>", "</prompt>")
 
     return prompts
diff --git a/tests/conftest.py b/tests/conftest.py
index e2099d1..b5c023e 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -7,6 +7,8 @@
 from mocks.mock_task import MockTask
 
 from promptolution.tasks import ClassificationTask
+from promptolution.tasks.judge_tasks import JudgeTask
+from promptolution.tasks.reward_tasks import RewardTask
 from promptolution.utils import ExperimentConfig
 
 
@@ -31,9 +33,9 @@ def experiment_config():
 def mock_task():
     """Fixture providing a MockTask with predetermined scoring behavior."""
 
-    def score_function(prompt):
+    def score_function(pred):
         # Prefer longer prompts for testing purposes
-        return min(0.9, 0.5 + 0.01 * len(prompt))
+        return len(pred)
 
     return MockTask(predetermined_scores=score_function)
 
@@ -92,9 +94,104 @@ def mock_classification_task_with_subsampling(mock_df):
     """Fixture providing a ClassificationTask instance with subsampling."""
     return ClassificationTask(
         df=mock_df,
-        description="Sentiment classification task",
+        task_description="Sentiment classification task",
         x_column="x",
         y_column="y",
         eval_strategy="subsample",
         n_subsamples=2,
     )
+
+
+@pytest.fixture
+def simple_reward_function():
+    """A simple reward function for testing RewardTask."""
+
+    def reward_func(prediction: str) -> float:
+        if "great" in prediction.lower() or "perfect" in prediction.lower():
+            return 1.0
+        elif "ok" in prediction.lower():
+            return 0.5
+        else:
+            return 0.0
+
+    return reward_func
+
+
+@pytest.fixture
+def mock_reward_task(mock_df, simple_reward_function):
+    """Fixture providing a RewardTask instance."""
+    return RewardTask(
+        df=mock_df,
+        reward_function=simple_reward_function,
+        x_column="x",
+        task_description="Evaluate text quality",
+        n_subsamples=2,
+        eval_strategy="full",  # Using "full" for initial clarity, can be changed in specific tests
+        seed=42,
+    )
+
+
+@pytest.fixture
+def mock_reward_task_no_x_column(simple_reward_function):
+    """Fixture providing a RewardTask instance without a meaningful x_column."""
+    # Create a DataFrame where 'x' is just a placeholder, not used for prompt construction directly
+    df_no_x_data = {
+        "id_col": list(range(5)),
+        "dummy_input": ["", "", "", "", ""],  # Or just 0, 1, 2, 3, 4
+        "some_attribute": ["A", "B", "C", "D", "E"],
+    }
+    df_no_x = pd.DataFrame(df_no_x_data)
+    return RewardTask(
+        df=df_no_x,
+        reward_function=simple_reward_function,
+        x_column="dummy_input",  # The x_column is still technically provided but contains empty strings or Nones
+        task_description="Generate and evaluate jokes without explicit input text.",
+        n_subsamples=3,
+        eval_strategy="subsample",
+        seed=42,
+    )
+
+
+@pytest.fixture
+def mock_judge_llm():
+    """Fixture providing a MockLLM configured for judge responses."""
+    # Responses containing the final_score tag
+    responses = [
+        "<final_score>5.0</final_score>",  # Perfect match
+        "<final_score>-5.0</final_score>",  # Completely incorrect
+        "<final_score>0.0</final_score>",  # Partially correct
+        "<final_score>1.0</final_score>",  # Default/Other
+        "<final_score>3.0</final_score>",  # Another specific score
+        "This response does not contain a score tag.",  # For parsing error test
+    ]
+    return MockLLM(predetermined_responses=responses)
+
+
+@pytest.fixture
+def mock_judge_task_with_y(mock_df, mock_judge_llm):
+    """Fixture providing a JudgeTask instance with y_column."""
+    return JudgeTask(
+        df=mock_df,
+        x_column="x",
+        y_column="y",
+        judge_llm=mock_judge_llm,
+        task_description="Evaluate sentiment prediction quality.",
+        n_subsamples=2,
+        eval_strategy="full",
+        seed=42,
+    )
+
+
+@pytest.fixture
+def mock_judge_task_no_y(mock_df, mock_judge_llm):
+    """Fixture providing a JudgeTask instance without y_column."""
+    # Use mock_df, but ensure y_column is explicitly None for this task instance
+    return JudgeTask(
+        df=mock_df,
+        x_column="x",
+        judge_llm=mock_judge_llm,
+        task_description="Evaluate joke quality (no ground truth).",
+        n_subsamples=2,
+        eval_strategy="subsample",  # Test with subsampling here
+        seed=42,
+    )
diff --git a/tests/helpers/test_helpers.py b/tests/helpers/test_helpers.py
index 909da42..03231a6 100644
--- a/tests/helpers/test_helpers.py
+++ b/tests/helpers/test_helpers.py
@@ -98,7 +98,7 @@ def test_run_optimization(
     # Verify mocks were called
     mock_get_llm.assert_called_once_with(config=experiment_config)
     mock_get_predictor.assert_called_once_with(mock_llm, config=experiment_config)
-    mock_get_task.assert_called_once_with(sample_df, experiment_config)
+    mock_get_task.assert_called_once_with(sample_df, experiment_config, judge_llm=mock_llm)
     mock_get_optimizer.assert_called_once_with(
         predictor=mock_predictor, meta_llm=mock_llm, task=mock_task, config=experiment_config
     )
@@ -158,7 +158,7 @@ def test_run_optimization_with_exemplars(
     # Verify mocks were called
     mock_get_llm.assert_called_once_with(config=experiment_config_with_exemplars)
     mock_get_predictor.assert_called_once_with(mock_llm, config=experiment_config_with_exemplars)
-    mock_get_task.assert_called_once_with(sample_df, experiment_config_with_exemplars)
+    mock_get_task.assert_called_once_with(sample_df, experiment_config_with_exemplars, judge_llm=mock_llm)
     mock_get_optimizer.assert_called_once_with(
         predictor=mock_predictor, meta_llm=mock_llm, task=mock_task, config=experiment_config_with_exemplars
     )
@@ -216,7 +216,7 @@ def test_run_evaluation(mock_get_task, mock_get_predictor, mock_get_llm, sample_
     # Verify mocks were called
     mock_get_llm.assert_called_once_with(config=experiment_config)
     mock_get_predictor.assert_called_once_with(mock_llm, config=experiment_config)
-    mock_get_task.assert_called_once_with(sample_df, experiment_config)
+    mock_get_task.assert_called_once_with(sample_df, experiment_config, judge_llm=mock_llm)
     mock_task.evaluate.assert_called_once_with(prompts, mock_predictor, eval_strategy="full")
 
 
diff --git a/tests/mocks/mock_llm.py b/tests/mocks/mock_llm.py
index cb8ef61..367e204 100644
--- a/tests/mocks/mock_llm.py
+++ b/tests/mocks/mock_llm.py
@@ -23,11 +23,7 @@ def __init__(self, predetermined_responses=None, add_prompt_tags=False, *args, *
         """
         super().__init__(*args, **kwargs)
 
-        # Set up response list
-        if predetermined_responses is None:
-            self.responses = []
-        else:
-            self.responses = list(predetermined_responses)  # Ensure it's a list
+        self.responses = predetermined_responses or []
 
         # Add prompt tags if requested
         if add_prompt_tags:
@@ -58,9 +54,11 @@ def _get_response(self, prompts: List[str], system_prompts: Optional[List[str]]
         results = []
         for i, prompt in enumerate(prompts):
             # Return the next response from the list if available
-            if self.response_index < len(self.responses):
+            if self.response_index < len(self.responses) and isinstance(self.responses, list):
                 results.append(self.responses[self.response_index])
                 self.response_index += 1
+            elif prompt in self.responses and isinstance(self.responses, dict):
+                results.append(self.responses[prompt])
             else:
                 # Default response if we've exhausted the list
                 if hasattr(self, "add_prompt_tags") and getattr(self, "add_prompt_tags"):
diff --git a/tests/mocks/mock_task.py b/tests/mocks/mock_task.py
index 488c3d1..cfda55a 100644
--- a/tests/mocks/mock_task.py
+++ b/tests/mocks/mock_task.py
@@ -3,6 +3,7 @@
 from unittest.mock import MagicMock
 
 import numpy as np
+import pandas as pd
 
 from typing import List
 
@@ -24,7 +25,13 @@ def __init__(self, predetermined_scores=None):
                 or a list of scores to return in sequence, or a function
                 that generates scores based on prompts.
         """
-        super().__init__()
+        super().__init__(
+            df=pd.DataFrame(
+                {"x": ["Sample text 1", "Sample text 2", "Sample text 3"], "y": ["positive", "negative", "neutral"]}
+            ),
+            x_column="x",
+            y_column="y",
+        )
         self.predetermined_scores = predetermined_scores or {}
         self.call_history = []
         self.score_index = 0
@@ -34,76 +41,95 @@ def __init__(self, predetermined_scores=None):
         # Default attributes similar to ClassificationTask
         self.description = "Mock classification task"
         self.classes = ["positive", "neutral", "negative"]
-        self.xs = np.array(["Sample text 1", "Sample text 2", "Sample text 3"])
-        self.ys = np.array(["positive", "negative", "neutral"])
         self.initial_prompts = ["Classify:", "Determine:"]
         self.n_blocks = 10
 
         self.increment_block_idx = MagicMock()
         self.reset_block_idx = MagicMock()
 
-    def evaluate(
-        self,
-        prompts: List[str],
-        predictor,
-        eval_strategy: str = "subsample",
-        system_prompts: List[str] = None,
-        return_agg_scores: bool = False,
-        return_seq: bool = False,
-    ) -> np.ndarray:
-        """Evaluate prompts with predetermined scores.
+    def _evaluate(self, x: np.ndarray, y: np.ndarray, pred: np.ndarray, **kwargs) -> float:
+        """Calculate the score for a single prediction.
 
         Args:
-            prompts: List of prompts to evaluate
-            predictor: Predictor (ignored in mock)
-            system_prompts: System prompts (ignored in mock)
-            subsample: Whether to subsample (ignored in mock)
-            n_samples: Number of samples (ignored in mock)
-            return_seq: Whether to return sequences
+            x: Input data (not used in mock)
+            y: Ground truth labels (not used in mock)
+            pred: Predicted labels
 
         Returns:
-            np.ndarray of scores, and optionally sequences
+            Score based on predetermined scores or a default logic.
         """
-        # Record the call
-        self.call_history.append(
-            {
-                "prompts": prompts,
-                "predictor": predictor,
-                "system_prompts": system_prompts,
-                "eval_strategy": eval_strategy,
-                "return_agg_scores": return_agg_scores,
-                "return_seq": return_seq,
-            }
-        )
-
-        scores = []
-        for prompt in prompts:
-            # Handle different types of predetermined_scores
-            if callable(self.predetermined_scores):
-                # If it's a function, call it with the prompt
-                score = self.predetermined_scores(prompt)
-            elif isinstance(self.predetermined_scores, dict) and prompt in self.predetermined_scores:
-                # If it's a dict, look up the prompt
-                score = self.predetermined_scores[prompt]
-            elif isinstance(self.predetermined_scores, list):
-                # If it's a list, return items in sequence (cycling if needed)
-                if self.score_index < len(self.predetermined_scores):
-                    score = self.predetermined_scores[self.score_index]
-                    self.score_index = (self.score_index + 1) % len(self.predetermined_scores)
-                else:
-                    score = 0.5  # Default score
-            else:
-                # Generate a somewhat predictable score based on prompt length
-                # (longer prompts get slightly higher scores)
-                score = 0.5 + 0.01 * (len(prompt) % 10)
-
-            scores.append(score)
-
-        scores_array = np.array(scores)
-
-        if return_seq:
-            # Generate dummy sequences
-            seqs = [[f"Input: {x}\nOutput: {prompt}" for x in self.xs] for prompt in prompts]
-            return scores_array, seqs
-
-        return scores_array
+        if isinstance(self.predetermined_scores, dict):
+            return self.predetermined_scores.get(pred, 0.5)
+        elif isinstance(self.predetermined_scores, list):
+            self.score_index += 1
+            return self.predetermined_scores[(self.score_index - 1) % len(self.predetermined_scores)]
+        elif callable(self.predetermined_scores):
+            return self.predetermined_scores(x)
+        else:
+            return len(pred)
+
+    # def evaluate(
+    #     self,
+    #     prompts: List[str],
+    #     predictor,
+    #     eval_strategy: str = "subsample",
+    #     system_prompts: List[str] = None,
+    #     return_agg_scores: bool = False,
+    #     return_seq: bool = False,
+    # ) -> np.ndarray:
+    #     """Evaluate prompts with predetermined scores.
+
+    #     Args:
+    #         prompts: List of prompts to evaluate
+    #         predictor: Predictor (ignored in mock)
+    #         system_prompts: System prompts (ignored in mock)
+    #         subsample: Whether to subsample (ignored in mock)
+    #         n_samples: Number of samples (ignored in mock)
+    #         return_seq: Whether to return sequences
+
+    #     Returns:
+    #         np.ndarray of scores, and optionally sequences
+    #     """
+    #     # Record the call
+    #     self.call_history.append(
+    #         {
+    #             "prompts": prompts,
+    #             "predictor": predictor,
+    #             "system_prompts": system_prompts,
+    #             "eval_strategy": eval_strategy,
+    #             "return_agg_scores": return_agg_scores,
+    #             "return_seq": return_seq,
+    #         }
+    #     )
+
+    #     scores = []
+    #     for prompt in prompts:
+    #         # Handle different types of predetermined_scores
+    #         if callable(self.predetermined_scores):
+    #             # If it's a function, call it with the prompt
+    #             score = self.predetermined_scores(prompt)
+    #         elif isinstance(self.predetermined_scores, dict) and prompt in self.predetermined_scores:
+    #             # If it's a dict, look up the prompt
+    #             score = self.predetermined_scores[prompt]
+    #         elif isinstance(self.predetermined_scores, list):
+    #             # If it's a list, return items in sequence (cycling if needed)
+    #             if self.score_index < len(self.predetermined_scores):
+    #                 score = self.predetermined_scores[self.score_index]
+    #                 self.score_index = (self.score_index + 1) % len(self.predetermined_scores)
+    #             else:
+    #                 score = 0.5  # Default score
+    #         else:
+    #             # Generate a somewhat predictable score based on prompt length
+    #             # (longer prompts get slightly higher scores)
+    #             score = 0.5 + 0.01 * (len(prompt) % 10)
+
+    #         scores.append(score)
+
+    #     scores_array = np.array(scores)
+
+    #     if return_seq:
+    #         # Generate dummy sequences
+    #         seqs = [[f"Input: {x}\nOutput: {prompt}" for x in self.xs] for prompt in prompts]
+    #         return scores_array, seqs
+
+    #     return scores_array
diff --git a/tests/optimizers/test_capo.py b/tests/optimizers/test_capo.py
index b0c9ae2..b0d69a0 100644
--- a/tests/optimizers/test_capo.py
+++ b/tests/optimizers/test_capo.py
@@ -2,6 +2,8 @@
 
 import pandas as pd
 
+from tests.mocks.mock_task import MockTask
+
 from promptolution.optimizers.capo import CAPO, CAPOPrompt
 
 
@@ -170,7 +172,6 @@ def test_crossover(mock_meta_llm, mock_predictor, initial_prompts, mock_task, mo
     offsprings = optimizer._crossover(
         [CAPOPrompt("Instruction 1", ["Example 1"]), CAPOPrompt("Instruction 2", ["Example 2"])]
     )
-    print(offsprings)
     assert len(offsprings) == 5
 
 
@@ -189,7 +190,8 @@ def test_mutate(mock_meta_llm, mock_predictor, initial_prompts, mock_task, mock_
     assert len(mutated) == 2
 
 
-def test_do_racing(mock_meta_llm, mock_predictor, initial_prompts, mock_task, mock_df):
+def test_do_racing(mock_meta_llm, mock_predictor, initial_prompts, mock_df):
+    mock_task = MockTask(predetermined_scores=[0.89, 0.9])
     optimizer = CAPO(
         predictor=mock_predictor,
         task=mock_task,
@@ -206,4 +208,4 @@ def test_do_racing(mock_meta_llm, mock_predictor, initial_prompts, mock_task, mo
 
     # check that mocktask.reset_blocks was called
     assert mock_task.reset_block_idx.call_count == 2
-    assert mock_task.increment_block_idx.call_count == 10
+    assert mock_task.increment_block_idx.call_count == 3
diff --git a/tests/tasks/test_classifications_tasks.py b/tests/tasks/test_classifications_tasks.py
index d72a7e6..8cd6e22 100644
--- a/tests/tasks/test_classifications_tasks.py
+++ b/tests/tasks/test_classifications_tasks.py
@@ -7,10 +7,9 @@
 
 def test_classification_task_initialization(mock_df):
     """Test that ClassificationTask initializes correctly."""
-    task = ClassificationTask(df=mock_df, description="Sentiment classification task", x_column="x", y_column="y")
+    task = ClassificationTask(df=mock_df, task_description="Sentiment classification task", x_column="x", y_column="y")
 
-    # Verify attributes
-    assert task.description == "Sentiment classification task"
+    assert task.task_description == "Sentiment classification task"
     assert len(task.classes) == 3
     assert set(task.classes) == set(["positive", "neutral", "negative"])
     assert len(task.xs) == 3
@@ -20,21 +19,17 @@ def test_classification_task_initialization(mock_df):
 
 def test_task_evaluate(mock_classification_task_with_subsampling, mock_predictor):
     """Test the evaluate method of ClassificationTask."""
-    # Evaluate with a single prompt
     prompts = ["Classify sentiment:"]
     scores = mock_classification_task_with_subsampling.evaluate(prompts, mock_predictor)
 
-    # Verify scores
     assert isinstance(scores, np.ndarray)
-    assert scores.shape == (1,)  # One score per prompt
-    assert 0 <= scores[0] <= 1  # Score should be between 0 and 1
+    assert scores.shape == (1,)
+    assert 0 <= scores[0] <= 1
 
-    # Evaluate with multiple prompts
     prompts = ["Classify sentiment:", "Rate the text:"]
     scores = mock_classification_task_with_subsampling.evaluate(prompts, mock_predictor)
 
-    # Verify scores for multiple prompts
-    assert scores.shape == (2,)  # Two scores, one per prompt
+    assert scores.shape == (2,)
     assert all(0 <= score <= 1 for score in scores)
 
 
@@ -42,18 +37,14 @@ def test_task_evaluate_with_subsampling(mock_classification_task_with_subsamplin
     """Test the evaluate method with subsampling."""
     prompts = ["Classify sentiment:"]
 
-    # Evaluate with subsampling
     scores = mock_classification_task_with_subsampling.evaluate(
         prompts,
         mock_predictor,
     )
 
-    # Verify scores
-    assert scores.shape == (1,)  # One score per prompt
+    assert scores.shape == (1,)
 
-    # Test with a different random seed to ensure different subsamples
     with pytest.raises(AssertionError, match=r".*Arrays are not equal.*"):
-        # Use a different random seed to force different subsampling
         np.random.seed(42)
         scores1 = mock_classification_task_with_subsampling.evaluate(
             prompts,
@@ -66,7 +57,6 @@ def test_task_evaluate_with_subsampling(mock_classification_task_with_subsamplin
             mock_predictor,
         )
 
-        # This should fail because the subsamples should be different
         np.testing.assert_array_equal(scores1, scores2)
 
 
@@ -74,14 +64,11 @@ def test_task_evaluate_with_return_seq(mock_classification_task_with_subsampling
     """Test the evaluate method with return_seq=True."""
     prompts = ["Classify sentiment:"]
 
-    # Evaluate with return_seq=True
     scores, seqs = mock_classification_task_with_subsampling.evaluate(prompts, mock_predictor, return_seq=True)
 
-    # Verify scores and sequences
-    assert scores.shape == (1,)  # One score per prompt
-    assert len(seqs) == 1  # One list of sequences per prompt
+    assert scores.shape == (1,)
+    assert len(seqs) == 1
 
-    # Check that sequences contain input text
     for seq in seqs[0]:
         assert any(sample_text in seq for sample_text in mock_classification_task_with_subsampling.xs)
 
@@ -94,22 +81,18 @@ def test_task_evaluate_with_system_prompts(
     prompts = ["Classify sentiment:"]
     system_prompts = ["Be concise"]
 
-    # Evaluate with system prompts
     scores = mock_classification_task_with_subsampling.evaluate(
         prompts, mock_predictor, system_prompts=system_prompts, return_agg_scores=True
     )
 
-    # Verify scores
     assert scores.shape == (1,)
-
-    # Verify that system prompts were passed through to the LLM
     assert any(call["system_prompts"] == system_prompts for call in mock_downstream_llm.call_history)
 
 
 def test_pop_datapoints(mock_df):
     task = ClassificationTask(
         df=mock_df,
-        description="Sentiment classification task",
+        task_description="Sentiment classification task",
         eval_strategy="sequential_blocks",
     )
 
@@ -121,13 +104,77 @@ def test_pop_datapoints(mock_df):
 
 def test_blocks(mock_df):
     task = ClassificationTask(
-        df=mock_df, description="Sentiment classification task", eval_strategy="sequential_blocks", n_subsamples=1
+        df=mock_df, task_description="Sentiment classification task", eval_strategy="sequential_blocks", n_subsamples=1
     )
 
-    # Increment blocks
     task.increment_block_idx()
     assert task.block_idx == 1
 
-    # Reset blocks
     task.reset_block_idx()
     assert task.block_idx == 0
+
+
+def test_classification_task_evaluate_random_block(mock_df, mock_predictor):
+    """Test the evaluate method with 'random_block' subsampling for ClassificationTask."""
+    task = ClassificationTask(
+        df=mock_df,
+        task_description="Sentiment classification",
+        x_column="x",
+        y_column="y",
+        n_subsamples=1,
+        eval_strategy="random_block",
+        seed=42,
+    )
+    prompts = ["Classify sentiment:"]
+
+    evaluated_x_sets = []
+    for _ in range(5):
+        mock_predictor.call_history = []
+        task.evaluate(prompts, mock_predictor)
+        if mock_predictor.call_history:
+            evaluated_x_sets.append(tuple(mock_predictor.call_history[0]["preds"]))
+        else:
+            evaluated_x_sets.append(tuple())
+
+    assert len(set(evaluated_x_sets)) > 1, "Should select different random blocks across evaluations"
+
+
+def test_classification_task_evaluate_sequential_block(mock_df, mock_predictor):
+    """Test the evaluate method with 'sequential_block' subsampling for ClassificationTask."""
+    task = ClassificationTask(
+        df=mock_df,
+        task_description="Sentiment classification",
+        x_column="x",
+        y_column="y",
+        n_subsamples=1,
+        eval_strategy="sequential_block",
+        seed=42,
+    )
+    prompts = ["Classify sentiment:"]
+
+    task.reset_block_idx()
+    assert task.block_idx == 0
+
+    expected_x_sequence = [
+        "This review is not negative, so my answer is <final_answer>positive</final_answer>",
+        "This review is not positive, so my answer is <final_answer>negative</final_answer>",
+        "This review is neither positive nor negative, so my answer is <final_answer>neutral</final_answer>",
+    ]
+
+    for i in range(task.n_blocks):
+        mock_predictor.call_history = []
+        task.evaluate(prompts, mock_predictor)
+
+        assert len(mock_predictor.call_history) == 1
+        assert mock_predictor.call_history[0]["preds"][0] == expected_x_sequence[i]
+
+        task.increment_block_idx()
+        if i < task.n_blocks - 1:
+            assert task.block_idx == i + 1
+            assert task.block_idx == 0
+
+    task_full_strategy = ClassificationTask(df=mock_df, x_column="x", y_column="y", eval_strategy="full")
+    with pytest.raises(ValueError, match="Block increment is only valid for block subsampling."):
+        task_full_strategy.increment_block_idx()
+    with pytest.raises(ValueError, match="Block reset is only valid for block subsampling."):
+        task_full_strategy.reset_block_idx()
diff --git a/tests/tasks/test_judge_task.py b/tests/tasks/test_judge_task.py
new file mode 100644
index 0000000..a1b3ccd
--- /dev/null
+++ b/tests/tasks/test_judge_task.py
@@ -0,0 +1,106 @@
+import numpy as np
+
+
+def test_judge_task_initialization(mock_judge_task_with_y, mock_judge_llm):
+    """Test that JudgeTask initializes correctly with ground truth."""
+    assert mock_judge_task_with_y.task_description == "Evaluate sentiment prediction quality."
+    assert mock_judge_task_with_y.x_column == "x"
+    assert mock_judge_task_with_y.y_column == "y"
+    assert mock_judge_task_with_y.judge_llm == mock_judge_llm
+    assert mock_judge_task_with_y.has_y is True
+    assert len(mock_judge_task_with_y.xs) == len(mock_judge_task_with_y.df)
+    assert len(mock_judge_task_with_y.ys) == len(mock_judge_task_with_y.df)
+
+
+def test_judge_task_initialization_no_y(mock_judge_task_no_y, mock_judge_llm):
+    """Test JudgeTask initialization when no y_column is provided."""
+    assert mock_judge_task_no_y.y_column is None
+    assert mock_judge_task_no_y.has_y is False
+    assert len(mock_judge_task_no_y.xs) == len(mock_judge_task_no_y.df)
+    assert np.all(mock_judge_task_no_y.ys == None)  # noqa: E711
+
+
+def test_judge_task_construct_judge_prompt_with_ground_truth(mock_judge_task_with_y):
+    """Test _construct_judge_prompt generates correct prompt when ground truth is available."""
+    x_val = "This movie was great!"
+    pred_val = "positive"
+    y_val = "positive"
+    prompt = mock_judge_task_with_y._construct_judge_prompt(x_val, pred_val, y_val)
+
+    assert mock_judge_task_with_y.task_description in prompt
+    assert f"Input:\n{x_val}" in prompt
+    assert f"Ground Truth:\n{y_val}" in prompt
+    assert f"Prediction:\n{pred_val}" in prompt
+    assert "Response:" not in prompt
+    assert "<final_score>" in prompt
+
+
+def test_judge_task_construct_judge_prompt_without_ground_truth(mock_judge_task_no_y):
+    """Test _construct_judge_prompt generates correct prompt when no ground truth."""
+    x_val = "Tell me a joke."
+    pred_val = "Why did the scarecrow win an award? Because he was outstanding in his field!"
+    prompt = mock_judge_task_no_y._construct_judge_prompt(x_val, pred_val, None)
+
+    assert mock_judge_task_no_y.task_description in prompt
+    assert f"Input:\n{x_val}" in prompt
+    assert pred_val in prompt
+    assert "<final_score>" in prompt
+
+
+def test_judge_task_single_evaluate_successful_parse(mock_judge_task_with_y, mock_judge_llm):
+    """Test _single_evaluate correctly parses a valid score from judge LLM response."""
+    score = mock_judge_task_with_y._single_evaluate(x="any", y="any", pred="any")
+    assert score == 5.0
+
+
+def test_judge_task_evaluate_with_ground_truth(mock_judge_task_with_y, mock_predictor, mock_judge_llm):
+    """Test the evaluate method of JudgeTask with ground truth and full evaluation."""
+    prompts = ["Rate the sentiment:"]
+
+    mock_predictor.call_history = []
+    mock_judge_llm.call_history = []
+
+    scores_per_datapoint = mock_judge_task_with_y.evaluate(prompts, mock_predictor, return_agg_scores=False)
+
+    assert scores_per_datapoint.shape == (len(prompts), len(mock_judge_task_with_y.xs))
+
+    expected_scores = [5.0, -5.0, 0.0]
+    np.testing.assert_allclose(scores_per_datapoint[0], expected_scores)
+
+    assert len(mock_predictor.call_history) == 1
+    assert len(mock_judge_llm.call_history) == len(mock_judge_task_with_y.xs)
+
+    mock_predictor.call_history = []
+    mock_judge_llm.call_history = []
+
+    aggregated_scores = mock_judge_task_with_y.evaluate(prompts, mock_predictor, return_agg_scores=True)
+    assert aggregated_scores.shape == (len(prompts),)
+    np.testing.assert_allclose(aggregated_scores[0], np.mean(expected_scores))
+
+
+def test_judge_task_evaluate_no_ground_truth(mock_judge_task_no_y, mock_predictor, mock_judge_llm):
+    """Test the evaluate method of JudgeTask without a y_column (no ground truth)."""
+    prompts = ["Tell a funny joke:"]
+
+    mock_predictor.call_history = []
+    mock_judge_llm.call_history = []
+
+    scores_per_datapoint = mock_judge_task_no_y.evaluate(prompts, mock_predictor, return_agg_scores=False)
+
+    assert scores_per_datapoint.shape == (len(prompts), mock_judge_task_no_y.n_subsamples)
+
+    expected_scores = [5.0, -5.0]
+    np.testing.assert_allclose(scores_per_datapoint[0], expected_scores)
+
+    assert len(mock_predictor.call_history) == 1
+    assert len(mock_judge_llm.call_history) == mock_judge_task_no_y.n_subsamples
+
+
+def test_judge_task_evaluate_with_return_seq(mock_judge_task_with_y, mock_predictor):
+    """Test the evaluate method with return_seq=True for JudgeTask."""
+    prompts = ["Evaluate this text:"]
+    scores, seqs = mock_judge_task_with_y.evaluate(prompts, mock_predictor, return_seq=True)
+
+    assert scores.shape == (1,)
+    assert len(seqs) == 1
+    assert len(seqs[0]) == len(mock_judge_task_with_y.xs)
diff --git a/tests/tasks/test_reward_tasks.py b/tests/tasks/test_reward_tasks.py
new file mode 100644
index 0000000..dc19069
--- /dev/null
+++ b/tests/tasks/test_reward_tasks.py
@@ -0,0 +1,34 @@
+import numpy as np
+
+
+def test_reward_task_initialization(mock_reward_task, simple_reward_function):
+    """Test that RewardTask initializes correctly."""
+    assert mock_reward_task.task_description == "Evaluate text quality"
+    assert mock_reward_task.reward_function == simple_reward_function
+    assert mock_reward_task.x_column == "x"
+    assert not mock_reward_task.has_y
+    assert len(mock_reward_task.xs) == len(mock_reward_task.df)
+    assert np.all(mock_reward_task.ys == None)  # noqa: E711
+
+
+def test_reward_task_initialization_no_x_column(mock_reward_task_no_x_column, simple_reward_function):
+    """Test RewardTask initialization when a dummy x_column is provided (no semantic input)."""
+    assert mock_reward_task_no_x_column.x_column == "dummy_input"
+    assert not mock_reward_task_no_x_column.has_y
+    assert len(mock_reward_task_no_x_column.xs) == len(mock_reward_task_no_x_column.df)
+    assert all(x == "" for x in mock_reward_task_no_x_column.xs)
+    assert np.all(mock_reward_task_no_x_column.ys == None)  # noqa: E711
+
+
+def test_reward_task_evaluate_with_return_seq(mock_reward_task, mock_predictor):
+    """Test the evaluate method with return_seq=True for RewardTask."""
+    prompts = ["Generate a short text:"]
+
+    scores, seqs = mock_reward_task.evaluate(prompts, mock_predictor, return_seq=True, return_agg_scores=False)
+
+    assert scores.shape == (1, len(mock_reward_task.xs))
+    assert len(seqs) == 1
+    assert len(seqs[0]) == len(mock_reward_task.xs)
+
+    for seq in seqs[0]:
+        assert any(str(x_val) in seq for x_val in mock_reward_task.xs)
diff --git a/tests/utils/test_prompt_creation.py b/tests/utils/test_prompt_creation.py
new file mode 100644
index 0000000..91d739d
--- /dev/null
+++ b/tests/utils/test_prompt_creation.py
@@ -0,0 +1,145 @@
+from promptolution.tasks.base_task import BaseTask
+from promptolution.tasks.classification_tasks import ClassificationTask
+from promptolution.utils.prompt_creation import create_prompt_variation, create_prompts_from_samples
+
+
+def test_create_prompt_variation_single_prompt(mock_meta_llm):
+    """Test create_prompt_variation with a single string prompt and default meta-prompt."""
+    original_prompt = "Analyze the sentiment of the following text."
+
+    mock_meta_llm.call_history = []
+
+    varied_prompts = create_prompt_variation(original_prompt, mock_meta_llm)
+
+    assert isinstance(varied_prompts, list)
+    assert len(varied_prompts) == 1
+    assert varied_prompts[0] == "Meta-generated prompt for input 0"
+
+    assert len(mock_meta_llm.call_history) == 1
+
+
+def test_create_prompt_variation_list_of_prompts(mock_meta_llm):
+    """Test create_prompt_variation with a list of prompts and custom meta-prompt."""
+    original_prompts = ["Prompt A.", "Prompt B."]
+    custom_meta_prompt = "Vary the following: <prev_prompt>"
+
+    mock_meta_llm.call_history = []
+
+    varied_prompts = create_prompt_variation(original_prompts, mock_meta_llm, meta_prompt=custom_meta_prompt)
+
+    assert isinstance(varied_prompts, list)
+    assert len(varied_prompts) == 2
+    assert varied_prompts[0] == "Meta-generated prompt for input 0"
+    assert varied_prompts[1] == "Meta-generated prompt for input 1"
+
+    assert len(mock_meta_llm.call_history) == 1
+
+
+def test_create_prompts_from_samples_default_meta_prompt(mock_df, mock_meta_llm):
+    """Test create_prompts_from_samples with default meta_prompt (no task_description)."""
+    task = ClassificationTask(df=mock_df, x_column="x", y_column="y", task_description="Dummy task")
+    n_samples = 2
+    n_prompts = 1
+
+    mock_meta_llm.call_history = []
+
+    generated_prompts = create_prompts_from_samples(task, mock_meta_llm, n_samples=n_samples, n_prompts=n_prompts)
+
+    assert isinstance(generated_prompts, list)
+    assert len(generated_prompts) == n_prompts
+    assert generated_prompts[0] == "Meta-generated prompt for input 0"
+
+    assert len(mock_meta_llm.call_history) == n_prompts
+
+
+def test_create_prompts_from_samples_with_task_description_only(mock_df, mock_meta_llm):
+    """Test create_prompts_from_samples with task_description and no meta_prompt."""
+    task = ClassificationTask(df=mock_df, x_column="x", y_column="y")
+    test_task_description = "Classify customer reviews into positive, negative, or neutral."
+    n_samples = 2
+    n_prompts = 1
+
+    mock_meta_llm.call_history = []
+
+    generated_prompts = create_prompts_from_samples(
+        task, mock_meta_llm, n_samples=n_samples, task_description=test_task_description, n_prompts=n_prompts
+    )
+
+    assert len(generated_prompts) == n_prompts
+    assert generated_prompts[0] == "Meta-generated prompt for input 0"
+
+
+def test_create_prompts_from_samples_with_custom_meta_prompt_only(mock_df, mock_meta_llm):
+    """Test create_prompts_from_samples with custom meta_prompt and no task_description."""
+    task = ClassificationTask(df=mock_df, x_column="x", y_column="y")
+    custom_meta_prompt = "Generate a prompt based on these examples: <input_output_pairs>"
+    n_samples = 2
+    n_prompts = 1
+
+    mock_meta_llm.call_history = []
+
+    generated_prompts = create_prompts_from_samples(
+        task, mock_meta_llm, meta_prompt=custom_meta_prompt, n_samples=n_samples, n_prompts=n_prompts
+    )
+
+    assert len(generated_prompts) == n_prompts
+    assert generated_prompts[0] == "Meta-generated prompt for input 0"
+
+
+def test_create_prompts_from_samples_with_both_meta_prompt_and_task_description(mock_df, mock_meta_llm):
+    """Test create_prompts_from_samples with both custom meta_prompt and task_description."""
+    task = ClassificationTask(df=mock_df, x_column="x", y_column="y")
+    custom_meta_prompt = "For <task_desc>, create a prompt using: <input_output_pairs>"
+    test_task_description = "Identify categories."
+    n_samples = 2
+    n_prompts = 1
+
+    mock_meta_llm.call_history = []
+
+    generated_prompts = create_prompts_from_samples(
+        task,
+        mock_meta_llm,
+        meta_prompt=custom_meta_prompt,
+        n_samples=n_samples,
+        task_description=test_task_description,
+        n_prompts=n_prompts,
+    )
+
+    assert len(generated_prompts) == n_prompts
+    assert generated_prompts[0] == "Meta-generated prompt for input 0"
+
+
+def test_create_prompts_from_samples_random_sampling(mock_df, mock_meta_llm):
+    """Test create_prompts_from_samples with random sampling (not ClassificationTask or get_uniform_labels=False)."""
+
+    class DummyTask(BaseTask):
+        def _evaluate(self, x, y, pred):
+            return 1.0
+
+    task = DummyTask(df=mock_df, x_column="x", y_column="y", task_description="Dummy task for random sampling")
+    n_samples = 2
+    n_prompts = 1
+
+    mock_meta_llm.call_history = []
+
+    generated_prompts = create_prompts_from_samples(
+        task, mock_meta_llm, n_samples=n_samples, get_uniform_labels=False, n_prompts=n_prompts
+    )
+
+    assert len(generated_prompts) == n_prompts
+
+
+def test_create_prompts_from_samples_multiple_prompts(mock_df, mock_meta_llm):
+    """Test create_prompts_from_samples generates multiple prompts."""
+    task = ClassificationTask(df=mock_df, x_column="x", y_column="y")
+    n_samples = 2
+    n_prompts = 3
+
+    mock_meta_llm.call_history = []
+
+    generated_prompts = create_prompts_from_samples(task, mock_meta_llm, n_samples=n_samples, n_prompts=n_prompts)
+
+    assert isinstance(generated_prompts, list)
+    assert len(generated_prompts) == n_prompts
+
+    assert len(mock_meta_llm.call_history) == 1
diff --git a/scripts/api_llm_demo.py b/tutorials/api_llm_demo.py
similarity index 92%
rename from scripts/api_llm_demo.py
rename to tutorials/api_llm_demo.py
index 13370ea..d369a1b 100644
--- a/scripts/api_llm_demo.py
+++ b/tutorials/api_llm_demo.py
@@ -33,7 +33,7 @@
 
 task = ClassificationTask(
     df,
-    description="The dataset contains news articles categorized into four classes: World, Sports, Business, and Tech. The task is to classify each news article into one of the four categories.",
+    task_description="The dataset contains news articles categorized into four classes: World, Sports, Business, and Tech. The task is to classify each news article into one of the four categories.",
     x_column="input",
     y_column="target",
 )
diff --git a/scripts/capo_demo.py b/tutorials/capo_demo.py
similarity index 92%
rename from scripts/capo_demo.py
rename to tutorials/capo_demo.py
index a03ec78..a7cc53f 100644
--- a/scripts/capo_demo.py
+++ b/tutorials/capo_demo.py
@@ -35,7 +35,7 @@
 
 task = ClassificationTask(
     df,
-    description="The dataset consists of elementary school math word problems that require multi-step reasoning to solve. The task is to solve each word problem and provide the final answer.",
+    task_description="The dataset consists of elementary school math word problems that require multi-step reasoning to solve. The task is to solve each word problem and provide the final answer.",
     x_column="input",
     y_column="target",
     eval_strategy="sequential_block",
diff --git a/scripts/evoprompt_demo.py b/tutorials/evoprompt_demo.py
similarity index 93%
rename from scripts/evoprompt_demo.py
rename to tutorials/evoprompt_demo.py
index 4177eb2..6568230 100644
--- a/scripts/evoprompt_demo.py
+++ b/tutorials/evoprompt_demo.py
@@ -34,7 +34,7 @@
 
 task = ClassificationTask(
     df,
-    description="The dataset contains news articles categorized into four classes: World, Sports, Business, and Tech. The task is to classify each news article into one of the four categories.",
+    task_description="The dataset contains news articles categorized into four classes: World, Sports, Business, and Tech. The task is to classify each news article into one of the four categories.",
     x_column="text",
     y_column="label_text",
     eval_strategy="subsample",
diff --git a/notebooks/getting_started.ipynb b/tutorials/getting_started.ipynb
similarity index 99%
rename from notebooks/getting_started.ipynb
rename to tutorials/getting_started.ipynb
index 54bf3f2..2c140f6 100644
--- a/notebooks/getting_started.ipynb
+++ b/tutorials/getting_started.ipynb
@@ -380,7 +380,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "promptolution-py3.12",
    "language": "python",
    "name": "python3"
   },
diff --git a/tutorials/llm_as_judge_tutorial.ipynb b/tutorials/llm_as_judge_tutorial.ipynb
new file mode 100644
index 0000000..cc26baa
--- /dev/null
+++ b/tutorials/llm_as_judge_tutorial.ipynb
@@ -0,0 +1,448 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Getting Started: LLM as a Judge with Promptolution\n",
+    "\n",
+    "## Welcome to Promptolution! \n",
+    "\n",
+    "Discover a powerful tool for evolving and optimizing your LLM prompts. This notebook provides a friendly introduction to one of Promptolution's most advanced features: LLM as a Judge.\n",
+    "\n",
+    "While the standard getting_started notebook shows how to optimize for classification tasks, this guide will focus on something different. We'll optimize prompts for a creative task where there's no single \"correct\" answer: *Finding an optimal argument for a statement*!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Intro\n",
+    "In traditional machine learning and prompt optimization, we often rely on labeled data. For a classification task, you need an input (x) and a corresponding ground-truth label (y). The goal is to find a prompt that helps the model predict y correctly.\n",
+    "But what if your task is more subjective? How do you \"label\" things like:\n",
+    "\n",
+    "- The quality of a generated argument?\n",
+    "- The creativity of a story?\n",
+    "- The helpfulness of a summary?\n",
+    "- The persuasiveness of an essay?\n",
+    "\n",
+    "This is where LLM as a Judge comes in. Instead of relying on a pre-defined dataset of labels, we use another powerful Language Model (the \"judge\") to score the output of our prompts. The process looks like this:\n",
+    "\n",
+    "A candidate prompt is used to generate a response (e.g., an argument).\n",
+    "A \"judge\" LLM then evaluates this response based on the task provided and assigns a score.\n",
+    "Promptolution's optimizer uses these scores to identify which prompts are best and evolves them to generate even better responses.\n",
+    "\n",
+    "The beauty of this approach is its flexibility. While you can provide groundtruths (in case there is a correct answer) and let the LLM judge itself if both the prediction and the correct answer are equivalent - you don't need to.\n",
+    "\n",
+    "*New to Promptolution? If you haven't seen our classification tutorial yet, check out `getting_started.ipynb` first! It covers the basics of prompt optimization with simpler tasks like text classification. This notebook builds on those concepts but tackles more complex, subjective tasks.*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Installation\n",
+    "Install Promptolution with a single command"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! pip install promptolution[api]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from promptolution.utils import ExperimentConfig\n",
+    "from promptolution.helpers import run_experiment\n",
+    "import nest_asyncio\n",
+    "\n",
+    "nest_asyncio.apply()  # Required for notebook environments"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setting Up Your Experiment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Prepare the data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For this tutorial, we're using IBM's Argument Quality Ranking dataset - a collection of crowd-sourced arguments on controversial topics like capital punishment, abortion rights, and climate change.\n",
+    "\n",
+    "Unlike classification tasks where you have clear input-output pairs, here we're working with debate topics that we want to generate compelling arguments for."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(\"hf://datasets/ibm-research/argument_quality_ranking_30k/dev.csv\").sample(300)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Sample topics:\n",
+      "- We should adopt a zero-tolerance policy in schools\n",
+      "- Payday loans should be banned\n",
+      "- Intelligence tests bring more harm than good\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"\\nSample topics:\")\n",
+    "for topic in df[\"topic\"].unique()[:3]:\n",
+    "    print(f\"- {topic}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Our task: **Given a controversial statement, generate the strongest possible argument supporting that position.**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's look at what we're working with:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Creating Inital Prompts"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here are some starter prompts for generating compelling arguments. Feel free to experiment with your own!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "init_prompts = [\n",
+    "    \"Create a strong argument for this position with clear reasoning and examples:\",\n",
+    "    \"Write a persuasive argument supporting this statement. Include evidence and address counterarguments:\",\n",
+    "    \"Make a compelling case for this viewpoint using logical reasoning and real examples:\",\n",
+    "    \"Argue convincingly for this position. Provide supporting points and evidence:\",\n",
+    "    \"Build a strong argument for this statement with clear structure and solid reasoning:\",\n",
+    "    \"Generate a persuasive argument supporting this position. Use facts and logical flow:\",\n",
+    "    \"Create a well-reasoned argument for this viewpoint with supporting evidence:\",\n",
+    "    \"Write a convincing argument for this position. Include examples and counter opposing views:\",\n",
+    "    \"Develop a strong case supporting this statement using clear logic and evidence:\",\n",
+    "    \"Construct a persuasive argument for this position with solid reasoning and examples:\",\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Configure Your LLM"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For this demonstration, we will again use the DeepInfra API, but you can easily switch to other providers like Anthropic or OpenAI by simply changing the `api_url` and `model_id`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "api_key = \"YOUR_API_KEY\"  # Replace with your Promptolution API key"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here are the key parameters for LLM-as-a-Judge tasks:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config = ExperimentConfig(\n",
+    "    optimizer=\"evopromptga\",\n",
+    "    task_description=\"Given a statement, find the best argument supporting it.\",\n",
+    "    x_column=\"topic\",\n",
+    "    prompts=init_prompts,\n",
+    "    n_steps=3,\n",
+    "    n_subsamples=10,\n",
+    "    subsample_strategy=\"random_subsample\",\n",
+    "    api_url=\"https://api.deepinfra.com/v1/openai\",\n",
+    "    model_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n",
+    "    api_key=api_key,\n",
+    "    task_type=\"judge\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- `task_type=\"judge\"` - This tells Promptolution to use LLM evaluation instead of accuracy metrics\n",
+    "- `x_column=\"topic\"` - We specify which column contains our input (debate topics)\n",
+    "- `optimizer=\"evopromptga\"` - In the classification task we show cased CAPO, here we are using EvoPrompt, a strong evolutionary prompt optimizer.\n",
+    "- No y column needed - the judge will evaluate quality without ground truth labels!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run Your Experiment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "With everything configured, you're ready to optimize your prompts! The run_experiment function will:\n",
+    "\n",
+    "1. Evaluate your initial prompts by generating arguments and having the judge LLM score them\n",
+    "1. Use evolutionary operators (mutation, crossover) to create new prompt variations from the 1. best-performing ones\n",
+    "1. Test these new prompt candidates and select the fittest ones for the next generation\n",
+    "1. Repeat this evolutionary process for the specified number of steps, gradually improving prompt 1. quality"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "🔥 Starting optimization...\n"
+     ]
+    }
+   ],
+   "source": [
+    "prompts = run_experiment(df, config)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can expect this to take several minutes as the optimizer generates arguments, evaluates them with the judge, and evolves the prompts."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>prompt</th>\n",
+       "      <th>score</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Construct a persuasive argument supporting the given statement, relying on logical coherence and evidence-based reasoning.</td>\n",
+       "      <td>0.931500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Develop a strong case supporting this statement using clear logic and evidence:</td>\n",
+       "      <td>0.924167</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Construct a convincing case supporting the stated argument, providing evidence and responding to potential objections.</td>\n",
+       "      <td>0.915833</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Develop a well-reasoned argument in favor of the given statement, incorporating reliable examples and addressing potential counterpoints.</td>\n",
+       "      <td>0.913333</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Write a persuasive argument supporting this statement. Include evidence and address counterarguments:</td>\n",
+       "      <td>0.907500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Present a convincing case for this assertion, incorporating logical premises and applicable examples.</td>\n",
+       "      <td>0.903333</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Fortify the provided statement with a robust and well-reasoned argument, underscoring logical relationships and leveraging empirical support to build a compelling case, while also anticipating and addressing potential counterpoints.</td>\n",
+       "      <td>0.902500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Construct a strong claim in support of this statement, employing a logical framework and relevant examples to make a convincing case.</td>\n",
+       "      <td>0.891667</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Create a well-reasoned argument for this viewpoint with supporting evidence:</td>\n",
+       "      <td>0.888333</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Extract the most compelling supporting argument for this statement, grounding it in logical reasoning and bolstered by relevant evidence and examples.</td>\n",
+       "      <td>0.697500</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                                                                                                                                                                                                     prompt  \\\n",
+       "0                                                                                                                Construct a persuasive argument supporting the given statement, relying on logical coherence and evidence-based reasoning.   \n",
+       "1                                                                                                                                                           Develop a strong case supporting this statement using clear logic and evidence:   \n",
+       "2                                                                                                                    Construct a convincing case supporting the stated argument, providing evidence and responding to potential objections.   \n",
+       "3                                                                                                 Develop a well-reasoned argument in favor of the given statement, incorporating reliable examples and addressing potential counterpoints.   \n",
+       "4                                                                                                                                     Write a persuasive argument supporting this statement. Include evidence and address counterarguments:   \n",
+       "5                                                                                                                                     Present a convincing case for this assertion, incorporating logical premises and applicable examples.   \n",
+       "6  Fortify the provided statement with a robust and well-reasoned argument, underscoring logical relationships and leveraging empirical support to build a compelling case, while also anticipating and addressing potential counterpoints.   \n",
+       "7                                                                                                     Construct a strong claim in support of this statement, employing a logical framework and relevant examples to make a convincing case.   \n",
+       "8                                                                                                                                                              Create a well-reasoned argument for this viewpoint with supporting evidence:   \n",
+       "9                                                                                    Extract the most compelling supporting argument for this statement, grounding it in logical reasoning and bolstered by relevant evidence and examples.   \n",
+       "\n",
+       "      score  \n",
+       "0  0.931500  \n",
+       "1  0.924167  \n",
+       "2  0.915833  \n",
+       "3  0.913333  \n",
+       "4  0.907500  \n",
+       "5  0.903333  \n",
+       "6  0.902500  \n",
+       "7  0.891667  \n",
+       "8  0.888333  \n",
+       "9  0.697500  "
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "prompts"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The best prompts aren't always the most obvious ones - let the optimizer surprise you with what works!\n",
+    "\n",
+    "\n",
+    "Happy prompt optimizing! 🚀✨ We can't wait to see what you build with Promptolution! 🤖💡"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "promptolution-py3.12",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/scripts/opro_demo.py b/tutorials/opro_demo.py
similarity index 93%
rename from scripts/opro_demo.py
rename to tutorials/opro_demo.py
index eec818f..2b6ea93 100644
--- a/scripts/opro_demo.py
+++ b/tutorials/opro_demo.py
@@ -34,7 +34,7 @@
 
 task = ClassificationTask(
     df,
-    description="The dataset contains news articles categorized into four classes: World, Sports, Business, and Tech. The task is to classify each news article into one of the four categories.",
+    task_description="The dataset contains news articles categorized into four classes: World, Sports, Business, and Tech. The task is to classify each news article into one of the four categories.",
     x_column="text",
     y_column="label_text",
 )
@@ -62,7 +62,7 @@
 
 optimizer = OPRO(
     task=task,
-    prompt_template=OPRO_TEMPLATE_TD.replace("<task_desc", task.description),
+    prompt_template=OPRO_TEMPLATE_TD.replace("<task_desc", task.task_description),
     predictor=predictor,
     meta_llm=meta_llm,
     initial_prompts=initial_prompts,
diff --git a/tutorials/reward_task_tutorial.ipynb b/tutorials/reward_task_tutorial.ipynb
new file mode 100644
index 0000000..ab26213
--- /dev/null
+++ b/tutorials/reward_task_tutorial.ipynb
@@ -0,0 +1,416 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Getting Started: Reward Tasks with Promptolution\n",
+    "\n",
+    "Welcome to the world of **reward-based prompt optimization**! If you've explored our classification tutorial (`getting_started.ipynb`) or our LLM-as-a-Judge notebook (`llm_judge_getting_started.ipynb`), you've seen how to optimize prompts for predicting labels or generating content that gets rated by AI judges.\n",
+    "\n",
+    "But what if you want to optimize for something completely different? What if you want to optimize for:\n",
+    "* **Objective, measurable outcomes** rather than subjective quality?\n",
+    "* **System compatibility** - does the output actually work with your software?\n",
+    "* **Concrete business metrics** that you can define and measure automatically?\n",
+    "\n",
+    "This is where **Reward Tasks** shine. Instead of relying on pre-labeled data or AI judges, you define your own reward function - a simple Python function that takes the model's output and returns a score. The optimizer then evolves prompts that maximize this reward.\n",
+    "\n",
+    "**The beauty of reward tasks**: You can optimize for literally anything you can measure! Valid JSON parsing, code execution success, mathematical correctness, format compliance, API compatibility - if you can write a function to evaluate it, you can optimize for it.\n",
+    "\n",
+    "> **New to Promptolution?** If you haven't seen our other tutorials yet, check out `getting_started.ipynb` (classification) and `llm_judge_getting_started.ipynb` (LLM evaluation) first! This notebook builds on those concepts but tackles objective, measurable outcomes."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Installation\n",
+    "Install Promptolution with a single command"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ! pip install promptolution[api]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from promptolution.utils import ExperimentConfig\n",
+    "from promptolution.helpers import run_experiment\n",
+    "import nest_asyncio\n",
+    "\n",
+    "nest_asyncio.apply()  # Required for notebook environments"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setting Up Your Experiment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Prepare the data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For this tutorial, we're tackling a practical business problem: **converting unstructured text into valid JSON**. This is something every company deals with - extracting structured data from emails, documents, conversations, and web content.\n",
+    "\n",
+    "We're using a specialized dataset designed for JSON extraction tasks, containing diverse text examples across different domains and formats."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_parquet(\"hf://datasets/paraloq/json_data_extraction/data.parquet\").sample(300)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Key difference from other tasks**: Notice we're not using labeled \"correct\" outputs or asking an AI to judge quality. Instead, we'll define our own objective success criteria - does the output parse as valid JSON?\n",
+    "\n",
+    "Let's explore what we're working with:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Dataset columns:\", df.columns.tolist())\n",
+    "print(f\"\\nDataset size: {len(df)} examples\")\n",
+    "print(\"\\nSample text:\")\n",
+    "print(df[\"text\"].iloc[0][:200] + \"...\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Creating Inital Prompts"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here are some starter prompts for JSON extraction. Feel free to experiment with your own approaches!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "init_prompts = [\"Translate the provided information into a valid json schema!\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Configure Your LLM"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Promptolution offers three flexible ways to access language models:\n",
+    "\n",
+    "1. Local LLMs (using the Transformers library)\n",
+    "1. vLLM backend (for efficient serving of large language models)\n",
+    "1. API-based LLMs (compatible with any provider following the OpenAI standard)\n",
+    "\n",
+    "For this demonstration, we'll use the DeepInfra API, but you can easily switch to other providers like Anthropic or OpenAI by simply changing the base_url and llm string in the configuration."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "api_key = \"YOUR_API_KEY\"  # Replace with your Promptolution API key\n",
+    "\n",
+    "with open(\"../deepinfratoken.txt\", \"r\") as f:\n",
+    "    api_key = f.read().strip()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here's an explanation of each configuration parameter in the ExperimentConfig:\n",
+    "- `optimizer`: The algorithm used for prompt optimization. Currently we support \"capo\", \"evopromptga\", \"evopromptde\", and \"opro\". For this example, we use \"capo\" as it is capable of leveraging few-shot examples.\n",
+    "- `task_description`: A string describing the task you're optimizing prompts for. This is used to provide the meta-llm with context about your task.\n",
+    "- `prompts`: A list of initial prompt strings that will be used as the starting point for optimization.\n",
+    "- `n_steps`: The number of optimization steps to run. Higher values allow more exploration and refinement but require more API calls and computational resources.\n",
+    "- `api_url`: The API endpoint URL used to access the language model. This example uses DeepInfra's API which follows the OpenAI standard.\n",
+    "- `llm`: The LLM to use for the experiment, as both downstream and meta LLM.\n",
+    "- `token`: Your API authentication token required to access the language model service."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define Your Reward Function\n",
+    "\n",
+    "This is where the magic happens! Unlike classification (which needs labeled data) or judging (which uses AI evaluation), reward tasks let you define exactly what \"success\" means for your business case.\n",
+    "\n",
+    "Our reward function is beautifully simple and objective:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "\n",
+    "def reward_function(prediction: str) -> float:\n",
+    "    try:\n",
+    "        json.loads(prediction)\n",
+    "        return 1.0  # Valid JSON\n",
+    "    except json.JSONDecodeError:\n",
+    "        return 0.0  # Invalid JSON"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This reward function captures a real business requirement - \"generate output that our systems can actually process.\" No subjective judgment needed, no human labeling required!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config = ExperimentConfig(\n",
+    "    optimizer=\"opro\",\n",
+    "    task_description=\"The task is to convert information into a valid JSON schema. The LLM should generate a JSON object and return the json inside of <final_answer> tags.\",\n",
+    "    prompts=init_prompts,\n",
+    "    x_column=\"text\",\n",
+    "    n_steps=8,\n",
+    "    num_instructions_per_step=5,\n",
+    "    api_url=\"https://api.deepinfra.com/v1/openai\",\n",
+    "    model_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n",
+    "    api_key=api_key,\n",
+    "    n_subsamples=15,\n",
+    "    task_type=\"reward\",\n",
+    "    reward_function=reward_function,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Difference compared to Classification and LLM-As-a-Judge**:\n",
+    "- `task_type=\"reward\"` - Uses your custom reward function instead of accuracy or AI judgment\n",
+    "- `reward_function=reward_function` - Your objective success criteria\n",
+    "- `optimizer=\"opro\"` - We already used EvoPrompt and CAPO in the other tutorials - here we will use OPRO. Its main benefit: it requires only a single initial prompt.\n",
+    "- No need for labeled \"correct\" outputs - the reward function defines success\n",
+    "- Completely customizable - change the reward function to optimize for anything!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run Your Experiment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "With everything configured, you're ready to optimize your prompts! The `run_experiment` function will run the optimization and evaluate on a holdout set. You can expect this cell to take a few minutes to run."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "🔥 Starting optimization...\n",
+      "⚠️ Unused configuration attributes: {'reward_function'}\n",
+      "📊 Starting evaluation...\n"
+     ]
+    }
+   ],
+   "source": [
+    "prompts = run_experiment(df, config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>prompt</th>\n",
+       "      <th>score</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Convert the input text into a valid JSON schema by extracting relevant information and filling in the corresponding fields. The JSON schema format is as follows:\\n{\\n\"meta\": {\\n\"type\": \"object\",\\n\"properties\": {\\n\"author\": {\\n\"type\": \"string\"\\n},\\n\"title\": {\\n\"type\": \"string\"\\n},\\n\"date\": {\\n\"type\": \"string\"\\n}\\n}\\n},\\n\"content\": {\\n\"type\": \"array\",\\n\"items\": {\\n\"type\": \"object\",\\n\"properties\": {\\n\"header\": {\\n\"type\": \"string\"\\n},\\n\"paragraph\": {\\n\"type\": \"string\"\\n}\\n}\\n}\\n}\\n}\\n\\nUse the class label extracted from the text between the markers \"&lt;final_answer&gt;\" and \"&lt;/final_answer&gt;\" as the \"title\" field in the \"meta\" section of the JSON schema.\\n\\nNote: The class label should be extracted using natural language processing techniques such as named entity recognition and sentence classification. If the class label is not present in the text, use an empty string as the default value.</td>\n",
+       "      <td>0.316667</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>To convert text into a valid JSON schema, follow these steps:\\n\\n1. Identify the main entities and relationships present in the text.\\n2. Create a JSON schema template with the following structure:\\n```json\\n{\\n  \"meta\": {\\n    \"type\": \"object\",\\n    \"properties\": {\\n      \"title\": {\\n        \"type\": \"string\"\\n      },\\n      \"author\": {\\n        \"type\": \"string\"\\n      },\\n      \"date\": {\\n        \"type\": \"string\"\\n      }\\n    }\\n  },\\n  \"content\": {\\n    \"type\": \"array\",\\n    \"items\": {\\n      \"type\": \"object\",\\n      \"properties\": {\\n        \"header\": {\\n          \"type\": \"string\"\\n        },\\n        \"paragraphs\": {\\n          \"type\": \"array\",\\n          \"items\": {\\n            \"type\": \"string\"\\n          }\\n        }\\n      }\\n    }\\n  }\\n}\\n```\\n3. Extract relevant information from the text and fill in the corresponding fields in the JSON schema template.\\n4. Use the class label extracted from the text between the markers \"&lt;final_answer&gt;\" and \"&lt;/final_answer&gt;\" as the \"title\" field in the \"meta\" section of the JSON schema.\\n\\nNote: If the class label is not present in the text, use an empty string as the default value.\\n\\nScore: 95</td>\n",
+       "      <td>0.133333</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Convert the provided text into a valid JSON schema by extracting relevant information and filling in the corresponding fields. The JSON schema format is as follows:\\n\\n{\\n\"meta\": {\\n\"type\": \"object\",\\n\"properties\": {\\n\"title\": {\\n\"type\": \"string\"\\n},\\n\"author\": {\\n\"type\": \"string\"\\n},\\n\"tags\": {\\n\"type\": \"array\",\\n\"items\": {\\n\"type\": \"string\"\\n}\\n},\\n\"date\": {\\n\"type\": \"string\"\\n}\\n},\\n\"content\": {\\n\"type\": \"array\",\\n\"items\": {\\n\"type\": \"object\",\\n\"properties\": {\\n\"heading\": {\\n\"type\": \"string\"\\n},\\n\"paragraphs\": {\\n\"type\": \"array\",\\n\"items\": {\\n\"type\": \"string\"\\n}\\n}\\n}\\n}\\n}\\n}\\n\\nExtract relevant information from the text and fill in the corresponding fields in the JSON schema. The class label is the text within the markers \"&lt;final_answer&gt;\" and \"&lt;/final_answer&gt;\".</td>\n",
+       "      <td>0.066667</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Convert the provided text into a JSON schema in the following structure:\\n\\n{\\n\"advisory\": {\\n\"type\": \"object\",\\n\"properties\": {\\n\"title\": {\\n\"type\": \"string\"\\n},\\n\"issued\": {\\n\"type\": \"string\"\\n},\\n\"level\": {\\n\"type\": \"integer\"\\n},\\n\"reason\": {\\n\"type\": \"string\"\\n},\\n\"recommendations\": {\\n\"type\": \"array\",\\n\"items\": {\\n\"type\": \"string\"\\n}\\n}\\n},\\n\"character\": {\\n\"type\": \"object\",\\n\"properties\": {\\n\"name\": {\\n\"type\": \"string\"\\n},\\n\"level\": {\\n\"type\":\": \"integer\"\\n},\\n\"health\": {\\n\"type\": \"integer\"\\n},\\n\"mana\": {\\n\"type\": \"integer\"\\n},\\n\"strength\": {\\n\"type\": \"integer\"\\n},\\n\"agility\": {\\n\"type\": \"integer\"\\n},\\n\"intelligence\": {\\n\"type\": \"integer\"\\n},\\n\"equipment\": {\\n\"type\": \"array\",\\n\"items\": {\\n\"type\": \"object\",\\n\"properties\": {\\n\"name\": {\\n\"type\": \"string\"\\n},\\n\"damage\": {\\n\"type\": \"integer\"\\n},\\n\"defense\": {\\n\"type\": \"integer\"\\n},\\n\"durability\": {\\n\"type\": \"integer\"\\n}\\n}\\n}\\n},\\n\"skills\": {\\n\"type\": \"array\",\\n\"items\": {\\n\"type\": \"object\",\\n\"properties\": {\\n\"name\": {\\n\"type\": \"string\"\\n},\\n\"level\": {\\n\"type\": \"integer\"\\n}\\n}\\n}\\n},\\n\"quests\": {\\n\"type\": \"object\",\\n\"properties\": {\\n\"active\": {\\n\"type\": \"array\",\\n\"items\": {\\n\"type\": \"string\"\\n}\\n},\\n\"completed\": {\\n\"type\": \"array\",\\n\"items\": {\\n\"type\": \"string\"\\n}\\n}\\n}\\n}\\n}\\n}\\n\\nPlease extract relevant information from the text and fill in the corresponding fields in the JSON schema. The corresponding class label is the text within the markers \"&lt;final_answer&gt;\" and \"&lt;/final_answer&gt;\".</td>\n",
+       "      <td>0.033333</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                prompt  \\\n",
+       "0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        Convert the input text into a valid JSON schema by extracting relevant information and filling in the corresponding fields. The JSON schema format is as follows:\\n{\\n\"meta\": {\\n\"type\": \"object\",\\n\"properties\": {\\n\"author\": {\\n\"type\": \"string\"\\n},\\n\"title\": {\\n\"type\": \"string\"\\n},\\n\"date\": {\\n\"type\": \"string\"\\n}\\n}\\n},\\n\"content\": {\\n\"type\": \"array\",\\n\"items\": {\\n\"type\": \"object\",\\n\"properties\": {\\n\"header\": {\\n\"type\": \"string\"\\n},\\n\"paragraph\": {\\n\"type\": \"string\"\\n}\\n}\\n}\\n}\\n}\\n\\nUse the class label extracted from the text between the markers \"<final_answer>\" and \"</final_answer>\" as the \"title\" field in the \"meta\" section of the JSON schema.\\n\\nNote: The class label should be extracted using natural language processing techniques such as named entity recognition and sentence classification. If the class label is not present in the text, use an empty string as the default value.   \n",
+       "1                                                                                                                                                                                                                                                                                                                  To convert text into a valid JSON schema, follow these steps:\\n\\n1. Identify the main entities and relationships present in the text.\\n2. Create a JSON schema template with the following structure:\\n```json\\n{\\n  \"meta\": {\\n    \"type\": \"object\",\\n    \"properties\": {\\n      \"title\": {\\n        \"type\": \"string\"\\n      },\\n      \"author\": {\\n        \"type\": \"string\"\\n      },\\n      \"date\": {\\n        \"type\": \"string\"\\n      }\\n    }\\n  },\\n  \"content\": {\\n    \"type\": \"array\",\\n    \"items\": {\\n      \"type\": \"object\",\\n      \"properties\": {\\n        \"header\": {\\n          \"type\": \"string\"\\n        },\\n        \"paragraphs\": {\\n          \"type\": \"array\",\\n          \"items\": {\\n            \"type\": \"string\"\\n          }\\n        }\\n      }\\n    }\\n  }\\n}\\n```\\n3. Extract relevant information from the text and fill in the corresponding fields in the JSON schema template.\\n4. Use the class label extracted from the text between the markers \"<final_answer>\" and \"</final_answer>\" as the \"title\" field in the \"meta\" section of the JSON schema.\\n\\nNote: If the class label is not present in the text, use an empty string as the default value.\\n\\nScore: 95   \n",
+       "2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             Convert the provided text into a valid JSON schema by extracting relevant information and filling in the corresponding fields. The JSON schema format is as follows:\\n\\n{\\n\"meta\": {\\n\"type\": \"object\",\\n\"properties\": {\\n\"title\": {\\n\"type\": \"string\"\\n},\\n\"author\": {\\n\"type\": \"string\"\\n},\\n\"tags\": {\\n\"type\": \"array\",\\n\"items\": {\\n\"type\": \"string\"\\n}\\n},\\n\"date\": {\\n\"type\": \"string\"\\n}\\n},\\n\"content\": {\\n\"type\": \"array\",\\n\"items\": {\\n\"type\": \"object\",\\n\"properties\": {\\n\"heading\": {\\n\"type\": \"string\"\\n},\\n\"paragraphs\": {\\n\"type\": \"array\",\\n\"items\": {\\n\"type\": \"string\"\\n}\\n}\\n}\\n}\\n}\\n}\\n\\nExtract relevant information from the text and fill in the corresponding fields in the JSON schema. The class label is the text within the markers \"<final_answer>\" and \"</final_answer>\".   \n",
+       "3  Convert the provided text into a JSON schema in the following structure:\\n\\n{\\n\"advisory\": {\\n\"type\": \"object\",\\n\"properties\": {\\n\"title\": {\\n\"type\": \"string\"\\n},\\n\"issued\": {\\n\"type\": \"string\"\\n},\\n\"level\": {\\n\"type\": \"integer\"\\n},\\n\"reason\": {\\n\"type\": \"string\"\\n},\\n\"recommendations\": {\\n\"type\": \"array\",\\n\"items\": {\\n\"type\": \"string\"\\n}\\n}\\n},\\n\"character\": {\\n\"type\": \"object\",\\n\"properties\": {\\n\"name\": {\\n\"type\": \"string\"\\n},\\n\"level\": {\\n\"type\":\": \"integer\"\\n},\\n\"health\": {\\n\"type\": \"integer\"\\n},\\n\"mana\": {\\n\"type\": \"integer\"\\n},\\n\"strength\": {\\n\"type\": \"integer\"\\n},\\n\"agility\": {\\n\"type\": \"integer\"\\n},\\n\"intelligence\": {\\n\"type\": \"integer\"\\n},\\n\"equipment\": {\\n\"type\": \"array\",\\n\"items\": {\\n\"type\": \"object\",\\n\"properties\": {\\n\"name\": {\\n\"type\": \"string\"\\n},\\n\"damage\": {\\n\"type\": \"integer\"\\n},\\n\"defense\": {\\n\"type\": \"integer\"\\n},\\n\"durability\": {\\n\"type\": \"integer\"\\n}\\n}\\n}\\n},\\n\"skills\": {\\n\"type\": \"array\",\\n\"items\": {\\n\"type\": \"object\",\\n\"properties\": {\\n\"name\": {\\n\"type\": \"string\"\\n},\\n\"level\": {\\n\"type\": \"integer\"\\n}\\n}\\n}\\n},\\n\"quests\": {\\n\"type\": \"object\",\\n\"properties\": {\\n\"active\": {\\n\"type\": \"array\",\\n\"items\": {\\n\"type\": \"string\"\\n}\\n},\\n\"completed\": {\\n\"type\": \"array\",\\n\"items\": {\\n\"type\": \"string\"\\n}\\n}\\n}\\n}\\n}\\n}\\n\\nPlease extract relevant information from the text and fill in the corresponding fields in the JSON schema. The corresponding class label is the text within the markers \"<final_answer>\" and \"</final_answer>\".   \n",
+       "\n",
+       "      score  \n",
+       "0  0.316667  \n",
+       "1  0.133333  \n",
+       "2  0.066667  \n",
+       "3  0.033333  "
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "prompts.iloc[:4]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We might think 'just ask for JSON' would work fine, but optimization showed that detailed instructions perform much better - another reminder that optimization beats guessing!\n",
+    "\n",
+    "\n",
+    "Happy prompt optimizing! 🚀✨ We can't wait to see what you build with Promptolution! 🤖💡"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "promptolution-py3.12",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}