generative-computing · guicho271828 · Sep 4, 2025 · Aug 28, 2025 · Aug 28, 2025 · Sep 2, 2025
diff --git a/mellea/backends/_utils.py b/mellea/backends/_utils.py
@@ -0,0 +1,107 @@
+from __future__ import annotations
+
+import inspect
+from collections.abc import Callable
+from typing import Any, Literal
+
+from mellea.backends.aloras import Alora
+from mellea.backends.formatter import Formatter
+from mellea.backends.tools import parse_tools
+from mellea.helpers.fancy_logger import FancyLogger
+from mellea.stdlib.base import CBlock, Component, Context, ModelToolCall
+from mellea.stdlib.chat import Message
+from mellea.stdlib.requirement import ALoraRequirement, LLMaJRequirement, Requirement
+
+# Chat = dict[Literal["role", "content"], str] # external apply_chat_template type hint is weaker
+# Chat = dict[str, str | list[dict[str, Any]] ] # for multi-modal models
+Chat = dict[str, str]
+
+
+def to_chat(
+    action: Component | CBlock,
+    ctx: Context,
+    formatter: Formatter,
+    system_prompt: str | None,
+) -> list[Chat]:
+    """Converts a context and an action into a series of dicts to be passed to apply_chat_template .
+
+    This function is used by local inference backends.
+    """
+    assert ctx.is_chat_context
+
+    linearized_ctx = ctx.view_for_generation()
+    assert linearized_ctx is not None, (
+        "If ctx.is_chat_context, then the context should be linearizable."
+    )
+    ctx_as_message_list: list[Message] = formatter.to_chat_messages(linearized_ctx)
+    # add action
+    ctx_as_message_list.extend(formatter.to_chat_messages([action]))
+
+    ctx_as_conversation: list = [
+        {"role": m.role, "content": m.content} for m in ctx_as_message_list
+    ]
+
+    # Check that we ddin't accidentally end up with CBlocks.
+    for msg in ctx_as_conversation:
+        for v in msg.values():
+            if "CBlock" in v:
+                FancyLogger.get_logger().error(
+                    f"Found the string `CBlock` in what should've been a stringified context: {ctx_as_conversation}"
+                )
+
+    # handle custom system prompts. It's important that we do this before the _parse_and_**clean**_model_options step.
+    if system_prompt is not None:
+        system_msg: Chat = {"role": "system", "content": system_prompt}
+        ctx_as_conversation.insert(0, system_msg)
+
+    return ctx_as_conversation
+
+
+def use_alora(
+    action: Component | CBlock,
+    alora: Alora | None,
+    default_to_constraint_checking_alora: bool,
+) -> bool:
+    """Returns True when the condition for using alora is met.
+
+    See `docs/dev/requirement_aLoRA_rerouting.md` for an explanation of the following code block.
+    """
+    if issubclass(type(action), Requirement):
+        # The general rule is that we reroute to the alora if it exists.
+        reroute_to_alora = alora is not None
+        # However, there are some exceptions:
+        if not default_to_constraint_checking_alora:
+            reroute_to_alora = False
+        if issubclass(type(action), LLMaJRequirement):
+            reroute_to_alora = False
+        if issubclass(type(action), ALoraRequirement):
+            reroute_to_alora = True
+        return reroute_to_alora
+    else:
+        return False
+
+
+def to_tool_calls(
+    tools: dict[str, Callable], decoded_result: str
+) -> dict[str, ModelToolCall] | None:
+    """Parse a tool call string."""
+    model_tool_calls: dict[str, ModelToolCall] = dict()
+    for tool_name, tool_args in parse_tools(decoded_result):
+        func = tools.get(tool_name)
+        if func is None:
+            FancyLogger.get_logger().warning(
+                f"model attempted to call a non-existing function: {tool_name}"
+            )
+            continue
+
+        # Clean up the function args slightly. Some models seem to
+        # hallucinate parameters when none are required.
+        sig = inspect.signature(func)
+        if len(sig.parameters) == 0:
+            tool_args = {}
+
+        model_tool_calls[tool_name] = ModelToolCall(tool_name, func, tool_args)
+
+    if len(model_tool_calls) > 0:
+        return model_tool_calls
+    return None
diff --git a/mellea/backends/huggingface.py b/mellea/backends/huggingface.py
@@ -30,6 +30,7 @@
 from transformers.generation.utils import GenerateDecoderOnlyOutput
 
 from mellea.backends import BaseModelSubclass
+from mellea.backends._utils import to_chat, to_tool_calls, use_alora
 from mellea.backends.aloras import Alora, AloraBackendMixin
 from mellea.backends.cache import Cache, SimpleLRUCache
 from mellea.backends.formatter import Formatter, FormatterBackend, TemplateFormatter
@@ -39,7 +40,6 @@
     add_tools_from_context_actions,
     add_tools_from_model_options,
     convert_tools_to_json,
-    parse_tools,
 )
 from mellea.backends.types import ModelOption
 from mellea.helpers.async_helpers import send_to_queue
@@ -198,26 +198,24 @@ def generate_from_context(
         # Upsert model options.
         model_opts = self._simplify_and_merge(model_options)
 
-        # See `docs/dev/requirement_aLoRA_rerouting.md` for an explanation of the following code block.
-        if issubclass(type(action), Requirement):
-            # The general rule is that we reroute to the alora if it exists.
-            reroute_to_alora = self.get_alora("constraint") is not None
-            # However, there are some exceptions:
-            if not self.default_to_constraint_checking_alora:
-                reroute_to_alora = False
-            if issubclass(type(action), LLMaJRequirement):
-                reroute_to_alora = False
-            if issubclass(type(action), ALoraRequirement):
-                reroute_to_alora = True
-            if reroute_to_alora:
-                mot = self._generate_from_context_alora(
-                    action, ctx, _format=format, model_options=model_opts
-                )
-                return mot, ctx.add(mot)
-        mot = self._generate_from_context_standard(
-            action, ctx, _format=format, model_options=model_opts, tool_calls=tool_calls
-        )
-        return mot, ctx.add(action).add(mot)
+        if use_alora(
+            action,
+            self.get_alora("constraint"),
+            self.default_to_constraint_checking_alora,
+        ):
+            mot = self._generate_from_context_alora(
+                action, ctx, _format=format, model_options=model_opts
+            )
+            return mot, ctx.add(mot)
+        else:
+            mot = self._generate_from_context_standard(
+                action,
+                ctx,
+                _format=format,
+                model_options=model_opts,
+                tool_calls=tool_calls,
+            )
+            return mot, ctx.add(action).add(mot)
 
     def _generate_from_context_alora(
         self,
@@ -279,35 +277,8 @@ def _generate_from_context_standard(
         # If the Context is a ChatHistory then we will pretty-print each content as a message and then use apply_chat_template.
         # Otherwise, we will linearize the context and treat it as a raw input.
         if ctx.is_chat_context:
-            linearized_ctx = ctx.view_for_generation()
-            assert linearized_ctx is not None, (
-                "If ctx.is_chat_context, then the context should be linearizable."
-            )
-            ctx_as_message_list: list[Message] = self.formatter.to_chat_messages(
-                linearized_ctx
-            )
-            # add action
-            ctx_as_message_list.extend(self.formatter.to_chat_messages([action]))
-            ctx_as_conversation = [
-                {"role": m.role, "content": m.content} for m in ctx_as_message_list
-            ]
-
-            # Check that we ddin't accidentally end up with CBlocks.
-            for msg in ctx_as_conversation:
-                for v in msg.values():
-                    if "CBlock" in v:
-                        FancyLogger.get_logger().error(
-                            f"Found the string `CBlock` in what should've been a stringified context: {ctx_as_conversation}"
-                        )
-
-            # handle custom system prompts. It's important that we do this before the _parse_and_**clean**_model_options step.
             system_prompt = model_options.get(ModelOption.SYSTEM_PROMPT, None)
-            if system_prompt is not None:
-                system_msg: dict[str, str] = {
-                    "role": "system",
-                    "content": system_prompt,
-                }
-                ctx_as_conversation.insert(0, system_msg)
+            ctx_as_chat = to_chat(action, ctx, self.formatter, system_prompt)
 
             # Append tool call information if applicable.
             tools: dict[str, Callable] = dict()
@@ -332,7 +303,7 @@ def _generate_from_context_standard(
                 set_seed(seed)
 
             input_ids = self._tokenizer.apply_chat_template(  # type: ignore
-                ctx_as_conversation,
+                ctx_as_chat,
                 tools=convert_tools_to_json(tools),  # type: ignore
                 add_generation_prompt=True,
                 return_tensors="pt",
@@ -397,7 +368,7 @@ def _generate_from_context_standard(
             )
 
             output = ModelOutputThunk(None)
-            output._context = linearized_ctx
+            output._context = ctx.view_for_generation()
             output._action = action
             output._model_options = model_options
 
@@ -406,7 +377,7 @@ def _generate_from_context_standard(
             output._process = functools.partial(self.processing, input_ids=input_ids)
             output._post_process = functools.partial(
                 self.post_processing,
-                conversation=ctx_as_conversation,
+                conversation=ctx_as_chat,
                 input_ids=input_ids,
                 _format=_format,
                 tool_calls=tool_calls,
@@ -497,7 +468,7 @@ async def post_processing(
 
         # Only scan for tools if we are not doing structured output and tool calls were provided to the model.
         if _format is None and tool_calls:
-            mot.tool_calls = self._extract_model_tool_requests(tools, mot.value)
+            mot.tool_calls = to_tool_calls(tools, mot.value)
 
         assert mot._action is not None, (
             "ModelOutputThunks should have their action assigned during generation"
@@ -698,30 +669,6 @@ def _filter_chat_template_only_options(
         }
         return {k: v for k, v in model_options.items() if k not in chat_template_only}
 
-    def _extract_model_tool_requests(
-        self, tools: dict[str, Callable], decoded_result: str
-    ) -> dict[str, ModelToolCall] | None:
-        model_tool_calls: dict[str, ModelToolCall] = dict()
-        for tool_name, tool_args in parse_tools(decoded_result):
-            func = tools.get(tool_name)
-            if func is None:
-                FancyLogger.get_logger().warning(
-                    f"model attempted to call a non-existing function: {tool_name}"
-                )
-                continue
-
-            # Clean up the function args slightly. Some models seem to
-            # hallucinate parameters when none are required.
-            sig = inspect.signature(func)
-            if len(sig.parameters) == 0:
-                tool_args = {}
-
-            model_tool_calls[tool_name] = ModelToolCall(tool_name, func, tool_args)
-
-        if len(model_tool_calls) > 0:
-            return model_tool_calls
-        return None
-
     # region ALora loading, unloading, and utility functions.
     def add_alora(self, alora: HFAlora):
         """Loads an ALora for this backend.

diff --git a/mellea/backends/model_ids.py b/mellea/backends/model_ids.py
@@ -126,6 +126,10 @@ class ModelIdentifier:
 #### Qwen models ####
 #####################
 
+QWEN3_0_6B = ModelIdentifier(hf_model_name="Qwen/Qwen3-0.6B", ollama_name="qwen3:0.6b")
+
+QWEN3_1_7B = ModelIdentifier(hf_model_name="Qwen/Qwen3-1.7B", ollama_name="qwen3:1.7b")
+
 QWEN3_8B = ModelIdentifier(hf_model_name="Qwen/Qwen3-8B", ollama_name="qwen3:8b")
 
 QWEN3_14B = ModelIdentifier(hf_model_name="Qwen/Qwen3-14B", ollama_name="qwen3:14b")