from __future__ import annotations from threading import RLock from typing import Any, Iterator from llama_cpp import Llama class CriticAdapter: def __init__(self, llm: Llama, system_prompt: str | None = None, lock: RLock | None = None) -> None: self._llm = llm self._lock = lock or RLock() self._system_prompt = system_prompt or ( "You are a critic model. Evaluate tool results and respond with JSON." ) self._temperature = 0.1 def generate(self, prompt: str, max_tokens: int | None = None) -> str: messages = [ {"role": "system", "content": self._system_prompt}, {"role": "user", "content": prompt}, ] with self._lock: output = self._llm.create_chat_completion( messages=messages, max_tokens=max_tokens or 512, temperature=self._temperature, ) return output["choices"][0]["message"]["content"] def stream(self, prompt: str, max_tokens: int | None = None) -> Iterator[str]: messages = [ {"role": "system", "content": self._system_prompt}, {"role": "user", "content": prompt}, ] with self._lock: for chunk in self._llm.create_chat_completion( messages=messages, max_tokens=max_tokens or 512, temperature=self._temperature, stream=True, ): content = chunk["choices"][0].get("delta", {}).get("content") if content: yield content