45 lines
1.5 KiB
Python
45 lines
1.5 KiB
Python
from __future__ import annotations
|
|
|
|
from threading import RLock
|
|
from typing import Any, Iterator
|
|
from llama_cpp import Llama
|
|
|
|
|
|
class CoderAdapter:
|
|
def __init__(self, llm: Llama, system_prompt: str | None = None, lock: RLock | None = None) -> None:
|
|
self._llm = llm
|
|
self._lock = lock or RLock()
|
|
self._system_prompt = system_prompt or (
|
|
"You are an expert code generation model."
|
|
)
|
|
self._temperature = 0.2
|
|
|
|
def generate(self, prompt: str, max_tokens: int | None = None) -> str:
|
|
messages = [
|
|
{"role": "system", "content": self._system_prompt},
|
|
{"role": "user", "content": prompt},
|
|
]
|
|
with self._lock:
|
|
output = self._llm.create_chat_completion(
|
|
messages=messages,
|
|
max_tokens=max_tokens or 1024,
|
|
temperature=self._temperature,
|
|
)
|
|
return output["choices"][0]["message"]["content"]
|
|
|
|
def stream(self, prompt: str, max_tokens: int | None = None) -> Iterator[str]:
|
|
messages = [
|
|
{"role": "system", "content": self._system_prompt},
|
|
{"role": "user", "content": prompt},
|
|
]
|
|
with self._lock:
|
|
for chunk in self._llm.create_chat_completion(
|
|
messages=messages,
|
|
max_tokens=max_tokens or 1024,
|
|
temperature=self._temperature,
|
|
stream=True,
|
|
):
|
|
content = chunk["choices"][0].get("delta", {}).get("content")
|
|
if content:
|
|
yield content
|