ducklm/app/models/adapters.py

73 lines
1.9 KiB
Python

from __future__ import annotations
from pathlib import Path
from typing import Any, Protocol, Iterator
import os
try:
from llama_cpp import Llama
LLAMA_AVAILABLE = True
except ImportError:
Llama = None
LLAMA_AVAILABLE = False
class BaseModelAdapter(Protocol):
async def generate(self, prompt: str, **kwargs: Any) -> str: ...
def stream(self, prompt: str, **kwargs: Any) -> Iterator[str]: ...
def create_llama_adapter(
model_path: str,
backend: str = "cpu",
n_gpu_layers: int = 0,
max_tokens: int = 2048,
temperature: float = 0.2,
base_dir: Path | None = None,
) -> "Llama":
if not LLAMA_AVAILABLE:
raise RuntimeError("llama-cpp-python not installed")
if base_dir:
model_path = str(base_dir / model_path)
else:
model_path = str(Path.cwd() / model_path)
return Llama(
model_path=model_path,
n_gpu_layers=n_gpu_layers,
n_ctx=4096,
n_threads=int(os.environ.get("DUCKLM_N_THREADS", max(4, min((os.cpu_count() or 4) // 2, 20)))),
n_threads_batch=-1,
max_tokens=max_tokens,
temperature=temperature,
verbose=False,
)
def create_adapter(
model_type: str,
config: dict[str, Any],
base_dir: Path | None = None,
) -> "Llama":
if not LLAMA_AVAILABLE:
raise RuntimeError("llama-cpp-python not installed")
model_path = config.get("path", "")
backend = config.get("backend", "cpu")
n_gpu_layers = config.get("n_gpu_layers", 0)
max_tokens = config.get("max_tokens", 2048)
temperature = config.get("temperature", 0.2)
if backend == "vulkan" and n_gpu_layers != 0:
n_gpu_layers = -1
return create_llama_adapter(
model_path=model_path,
backend=backend,
n_gpu_layers=n_gpu_layers,
max_tokens=max_tokens,
temperature=temperature,
base_dir=base_dir,
)