from __future__ import annotations from pathlib import Path from typing import Any, Protocol, Iterator import os try: from llama_cpp import Llama LLAMA_AVAILABLE = True except ImportError: Llama = None LLAMA_AVAILABLE = False class BaseModelAdapter(Protocol): async def generate(self, prompt: str, **kwargs: Any) -> str: ... def stream(self, prompt: str, **kwargs: Any) -> Iterator[str]: ... def create_llama_adapter( model_path: str, backend: str = "cpu", n_gpu_layers: int = 0, max_tokens: int = 2048, temperature: float = 0.2, base_dir: Path | None = None, ) -> "Llama": if not LLAMA_AVAILABLE: raise RuntimeError("llama-cpp-python not installed") if base_dir: model_path = str(base_dir / model_path) else: model_path = str(Path.cwd() / model_path) return Llama( model_path=model_path, n_gpu_layers=n_gpu_layers, n_ctx=4096, n_threads=int(os.environ.get("DUCKLM_N_THREADS", max(4, min((os.cpu_count() or 4) // 2, 20)))), n_threads_batch=-1, max_tokens=max_tokens, temperature=temperature, verbose=False, ) def create_adapter( model_type: str, config: dict[str, Any], base_dir: Path | None = None, ) -> "Llama": if not LLAMA_AVAILABLE: raise RuntimeError("llama-cpp-python not installed") model_path = config.get("path", "") backend = config.get("backend", "cpu") n_gpu_layers = config.get("n_gpu_layers", 0) max_tokens = config.get("max_tokens", 2048) temperature = config.get("temperature", 0.2) if backend == "vulkan" and n_gpu_layers != 0: n_gpu_layers = -1 return create_llama_adapter( model_path=model_path, backend=backend, n_gpu_layers=n_gpu_layers, max_tokens=max_tokens, temperature=temperature, base_dir=base_dir, )