ducklm/app/models/adapters.py

from __future__ import annotations

from pathlib import Path
from typing import Any, Protocol, Iterator
import os

try:
    from llama_cpp import Llama
    LLAMA_AVAILABLE = True
except ImportError:
    Llama = None
    LLAMA_AVAILABLE = False


class BaseModelAdapter(Protocol):
    async def generate(self, prompt: str, **kwargs: Any) -> str: ...
    def stream(self, prompt: str, **kwargs: Any) -> Iterator[str]: ...


def create_llama_adapter(
    model_path: str,
    backend: str = "cpu",
    n_gpu_layers: int = 0,
    max_tokens: int = 2048,
    temperature: float = 0.2,
    base_dir: Path | None = None,
) -> "Llama":
    if not LLAMA_AVAILABLE:
        raise RuntimeError("llama-cpp-python not installed")

    if base_dir:
        model_path = str(base_dir / model_path)
    else:
        model_path = str(Path.cwd() / model_path)

    return Llama(
        model_path=model_path,
        n_gpu_layers=n_gpu_layers,
        n_ctx=4096,
        n_threads=int(os.environ.get("DUCKLM_N_THREADS", max(4, min((os.cpu_count() or 4) // 2, 20)))),
        n_threads_batch=-1,
        max_tokens=max_tokens,
        temperature=temperature,
        verbose=False,
    )


def create_adapter(
    model_type: str,
    config: dict[str, Any],
    base_dir: Path | None = None,
) -> "Llama":
    if not LLAMA_AVAILABLE:
        raise RuntimeError("llama-cpp-python not installed")

    model_path = config.get("path", "")
    backend = config.get("backend", "cpu")
    n_gpu_layers = config.get("n_gpu_layers", 0)
    max_tokens = config.get("max_tokens", 2048)
    temperature = config.get("temperature", 0.2)

    if backend == "vulkan" and n_gpu_layers != 0:
        n_gpu_layers = -1

    return create_llama_adapter(
        model_path=model_path,
        backend=backend,
        n_gpu_layers=n_gpu_layers,
        max_tokens=max_tokens,
        temperature=temperature,
        base_dir=base_dir,
    )