73 lines
1.9 KiB
Python
73 lines
1.9 KiB
Python
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
from typing import Any, Protocol, Iterator
|
|
import os
|
|
|
|
try:
|
|
from llama_cpp import Llama
|
|
LLAMA_AVAILABLE = True
|
|
except ImportError:
|
|
Llama = None
|
|
LLAMA_AVAILABLE = False
|
|
|
|
|
|
class BaseModelAdapter(Protocol):
|
|
async def generate(self, prompt: str, **kwargs: Any) -> str: ...
|
|
def stream(self, prompt: str, **kwargs: Any) -> Iterator[str]: ...
|
|
|
|
|
|
def create_llama_adapter(
|
|
model_path: str,
|
|
backend: str = "cpu",
|
|
n_gpu_layers: int = 0,
|
|
max_tokens: int = 2048,
|
|
temperature: float = 0.2,
|
|
base_dir: Path | None = None,
|
|
) -> "Llama":
|
|
if not LLAMA_AVAILABLE:
|
|
raise RuntimeError("llama-cpp-python not installed")
|
|
|
|
if base_dir:
|
|
model_path = str(base_dir / model_path)
|
|
else:
|
|
model_path = str(Path.cwd() / model_path)
|
|
|
|
return Llama(
|
|
model_path=model_path,
|
|
n_gpu_layers=n_gpu_layers,
|
|
n_ctx=4096,
|
|
n_threads=int(os.environ.get("DUCKLM_N_THREADS", max(4, min((os.cpu_count() or 4) // 2, 20)))),
|
|
n_threads_batch=-1,
|
|
max_tokens=max_tokens,
|
|
temperature=temperature,
|
|
verbose=False,
|
|
)
|
|
|
|
|
|
def create_adapter(
|
|
model_type: str,
|
|
config: dict[str, Any],
|
|
base_dir: Path | None = None,
|
|
) -> "Llama":
|
|
if not LLAMA_AVAILABLE:
|
|
raise RuntimeError("llama-cpp-python not installed")
|
|
|
|
model_path = config.get("path", "")
|
|
backend = config.get("backend", "cpu")
|
|
n_gpu_layers = config.get("n_gpu_layers", 0)
|
|
max_tokens = config.get("max_tokens", 2048)
|
|
temperature = config.get("temperature", 0.2)
|
|
|
|
if backend == "vulkan" and n_gpu_layers != 0:
|
|
n_gpu_layers = -1
|
|
|
|
return create_llama_adapter(
|
|
model_path=model_path,
|
|
backend=backend,
|
|
n_gpu_layers=n_gpu_layers,
|
|
max_tokens=max_tokens,
|
|
temperature=temperature,
|
|
base_dir=base_dir,
|
|
)
|