Expose DuckLM runtime status

2026-05-22 07:45:47 +08:00 · 2026-05-22 07:45:47 +08:00 · 8452673994
parent ff98224eb6
commit 8452673994
8 changed files with 319 additions and 9 deletions
--- a/CURRENT_STATE.md
+++ b/CURRENT_STATE.md
@ -1,6 +1,6 @@
 # DuckLM — текущее состояние проекта
-Дата обновления: 2026-05-21
+Дата обновления: 2026-05-22
 Рабочая копия: `/home/mirivlad/git/ducklm`
 Git remote: `origin/main`
@ -25,6 +25,7 @@ WebChat доступен через FastAPI на `http://127.0.0.1:8000/`.
 - TaskStore и EventStore в SQLite.
 - ModelClient с логическими ролями из `config/models.yaml`.
 - Роли: `thinker`, `critic`, `coder`, `action`, `summary`, `memory_policy`, `recall`.
 - Расширенный `/v1/status` с API paths, token budgets, model role map и optional live-probe для llama/Qdrant.
 - SSE streaming chat: reasoning/content deltas, runtime status events, final stats.
 - Runtime status в чате для долгих этапов: planning, running_tool(s), answering.
 - Min/avg/max token speed в конце ответа.
@ -65,6 +66,7 @@ WebChat доступен через FastAPI на `http://127.0.0.1:8000/`.
  - reflection
  - experience records
 - Skill candidate selection теперь используется в обычном и streaming chat.
 - `scripts/duck.sh status --probe` и `scripts/duck-mtp.sh status --probe` показывают live-состояние DuckLM runtime, model backend и vector memory.
 ## Соответствие этапам из Ducklm.md
@ -78,7 +80,7 @@ WebChat доступен через FastAPI на `http://127.0.0.1:8000/`.
 | 6. Approvals | Готово | UI и API approvals, allow_once/forever/deny |
 | 7. Skills | Готово | Registry, API/UI, candidate skill injection |
 | 8. Reflection/Experience | Готово | Reflection после completed задач, experience records |
-| 9. Memory/VectorMemory | Готово частично | SQLite memory готова; Qdrant зависит от запущенного сервиса и embeddings |
+| 9. Memory/VectorMemory | Готово частично | SQLite memory готова; `/v1/status?probe=true` показывает live health Qdrant; embeddings зависят от локальной модели/endpoint |
 | 10. MTP/benchmark | Готово как experimental | MTP script есть, action по умолчанию остаётся на main endpoint |
 ## Остаточные ограничения
@ -119,6 +121,7 @@ http://127.0.0.1:8000/
 ```bash
 curl --noproxy '*' http://127.0.0.1:8000/health
 curl --noproxy '*' 'http://127.0.0.1:8000/v1/status?probe=true'
 curl --noproxy '*' http://127.0.0.1:8000/v1/models/roles
 ```
@ -126,6 +129,7 @@ curl --noproxy '*' http://127.0.0.1:8000/v1/models/roles
 ```bash
 bash scripts/duck.sh status
 bash scripts/duck.sh status --probe
 bash scripts/duck.sh logs --follow
 bash scripts/duck.sh restart
 bash scripts/duck.sh stop
@ -143,6 +147,7 @@ bash scripts/duck-mtp.sh logs --follow
 ## Что делать следующим
 1. Пройти live E2E checklist в WebChat на реальной модели.
-2. Если Qdrant нужен постоянно, добавить отдельную health-индикацию vector memory в `/v1/status`.
+2. Вынести runtime/model role routing в явный конфиг с fallback-политикой, оставив Qwen основным backend для всех ролей.
-3. При необходимости заменить keyword skill selection на LLM-based selection.
+3. Добавить строгую JSON validation/fallback для structured utility-ролей.
-4. Позже мигрировать FastAPI startup на lifespan.
+4. При необходимости заменить keyword skill selection на LLM-based selection.
 5. Позже мигрировать FastAPI startup на lifespan.
--- a/docs/how_to_run.md
+++ b/docs/how_to_run.md
@ -36,6 +36,13 @@ bash scripts/duck.sh restart
 bash scripts/duck.sh stop
 ```
 Use live probes when you need backend diagnostics, not just process status:
 ```bash
 bash scripts/duck.sh status --probe
 curl --noproxy '*' 'http://127.0.0.1:8000/v1/status?probe=true'
 ```
 4. Open WebChat:
 ```text
--- a/docs/web_api.md
+++ b/docs/web_api.md
@ -24,6 +24,29 @@ GET  /v1/experience/{id}
 GET  /v1/memory/search?q=...
 ```
 `GET /v1/status` returns a fast runtime snapshot without live backend checks:
 ```json
 {
  "name": "DuckLM",
  "api": {"host": "127.0.0.1", "port": 8000},
  "paths": {"workspace": "./workspace", "db_path": "./data/duck.sqlite3"},
  "token_budget": {"ctx_size": 65536},
  "models": {
    "default_provider": "llama_server",
    "endpoints": ["llama_server:http://127.0.0.1:8081/v1:local-main"],
    "roles": {"thinker": {"model": "local-main"}}
  },
  "services": {
    "duck_api": {"ok": true, "probed": true},
    "llama": {"ok": null, "probed": false, "roles": {}},
    "vector_memory": {"ok": null, "probed": false}
  }
 }
 ```
 Use `GET /v1/status?probe=true` to also call the model backend and Qdrant.
 Chat requests accept optional `reasoning`:
 ```json
--- a/duck_core/api.py
+++ b/duck_core/api.py
@ -6,7 +6,7 @@ from pathlib import Path
 from typing import Any, Literal
 import uvicorn
-from fastapi import FastAPI, HTTPException, Request
+from fastapi import FastAPI, HTTPException, Query, Request
 from fastapi.responses import FileResponse, HTMLResponse, StreamingResponse
 from fastapi.staticfiles import StaticFiles
 from fastapi.templating import Jinja2Templates
@ -165,7 +165,31 @@ def create_app() -> FastAPI:
        return {"status": "ok"}
    @app.get("/v1/status")
-    async def status() -> dict[str, Any]:
+    async def status(probe: bool = Query(False)) -> dict[str, Any]:
        role_configs = model_client.list_roles()
        endpoints = sorted(
            {
                f"{role_config['provider']}:{role_config['base_url']}:{role_config['model']}"
                for role_config in role_configs.values()
            }
        )
        llama_status: dict[str, Any] = {"probed": False, "ok": None, "roles": {}}
        vector_status: dict[str, Any] = {
            **vector_memory.config_status(),
            "probed": False,
            "ok": None,
        }
        if probe:
            model_ping = await model_client.ping()
            llama_status = {
                "probed": True,
                "ok": all(role.get("ok") for role in model_ping.values()) if model_ping else False,
                "roles": model_ping,
            }
            vector_status = {
                **await vector_memory.health(),
                "probed": True,
            }
        return {
            "name": "DuckLM",
            "version": "0.1.0",
@ -173,6 +197,33 @@ def create_app() -> FastAPI:
            "api_port": settings.api_port,
            "workspace": settings.workspace,
            "db_path": settings.db_path,
            "api": {
                "host": settings.api_host,
                "port": settings.api_port,
                "base_url": f"http://{settings.api_host}:{settings.api_port}",
            },
            "paths": {
                "workspace": settings.workspace,
                "db_path": settings.db_path,
                "models_config": str(model_client.config_path),
            },
            "token_budget": {
                "ctx_size": settings.ctx_size,
                "max_input_tokens": settings.max_input_tokens,
                "max_recent_events_tokens": settings.max_recent_events_tokens,
                "max_memory_tokens": settings.max_memory_tokens,
                "max_skill_tokens": settings.max_skill_tokens,
            },
            "models": {
                "default_provider": model_client.default_provider,
                "roles": role_configs,
                "endpoints": endpoints,
            },
            "services": {
                "duck_api": {"ok": True, "probed": True},
                "llama": llama_status,
                "vector_memory": vector_status,
            },
        }
    @app.get("/v1/models/roles")
--- a/duck_core/memory/vector_memory.py
+++ b/duck_core/memory/vector_memory.py
@ -94,6 +94,43 @@ class VectorMemory:
            response.raise_for_status()
        return response.json().get("result", [])
    def config_status(self) -> dict[str, Any]:
        if self.embeddings_base_url:
            embedding_source = f"remote:{self.embeddings_base_url}"
        elif self._local_model_path:
            embedding_source = f"local:{self._local_model_path}"
        else:
            embedding_source = "none"
        return {
            "configured": bool(self.qdrant_url and embedding_source != "none"),
            "qdrant_url": self.qdrant_url,
            "collection": self.collection_name,
            "embedding_source": embedding_source,
        }
    async def health(self) -> dict[str, Any]:
        """Probe Qdrant without loading the local embedding model."""
        import time
        status = self.config_status()
        started = time.perf_counter()
        try:
            async with httpx.AsyncClient(timeout=5.0, trust_env=False) as client:
                response = await client.get(f"{self.qdrant_url}/")
                response.raise_for_status()
            return {
                **status,
                "ok": True,
                "latency_ms": round((time.perf_counter() - started) * 1000, 1),
            }
        except httpx.HTTPError as exc:
            return {
                **status,
                "ok": False,
                "error": str(exc),
                "latency_ms": round((time.perf_counter() - started) * 1000, 1),
            }
    async def _embed(self, text: str) -> list[float]:
        """Generate embeddings using local model or remote endpoint."""
        # Prefer local model if available
--- a/scripts/duck-mtp.sh
+++ b/scripts/duck-mtp.sh
@ -53,6 +53,7 @@ Commands:
  stop        Stop DuckLM API and managed MTP llama-server
  restart     Stop and start the whole local DuckLM stack
  status      Print process and HTTP health status
              Use "status --probe" to include live model/vector checks
  logs        Show DuckLM API and llama-server logs; use --follow/-f and --lines N
  help        Show this help
@ -169,6 +170,20 @@ stop_stack() {
 }
 status_stack() {
  local probe=0
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --probe)
        probe=1
        shift
        ;;
      *)
        echo "Unknown status argument: $1" >&2
        return 2
        ;;
    esac
  done
  local rc=0
  if api_is_running; then
    local pid
@ -176,6 +191,7 @@ status_stack() {
    echo "DuckLM API running: pid=${pid}"
    if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${API_URL}/health" >/dev/null 2>&1; then
      echo "DuckLM API health: ok (${API_URL})"
      print_runtime_status "${probe}"
    else
      echo "DuckLM API health: not ready (${API_URL})"
    fi
@ -197,6 +213,55 @@ status_stack() {
  return "${rc}"
 }
 print_runtime_status() {
  local probe="${1:-0}"
  local status_url="${API_URL}/v1/status"
  if [[ "${probe}" == "1" ]]; then
    status_url="${status_url}?probe=true"
  fi
  local payload
  if ! payload="$(curl --noproxy "*" -fsS "${status_url}" 2>/dev/null)"; then
    echo "DuckLM runtime status: unavailable (${status_url})"
    return 0
  fi
  local python_bin="${DUCK_PYTHON_BIN:-${ROOT_DIR}/.venv/bin/python}"
  if [[ ! -x "${python_bin}" ]]; then
    python_bin="python3"
  fi
  if ! command -v "${python_bin}" >/dev/null 2>&1 && [[ ! -x "${python_bin}" ]]; then
    echo "DuckLM runtime status: ${status_url}"
    return 0
  fi
  printf '%s' "${payload}" | "${python_bin}" -c '
 import json
 import sys
 data = json.load(sys.stdin)
 models = data.get("models", {})
 roles = sorted((models.get("roles") or {}).keys())
 services = data.get("services", {})
 llama = services.get("llama", {})
 vector = services.get("vector_memory", {})
 def service_line(name, service):
    if not service.get("probed"):
        return f"{name}: not probed"
    ok = "ok" if service.get("ok") else "failed"
    error = service.get("error")
    return f"{name}: {ok}" + (f" ({error})" if error else "")
 print("DuckLM runtime:")
 print("  workspace: {}".format(data.get("workspace")))
 print("  db: {}".format(data.get("db_path")))
 print("  model endpoints: {}".format(len(models.get("endpoints") or [])))
 print("  roles: {}".format(", ".join(roles)))
 print("  {}".format(service_line("llama", llama)))
 print("  {}".format(service_line("vector memory", vector)))
 '
 }
 logs_stack() {
  local follow=0
  local lines=100
@ -246,7 +311,8 @@ case "${ACTION}" in
    start_stack
    ;;
  status)
-    status_stack
+    shift || true
    status_stack "$@"
    ;;
  logs)
    logs_stack "$@"
--- a/scripts/duck.sh
+++ b/scripts/duck.sh
@ -53,6 +53,7 @@ Commands:
  stop        Stop DuckLM API and managed llama-server
  restart     Stop and start the whole local DuckLM stack
  status      Print process and HTTP health status
              Use "status --probe" to include live model/vector checks
  logs        Show DuckLM API and llama-server logs; use --follow/-f and --lines N
  help        Show this help
@ -169,6 +170,20 @@ stop_stack() {
 }
 status_stack() {
  local probe=0
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --probe)
        probe=1
        shift
        ;;
      *)
        echo "Unknown status argument: $1" >&2
        return 2
        ;;
    esac
  done
  local rc=0
  if api_is_running; then
    local pid
@ -176,6 +191,7 @@ status_stack() {
    echo "DuckLM API running: pid=${pid}"
    if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${API_URL}/health" >/dev/null 2>&1; then
      echo "DuckLM API health: ok (${API_URL})"
      print_runtime_status "${probe}"
    else
      echo "DuckLM API health: not ready (${API_URL})"
    fi
@ -197,6 +213,55 @@ status_stack() {
  return "${rc}"
 }
 print_runtime_status() {
  local probe="${1:-0}"
  local status_url="${API_URL}/v1/status"
  if [[ "${probe}" == "1" ]]; then
    status_url="${status_url}?probe=true"
  fi
  local payload
  if ! payload="$(curl --noproxy "*" -fsS "${status_url}" 2>/dev/null)"; then
    echo "DuckLM runtime status: unavailable (${status_url})"
    return 0
  fi
  local python_bin="${DUCK_PYTHON_BIN:-${ROOT_DIR}/.venv/bin/python}"
  if [[ ! -x "${python_bin}" ]]; then
    python_bin="python3"
  fi
  if ! command -v "${python_bin}" >/dev/null 2>&1 && [[ ! -x "${python_bin}" ]]; then
    echo "DuckLM runtime status: ${status_url}"
    return 0
  fi
  printf '%s' "${payload}" | "${python_bin}" -c '
 import json
 import sys
 data = json.load(sys.stdin)
 models = data.get("models", {})
 roles = sorted((models.get("roles") or {}).keys())
 services = data.get("services", {})
 llama = services.get("llama", {})
 vector = services.get("vector_memory", {})
 def service_line(name, service):
    if not service.get("probed"):
        return f"{name}: not probed"
    ok = "ok" if service.get("ok") else "failed"
    error = service.get("error")
    return f"{name}: {ok}" + (f" ({error})" if error else "")
 print("DuckLM runtime:")
 print("  workspace: {}".format(data.get("workspace")))
 print("  db: {}".format(data.get("db_path")))
 print("  model endpoints: {}".format(len(models.get("endpoints") or [])))
 print("  roles: {}".format(", ".join(roles)))
 print("  {}".format(service_line("llama", llama)))
 print("  {}".format(service_line("vector memory", vector)))
 '
 }
 logs_stack() {
  local follow=0
  local lines=100
@ -246,7 +311,8 @@ case "${ACTION}" in
    start_stack
    ;;
  status)
-    status_stack
+    shift || true
    status_stack "$@"
    ;;
  logs)
    logs_stack "$@"
--- a/tests/smoke/test_api_health.py
+++ b/tests/smoke/test_api_health.py
@ -1,6 +1,8 @@
 from fastapi.testclient import TestClient
 from duck_core.api import create_app
 from duck_core.memory.vector_memory import VectorMemory
 from duck_core.model_client import ModelClient
 def test_health_and_status_endpoints(tmp_path, monkeypatch):
@ -12,6 +14,59 @@ def test_health_and_status_endpoints(tmp_path, monkeypatch):
    status = client.get("/v1/status").json()
    assert status["name"] == "DuckLM"
    assert status["api_host"] == "127.0.0.1"
    assert status["api"]["host"] == "127.0.0.1"
    assert status["api"]["port"] == 8000
    assert status["paths"]["db_path"] == str(tmp_path / "duck.sqlite3")
    assert status["models"]["default_provider"] == "llama_server"
    assert status["models"]["roles"]["thinker"]["model"] == "local-main"
    assert status["services"]["duck_api"]["ok"] is True
    assert status["services"]["llama"]["probed"] is False
    assert status["services"]["vector_memory"]["probed"] is False
 def test_status_endpoint_can_probe_backends(tmp_path, monkeypatch):
    monkeypatch.setenv("DUCK_DB_PATH", str(tmp_path / "duck.sqlite3"))
    async def fake_ping(self):
        return {
            "thinker": {
                "ok": True,
                "base_url": "http://127.0.0.1:8081/v1",
                "model": "local-main",
                "latency_ms": 1.2,
            },
            "critic": {
                "ok": False,
                "base_url": "http://127.0.0.1:8081/v1",
                "model": "local-main",
                "error": "offline",
            },
        }
    async def fake_vector_health(self):
        return {
            "configured": True,
            "ok": True,
            "qdrant_url": "http://127.0.0.1:6333",
            "collection": "duck_memory",
            "embedding_source": "local:./models/all-MiniLM-L6-v2",
            "latency_ms": 2.3,
        }
    monkeypatch.setattr(ModelClient, "ping", fake_ping)
    monkeypatch.setattr(VectorMemory, "health", fake_vector_health)
    app = create_app()
    client = TestClient(app)
    status = client.get("/v1/status?probe=true").json()
    assert status["services"]["llama"]["probed"] is True
    assert status["services"]["llama"]["ok"] is False
    assert status["services"]["llama"]["roles"]["thinker"]["ok"] is True
    assert status["services"]["llama"]["roles"]["critic"]["ok"] is False
    assert status["services"]["vector_memory"]["probed"] is True
    assert status["services"]["vector_memory"]["ok"] is True
 def test_webchat_index_renders(tmp_path, monkeypatch):