Expose DuckLM runtime status

2026-05-22 07:45:47 +08:00 · 2026-05-22 07:45:47 +08:00 · 8452673994
parent ff98224eb6
commit 8452673994
8 changed files with 319 additions and 9 deletions
--- a/CURRENT_STATE.md
+++ b/CURRENT_STATE.md
@ -1,6 +1,6 @@
 # DuckLM — текущее состояние проекта

-Дата обновления: 2026-05-21
+Дата обновления: 2026-05-22
 Рабочая копия: `/home/mirivlad/git/ducklm`
 Git remote: `origin/main`

@ -25,6 +25,7 @@ WebChat доступен через FastAPI на `http://127.0.0.1:8000/`.
 - TaskStore и EventStore в SQLite.
 - ModelClient с логическими ролями из `config/models.yaml`.
 - Роли: `thinker`, `critic`, `coder`, `action`, `summary`, `memory_policy`, `recall`.
+- Расширенный `/v1/status` с API paths, token budgets, model role map и optional live-probe для llama/Qdrant.
 - SSE streaming chat: reasoning/content deltas, runtime status events, final stats.
 - Runtime status в чате для долгих этапов: planning, running_tool(s), answering.
 - Min/avg/max token speed в конце ответа.
@ -65,6 +66,7 @@ WebChat доступен через FastAPI на `http://127.0.0.1:8000/`.
  - reflection
  - experience records
 - Skill candidate selection теперь используется в обычном и streaming chat.
+- `scripts/duck.sh status --probe` и `scripts/duck-mtp.sh status --probe` показывают live-состояние DuckLM runtime, model backend и vector memory.

 ## Соответствие этапам из Ducklm.md

@ -78,7 +80,7 @@ WebChat доступен через FastAPI на `http://127.0.0.1:8000/`.
 | 6. Approvals | Готово | UI и API approvals, allow_once/forever/deny |
 | 7. Skills | Готово | Registry, API/UI, candidate skill injection |
 | 8. Reflection/Experience | Готово | Reflection после completed задач, experience records |
-| 9. Memory/VectorMemory | Готово частично | SQLite memory готова; Qdrant зависит от запущенного сервиса и embeddings |
+| 9. Memory/VectorMemory | Готово частично | SQLite memory готова; `/v1/status?probe=true` показывает live health Qdrant; embeddings зависят от локальной модели/endpoint |
 | 10. MTP/benchmark | Готово как experimental | MTP script есть, action по умолчанию остаётся на main endpoint |

 ## Остаточные ограничения
@ -119,6 +121,7 @@ http://127.0.0.1:8000/

 ```bash
 curl --noproxy '*' http://127.0.0.1:8000/health
+curl --noproxy '*' 'http://127.0.0.1:8000/v1/status?probe=true'
 curl --noproxy '*' http://127.0.0.1:8000/v1/models/roles
 ```

@ -126,6 +129,7 @@ curl --noproxy '*' http://127.0.0.1:8000/v1/models/roles

 ```bash
 bash scripts/duck.sh status
+bash scripts/duck.sh status --probe
 bash scripts/duck.sh logs --follow
 bash scripts/duck.sh restart
 bash scripts/duck.sh stop
@ -143,6 +147,7 @@ bash scripts/duck-mtp.sh logs --follow
 ## Что делать следующим

 1. Пройти live E2E checklist в WebChat на реальной модели.
-2. Если Qdrant нужен постоянно, добавить отдельную health-индикацию vector memory в `/v1/status`.
-3. При необходимости заменить keyword skill selection на LLM-based selection.
-4. Позже мигрировать FastAPI startup на lifespan.
+2. Вынести runtime/model role routing в явный конфиг с fallback-политикой, оставив Qwen основным backend для всех ролей.
+3. Добавить строгую JSON validation/fallback для structured utility-ролей.
+4. При необходимости заменить keyword skill selection на LLM-based selection.
+5. Позже мигрировать FastAPI startup на lifespan.
--- a/docs/how_to_run.md
+++ b/docs/how_to_run.md
@ -36,6 +36,13 @@ bash scripts/duck.sh restart
 bash scripts/duck.sh stop
 ```

+Use live probes when you need backend diagnostics, not just process status:
+
+```bash
+bash scripts/duck.sh status --probe
+curl --noproxy '*' 'http://127.0.0.1:8000/v1/status?probe=true'
+```
+
 4. Open WebChat:

 ```text
--- a/docs/web_api.md
+++ b/docs/web_api.md
@ -24,6 +24,29 @@ GET  /v1/experience/{id}
 GET  /v1/memory/search?q=...
 ```

+`GET /v1/status` returns a fast runtime snapshot without live backend checks:
+
+```json
+{
+  "name": "DuckLM",
+  "api": {"host": "127.0.0.1", "port": 8000},
+  "paths": {"workspace": "./workspace", "db_path": "./data/duck.sqlite3"},
+  "token_budget": {"ctx_size": 65536},
+  "models": {
+    "default_provider": "llama_server",
+    "endpoints": ["llama_server:http://127.0.0.1:8081/v1:local-main"],
+    "roles": {"thinker": {"model": "local-main"}}
+  },
+  "services": {
+    "duck_api": {"ok": true, "probed": true},
+    "llama": {"ok": null, "probed": false, "roles": {}},
+    "vector_memory": {"ok": null, "probed": false}
+  }
+}
+```
+
+Use `GET /v1/status?probe=true` to also call the model backend and Qdrant.
+
 Chat requests accept optional `reasoning`:

 ```json
--- a/duck_core/api.py
+++ b/duck_core/api.py
@ -6,7 +6,7 @@ from pathlib import Path
 from typing import Any, Literal

 import uvicorn
-from fastapi import FastAPI, HTTPException, Request
+from fastapi import FastAPI, HTTPException, Query, Request
 from fastapi.responses import FileResponse, HTMLResponse, StreamingResponse
 from fastapi.staticfiles import StaticFiles
 from fastapi.templating import Jinja2Templates
@ -165,7 +165,31 @@ def create_app() -> FastAPI:
        return {"status": "ok"}

    @app.get("/v1/status")
-    async def status() -> dict[str, Any]:
+    async def status(probe: bool = Query(False)) -> dict[str, Any]:
+        role_configs = model_client.list_roles()
+        endpoints = sorted(
+            {
+                f"{role_config['provider']}:{role_config['base_url']}:{role_config['model']}"
+                for role_config in role_configs.values()
+            }
+        )
+        llama_status: dict[str, Any] = {"probed": False, "ok": None, "roles": {}}
+        vector_status: dict[str, Any] = {
+            **vector_memory.config_status(),
+            "probed": False,
+            "ok": None,
+        }
+        if probe:
+            model_ping = await model_client.ping()
+            llama_status = {
+                "probed": True,
+                "ok": all(role.get("ok") for role in model_ping.values()) if model_ping else False,
+                "roles": model_ping,
+            }
+            vector_status = {
+                **await vector_memory.health(),
+                "probed": True,
+            }
        return {
            "name": "DuckLM",
            "version": "0.1.0",
@ -173,6 +197,33 @@ def create_app() -> FastAPI:
            "api_port": settings.api_port,
            "workspace": settings.workspace,
            "db_path": settings.db_path,
+            "api": {
+                "host": settings.api_host,
+                "port": settings.api_port,
+                "base_url": f"http://{settings.api_host}:{settings.api_port}",
+            },
+            "paths": {
+                "workspace": settings.workspace,
+                "db_path": settings.db_path,
+                "models_config": str(model_client.config_path),
+            },
+            "token_budget": {
+                "ctx_size": settings.ctx_size,
+                "max_input_tokens": settings.max_input_tokens,
+                "max_recent_events_tokens": settings.max_recent_events_tokens,
+                "max_memory_tokens": settings.max_memory_tokens,
+                "max_skill_tokens": settings.max_skill_tokens,
+            },
+            "models": {
+                "default_provider": model_client.default_provider,
+                "roles": role_configs,
+                "endpoints": endpoints,
+            },
+            "services": {
+                "duck_api": {"ok": True, "probed": True},
+                "llama": llama_status,
+                "vector_memory": vector_status,
+            },
        }

    @app.get("/v1/models/roles")
--- a/duck_core/memory/vector_memory.py
+++ b/duck_core/memory/vector_memory.py
@ -94,6 +94,43 @@ class VectorMemory:
            response.raise_for_status()
        return response.json().get("result", [])

+    def config_status(self) -> dict[str, Any]:
+        if self.embeddings_base_url:
+            embedding_source = f"remote:{self.embeddings_base_url}"
+        elif self._local_model_path:
+            embedding_source = f"local:{self._local_model_path}"
+        else:
+            embedding_source = "none"
+        return {
+            "configured": bool(self.qdrant_url and embedding_source != "none"),
+            "qdrant_url": self.qdrant_url,
+            "collection": self.collection_name,
+            "embedding_source": embedding_source,
+        }
+
+    async def health(self) -> dict[str, Any]:
+        """Probe Qdrant without loading the local embedding model."""
+        import time
+
+        status = self.config_status()
+        started = time.perf_counter()
+        try:
+            async with httpx.AsyncClient(timeout=5.0, trust_env=False) as client:
+                response = await client.get(f"{self.qdrant_url}/")
+                response.raise_for_status()
+            return {
+                **status,
+                "ok": True,
+                "latency_ms": round((time.perf_counter() - started) * 1000, 1),
+            }
+        except httpx.HTTPError as exc:
+            return {
+                **status,
+                "ok": False,
+                "error": str(exc),
+                "latency_ms": round((time.perf_counter() - started) * 1000, 1),
+            }
+
    async def _embed(self, text: str) -> list[float]:
        """Generate embeddings using local model or remote endpoint."""
        # Prefer local model if available
--- a/scripts/duck-mtp.sh
+++ b/scripts/duck-mtp.sh
@ -53,6 +53,7 @@ Commands:
  stop        Stop DuckLM API and managed MTP llama-server
  restart     Stop and start the whole local DuckLM stack
  status      Print process and HTTP health status
+              Use "status --probe" to include live model/vector checks
  logs        Show DuckLM API and llama-server logs; use --follow/-f and --lines N
  help        Show this help

@ -169,6 +170,20 @@ stop_stack() {
 }

 status_stack() {
+  local probe=0
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+      --probe)
+        probe=1
+        shift
+        ;;
+      *)
+        echo "Unknown status argument: $1" >&2
+        return 2
+        ;;
+    esac
+  done
+
  local rc=0
  if api_is_running; then
    local pid
@ -176,6 +191,7 @@ status_stack() {
    echo "DuckLM API running: pid=${pid}"
    if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${API_URL}/health" >/dev/null 2>&1; then
      echo "DuckLM API health: ok (${API_URL})"
+      print_runtime_status "${probe}"
    else
      echo "DuckLM API health: not ready (${API_URL})"
    fi
@ -197,6 +213,55 @@ status_stack() {
  return "${rc}"
 }

+print_runtime_status() {
+  local probe="${1:-0}"
+  local status_url="${API_URL}/v1/status"
+  if [[ "${probe}" == "1" ]]; then
+    status_url="${status_url}?probe=true"
+  fi
+  local payload
+  if ! payload="$(curl --noproxy "*" -fsS "${status_url}" 2>/dev/null)"; then
+    echo "DuckLM runtime status: unavailable (${status_url})"
+    return 0
+  fi
+
+  local python_bin="${DUCK_PYTHON_BIN:-${ROOT_DIR}/.venv/bin/python}"
+  if [[ ! -x "${python_bin}" ]]; then
+    python_bin="python3"
+  fi
+  if ! command -v "${python_bin}" >/dev/null 2>&1 && [[ ! -x "${python_bin}" ]]; then
+    echo "DuckLM runtime status: ${status_url}"
+    return 0
+  fi
+
+  printf '%s' "${payload}" | "${python_bin}" -c '
+import json
+import sys
+
+data = json.load(sys.stdin)
+models = data.get("models", {})
+roles = sorted((models.get("roles") or {}).keys())
+services = data.get("services", {})
+llama = services.get("llama", {})
+vector = services.get("vector_memory", {})
+
+def service_line(name, service):
+    if not service.get("probed"):
+        return f"{name}: not probed"
+    ok = "ok" if service.get("ok") else "failed"
+    error = service.get("error")
+    return f"{name}: {ok}" + (f" ({error})" if error else "")
+
+print("DuckLM runtime:")
+print("  workspace: {}".format(data.get("workspace")))
+print("  db: {}".format(data.get("db_path")))
+print("  model endpoints: {}".format(len(models.get("endpoints") or [])))
+print("  roles: {}".format(", ".join(roles)))
+print("  {}".format(service_line("llama", llama)))
+print("  {}".format(service_line("vector memory", vector)))
+'
+}
+
 logs_stack() {
  local follow=0
  local lines=100
@ -246,7 +311,8 @@ case "${ACTION}" in
    start_stack
    ;;
  status)
-    status_stack
+    shift || true
+    status_stack "$@"
    ;;
  logs)
    logs_stack "$@"
--- a/scripts/duck.sh
+++ b/scripts/duck.sh
@ -53,6 +53,7 @@ Commands:
  stop        Stop DuckLM API and managed llama-server
  restart     Stop and start the whole local DuckLM stack
  status      Print process and HTTP health status
+              Use "status --probe" to include live model/vector checks
  logs        Show DuckLM API and llama-server logs; use --follow/-f and --lines N
  help        Show this help

@ -169,6 +170,20 @@ stop_stack() {
 }

 status_stack() {
+  local probe=0
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+      --probe)
+        probe=1
+        shift
+        ;;
+      *)
+        echo "Unknown status argument: $1" >&2
+        return 2
+        ;;
+    esac
+  done
+
  local rc=0
  if api_is_running; then
    local pid
@ -176,6 +191,7 @@ status_stack() {
    echo "DuckLM API running: pid=${pid}"
    if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${API_URL}/health" >/dev/null 2>&1; then
      echo "DuckLM API health: ok (${API_URL})"
+      print_runtime_status "${probe}"
    else
      echo "DuckLM API health: not ready (${API_URL})"
    fi
@ -197,6 +213,55 @@ status_stack() {
  return "${rc}"
 }

+print_runtime_status() {
+  local probe="${1:-0}"
+  local status_url="${API_URL}/v1/status"
+  if [[ "${probe}" == "1" ]]; then
+    status_url="${status_url}?probe=true"
+  fi
+  local payload
+  if ! payload="$(curl --noproxy "*" -fsS "${status_url}" 2>/dev/null)"; then
+    echo "DuckLM runtime status: unavailable (${status_url})"
+    return 0
+  fi
+
+  local python_bin="${DUCK_PYTHON_BIN:-${ROOT_DIR}/.venv/bin/python}"
+  if [[ ! -x "${python_bin}" ]]; then
+    python_bin="python3"
+  fi
+  if ! command -v "${python_bin}" >/dev/null 2>&1 && [[ ! -x "${python_bin}" ]]; then
+    echo "DuckLM runtime status: ${status_url}"
+    return 0
+  fi
+
+  printf '%s' "${payload}" | "${python_bin}" -c '
+import json
+import sys
+
+data = json.load(sys.stdin)
+models = data.get("models", {})
+roles = sorted((models.get("roles") or {}).keys())
+services = data.get("services", {})
+llama = services.get("llama", {})
+vector = services.get("vector_memory", {})
+
+def service_line(name, service):
+    if not service.get("probed"):
+        return f"{name}: not probed"
+    ok = "ok" if service.get("ok") else "failed"
+    error = service.get("error")
+    return f"{name}: {ok}" + (f" ({error})" if error else "")
+
+print("DuckLM runtime:")
+print("  workspace: {}".format(data.get("workspace")))
+print("  db: {}".format(data.get("db_path")))
+print("  model endpoints: {}".format(len(models.get("endpoints") or [])))
+print("  roles: {}".format(", ".join(roles)))
+print("  {}".format(service_line("llama", llama)))
+print("  {}".format(service_line("vector memory", vector)))
+'
+}
+
 logs_stack() {
  local follow=0
  local lines=100
@ -246,7 +311,8 @@ case "${ACTION}" in
    start_stack
    ;;
  status)
-    status_stack
+    shift || true
+    status_stack "$@"
    ;;
  logs)
    logs_stack "$@"
--- a/tests/smoke/test_api_health.py
+++ b/tests/smoke/test_api_health.py
@ -1,6 +1,8 @@
 from fastapi.testclient import TestClient

 from duck_core.api import create_app
+from duck_core.memory.vector_memory import VectorMemory
+from duck_core.model_client import ModelClient


 def test_health_and_status_endpoints(tmp_path, monkeypatch):
@ -12,6 +14,59 @@ def test_health_and_status_endpoints(tmp_path, monkeypatch):
    status = client.get("/v1/status").json()
    assert status["name"] == "DuckLM"
    assert status["api_host"] == "127.0.0.1"
+    assert status["api"]["host"] == "127.0.0.1"
+    assert status["api"]["port"] == 8000
+    assert status["paths"]["db_path"] == str(tmp_path / "duck.sqlite3")
+    assert status["models"]["default_provider"] == "llama_server"
+    assert status["models"]["roles"]["thinker"]["model"] == "local-main"
+    assert status["services"]["duck_api"]["ok"] is True
+    assert status["services"]["llama"]["probed"] is False
+    assert status["services"]["vector_memory"]["probed"] is False
+
+
+def test_status_endpoint_can_probe_backends(tmp_path, monkeypatch):
+    monkeypatch.setenv("DUCK_DB_PATH", str(tmp_path / "duck.sqlite3"))
+
+    async def fake_ping(self):
+        return {
+            "thinker": {
+                "ok": True,
+                "base_url": "http://127.0.0.1:8081/v1",
+                "model": "local-main",
+                "latency_ms": 1.2,
+            },
+            "critic": {
+                "ok": False,
+                "base_url": "http://127.0.0.1:8081/v1",
+                "model": "local-main",
+                "error": "offline",
+            },
+        }
+
+    async def fake_vector_health(self):
+        return {
+            "configured": True,
+            "ok": True,
+            "qdrant_url": "http://127.0.0.1:6333",
+            "collection": "duck_memory",
+            "embedding_source": "local:./models/all-MiniLM-L6-v2",
+            "latency_ms": 2.3,
+        }
+
+    monkeypatch.setattr(ModelClient, "ping", fake_ping)
+    monkeypatch.setattr(VectorMemory, "health", fake_vector_health)
+
+    app = create_app()
+    client = TestClient(app)
+
+    status = client.get("/v1/status?probe=true").json()
+
+    assert status["services"]["llama"]["probed"] is True
+    assert status["services"]["llama"]["ok"] is False
+    assert status["services"]["llama"]["roles"]["thinker"]["ok"] is True
+    assert status["services"]["llama"]["roles"]["critic"]["ok"] is False
+    assert status["services"]["vector_memory"]["probed"] is True
+    assert status["services"]["vector_memory"]["ok"] is True


 def test_webchat_index_renders(tmp_path, monkeypatch):