Expose DuckLM runtime status

This commit is contained in:
mirivlad 2026-05-22 07:45:47 +08:00
parent ff98224eb6
commit 8452673994
8 changed files with 319 additions and 9 deletions

View File

@ -1,6 +1,6 @@
# DuckLM — текущее состояние проекта
Дата обновления: 2026-05-21
Дата обновления: 2026-05-22
Рабочая копия: `/home/mirivlad/git/ducklm`
Git remote: `origin/main`
@ -25,6 +25,7 @@ WebChat доступен через FastAPI на `http://127.0.0.1:8000/`.
- TaskStore и EventStore в SQLite.
- ModelClient с логическими ролями из `config/models.yaml`.
- Роли: `thinker`, `critic`, `coder`, `action`, `summary`, `memory_policy`, `recall`.
- Расширенный `/v1/status` с API paths, token budgets, model role map и optional live-probe для llama/Qdrant.
- SSE streaming chat: reasoning/content deltas, runtime status events, final stats.
- Runtime status в чате для долгих этапов: planning, running_tool(s), answering.
- Min/avg/max token speed в конце ответа.
@ -65,6 +66,7 @@ WebChat доступен через FastAPI на `http://127.0.0.1:8000/`.
- reflection
- experience records
- Skill candidate selection теперь используется в обычном и streaming chat.
- `scripts/duck.sh status --probe` и `scripts/duck-mtp.sh status --probe` показывают live-состояние DuckLM runtime, model backend и vector memory.
## Соответствие этапам из Ducklm.md
@ -78,7 +80,7 @@ WebChat доступен через FastAPI на `http://127.0.0.1:8000/`.
| 6. Approvals | Готово | UI и API approvals, allow_once/forever/deny |
| 7. Skills | Готово | Registry, API/UI, candidate skill injection |
| 8. Reflection/Experience | Готово | Reflection после completed задач, experience records |
| 9. Memory/VectorMemory | Готово частично | SQLite memory готова; Qdrant зависит от запущенного сервиса и embeddings |
| 9. Memory/VectorMemory | Готово частично | SQLite memory готова; `/v1/status?probe=true` показывает live health Qdrant; embeddings зависят от локальной модели/endpoint |
| 10. MTP/benchmark | Готово как experimental | MTP script есть, action по умолчанию остаётся на main endpoint |
## Остаточные ограничения
@ -119,6 +121,7 @@ http://127.0.0.1:8000/
```bash
curl --noproxy '*' http://127.0.0.1:8000/health
curl --noproxy '*' 'http://127.0.0.1:8000/v1/status?probe=true'
curl --noproxy '*' http://127.0.0.1:8000/v1/models/roles
```
@ -126,6 +129,7 @@ curl --noproxy '*' http://127.0.0.1:8000/v1/models/roles
```bash
bash scripts/duck.sh status
bash scripts/duck.sh status --probe
bash scripts/duck.sh logs --follow
bash scripts/duck.sh restart
bash scripts/duck.sh stop
@ -143,6 +147,7 @@ bash scripts/duck-mtp.sh logs --follow
## Что делать следующим
1. Пройти live E2E checklist в WebChat на реальной модели.
2. Если Qdrant нужен постоянно, добавить отдельную health-индикацию vector memory в `/v1/status`.
3. При необходимости заменить keyword skill selection на LLM-based selection.
4. Позже мигрировать FastAPI startup на lifespan.
2. Вынести runtime/model role routing в явный конфиг с fallback-политикой, оставив Qwen основным backend для всех ролей.
3. Добавить строгую JSON validation/fallback для structured utility-ролей.
4. При необходимости заменить keyword skill selection на LLM-based selection.
5. Позже мигрировать FastAPI startup на lifespan.

View File

@ -36,6 +36,13 @@ bash scripts/duck.sh restart
bash scripts/duck.sh stop
```
Use live probes when you need backend diagnostics, not just process status:
```bash
bash scripts/duck.sh status --probe
curl --noproxy '*' 'http://127.0.0.1:8000/v1/status?probe=true'
```
4. Open WebChat:
```text

View File

@ -24,6 +24,29 @@ GET /v1/experience/{id}
GET /v1/memory/search?q=...
```
`GET /v1/status` returns a fast runtime snapshot without live backend checks:
```json
{
"name": "DuckLM",
"api": {"host": "127.0.0.1", "port": 8000},
"paths": {"workspace": "./workspace", "db_path": "./data/duck.sqlite3"},
"token_budget": {"ctx_size": 65536},
"models": {
"default_provider": "llama_server",
"endpoints": ["llama_server:http://127.0.0.1:8081/v1:local-main"],
"roles": {"thinker": {"model": "local-main"}}
},
"services": {
"duck_api": {"ok": true, "probed": true},
"llama": {"ok": null, "probed": false, "roles": {}},
"vector_memory": {"ok": null, "probed": false}
}
}
```
Use `GET /v1/status?probe=true` to also call the model backend and Qdrant.
Chat requests accept optional `reasoning`:
```json

View File

@ -6,7 +6,7 @@ from pathlib import Path
from typing import Any, Literal
import uvicorn
from fastapi import FastAPI, HTTPException, Request
from fastapi import FastAPI, HTTPException, Query, Request
from fastapi.responses import FileResponse, HTMLResponse, StreamingResponse
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
@ -165,7 +165,31 @@ def create_app() -> FastAPI:
return {"status": "ok"}
@app.get("/v1/status")
async def status() -> dict[str, Any]:
async def status(probe: bool = Query(False)) -> dict[str, Any]:
role_configs = model_client.list_roles()
endpoints = sorted(
{
f"{role_config['provider']}:{role_config['base_url']}:{role_config['model']}"
for role_config in role_configs.values()
}
)
llama_status: dict[str, Any] = {"probed": False, "ok": None, "roles": {}}
vector_status: dict[str, Any] = {
**vector_memory.config_status(),
"probed": False,
"ok": None,
}
if probe:
model_ping = await model_client.ping()
llama_status = {
"probed": True,
"ok": all(role.get("ok") for role in model_ping.values()) if model_ping else False,
"roles": model_ping,
}
vector_status = {
**await vector_memory.health(),
"probed": True,
}
return {
"name": "DuckLM",
"version": "0.1.0",
@ -173,6 +197,33 @@ def create_app() -> FastAPI:
"api_port": settings.api_port,
"workspace": settings.workspace,
"db_path": settings.db_path,
"api": {
"host": settings.api_host,
"port": settings.api_port,
"base_url": f"http://{settings.api_host}:{settings.api_port}",
},
"paths": {
"workspace": settings.workspace,
"db_path": settings.db_path,
"models_config": str(model_client.config_path),
},
"token_budget": {
"ctx_size": settings.ctx_size,
"max_input_tokens": settings.max_input_tokens,
"max_recent_events_tokens": settings.max_recent_events_tokens,
"max_memory_tokens": settings.max_memory_tokens,
"max_skill_tokens": settings.max_skill_tokens,
},
"models": {
"default_provider": model_client.default_provider,
"roles": role_configs,
"endpoints": endpoints,
},
"services": {
"duck_api": {"ok": True, "probed": True},
"llama": llama_status,
"vector_memory": vector_status,
},
}
@app.get("/v1/models/roles")

View File

@ -94,6 +94,43 @@ class VectorMemory:
response.raise_for_status()
return response.json().get("result", [])
def config_status(self) -> dict[str, Any]:
if self.embeddings_base_url:
embedding_source = f"remote:{self.embeddings_base_url}"
elif self._local_model_path:
embedding_source = f"local:{self._local_model_path}"
else:
embedding_source = "none"
return {
"configured": bool(self.qdrant_url and embedding_source != "none"),
"qdrant_url": self.qdrant_url,
"collection": self.collection_name,
"embedding_source": embedding_source,
}
async def health(self) -> dict[str, Any]:
"""Probe Qdrant without loading the local embedding model."""
import time
status = self.config_status()
started = time.perf_counter()
try:
async with httpx.AsyncClient(timeout=5.0, trust_env=False) as client:
response = await client.get(f"{self.qdrant_url}/")
response.raise_for_status()
return {
**status,
"ok": True,
"latency_ms": round((time.perf_counter() - started) * 1000, 1),
}
except httpx.HTTPError as exc:
return {
**status,
"ok": False,
"error": str(exc),
"latency_ms": round((time.perf_counter() - started) * 1000, 1),
}
async def _embed(self, text: str) -> list[float]:
"""Generate embeddings using local model or remote endpoint."""
# Prefer local model if available

View File

@ -53,6 +53,7 @@ Commands:
stop Stop DuckLM API and managed MTP llama-server
restart Stop and start the whole local DuckLM stack
status Print process and HTTP health status
Use "status --probe" to include live model/vector checks
logs Show DuckLM API and llama-server logs; use --follow/-f and --lines N
help Show this help
@ -169,6 +170,20 @@ stop_stack() {
}
status_stack() {
local probe=0
while [[ $# -gt 0 ]]; do
case "$1" in
--probe)
probe=1
shift
;;
*)
echo "Unknown status argument: $1" >&2
return 2
;;
esac
done
local rc=0
if api_is_running; then
local pid
@ -176,6 +191,7 @@ status_stack() {
echo "DuckLM API running: pid=${pid}"
if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${API_URL}/health" >/dev/null 2>&1; then
echo "DuckLM API health: ok (${API_URL})"
print_runtime_status "${probe}"
else
echo "DuckLM API health: not ready (${API_URL})"
fi
@ -197,6 +213,55 @@ status_stack() {
return "${rc}"
}
print_runtime_status() {
local probe="${1:-0}"
local status_url="${API_URL}/v1/status"
if [[ "${probe}" == "1" ]]; then
status_url="${status_url}?probe=true"
fi
local payload
if ! payload="$(curl --noproxy "*" -fsS "${status_url}" 2>/dev/null)"; then
echo "DuckLM runtime status: unavailable (${status_url})"
return 0
fi
local python_bin="${DUCK_PYTHON_BIN:-${ROOT_DIR}/.venv/bin/python}"
if [[ ! -x "${python_bin}" ]]; then
python_bin="python3"
fi
if ! command -v "${python_bin}" >/dev/null 2>&1 && [[ ! -x "${python_bin}" ]]; then
echo "DuckLM runtime status: ${status_url}"
return 0
fi
printf '%s' "${payload}" | "${python_bin}" -c '
import json
import sys
data = json.load(sys.stdin)
models = data.get("models", {})
roles = sorted((models.get("roles") or {}).keys())
services = data.get("services", {})
llama = services.get("llama", {})
vector = services.get("vector_memory", {})
def service_line(name, service):
if not service.get("probed"):
return f"{name}: not probed"
ok = "ok" if service.get("ok") else "failed"
error = service.get("error")
return f"{name}: {ok}" + (f" ({error})" if error else "")
print("DuckLM runtime:")
print(" workspace: {}".format(data.get("workspace")))
print(" db: {}".format(data.get("db_path")))
print(" model endpoints: {}".format(len(models.get("endpoints") or [])))
print(" roles: {}".format(", ".join(roles)))
print(" {}".format(service_line("llama", llama)))
print(" {}".format(service_line("vector memory", vector)))
'
}
logs_stack() {
local follow=0
local lines=100
@ -246,7 +311,8 @@ case "${ACTION}" in
start_stack
;;
status)
status_stack
shift || true
status_stack "$@"
;;
logs)
logs_stack "$@"

View File

@ -53,6 +53,7 @@ Commands:
stop Stop DuckLM API and managed llama-server
restart Stop and start the whole local DuckLM stack
status Print process and HTTP health status
Use "status --probe" to include live model/vector checks
logs Show DuckLM API and llama-server logs; use --follow/-f and --lines N
help Show this help
@ -169,6 +170,20 @@ stop_stack() {
}
status_stack() {
local probe=0
while [[ $# -gt 0 ]]; do
case "$1" in
--probe)
probe=1
shift
;;
*)
echo "Unknown status argument: $1" >&2
return 2
;;
esac
done
local rc=0
if api_is_running; then
local pid
@ -176,6 +191,7 @@ status_stack() {
echo "DuckLM API running: pid=${pid}"
if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${API_URL}/health" >/dev/null 2>&1; then
echo "DuckLM API health: ok (${API_URL})"
print_runtime_status "${probe}"
else
echo "DuckLM API health: not ready (${API_URL})"
fi
@ -197,6 +213,55 @@ status_stack() {
return "${rc}"
}
print_runtime_status() {
local probe="${1:-0}"
local status_url="${API_URL}/v1/status"
if [[ "${probe}" == "1" ]]; then
status_url="${status_url}?probe=true"
fi
local payload
if ! payload="$(curl --noproxy "*" -fsS "${status_url}" 2>/dev/null)"; then
echo "DuckLM runtime status: unavailable (${status_url})"
return 0
fi
local python_bin="${DUCK_PYTHON_BIN:-${ROOT_DIR}/.venv/bin/python}"
if [[ ! -x "${python_bin}" ]]; then
python_bin="python3"
fi
if ! command -v "${python_bin}" >/dev/null 2>&1 && [[ ! -x "${python_bin}" ]]; then
echo "DuckLM runtime status: ${status_url}"
return 0
fi
printf '%s' "${payload}" | "${python_bin}" -c '
import json
import sys
data = json.load(sys.stdin)
models = data.get("models", {})
roles = sorted((models.get("roles") or {}).keys())
services = data.get("services", {})
llama = services.get("llama", {})
vector = services.get("vector_memory", {})
def service_line(name, service):
if not service.get("probed"):
return f"{name}: not probed"
ok = "ok" if service.get("ok") else "failed"
error = service.get("error")
return f"{name}: {ok}" + (f" ({error})" if error else "")
print("DuckLM runtime:")
print(" workspace: {}".format(data.get("workspace")))
print(" db: {}".format(data.get("db_path")))
print(" model endpoints: {}".format(len(models.get("endpoints") or [])))
print(" roles: {}".format(", ".join(roles)))
print(" {}".format(service_line("llama", llama)))
print(" {}".format(service_line("vector memory", vector)))
'
}
logs_stack() {
local follow=0
local lines=100
@ -246,7 +311,8 @@ case "${ACTION}" in
start_stack
;;
status)
status_stack
shift || true
status_stack "$@"
;;
logs)
logs_stack "$@"

View File

@ -1,6 +1,8 @@
from fastapi.testclient import TestClient
from duck_core.api import create_app
from duck_core.memory.vector_memory import VectorMemory
from duck_core.model_client import ModelClient
def test_health_and_status_endpoints(tmp_path, monkeypatch):
@ -12,6 +14,59 @@ def test_health_and_status_endpoints(tmp_path, monkeypatch):
status = client.get("/v1/status").json()
assert status["name"] == "DuckLM"
assert status["api_host"] == "127.0.0.1"
assert status["api"]["host"] == "127.0.0.1"
assert status["api"]["port"] == 8000
assert status["paths"]["db_path"] == str(tmp_path / "duck.sqlite3")
assert status["models"]["default_provider"] == "llama_server"
assert status["models"]["roles"]["thinker"]["model"] == "local-main"
assert status["services"]["duck_api"]["ok"] is True
assert status["services"]["llama"]["probed"] is False
assert status["services"]["vector_memory"]["probed"] is False
def test_status_endpoint_can_probe_backends(tmp_path, monkeypatch):
monkeypatch.setenv("DUCK_DB_PATH", str(tmp_path / "duck.sqlite3"))
async def fake_ping(self):
return {
"thinker": {
"ok": True,
"base_url": "http://127.0.0.1:8081/v1",
"model": "local-main",
"latency_ms": 1.2,
},
"critic": {
"ok": False,
"base_url": "http://127.0.0.1:8081/v1",
"model": "local-main",
"error": "offline",
},
}
async def fake_vector_health(self):
return {
"configured": True,
"ok": True,
"qdrant_url": "http://127.0.0.1:6333",
"collection": "duck_memory",
"embedding_source": "local:./models/all-MiniLM-L6-v2",
"latency_ms": 2.3,
}
monkeypatch.setattr(ModelClient, "ping", fake_ping)
monkeypatch.setattr(VectorMemory, "health", fake_vector_health)
app = create_app()
client = TestClient(app)
status = client.get("/v1/status?probe=true").json()
assert status["services"]["llama"]["probed"] is True
assert status["services"]["llama"]["ok"] is False
assert status["services"]["llama"]["roles"]["thinker"]["ok"] is True
assert status["services"]["llama"]["roles"]["critic"]["ok"] is False
assert status["services"]["vector_memory"]["probed"] is True
assert status["services"]["vector_memory"]["ok"] is True
def test_webchat_index_renders(tmp_path, monkeypatch):