diff --git a/CURRENT_STATE.md b/CURRENT_STATE.md index 9f82fe2..b12e803 100644 --- a/CURRENT_STATE.md +++ b/CURRENT_STATE.md @@ -1,6 +1,6 @@ # DuckLM — текущее состояние проекта -Дата обновления: 2026-05-21 +Дата обновления: 2026-05-22 Рабочая копия: `/home/mirivlad/git/ducklm` Git remote: `origin/main` @@ -25,6 +25,7 @@ WebChat доступен через FastAPI на `http://127.0.0.1:8000/`. - TaskStore и EventStore в SQLite. - ModelClient с логическими ролями из `config/models.yaml`. - Роли: `thinker`, `critic`, `coder`, `action`, `summary`, `memory_policy`, `recall`. +- Расширенный `/v1/status` с API paths, token budgets, model role map и optional live-probe для llama/Qdrant. - SSE streaming chat: reasoning/content deltas, runtime status events, final stats. - Runtime status в чате для долгих этапов: planning, running_tool(s), answering. - Min/avg/max token speed в конце ответа. @@ -65,6 +66,7 @@ WebChat доступен через FastAPI на `http://127.0.0.1:8000/`. - reflection - experience records - Skill candidate selection теперь используется в обычном и streaming chat. +- `scripts/duck.sh status --probe` и `scripts/duck-mtp.sh status --probe` показывают live-состояние DuckLM runtime, model backend и vector memory. ## Соответствие этапам из Ducklm.md @@ -78,7 +80,7 @@ WebChat доступен через FastAPI на `http://127.0.0.1:8000/`. | 6. Approvals | Готово | UI и API approvals, allow_once/forever/deny | | 7. Skills | Готово | Registry, API/UI, candidate skill injection | | 8. Reflection/Experience | Готово | Reflection после completed задач, experience records | -| 9. Memory/VectorMemory | Готово частично | SQLite memory готова; Qdrant зависит от запущенного сервиса и embeddings | +| 9. Memory/VectorMemory | Готово частично | SQLite memory готова; `/v1/status?probe=true` показывает live health Qdrant; embeddings зависят от локальной модели/endpoint | | 10. MTP/benchmark | Готово как experimental | MTP script есть, action по умолчанию остаётся на main endpoint | ## Остаточные ограничения @@ -119,6 +121,7 @@ http://127.0.0.1:8000/ ```bash curl --noproxy '*' http://127.0.0.1:8000/health +curl --noproxy '*' 'http://127.0.0.1:8000/v1/status?probe=true' curl --noproxy '*' http://127.0.0.1:8000/v1/models/roles ``` @@ -126,6 +129,7 @@ curl --noproxy '*' http://127.0.0.1:8000/v1/models/roles ```bash bash scripts/duck.sh status +bash scripts/duck.sh status --probe bash scripts/duck.sh logs --follow bash scripts/duck.sh restart bash scripts/duck.sh stop @@ -143,6 +147,7 @@ bash scripts/duck-mtp.sh logs --follow ## Что делать следующим 1. Пройти live E2E checklist в WebChat на реальной модели. -2. Если Qdrant нужен постоянно, добавить отдельную health-индикацию vector memory в `/v1/status`. -3. При необходимости заменить keyword skill selection на LLM-based selection. -4. Позже мигрировать FastAPI startup на lifespan. +2. Вынести runtime/model role routing в явный конфиг с fallback-политикой, оставив Qwen основным backend для всех ролей. +3. Добавить строгую JSON validation/fallback для structured utility-ролей. +4. При необходимости заменить keyword skill selection на LLM-based selection. +5. Позже мигрировать FastAPI startup на lifespan. diff --git a/docs/how_to_run.md b/docs/how_to_run.md index 533c31e..904939e 100644 --- a/docs/how_to_run.md +++ b/docs/how_to_run.md @@ -36,6 +36,13 @@ bash scripts/duck.sh restart bash scripts/duck.sh stop ``` +Use live probes when you need backend diagnostics, not just process status: + +```bash +bash scripts/duck.sh status --probe +curl --noproxy '*' 'http://127.0.0.1:8000/v1/status?probe=true' +``` + 4. Open WebChat: ```text diff --git a/docs/web_api.md b/docs/web_api.md index 3f73041..e21ca28 100644 --- a/docs/web_api.md +++ b/docs/web_api.md @@ -24,6 +24,29 @@ GET /v1/experience/{id} GET /v1/memory/search?q=... ``` +`GET /v1/status` returns a fast runtime snapshot without live backend checks: + +```json +{ + "name": "DuckLM", + "api": {"host": "127.0.0.1", "port": 8000}, + "paths": {"workspace": "./workspace", "db_path": "./data/duck.sqlite3"}, + "token_budget": {"ctx_size": 65536}, + "models": { + "default_provider": "llama_server", + "endpoints": ["llama_server:http://127.0.0.1:8081/v1:local-main"], + "roles": {"thinker": {"model": "local-main"}} + }, + "services": { + "duck_api": {"ok": true, "probed": true}, + "llama": {"ok": null, "probed": false, "roles": {}}, + "vector_memory": {"ok": null, "probed": false} + } +} +``` + +Use `GET /v1/status?probe=true` to also call the model backend and Qdrant. + Chat requests accept optional `reasoning`: ```json diff --git a/duck_core/api.py b/duck_core/api.py index b211446..7a70ff5 100644 --- a/duck_core/api.py +++ b/duck_core/api.py @@ -6,7 +6,7 @@ from pathlib import Path from typing import Any, Literal import uvicorn -from fastapi import FastAPI, HTTPException, Request +from fastapi import FastAPI, HTTPException, Query, Request from fastapi.responses import FileResponse, HTMLResponse, StreamingResponse from fastapi.staticfiles import StaticFiles from fastapi.templating import Jinja2Templates @@ -165,7 +165,31 @@ def create_app() -> FastAPI: return {"status": "ok"} @app.get("/v1/status") - async def status() -> dict[str, Any]: + async def status(probe: bool = Query(False)) -> dict[str, Any]: + role_configs = model_client.list_roles() + endpoints = sorted( + { + f"{role_config['provider']}:{role_config['base_url']}:{role_config['model']}" + for role_config in role_configs.values() + } + ) + llama_status: dict[str, Any] = {"probed": False, "ok": None, "roles": {}} + vector_status: dict[str, Any] = { + **vector_memory.config_status(), + "probed": False, + "ok": None, + } + if probe: + model_ping = await model_client.ping() + llama_status = { + "probed": True, + "ok": all(role.get("ok") for role in model_ping.values()) if model_ping else False, + "roles": model_ping, + } + vector_status = { + **await vector_memory.health(), + "probed": True, + } return { "name": "DuckLM", "version": "0.1.0", @@ -173,6 +197,33 @@ def create_app() -> FastAPI: "api_port": settings.api_port, "workspace": settings.workspace, "db_path": settings.db_path, + "api": { + "host": settings.api_host, + "port": settings.api_port, + "base_url": f"http://{settings.api_host}:{settings.api_port}", + }, + "paths": { + "workspace": settings.workspace, + "db_path": settings.db_path, + "models_config": str(model_client.config_path), + }, + "token_budget": { + "ctx_size": settings.ctx_size, + "max_input_tokens": settings.max_input_tokens, + "max_recent_events_tokens": settings.max_recent_events_tokens, + "max_memory_tokens": settings.max_memory_tokens, + "max_skill_tokens": settings.max_skill_tokens, + }, + "models": { + "default_provider": model_client.default_provider, + "roles": role_configs, + "endpoints": endpoints, + }, + "services": { + "duck_api": {"ok": True, "probed": True}, + "llama": llama_status, + "vector_memory": vector_status, + }, } @app.get("/v1/models/roles") diff --git a/duck_core/memory/vector_memory.py b/duck_core/memory/vector_memory.py index e509c6d..bdc1f4f 100644 --- a/duck_core/memory/vector_memory.py +++ b/duck_core/memory/vector_memory.py @@ -94,6 +94,43 @@ class VectorMemory: response.raise_for_status() return response.json().get("result", []) + def config_status(self) -> dict[str, Any]: + if self.embeddings_base_url: + embedding_source = f"remote:{self.embeddings_base_url}" + elif self._local_model_path: + embedding_source = f"local:{self._local_model_path}" + else: + embedding_source = "none" + return { + "configured": bool(self.qdrant_url and embedding_source != "none"), + "qdrant_url": self.qdrant_url, + "collection": self.collection_name, + "embedding_source": embedding_source, + } + + async def health(self) -> dict[str, Any]: + """Probe Qdrant without loading the local embedding model.""" + import time + + status = self.config_status() + started = time.perf_counter() + try: + async with httpx.AsyncClient(timeout=5.0, trust_env=False) as client: + response = await client.get(f"{self.qdrant_url}/") + response.raise_for_status() + return { + **status, + "ok": True, + "latency_ms": round((time.perf_counter() - started) * 1000, 1), + } + except httpx.HTTPError as exc: + return { + **status, + "ok": False, + "error": str(exc), + "latency_ms": round((time.perf_counter() - started) * 1000, 1), + } + async def _embed(self, text: str) -> list[float]: """Generate embeddings using local model or remote endpoint.""" # Prefer local model if available diff --git a/scripts/duck-mtp.sh b/scripts/duck-mtp.sh index 387f23b..213aa48 100755 --- a/scripts/duck-mtp.sh +++ b/scripts/duck-mtp.sh @@ -53,6 +53,7 @@ Commands: stop Stop DuckLM API and managed MTP llama-server restart Stop and start the whole local DuckLM stack status Print process and HTTP health status + Use "status --probe" to include live model/vector checks logs Show DuckLM API and llama-server logs; use --follow/-f and --lines N help Show this help @@ -169,6 +170,20 @@ stop_stack() { } status_stack() { + local probe=0 + while [[ $# -gt 0 ]]; do + case "$1" in + --probe) + probe=1 + shift + ;; + *) + echo "Unknown status argument: $1" >&2 + return 2 + ;; + esac + done + local rc=0 if api_is_running; then local pid @@ -176,6 +191,7 @@ status_stack() { echo "DuckLM API running: pid=${pid}" if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${API_URL}/health" >/dev/null 2>&1; then echo "DuckLM API health: ok (${API_URL})" + print_runtime_status "${probe}" else echo "DuckLM API health: not ready (${API_URL})" fi @@ -197,6 +213,55 @@ status_stack() { return "${rc}" } +print_runtime_status() { + local probe="${1:-0}" + local status_url="${API_URL}/v1/status" + if [[ "${probe}" == "1" ]]; then + status_url="${status_url}?probe=true" + fi + local payload + if ! payload="$(curl --noproxy "*" -fsS "${status_url}" 2>/dev/null)"; then + echo "DuckLM runtime status: unavailable (${status_url})" + return 0 + fi + + local python_bin="${DUCK_PYTHON_BIN:-${ROOT_DIR}/.venv/bin/python}" + if [[ ! -x "${python_bin}" ]]; then + python_bin="python3" + fi + if ! command -v "${python_bin}" >/dev/null 2>&1 && [[ ! -x "${python_bin}" ]]; then + echo "DuckLM runtime status: ${status_url}" + return 0 + fi + + printf '%s' "${payload}" | "${python_bin}" -c ' +import json +import sys + +data = json.load(sys.stdin) +models = data.get("models", {}) +roles = sorted((models.get("roles") or {}).keys()) +services = data.get("services", {}) +llama = services.get("llama", {}) +vector = services.get("vector_memory", {}) + +def service_line(name, service): + if not service.get("probed"): + return f"{name}: not probed" + ok = "ok" if service.get("ok") else "failed" + error = service.get("error") + return f"{name}: {ok}" + (f" ({error})" if error else "") + +print("DuckLM runtime:") +print(" workspace: {}".format(data.get("workspace"))) +print(" db: {}".format(data.get("db_path"))) +print(" model endpoints: {}".format(len(models.get("endpoints") or []))) +print(" roles: {}".format(", ".join(roles))) +print(" {}".format(service_line("llama", llama))) +print(" {}".format(service_line("vector memory", vector))) +' +} + logs_stack() { local follow=0 local lines=100 @@ -246,7 +311,8 @@ case "${ACTION}" in start_stack ;; status) - status_stack + shift || true + status_stack "$@" ;; logs) logs_stack "$@" diff --git a/scripts/duck.sh b/scripts/duck.sh index 26bcc16..40f076f 100755 --- a/scripts/duck.sh +++ b/scripts/duck.sh @@ -53,6 +53,7 @@ Commands: stop Stop DuckLM API and managed llama-server restart Stop and start the whole local DuckLM stack status Print process and HTTP health status + Use "status --probe" to include live model/vector checks logs Show DuckLM API and llama-server logs; use --follow/-f and --lines N help Show this help @@ -169,6 +170,20 @@ stop_stack() { } status_stack() { + local probe=0 + while [[ $# -gt 0 ]]; do + case "$1" in + --probe) + probe=1 + shift + ;; + *) + echo "Unknown status argument: $1" >&2 + return 2 + ;; + esac + done + local rc=0 if api_is_running; then local pid @@ -176,6 +191,7 @@ status_stack() { echo "DuckLM API running: pid=${pid}" if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${API_URL}/health" >/dev/null 2>&1; then echo "DuckLM API health: ok (${API_URL})" + print_runtime_status "${probe}" else echo "DuckLM API health: not ready (${API_URL})" fi @@ -197,6 +213,55 @@ status_stack() { return "${rc}" } +print_runtime_status() { + local probe="${1:-0}" + local status_url="${API_URL}/v1/status" + if [[ "${probe}" == "1" ]]; then + status_url="${status_url}?probe=true" + fi + local payload + if ! payload="$(curl --noproxy "*" -fsS "${status_url}" 2>/dev/null)"; then + echo "DuckLM runtime status: unavailable (${status_url})" + return 0 + fi + + local python_bin="${DUCK_PYTHON_BIN:-${ROOT_DIR}/.venv/bin/python}" + if [[ ! -x "${python_bin}" ]]; then + python_bin="python3" + fi + if ! command -v "${python_bin}" >/dev/null 2>&1 && [[ ! -x "${python_bin}" ]]; then + echo "DuckLM runtime status: ${status_url}" + return 0 + fi + + printf '%s' "${payload}" | "${python_bin}" -c ' +import json +import sys + +data = json.load(sys.stdin) +models = data.get("models", {}) +roles = sorted((models.get("roles") or {}).keys()) +services = data.get("services", {}) +llama = services.get("llama", {}) +vector = services.get("vector_memory", {}) + +def service_line(name, service): + if not service.get("probed"): + return f"{name}: not probed" + ok = "ok" if service.get("ok") else "failed" + error = service.get("error") + return f"{name}: {ok}" + (f" ({error})" if error else "") + +print("DuckLM runtime:") +print(" workspace: {}".format(data.get("workspace"))) +print(" db: {}".format(data.get("db_path"))) +print(" model endpoints: {}".format(len(models.get("endpoints") or []))) +print(" roles: {}".format(", ".join(roles))) +print(" {}".format(service_line("llama", llama))) +print(" {}".format(service_line("vector memory", vector))) +' +} + logs_stack() { local follow=0 local lines=100 @@ -246,7 +311,8 @@ case "${ACTION}" in start_stack ;; status) - status_stack + shift || true + status_stack "$@" ;; logs) logs_stack "$@" diff --git a/tests/smoke/test_api_health.py b/tests/smoke/test_api_health.py index 8558ef2..dea3ed5 100644 --- a/tests/smoke/test_api_health.py +++ b/tests/smoke/test_api_health.py @@ -1,6 +1,8 @@ from fastapi.testclient import TestClient from duck_core.api import create_app +from duck_core.memory.vector_memory import VectorMemory +from duck_core.model_client import ModelClient def test_health_and_status_endpoints(tmp_path, monkeypatch): @@ -12,6 +14,59 @@ def test_health_and_status_endpoints(tmp_path, monkeypatch): status = client.get("/v1/status").json() assert status["name"] == "DuckLM" assert status["api_host"] == "127.0.0.1" + assert status["api"]["host"] == "127.0.0.1" + assert status["api"]["port"] == 8000 + assert status["paths"]["db_path"] == str(tmp_path / "duck.sqlite3") + assert status["models"]["default_provider"] == "llama_server" + assert status["models"]["roles"]["thinker"]["model"] == "local-main" + assert status["services"]["duck_api"]["ok"] is True + assert status["services"]["llama"]["probed"] is False + assert status["services"]["vector_memory"]["probed"] is False + + +def test_status_endpoint_can_probe_backends(tmp_path, monkeypatch): + monkeypatch.setenv("DUCK_DB_PATH", str(tmp_path / "duck.sqlite3")) + + async def fake_ping(self): + return { + "thinker": { + "ok": True, + "base_url": "http://127.0.0.1:8081/v1", + "model": "local-main", + "latency_ms": 1.2, + }, + "critic": { + "ok": False, + "base_url": "http://127.0.0.1:8081/v1", + "model": "local-main", + "error": "offline", + }, + } + + async def fake_vector_health(self): + return { + "configured": True, + "ok": True, + "qdrant_url": "http://127.0.0.1:6333", + "collection": "duck_memory", + "embedding_source": "local:./models/all-MiniLM-L6-v2", + "latency_ms": 2.3, + } + + monkeypatch.setattr(ModelClient, "ping", fake_ping) + monkeypatch.setattr(VectorMemory, "health", fake_vector_health) + + app = create_app() + client = TestClient(app) + + status = client.get("/v1/status?probe=true").json() + + assert status["services"]["llama"]["probed"] is True + assert status["services"]["llama"]["ok"] is False + assert status["services"]["llama"]["roles"]["thinker"]["ok"] is True + assert status["services"]["llama"]["roles"]["critic"]["ok"] is False + assert status["services"]["vector_memory"]["probed"] is True + assert status["services"]["vector_memory"]["ok"] is True def test_webchat_index_renders(tmp_path, monkeypatch):