From f63125e9457a41859b085d56f18350480bb33747 Mon Sep 17 00:00:00 2001 From: mirivlad Date: Sat, 23 May 2026 18:18:58 +0800 Subject: [PATCH] many fixes --- CURRENT_STATE.md | 1 + README.md | 4 + docs/how_to_run.md | 4 +- docs/memory_architecture.md | 67 ++++++++++++++++- duck_core/api.py | 30 +++++++- duck_core/context_builder.py | 53 +++++++++---- duck_core/runtime_loop.py | 68 ++++++++++++++++- duck_core/web/static/app.js | 48 ++++++++++-- duck_core/web/templates/index.html | 21 +++++- duck_core/web/templates/memory.html | 4 +- prompts/roles/recall.md | 3 + scripts/duck-mtp.sh | 90 +++++++++++++++++++++- scripts/duck.sh | 90 +++++++++++++++++++++- tests/smoke/test_api_stream_chat.py | 71 ++++++++++++++++++ tests/smoke/test_context_builder.py | 28 ++++++- tests/smoke/test_duck_service_script.py | 97 ++++++++++++++++++++++++ tests/smoke/test_memory_policy.py | 82 ++++++++++++++++++++ tests/smoke/test_runtime_tools.py | 99 +++++++++++++++++++++++++ tests/smoke/test_vector_memory_live.py | 39 ++++++++++ 19 files changed, 858 insertions(+), 41 deletions(-) create mode 100644 tests/smoke/test_vector_memory_live.py diff --git a/CURRENT_STATE.md b/CURRENT_STATE.md index 0258725..9351b05 100644 --- a/CURRENT_STATE.md +++ b/CURRENT_STATE.md @@ -69,6 +69,7 @@ WebChat доступен через FastAPI на `http://127.0.0.1:8000/`. - reflection - experience records - Skill candidate selection теперь используется в обычном и streaming chat. +- `scripts/duck.sh` и `scripts/duck-mtp.sh` управляют всем локальным стеком: Qdrant, llama-server и DuckLM API. - `scripts/duck.sh status --probe` и `scripts/duck-mtp.sh status --probe` показывают live-состояние DuckLM runtime, model backend и vector memory. - Structured utility-outputs валидируются локально по JSON schema; это защищает tool loop и memory writes от мусора модели. - Live E2E выявил и исправил два runtime-дефекта: большие stdout больше не раздувают следующий planning prompt, повторяющиеся identical actions больше не исполняются повторно. diff --git a/README.md b/README.md index a511f72..165a2b5 100644 --- a/README.md +++ b/README.md @@ -14,10 +14,14 @@ bash scripts/duck.sh start Open `http://127.0.0.1:8000/`. +`duck.sh` starts and stops the local Qdrant vector memory service together with +`llama-server` and the DuckLM API. Use `status --probe` for live backend checks. + Useful commands: ```bash bash scripts/duck.sh status +bash scripts/duck.sh status --probe bash scripts/duck.sh logs --follow bash scripts/duck.sh restart bash scripts/duck.sh stop diff --git a/docs/how_to_run.md b/docs/how_to_run.md index 61bb29f..603397b 100644 --- a/docs/how_to_run.md +++ b/docs/how_to_run.md @@ -26,8 +26,9 @@ chat and accept that it can slow down the next request. bash scripts/duck.sh start ``` -This starts both processes: +This starts the local stack: +- Qdrant vector memory on `http://127.0.0.1:6333/` - `llama-server` on `http://127.0.0.1:8081/v1` - DuckLM API/WebChat on `http://127.0.0.1:8000/` @@ -97,5 +98,4 @@ curl http://127.0.0.1:8000/v1/approvals/pending ```bash bash scripts/duck.sh stop -docker compose -f docker-compose.memory.yml down ``` diff --git a/docs/memory_architecture.md b/docs/memory_architecture.md index 4f40f7f..87c7e4c 100644 --- a/docs/memory_architecture.md +++ b/docs/memory_architecture.md @@ -1,5 +1,68 @@ # Memory Architecture -Semantic memory uses Qdrant as the vector store. Embeddings come from `/v1/embeddings` when the model backend supports it. +DuckLM currently has two memory layers: -If embeddings are unavailable, `VectorMemory` fails explicitly with `EmbeddingsUnavailableError`; it does not invent a local embedding algorithm. +- SQLite memory in `duck_core.memory.store.MemoryStore` for durable structured records. +- Vector memory in `duck_core.memory.vector_memory.VectorMemory` for semantic search through Qdrant. + +## SQLite Memory + +SQLite is the primary durable store. Runtime writes memory records after +`memory_policy` decides that a completed task contains reusable information. +Manual memory records can also be added through `/v1/memory` and the WebChat +memory drawer. + +SQLite memory remains available even when Qdrant is down. + +## Vector Memory + +Vector memory stores the same useful memory summaries in Qdrant when vector +storage is configured and reachable. Qdrant is managed by the local service +scripts: + +```bash +bash scripts/duck.sh start +bash scripts/duck.sh status --probe +bash scripts/duck.sh stop +``` + +The MTP stack uses the same memory lifecycle through `scripts/duck-mtp.sh`. + +## Embeddings + +The default embedding source is a local `sentence-transformers` model: + +```text +./models/all-MiniLM-L6-v2 +``` + +`VectorMemory` lazy-loads that model only when it needs to write or search +vectors. Health checks do not load the embedding model; they only probe Qdrant. + +A remote OpenAI-compatible embeddings endpoint can be used by setting +`embeddings_base_url`, but the normal local stack does not rely on +`llama-server` embeddings. + +If no embedding source is configured, `VectorMemory` raises +`EmbeddingsUnavailableError`. It does not silently invent fallback embeddings. + +## Status And Verification + +Runtime status is available through: + +```bash +curl --noproxy '*' 'http://127.0.0.1:8000/v1/status?probe=true' +``` + +`scripts/duck.sh status --probe` prints the same backend result plus Docker +Compose state for Qdrant. WebChat also shows model and vector memory state in +the Runtime panel. + +The live smoke test for Qdrant write/search is: + +```bash +.venv/bin/python -m pytest tests/smoke/test_vector_memory_live.py -q +``` + +The test skips when Qdrant is not reachable, and runs a real add/search cycle +when the local stack is up. diff --git a/duck_core/api.py b/duck_core/api.py index a2f4296..1c428dd 100644 --- a/duck_core/api.py +++ b/duck_core/api.py @@ -245,12 +245,15 @@ def create_app() -> FastAPI: memory_records = await relevant_memory( body.message, conversation.workspace, conversation.conversation_id ) + memory_sufficient_to_answer = False # Use recall-role to filter relevant memories via LLM if memory_records and runtime.context_builder._model_client is not None: try: - memory_records = await runtime.context_builder.recall_relevant_memory( + recall_decision = await runtime.context_builder.recall_relevant_memory_decision( body.message, memory_records ) + memory_records = recall_decision.records + memory_sufficient_to_answer = recall_decision.sufficient_to_answer except Exception: pass # Fallback to unfiltered memory_records result = await runtime.run_chat( @@ -262,6 +265,7 @@ def create_app() -> FastAPI: skill_summary=await selected_skill_summary(body.message), reasoning=body.reasoning, reflect=bool(settings.enable_reflection), + skip_action_loop=memory_sufficient_to_answer, ) await conversations.add_message( conversation.conversation_id, @@ -377,6 +381,16 @@ def create_app() -> FastAPI: memory_records = await relevant_memory( body.message, conversation.workspace, conversation.conversation_id ) + memory_sufficient_to_answer = False + if memory_records and runtime.context_builder._model_client is not None: + try: + recall_decision = await runtime.context_builder.recall_relevant_memory_decision( + body.message, memory_records + ) + memory_records = recall_decision.records + memory_sufficient_to_answer = recall_decision.sufficient_to_answer + except Exception: + pass task = await task_store.create_task( body.message, conversation.workspace, body.debug ) @@ -407,9 +421,17 @@ def create_app() -> FastAPI: "planning", "Планирую, нужны ли локальные действия...", ) - tool_observations = await runtime._run_action_loop( - task.task_id, messages, conversation.workspace - ) + tool_observations = [] + if not memory_sufficient_to_answer: + tool_observations = await runtime._run_action_loop( + task.task_id, messages, conversation.workspace + ) + else: + await event_store.append( + task.task_id, + "action_loop_skipped", + {"reason": "recall_sufficient_to_answer"}, + ) if tool_observations: yield runtime_status( task.task_id, diff --git a/duck_core/context_builder.py b/duck_core/context_builder.py index 6c50757..aafcc4e 100644 --- a/duck_core/context_builder.py +++ b/duck_core/context_builder.py @@ -2,6 +2,7 @@ from __future__ import annotations import json import logging +from dataclasses import dataclass from typing import Any from duck_core.tasks.state import TaskState @@ -12,6 +13,13 @@ logger = logging.getLogger(__name__) _CHARS_PER_TOKEN = 4 +@dataclass +class RecallDecision: + records: list[dict[str, str]] + sufficient_to_answer: bool = False + reasoning: str = "" + + def estimate_tokens(text: str) -> int: """Rough token estimate based on character count.""" return max(len(text) // _CHARS_PER_TOKEN, 1) @@ -64,20 +72,27 @@ class ContextBuilder: Returns only the memories that are relevant to the query. Falls back to returning all records if LLM is unavailable. """ + return (await self.recall_relevant_memory_decision(query, memory_records)).records + + async def recall_relevant_memory_decision( + self, + query: str, + memory_records: list[dict[str, str]], + ) -> RecallDecision: if not memory_records or self._model_client is None: - return memory_records + return RecallDecision(records=memory_records, sufficient_to_answer=False) try: return await self._llm_recall(query, memory_records) except Exception as exc: logger.warning("Recall failed, using all memories: %s", exc) - return memory_records + return RecallDecision(records=memory_records, sufficient_to_answer=False) async def _llm_recall( self, query: str, memory_records: list[dict[str, str]], - ) -> list[dict[str, str]]: + ) -> RecallDecision: """Call recall-role LLM to identify relevant memories.""" memories_text = "\n".join( f"[{m.get('memory_id', i)}] {m.get('text', '')}" @@ -98,15 +113,19 @@ class ContextBuilder: "name": "recall_result", "schema": { "type": "object", - "required": ["relevant_ids", "reasoning"], - "additionalProperties": False, - "properties": { - "relevant_ids": { - "type": "array", - "items": {"type": "string"}, - }, - "reasoning": {"type": "string"}, + "required": ["relevant_ids", "reasoning"], + "additionalProperties": False, + "properties": { + "relevant_ids": { + "type": "array", + "items": {"type": "string"}, }, + "sufficient_to_answer": { + "type": "boolean", + "description": "True when selected memories are enough to answer without local tools/actions.", + }, + "reasoning": {"type": "string"}, + }, }, "strict": True, }, @@ -115,8 +134,16 @@ class ContextBuilder: data = json.loads(response.content) relevant_ids = set(data.get("relevant_ids", [])) if not relevant_ids: - return [] - return [m for i, m in enumerate(memory_records) if m.get("memory_id", str(i)) in relevant_ids] + return RecallDecision(records=[], sufficient_to_answer=False, reasoning=str(data.get("reasoning", ""))) + records = [ + m for i, m in enumerate(memory_records) + if m.get("memory_id", str(i)) in relevant_ids + ] + return RecallDecision( + records=records, + sufficient_to_answer=bool(data.get("sufficient_to_answer", False)) and bool(records), + reasoning=str(data.get("reasoning", "")), + ) def build_basic_messages( self, diff --git a/duck_core/runtime_loop.py b/duck_core/runtime_loop.py index f2d8541..43bc9eb 100644 --- a/duck_core/runtime_loop.py +++ b/duck_core/runtime_loop.py @@ -20,6 +20,7 @@ from duck_core.tools.gateway import ToolGateway logger = logging.getLogger(__name__) ACTION_DIRECTIVE_SCHEMA = load_json_schema("duck_core/schemas/action_directive.schema.json") MAX_TOOL_OBSERVATION_TEXT_CHARS = 2000 +MAX_MEMORY_TRANSCRIPT_CHARS = 6000 @dataclass @@ -67,6 +68,7 @@ class RuntimeLoop: skill_summary: str | None = None, reflect: bool = True, reasoning: ReasoningMode | None = None, + skip_action_loop: bool = False, ) -> ChatResult: task = await self.task_store.create_task(message, workspace, debug) await self.event_store.append( @@ -78,7 +80,15 @@ class RuntimeLoop: messages = await self.context_builder.build_async_messages( task, history_messages, memory_records, skill_summary=skill_summary ) - tool_observations = await self._run_action_loop(task.task_id, messages, workspace) + tool_observations = [] + if not skip_action_loop: + tool_observations = await self._run_action_loop(task.task_id, messages, workspace) + else: + await self.event_store.append( + task.task_id, + "action_loop_skipped", + {"reason": "recall_sufficient_to_answer"}, + ) if any(observation.get("requires_approval") for observation in tool_observations): await self.task_store.waiting_for_approval(task.task_id) await self.event_store.append( @@ -288,7 +298,8 @@ class RuntimeLoop: if self.memory_store is None: return try: - decision = await self.memory_policy.classify(final_response, task_id) + transcript = await self._build_memory_policy_transcript(task_id, final_response) + decision = await self.memory_policy.classify(transcript, task_id) await self.event_store.append( task_id, "memory_policy_decision", @@ -326,6 +337,47 @@ class RuntimeLoop: {"error": str(exc)}, ) + async def _build_memory_policy_transcript(self, task_id: str, final_response: str) -> str: + task = await self.task_store.get_task(task_id) + parts = [] + if task is not None: + parts.append(f"User message:\n{task.user_message}") + if task.workspace: + parts.append(f"Workspace:\n{task.workspace}") + parts.append(f"Assistant final response:\n{final_response}") + + event_summaries = [] + for event in await self.event_store.list_events(task_id): + summary = self._summarize_memory_event(event.event_type, event.payload) + if summary: + event_summaries.append(f"- {event.event_type}: {summary}") + if event_summaries: + parts.append("Relevant runtime events:\n" + "\n".join(event_summaries)) + + transcript = "\n\n".join(parts) + if len(transcript) > MAX_MEMORY_TRANSCRIPT_CHARS: + return transcript[:MAX_MEMORY_TRANSCRIPT_CHARS] + "\n...[truncated]" + return transcript + + def _summarize_memory_event(self, event_type: str, payload: dict[str, Any]) -> str: + if event_type == "action_directive": + intent = payload.get("intent") + hints = payload.get("memory_hints") or [] + if hints: + return f"intent={intent}; memory_hints={hints}" + return f"intent={intent}" if intent else "" + if event_type == "tool_call_finished": + result = payload.get("result") or {} + output = str(result.get("output") or result.get("error") or "").strip() + if len(output) > 500: + output = output[:500] + "...[truncated]" + return "{} ok={} {}".format(payload.get("tool"), result.get("ok"), output).strip() + if event_type == "tool_approval_requested": + return str(payload.get("reason") or payload.get("tool") or "") + if event_type == "tool_call_skipped": + return str(payload.get("reason") or "") + return "" + async def _run_reflection(self, task_id: str) -> None: """Run critic reflection on completed task and record experience.""" if self.experience_recorder is None: @@ -463,6 +515,18 @@ class RuntimeLoop: if seen_action_keys is not None: seen_action_keys.add(action_key) tool_name = str(action.get("tool", "")) + if tool_name not in gateway.tools: + await self.event_store.append( + task_id, + "tool_call_skipped", + { + "index": index, + "tool": tool_name, + "reason": "unknown_tool", + "action": action, + }, + ) + continue await self.event_store.append( task_id, "tool_call_started", diff --git a/duck_core/web/static/app.js b/duck_core/web/static/app.js index d2c46e1..4f3ad08 100644 --- a/duck_core/web/static/app.js +++ b/duck_core/web/static/app.js @@ -894,19 +894,46 @@ async function sendMessage() { async function checkRuntime() { try { - await jsonFetch("/health"); + const status = await jsonFetch("/v1/status?probe=true"); + const services = status.services || {}; + const llama = services.llama || {}; + const vector = services.vector_memory || {}; + const roles = Object.keys(status.models?.roles || {}).sort(); + setStatus("#api-status", "online", "ok"); + setStatus("#model-status", serviceStatusText(llama), serviceTone(llama)); + setStatus("#vector-status", serviceStatusText(vector), serviceTone(vector)); + setStatus("#embedding-status", compactEmbeddingSource(vector.embedding_source), vector.configured ? "ok" : "warn"); + setStatus("#roles-status", roles.length ? String(roles.length) : "none", roles.length ? "ok" : "warn"); } catch { setStatus("#api-status", "offline", "bad"); + setStatus("#model-status", "unknown", "bad"); + setStatus("#vector-status", "unknown", "bad"); + setStatus("#embedding-status", "unknown", "bad"); + setStatus("#roles-status", "unknown", "bad"); } +} - try { - const roles = await jsonFetch("/v1/models/ping"); - const ok = Object.values(roles).every((item) => item.ok); - setStatus("#model-status", ok ? "online" : "degraded", ok ? "ok" : "warn"); - } catch { - setStatus("#model-status", "offline", "bad"); +function serviceStatusText(service) { + if (!service || service.probed === false) return "not probed"; + if (service.ok === true) { + return service.latency_ms !== undefined ? `ok ${Math.round(service.latency_ms)}ms` : "ok"; } + if (service.ok === false) return "failed"; + return service.configured === false ? "disabled" : "unknown"; +} + +function serviceTone(service) { + if (!service || service.ok === false) return "bad"; + if (service.ok === true) return "ok"; + return service.configured === false ? "warn" : "neutral"; +} + +function compactEmbeddingSource(source) { + if (!source) return "unknown"; + if (source.startsWith("local:")) return source.slice(6).split("/").filter(Boolean).pop() || "local"; + if (source.startsWith("remote:")) return "remote"; + return source; } function bindChat() { @@ -928,6 +955,9 @@ function bindChat() { document.querySelector("#reload-chat")?.addEventListener("click", () => { if (state.currentConversationId) selectConversation(state.currentConversationId).catch(console.error); }); + document.querySelector("#refresh-runtime")?.addEventListener("click", () => { + checkRuntime().catch(console.error); + }); document.querySelector("#activity-open")?.addEventListener("click", () => { openActivity("events"); }); @@ -1114,7 +1144,9 @@ async function renderMemoryPageResults(query) { if (!results.length) { const empty = document.createElement("p"); empty.className = "compact-empty"; - empty.textContent = "No memories found."; + empty.textContent = query.trim() + ? "No matching memories." + : "No memories yet. Add one here or let DuckLM store useful task results automatically."; container.append(empty); return; } diff --git a/duck_core/web/templates/index.html b/duck_core/web/templates/index.html index 3d02776..dfc97f8 100644 --- a/duck_core/web/templates/index.html +++ b/duck_core/web/templates/index.html @@ -4,7 +4,7 @@ DuckLM WebChat - +
@@ -53,7 +53,10 @@
-

Runtime

+
+

Runtime

+ +
API
@@ -63,6 +66,18 @@
Model
checking
+
+
Vector
+
checking
+
+
+
Embed
+
checking
+
+
+
Roles
+
checking
+
Last task
none
@@ -146,6 +161,6 @@
- + diff --git a/duck_core/web/templates/memory.html b/duck_core/web/templates/memory.html index a40a5d1..af0a208 100644 --- a/duck_core/web/templates/memory.html +++ b/duck_core/web/templates/memory.html @@ -3,7 +3,7 @@ DuckLM Memory - +
@@ -28,6 +28,6 @@
- + diff --git a/prompts/roles/recall.md b/prompts/roles/recall.md index 984180b..9f2fb98 100644 --- a/prompts/roles/recall.md +++ b/prompts/roles/recall.md @@ -3,6 +3,7 @@ You are DuckLM recall role. Given a user query and a list of memory records, ide Return ONLY valid JSON: { "relevant_ids": ["memory_id_1", "memory_id_2"], + "sufficient_to_answer": false, "reasoning": "brief explanation of why these memories were selected" } @@ -11,4 +12,6 @@ Rules: - Prefer specific memories over general ones - Include global memories if they apply to the current context - If no memories are relevant, return empty relevant_ids array +- Set sufficient_to_answer=true only when the selected memories alone are enough to answer the user directly without local tools, files, commands, repository inspection, web/API calls, or other actions +- Set sufficient_to_answer=false when the user asks to inspect, run, change, search, verify, debug, edit, or otherwise act on local/external state - Be conservative — better to include too few than too many irrelevant memories diff --git a/scripts/duck-mtp.sh b/scripts/duck-mtp.sh index 213aa48..f05ac6e 100755 --- a/scripts/duck-mtp.sh +++ b/scripts/duck-mtp.sh @@ -20,6 +20,9 @@ ENV_KEYS=( DUCK_API_PID_FILE DUCK_API_LOG_FILE DUCK_API_COMMAND + DUCK_QDRANT_MANAGED + DUCK_QDRANT_COMPOSE_FILE + DUCK_QDRANT_SERVICE ) declare -A ENV_OVERRIDES=() for key in "${ENV_KEYS[@]}"; do @@ -43,14 +46,17 @@ API_PID_FILE="${DUCK_API_PID_FILE:-${ROOT_DIR}/data/duck-api.pid}" API_LOG_FILE="${DUCK_API_LOG_FILE:-${ROOT_DIR}/data/duck-api.log}" API_URL="http://${DUCK_API_HOST:-127.0.0.1}:${DUCK_API_PORT:-8000}" LLAMA_SCRIPT="${ROOT_DIR}/scripts/llama/start_mtp_main.sh" +QDRANT_MANAGED="${DUCK_QDRANT_MANAGED:-1}" +QDRANT_COMPOSE_FILE="${DUCK_QDRANT_COMPOSE_FILE:-${ROOT_DIR}/docker-compose.memory.yml}" +QDRANT_SERVICE="${DUCK_QDRANT_SERVICE:-qdrant}" usage() { cat <<'EOF' Usage: scripts/duck-mtp.sh Commands: - start Start MTP llama-server and DuckLM API in the background - stop Stop DuckLM API and managed MTP llama-server + start Start Qdrant, MTP llama-server, and DuckLM API in the background + stop Stop DuckLM API, managed MTP llama-server, and Qdrant restart Stop and start the whole local DuckLM stack status Print process and HTTP health status Use "status --probe" to include live model/vector checks @@ -63,11 +69,77 @@ Environment: DUCK_API_PID_FILE API PID file path DUCK_API_LOG_FILE API log file path DUCK_API_COMMAND API command override, default ".venv/bin/python -m duck_core.api" + DUCK_QDRANT_MANAGED Set to 0 to skip Docker Compose Qdrant lifecycle + DUCK_QDRANT_COMPOSE_FILE + Compose file path, default docker-compose.memory.yml + DUCK_QDRANT_SERVICE Compose service name, default qdrant MTP llama-server environment is handled by scripts/llama/start_mtp_main.sh. EOF } +qdrant_enabled() { + [[ "${QDRANT_MANAGED}" != "0" && "${QDRANT_MANAGED,,}" != "false" && "${QDRANT_MANAGED,,}" != "no" ]] +} + +docker_compose() { + docker compose -f "${QDRANT_COMPOSE_FILE}" "$@" +} + +start_qdrant() { + if ! qdrant_enabled; then + echo "Qdrant management: disabled" + return 0 + fi + if ! command -v docker >/dev/null 2>&1; then + echo "Docker not found; cannot start Qdrant" >&2 + return 1 + fi + if [[ ! -f "${QDRANT_COMPOSE_FILE}" ]]; then + echo "Qdrant compose file not found: ${QDRANT_COMPOSE_FILE}" >&2 + return 1 + fi + + echo "Starting Qdrant..." + docker_compose up -d "${QDRANT_SERVICE}" +} + +stop_qdrant() { + if ! qdrant_enabled; then + echo "Qdrant management: disabled" + return 0 + fi + if ! command -v docker >/dev/null 2>&1; then + echo "Docker not found; cannot stop Qdrant" >&2 + return 1 + fi + if [[ ! -f "${QDRANT_COMPOSE_FILE}" ]]; then + echo "Qdrant compose file not found: ${QDRANT_COMPOSE_FILE}" >&2 + return 1 + fi + + echo "Stopping Qdrant..." + docker_compose down +} + +status_qdrant() { + if ! qdrant_enabled; then + echo "Qdrant status: disabled" + return 0 + fi + if ! command -v docker >/dev/null 2>&1; then + echo "Qdrant status: Docker not found" + return 1 + fi + if [[ ! -f "${QDRANT_COMPOSE_FILE}" ]]; then + echo "Qdrant status: compose file not found (${QDRANT_COMPOSE_FILE})" + return 1 + fi + + echo "Qdrant status:" + docker_compose ps "${QDRANT_SERVICE}" +} + api_is_running() { [[ -f "${API_PID_FILE}" ]] || return 1 local pid @@ -157,6 +229,7 @@ stop_api() { } start_stack() { + start_qdrant "${LLAMA_SCRIPT}" start start_api echo @@ -167,6 +240,7 @@ start_stack() { stop_stack() { stop_api "${LLAMA_SCRIPT}" stop + stop_qdrant } status_stack() { @@ -210,6 +284,11 @@ status_stack() { if [[ "${llama_rc}" != "0" && "${rc}" == "0" ]]; then rc="${llama_rc}" fi + local qdrant_rc=0 + status_qdrant || qdrant_rc=$? + if [[ "${qdrant_rc}" != "0" && "${rc}" == "0" ]]; then + rc="${qdrant_rc}" + fi return "${rc}" } @@ -296,6 +375,13 @@ logs_stack() { echo echo "==> llama-server log: ${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-mtp.log} <==" "${LLAMA_SCRIPT}" logs --lines "${lines}" + echo + echo "==> Qdrant log <==" + if qdrant_enabled && command -v docker >/dev/null 2>&1 && [[ -f "${QDRANT_COMPOSE_FILE}" ]]; then + docker_compose logs --tail "${lines}" "${QDRANT_SERVICE}" + else + status_qdrant || true + fi fi } diff --git a/scripts/duck.sh b/scripts/duck.sh index 40f076f..f941c58 100755 --- a/scripts/duck.sh +++ b/scripts/duck.sh @@ -20,6 +20,9 @@ ENV_KEYS=( DUCK_API_PID_FILE DUCK_API_LOG_FILE DUCK_API_COMMAND + DUCK_QDRANT_MANAGED + DUCK_QDRANT_COMPOSE_FILE + DUCK_QDRANT_SERVICE ) declare -A ENV_OVERRIDES=() for key in "${ENV_KEYS[@]}"; do @@ -43,14 +46,17 @@ API_PID_FILE="${DUCK_API_PID_FILE:-${ROOT_DIR}/data/duck-api.pid}" API_LOG_FILE="${DUCK_API_LOG_FILE:-${ROOT_DIR}/data/duck-api.log}" API_URL="http://${DUCK_API_HOST:-127.0.0.1}:${DUCK_API_PORT:-8000}" LLAMA_SCRIPT="${ROOT_DIR}/scripts/llama/start_main.sh" +QDRANT_MANAGED="${DUCK_QDRANT_MANAGED:-1}" +QDRANT_COMPOSE_FILE="${DUCK_QDRANT_COMPOSE_FILE:-${ROOT_DIR}/docker-compose.memory.yml}" +QDRANT_SERVICE="${DUCK_QDRANT_SERVICE:-qdrant}" usage() { cat <<'EOF' Usage: scripts/duck.sh Commands: - start Start llama-server and DuckLM API in the background - stop Stop DuckLM API and managed llama-server + start Start Qdrant, llama-server, and DuckLM API in the background + stop Stop DuckLM API, managed llama-server, and Qdrant restart Stop and start the whole local DuckLM stack status Print process and HTTP health status Use "status --probe" to include live model/vector checks @@ -63,11 +69,77 @@ Environment: DUCK_API_PID_FILE API PID file path DUCK_API_LOG_FILE API log file path DUCK_API_COMMAND API command override, default ".venv/bin/python -m duck_core.api" + DUCK_QDRANT_MANAGED Set to 0 to skip Docker Compose Qdrant lifecycle + DUCK_QDRANT_COMPOSE_FILE + Compose file path, default docker-compose.memory.yml + DUCK_QDRANT_SERVICE Compose service name, default qdrant llama-server environment is handled by scripts/llama/start_main.sh. EOF } +qdrant_enabled() { + [[ "${QDRANT_MANAGED}" != "0" && "${QDRANT_MANAGED,,}" != "false" && "${QDRANT_MANAGED,,}" != "no" ]] +} + +docker_compose() { + docker compose -f "${QDRANT_COMPOSE_FILE}" "$@" +} + +start_qdrant() { + if ! qdrant_enabled; then + echo "Qdrant management: disabled" + return 0 + fi + if ! command -v docker >/dev/null 2>&1; then + echo "Docker not found; cannot start Qdrant" >&2 + return 1 + fi + if [[ ! -f "${QDRANT_COMPOSE_FILE}" ]]; then + echo "Qdrant compose file not found: ${QDRANT_COMPOSE_FILE}" >&2 + return 1 + fi + + echo "Starting Qdrant..." + docker_compose up -d "${QDRANT_SERVICE}" +} + +stop_qdrant() { + if ! qdrant_enabled; then + echo "Qdrant management: disabled" + return 0 + fi + if ! command -v docker >/dev/null 2>&1; then + echo "Docker not found; cannot stop Qdrant" >&2 + return 1 + fi + if [[ ! -f "${QDRANT_COMPOSE_FILE}" ]]; then + echo "Qdrant compose file not found: ${QDRANT_COMPOSE_FILE}" >&2 + return 1 + fi + + echo "Stopping Qdrant..." + docker_compose down +} + +status_qdrant() { + if ! qdrant_enabled; then + echo "Qdrant status: disabled" + return 0 + fi + if ! command -v docker >/dev/null 2>&1; then + echo "Qdrant status: Docker not found" + return 1 + fi + if [[ ! -f "${QDRANT_COMPOSE_FILE}" ]]; then + echo "Qdrant status: compose file not found (${QDRANT_COMPOSE_FILE})" + return 1 + fi + + echo "Qdrant status:" + docker_compose ps "${QDRANT_SERVICE}" +} + api_is_running() { [[ -f "${API_PID_FILE}" ]] || return 1 local pid @@ -157,6 +229,7 @@ stop_api() { } start_stack() { + start_qdrant "${LLAMA_SCRIPT}" start start_api echo @@ -167,6 +240,7 @@ start_stack() { stop_stack() { stop_api "${LLAMA_SCRIPT}" stop + stop_qdrant } status_stack() { @@ -210,6 +284,11 @@ status_stack() { if [[ "${llama_rc}" != "0" && "${rc}" == "0" ]]; then rc="${llama_rc}" fi + local qdrant_rc=0 + status_qdrant || qdrant_rc=$? + if [[ "${qdrant_rc}" != "0" && "${rc}" == "0" ]]; then + rc="${qdrant_rc}" + fi return "${rc}" } @@ -296,6 +375,13 @@ logs_stack() { echo echo "==> llama-server log: ${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-main.log} <==" "${LLAMA_SCRIPT}" logs --lines "${lines}" + echo + echo "==> Qdrant log <==" + if qdrant_enabled && command -v docker >/dev/null 2>&1 && [[ -f "${QDRANT_COMPOSE_FILE}" ]]; then + docker_compose logs --tail "${lines}" "${QDRANT_SERVICE}" + else + status_qdrant || true + fi fi } diff --git a/tests/smoke/test_api_stream_chat.py b/tests/smoke/test_api_stream_chat.py index 833d58d..71a07f9 100644 --- a/tests/smoke/test_api_stream_chat.py +++ b/tests/smoke/test_api_stream_chat.py @@ -111,6 +111,77 @@ def test_stream_chat_forwards_reasoning_toggle_to_thinker(tmp_path, monkeypatch) assert captured == {"role": "thinker", "reasoning": "off"} +def test_stream_chat_skips_action_role_for_direct_memory_question(tmp_path, monkeypatch): + monkeypatch.setenv("DUCK_DB_PATH", str(tmp_path / "duck.sqlite3")) + calls = [] + + async def fake_chat(self, role, messages, temperature=None, max_output_tokens=None, response_format=None): + calls.append(role) + if role == "action": + raise AssertionError("direct memory question should skip action role") + if role == "recall": + content = { + "relevant_ids": ["0"], + "sufficient_to_answer": True, + "reasoning": "name memory is relevant and sufficient", + } + elif role == "memory_policy": + content = { + "should_store": False, + "memory_type": "note", + "summary": "Recall answer, no new memory.", + "importance": 0.1, + "scope": "conversation", + "metadata": {}, + } + else: + content = { + "kind": "action_directive", + "intent": "unused", + "risk_level": "none", + "actions": [], + } + return ModelResponse( + role=role, + model="local-main", + content=json.dumps(content), + reasoning_content=None, + raw={}, + latency_ms=1.0, + ) + + async def fake_stream_chat(self, role, messages, **kwargs): + assert role == "thinker" + yield {"type": "content_delta", "delta": "Вас зовут Владимир."} + + monkeypatch.setattr("duck_core.model_client.ModelClient.chat", fake_chat) + monkeypatch.setattr("duck_core.model_client.ModelClient.stream_chat", fake_stream_chat) + with TestClient(create_app()) as client: + client.post( + "/v1/memory", + json={ + "text": "Пользователя зовут Владимир.", + "workspace": "./workspace", + "scope": "global", + "memory_type": "fact", + "importance": 0.8, + }, + ) + with client.stream( + "POST", + "/v1/chat/stream", + json={"message": "Как меня зовут? Ответь коротко.", "workspace": "./workspace"}, + ) as response: + body = "".join(response.iter_text()) + task_id = re.search(r'"task_id"\s*:\s*"([^"]+)"', body).group(1) + events = client.get(f"/v1/tasks/{task_id}/events").json() + + assert response.status_code == 200 + assert "Вас зовут Владимир." in body + assert "action" not in calls + assert any(event["event_type"] == "action_loop_skipped" for event in events) + + def test_stream_chat_runs_memory_policy_and_reflection_after_completion(tmp_path, monkeypatch): monkeypatch.setenv("DUCK_DB_PATH", str(tmp_path / "duck.sqlite3")) monkeypatch.setenv("DUCK_ENABLE_REFLECTION", "1") diff --git a/tests/smoke/test_context_builder.py b/tests/smoke/test_context_builder.py index 183ed81..c6ada51 100644 --- a/tests/smoke/test_context_builder.py +++ b/tests/smoke/test_context_builder.py @@ -156,7 +156,7 @@ async def test_context_builder_recall_awaits_model_client(): return_value=ModelResponse( role="recall", model="local-main", - content='{"relevant_ids":["mem_1"],"reasoning":"matches query"}', + content='{"relevant_ids":["mem_1"],"sufficient_to_answer":true,"reasoning":"matches query"}', reasoning_content=None, raw={}, latency_ms=1.0, @@ -174,6 +174,32 @@ async def test_context_builder_recall_awaits_model_client(): model_client.chat.assert_awaited_once() +@pytest.mark.asyncio +async def test_context_builder_recall_returns_sufficiency_decision(): + model_client = AsyncMock() + model_client.chat = AsyncMock( + return_value=ModelResponse( + role="recall", + model="local-main", + content='{"relevant_ids":["mem_1"],"sufficient_to_answer":true,"reasoning":"memory answers directly"}', + reasoning_content=None, + raw={}, + latency_ms=1.0, + ) + ) + builder = ContextBuilder(model_client=model_client) + records = [ + {"memory_id": "mem_1", "text": "User name is Vladimir."}, + {"memory_id": "mem_2", "text": "Unrelated."}, + ] + + decision = await builder.recall_relevant_memory_decision("What is my name?", records) + + assert decision.records == [records[0]] + assert decision.sufficient_to_answer is True + assert decision.reasoning == "memory answers directly" + + @pytest.mark.asyncio async def test_context_builder_summary_awaits_model_client(): model_client = AsyncMock() diff --git a/tests/smoke/test_duck_service_script.py b/tests/smoke/test_duck_service_script.py index 6bcfca8..d5f838a 100644 --- a/tests/smoke/test_duck_service_script.py +++ b/tests/smoke/test_duck_service_script.py @@ -50,6 +50,7 @@ def test_duck_script_manages_llama_and_api(tmp_path): "DUCK_API_LOG_FILE": str(api_log_file), "DUCK_API_COMMAND": str(fake_api), "DUCK_API_PORT": "18000", + "DUCK_QDRANT_MANAGED": "0", } script = "scripts/duck.sh" @@ -142,6 +143,7 @@ def test_duck_mtp_script_starts_mtp_llama_for_duck_api(tmp_path): "DUCK_API_LOG_FILE": str(api_log_file), "DUCK_API_COMMAND": str(fake_api), "DUCK_API_PORT": "18000", + "DUCK_QDRANT_MANAGED": "0", } script = "scripts/duck-mtp.sh" @@ -225,6 +227,7 @@ def test_duck_mtp_script_sets_llama_bin_dir_library_path_for_help_check(tmp_path "DUCK_API_LOG_FILE": str(tmp_path / "duck-api-mtp.log"), "DUCK_API_COMMAND": str(fake_api), "DUCK_API_PORT": "18000", + "DUCK_QDRANT_MANAGED": "0", } started = subprocess.run( @@ -244,3 +247,97 @@ def test_duck_mtp_script_sets_llama_bin_dir_library_path_for_help_check(tmp_path assert "--model-draft" not in logs.stdout finally: subprocess.run(["scripts/duck-mtp.sh", "stop"], env=env, text=True, capture_output=True) + + +def test_duck_script_manages_qdrant_compose_service(tmp_path): + docker_log = tmp_path / "docker.log" + fake_docker = tmp_path / "docker" + fake_docker.write_text( + textwrap.dedent( + f"""\ + #!/usr/bin/env bash + echo "$*" >> "{docker_log}" + if [[ "$*" == *"compose"* && "$*" == *"ps qdrant"* ]]; then + echo "ducklm-qdrant-1 running" + fi + if [[ "$*" == *"compose"* && "$*" == *"logs"* ]]; then + echo "fake qdrant log" + fi + exit 0 + """ + ) + ) + fake_docker.chmod(0o755) + + fake_llama = tmp_path / "llama-server" + fake_llama.write_text( + textwrap.dedent( + """\ + #!/usr/bin/env bash + echo "fake llama-server $*" >&2 + trap 'exit 0' TERM INT + while true; do sleep 1; done + """ + ) + ) + fake_llama.chmod(0o755) + + fake_api = tmp_path / "fake-api.sh" + fake_api.write_text( + textwrap.dedent( + """\ + #!/usr/bin/env bash + trap 'exit 0' TERM INT + while true; do sleep 1; done + """ + ) + ) + fake_api.chmod(0o755) + + api_pid_file = tmp_path / "duck-api.pid" + env = { + **os.environ, + "PATH": f"{tmp_path}:{os.environ['PATH']}", + "DUCK_LLAMA_SERVER_BIN": str(fake_llama), + "DUCK_MAIN_MODEL_PATH": str(tmp_path / "model.gguf"), + "DUCK_LLAMA_PID_FILE": str(tmp_path / "llama.pid"), + "DUCK_LLAMA_LOG_FILE": str(tmp_path / "llama.log"), + "DUCK_MAIN_PORT": "18081", + "DUCK_API_PID_FILE": str(api_pid_file), + "DUCK_API_LOG_FILE": str(tmp_path / "duck-api.log"), + "DUCK_API_COMMAND": str(fake_api), + "DUCK_API_PORT": "18000", + } + Path(env["DUCK_MAIN_MODEL_PATH"]).write_text("fake") + + started = subprocess.run( + ["scripts/duck.sh", "start"], env=env, text=True, capture_output=True + ) + assert started.returncode == 0 + assert "Starting Qdrant" in started.stdout + assert "Qdrant status:" in started.stdout + + running = subprocess.run( + ["scripts/duck.sh", "status"], env=env, text=True, capture_output=True + ) + assert running.returncode == 0 + assert "Qdrant status:" in running.stdout + assert "ducklm-qdrant-1 running" in running.stdout + + logs = subprocess.run( + ["scripts/duck.sh", "logs", "--lines", "20"], env=env, text=True, capture_output=True + ) + assert logs.returncode == 0 + assert "Qdrant log" in logs.stdout + assert "fake qdrant log" in logs.stdout + + stopped = subprocess.run(["scripts/duck.sh", "stop"], env=env, text=True, capture_output=True) + assert stopped.returncode == 0 + assert "Stopping Qdrant" in stopped.stdout + + calls = docker_log.read_text() + assert "compose -f" in calls + assert "docker-compose.memory.yml up -d qdrant" in calls + assert "docker-compose.memory.yml ps qdrant" in calls + assert "docker-compose.memory.yml logs --tail 20 qdrant" in calls + assert "docker-compose.memory.yml down" in calls diff --git a/tests/smoke/test_memory_policy.py b/tests/smoke/test_memory_policy.py index 46ec2c5..51f8d33 100644 --- a/tests/smoke/test_memory_policy.py +++ b/tests/smoke/test_memory_policy.py @@ -4,7 +4,11 @@ from unittest.mock import AsyncMock import pytest from duck_core.memory.policy import MemoryPolicy +from duck_core.memory.store import MemoryStore from duck_core.model_client import ModelClient, ModelResponse +from duck_core.events.store import EventStore +from duck_core.runtime_loop import RuntimeLoop +from duck_core.tasks.store import TaskStore @pytest.fixture @@ -145,3 +149,81 @@ async def test_llm_memory_policy_schema_violation_falls_back(mock_model_client): assert decision.should_store is False assert decision.metadata["source"] == "llm_policy_fallback" assert "schema violation" in decision.metadata["error"] + + +class FakeRuntimeMemoryModelClient: + def __init__(self): + self.memory_policy_prompt = "" + + async def chat(self, role, messages, **kwargs): + if role == "action": + return ModelResponse( + role=role, + model="local-main", + content=json.dumps({ + "kind": "action_directive", + "intent": "direct answer", + "risk_level": "none", + "actions": [], + }), + reasoning_content=None, + raw={}, + latency_ms=1.0, + ) + if role == "thinker": + return ModelResponse( + role=role, + model="local-main", + content="Приятно познакомиться. Чем помочь?", + reasoning_content=None, + raw={}, + latency_ms=1.0, + ) + if role == "memory_policy": + self.memory_policy_prompt = messages[0]["content"] + should_store = "Меня зовут Владимир" in self.memory_policy_prompt + return ModelResponse( + role=role, + model="local-main", + content=json.dumps({ + "should_store": should_store, + "memory_type": "fact", + "summary": "User's name is Vladimir.", + "importance": 0.7, + "scope": "global", + "metadata": {}, + }), + reasoning_content=None, + raw={}, + latency_ms=1.0, + ) + raise AssertionError(f"unexpected role: {role}") + + +@pytest.mark.asyncio +async def test_runtime_memory_policy_sees_user_message_not_only_final_answer(tmp_path): + db_path = str(tmp_path / "duck.sqlite3") + task_store = TaskStore(db_path) + event_store = EventStore(db_path) + memory_store = MemoryStore(db_path) + model_client = FakeRuntimeMemoryModelClient() + loop = RuntimeLoop( + task_store, + event_store, + model_client, + memory_store=memory_store, + ) + + result = await loop.run_chat( + "Меня зовут Владимир. Я работаю системным администратором.", + str(tmp_path), + debug=True, + reflect=False, + ) + memories = await memory_store.list(workspace=str(tmp_path)) + + assert result.status == "completed" + assert "User message:" in model_client.memory_policy_prompt + assert "Меня зовут Владимир" in model_client.memory_policy_prompt + assert "Assistant final response:" in model_client.memory_policy_prompt + assert memories[0].text == "User's name is Vladimir." diff --git a/tests/smoke/test_runtime_tools.py b/tests/smoke/test_runtime_tools.py index 06df179..c204119 100644 --- a/tests/smoke/test_runtime_tools.py +++ b/tests/smoke/test_runtime_tools.py @@ -211,6 +211,61 @@ class FakeRepeatingActionModelClient: ) +class FakeUnknownToolActionModelClient: + async def chat(self, role, messages): + if role == "action": + return ModelResponse( + role=role, + model="local-main", + content=json.dumps( + { + "kind": "action_directive", + "intent": "answer from context", + "risk_level": "low", + "actions": [ + { + "tool": "answer", + "args": {"text": "This is not a real tool."}, + "reason": "Model attempted to answer as a tool", + } + ], + } + ), + reasoning_content=None, + raw={}, + latency_ms=5.0, + ) + assert role == "thinker" + return ModelResponse( + role=role, + model="local-main", + content="Answered normally without unknown tool execution.", + reasoning_content=None, + raw={}, + latency_ms=12.0, + ) + + +class FakeDirectMemoryQuestionModelClient: + def __init__(self): + self.roles = [] + + async def chat(self, role, messages, **kwargs): + self.roles.append(role) + if role == "action": + raise AssertionError("direct memory question should skip action role") + assert role == "thinker" + assert any("Known memory" in message["content"] for message in messages) + return ModelResponse( + role=role, + model="local-main", + content="Вас зовут Владимир.", + reasoning_content=None, + raw={}, + latency_ms=12.0, + ) + + @pytest.mark.asyncio async def test_runtime_executes_action_directive_tool_and_finishes_with_observation(tmp_path): (tmp_path / "note.txt").write_text("hello from tool") @@ -330,6 +385,50 @@ async def test_runtime_skips_duplicate_action_within_same_task(tmp_path): assert skipped_tools[0].payload["reason"] == "duplicate_action" +@pytest.mark.asyncio +async def test_runtime_skips_unknown_action_tools_before_gateway(tmp_path): + db_path = str(tmp_path / "duck.sqlite3") + task_store = TaskStore(db_path) + event_store = EventStore(db_path) + loop = RuntimeLoop(task_store, event_store, FakeUnknownToolActionModelClient()) + + result = await loop.run_chat("answer from known context", str(tmp_path), debug=True) + events = await event_store.list_events(result.task_id) + skipped_tools = [event for event in events if event.event_type == "tool_call_skipped"] + + assert result.status == "completed" + assert result.final_response == "Answered normally without unknown tool execution." + assert len(skipped_tools) == 1 + assert skipped_tools[0].payload["reason"] == "unknown_tool" + assert skipped_tools[0].payload["tool"] == "answer" + assert not any(event.event_type == "tool_call_started" for event in events) + + +@pytest.mark.asyncio +async def test_runtime_skips_action_loop_for_direct_memory_question(tmp_path): + db_path = str(tmp_path / "duck.sqlite3") + task_store = TaskStore(db_path) + event_store = EventStore(db_path) + model_client = FakeDirectMemoryQuestionModelClient() + loop = RuntimeLoop(task_store, event_store, model_client) + + result = await loop.run_chat( + "Как меня зовут? Ответь коротко.", + str(tmp_path), + debug=True, + memory_records=[{"text": "Known memory: user's name is Vladimir."}], + skip_action_loop=True, + reflect=False, + ) + events = await event_store.list_events(result.task_id) + + assert result.status == "completed" + assert result.final_response == "Вас зовут Владимир." + assert model_client.roles == ["thinker"] + assert any(event.event_type == "action_loop_skipped" for event in events) + assert not any(event.event_type == "model_call_started" and event.payload["role"] == "action" for event in events) + + class FakeApprovalModelClient: async def chat(self, role, messages): if role == "action": diff --git a/tests/smoke/test_vector_memory_live.py b/tests/smoke/test_vector_memory_live.py new file mode 100644 index 0000000..09006c1 --- /dev/null +++ b/tests/smoke/test_vector_memory_live.py @@ -0,0 +1,39 @@ +import os +from uuid import uuid4 + +import httpx +import pytest + +from duck_core.memory.vector_memory import VectorMemory + + +async def qdrant_available(url: str) -> bool: + try: + async with httpx.AsyncClient(timeout=2.0, trust_env=False) as client: + response = await client.get(url) + response.raise_for_status() + return True + except httpx.HTTPError: + return False + + +@pytest.mark.asyncio +async def test_vector_memory_live_qdrant_write_and_search(): + qdrant_url = os.environ.get("DUCK_QDRANT_URL", "http://127.0.0.1:6333") + if not await qdrant_available(qdrant_url): + pytest.skip(f"Qdrant is not running at {qdrant_url}") + + embedding_model = os.environ.get("DUCK_LOCAL_EMBEDDING_MODEL", "./models/all-MiniLM-L6-v2") + collection = f"duck_memory_smoke_{uuid4().hex[:12]}" + memory = VectorMemory( + qdrant_url=qdrant_url, + collection_name=collection, + local_embedding_model=embedding_model, + ) + marker = f"DuckLM vector memory live smoke {uuid4().hex}" + + point_id = await memory.add_memory(marker, {"kind": "live_smoke"}) + results = await memory.search_memory(marker, limit=3) + + assert point_id + assert any(item.get("payload", {}).get("text") == marker for item in results)