Add DuckLM service scripts and utility model benchmark

2026-05-22 07:27:59 +08:00 · 2026-05-22 07:27:59 +08:00 · ff98224eb6
parent e6b82f0376
commit ff98224eb6
24 changed files with 2492 additions and 74 deletions
--- a/CURRENT_STATE.md
+++ b/CURRENT_STATE.md
@ -106,8 +106,7 @@ git diff --check
 ```bash
 . .venv/bin/activate
-bash scripts/llama/start_main.sh start
+bash scripts/duck.sh start
 python -m duck_core.api
 ```
 Открыть WebChat:
@ -123,6 +122,24 @@ curl --noproxy '*' http://127.0.0.1:8000/health
 curl --noproxy '*' http://127.0.0.1:8000/v1/models/roles
 ```
 Управление процессами:
 ```bash
 bash scripts/duck.sh status
 bash scripts/duck.sh logs --follow
 bash scripts/duck.sh restart
 bash scripts/duck.sh stop
 ```
 MTP/speculative-вариант:
 ```bash
 bash scripts/duck.sh stop
 bash scripts/duck-mtp.sh start
 bash scripts/duck-mtp.sh status
 bash scripts/duck-mtp.sh logs --follow
 ```
 ## Что делать следующим
 1. Пройти live E2E checklist в WebChat на реальной модели.
--- a/38
+++ b/38
@ -1,8 +1,32 @@
 duck-up:
-	docker compose -f docker-compose.memory.yml up -d
+	bash scripts/duck.sh start
-	@echo "Memory services started."
+
-	@echo "Start llama-server:"
+duck-stop:
-	@echo "bash scripts/llama/start_main.sh start"
+	bash scripts/duck.sh stop
 duck-restart:
 	bash scripts/duck.sh restart
 duck-status:
 	bash scripts/duck.sh status
 duck-logs:
 	bash scripts/duck.sh logs --follow
 duck-mtp-up:
 	bash scripts/duck-mtp.sh start
 duck-mtp-stop:
 	bash scripts/duck-mtp.sh stop
 duck-mtp-restart:
 	bash scripts/duck-mtp.sh restart
 duck-mtp-status:
 	bash scripts/duck-mtp.sh status
 duck-mtp-logs:
 	bash scripts/duck-mtp.sh logs --follow
 duck-llama-main:
 	bash scripts/llama/start_main.sh start
@ -26,11 +50,7 @@ duck-api:
 	python3 -m duck_core.api
 duck-dev:
-	docker compose -f docker-compose.memory.yml up -d
+	bash scripts/duck.sh start
 	@echo "Start llama-server in another terminal:"
 	@echo "bash scripts/llama/start_main.sh start"
 	@echo "Then run:"
 	@echo "make duck-api"
 	@echo "Open:"
 	@echo "http://127.0.0.1:8000/"
--- a/README.md
+++ b/README.md
@ -9,14 +9,25 @@ python3 -m venv .venv
 . .venv/bin/activate
 python -m pip install -e ".[dev]"
 cp .env.example .env
-bash scripts/llama/start_main.sh
+bash scripts/duck.sh start
 ```
 In another terminal:
 ```bash
 . .venv/bin/activate
 python -m duck_core.api
 ```
 Open `http://127.0.0.1:8000/`.
 Useful commands:
 ```bash
 bash scripts/duck.sh status
 bash scripts/duck.sh logs --follow
 bash scripts/duck.sh restart
 bash scripts/duck.sh stop
 ```
 MTP/speculative variant:
 ```bash
 bash scripts/duck-mtp.sh start
 bash scripts/duck-mtp.sh status
 bash scripts/duck-mtp.sh logs --follow
 bash scripts/duck-mtp.sh stop
 ```
--- a/docs/bench/utility_model_bench_20260522_044407.json
+++ b/docs/bench/utility_model_bench_20260522_044407.json
@ -0,0 +1,80 @@
 [
  {
    "model": "Qwen3.6-35B nonMTP GPU baseline",
    "quality": 0.971,
    "avg_latency_seconds": 17.935,
    "avg_tokens_per_second": 4.51,
    "cases": [
      {
        "role": "action",
        "case": "direct_answer_no_tools",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 15.321,
        "completion_tokens": 45,
        "tokens_per_second": 2.94,
        "content_preview": "{\n  \"kind\": \"action_directive\",\n  \"intent\": \"direct_answer\",\n  \"risk_level\": \"low\",\n  \"actions\": []\n}\n"
      },
      {
        "role": "action",
        "case": "read_specific_file",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 19.638,
        "completion_tokens": 81,
        "tokens_per_second": 4.12,
        "content_preview": "{\n  \"kind\": \"action_directive\",\n  \"intent\": \"read_file\",\n  \"risk_level\": \"low\",\n  \"actions\": [\n    {\n      \"tool\": \"file_read\",\n      \"args\": {\n        \"path\": \"CURRENT_STATE.md\"\n      }\n    }\n  ]\n}\n"
      },
      {
        "role": "memory_policy",
        "case": "store_user_preference",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 18.421,
        "completion_tokens": 88,
        "tokens_per_second": 4.78,
        "content_preview": "{\n  \"should_store\": true,\n  \"memory_type\": \"preference\",\n  \"summary\": \"User prefers responses in Russian and requires explicit confirmation before executing sudo commands.\",\n  \"importance\": 0.9,\n  \"scope\": \"global\",\n  \"metadata\": {\n    \"language\": \"ru\",\n    \"security_policy\": \"sudo_confirmation_required\"\n  }\n}"
      },
      {
        "role": "memory_policy",
        "case": "ignore_trivial_tool_call",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 14.978,
        "completion_tokens": 61,
        "tokens_per_second": 4.07,
        "content_preview": "{\n  \"should_store\": false,\n  \"memory_type\": \"note\",\n  \"summary\": \"Routine execution of pwd command returning /tmp/project.\",\n  \"importance\": 0.1,\n  \"scope\": \"conversation\",\n  \"metadata\": {}\n}"
      },
      {
        "role": "recall",
        "case": "select_relevant_memory",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 15.038,
        "completion_tokens": 66,
        "tokens_per_second": 4.39,
        "content_preview": "{\n  \"relevant_ids\": [\n    \"m1\"\n  ],\n  \"reasoning\": \"Memory m1 directly addresses the user's preference regarding the execution of sudo commands, which is the core of the query. Memories m2 and m3 are unrelated to sudo or command execution preferences.\"\n}"
      },
      {
        "role": "summary",
        "case": "preserve_decisions",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 9.991,
        "completion_tokens": 44,
        "tokens_per_second": 4.4,
        "content_preview": "1. WebChat работает на 8000, llama-server на 8081.\n2. Для внешних путей нужен approval.\n3. allow_forever хранится по normalized action hash."
      },
      {
        "role": "critic",
        "case": "reflection_quality",
        "score": 0.8,
        "note": "missing=['lesson']",
        "elapsed_seconds": 32.16,
        "completion_tokens": 220,
        "tokens_per_second": 6.84,
        "content_preview": "**Critic Reflection: WebChat API Incident**\n\n**1. Risk Assessment**\n\n*   **Operational Fragility (High):** The incident revealed a critical dependency on manual intervention. If the API crashes or the server reboots, WebChat will silently fail or return errors until an engineer notices and manually restarts the service. This creates a \"single point of failure\" in the operational process.\n*   **Meaning Time to Recovery (MTTR) Variance:** Recovery time is currently dependent on human availability "
      }
    ]
  }
 ]
--- a/docs/bench/utility_model_bench_20260522_044407.md
+++ b/docs/bench/utility_model_bench_20260522_044407.md
@ -0,0 +1,21 @@
 # Utility Role Model Benchmark
 Scope: service roles only (`action`, `memory_policy`, `recall`, `summary`, `critic`).
 The main user-facing thinker is not evaluated for replacement here.
 | Model | Quality | Avg latency, s | Avg tok/s | Notes |
 | --- | ---: | ---: | ---: | --- |
 | Qwen3.6-35B nonMTP GPU baseline | 0.97 | 17.93 | 4.51 | critic/reflection_quality: missing=['lesson'] |
 ## Case Details
 ### Qwen3.6-35B nonMTP GPU baseline
 | Role | Case | Score | Latency, s | tok/s | Note |
 | --- | --- | ---: | ---: | ---: | --- |
 | action | direct_answer_no_tools | 1.00 | 15.32 | 2.94 | ok |
 | action | read_specific_file | 1.00 | 19.64 | 4.12 | ok |
 | memory_policy | store_user_preference | 1.00 | 18.42 | 4.78 | ok |
 | memory_policy | ignore_trivial_tool_call | 1.00 | 14.98 | 4.07 | ok |
 | recall | select_relevant_memory | 1.00 | 15.04 | 4.39 | ok |
 | summary | preserve_decisions | 1.00 | 9.99 | 4.40 | ok |
 | critic | reflection_quality | 0.80 | 32.16 | 6.84 | missing=['lesson'] |
--- a/docs/bench/utility_model_bench_20260522_050427.json
+++ b/docs/bench/utility_model_bench_20260522_050427.json
@ -0,0 +1,392 @@
 [
  {
    "model": "Qwen3.6-35B nonMTP GPU baseline",
    "quality": 0.971,
    "avg_latency_seconds": 17.94,
    "avg_tokens_per_second": 4.51,
    "cases": [
      {
        "role": "action",
        "case": "direct_answer_no_tools",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 15.307,
        "completion_tokens": 45,
        "tokens_per_second": 2.94,
        "content_preview": "{\n  \"kind\": \"action_directive\",\n  \"intent\": \"direct_answer\",\n  \"risk_level\": \"low\",\n  \"actions\": []\n}\n"
      },
      {
        "role": "action",
        "case": "read_specific_file",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 19.612,
        "completion_tokens": 81,
        "tokens_per_second": 4.13,
        "content_preview": "{\n  \"kind\": \"action_directive\",\n  \"intent\": \"read_file\",\n  \"risk_level\": \"low\",\n  \"actions\": [\n    {\n      \"tool\": \"file_read\",\n      \"args\": {\n        \"path\": \"CURRENT_STATE.md\"\n      }\n    }\n  ]\n}\n"
      },
      {
        "role": "memory_policy",
        "case": "store_user_preference",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 18.533,
        "completion_tokens": 88,
        "tokens_per_second": 4.75,
        "content_preview": "{\n  \"should_store\": true,\n  \"memory_type\": \"preference\",\n  \"summary\": \"User prefers responses in Russian and requires explicit confirmation before executing sudo commands.\",\n  \"importance\": 0.9,\n  \"scope\": \"global\",\n  \"metadata\": {\n    \"language\": \"ru\",\n    \"security_policy\": \"sudo_confirmation_required\"\n  }\n}"
      },
      {
        "role": "memory_policy",
        "case": "ignore_trivial_tool_call",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 15.0,
        "completion_tokens": 61,
        "tokens_per_second": 4.07,
        "content_preview": "{\n  \"should_store\": false,\n  \"memory_type\": \"note\",\n  \"summary\": \"Routine execution of pwd command returning /tmp/project.\",\n  \"importance\": 0.1,\n  \"scope\": \"conversation\",\n  \"metadata\": {}\n}"
      },
      {
        "role": "recall",
        "case": "select_relevant_memory",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 15.085,
        "completion_tokens": 66,
        "tokens_per_second": 4.38,
        "content_preview": "{\n  \"relevant_ids\": [\n    \"m1\"\n  ],\n  \"reasoning\": \"Memory m1 directly addresses the user's preference regarding the execution of sudo commands, which is the core of the query. Memories m2 and m3 are unrelated to sudo or command execution preferences.\"\n}"
      },
      {
        "role": "summary",
        "case": "preserve_decisions",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 9.954,
        "completion_tokens": 44,
        "tokens_per_second": 4.42,
        "content_preview": "1. WebChat работает на 8000, llama-server на 8081.\n2. Для внешних путей нужен approval.\n3. allow_forever хранится по normalized action hash."
      },
      {
        "role": "critic",
        "case": "reflection_quality",
        "score": 0.8,
        "note": "missing=['lesson']",
        "elapsed_seconds": 32.088,
        "completion_tokens": 220,
        "tokens_per_second": 6.86,
        "content_preview": "**Critic Reflection: WebChat API Incident**\n\n**1. Risk Assessment**\n\n*   **Operational Fragility (High):** The incident revealed a critical dependency on manual intervention. If the API crashes or the server reboots, WebChat will silently fail or return errors until an engineer notices and manually restarts the service. This creates a \"single point of failure\" in the operational process.\n*   **Meaning Time to Recovery (MTTR) Variance:** Recovery time is currently dependent on human availability "
      }
    ]
  },
  {
    "model": "Menlo_Lucy-Q4_K_M CPU",
    "quality": 0.771,
    "avg_latency_seconds": 4.406,
    "avg_tokens_per_second": 16.21,
    "cases": [
      {
        "role": "action",
        "case": "direct_answer_no_tools",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 3.228,
        "completion_tokens": 31,
        "tokens_per_second": 9.6,
        "content_preview": "{\"kind\": \"action_directive\", \"intent\": \"answer_question\", \"risk_level\": \"low\", \"actions\": []}"
      },
      {
        "role": "action",
        "case": "read_specific_file",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 3.031,
        "completion_tokens": 48,
        "tokens_per_second": 15.84,
        "content_preview": "{\"kind\": \"action_directive\", \"intent\": \"file_read\", \"risk_level\": \"low\", \"actions\": [{\"tool\": \"file_read\", \"args\": {\"path\": \"CURRENT_STATE.md\"}}]}\n\n"
      },
      {
        "role": "memory_policy",
        "case": "store_user_preference",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 3.62,
        "completion_tokens": 54,
        "tokens_per_second": 14.92,
        "content_preview": "{\n  \"should_store\": true,\n  \"memory_type\": \"preference\",\n  \"summary\": \"User prefers Russian and requires sudo confirmation.\",\n  \"importance\": 0.7,\n  \"scope\": \"global\",\n  \"metadata\": {}\n}"
      },
      {
        "role": "memory_policy",
        "case": "ignore_trivial_tool_call",
        "score": 0.3,
        "note": "stored_trivial={'should_store': True, 'memory_type': 'fact', 'summary': 'Password was successfully launched and user was informed.', 'importance': 0.7, 'scope': 'global', 'metadata': {}}",
        "elapsed_seconds": 3.192,
        "completion_tokens": 58,
        "tokens_per_second": 18.17,
        "content_preview": "{\n  \"should_store\": true,\n  \"memory_type\": \"fact\",\n  \"summary\": \"Password was successfully launched and user was informed.\",\n  \"importance\": 0.7,\n  \"scope\": \"global\",\n  \"metadata\": {}\n}\n"
      },
      {
        "role": "recall",
        "case": "select_relevant_memory",
        "score": 0.3,
        "note": "wrong_ids=[]",
        "elapsed_seconds": 3.737,
        "completion_tokens": 60,
        "tokens_per_second": 16.05,
        "content_preview": "{\n  \"relevant_ids\": [],\n  \"reasoning\": \"The query is about how the user wants to run sudo, but none of the provided memories are related to sudo or user preferences for running commands. The memories are about SQLite and weather answers, which are unrelated to the query.\"\n}"
      },
      {
        "role": "summary",
        "case": "preserve_decisions",
        "score": 0.8,
        "note": "missing=['approval']",
        "elapsed_seconds": 3.334,
        "completion_tokens": 61,
        "tokens_per_second": 18.29,
        "content_preview": "1. WebChat работает на портах 8000, а LLaMA-Server на 8081.  \n2. Для внешних путей требуется подтверждение.  \n3. allow_forever хранится по hash-коду действий."
      },
      {
        "role": "critic",
        "case": "reflection_quality",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 10.697,
        "completion_tokens": 220,
        "tokens_per_second": 20.57,
        "content_preview": "Okay, the user wants me to reflect on the risk and reusable lessons from fixing the WebChat issue. Let me break this down.\n\nFirst, the root cause was the API not running. They manually started the API but didn't have a unified service script. So the main risk here is that without a unified script, there's a chance the API might not be running consistently or could be misconfigured.\n\nReusability is key here. Maybe we can create a script that starts the API and monitors its status. That way, if th"
      }
    ]
  },
  {
    "model": "Qwen3.5-9B-GLM5.1-Distill-v1-Q4_K_M CPU",
    "quality": 0.4,
    "avg_latency_seconds": 61.939,
    "avg_tokens_per_second": 2.56,
    "cases": [
      {
        "role": "action",
        "case": "direct_answer_no_tools",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 68.075,
        "completion_tokens": 72,
        "tokens_per_second": 1.06,
        "content_preview": "{\"kind\": \"action_directive\", \"intent\": \"answer\", \"risk_level\": \"high\", \"actions\": []}"
      },
      {
        "role": "action",
        "case": "read_specific_file",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 72.149,
        "completion_tokens": 86,
        "tokens_per_second": 1.19,
        "content_preview": "{\"kind\": \"action_directive\", \"intent\": \"read file\", \"risk_level\": \"low\", \"actions\": [{\"tool\": \"file_read\", \"args\": {\"path\": \"CURRENT_STATE.md\"}}]}\n"
      },
      {
        "role": "memory_policy",
        "case": "store_user_preference",
        "score": 0.0,
        "note": "invalid_json: Expecting value: line 1 column 1 (char 0)",
        "elapsed_seconds": 67.764,
        "completion_tokens": 180,
        "tokens_per_second": 2.66,
        "content_preview": ""
      },
      {
        "role": "memory_policy",
        "case": "ignore_trivial_tool_call",
        "score": 0.0,
        "note": "invalid_json: Expecting value: line 1 column 1 (char 0)",
        "elapsed_seconds": 64.653,
        "completion_tokens": 160,
        "tokens_per_second": 2.47,
        "content_preview": ""
      },
      {
        "role": "recall",
        "case": "select_relevant_memory",
        "score": 0.0,
        "note": "invalid_json: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)",
        "elapsed_seconds": 59.451,
        "completion_tokens": 160,
        "tokens_per_second": 2.69,
        "content_preview": "{\n"
      },
      {
        "role": "summary",
        "case": "preserve_decisions",
        "score": 0.2,
        "note": "missing=['8000', '8081', 'approval', 'allow_forever']",
        "elapsed_seconds": 47.05,
        "completion_tokens": 180,
        "tokens_per_second": 3.83,
        "content_preview": ""
      },
      {
        "role": "critic",
        "case": "reflection_quality",
        "score": 0.6,
        "note": "missing=['risk', 'lesson']",
        "elapsed_seconds": 54.433,
        "completion_tokens": 220,
        "tokens_per_second": 4.04,
        "content_preview": ""
      }
    ]
  },
  {
    "model": "X-Coder-SFT-Qwen3-8B.Q6_K CPU",
    "quality": 0.757,
    "avg_latency_seconds": 60.117,
    "avg_tokens_per_second": 2.51,
    "cases": [
      {
        "role": "action",
        "case": "direct_answer_no_tools",
        "score": 0.0,
        "note": "invalid_json: Expecting ',' delimiter: line 13 column 6 (char 632)",
        "elapsed_seconds": 121.049,
        "completion_tokens": 180,
        "tokens_per_second": 1.49,
        "content_preview": "{\n  \"kind\": \"action_directive\",\n  \"intent\": \"answer\",\n  \"risk_level\": \"low\",\n  \"actions\": [\n    {\n      \"tool\": \"coder\",\n      \"args\": {\n        \"task_description\": \"Объясните коротко, что такое DuckLM\",\n        \"language\": \"python\",\n        \"context\": \"Пользователь запросил краткое описание DuckLM. DuckLM - это проект, который развивается сообществом, направленный на создание открытой и этичной языковой модели. Мы можем сказать, что это модель на основе Llama, созданная Meta, но адаптированная "
      },
      {
        "role": "action",
        "case": "read_specific_file",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 37.562,
        "completion_tokens": 134,
        "tokens_per_second": 3.57,
        "content_preview": "{\n  \"kind\": \"action_directive\",\n  \"intent\": \"read_current_state\",\n  \"risk_level\": \"low\",\n  \"actions\": [\n    {\n      \"tool\": \"file_read\",\n      \"args\": {\n        \"path\": \"CURRENT_STATE.md\"\n      },\n      \"reason\": \"We need to read the content of CURRENT_STATE.md to provide a summary of the project status.\"\n    }\n  ],\n  \"expected_observations\": [\n    \"file_content\"\n  ],\n  \"stop_reason\": \"After reading the file, we can directly generate the response without further tools.\"\n}\n"
      },
      {
        "role": "memory_policy",
        "case": "store_user_preference",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 66.979,
        "completion_tokens": 80,
        "tokens_per_second": 1.19,
        "content_preview": "{\n  \"should_store\": true,\n  \"memory_type\": \"preference\",\n  \"summary\": \"User prefers responses in Russian and explicit sudo confirmation.\",\n  \"importance\": 0.9,\n  \"scope\": \"global\",\n  \"metadata\": {\n      \"language\": \"Russian\",\n      \"sudo_policy\": \"explicit_confirmation_required\"\n  }\n}\n"
      },
      {
        "role": "memory_policy",
        "case": "ignore_trivial_tool_call",
        "score": 0.3,
        "note": "stored_trivial={'should_store': True, 'memory_type': 'event', 'summary': 'User executed pwd command and received /tmp/project as output.', 'importance': 0.8, 'scope': 'conversation', 'metadata': {}}",
        "elapsed_seconds": 21.772,
        "completion_tokens": 62,
        "tokens_per_second": 2.85,
        "content_preview": "{\n  \"should_store\": true,\n  \"memory_type\": \"event\",\n  \"summary\": \"User executed pwd command and received /tmp/project as output.\",\n  \"importance\": 0.8,\n  \"scope\": \"conversation\",\n  \"metadata\": {}\n}\n"
      },
      {
        "role": "recall",
        "case": "select_relevant_memory",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 58.656,
        "completion_tokens": 90,
        "tokens_per_second": 1.53,
        "content_preview": "{\n  \"relevant_ids\": [\"m1\"],\n  \"reasoning\": \"The query is about how the user wants to run 'sudo'. The only memory that directly relates to this is memory m1, which states that the user does not want to run sudo without explicit confirmation. The other memories (m2 and m3) are about unrelated topics (database and weather). Therefore, only m1 is relevant.\"\n}\n"
      },
      {
        "role": "summary",
        "case": "preserve_decisions",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 53.245,
        "completion_tokens": 180,
        "tokens_per_second": 3.38,
        "content_preview": "We are going to compress the task context into 3 key points while preserving the decisions and outcomes.\n\nKey Points:\n1. **Service Endpoints**: \n   - WebChat is running on port 8000.\n   - llama-server is running on port 8081.\n\n2. **External Paths Approval**:\n   - Any external path that requires access must be approved. This implies that we need to have a mechanism (like a firewall or access control list) to check and approve external requests.\n\n3. **Forever Allowance Storage**:\n   - The `allow_f"
      },
      {
        "role": "critic",
        "case": "reflection_quality",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 61.555,
        "completion_tokens": 220,
        "tokens_per_second": 3.57,
        "content_preview": "We are reflecting on the task: fixing the WebChat which had an API that wasn't running.\n The root cause was that the API was not running. We started it manually but there was no unified service script.\n\n Reflection:\n - Risk: The manual start might be error-prone and not scalable. If multiple components, hard to manage.\n - Waste: We ended up having to start manually, which is not efficient and might lead to repeated issues.\n - Reusable Lesson: We should create a unified service script that starts"
      }
    ]
  },
  {
    "model": "gemma-4-E4B-it-Q4_K_M CPU",
    "quality": 0.971,
    "avg_latency_seconds": 21.235,
    "avg_tokens_per_second": 5.36,
    "cases": [
      {
        "role": "action",
        "case": "direct_answer_no_tools",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 35.717,
        "completion_tokens": 53,
        "tokens_per_second": 1.48,
        "content_preview": "{\n  \"kind\": \"action_directive\",\n  \"intent\": \"answer_directly\"\n  ,\n  \"risk_level\": \"low\"\n  ,\n  \"actions\": []\n}\n"
      },
      {
        "role": "action",
        "case": "read_specific_file",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 13.325,
        "completion_tokens": 88,
        "tokens_per_second": 6.6,
        "content_preview": "{\n  \"kind\": \"action_directive\",\n  \"intent\": \"tool_use\",\n  \"risk_level\": \"low\",\n  \"actions\": [\n    {\n      \"tool\": \"file_read\",\n      \"args\": {\n        \"path\": \"CURRENT_STATE.md\"\n      }\n    }\n  ]\n}\n"
      },
      {
        "role": "memory_policy",
        "case": "store_user_preference",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 27.13,
        "completion_tokens": 98,
        "tokens_per_second": 3.61,
        "content_preview": "{\n  \"should_store\": true,\n  \"memory_type\": \"preference\",\n  \"summary\": \"Пользователь требует, чтобы ответы всегда были на русском языке и чтобы не запускать sudo без явного подтверждения.\",\n  \"importance\": 0.95,\n  \"scope\": \"global\",\n  \"metadata\": {\n    \"task_id\": \"task_pref\"\n  }\n}\n"
      },
      {
        "role": "memory_policy",
        "case": "ignore_trivial_tool_call",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 10.229,
        "completion_tokens": 90,
        "tokens_per_second": 8.8,
        "content_preview": "{\n  \"should_store\": false,\n  \"memory_type\": \"note\",\n  \"summary\": \"The 'pwd' command was executed, and the output was sent to the user.\",\n  \"importance\": 0.1,\n  \"scope\": \"conversation\",\n  \"metadata\": {\n    \"task_id\": \"task_tmp\"\n  }\n}\n"
      },
      {
        "role": "recall",
        "case": "select_relevant_memory",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 19.391,
        "completion_tokens": 62,
        "tokens_per_second": 3.2,
        "content_preview": "{\n  \"relevant_ids\": [\n    \"m1\"\n  ],\n  \"reasoning\": \"Воспоминание m1 напрямую отвечает на вопрос о том, как пользователь хочет, чтобы запускался sudo (с требованием отдельного подтверждения).\"\n}\n"
      },
      {
        "role": "summary",
        "case": "preserve_decisions",
        "score": 1.0,
        "note": "ok",
        "elapsed_seconds": 14.371,
        "completion_tokens": 88,
        "tokens_per_second": 6.12,
        "content_preview": "Вот сжатый контекст в 3 пунктах:\n\n1. **Порты:** WebChat использует порт 8000, llama-server — 8081.\n2. **Безопасность:** Для внешних путей требуется предварительное одобрение (approval).\n3. **Хранение разрешений:** `allow_forever` сохраняется на основе хеша нормализованного действия."
      },
      {
        "role": "critic",
        "case": "reflection_quality",
        "score": 0.8,
        "note": "missing=['lesson']",
        "elapsed_seconds": 28.484,
        "completion_tokens": 220,
        "tokens_per_second": 7.72,
        "content_preview": "## DuckLM: Critic Reflection\n\n**Task:** Fix WebChat.\n**Observed Root Cause:** API was not running.\n**Action Taken:** Started API manually.\n**Observed Deficiency:** Lack of a unified service script.\n\n---\n\n### 🔍 Reflection Analysis\n\n#### 1. Risk Assessment (What went wrong/could go wrong?)\n\n*   **Operational Risk (High):** The immediate risk was service unavailability (WebChat down). Manually starting the API is a brittle, high-touch workaround. If the system restarts, or if the API needs to be re"
      }
    ]
  }
 ]
--- a/docs/bench/utility_model_bench_20260522_050427.md
+++ b/docs/bench/utility_model_bench_20260522_050427.md
@ -0,0 +1,69 @@
 # Utility Role Model Benchmark
 Scope: service roles only (`action`, `memory_policy`, `recall`, `summary`, `critic`).
 The main user-facing thinker is not evaluated for replacement here.
 | Model | Quality | Avg latency, s | Avg tok/s | Notes |
 | --- | ---: | ---: | ---: | --- |
 | Qwen3.6-35B nonMTP GPU baseline | 0.97 | 17.94 | 4.51 | critic/reflection_quality: missing=['lesson'] |
 | Menlo_Lucy-Q4_K_M CPU | 0.77 | 4.41 | 16.21 | memory_policy/ignore_trivial_tool_call: stored_trivial={'should_store': True, 'memory_type': 'fact', 'summary': 'Password was successfully launched and user was informed.', 'importance': 0.7, 'scope': 'global', 'metadata': {}}; recall/select_relevant_memory: wrong_ids=[]; summary/preserve_decisions: missing=['approval'] |
 | Qwen3.5-9B-GLM5.1-Distill-v1-Q4_K_M CPU | 0.40 | 61.94 | 2.56 | memory_policy/store_user_preference: invalid_json: Expecting value: line 1 column 1 (char 0); memory_policy/ignore_trivial_tool_call: invalid_json: Expecting value: line 1 column 1 (char 0); recall/select_relevant_memory: invalid_json: Expecting property name enclosed in double quotes: line 1 column 2 (char 1) |
 | X-Coder-SFT-Qwen3-8B.Q6_K CPU | 0.76 | 60.12 | 2.51 | action/direct_answer_no_tools: invalid_json: Expecting ',' delimiter: line 13 column 6 (char 632); memory_policy/ignore_trivial_tool_call: stored_trivial={'should_store': True, 'memory_type': 'event', 'summary': 'User executed pwd command and received /tmp/project as output.', 'importance': 0.8, 'scope': 'conversation', 'metadata': {}} |
 | gemma-4-E4B-it-Q4_K_M CPU | 0.97 | 21.23 | 5.36 | critic/reflection_quality: missing=['lesson'] |
 ## Case Details
 ### Qwen3.6-35B nonMTP GPU baseline
 | Role | Case | Score | Latency, s | tok/s | Note |
 | --- | --- | ---: | ---: | ---: | --- |
 | action | direct_answer_no_tools | 1.00 | 15.31 | 2.94 | ok |
 | action | read_specific_file | 1.00 | 19.61 | 4.13 | ok |
 | memory_policy | store_user_preference | 1.00 | 18.53 | 4.75 | ok |
 | memory_policy | ignore_trivial_tool_call | 1.00 | 15.00 | 4.07 | ok |
 | recall | select_relevant_memory | 1.00 | 15.09 | 4.38 | ok |
 | summary | preserve_decisions | 1.00 | 9.95 | 4.42 | ok |
 | critic | reflection_quality | 0.80 | 32.09 | 6.86 | missing=['lesson'] |
 ### Menlo_Lucy-Q4_K_M CPU
 | Role | Case | Score | Latency, s | tok/s | Note |
 | --- | --- | ---: | ---: | ---: | --- |
 | action | direct_answer_no_tools | 1.00 | 3.23 | 9.60 | ok |
 | action | read_specific_file | 1.00 | 3.03 | 15.84 | ok |
 | memory_policy | store_user_preference | 1.00 | 3.62 | 14.92 | ok |
 | memory_policy | ignore_trivial_tool_call | 0.30 | 3.19 | 18.17 | stored_trivial={'should_store': True, 'memory_type': 'fact', 'summary': 'Password was successfully launched and user was informed.', 'importance': 0.7, 'scope': 'global', 'metadata': {}} |
 | recall | select_relevant_memory | 0.30 | 3.74 | 16.05 | wrong_ids=[] |
 | summary | preserve_decisions | 0.80 | 3.33 | 18.29 | missing=['approval'] |
 | critic | reflection_quality | 1.00 | 10.70 | 20.57 | ok |
 ### Qwen3.5-9B-GLM5.1-Distill-v1-Q4_K_M CPU
 | Role | Case | Score | Latency, s | tok/s | Note |
 | --- | --- | ---: | ---: | ---: | --- |
 | action | direct_answer_no_tools | 1.00 | 68.08 | 1.06 | ok |
 | action | read_specific_file | 1.00 | 72.15 | 1.19 | ok |
 | memory_policy | store_user_preference | 0.00 | 67.76 | 2.66 | invalid_json: Expecting value: line 1 column 1 (char 0) |
 | memory_policy | ignore_trivial_tool_call | 0.00 | 64.65 | 2.47 | invalid_json: Expecting value: line 1 column 1 (char 0) |
 | recall | select_relevant_memory | 0.00 | 59.45 | 2.69 | invalid_json: Expecting property name enclosed in double quotes: line 1 column 2 (char 1) |
 | summary | preserve_decisions | 0.20 | 47.05 | 3.83 | missing=['8000', '8081', 'approval', 'allow_forever'] |
 | critic | reflection_quality | 0.60 | 54.43 | 4.04 | missing=['risk', 'lesson'] |
 ### X-Coder-SFT-Qwen3-8B.Q6_K CPU
 | Role | Case | Score | Latency, s | tok/s | Note |
 | --- | --- | ---: | ---: | ---: | --- |
 | action | direct_answer_no_tools | 0.00 | 121.05 | 1.49 | invalid_json: Expecting ',' delimiter: line 13 column 6 (char 632) |
 | action | read_specific_file | 1.00 | 37.56 | 3.57 | ok |
 | memory_policy | store_user_preference | 1.00 | 66.98 | 1.19 | ok |
 | memory_policy | ignore_trivial_tool_call | 0.30 | 21.77 | 2.85 | stored_trivial={'should_store': True, 'memory_type': 'event', 'summary': 'User executed pwd command and received /tmp/project as output.', 'importance': 0.8, 'scope': 'conversation', 'metadata': {}} |
 | recall | select_relevant_memory | 1.00 | 58.66 | 1.53 | ok |
 | summary | preserve_decisions | 1.00 | 53.24 | 3.38 | ok |
 | critic | reflection_quality | 1.00 | 61.55 | 3.57 | ok |
 ### gemma-4-E4B-it-Q4_K_M CPU
 | Role | Case | Score | Latency, s | tok/s | Note |
 | --- | --- | ---: | ---: | ---: | --- |
 | action | direct_answer_no_tools | 1.00 | 35.72 | 1.48 | ok |
 | action | read_specific_file | 1.00 | 13.32 | 6.60 | ok |
 | memory_policy | store_user_preference | 1.00 | 27.13 | 3.61 | ok |
 | memory_policy | ignore_trivial_tool_call | 1.00 | 10.23 | 8.80 | ok |
 | recall | select_relevant_memory | 1.00 | 19.39 | 3.20 | ok |
 | summary | preserve_decisions | 1.00 | 14.37 | 6.12 | ok |
 | critic | reflection_quality | 0.80 | 28.48 | 7.72 | missing=['lesson'] |
--- a/docs/how_to_run.md
+++ b/docs/how_to_run.md
@ -16,34 +16,53 @@ cp .env.example .env
 The default `DUCK_MAIN_MODEL_PATH` points to `./models/Qwen3.6/nonMTP/Qwen3.6-35B-A3B-UD-Q4_K_M.gguf`.
-3. Start `llama-server`:
+3. Start DuckLM:
 ```bash
-bash scripts/llama/start_main.sh start
+bash scripts/duck.sh start
 ```
 This starts both processes:
 - `llama-server` on `http://127.0.0.1:8081/v1`
 - DuckLM API/WebChat on `http://127.0.0.1:8000/`
 Useful process commands:
 ```bash
-bash scripts/llama/start_main.sh status
+bash scripts/duck.sh status
-bash scripts/llama/start_main.sh logs --follow
+bash scripts/duck.sh logs --follow
-bash scripts/llama/start_main.sh restart
+bash scripts/duck.sh restart
-bash scripts/llama/start_main.sh stop
+bash scripts/duck.sh stop
 ```
-4. Start DuckLM API:
+4. Open WebChat:
 ```bash
 python -m duck_core.api
 ```
 5. Open WebChat:
 ```text
 http://127.0.0.1:8000/
 ```
-6. Send a task:
+Low-level llama-only commands are still available when needed:
 ```bash
 bash scripts/llama/start_main.sh status
 bash scripts/llama/start_main.sh logs --follow
 ```
 MTP/speculative variant:
 ```bash
 bash scripts/duck.sh stop
 bash scripts/duck-mtp.sh start
 bash scripts/duck-mtp.sh status
 bash scripts/duck-mtp.sh logs --follow
 ```
 `duck-mtp.sh` keeps DuckLM on `http://127.0.0.1:8000/` and starts the MTP-backed
 `llama-server` on the normal role endpoint `http://127.0.0.1:8081/v1`, so
 `config/models.yaml` does not need to change.
 5. Send a task:
 ```bash
 curl -X POST http://127.0.0.1:8000/v1/chat \
@ -51,21 +70,21 @@ curl -X POST http://127.0.0.1:8000/v1/chat \
  -d '{"message":"Скажи коротко, что ты DuckLM","workspace":"./workspace","debug":true}'
 ```
-7. Inspect events:
+6. Inspect events:
 ```bash
 curl http://127.0.0.1:8000/v1/tasks/<task_id>/events
 ```
-8. Approvals:
+7. Approvals:
 ```bash
 curl http://127.0.0.1:8000/v1/approvals/pending
 ```
-9. Stop services:
+8. Stop services:
 ```bash
-bash scripts/llama/start_main.sh stop
+bash scripts/duck.sh stop
 docker compose -f docker-compose.memory.yml down
 ```
--- a/docs/web_api.md
+++ b/docs/web_api.md
@ -23,3 +23,18 @@ GET  /v1/experience
 GET  /v1/experience/{id}
 GET  /v1/memory/search?q=...
 ```
 Chat requests accept optional `reasoning`:
 ```json
 {
  "message": "hello",
  "reasoning": "auto"
 }
 ```
 Allowed values:
 - `auto`: use the `llama-server` default.
 - `on`: pass `enable_thinking=true` for the thinker response.
 - `off`: pass `enable_thinking=false` and `thinking_budget_tokens=0`.
--- a/duck_core/api.py
+++ b/duck_core/api.py
@ -3,7 +3,7 @@ import json
 import logging
 import time
 from pathlib import Path
-from typing import Any
+from typing import Any, Literal
 import uvicorn
 from fastapi import FastAPI, HTTPException, Request
@ -60,6 +60,7 @@ class ChatRequest(BaseModel):
    conversation_id: str | None = None
    workspace: str | None = None
    debug: bool = False
    reasoning: Literal["auto", "on", "off"] = "auto"
 class ConversationRequest(BaseModel):
@ -208,6 +209,7 @@ def create_app() -> FastAPI:
            history_messages=history,
            memory_records=memory_records,
            skill_summary=await selected_skill_summary(body.message),
            reasoning=body.reasoning,
        )
        await conversations.add_message(
            conversation.conversation_id,
@ -411,7 +413,12 @@ def create_app() -> FastAPI:
                    "Формирую ответ...",
                )
                await event_store.append(task.task_id, "model_call_started", {"role": "thinker"})
-                async for chunk in model_client.stream_chat("thinker", messages):
+                stream = (
                    model_client.stream_chat("thinker", messages, reasoning=body.reasoning)
                    if body.reasoning in {"on", "off"}
                    else model_client.stream_chat("thinker", messages)
                )
                async for chunk in stream:
                    delta = str(chunk.get("delta") or "")
                    if chunk.get("type") == "reasoning_delta":
                        generation_stats.record(delta)
--- a/duck_core/model_client.py
+++ b/duck_core/model_client.py
@ -3,13 +3,15 @@ import logging
 import time
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any
+from typing import Any, Literal
 import httpx
 import yaml
 logger = logging.getLogger(__name__)
 ReasoningMode = Literal["auto", "on", "off"]
@dataclass(frozen=True)
 class RoleConfig:
@ -92,6 +94,20 @@ class ModelClient:
            }
        return {"type": "json_object"}
    def _reasoning_options(self, reasoning: ReasoningMode | None) -> dict[str, Any]:
        if reasoning == "on":
            return {
                "reasoning_format": "deepseek",
                "chat_template_kwargs": {"enable_thinking": True},
            }
        if reasoning == "off":
            return {
                "reasoning_format": "deepseek",
                "chat_template_kwargs": {"enable_thinking": False},
                "thinking_budget_tokens": 0,
            }
        return {}
    async def chat(
        self,
        role: str,
@ -99,6 +115,7 @@ class ModelClient:
        temperature: float | None = None,
        max_output_tokens: int | None = None,
        response_format: dict[str, Any] | None = None,
        reasoning: ReasoningMode | None = None,
    ) -> ModelResponse:
        cfg = self.get_role_config(role)
        outbound = list(messages)
@ -115,6 +132,7 @@ class ModelClient:
        fmt = self._response_format(cfg, response_format)
        if fmt is not None:
            payload["response_format"] = fmt
        payload.update(self._reasoning_options(reasoning))
        start = time.perf_counter()
        try:
@ -150,6 +168,7 @@ class ModelClient:
        temperature: float | None = None,
        max_output_tokens: int | None = None,
        response_format: dict[str, Any] | None = None,
        reasoning: ReasoningMode | None = None,
    ):
        cfg = self.get_role_config(role)
        outbound = list(messages)
@ -167,6 +186,7 @@ class ModelClient:
        fmt = self._response_format(cfg, response_format)
        if fmt is not None:
            payload["response_format"] = fmt
        payload.update(self._reasoning_options(reasoning))
        try:
            async with httpx.AsyncClient(timeout=self.timeout, trust_env=False) as client:
--- a/duck_core/runtime_loop.py
+++ b/duck_core/runtime_loop.py
@ -10,7 +10,7 @@ from duck_core.experience.recorder import ExperienceRecorder
 from duck_core.memory.policy import MemoryPolicy
 from duck_core.memory.store import MemoryStore
 from duck_core.memory.vector_memory import VectorMemory
-from duck_core.model_client import ModelClient
+from duck_core.model_client import ModelClient, ReasoningMode
 from duck_core.reflection import Reflection
 from duck_core.tasks.store import TaskStore
 from duck_core.tools.base import ToolResult
@ -63,6 +63,7 @@ class RuntimeLoop:
        memory_records: list[dict[str, str]] | None = None,
        skill_summary: str | None = None,
        reflect: bool = True,
        reasoning: ReasoningMode | None = None,
    ) -> ChatResult:
        task = await self.task_store.create_task(message, workspace, debug)
        await self.event_store.append(
@ -100,7 +101,10 @@ class RuntimeLoop:
            await self.event_store.append(
                task.task_id, "model_call_started", {"role": "thinker"}
            )
-            response = await self.model_client.chat("thinker", messages)
+            if reasoning in {"on", "off"}:
                response = await self.model_client.chat("thinker", messages, reasoning=reasoning)
            else:
                response = await self.model_client.chat("thinker", messages)
            await self.event_store.append(
                task.task_id,
                "cognition_response",
--- a/duck_core/web/static/app.js
+++ b/duck_core/web/static/app.js
@ -16,6 +16,100 @@ function escapeText(value) {
  return String(value ?? "");
 }
 function escapeHtml(value) {
  return String(value ?? "")
    .replaceAll("&", "&amp;")
    .replaceAll("<", "&lt;")
    .replaceAll(">", "&gt;")
    .replaceAll('"', "&quot;")
    .replaceAll("'", "&#39;");
 }
 function renderInlineMarkdown(text) {
  let html = escapeHtml(text);
  html = html.replace(/`([^`]+)`/g, "<code>$1</code>");
  html = html.replace(/\*\*([^*]+)\*\*/g, "<strong>$1</strong>");
  html = html.replace(/\*([^*]+)\*/g, "<em>$1</em>");
  html = html.replace(
    /(https?:\/\/[^\s<]+)/g,
    '<a href="$1" target="_blank" rel="noreferrer">$1</a>',
  );
  return html;
 }
 function renderMarkdown(markdown) {
  const blocks = [];
  const lines = String(markdown ?? "").replace(/\r\n/g, "\n").split("\n");
  let paragraph = [];
  let listItems = [];
  let inFence = false;
  let fenceLines = [];
  const flushParagraph = () => {
    if (!paragraph.length) return;
    blocks.push(`<p>${renderInlineMarkdown(paragraph.join(" "))}</p>`);
    paragraph = [];
  };
  const flushList = () => {
    if (!listItems.length) return;
    blocks.push(`<ul>${listItems.map((item) => `<li>${renderInlineMarkdown(item)}</li>`).join("")}</ul>`);
    listItems = [];
  };
  for (const line of lines) {
    if (line.startsWith("```")) {
      if (inFence) {
        blocks.push(`<pre><code>${escapeHtml(fenceLines.join("\n"))}</code></pre>`);
        fenceLines = [];
        inFence = false;
      } else {
        flushParagraph();
        flushList();
        inFence = true;
      }
      continue;
    }
    if (inFence) {
      fenceLines.push(line);
      continue;
    }
    const heading = /^(#{1,4})\s+(.+)$/.exec(line);
    if (heading) {
      flushParagraph();
      flushList();
      const level = heading[1].length + 2;
      blocks.push(`<h${level}>${renderInlineMarkdown(heading[2])}</h${level}>`);
      continue;
    }
    const listItem = /^\s*[-*]\s+(.+)$/.exec(line);
    if (listItem) {
      flushParagraph();
      listItems.push(listItem[1]);
      continue;
    }
    if (!line.trim()) {
      flushParagraph();
      flushList();
      continue;
    }
    flushList();
    paragraph.push(line.trim());
  }
  if (inFence) blocks.push(`<pre><code>${escapeHtml(fenceLines.join("\n"))}</code></pre>`);
  flushParagraph();
  flushList();
  return blocks.join("");
 }
 function setMarkdownContent(node, content) {
  if (!node) return;
  node.dataset.markdown = String(content ?? "");
  node.innerHTML = renderMarkdown(node.dataset.markdown);
 }
 function setStatus(id, text, tone = "neutral") {
  const node = document.querySelector(id);
  if (!node) return;
@ -67,7 +161,12 @@ function addMessage(role, content, meta = "", options = {}) {
  messageMeta.innerHTML = `<strong>${role === "user" ? "You" : "DuckLM"}</strong><span>${escapeText(meta)}</span>`;
  const text = document.createElement("p");
-  text.textContent = content;
+  text.className = role === "assistant" ? "message-body markdown-body" : "message-body";
  if (role === "assistant") {
    setMarkdownContent(text, content);
  } else {
    text.textContent = content;
  }
  bubble.append(messageMeta);
  if (role === "assistant" && options.reasoning) {
@ -204,11 +303,10 @@ function updateToolTerminal(article, eventPayload) {
  terminal.classList.remove("is-waiting");
  status.textContent = result.ok ? "ok" : "error";
-  const title = terminal.querySelector(".tool-terminal-title")?.textContent || body.textContent.trim();
+  const parts = [];
-  const parts = [title];
+  if (result.output) parts.push("stdout\n" + result.output.trimEnd());
-  if (result.output) parts.push("\nstdout\n" + result.output.trimEnd());
+  if (result.error) parts.push("stderr\n" + result.error.trimEnd());
-  if (result.error) parts.push("\nstderr\n" + result.error.trimEnd());
+  body.textContent = parts.join("\n\n") || "completed with no output";
  body.textContent = parts.join("\n");
  document.querySelector("#messages").scrollTop = document.querySelector("#messages").scrollHeight;
 }
@ -359,13 +457,22 @@ function humanApprovalDecision(action) {
 function setMessagePending(article, text) {
  const paragraph = article?.querySelector("p");
-  if (paragraph) paragraph.textContent = text;
+  if (!paragraph) return;
  if (paragraph.classList.contains("markdown-body")) {
    setMarkdownContent(paragraph, text);
  } else {
    paragraph.textContent = text;
  }
 }
 function appendMessageText(article, delta) {
  const paragraph = article?.querySelector("p");
  if (!paragraph) return;
-  paragraph.textContent += delta;
+  if (paragraph.classList.contains("markdown-body")) {
    setMarkdownContent(paragraph, `${paragraph.dataset.markdown || ""}${delta}`);
  } else {
    paragraph.textContent += delta;
  }
  document.querySelector("#messages").scrollTop = document.querySelector("#messages").scrollHeight;
 }
@ -758,7 +865,8 @@ async function sendMessage() {
  setStatus("#task-status", "running", "warn");
  addMessage("user", message, "submitted");
  input.value = "";
-  const pending = addMessage("assistant", "", "thinking", {reasoning: true});
+  const reasoningEnabled = document.querySelector("#reasoning")?.checked || false;
  const pending = addMessage("assistant", "", "thinking", {reasoning: reasoningEnabled});
  const context = {taskId: "", contentStarted: false};
  try {
@ -767,6 +875,7 @@ async function sendMessage() {
      conversation_id: state.currentConversationId || null,
      workspace: document.querySelector("#workspace").value,
      debug: document.querySelector("#debug").checked,
      reasoning: reasoningEnabled ? "on" : "off",
    }, async ({name, data}) => {
      await handleAssistantStreamEvent(pending, name, data, context);
    });
--- a/duck_core/web/static/style.css
+++ b/duck_core/web/static/style.css
@ -500,6 +500,60 @@ dd {
  line-height: 1.5;
 }
 .bubble .markdown-body {
  white-space: normal;
 }
 .markdown-body h3,
 .markdown-body h4,
 .markdown-body h5,
 .markdown-body p,
 .markdown-body ul,
 .markdown-body pre {
  margin: 8px 0 0;
 }
 .markdown-body h3 {
  font-size: 17px;
 }
 .markdown-body h4,
 .markdown-body h5 {
  font-size: 15px;
 }
 .markdown-body ul {
  padding-left: 22px;
 }
 .markdown-body code {
  padding: 1px 4px;
  border-radius: 5px;
  background: #e2e8f0;
  font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
  font-size: 0.92em;
 }
 .markdown-body pre {
  max-width: 100%;
  overflow: auto;
  padding: 10px 12px;
  border-radius: 8px;
  background: #0f172a;
  color: #d1fae5;
  white-space: pre;
 }
 .markdown-body pre code {
  padding: 0;
  background: transparent;
  color: inherit;
 }
 .markdown-body a {
  color: var(--accent);
 }
 .message-reasoning {
  display: grid;
  gap: 8px;
--- a/duck_core/web/templates/index.html
+++ b/duck_core/web/templates/index.html
@ -46,6 +46,10 @@
            <input id="debug" type="checkbox" checked>
            <span>Debug mode</span>
          </label>
          <label class="toggle-row">
            <input id="reasoning" type="checkbox">
            <span>Reasoning</span>
          </label>
        </section>
        <section class="status-panel" aria-labelledby="status-title">
--- a/scripts/bench/bench_runtime.py
+++ b/scripts/bench/bench_runtime.py
@ -1,33 +1,434 @@
 import argparse
 import asyncio
 import json
 import os
 import signal
 import subprocess
 import time
 from dataclasses import dataclass, field
 from pathlib import Path
 from statistics import mean
 from typing import Any
-from duck_core.model_client import ModelClient
+import httpx
 import jsonschema
-TASKS = [
+ROOT = Path(__file__).resolve().parents[2]
-    "Скажи коротко, что ты DuckLM.",
+LLAMA_BIN = ROOT / "vendor/llama.cpp/build/bin/llama-server"
-    "Создай tmp/duck_test_note.md с текстом hello duck и прочитай его обратно.",
+BASELINE_URL = "http://127.0.0.1:8081/v1"
-    "Посмотри структуру проекта и кратко опиши модули.",
+BENCH_PORT = 18091
-    "Найди TODO/FIXME в проекте.",
+BENCH_URL = f"http://127.0.0.1:{BENCH_PORT}/v1"
-    "Запусти тесты и кратко объясни результат.",
+RESULTS_DIR = ROOT / "docs/bench"
 CANDIDATES = [
    ROOT / "models/Menlo_Lucy-Q4_K_M.gguf",
    ROOT / "models/Qwen3.5-9B-GLM5.1-Distill-v1-Q4_K_M.gguf",
    ROOT / "models/X-Coder-SFT-Qwen3-8B.Q6_K.gguf",
    ROOT / "models/gemma-4-E4B-it-Q4_K_M.gguf",
 ]
-async def main() -> None:
+@dataclass
-    client = ModelClient()
+class BenchCase:
-    print("role -> base_url/model")
+    role: str
-    for role, cfg in client._roles.items():
+    name: str
-        print(f"{role} -> {cfg.base_url}/{cfg.model}")
+    system_prompt: str
    user_prompt: str
    max_tokens: int
    response_format: dict[str, Any] | None = None
    required_keywords: list[str] = field(default_factory=list)
 def read(path: str) -> str:
    return (ROOT / path).read_text()
 ACTION_SCHEMA = json.loads((ROOT / "duck_core/schemas/action_directive.schema.json").read_text())
 MEMORY_SCHEMA = {
    "type": "object",
    "required": ["should_store", "memory_type", "summary", "importance", "scope", "metadata"],
    "additionalProperties": True,
    "properties": {
        "should_store": {"type": "boolean"},
        "memory_type": {"type": "string"},
        "summary": {"type": "string"},
        "importance": {"type": "number"},
        "scope": {"type": "string"},
        "metadata": {"type": "object"},
    },
 }
 RECALL_SCHEMA = {
    "type": "object",
    "required": ["relevant_ids", "reasoning"],
    "additionalProperties": True,
    "properties": {
        "relevant_ids": {"type": "array", "items": {"type": "string"}},
        "reasoning": {"type": "string"},
    },
 }
 CASES = [
    BenchCase(
        role="action",
        name="direct_answer_no_tools",
        system_prompt=read("prompts/roles/action.md"),
        user_prompt="User request: Скажи коротко, что такое DuckLM.\nWorkspace: /tmp/duck",
        max_tokens=180,
        response_format={
            "type": "json_schema",
            "json_schema": {"name": "action_directive", "schema": ACTION_SCHEMA, "strict": True},
        },
    ),
    BenchCase(
        role="action",
        name="read_specific_file",
        system_prompt=read("prompts/roles/action.md"),
        user_prompt="User request: Прочитай файл CURRENT_STATE.md и кратко скажи статус проекта.\nWorkspace: /home/mirivlad/git/ducklm",
        max_tokens=220,
        response_format={
            "type": "json_schema",
            "json_schema": {"name": "action_directive", "schema": ACTION_SCHEMA, "strict": True},
        },
    ),
    BenchCase(
        role="memory_policy",
        name="store_user_preference",
        system_prompt=read("prompts/roles/memory_policy.md"),
        user_prompt="Task ID: task_pref\n\nTranscript:\nПользователь сказал: всегда отвечай мне по-русски и не запускай sudo без отдельного подтверждения.",
        max_tokens=180,
        response_format={
            "type": "json_schema",
            "json_schema": {"name": "memory_decision", "schema": MEMORY_SCHEMA, "strict": True},
        },
    ),
    BenchCase(
        role="memory_policy",
        name="ignore_trivial_tool_call",
        system_prompt=read("prompts/roles/memory_policy.md"),
        user_prompt="Task ID: task_tmp\n\nTranscript:\nЗапущен pwd, stdout: /tmp/project. Ответ отправлен пользователю.",
        max_tokens=160,
        response_format={
            "type": "json_schema",
            "json_schema": {"name": "memory_decision", "schema": MEMORY_SCHEMA, "strict": True},
        },
    ),
    BenchCase(
        role="recall",
        name="select_relevant_memory",
        system_prompt=read("prompts/roles/recall.md"),
        user_prompt=(
            "Query: Как пользователь хочет, чтобы я запускал sudo?\n\n"
            "Memories:\n"
            "- id: m1 | text: Пользователь просит не запускать sudo без отдельного подтверждения.\n"
            "- id: m2 | text: Проект использует SQLite для событий.\n"
            "- id: m3 | text: Пользователь любит краткие ответы о погоде.\n"
        ),
        max_tokens=160,
        response_format={
            "type": "json_schema",
            "json_schema": {"name": "recall_decision", "schema": RECALL_SCHEMA, "strict": True},
        },
    ),
    BenchCase(
        role="summary",
        name="preserve_decisions",
        system_prompt=read("prompts/roles/summary.md"),
        user_prompt=(
            "Сожми контекст до 3 пунктов. Сохрани решения:\n"
            "1. WebChat работает на 8000, llama-server на 8081.\n"
            "2. Для внешних путей нужен approval.\n"
            "3. allow_forever хранится по normalized action hash.\n"
        ),
        max_tokens=180,
        required_keywords=["8000", "8081", "approval", "allow_forever"],
    ),
    BenchCase(
        role="critic",
        name="reflection_quality",
        system_prompt=read("prompts/roles/critic.md"),
        user_prompt=(
            "Task transcript:\n"
            "User asked to fix WebChat. Root cause was API not running. "
            "We started API manually but had no unified service script. Reflect on risk and reusable lesson."
        ),
        max_tokens=220,
        required_keywords=["risk", "lesson"],
    ),
 ]
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser()
    parser.add_argument("--baseline-url", default=BASELINE_URL)
    parser.add_argument("--port", type=int, default=BENCH_PORT)
    parser.add_argument("--models", nargs="*", default=[str(path) for path in CANDIDATES])
    parser.add_argument("--threads", type=int, default=max(1, (os.cpu_count() or 8) // 2))
    parser.add_argument("--ctx-size", type=int, default=4096)
    parser.add_argument("--timeout", type=float, default=180.0)
    parser.add_argument("--skip-cpu", action="store_true")
    return parser.parse_args()
 def llama_env() -> dict[str, str]:
    env = os.environ.copy()
    bin_dir = str(LLAMA_BIN.parent)
    env["LD_LIBRARY_PATH"] = f"{bin_dir}{':' + env['LD_LIBRARY_PATH'] if env.get('LD_LIBRARY_PATH') else ''}"
    return env
 async def wait_ready(base_url: str, timeout: float) -> None:
    deadline = time.perf_counter() + timeout
    async with httpx.AsyncClient(timeout=5.0, trust_env=False) as client:
        last_error = ""
        while time.perf_counter() < deadline:
            try:
                response = await client.get(f"{base_url}/models")
                if response.status_code == 200:
                    return
                last_error = f"HTTP {response.status_code}: {response.text[:120]}"
            except Exception as exc:
                last_error = str(exc)
            await asyncio.sleep(1.0)
    raise TimeoutError(f"{base_url} not ready: {last_error}")
 def start_cpu_server(model_path: Path, port: int, threads: int, ctx_size: int) -> subprocess.Popen:
    log_dir = ROOT / "data/bench"
    log_dir.mkdir(parents=True, exist_ok=True)
    log_file = log_dir / f"{model_path.stem}.log"
    command = [
        str(LLAMA_BIN),
        "-m",
        str(model_path),
        "--alias",
        "bench-cpu",
        "--host",
        "127.0.0.1",
        "--port",
        str(port),
        "-c",
        str(ctx_size),
        "--parallel",
        "1",
        "-ngl",
        "0",
        "--threads",
        str(threads),
        "--threads-batch",
        str(threads),
        "--reasoning",
        "off",
        "--cache-ram",
        "0",
    ]
    handle = log_file.open("a")
    handle.write("Command: " + " ".join(command) + "\n")
    handle.flush()
    process = subprocess.Popen(
        command,
        cwd=ROOT,
        env=llama_env(),
        stdout=handle,
        stderr=subprocess.STDOUT,
        start_new_session=True,
    )
    process._duck_log_handle = handle  # type: ignore[attr-defined]
    return process
 def stop_process(process: subprocess.Popen | None) -> None:
    if process is None:
        return
    if process.poll() is None:
        os.killpg(process.pid, signal.SIGTERM)
        try:
            process.wait(timeout=15)
        except subprocess.TimeoutExpired:
            os.killpg(process.pid, signal.SIGKILL)
            process.wait(timeout=10)
    handle = getattr(process, "_duck_log_handle", None)
    if handle:
        handle.close()
 def safe_json(content: str) -> tuple[dict[str, Any] | None, str | None]:
    text = content.strip()
    if text.startswith("```"):
        text = text.strip("`")
        text = text.removeprefix("json").strip()
    try:
        return json.loads(text), None
    except json.JSONDecodeError as exc:
        return None, str(exc)
 def score_case(case: BenchCase, content: str) -> tuple[float, str]:
    data = None
    if case.response_format:
        data, error = safe_json(content)
        if data is None:
            return 0.0, f"invalid_json: {error}"
        schema = case.response_format["json_schema"]["schema"]
        try:
            jsonschema.validate(data, schema)
        except jsonschema.ValidationError as exc:
            return 0.2, f"schema_error: {exc.message}"
    if case.role == "action" and data is not None:
        actions = data.get("actions") or []
        if case.name == "direct_answer_no_tools":
            return (1.0, "ok") if actions == [] else (0.3, f"unexpected_actions={actions}")
        if case.name == "read_specific_file":
            if actions and actions[0].get("tool") == "file_read" and actions[0].get("args", {}).get("path") == "CURRENT_STATE.md":
                return 1.0, "ok"
            return 0.4, f"wrong_action={actions}"
    if case.role == "memory_policy" and data is not None:
        if case.name == "store_user_preference":
            ok = data.get("should_store") is True and data.get("memory_type") == "preference" and data.get("scope") == "global"
            return (1.0, "ok") if ok else (0.4, f"wrong_memory_decision={data}")
        if case.name == "ignore_trivial_tool_call":
            ok = data.get("should_store") is False
            return (1.0, "ok") if ok else (0.3, f"stored_trivial={data}")
    if case.role == "recall" and data is not None:
        ids = set(data.get("relevant_ids") or [])
        if "m1" in ids and "m2" not in ids and "m3" not in ids:
            return 1.0, "ok"
        return 0.3, f"wrong_ids={sorted(ids)}"
    lowered = content.lower()
    missing = [word for word in case.required_keywords if word.lower() not in lowered]
    if missing:
        return max(0.2, 1.0 - 0.2 * len(missing)), f"missing={missing}"
    return 1.0, "ok"
 async def run_case(base_url: str, model: str, case: BenchCase, timeout: float) -> dict[str, Any]:
    payload: dict[str, Any] = {
        "model": model,
        "messages": [
            {"role": "system", "content": case.system_prompt},
            {"role": "user", "content": case.user_prompt},
        ],
        "temperature": 0.0,
        "max_tokens": case.max_tokens,
    }
    if case.response_format:
        payload["response_format"] = case.response_format
    started = time.perf_counter()
-    print(f"test_tasks={len(TASKS)}")
+    async with httpx.AsyncClient(timeout=timeout, trust_env=False) as client:
-    print("llm_calls=0")
+        response = await client.post(f"{base_url}/chat/completions", json=payload)
-    print("tool_calls=0")
+        elapsed = time.perf_counter() - started
-    print("json_directive_validity=not_run")
+        response.raise_for_status()
-    print("retry_count=0")
+        raw = response.json()
-    print("memory_writes=0")
+    message = raw.get("choices", [{}])[0].get("message", {})
-    print("experience_record_created=no")
+    content = message.get("content") or ""
-    print("selected_skill=not_run")
+    score, note = score_case(case, content)
-    print(f"total_runtime_seconds={time.perf_counter() - started:.3f}")
+    usage = raw.get("usage") or {}
    completion_tokens = usage.get("completion_tokens") or max(1, len(content.split()))
    return {
        "role": case.role,
        "case": case.name,
        "score": score,
        "note": note,
        "elapsed_seconds": round(elapsed, 3),
        "completion_tokens": completion_tokens,
        "tokens_per_second": round(completion_tokens / max(elapsed, 0.001), 2),
        "content_preview": content[:500],
    }
 async def run_model(label: str, base_url: str, model: str, timeout: float) -> dict[str, Any]:
    rows = []
    for case in CASES:
        try:
            rows.append(await run_case(base_url, model, case, timeout))
        except Exception as exc:
            rows.append({
                "role": case.role,
                "case": case.name,
                "score": 0.0,
                "note": f"error: {exc}",
                "elapsed_seconds": timeout,
                "completion_tokens": 0,
                "tokens_per_second": 0.0,
                "content_preview": "",
            })
    scores = [row["score"] for row in rows]
    return {
        "model": label,
        "quality": round(mean(scores), 3),
        "avg_latency_seconds": round(mean(row["elapsed_seconds"] for row in rows), 3),
        "avg_tokens_per_second": round(mean(row["tokens_per_second"] for row in rows), 2),
        "cases": rows,
    }
 def markdown_report(results: list[dict[str, Any]]) -> str:
    lines = [
        "# Utility Role Model Benchmark",
        "",
        "Scope: service roles only (`action`, `memory_policy`, `recall`, `summary`, `critic`).",
        "The main user-facing thinker is not evaluated for replacement here.",
        "",
        "| Model | Quality | Avg latency, s | Avg tok/s | Notes |",
        "| --- | ---: | ---: | ---: | --- |",
    ]
    for result in results:
        failed = [case for case in result["cases"] if case["score"] < 1.0]
        note = "all checks passed" if not failed else "; ".join(
            f"{case['role']}/{case['case']}: {case['note']}" for case in failed[:3]
        )
        lines.append(
            f"| {result['model']} | {result['quality']:.2f} | "
            f"{result['avg_latency_seconds']:.2f} | {result['avg_tokens_per_second']:.2f} | {note} |"
        )
    lines.append("")
    lines.append("## Case Details")
    for result in results:
        lines.append(f"\n### {result['model']}")
        lines.append("| Role | Case | Score | Latency, s | tok/s | Note |")
        lines.append("| --- | --- | ---: | ---: | ---: | --- |")
        for case in result["cases"]:
            lines.append(
                f"| {case['role']} | {case['case']} | {case['score']:.2f} | "
                f"{case['elapsed_seconds']:.2f} | {case['tokens_per_second']:.2f} | {case['note']} |"
            )
    return "\n".join(lines) + "\n"
 async def main() -> None:
    args = parse_args()
    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
    results: list[dict[str, Any]] = []
    print("Checking Qwen GPU baseline at", args.baseline_url)
    await wait_ready(args.baseline_url, args.timeout)
    results.append(await run_model("Qwen3.6-35B nonMTP GPU baseline", args.baseline_url, "local-main", args.timeout))
    if not args.skip_cpu:
        for raw_model in args.models:
            model_path = Path(raw_model).resolve()
            label = f"{model_path.stem} CPU"
            print("Starting", label)
            process = start_cpu_server(model_path, args.port, args.threads, args.ctx_size)
            try:
                await wait_ready(f"http://127.0.0.1:{args.port}/v1", args.timeout)
                results.append(await run_model(label, f"http://127.0.0.1:{args.port}/v1", "bench-cpu", args.timeout))
            finally:
                stop_process(process)
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    json_path = RESULTS_DIR / f"utility_model_bench_{timestamp}.json"
    md_path = RESULTS_DIR / f"utility_model_bench_{timestamp}.md"
    json_path.write_text(json.dumps(results, ensure_ascii=False, indent=2))
    md_path.write_text(markdown_report(results))
    print(markdown_report(results))
    print(f"Wrote {json_path}")
    print(f"Wrote {md_path}")
 if __name__ == "__main__":
--- a/scripts/duck-mtp.sh
+++ b/scripts/duck-mtp.sh
@ -0,0 +1,262 @@
 #!/usr/bin/env bash
 set -euo pipefail
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 ENV_KEYS=(
  DUCK_LLAMA_SERVER_BIN
  DUCK_MTP_MODEL_PATH
  DUCK_MAIN_PORT
  DUCK_CTX_SIZE
  DUCK_N_GPU_LAYERS
  DUCK_LLAMA_DEVICE
  DUCK_PARALLEL
  DUCK_LLAMA_PID_FILE
  DUCK_LLAMA_LOG_FILE
  DUCK_MTP_FLAGS
  DUCK_HOST
  DUCK_API_HOST
  DUCK_API_PORT
  DUCK_API_PID_FILE
  DUCK_API_LOG_FILE
  DUCK_API_COMMAND
 )
 declare -A ENV_OVERRIDES=()
 for key in "${ENV_KEYS[@]}"; do
  if [[ -v "${key}" ]]; then
    ENV_OVERRIDES["${key}"]="${!key}"
  fi
 done
 if [[ -f "${ROOT_DIR}/.env" ]]; then
  set -a
  # shellcheck disable=SC1091
  source "${ROOT_DIR}/.env"
  set +a
 fi
 for key in "${!ENV_OVERRIDES[@]}"; do
  export "${key}=${ENV_OVERRIDES[${key}]}"
 done
 ACTION="${1:-start}"
 API_PID_FILE="${DUCK_API_PID_FILE:-${ROOT_DIR}/data/duck-api.pid}"
 API_LOG_FILE="${DUCK_API_LOG_FILE:-${ROOT_DIR}/data/duck-api.log}"
 API_URL="http://${DUCK_API_HOST:-127.0.0.1}:${DUCK_API_PORT:-8000}"
 LLAMA_SCRIPT="${ROOT_DIR}/scripts/llama/start_mtp_main.sh"
 usage() {
  cat <<'EOF'
 Usage: scripts/duck-mtp.sh <command>
 Commands:
  start       Start MTP llama-server and DuckLM API in the background
  stop        Stop DuckLM API and managed MTP llama-server
  restart     Stop and start the whole local DuckLM stack
  status      Print process and HTTP health status
  logs        Show DuckLM API and llama-server logs; use --follow/-f and --lines N
  help        Show this help
 Environment:
  DUCK_API_HOST       API bind host, default 127.0.0.1
  DUCK_API_PORT       API port, default 8000
  DUCK_API_PID_FILE   API PID file path
  DUCK_API_LOG_FILE   API log file path
  DUCK_API_COMMAND    API command override, default ".venv/bin/python -m duck_core.api"
 MTP llama-server environment is handled by scripts/llama/start_mtp_main.sh.
 EOF
 }
 api_is_running() {
  [[ -f "${API_PID_FILE}" ]] || return 1
  local pid
  pid="$(cat "${API_PID_FILE}")"
  [[ "${pid}" =~ ^[0-9]+$ ]] || return 1
  kill -0 "${pid}" 2>/dev/null
 }
 api_pid_value() {
  if [[ -f "${API_PID_FILE}" ]]; then
    cat "${API_PID_FILE}"
  fi
 }
 start_api() {
  if api_is_running; then
    echo "DuckLM API already running: pid=$(api_pid_value)"
    return 0
  fi
  mkdir -p "$(dirname "${API_PID_FILE}")" "$(dirname "${API_LOG_FILE}")"
  rm -f "${API_PID_FILE}"
  local command_string="${DUCK_API_COMMAND:-.venv/bin/python -m duck_core.api}"
  echo "Starting DuckLM API..."
  echo "Command: ${command_string}" >> "${API_LOG_FILE}"
  if command -v setsid >/dev/null 2>&1; then
    (
      cd "${ROOT_DIR}"
      nohup setsid bash -lc "${command_string}" >> "${API_LOG_FILE}" 2>&1 < /dev/null &
      echo "$!" > "${API_PID_FILE}"
    )
  else
    (
      cd "${ROOT_DIR}"
      nohup bash -lc "${command_string}" >> "${API_LOG_FILE}" 2>&1 < /dev/null &
      echo "$!" > "${API_PID_FILE}"
    )
  fi
  sleep 0.2
  if api_is_running; then
    echo "DuckLM API started: pid=$(api_pid_value)"
    echo "WebChat: ${API_URL}/"
    echo "Log: ${API_LOG_FILE}"
    for _ in {1..20}; do
      if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${API_URL}/health" >/dev/null 2>&1; then
        echo "DuckLM API health: ok (${API_URL})"
        return 0
      fi
      sleep 0.25
    done
    echo "DuckLM API health: not ready yet (${API_URL})"
    return 0
  fi
  echo "DuckLM API failed to start. See ${API_LOG_FILE}" >&2
  rm -f "${API_PID_FILE}"
  return 1
 }
 stop_api() {
  if ! api_is_running; then
    rm -f "${API_PID_FILE}"
    echo "DuckLM API not running"
    return 0
  fi
  local pid
  pid="$(api_pid_value)"
  echo "Stopping DuckLM API: pid=${pid}"
  kill "${pid}" 2>/dev/null || true
  for _ in {1..30}; do
    if ! kill -0 "${pid}" 2>/dev/null; then
      rm -f "${API_PID_FILE}"
      echo "DuckLM API stopped"
      return 0
    fi
    sleep 0.2
  done
  echo "DuckLM API did not stop after SIGTERM; sending SIGKILL"
  kill -9 "${pid}" 2>/dev/null || true
  rm -f "${API_PID_FILE}"
  echo "DuckLM API stopped"
 }
 start_stack() {
  "${LLAMA_SCRIPT}" start
  start_api
  echo
  echo "Status:"
  status_stack
 }
 stop_stack() {
  stop_api
  "${LLAMA_SCRIPT}" stop
 }
 status_stack() {
  local rc=0
  if api_is_running; then
    local pid
    pid="$(api_pid_value)"
    echo "DuckLM API running: pid=${pid}"
    if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${API_URL}/health" >/dev/null 2>&1; then
      echo "DuckLM API health: ok (${API_URL})"
    else
      echo "DuckLM API health: not ready (${API_URL})"
    fi
  else
    if [[ -f "${API_PID_FILE}" ]]; then
      echo "DuckLM API not running; removing stale pid file ${API_PID_FILE}"
      rm -f "${API_PID_FILE}"
    else
      echo "DuckLM API not running"
    fi
    rc=3
  fi
  local llama_rc=0
  "${LLAMA_SCRIPT}" status || llama_rc=$?
  if [[ "${llama_rc}" != "0" && "${rc}" == "0" ]]; then
    rc="${llama_rc}"
  fi
  return "${rc}"
 }
 logs_stack() {
  local follow=0
  local lines=100
  shift || true
  while [[ $# -gt 0 ]]; do
    case "$1" in
      -f|--follow)
        follow=1
        shift
        ;;
      --lines)
        lines="${2:?--lines requires a value}"
        shift 2
        ;;
      *)
        echo "Unknown logs argument: $1" >&2
        return 2
        ;;
    esac
  done
  mkdir -p "$(dirname "${API_LOG_FILE}")"
  touch "${API_LOG_FILE}"
  if [[ "${follow}" == "1" ]]; then
    local llama_log_file="${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-mtp.log}"
    mkdir -p "$(dirname "${llama_log_file}")"
    touch "${llama_log_file}"
    tail -n "${lines}" -f "${API_LOG_FILE}" "${llama_log_file}"
  else
    echo "==> DuckLM API log: ${API_LOG_FILE} <=="
    tail -n "${lines}" "${API_LOG_FILE}"
    echo
    echo "==> llama-server log: ${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-mtp.log} <=="
    "${LLAMA_SCRIPT}" logs --lines "${lines}"
  fi
 }
 case "${ACTION}" in
  start)
    start_stack
    ;;
  stop)
    stop_stack
    ;;
  restart)
    stop_stack
    start_stack
    ;;
  status)
    status_stack
    ;;
  logs)
    logs_stack "$@"
    ;;
  help|-h|--help)
    usage
    ;;
  *)
    echo "Unknown command: ${ACTION}" >&2
    usage >&2
    exit 2
    ;;
 esac
--- a/scripts/duck.sh
+++ b/scripts/duck.sh
@ -0,0 +1,262 @@
 #!/usr/bin/env bash
 set -euo pipefail
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 ENV_KEYS=(
  DUCK_LLAMA_SERVER_BIN
  DUCK_MAIN_MODEL_PATH
  DUCK_MAIN_PORT
  DUCK_CTX_SIZE
  DUCK_N_GPU_LAYERS
  DUCK_LLAMA_DEVICE
  DUCK_PARALLEL
  DUCK_LLAMA_PID_FILE
  DUCK_LLAMA_LOG_FILE
  DUCK_LLAMA_EXTRA_ARGS
  DUCK_HOST
  DUCK_API_HOST
  DUCK_API_PORT
  DUCK_API_PID_FILE
  DUCK_API_LOG_FILE
  DUCK_API_COMMAND
 )
 declare -A ENV_OVERRIDES=()
 for key in "${ENV_KEYS[@]}"; do
  if [[ -v "${key}" ]]; then
    ENV_OVERRIDES["${key}"]="${!key}"
  fi
 done
 if [[ -f "${ROOT_DIR}/.env" ]]; then
  set -a
  # shellcheck disable=SC1091
  source "${ROOT_DIR}/.env"
  set +a
 fi
 for key in "${!ENV_OVERRIDES[@]}"; do
  export "${key}=${ENV_OVERRIDES[${key}]}"
 done
 ACTION="${1:-start}"
 API_PID_FILE="${DUCK_API_PID_FILE:-${ROOT_DIR}/data/duck-api.pid}"
 API_LOG_FILE="${DUCK_API_LOG_FILE:-${ROOT_DIR}/data/duck-api.log}"
 API_URL="http://${DUCK_API_HOST:-127.0.0.1}:${DUCK_API_PORT:-8000}"
 LLAMA_SCRIPT="${ROOT_DIR}/scripts/llama/start_main.sh"
 usage() {
  cat <<'EOF'
 Usage: scripts/duck.sh <command>
 Commands:
  start       Start llama-server and DuckLM API in the background
  stop        Stop DuckLM API and managed llama-server
  restart     Stop and start the whole local DuckLM stack
  status      Print process and HTTP health status
  logs        Show DuckLM API and llama-server logs; use --follow/-f and --lines N
  help        Show this help
 Environment:
  DUCK_API_HOST       API bind host, default 127.0.0.1
  DUCK_API_PORT       API port, default 8000
  DUCK_API_PID_FILE   API PID file path
  DUCK_API_LOG_FILE   API log file path
  DUCK_API_COMMAND    API command override, default ".venv/bin/python -m duck_core.api"
 llama-server environment is handled by scripts/llama/start_main.sh.
 EOF
 }
 api_is_running() {
  [[ -f "${API_PID_FILE}" ]] || return 1
  local pid
  pid="$(cat "${API_PID_FILE}")"
  [[ "${pid}" =~ ^[0-9]+$ ]] || return 1
  kill -0 "${pid}" 2>/dev/null
 }
 api_pid_value() {
  if [[ -f "${API_PID_FILE}" ]]; then
    cat "${API_PID_FILE}"
  fi
 }
 start_api() {
  if api_is_running; then
    echo "DuckLM API already running: pid=$(api_pid_value)"
    return 0
  fi
  mkdir -p "$(dirname "${API_PID_FILE}")" "$(dirname "${API_LOG_FILE}")"
  rm -f "${API_PID_FILE}"
  local command_string="${DUCK_API_COMMAND:-.venv/bin/python -m duck_core.api}"
  echo "Starting DuckLM API..."
  echo "Command: ${command_string}" >> "${API_LOG_FILE}"
  if command -v setsid >/dev/null 2>&1; then
    (
      cd "${ROOT_DIR}"
      nohup setsid bash -lc "${command_string}" >> "${API_LOG_FILE}" 2>&1 < /dev/null &
      echo "$!" > "${API_PID_FILE}"
    )
  else
    (
      cd "${ROOT_DIR}"
      nohup bash -lc "${command_string}" >> "${API_LOG_FILE}" 2>&1 < /dev/null &
      echo "$!" > "${API_PID_FILE}"
    )
  fi
  sleep 0.2
  if api_is_running; then
    echo "DuckLM API started: pid=$(api_pid_value)"
    echo "WebChat: ${API_URL}/"
    echo "Log: ${API_LOG_FILE}"
    for _ in {1..20}; do
      if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${API_URL}/health" >/dev/null 2>&1; then
        echo "DuckLM API health: ok (${API_URL})"
        return 0
      fi
      sleep 0.25
    done
    echo "DuckLM API health: not ready yet (${API_URL})"
    return 0
  fi
  echo "DuckLM API failed to start. See ${API_LOG_FILE}" >&2
  rm -f "${API_PID_FILE}"
  return 1
 }
 stop_api() {
  if ! api_is_running; then
    rm -f "${API_PID_FILE}"
    echo "DuckLM API not running"
    return 0
  fi
  local pid
  pid="$(api_pid_value)"
  echo "Stopping DuckLM API: pid=${pid}"
  kill "${pid}" 2>/dev/null || true
  for _ in {1..30}; do
    if ! kill -0 "${pid}" 2>/dev/null; then
      rm -f "${API_PID_FILE}"
      echo "DuckLM API stopped"
      return 0
    fi
    sleep 0.2
  done
  echo "DuckLM API did not stop after SIGTERM; sending SIGKILL"
  kill -9 "${pid}" 2>/dev/null || true
  rm -f "${API_PID_FILE}"
  echo "DuckLM API stopped"
 }
 start_stack() {
  "${LLAMA_SCRIPT}" start
  start_api
  echo
  echo "Status:"
  status_stack
 }
 stop_stack() {
  stop_api
  "${LLAMA_SCRIPT}" stop
 }
 status_stack() {
  local rc=0
  if api_is_running; then
    local pid
    pid="$(api_pid_value)"
    echo "DuckLM API running: pid=${pid}"
    if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${API_URL}/health" >/dev/null 2>&1; then
      echo "DuckLM API health: ok (${API_URL})"
    else
      echo "DuckLM API health: not ready (${API_URL})"
    fi
  else
    if [[ -f "${API_PID_FILE}" ]]; then
      echo "DuckLM API not running; removing stale pid file ${API_PID_FILE}"
      rm -f "${API_PID_FILE}"
    else
      echo "DuckLM API not running"
    fi
    rc=3
  fi
  local llama_rc=0
  "${LLAMA_SCRIPT}" status || llama_rc=$?
  if [[ "${llama_rc}" != "0" && "${rc}" == "0" ]]; then
    rc="${llama_rc}"
  fi
  return "${rc}"
 }
 logs_stack() {
  local follow=0
  local lines=100
  shift || true
  while [[ $# -gt 0 ]]; do
    case "$1" in
      -f|--follow)
        follow=1
        shift
        ;;
      --lines)
        lines="${2:?--lines requires a value}"
        shift 2
        ;;
      *)
        echo "Unknown logs argument: $1" >&2
        return 2
        ;;
    esac
  done
  mkdir -p "$(dirname "${API_LOG_FILE}")"
  touch "${API_LOG_FILE}"
  if [[ "${follow}" == "1" ]]; then
    local llama_log_file="${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-main.log}"
    mkdir -p "$(dirname "${llama_log_file}")"
    touch "${llama_log_file}"
    tail -n "${lines}" -f "${API_LOG_FILE}" "${llama_log_file}"
  else
    echo "==> DuckLM API log: ${API_LOG_FILE} <=="
    tail -n "${lines}" "${API_LOG_FILE}"
    echo
    echo "==> llama-server log: ${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-main.log} <=="
    "${LLAMA_SCRIPT}" logs --lines "${lines}"
  fi
 }
 case "${ACTION}" in
  start)
    start_stack
    ;;
  stop)
    stop_stack
    ;;
  restart)
    stop_stack
    start_stack
    ;;
  status)
    status_stack
    ;;
  logs)
    logs_stack "$@"
    ;;
  help|-h|--help)
    usage
    ;;
  *)
    echo "Unknown command: ${ACTION}" >&2
    usage >&2
    exit 2
    ;;
 esac
--- a/scripts/llama/start_main.sh
+++ b/scripts/llama/start_main.sh
@ -37,6 +37,7 @@ ACTION="${1:-start}"
 PID_FILE="${DUCK_LLAMA_PID_FILE:-${ROOT_DIR}/data/llama-main.pid}"
 LOG_FILE="${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-main.log}"
 BASE_URL="http://${DUCK_HOST:-127.0.0.1}:${DUCK_MAIN_PORT:-8081}/v1"
 LLAMA_BIN_DIR=""
 resolve_project_path() {
  local value="$1"
@ -125,6 +126,7 @@ start() {
  llama_bin="${DUCK_LLAMA_SERVER_BIN:-llama-server}"
  if [[ "${llama_bin}" == */* ]]; then
    llama_bin="$(resolve_project_path "${llama_bin}")"
    LLAMA_BIN_DIR="$(dirname "${llama_bin}")"
  fi
  model_path="$(resolve_project_path "${DUCK_MAIN_MODEL_PATH}")"
  local command=(
@ -152,9 +154,9 @@ start() {
  echo "Starting llama-server..."
  echo "Command: ${command[*]}" >> "${LOG_FILE}"
  if command -v setsid >/dev/null 2>&1; then
-    nohup setsid "${command[@]}" >> "${LOG_FILE}" 2>&1 &
+    nohup setsid env LD_LIBRARY_PATH="${LLAMA_BIN_DIR}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" "${command[@]}" >> "${LOG_FILE}" 2>&1 &
  else
-    nohup "${command[@]}" >> "${LOG_FILE}" 2>&1 &
+    nohup env LD_LIBRARY_PATH="${LLAMA_BIN_DIR}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" "${command[@]}" >> "${LOG_FILE}" 2>&1 &
  fi
  local pid=$!
  echo "${pid}" > "${PID_FILE}"
--- a/scripts/llama/start_mtp_main.sh
+++ b/scripts/llama/start_mtp_main.sh
@ -0,0 +1,271 @@
 #!/usr/bin/env bash
 set -euo pipefail
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
 ENV_KEYS=(
  DUCK_LLAMA_SERVER_BIN
  DUCK_MTP_MODEL_PATH
  DUCK_MAIN_PORT
  DUCK_CTX_SIZE
  DUCK_N_GPU_LAYERS
  DUCK_LLAMA_DEVICE
  DUCK_PARALLEL
  DUCK_LLAMA_PID_FILE
  DUCK_LLAMA_LOG_FILE
  DUCK_MTP_FLAGS
  DUCK_HOST
 )
 declare -A ENV_OVERRIDES=()
 for key in "${ENV_KEYS[@]}"; do
  if [[ -v "${key}" ]]; then
    ENV_OVERRIDES["${key}"]="${!key}"
  fi
 done
 if [[ -f "${ROOT_DIR}/.env" ]]; then
  set -a
  # shellcheck disable=SC1091
  source "${ROOT_DIR}/.env"
  set +a
 fi
 for key in "${!ENV_OVERRIDES[@]}"; do
  export "${key}=${ENV_OVERRIDES[${key}]}"
 done
 ACTION="${1:-start}"
 PID_FILE="${DUCK_LLAMA_PID_FILE:-${ROOT_DIR}/data/llama-mtp.pid}"
 LOG_FILE="${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-mtp.log}"
 BASE_URL="http://${DUCK_HOST:-127.0.0.1}:${DUCK_MAIN_PORT:-8081}/v1"
 LLAMA_BIN_DIR=""
 resolve_project_path() {
  local value="$1"
  if [[ "${value}" == /* ]]; then
    printf '%s\n' "${value}"
  else
    printf '%s\n' "${ROOT_DIR}/${value#./}"
  fi
 }
 usage() {
  cat <<'EOF'
 Usage: scripts/llama/start_mtp_main.sh <command>
 Commands:
  start       Start MTP llama-server in the background
  stop        Stop the managed MTP llama-server process
  restart     Stop and start MTP llama-server
  status      Print process and HTTP health status
  logs        Show logs; use --follow/-f and --lines N
  help        Show this help
 Environment:
  DUCK_LLAMA_SERVER_BIN  Path to llama-server binary
  DUCK_MTP_MODEL_PATH    Path to MTP GGUF model
  DUCK_HOST              Bind host, default 127.0.0.1
  DUCK_MAIN_PORT         Port, default 8081
  DUCK_CTX_SIZE          Context size, default 65536
  DUCK_N_GPU_LAYERS      GPU layers, default auto
  DUCK_LLAMA_DEVICE      Device name, for example Vulkan0
  DUCK_PARALLEL          Server slots, default 1
  DUCK_LLAMA_PID_FILE    PID file path
  DUCK_LLAMA_LOG_FILE    Log file path
  DUCK_MTP_FLAGS         Extra MTP llama-server args
 EOF
 }
 is_running() {
  [[ -f "${PID_FILE}" ]] || return 1
  local pid
  pid="$(cat "${PID_FILE}")"
  [[ "${pid}" =~ ^[0-9]+$ ]] || return 1
  kill -0 "${pid}" 2>/dev/null
 }
 pid_value() {
  if [[ -f "${PID_FILE}" ]]; then
    cat "${PID_FILE}"
  fi
 }
 status() {
  if is_running; then
    local pid
    pid="$(pid_value)"
    echo "llama-server running: pid=${pid}"
    if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${BASE_URL}/models" >/dev/null 2>&1; then
      echo "HTTP health: ok (${BASE_URL})"
    else
      echo "HTTP health: not ready (${BASE_URL})"
    fi
    return 0
  fi
  if [[ -f "${PID_FILE}" ]]; then
    echo "llama-server not running; removing stale pid file ${PID_FILE}"
    rm -f "${PID_FILE}"
  else
    echo "llama-server not running"
  fi
  return 3
 }
 start() {
  if is_running; then
    echo "MTP llama-server already running: pid=$(pid_value)"
    return 0
  fi
  : "${DUCK_MTP_MODEL_PATH:?DUCK_MTP_MODEL_PATH is required}"
  mkdir -p "$(dirname "${PID_FILE}")" "$(dirname "${LOG_FILE}")"
  rm -f "${PID_FILE}"
  local llama_bin mtp_model_path
  llama_bin="${DUCK_LLAMA_SERVER_BIN:-llama-server}"
  if [[ "${llama_bin}" == */* ]]; then
    llama_bin="$(resolve_project_path "${llama_bin}")"
    LLAMA_BIN_DIR="$(dirname "${llama_bin}")"
  fi
  mtp_model_path="$(resolve_project_path "${DUCK_MTP_MODEL_PATH}")"
  local help_text
  help_text="$(LD_LIBRARY_PATH="${LLAMA_BIN_DIR}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" "${llama_bin}" --help 2>&1 || true)"
  if ! grep -qi "draft-mtp" <<< "${help_text}"; then
    echo "This llama-server build does not expose draft-mtp speculative decoding." >&2
    return 1
  fi
  local command=(
    "${llama_bin}"
    -m "${mtp_model_path}"
    --alias local-main
    --host "${DUCK_HOST:-127.0.0.1}"
    --port "${DUCK_MAIN_PORT:-8081}"
    -c "${DUCK_CTX_SIZE:-65536}"
    --parallel "${DUCK_PARALLEL:-1}"
    -ngl "${DUCK_N_GPU_LAYERS:-auto}"
    --flash-attn on
    --cache-prompt
    --metrics
    --spec-type draft-mtp
  )
  if [[ -n "${DUCK_LLAMA_DEVICE:-}" ]]; then
    command+=(--device "${DUCK_LLAMA_DEVICE}")
  fi
  if [[ -n "${DUCK_MTP_FLAGS:-}" ]]; then
    # shellcheck disable=SC2206
    local extra_args=( ${DUCK_MTP_FLAGS} )
    command+=("${extra_args[@]}")
  fi
  echo "Starting MTP llama-server..."
  echo "Command: ${command[*]}" >> "${LOG_FILE}"
  if command -v setsid >/dev/null 2>&1; then
    nohup setsid env LD_LIBRARY_PATH="${LLAMA_BIN_DIR}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" "${command[@]}" >> "${LOG_FILE}" 2>&1 &
  else
    nohup env LD_LIBRARY_PATH="${LLAMA_BIN_DIR}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" "${command[@]}" >> "${LOG_FILE}" 2>&1 &
  fi
  local pid=$!
  echo "${pid}" > "${PID_FILE}"
  sleep 0.2
  if is_running; then
    echo "MTP llama-server started: pid=${pid}"
    echo "Log: ${LOG_FILE}"
    return 0
  fi
  echo "MTP llama-server failed to start. See ${LOG_FILE}" >&2
  rm -f "${PID_FILE}"
  return 1
 }
 stop() {
  if ! is_running; then
    rm -f "${PID_FILE}"
    echo "llama-server not running"
    return 0
  fi
  local pid
  pid="$(pid_value)"
  echo "Stopping MTP llama-server: pid=${pid}"
  kill "${pid}" 2>/dev/null || true
  for _ in {1..30}; do
    if ! kill -0 "${pid}" 2>/dev/null; then
      rm -f "${PID_FILE}"
      echo "MTP llama-server stopped"
      return 0
    fi
    sleep 0.2
  done
  echo "MTP llama-server did not stop after SIGTERM; sending SIGKILL"
  kill -9 "${pid}" 2>/dev/null || true
  rm -f "${PID_FILE}"
  echo "MTP llama-server stopped"
 }
 restart() {
  stop
  start
 }
 logs() {
  local follow=0
  local lines=100
  shift || true
  while [[ $# -gt 0 ]]; do
    case "$1" in
      -f|--follow)
        follow=1
        shift
        ;;
      --lines)
        lines="${2:?--lines requires a value}"
        shift 2
        ;;
      *)
        echo "Unknown logs argument: $1" >&2
        return 2
        ;;
    esac
  done
  mkdir -p "$(dirname "${LOG_FILE}")"
  touch "${LOG_FILE}"
  if [[ "${follow}" == "1" ]]; then
    tail -n "${lines}" -f "${LOG_FILE}"
  else
    tail -n "${lines}" "${LOG_FILE}"
  fi
 }
 case "${ACTION}" in
  start)
    start
    ;;
  stop)
    stop
    ;;
  restart)
    restart
    ;;
  status)
    status
    ;;
  logs)
    logs "$@"
    ;;
  help|-h|--help)
    usage
    ;;
  *)
    echo "Unknown command: ${ACTION}" >&2
    usage >&2
    exit 2
    ;;
 esac
--- a/tests/smoke/test_api_stream_chat.py
+++ b/tests/smoke/test_api_stream_chat.py
@ -56,6 +56,61 @@ def test_stream_chat_endpoint_emits_sse_reasoning_and_content(tmp_path, monkeypa
    assert "answer" in body
 def test_stream_chat_forwards_reasoning_toggle_to_thinker(tmp_path, monkeypatch):
    monkeypatch.setenv("DUCK_DB_PATH", str(tmp_path / "duck.sqlite3"))
    captured = {}
    async def fake_chat(self, role, messages, temperature=None, max_output_tokens=None, response_format=None):
        return ModelResponse(
            role=role,
            model="local-main",
            content=json.dumps(
                {
                    "kind": "action_directive",
                    "intent": "answer directly",
                    "risk_level": "none",
                    "actions": [],
                }
            ),
            reasoning_content=None,
            raw={},
            latency_ms=1.0,
        )
    async def fake_stream_chat(
        self,
        role,
        messages,
        temperature=None,
        max_output_tokens=None,
        response_format=None,
        reasoning=None,
    ):
        captured["role"] = role
        captured["reasoning"] = reasoning
        yield {"type": "content_delta", "delta": "answer"}
    monkeypatch.setattr("duck_core.model_client.ModelClient.chat", fake_chat)
    monkeypatch.setattr("duck_core.model_client.ModelClient.stream_chat", fake_stream_chat)
    client = TestClient(create_app())
    with client.stream(
        "POST",
        "/v1/chat/stream",
        json={
            "message": "hello",
            "workspace": "./workspace",
            "debug": True,
            "reasoning": "off",
        },
    ) as response:
        body = "".join(response.iter_text())
    assert response.status_code == 200
    assert "event: done" in body
    assert captured == {"role": "thinker", "reasoning": "off"}
 def test_stream_chat_runs_memory_policy_and_reflection_after_completion(tmp_path, monkeypatch):
    monkeypatch.setenv("DUCK_DB_PATH", str(tmp_path / "duck.sqlite3"))
--- a/tests/smoke/test_duck_service_script.py
+++ b/tests/smoke/test_duck_service_script.py
@ -0,0 +1,246 @@
 import os
 import subprocess
 import textwrap
 import time
 from pathlib import Path
 def test_duck_script_manages_llama_and_api(tmp_path):
    fake_bin = tmp_path / "llama-server"
    fake_bin.write_text(
        textwrap.dedent(
            """\
            #!/usr/bin/env bash
            echo "fake llama-server $*" >&2
            trap 'exit 0' TERM INT
            while true; do sleep 1; done
            """
        )
    )
    fake_bin.chmod(0o755)
    fake_api = tmp_path / "fake-api.sh"
    fake_api.write_text(
        textwrap.dedent(
            """\
            #!/usr/bin/env bash
            echo "fake duck api $*" >&2
            trap 'exit 0' TERM INT
            while true; do sleep 1; done
            """
        )
    )
    fake_api.chmod(0o755)
    pid_file = tmp_path / "llama.pid"
    log_file = tmp_path / "llama.log"
    api_pid_file = tmp_path / "duck-api.pid"
    api_log_file = tmp_path / "duck-api.log"
    model_path = tmp_path / "model.gguf"
    model_path.write_text("fake")
    env = {
        **os.environ,
        "DUCK_LLAMA_SERVER_BIN": str(fake_bin),
        "DUCK_MAIN_MODEL_PATH": str(model_path),
        "DUCK_LLAMA_PID_FILE": str(pid_file),
        "DUCK_LLAMA_LOG_FILE": str(log_file),
        "DUCK_MAIN_PORT": "18081",
        "DUCK_API_PID_FILE": str(api_pid_file),
        "DUCK_API_LOG_FILE": str(api_log_file),
        "DUCK_API_COMMAND": str(fake_api),
        "DUCK_API_PORT": "18000",
    }
    script = "scripts/duck.sh"
    stopped = subprocess.run([script, "status"], env=env, text=True, capture_output=True)
    assert stopped.returncode == 3
    assert "DuckLM API not running" in stopped.stdout
    assert "llama-server not running" in stopped.stdout
    started = subprocess.run([script, "start"], env=env, text=True, capture_output=True)
    assert started.returncode == 0
    assert "Starting llama-server" in started.stdout
    assert "Starting DuckLM API" in started.stdout
    assert "Status:" in started.stdout
    assert "DuckLM API running" in started.stdout
    assert "llama-server running" in started.stdout
    assert api_pid_file.exists()
    assert pid_file.exists()
    try:
        running = subprocess.run([script, "status"], env=env, text=True, capture_output=True)
        assert running.returncode == 0
        assert "DuckLM API running" in running.stdout
        assert "llama-server running" in running.stdout
        time.sleep(0.2)
        logs = subprocess.run(
            [script, "logs", "--lines", "20"], env=env, text=True, capture_output=True
        )
        assert logs.returncode == 0
        assert "DuckLM API log" in logs.stdout
        assert "fake duck api" in logs.stdout
        assert "llama-server log" in logs.stdout
        assert "--alias local-main" in logs.stdout
    finally:
        stopped = subprocess.run([script, "stop"], env=env, text=True, capture_output=True)
        assert stopped.returncode == 0
        assert not api_pid_file.exists()
        assert not pid_file.exists()
 def test_duck_mtp_script_starts_mtp_llama_for_duck_api(tmp_path):
    fake_bin = tmp_path / "llama-server"
    fake_bin.write_text(
        textwrap.dedent(
            """\
            #!/usr/bin/env bash
            if [[ "${1:-}" == "--help" ]]; then
              echo "supports --spec-type draft-mtp"
              exit 0
            fi
            echo "fake mtp llama-server $*" >&2
            trap 'exit 0' TERM INT
            while true; do sleep 1; done
            """
        )
    )
    fake_bin.chmod(0o755)
    fake_api = tmp_path / "fake-api.sh"
    fake_api.write_text(
        textwrap.dedent(
            """\
            #!/usr/bin/env bash
            echo "fake duck api $*" >&2
            trap 'exit 0' TERM INT
            while true; do sleep 1; done
            """
        )
    )
    fake_api.chmod(0o755)
    main_model_path = tmp_path / "main.gguf"
    mtp_model_path = tmp_path / "mtp.gguf"
    main_model_path.write_text("main")
    mtp_model_path.write_text("mtp")
    pid_file = tmp_path / "llama-mtp.pid"
    log_file = tmp_path / "llama-mtp.log"
    api_pid_file = tmp_path / "duck-api-mtp.pid"
    api_log_file = tmp_path / "duck-api-mtp.log"
    env = {
        **os.environ,
        "DUCK_LLAMA_SERVER_BIN": str(fake_bin),
        "DUCK_MAIN_MODEL_PATH": str(main_model_path),
        "DUCK_MTP_MODEL_PATH": str(mtp_model_path),
        "DUCK_LLAMA_PID_FILE": str(pid_file),
        "DUCK_LLAMA_LOG_FILE": str(log_file),
        "DUCK_MAIN_PORT": "18081",
        "DUCK_API_PID_FILE": str(api_pid_file),
        "DUCK_API_LOG_FILE": str(api_log_file),
        "DUCK_API_COMMAND": str(fake_api),
        "DUCK_API_PORT": "18000",
    }
    script = "scripts/duck-mtp.sh"
    started = subprocess.run([script, "start"], env=env, text=True, capture_output=True)
    assert started.returncode == 0
    assert "Starting MTP llama-server" in started.stdout
    assert "DuckLM API running" in started.stdout
    assert "llama-server running" in started.stdout
    try:
        logs = subprocess.run(
            [script, "logs", "--lines", "30"], env=env, text=True, capture_output=True
        )
        assert logs.returncode == 0
        assert "--alias local-main" in logs.stdout
        assert "--spec-type draft-mtp" in logs.stdout
        assert f"-m {mtp_model_path}" in logs.stdout
        assert "--model-draft" not in logs.stdout
        assert str(main_model_path) not in logs.stdout
    finally:
        stopped = subprocess.run([script, "stop"], env=env, text=True, capture_output=True)
        assert stopped.returncode == 0
        assert not api_pid_file.exists()
        assert not pid_file.exists()
 def test_duck_mtp_script_sets_llama_bin_dir_library_path_for_help_check(tmp_path):
    bin_dir = tmp_path / "build" / "bin"
    bin_dir.mkdir(parents=True)
    fake_bin = bin_dir / "llama-server"
    fake_bin.write_text(
        textwrap.dedent(
            """\
            #!/usr/bin/env bash
            case ":${LD_LIBRARY_PATH:-}:" in
              *":$(dirname "$0"):"*) ;;
              *)
                echo "error while loading shared libraries: libllama-common.so.0" >&2
                exit 127
                ;;
            esac
            if [[ "${1:-}" == "--help" ]]; then
              echo "supports --spec-type draft-mtp"
              exit 0
            fi
            echo "fake mtp llama-server $*" >&2
            trap 'exit 0' TERM INT
            while true; do sleep 1; done
            """
        )
    )
    fake_bin.chmod(0o755)
    fake_api = tmp_path / "fake-api.sh"
    fake_api.write_text(
        textwrap.dedent(
            """\
            #!/usr/bin/env bash
            trap 'exit 0' TERM INT
            while true; do sleep 1; done
            """
        )
    )
    fake_api.chmod(0o755)
    main_model_path = tmp_path / "main.gguf"
    mtp_model_path = tmp_path / "mtp.gguf"
    main_model_path.write_text("main")
    mtp_model_path.write_text("mtp")
    env = {
        **os.environ,
        "LD_LIBRARY_PATH": "",
        "DUCK_LLAMA_SERVER_BIN": str(fake_bin),
        "DUCK_MAIN_MODEL_PATH": str(main_model_path),
        "DUCK_MTP_MODEL_PATH": str(mtp_model_path),
        "DUCK_LLAMA_PID_FILE": str(tmp_path / "llama-mtp.pid"),
        "DUCK_LLAMA_LOG_FILE": str(tmp_path / "llama-mtp.log"),
        "DUCK_MAIN_PORT": "18081",
        "DUCK_API_PID_FILE": str(tmp_path / "duck-api-mtp.pid"),
        "DUCK_API_LOG_FILE": str(tmp_path / "duck-api-mtp.log"),
        "DUCK_API_COMMAND": str(fake_api),
        "DUCK_API_PORT": "18000",
    }
    started = subprocess.run(
        ["scripts/duck-mtp.sh", "start"], env=env, text=True, capture_output=True
    )
    assert started.returncode == 0
    try:
        logs = subprocess.run(
            ["scripts/duck-mtp.sh", "logs", "--lines", "30"],
            env=env,
            text=True,
            capture_output=True,
        )
        assert logs.returncode == 0
        assert "--spec-type draft-mtp" in logs.stdout
        assert "--model-draft" not in logs.stdout
    finally:
        subprocess.run(["scripts/duck-mtp.sh", "stop"], env=env, text=True, capture_output=True)
--- a/tests/smoke/test_llama_service_script.py
+++ b/tests/smoke/test_llama_service_script.py
@ -55,3 +55,54 @@ def test_start_main_script_manages_pid_status_stop_and_logs(tmp_path):
        stopped = subprocess.run([script, "stop"], env=env, text=True, capture_output=True)
        assert stopped.returncode == 0
        assert not pid_file.exists()
 def test_start_main_script_sets_llama_bin_dir_library_path(tmp_path):
    bin_dir = tmp_path / "build" / "bin"
    bin_dir.mkdir(parents=True)
    fake_bin = bin_dir / "llama-server"
    fake_bin.write_text(
        textwrap.dedent(
            """\
            #!/usr/bin/env bash
            case ":${LD_LIBRARY_PATH:-}:" in
              *":$(dirname "$0"):"*) ;;
              *)
                echo "error while loading shared libraries: libllama-common.so.0" >&2
                exit 127
                ;;
            esac
            echo "fake llama-server $*" >&2
            trap 'exit 0' TERM INT
            while true; do sleep 1; done
            """
        )
    )
    fake_bin.chmod(0o755)
    model_path = tmp_path / "model.gguf"
    model_path.write_text("fake")
    pid_file = tmp_path / "llama.pid"
    log_file = tmp_path / "llama.log"
    env = {
        **os.environ,
        "LD_LIBRARY_PATH": "",
        "DUCK_LLAMA_SERVER_BIN": str(fake_bin),
        "DUCK_MAIN_MODEL_PATH": str(model_path),
        "DUCK_LLAMA_PID_FILE": str(pid_file),
        "DUCK_LLAMA_LOG_FILE": str(log_file),
        "DUCK_MAIN_PORT": "18081",
    }
    script = "scripts/llama/start_main.sh"
    started = subprocess.run([script, "start"], env=env, text=True, capture_output=True)
    assert started.returncode == 0
    try:
        logs = subprocess.run(
            [script, "logs", "--lines", "20"], env=env, text=True, capture_output=True
        )
        assert logs.returncode == 0
        assert "--alias local-main" in logs.stdout
        assert "error while loading shared libraries" not in logs.stdout
    finally:
        subprocess.run([script, "stop"], env=env, text=True, capture_output=True)
--- a/tests/smoke/test_model_client.py
+++ b/tests/smoke/test_model_client.py
@ -57,6 +57,32 @@ async def test_model_client_preserves_reasoning_content(monkeypatch):
    assert response.reasoning_content == "private reasoning"
@pytest.mark.asyncio
 async def test_model_client_adds_request_reasoning_options(monkeypatch):
    payloads = []
    async def fake_post(self, url, json):
        payloads.append(json)
        return httpx.Response(
            200,
            json={"choices": [{"message": {"role": "assistant", "content": "ok"}}]},
            request=httpx.Request("POST", url),
        )
    monkeypatch.setattr(httpx.AsyncClient, "post", fake_post)
    client = ModelClient("config/models.yaml")
    await client.chat("thinker", [{"role": "user", "content": "hello"}], reasoning="on")
    await client.chat("thinker", [{"role": "user", "content": "hello"}], reasoning="off")
    assert payloads[0]["reasoning_format"] == "deepseek"
    assert payloads[0]["chat_template_kwargs"] == {"enable_thinking": True}
    assert "thinking_budget_tokens" not in payloads[0]
    assert payloads[1]["reasoning_format"] == "deepseek"
    assert payloads[1]["chat_template_kwargs"] == {"enable_thinking": False}
    assert payloads[1]["thinking_budget_tokens"] == 0
@pytest.mark.asyncio
 async def test_model_client_stream_chat_yields_reasoning_then_content(monkeypatch):
    class FakeStreamResponse: