Add DuckLM service scripts and utility model benchmark

2026-05-22 07:27:59 +08:00 · 2026-05-22 07:27:59 +08:00 · ff98224eb6
parent e6b82f0376
commit ff98224eb6
24 changed files with 2492 additions and 74 deletions
--- a/CURRENT_STATE.md
+++ b/CURRENT_STATE.md
@ -106,8 +106,7 @@ git diff --check

 ```bash
 . .venv/bin/activate
-bash scripts/llama/start_main.sh start
-python -m duck_core.api
+bash scripts/duck.sh start
 ```

 Открыть WebChat:
@ -123,6 +122,24 @@ curl --noproxy '*' http://127.0.0.1:8000/health
 curl --noproxy '*' http://127.0.0.1:8000/v1/models/roles
 ```

+Управление процессами:
+
+```bash
+bash scripts/duck.sh status
+bash scripts/duck.sh logs --follow
+bash scripts/duck.sh restart
+bash scripts/duck.sh stop
+```
+
+MTP/speculative-вариант:
+
+```bash
+bash scripts/duck.sh stop
+bash scripts/duck-mtp.sh start
+bash scripts/duck-mtp.sh status
+bash scripts/duck-mtp.sh logs --follow
+```
+
 ## Что делать следующим

 1. Пройти live E2E checklist в WebChat на реальной модели.
--- a/38
+++ b/38
@ -1,8 +1,32 @@
 duck-up:
-	docker compose -f docker-compose.memory.yml up -d
-	@echo "Memory services started."
-	@echo "Start llama-server:"
-	@echo "bash scripts/llama/start_main.sh start"
+	bash scripts/duck.sh start
+
+duck-stop:
+	bash scripts/duck.sh stop
+
+duck-restart:
+	bash scripts/duck.sh restart
+
+duck-status:
+	bash scripts/duck.sh status
+
+duck-logs:
+	bash scripts/duck.sh logs --follow
+
+duck-mtp-up:
+	bash scripts/duck-mtp.sh start
+
+duck-mtp-stop:
+	bash scripts/duck-mtp.sh stop
+
+duck-mtp-restart:
+	bash scripts/duck-mtp.sh restart
+
+duck-mtp-status:
+	bash scripts/duck-mtp.sh status
+
+duck-mtp-logs:
+	bash scripts/duck-mtp.sh logs --follow

 duck-llama-main:
 	bash scripts/llama/start_main.sh start
@ -26,11 +50,7 @@ duck-api:
 	python3 -m duck_core.api

 duck-dev:
-	docker compose -f docker-compose.memory.yml up -d
-	@echo "Start llama-server in another terminal:"
-	@echo "bash scripts/llama/start_main.sh start"
-	@echo "Then run:"
-	@echo "make duck-api"
+	bash scripts/duck.sh start
 	@echo "Open:"
 	@echo "http://127.0.0.1:8000/"

--- a/README.md
+++ b/README.md
@ -9,14 +9,25 @@ python3 -m venv .venv
 . .venv/bin/activate
 python -m pip install -e ".[dev]"
 cp .env.example .env
-bash scripts/llama/start_main.sh
-```
-
-In another terminal:
-
-```bash
-. .venv/bin/activate
-python -m duck_core.api
+bash scripts/duck.sh start
 ```

 Open `http://127.0.0.1:8000/`.
+
+Useful commands:
+
+```bash
+bash scripts/duck.sh status
+bash scripts/duck.sh logs --follow
+bash scripts/duck.sh restart
+bash scripts/duck.sh stop
+```
+
+MTP/speculative variant:
+
+```bash
+bash scripts/duck-mtp.sh start
+bash scripts/duck-mtp.sh status
+bash scripts/duck-mtp.sh logs --follow
+bash scripts/duck-mtp.sh stop
+```
--- a/docs/bench/utility_model_bench_20260522_044407.json
+++ b/docs/bench/utility_model_bench_20260522_044407.json
@ -0,0 +1,80 @@
+[
+  {
+    "model": "Qwen3.6-35B nonMTP GPU baseline",
+    "quality": 0.971,
+    "avg_latency_seconds": 17.935,
+    "avg_tokens_per_second": 4.51,
+    "cases": [
+      {
+        "role": "action",
+        "case": "direct_answer_no_tools",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 15.321,
+        "completion_tokens": 45,
+        "tokens_per_second": 2.94,
+        "content_preview": "{\n  \"kind\": \"action_directive\",\n  \"intent\": \"direct_answer\",\n  \"risk_level\": \"low\",\n  \"actions\": []\n}\n"
+      },
+      {
+        "role": "action",
+        "case": "read_specific_file",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 19.638,
+        "completion_tokens": 81,
+        "tokens_per_second": 4.12,
+        "content_preview": "{\n  \"kind\": \"action_directive\",\n  \"intent\": \"read_file\",\n  \"risk_level\": \"low\",\n  \"actions\": [\n    {\n      \"tool\": \"file_read\",\n      \"args\": {\n        \"path\": \"CURRENT_STATE.md\"\n      }\n    }\n  ]\n}\n"
+      },
+      {
+        "role": "memory_policy",
+        "case": "store_user_preference",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 18.421,
+        "completion_tokens": 88,
+        "tokens_per_second": 4.78,
+        "content_preview": "{\n  \"should_store\": true,\n  \"memory_type\": \"preference\",\n  \"summary\": \"User prefers responses in Russian and requires explicit confirmation before executing sudo commands.\",\n  \"importance\": 0.9,\n  \"scope\": \"global\",\n  \"metadata\": {\n    \"language\": \"ru\",\n    \"security_policy\": \"sudo_confirmation_required\"\n  }\n}"
+      },
+      {
+        "role": "memory_policy",
+        "case": "ignore_trivial_tool_call",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 14.978,
+        "completion_tokens": 61,
+        "tokens_per_second": 4.07,
+        "content_preview": "{\n  \"should_store\": false,\n  \"memory_type\": \"note\",\n  \"summary\": \"Routine execution of pwd command returning /tmp/project.\",\n  \"importance\": 0.1,\n  \"scope\": \"conversation\",\n  \"metadata\": {}\n}"
+      },
+      {
+        "role": "recall",
+        "case": "select_relevant_memory",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 15.038,
+        "completion_tokens": 66,
+        "tokens_per_second": 4.39,
+        "content_preview": "{\n  \"relevant_ids\": [\n    \"m1\"\n  ],\n  \"reasoning\": \"Memory m1 directly addresses the user's preference regarding the execution of sudo commands, which is the core of the query. Memories m2 and m3 are unrelated to sudo or command execution preferences.\"\n}"
+      },
+      {
+        "role": "summary",
+        "case": "preserve_decisions",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 9.991,
+        "completion_tokens": 44,
+        "tokens_per_second": 4.4,
+        "content_preview": "1. WebChat работает на 8000, llama-server на 8081.\n2. Для внешних путей нужен approval.\n3. allow_forever хранится по normalized action hash."
+      },
+      {
+        "role": "critic",
+        "case": "reflection_quality",
+        "score": 0.8,
+        "note": "missing=['lesson']",
+        "elapsed_seconds": 32.16,
+        "completion_tokens": 220,
+        "tokens_per_second": 6.84,
+        "content_preview": "**Critic Reflection: WebChat API Incident**\n\n**1. Risk Assessment**\n\n*   **Operational Fragility (High):** The incident revealed a critical dependency on manual intervention. If the API crashes or the server reboots, WebChat will silently fail or return errors until an engineer notices and manually restarts the service. This creates a \"single point of failure\" in the operational process.\n*   **Meaning Time to Recovery (MTTR) Variance:** Recovery time is currently dependent on human availability "
+      }
+    ]
+  }
+]
--- a/docs/bench/utility_model_bench_20260522_044407.md
+++ b/docs/bench/utility_model_bench_20260522_044407.md
@ -0,0 +1,21 @@
+# Utility Role Model Benchmark
+
+Scope: service roles only (`action`, `memory_policy`, `recall`, `summary`, `critic`).
+The main user-facing thinker is not evaluated for replacement here.
+
+| Model | Quality | Avg latency, s | Avg tok/s | Notes |
+| --- | ---: | ---: | ---: | --- |
+| Qwen3.6-35B nonMTP GPU baseline | 0.97 | 17.93 | 4.51 | critic/reflection_quality: missing=['lesson'] |
+
+## Case Details
+
+### Qwen3.6-35B nonMTP GPU baseline
+| Role | Case | Score | Latency, s | tok/s | Note |
+| --- | --- | ---: | ---: | ---: | --- |
+| action | direct_answer_no_tools | 1.00 | 15.32 | 2.94 | ok |
+| action | read_specific_file | 1.00 | 19.64 | 4.12 | ok |
+| memory_policy | store_user_preference | 1.00 | 18.42 | 4.78 | ok |
+| memory_policy | ignore_trivial_tool_call | 1.00 | 14.98 | 4.07 | ok |
+| recall | select_relevant_memory | 1.00 | 15.04 | 4.39 | ok |
+| summary | preserve_decisions | 1.00 | 9.99 | 4.40 | ok |
+| critic | reflection_quality | 0.80 | 32.16 | 6.84 | missing=['lesson'] |
--- a/docs/bench/utility_model_bench_20260522_050427.json
+++ b/docs/bench/utility_model_bench_20260522_050427.json
@ -0,0 +1,392 @@
+[
+  {
+    "model": "Qwen3.6-35B nonMTP GPU baseline",
+    "quality": 0.971,
+    "avg_latency_seconds": 17.94,
+    "avg_tokens_per_second": 4.51,
+    "cases": [
+      {
+        "role": "action",
+        "case": "direct_answer_no_tools",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 15.307,
+        "completion_tokens": 45,
+        "tokens_per_second": 2.94,
+        "content_preview": "{\n  \"kind\": \"action_directive\",\n  \"intent\": \"direct_answer\",\n  \"risk_level\": \"low\",\n  \"actions\": []\n}\n"
+      },
+      {
+        "role": "action",
+        "case": "read_specific_file",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 19.612,
+        "completion_tokens": 81,
+        "tokens_per_second": 4.13,
+        "content_preview": "{\n  \"kind\": \"action_directive\",\n  \"intent\": \"read_file\",\n  \"risk_level\": \"low\",\n  \"actions\": [\n    {\n      \"tool\": \"file_read\",\n      \"args\": {\n        \"path\": \"CURRENT_STATE.md\"\n      }\n    }\n  ]\n}\n"
+      },
+      {
+        "role": "memory_policy",
+        "case": "store_user_preference",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 18.533,
+        "completion_tokens": 88,
+        "tokens_per_second": 4.75,
+        "content_preview": "{\n  \"should_store\": true,\n  \"memory_type\": \"preference\",\n  \"summary\": \"User prefers responses in Russian and requires explicit confirmation before executing sudo commands.\",\n  \"importance\": 0.9,\n  \"scope\": \"global\",\n  \"metadata\": {\n    \"language\": \"ru\",\n    \"security_policy\": \"sudo_confirmation_required\"\n  }\n}"
+      },
+      {
+        "role": "memory_policy",
+        "case": "ignore_trivial_tool_call",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 15.0,
+        "completion_tokens": 61,
+        "tokens_per_second": 4.07,
+        "content_preview": "{\n  \"should_store\": false,\n  \"memory_type\": \"note\",\n  \"summary\": \"Routine execution of pwd command returning /tmp/project.\",\n  \"importance\": 0.1,\n  \"scope\": \"conversation\",\n  \"metadata\": {}\n}"
+      },
+      {
+        "role": "recall",
+        "case": "select_relevant_memory",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 15.085,
+        "completion_tokens": 66,
+        "tokens_per_second": 4.38,
+        "content_preview": "{\n  \"relevant_ids\": [\n    \"m1\"\n  ],\n  \"reasoning\": \"Memory m1 directly addresses the user's preference regarding the execution of sudo commands, which is the core of the query. Memories m2 and m3 are unrelated to sudo or command execution preferences.\"\n}"
+      },
+      {
+        "role": "summary",
+        "case": "preserve_decisions",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 9.954,
+        "completion_tokens": 44,
+        "tokens_per_second": 4.42,
+        "content_preview": "1. WebChat работает на 8000, llama-server на 8081.\n2. Для внешних путей нужен approval.\n3. allow_forever хранится по normalized action hash."
+      },
+      {
+        "role": "critic",
+        "case": "reflection_quality",
+        "score": 0.8,
+        "note": "missing=['lesson']",
+        "elapsed_seconds": 32.088,
+        "completion_tokens": 220,
+        "tokens_per_second": 6.86,
+        "content_preview": "**Critic Reflection: WebChat API Incident**\n\n**1. Risk Assessment**\n\n*   **Operational Fragility (High):** The incident revealed a critical dependency on manual intervention. If the API crashes or the server reboots, WebChat will silently fail or return errors until an engineer notices and manually restarts the service. This creates a \"single point of failure\" in the operational process.\n*   **Meaning Time to Recovery (MTTR) Variance:** Recovery time is currently dependent on human availability "
+      }
+    ]
+  },
+  {
+    "model": "Menlo_Lucy-Q4_K_M CPU",
+    "quality": 0.771,
+    "avg_latency_seconds": 4.406,
+    "avg_tokens_per_second": 16.21,
+    "cases": [
+      {
+        "role": "action",
+        "case": "direct_answer_no_tools",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 3.228,
+        "completion_tokens": 31,
+        "tokens_per_second": 9.6,
+        "content_preview": "{\"kind\": \"action_directive\", \"intent\": \"answer_question\", \"risk_level\": \"low\", \"actions\": []}"
+      },
+      {
+        "role": "action",
+        "case": "read_specific_file",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 3.031,
+        "completion_tokens": 48,
+        "tokens_per_second": 15.84,
+        "content_preview": "{\"kind\": \"action_directive\", \"intent\": \"file_read\", \"risk_level\": \"low\", \"actions\": [{\"tool\": \"file_read\", \"args\": {\"path\": \"CURRENT_STATE.md\"}}]}\n\n"
+      },
+      {
+        "role": "memory_policy",
+        "case": "store_user_preference",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 3.62,
+        "completion_tokens": 54,
+        "tokens_per_second": 14.92,
+        "content_preview": "{\n  \"should_store\": true,\n  \"memory_type\": \"preference\",\n  \"summary\": \"User prefers Russian and requires sudo confirmation.\",\n  \"importance\": 0.7,\n  \"scope\": \"global\",\n  \"metadata\": {}\n}"
+      },
+      {
+        "role": "memory_policy",
+        "case": "ignore_trivial_tool_call",
+        "score": 0.3,
+        "note": "stored_trivial={'should_store': True, 'memory_type': 'fact', 'summary': 'Password was successfully launched and user was informed.', 'importance': 0.7, 'scope': 'global', 'metadata': {}}",
+        "elapsed_seconds": 3.192,
+        "completion_tokens": 58,
+        "tokens_per_second": 18.17,
+        "content_preview": "{\n  \"should_store\": true,\n  \"memory_type\": \"fact\",\n  \"summary\": \"Password was successfully launched and user was informed.\",\n  \"importance\": 0.7,\n  \"scope\": \"global\",\n  \"metadata\": {}\n}\n"
+      },
+      {
+        "role": "recall",
+        "case": "select_relevant_memory",
+        "score": 0.3,
+        "note": "wrong_ids=[]",
+        "elapsed_seconds": 3.737,
+        "completion_tokens": 60,
+        "tokens_per_second": 16.05,
+        "content_preview": "{\n  \"relevant_ids\": [],\n  \"reasoning\": \"The query is about how the user wants to run sudo, but none of the provided memories are related to sudo or user preferences for running commands. The memories are about SQLite and weather answers, which are unrelated to the query.\"\n}"
+      },
+      {
+        "role": "summary",
+        "case": "preserve_decisions",
+        "score": 0.8,
+        "note": "missing=['approval']",
+        "elapsed_seconds": 3.334,
+        "completion_tokens": 61,
+        "tokens_per_second": 18.29,
+        "content_preview": "1. WebChat работает на портах 8000, а LLaMA-Server на 8081.  \n2. Для внешних путей требуется подтверждение.  \n3. allow_forever хранится по hash-коду действий."
+      },
+      {
+        "role": "critic",
+        "case": "reflection_quality",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 10.697,
+        "completion_tokens": 220,
+        "tokens_per_second": 20.57,
+        "content_preview": "Okay, the user wants me to reflect on the risk and reusable lessons from fixing the WebChat issue. Let me break this down.\n\nFirst, the root cause was the API not running. They manually started the API but didn't have a unified service script. So the main risk here is that without a unified script, there's a chance the API might not be running consistently or could be misconfigured.\n\nReusability is key here. Maybe we can create a script that starts the API and monitors its status. That way, if th"
+      }
+    ]
+  },
+  {
+    "model": "Qwen3.5-9B-GLM5.1-Distill-v1-Q4_K_M CPU",
+    "quality": 0.4,
+    "avg_latency_seconds": 61.939,
+    "avg_tokens_per_second": 2.56,
+    "cases": [
+      {
+        "role": "action",
+        "case": "direct_answer_no_tools",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 68.075,
+        "completion_tokens": 72,
+        "tokens_per_second": 1.06,
+        "content_preview": "{\"kind\": \"action_directive\", \"intent\": \"answer\", \"risk_level\": \"high\", \"actions\": []}"
+      },
+      {
+        "role": "action",
+        "case": "read_specific_file",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 72.149,
+        "completion_tokens": 86,
+        "tokens_per_second": 1.19,
+        "content_preview": "{\"kind\": \"action_directive\", \"intent\": \"read file\", \"risk_level\": \"low\", \"actions\": [{\"tool\": \"file_read\", \"args\": {\"path\": \"CURRENT_STATE.md\"}}]}\n"
+      },
+      {
+        "role": "memory_policy",
+        "case": "store_user_preference",
+        "score": 0.0,
+        "note": "invalid_json: Expecting value: line 1 column 1 (char 0)",
+        "elapsed_seconds": 67.764,
+        "completion_tokens": 180,
+        "tokens_per_second": 2.66,
+        "content_preview": ""
+      },
+      {
+        "role": "memory_policy",
+        "case": "ignore_trivial_tool_call",
+        "score": 0.0,
+        "note": "invalid_json: Expecting value: line 1 column 1 (char 0)",
+        "elapsed_seconds": 64.653,
+        "completion_tokens": 160,
+        "tokens_per_second": 2.47,
+        "content_preview": ""
+      },
+      {
+        "role": "recall",
+        "case": "select_relevant_memory",
+        "score": 0.0,
+        "note": "invalid_json: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)",
+        "elapsed_seconds": 59.451,
+        "completion_tokens": 160,
+        "tokens_per_second": 2.69,
+        "content_preview": "{\n"
+      },
+      {
+        "role": "summary",
+        "case": "preserve_decisions",
+        "score": 0.2,
+        "note": "missing=['8000', '8081', 'approval', 'allow_forever']",
+        "elapsed_seconds": 47.05,
+        "completion_tokens": 180,
+        "tokens_per_second": 3.83,
+        "content_preview": ""
+      },
+      {
+        "role": "critic",
+        "case": "reflection_quality",
+        "score": 0.6,
+        "note": "missing=['risk', 'lesson']",
+        "elapsed_seconds": 54.433,
+        "completion_tokens": 220,
+        "tokens_per_second": 4.04,
+        "content_preview": ""
+      }
+    ]
+  },
+  {
+    "model": "X-Coder-SFT-Qwen3-8B.Q6_K CPU",
+    "quality": 0.757,
+    "avg_latency_seconds": 60.117,
+    "avg_tokens_per_second": 2.51,
+    "cases": [
+      {
+        "role": "action",
+        "case": "direct_answer_no_tools",
+        "score": 0.0,
+        "note": "invalid_json: Expecting ',' delimiter: line 13 column 6 (char 632)",
+        "elapsed_seconds": 121.049,
+        "completion_tokens": 180,
+        "tokens_per_second": 1.49,
+        "content_preview": "{\n  \"kind\": \"action_directive\",\n  \"intent\": \"answer\",\n  \"risk_level\": \"low\",\n  \"actions\": [\n    {\n      \"tool\": \"coder\",\n      \"args\": {\n        \"task_description\": \"Объясните коротко, что такое DuckLM\",\n        \"language\": \"python\",\n        \"context\": \"Пользователь запросил краткое описание DuckLM. DuckLM - это проект, который развивается сообществом, направленный на создание открытой и этичной языковой модели. Мы можем сказать, что это модель на основе Llama, созданная Meta, но адаптированная "
+      },
+      {
+        "role": "action",
+        "case": "read_specific_file",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 37.562,
+        "completion_tokens": 134,
+        "tokens_per_second": 3.57,
+        "content_preview": "{\n  \"kind\": \"action_directive\",\n  \"intent\": \"read_current_state\",\n  \"risk_level\": \"low\",\n  \"actions\": [\n    {\n      \"tool\": \"file_read\",\n      \"args\": {\n        \"path\": \"CURRENT_STATE.md\"\n      },\n      \"reason\": \"We need to read the content of CURRENT_STATE.md to provide a summary of the project status.\"\n    }\n  ],\n  \"expected_observations\": [\n    \"file_content\"\n  ],\n  \"stop_reason\": \"After reading the file, we can directly generate the response without further tools.\"\n}\n"
+      },
+      {
+        "role": "memory_policy",
+        "case": "store_user_preference",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 66.979,
+        "completion_tokens": 80,
+        "tokens_per_second": 1.19,
+        "content_preview": "{\n  \"should_store\": true,\n  \"memory_type\": \"preference\",\n  \"summary\": \"User prefers responses in Russian and explicit sudo confirmation.\",\n  \"importance\": 0.9,\n  \"scope\": \"global\",\n  \"metadata\": {\n      \"language\": \"Russian\",\n      \"sudo_policy\": \"explicit_confirmation_required\"\n  }\n}\n"
+      },
+      {
+        "role": "memory_policy",
+        "case": "ignore_trivial_tool_call",
+        "score": 0.3,
+        "note": "stored_trivial={'should_store': True, 'memory_type': 'event', 'summary': 'User executed pwd command and received /tmp/project as output.', 'importance': 0.8, 'scope': 'conversation', 'metadata': {}}",
+        "elapsed_seconds": 21.772,
+        "completion_tokens": 62,
+        "tokens_per_second": 2.85,
+        "content_preview": "{\n  \"should_store\": true,\n  \"memory_type\": \"event\",\n  \"summary\": \"User executed pwd command and received /tmp/project as output.\",\n  \"importance\": 0.8,\n  \"scope\": \"conversation\",\n  \"metadata\": {}\n}\n"
+      },
+      {
+        "role": "recall",
+        "case": "select_relevant_memory",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 58.656,
+        "completion_tokens": 90,
+        "tokens_per_second": 1.53,
+        "content_preview": "{\n  \"relevant_ids\": [\"m1\"],\n  \"reasoning\": \"The query is about how the user wants to run 'sudo'. The only memory that directly relates to this is memory m1, which states that the user does not want to run sudo without explicit confirmation. The other memories (m2 and m3) are about unrelated topics (database and weather). Therefore, only m1 is relevant.\"\n}\n"
+      },
+      {
+        "role": "summary",
+        "case": "preserve_decisions",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 53.245,
+        "completion_tokens": 180,
+        "tokens_per_second": 3.38,
+        "content_preview": "We are going to compress the task context into 3 key points while preserving the decisions and outcomes.\n\nKey Points:\n1. **Service Endpoints**: \n   - WebChat is running on port 8000.\n   - llama-server is running on port 8081.\n\n2. **External Paths Approval**:\n   - Any external path that requires access must be approved. This implies that we need to have a mechanism (like a firewall or access control list) to check and approve external requests.\n\n3. **Forever Allowance Storage**:\n   - The `allow_f"
+      },
+      {
+        "role": "critic",
+        "case": "reflection_quality",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 61.555,
+        "completion_tokens": 220,
+        "tokens_per_second": 3.57,
+        "content_preview": "We are reflecting on the task: fixing the WebChat which had an API that wasn't running.\n The root cause was that the API was not running. We started it manually but there was no unified service script.\n\n Reflection:\n - Risk: The manual start might be error-prone and not scalable. If multiple components, hard to manage.\n - Waste: We ended up having to start manually, which is not efficient and might lead to repeated issues.\n - Reusable Lesson: We should create a unified service script that starts"
+      }
+    ]
+  },
+  {
+    "model": "gemma-4-E4B-it-Q4_K_M CPU",
+    "quality": 0.971,
+    "avg_latency_seconds": 21.235,
+    "avg_tokens_per_second": 5.36,
+    "cases": [
+      {
+        "role": "action",
+        "case": "direct_answer_no_tools",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 35.717,
+        "completion_tokens": 53,
+        "tokens_per_second": 1.48,
+        "content_preview": "{\n  \"kind\": \"action_directive\",\n  \"intent\": \"answer_directly\"\n  ,\n  \"risk_level\": \"low\"\n  ,\n  \"actions\": []\n}\n"
+      },
+      {
+        "role": "action",
+        "case": "read_specific_file",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 13.325,
+        "completion_tokens": 88,
+        "tokens_per_second": 6.6,
+        "content_preview": "{\n  \"kind\": \"action_directive\",\n  \"intent\": \"tool_use\",\n  \"risk_level\": \"low\",\n  \"actions\": [\n    {\n      \"tool\": \"file_read\",\n      \"args\": {\n        \"path\": \"CURRENT_STATE.md\"\n      }\n    }\n  ]\n}\n"
+      },
+      {
+        "role": "memory_policy",
+        "case": "store_user_preference",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 27.13,
+        "completion_tokens": 98,
+        "tokens_per_second": 3.61,
+        "content_preview": "{\n  \"should_store\": true,\n  \"memory_type\": \"preference\",\n  \"summary\": \"Пользователь требует, чтобы ответы всегда были на русском языке и чтобы не запускать sudo без явного подтверждения.\",\n  \"importance\": 0.95,\n  \"scope\": \"global\",\n  \"metadata\": {\n    \"task_id\": \"task_pref\"\n  }\n}\n"
+      },
+      {
+        "role": "memory_policy",
+        "case": "ignore_trivial_tool_call",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 10.229,
+        "completion_tokens": 90,
+        "tokens_per_second": 8.8,
+        "content_preview": "{\n  \"should_store\": false,\n  \"memory_type\": \"note\",\n  \"summary\": \"The 'pwd' command was executed, and the output was sent to the user.\",\n  \"importance\": 0.1,\n  \"scope\": \"conversation\",\n  \"metadata\": {\n    \"task_id\": \"task_tmp\"\n  }\n}\n"
+      },
+      {
+        "role": "recall",
+        "case": "select_relevant_memory",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 19.391,
+        "completion_tokens": 62,
+        "tokens_per_second": 3.2,
+        "content_preview": "{\n  \"relevant_ids\": [\n    \"m1\"\n  ],\n  \"reasoning\": \"Воспоминание m1 напрямую отвечает на вопрос о том, как пользователь хочет, чтобы запускался sudo (с требованием отдельного подтверждения).\"\n}\n"
+      },
+      {
+        "role": "summary",
+        "case": "preserve_decisions",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 14.371,
+        "completion_tokens": 88,
+        "tokens_per_second": 6.12,
+        "content_preview": "Вот сжатый контекст в 3 пунктах:\n\n1. **Порты:** WebChat использует порт 8000, llama-server — 8081.\n2. **Безопасность:** Для внешних путей требуется предварительное одобрение (approval).\n3. **Хранение разрешений:** `allow_forever` сохраняется на основе хеша нормализованного действия."
+      },
+      {
+        "role": "critic",
+        "case": "reflection_quality",
+        "score": 0.8,
+        "note": "missing=['lesson']",
+        "elapsed_seconds": 28.484,
+        "completion_tokens": 220,
+        "tokens_per_second": 7.72,
+        "content_preview": "## DuckLM: Critic Reflection\n\n**Task:** Fix WebChat.\n**Observed Root Cause:** API was not running.\n**Action Taken:** Started API manually.\n**Observed Deficiency:** Lack of a unified service script.\n\n---\n\n### 🔍 Reflection Analysis\n\n#### 1. Risk Assessment (What went wrong/could go wrong?)\n\n*   **Operational Risk (High):** The immediate risk was service unavailability (WebChat down). Manually starting the API is a brittle, high-touch workaround. If the system restarts, or if the API needs to be re"
+      }
+    ]
+  }
+]
--- a/docs/bench/utility_model_bench_20260522_050427.md
+++ b/docs/bench/utility_model_bench_20260522_050427.md
@ -0,0 +1,69 @@
+# Utility Role Model Benchmark
+
+Scope: service roles only (`action`, `memory_policy`, `recall`, `summary`, `critic`).
+The main user-facing thinker is not evaluated for replacement here.
+
+| Model | Quality | Avg latency, s | Avg tok/s | Notes |
+| --- | ---: | ---: | ---: | --- |
+| Qwen3.6-35B nonMTP GPU baseline | 0.97 | 17.94 | 4.51 | critic/reflection_quality: missing=['lesson'] |
+| Menlo_Lucy-Q4_K_M CPU | 0.77 | 4.41 | 16.21 | memory_policy/ignore_trivial_tool_call: stored_trivial={'should_store': True, 'memory_type': 'fact', 'summary': 'Password was successfully launched and user was informed.', 'importance': 0.7, 'scope': 'global', 'metadata': {}}; recall/select_relevant_memory: wrong_ids=[]; summary/preserve_decisions: missing=['approval'] |
+| Qwen3.5-9B-GLM5.1-Distill-v1-Q4_K_M CPU | 0.40 | 61.94 | 2.56 | memory_policy/store_user_preference: invalid_json: Expecting value: line 1 column 1 (char 0); memory_policy/ignore_trivial_tool_call: invalid_json: Expecting value: line 1 column 1 (char 0); recall/select_relevant_memory: invalid_json: Expecting property name enclosed in double quotes: line 1 column 2 (char 1) |
+| X-Coder-SFT-Qwen3-8B.Q6_K CPU | 0.76 | 60.12 | 2.51 | action/direct_answer_no_tools: invalid_json: Expecting ',' delimiter: line 13 column 6 (char 632); memory_policy/ignore_trivial_tool_call: stored_trivial={'should_store': True, 'memory_type': 'event', 'summary': 'User executed pwd command and received /tmp/project as output.', 'importance': 0.8, 'scope': 'conversation', 'metadata': {}} |
+| gemma-4-E4B-it-Q4_K_M CPU | 0.97 | 21.23 | 5.36 | critic/reflection_quality: missing=['lesson'] |
+
+## Case Details
+
+### Qwen3.6-35B nonMTP GPU baseline
+| Role | Case | Score | Latency, s | tok/s | Note |
+| --- | --- | ---: | ---: | ---: | --- |
+| action | direct_answer_no_tools | 1.00 | 15.31 | 2.94 | ok |
+| action | read_specific_file | 1.00 | 19.61 | 4.13 | ok |
+| memory_policy | store_user_preference | 1.00 | 18.53 | 4.75 | ok |
+| memory_policy | ignore_trivial_tool_call | 1.00 | 15.00 | 4.07 | ok |
+| recall | select_relevant_memory | 1.00 | 15.09 | 4.38 | ok |
+| summary | preserve_decisions | 1.00 | 9.95 | 4.42 | ok |
+| critic | reflection_quality | 0.80 | 32.09 | 6.86 | missing=['lesson'] |
+
+### Menlo_Lucy-Q4_K_M CPU
+| Role | Case | Score | Latency, s | tok/s | Note |
+| --- | --- | ---: | ---: | ---: | --- |
+| action | direct_answer_no_tools | 1.00 | 3.23 | 9.60 | ok |
+| action | read_specific_file | 1.00 | 3.03 | 15.84 | ok |
+| memory_policy | store_user_preference | 1.00 | 3.62 | 14.92 | ok |
+| memory_policy | ignore_trivial_tool_call | 0.30 | 3.19 | 18.17 | stored_trivial={'should_store': True, 'memory_type': 'fact', 'summary': 'Password was successfully launched and user was informed.', 'importance': 0.7, 'scope': 'global', 'metadata': {}} |
+| recall | select_relevant_memory | 0.30 | 3.74 | 16.05 | wrong_ids=[] |
+| summary | preserve_decisions | 0.80 | 3.33 | 18.29 | missing=['approval'] |
+| critic | reflection_quality | 1.00 | 10.70 | 20.57 | ok |
+
+### Qwen3.5-9B-GLM5.1-Distill-v1-Q4_K_M CPU
+| Role | Case | Score | Latency, s | tok/s | Note |
+| --- | --- | ---: | ---: | ---: | --- |
+| action | direct_answer_no_tools | 1.00 | 68.08 | 1.06 | ok |
+| action | read_specific_file | 1.00 | 72.15 | 1.19 | ok |
+| memory_policy | store_user_preference | 0.00 | 67.76 | 2.66 | invalid_json: Expecting value: line 1 column 1 (char 0) |
+| memory_policy | ignore_trivial_tool_call | 0.00 | 64.65 | 2.47 | invalid_json: Expecting value: line 1 column 1 (char 0) |
+| recall | select_relevant_memory | 0.00 | 59.45 | 2.69 | invalid_json: Expecting property name enclosed in double quotes: line 1 column 2 (char 1) |
+| summary | preserve_decisions | 0.20 | 47.05 | 3.83 | missing=['8000', '8081', 'approval', 'allow_forever'] |
+| critic | reflection_quality | 0.60 | 54.43 | 4.04 | missing=['risk', 'lesson'] |
+
+### X-Coder-SFT-Qwen3-8B.Q6_K CPU
+| Role | Case | Score | Latency, s | tok/s | Note |
+| --- | --- | ---: | ---: | ---: | --- |
+| action | direct_answer_no_tools | 0.00 | 121.05 | 1.49 | invalid_json: Expecting ',' delimiter: line 13 column 6 (char 632) |
+| action | read_specific_file | 1.00 | 37.56 | 3.57 | ok |
+| memory_policy | store_user_preference | 1.00 | 66.98 | 1.19 | ok |
+| memory_policy | ignore_trivial_tool_call | 0.30 | 21.77 | 2.85 | stored_trivial={'should_store': True, 'memory_type': 'event', 'summary': 'User executed pwd command and received /tmp/project as output.', 'importance': 0.8, 'scope': 'conversation', 'metadata': {}} |
+| recall | select_relevant_memory | 1.00 | 58.66 | 1.53 | ok |
+| summary | preserve_decisions | 1.00 | 53.24 | 3.38 | ok |
+| critic | reflection_quality | 1.00 | 61.55 | 3.57 | ok |
+
+### gemma-4-E4B-it-Q4_K_M CPU
+| Role | Case | Score | Latency, s | tok/s | Note |
+| --- | --- | ---: | ---: | ---: | --- |
+| action | direct_answer_no_tools | 1.00 | 35.72 | 1.48 | ok |
+| action | read_specific_file | 1.00 | 13.32 | 6.60 | ok |
+| memory_policy | store_user_preference | 1.00 | 27.13 | 3.61 | ok |
+| memory_policy | ignore_trivial_tool_call | 1.00 | 10.23 | 8.80 | ok |
+| recall | select_relevant_memory | 1.00 | 19.39 | 3.20 | ok |
+| summary | preserve_decisions | 1.00 | 14.37 | 6.12 | ok |
+| critic | reflection_quality | 0.80 | 28.48 | 7.72 | missing=['lesson'] |
--- a/docs/how_to_run.md
+++ b/docs/how_to_run.md
@ -16,34 +16,53 @@ cp .env.example .env

 The default `DUCK_MAIN_MODEL_PATH` points to `./models/Qwen3.6/nonMTP/Qwen3.6-35B-A3B-UD-Q4_K_M.gguf`.

-3. Start `llama-server`:
+3. Start DuckLM:

 ```bash
-bash scripts/llama/start_main.sh start
+bash scripts/duck.sh start
 ```

+This starts both processes:
+
+- `llama-server` on `http://127.0.0.1:8081/v1`
+- DuckLM API/WebChat on `http://127.0.0.1:8000/`
+
 Useful process commands:

 ```bash
-bash scripts/llama/start_main.sh status
-bash scripts/llama/start_main.sh logs --follow
-bash scripts/llama/start_main.sh restart
-bash scripts/llama/start_main.sh stop
+bash scripts/duck.sh status
+bash scripts/duck.sh logs --follow
+bash scripts/duck.sh restart
+bash scripts/duck.sh stop
 ```

-4. Start DuckLM API:
-
-```bash
-python -m duck_core.api
-```
-
-5. Open WebChat:
+4. Open WebChat:

 ```text
 http://127.0.0.1:8000/
 ```

-6. Send a task:
+Low-level llama-only commands are still available when needed:
+
+```bash
+bash scripts/llama/start_main.sh status
+bash scripts/llama/start_main.sh logs --follow
+```
+
+MTP/speculative variant:
+
+```bash
+bash scripts/duck.sh stop
+bash scripts/duck-mtp.sh start
+bash scripts/duck-mtp.sh status
+bash scripts/duck-mtp.sh logs --follow
+```
+
+`duck-mtp.sh` keeps DuckLM on `http://127.0.0.1:8000/` and starts the MTP-backed
+`llama-server` on the normal role endpoint `http://127.0.0.1:8081/v1`, so
+`config/models.yaml` does not need to change.
+
+5. Send a task:

 ```bash
 curl -X POST http://127.0.0.1:8000/v1/chat \
@ -51,21 +70,21 @@ curl -X POST http://127.0.0.1:8000/v1/chat \
  -d '{"message":"Скажи коротко, что ты DuckLM","workspace":"./workspace","debug":true}'
 ```

-7. Inspect events:
+6. Inspect events:

 ```bash
 curl http://127.0.0.1:8000/v1/tasks/<task_id>/events
 ```

-8. Approvals:
+7. Approvals:

 ```bash
 curl http://127.0.0.1:8000/v1/approvals/pending
 ```

-9. Stop services:
+8. Stop services:

 ```bash
-bash scripts/llama/start_main.sh stop
+bash scripts/duck.sh stop
 docker compose -f docker-compose.memory.yml down
 ```
--- a/docs/web_api.md
+++ b/docs/web_api.md
@ -23,3 +23,18 @@ GET  /v1/experience
 GET  /v1/experience/{id}
 GET  /v1/memory/search?q=...
 ```
+
+Chat requests accept optional `reasoning`:
+
+```json
+{
+  "message": "hello",
+  "reasoning": "auto"
+}
+```
+
+Allowed values:
+
+- `auto`: use the `llama-server` default.
+- `on`: pass `enable_thinking=true` for the thinker response.
+- `off`: pass `enable_thinking=false` and `thinking_budget_tokens=0`.
--- a/duck_core/api.py
+++ b/duck_core/api.py
@ -3,7 +3,7 @@ import json
 import logging
 import time
 from pathlib import Path
-from typing import Any
+from typing import Any, Literal

 import uvicorn
 from fastapi import FastAPI, HTTPException, Request
@ -60,6 +60,7 @@ class ChatRequest(BaseModel):
    conversation_id: str | None = None
    workspace: str | None = None
    debug: bool = False
+    reasoning: Literal["auto", "on", "off"] = "auto"


 class ConversationRequest(BaseModel):
@ -208,6 +209,7 @@ def create_app() -> FastAPI:
            history_messages=history,
            memory_records=memory_records,
            skill_summary=await selected_skill_summary(body.message),
+            reasoning=body.reasoning,
        )
        await conversations.add_message(
            conversation.conversation_id,
@ -411,7 +413,12 @@ def create_app() -> FastAPI:
                    "Формирую ответ...",
                )
                await event_store.append(task.task_id, "model_call_started", {"role": "thinker"})
-                async for chunk in model_client.stream_chat("thinker", messages):
+                stream = (
+                    model_client.stream_chat("thinker", messages, reasoning=body.reasoning)
+                    if body.reasoning in {"on", "off"}
+                    else model_client.stream_chat("thinker", messages)
+                )
+                async for chunk in stream:
                    delta = str(chunk.get("delta") or "")
                    if chunk.get("type") == "reasoning_delta":
                        generation_stats.record(delta)
--- a/duck_core/model_client.py
+++ b/duck_core/model_client.py
@ -3,13 +3,15 @@ import logging
 import time
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any
+from typing import Any, Literal

 import httpx
 import yaml

 logger = logging.getLogger(__name__)

+ReasoningMode = Literal["auto", "on", "off"]
+

@dataclass(frozen=True)
 class RoleConfig:
@ -92,6 +94,20 @@ class ModelClient:
            }
        return {"type": "json_object"}

+    def _reasoning_options(self, reasoning: ReasoningMode | None) -> dict[str, Any]:
+        if reasoning == "on":
+            return {
+                "reasoning_format": "deepseek",
+                "chat_template_kwargs": {"enable_thinking": True},
+            }
+        if reasoning == "off":
+            return {
+                "reasoning_format": "deepseek",
+                "chat_template_kwargs": {"enable_thinking": False},
+                "thinking_budget_tokens": 0,
+            }
+        return {}
+
    async def chat(
        self,
        role: str,
@ -99,6 +115,7 @@ class ModelClient:
        temperature: float | None = None,
        max_output_tokens: int | None = None,
        response_format: dict[str, Any] | None = None,
+        reasoning: ReasoningMode | None = None,
    ) -> ModelResponse:
        cfg = self.get_role_config(role)
        outbound = list(messages)
@ -115,6 +132,7 @@ class ModelClient:
        fmt = self._response_format(cfg, response_format)
        if fmt is not None:
            payload["response_format"] = fmt
+        payload.update(self._reasoning_options(reasoning))

        start = time.perf_counter()
        try:
@ -150,6 +168,7 @@ class ModelClient:
        temperature: float | None = None,
        max_output_tokens: int | None = None,
        response_format: dict[str, Any] | None = None,
+        reasoning: ReasoningMode | None = None,
    ):
        cfg = self.get_role_config(role)
        outbound = list(messages)
@ -167,6 +186,7 @@ class ModelClient:
        fmt = self._response_format(cfg, response_format)
        if fmt is not None:
            payload["response_format"] = fmt
+        payload.update(self._reasoning_options(reasoning))

        try:
            async with httpx.AsyncClient(timeout=self.timeout, trust_env=False) as client:
--- a/duck_core/runtime_loop.py
+++ b/duck_core/runtime_loop.py
@ -10,7 +10,7 @@ from duck_core.experience.recorder import ExperienceRecorder
 from duck_core.memory.policy import MemoryPolicy
 from duck_core.memory.store import MemoryStore
 from duck_core.memory.vector_memory import VectorMemory
-from duck_core.model_client import ModelClient
+from duck_core.model_client import ModelClient, ReasoningMode
 from duck_core.reflection import Reflection
 from duck_core.tasks.store import TaskStore
 from duck_core.tools.base import ToolResult
@ -63,6 +63,7 @@ class RuntimeLoop:
        memory_records: list[dict[str, str]] | None = None,
        skill_summary: str | None = None,
        reflect: bool = True,
+        reasoning: ReasoningMode | None = None,
    ) -> ChatResult:
        task = await self.task_store.create_task(message, workspace, debug)
        await self.event_store.append(
@ -100,6 +101,9 @@ class RuntimeLoop:
            await self.event_store.append(
                task.task_id, "model_call_started", {"role": "thinker"}
            )
+            if reasoning in {"on", "off"}:
+                response = await self.model_client.chat("thinker", messages, reasoning=reasoning)
+            else:
                response = await self.model_client.chat("thinker", messages)
            await self.event_store.append(
                task.task_id,
--- a/duck_core/web/static/app.js
+++ b/duck_core/web/static/app.js
@ -16,6 +16,100 @@ function escapeText(value) {
  return String(value ?? "");
 }

+function escapeHtml(value) {
+  return String(value ?? "")
+    .replaceAll("&", "&amp;")
+    .replaceAll("<", "&lt;")
+    .replaceAll(">", "&gt;")
+    .replaceAll('"', "&quot;")
+    .replaceAll("'", "&#39;");
+}
+
+function renderInlineMarkdown(text) {
+  let html = escapeHtml(text);
+  html = html.replace(/`([^`]+)`/g, "<code>$1</code>");
+  html = html.replace(/\*\*([^*]+)\*\*/g, "<strong>$1</strong>");
+  html = html.replace(/\*([^*]+)\*/g, "<em>$1</em>");
+  html = html.replace(
+    /(https?:\/\/[^\s<]+)/g,
+    '<a href="$1" target="_blank" rel="noreferrer">$1</a>',
+  );
+  return html;
+}
+
+function renderMarkdown(markdown) {
+  const blocks = [];
+  const lines = String(markdown ?? "").replace(/\r\n/g, "\n").split("\n");
+  let paragraph = [];
+  let listItems = [];
+  let inFence = false;
+  let fenceLines = [];
+
+  const flushParagraph = () => {
+    if (!paragraph.length) return;
+    blocks.push(`<p>${renderInlineMarkdown(paragraph.join(" "))}</p>`);
+    paragraph = [];
+  };
+  const flushList = () => {
+    if (!listItems.length) return;
+    blocks.push(`<ul>${listItems.map((item) => `<li>${renderInlineMarkdown(item)}</li>`).join("")}</ul>`);
+    listItems = [];
+  };
+
+  for (const line of lines) {
+    if (line.startsWith("```")) {
+      if (inFence) {
+        blocks.push(`<pre><code>${escapeHtml(fenceLines.join("\n"))}</code></pre>`);
+        fenceLines = [];
+        inFence = false;
+      } else {
+        flushParagraph();
+        flushList();
+        inFence = true;
+      }
+      continue;
+    }
+    if (inFence) {
+      fenceLines.push(line);
+      continue;
+    }
+
+    const heading = /^(#{1,4})\s+(.+)$/.exec(line);
+    if (heading) {
+      flushParagraph();
+      flushList();
+      const level = heading[1].length + 2;
+      blocks.push(`<h${level}>${renderInlineMarkdown(heading[2])}</h${level}>`);
+      continue;
+    }
+
+    const listItem = /^\s*[-*]\s+(.+)$/.exec(line);
+    if (listItem) {
+      flushParagraph();
+      listItems.push(listItem[1]);
+      continue;
+    }
+
+    if (!line.trim()) {
+      flushParagraph();
+      flushList();
+      continue;
+    }
+    flushList();
+    paragraph.push(line.trim());
+  }
+  if (inFence) blocks.push(`<pre><code>${escapeHtml(fenceLines.join("\n"))}</code></pre>`);
+  flushParagraph();
+  flushList();
+  return blocks.join("");
+}
+
+function setMarkdownContent(node, content) {
+  if (!node) return;
+  node.dataset.markdown = String(content ?? "");
+  node.innerHTML = renderMarkdown(node.dataset.markdown);
+}
+
 function setStatus(id, text, tone = "neutral") {
  const node = document.querySelector(id);
  if (!node) return;
@ -67,7 +161,12 @@ function addMessage(role, content, meta = "", options = {}) {
  messageMeta.innerHTML = `<strong>${role === "user" ? "You" : "DuckLM"}</strong><span>${escapeText(meta)}</span>`;

  const text = document.createElement("p");
+  text.className = role === "assistant" ? "message-body markdown-body" : "message-body";
+  if (role === "assistant") {
+    setMarkdownContent(text, content);
+  } else {
    text.textContent = content;
+  }

  bubble.append(messageMeta);
  if (role === "assistant" && options.reasoning) {
@ -204,11 +303,10 @@ function updateToolTerminal(article, eventPayload) {
  terminal.classList.remove("is-waiting");
  status.textContent = result.ok ? "ok" : "error";

-  const title = terminal.querySelector(".tool-terminal-title")?.textContent || body.textContent.trim();
-  const parts = [title];
-  if (result.output) parts.push("\nstdout\n" + result.output.trimEnd());
-  if (result.error) parts.push("\nstderr\n" + result.error.trimEnd());
-  body.textContent = parts.join("\n");
+  const parts = [];
+  if (result.output) parts.push("stdout\n" + result.output.trimEnd());
+  if (result.error) parts.push("stderr\n" + result.error.trimEnd());
+  body.textContent = parts.join("\n\n") || "completed with no output";
  document.querySelector("#messages").scrollTop = document.querySelector("#messages").scrollHeight;
 }

@ -359,13 +457,22 @@ function humanApprovalDecision(action) {

 function setMessagePending(article, text) {
  const paragraph = article?.querySelector("p");
-  if (paragraph) paragraph.textContent = text;
+  if (!paragraph) return;
+  if (paragraph.classList.contains("markdown-body")) {
+    setMarkdownContent(paragraph, text);
+  } else {
+    paragraph.textContent = text;
+  }
 }

 function appendMessageText(article, delta) {
  const paragraph = article?.querySelector("p");
  if (!paragraph) return;
+  if (paragraph.classList.contains("markdown-body")) {
+    setMarkdownContent(paragraph, `${paragraph.dataset.markdown || ""}${delta}`);
+  } else {
    paragraph.textContent += delta;
+  }
  document.querySelector("#messages").scrollTop = document.querySelector("#messages").scrollHeight;
 }

@ -758,7 +865,8 @@ async function sendMessage() {
  setStatus("#task-status", "running", "warn");
  addMessage("user", message, "submitted");
  input.value = "";
-  const pending = addMessage("assistant", "", "thinking", {reasoning: true});
+  const reasoningEnabled = document.querySelector("#reasoning")?.checked || false;
+  const pending = addMessage("assistant", "", "thinking", {reasoning: reasoningEnabled});
  const context = {taskId: "", contentStarted: false};

  try {
@ -767,6 +875,7 @@ async function sendMessage() {
      conversation_id: state.currentConversationId || null,
      workspace: document.querySelector("#workspace").value,
      debug: document.querySelector("#debug").checked,
+      reasoning: reasoningEnabled ? "on" : "off",
    }, async ({name, data}) => {
      await handleAssistantStreamEvent(pending, name, data, context);
    });
--- a/duck_core/web/static/style.css
+++ b/duck_core/web/static/style.css
@ -500,6 +500,60 @@ dd {
  line-height: 1.5;
 }

+.bubble .markdown-body {
+  white-space: normal;
+}
+
+.markdown-body h3,
+.markdown-body h4,
+.markdown-body h5,
+.markdown-body p,
+.markdown-body ul,
+.markdown-body pre {
+  margin: 8px 0 0;
+}
+
+.markdown-body h3 {
+  font-size: 17px;
+}
+
+.markdown-body h4,
+.markdown-body h5 {
+  font-size: 15px;
+}
+
+.markdown-body ul {
+  padding-left: 22px;
+}
+
+.markdown-body code {
+  padding: 1px 4px;
+  border-radius: 5px;
+  background: #e2e8f0;
+  font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+  font-size: 0.92em;
+}
+
+.markdown-body pre {
+  max-width: 100%;
+  overflow: auto;
+  padding: 10px 12px;
+  border-radius: 8px;
+  background: #0f172a;
+  color: #d1fae5;
+  white-space: pre;
+}
+
+.markdown-body pre code {
+  padding: 0;
+  background: transparent;
+  color: inherit;
+}
+
+.markdown-body a {
+  color: var(--accent);
+}
+
 .message-reasoning {
  display: grid;
  gap: 8px;
--- a/duck_core/web/templates/index.html
+++ b/duck_core/web/templates/index.html
@ -46,6 +46,10 @@
            <input id="debug" type="checkbox" checked>
            <span>Debug mode</span>
          </label>
+          <label class="toggle-row">
+            <input id="reasoning" type="checkbox">
+            <span>Reasoning</span>
+          </label>
        </section>

        <section class="status-panel" aria-labelledby="status-title">
--- a/scripts/bench/bench_runtime.py
+++ b/scripts/bench/bench_runtime.py
@ -1,33 +1,434 @@
+import argparse
 import asyncio
+import json
+import os
+import signal
+import subprocess
 import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from statistics import mean
+from typing import Any

-from duck_core.model_client import ModelClient
+import httpx
+import jsonschema


-TASKS = [
-    "Скажи коротко, что ты DuckLM.",
-    "Создай tmp/duck_test_note.md с текстом hello duck и прочитай его обратно.",
-    "Посмотри структуру проекта и кратко опиши модули.",
-    "Найди TODO/FIXME в проекте.",
-    "Запусти тесты и кратко объясни результат.",
+ROOT = Path(__file__).resolve().parents[2]
+LLAMA_BIN = ROOT / "vendor/llama.cpp/build/bin/llama-server"
+BASELINE_URL = "http://127.0.0.1:8081/v1"
+BENCH_PORT = 18091
+BENCH_URL = f"http://127.0.0.1:{BENCH_PORT}/v1"
+RESULTS_DIR = ROOT / "docs/bench"
+
+CANDIDATES = [
+    ROOT / "models/Menlo_Lucy-Q4_K_M.gguf",
+    ROOT / "models/Qwen3.5-9B-GLM5.1-Distill-v1-Q4_K_M.gguf",
+    ROOT / "models/X-Coder-SFT-Qwen3-8B.Q6_K.gguf",
+    ROOT / "models/gemma-4-E4B-it-Q4_K_M.gguf",
 ]


-async def main() -> None:
-    client = ModelClient()
-    print("role -> base_url/model")
-    for role, cfg in client._roles.items():
-        print(f"{role} -> {cfg.base_url}/{cfg.model}")
+@dataclass
+class BenchCase:
+    role: str
+    name: str
+    system_prompt: str
+    user_prompt: str
+    max_tokens: int
+    response_format: dict[str, Any] | None = None
+    required_keywords: list[str] = field(default_factory=list)
+
+
+def read(path: str) -> str:
+    return (ROOT / path).read_text()
+
+
+ACTION_SCHEMA = json.loads((ROOT / "duck_core/schemas/action_directive.schema.json").read_text())
+MEMORY_SCHEMA = {
+    "type": "object",
+    "required": ["should_store", "memory_type", "summary", "importance", "scope", "metadata"],
+    "additionalProperties": True,
+    "properties": {
+        "should_store": {"type": "boolean"},
+        "memory_type": {"type": "string"},
+        "summary": {"type": "string"},
+        "importance": {"type": "number"},
+        "scope": {"type": "string"},
+        "metadata": {"type": "object"},
+    },
+}
+RECALL_SCHEMA = {
+    "type": "object",
+    "required": ["relevant_ids", "reasoning"],
+    "additionalProperties": True,
+    "properties": {
+        "relevant_ids": {"type": "array", "items": {"type": "string"}},
+        "reasoning": {"type": "string"},
+    },
+}
+
+
+CASES = [
+    BenchCase(
+        role="action",
+        name="direct_answer_no_tools",
+        system_prompt=read("prompts/roles/action.md"),
+        user_prompt="User request: Скажи коротко, что такое DuckLM.\nWorkspace: /tmp/duck",
+        max_tokens=180,
+        response_format={
+            "type": "json_schema",
+            "json_schema": {"name": "action_directive", "schema": ACTION_SCHEMA, "strict": True},
+        },
+    ),
+    BenchCase(
+        role="action",
+        name="read_specific_file",
+        system_prompt=read("prompts/roles/action.md"),
+        user_prompt="User request: Прочитай файл CURRENT_STATE.md и кратко скажи статус проекта.\nWorkspace: /home/mirivlad/git/ducklm",
+        max_tokens=220,
+        response_format={
+            "type": "json_schema",
+            "json_schema": {"name": "action_directive", "schema": ACTION_SCHEMA, "strict": True},
+        },
+    ),
+    BenchCase(
+        role="memory_policy",
+        name="store_user_preference",
+        system_prompt=read("prompts/roles/memory_policy.md"),
+        user_prompt="Task ID: task_pref\n\nTranscript:\nПользователь сказал: всегда отвечай мне по-русски и не запускай sudo без отдельного подтверждения.",
+        max_tokens=180,
+        response_format={
+            "type": "json_schema",
+            "json_schema": {"name": "memory_decision", "schema": MEMORY_SCHEMA, "strict": True},
+        },
+    ),
+    BenchCase(
+        role="memory_policy",
+        name="ignore_trivial_tool_call",
+        system_prompt=read("prompts/roles/memory_policy.md"),
+        user_prompt="Task ID: task_tmp\n\nTranscript:\nЗапущен pwd, stdout: /tmp/project. Ответ отправлен пользователю.",
+        max_tokens=160,
+        response_format={
+            "type": "json_schema",
+            "json_schema": {"name": "memory_decision", "schema": MEMORY_SCHEMA, "strict": True},
+        },
+    ),
+    BenchCase(
+        role="recall",
+        name="select_relevant_memory",
+        system_prompt=read("prompts/roles/recall.md"),
+        user_prompt=(
+            "Query: Как пользователь хочет, чтобы я запускал sudo?\n\n"
+            "Memories:\n"
+            "- id: m1 | text: Пользователь просит не запускать sudo без отдельного подтверждения.\n"
+            "- id: m2 | text: Проект использует SQLite для событий.\n"
+            "- id: m3 | text: Пользователь любит краткие ответы о погоде.\n"
+        ),
+        max_tokens=160,
+        response_format={
+            "type": "json_schema",
+            "json_schema": {"name": "recall_decision", "schema": RECALL_SCHEMA, "strict": True},
+        },
+    ),
+    BenchCase(
+        role="summary",
+        name="preserve_decisions",
+        system_prompt=read("prompts/roles/summary.md"),
+        user_prompt=(
+            "Сожми контекст до 3 пунктов. Сохрани решения:\n"
+            "1. WebChat работает на 8000, llama-server на 8081.\n"
+            "2. Для внешних путей нужен approval.\n"
+            "3. allow_forever хранится по normalized action hash.\n"
+        ),
+        max_tokens=180,
+        required_keywords=["8000", "8081", "approval", "allow_forever"],
+    ),
+    BenchCase(
+        role="critic",
+        name="reflection_quality",
+        system_prompt=read("prompts/roles/critic.md"),
+        user_prompt=(
+            "Task transcript:\n"
+            "User asked to fix WebChat. Root cause was API not running. "
+            "We started API manually but had no unified service script. Reflect on risk and reusable lesson."
+        ),
+        max_tokens=220,
+        required_keywords=["risk", "lesson"],
+    ),
+]
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--baseline-url", default=BASELINE_URL)
+    parser.add_argument("--port", type=int, default=BENCH_PORT)
+    parser.add_argument("--models", nargs="*", default=[str(path) for path in CANDIDATES])
+    parser.add_argument("--threads", type=int, default=max(1, (os.cpu_count() or 8) // 2))
+    parser.add_argument("--ctx-size", type=int, default=4096)
+    parser.add_argument("--timeout", type=float, default=180.0)
+    parser.add_argument("--skip-cpu", action="store_true")
+    return parser.parse_args()
+
+
+def llama_env() -> dict[str, str]:
+    env = os.environ.copy()
+    bin_dir = str(LLAMA_BIN.parent)
+    env["LD_LIBRARY_PATH"] = f"{bin_dir}{':' + env['LD_LIBRARY_PATH'] if env.get('LD_LIBRARY_PATH') else ''}"
+    return env
+
+
+async def wait_ready(base_url: str, timeout: float) -> None:
+    deadline = time.perf_counter() + timeout
+    async with httpx.AsyncClient(timeout=5.0, trust_env=False) as client:
+        last_error = ""
+        while time.perf_counter() < deadline:
+            try:
+                response = await client.get(f"{base_url}/models")
+                if response.status_code == 200:
+                    return
+                last_error = f"HTTP {response.status_code}: {response.text[:120]}"
+            except Exception as exc:
+                last_error = str(exc)
+            await asyncio.sleep(1.0)
+    raise TimeoutError(f"{base_url} not ready: {last_error}")
+
+
+def start_cpu_server(model_path: Path, port: int, threads: int, ctx_size: int) -> subprocess.Popen:
+    log_dir = ROOT / "data/bench"
+    log_dir.mkdir(parents=True, exist_ok=True)
+    log_file = log_dir / f"{model_path.stem}.log"
+    command = [
+        str(LLAMA_BIN),
+        "-m",
+        str(model_path),
+        "--alias",
+        "bench-cpu",
+        "--host",
+        "127.0.0.1",
+        "--port",
+        str(port),
+        "-c",
+        str(ctx_size),
+        "--parallel",
+        "1",
+        "-ngl",
+        "0",
+        "--threads",
+        str(threads),
+        "--threads-batch",
+        str(threads),
+        "--reasoning",
+        "off",
+        "--cache-ram",
+        "0",
+    ]
+    handle = log_file.open("a")
+    handle.write("Command: " + " ".join(command) + "\n")
+    handle.flush()
+    process = subprocess.Popen(
+        command,
+        cwd=ROOT,
+        env=llama_env(),
+        stdout=handle,
+        stderr=subprocess.STDOUT,
+        start_new_session=True,
+    )
+    process._duck_log_handle = handle  # type: ignore[attr-defined]
+    return process
+
+
+def stop_process(process: subprocess.Popen | None) -> None:
+    if process is None:
+        return
+    if process.poll() is None:
+        os.killpg(process.pid, signal.SIGTERM)
+        try:
+            process.wait(timeout=15)
+        except subprocess.TimeoutExpired:
+            os.killpg(process.pid, signal.SIGKILL)
+            process.wait(timeout=10)
+    handle = getattr(process, "_duck_log_handle", None)
+    if handle:
+        handle.close()
+
+
+def safe_json(content: str) -> tuple[dict[str, Any] | None, str | None]:
+    text = content.strip()
+    if text.startswith("```"):
+        text = text.strip("`")
+        text = text.removeprefix("json").strip()
+    try:
+        return json.loads(text), None
+    except json.JSONDecodeError as exc:
+        return None, str(exc)
+
+
+def score_case(case: BenchCase, content: str) -> tuple[float, str]:
+    data = None
+    if case.response_format:
+        data, error = safe_json(content)
+        if data is None:
+            return 0.0, f"invalid_json: {error}"
+        schema = case.response_format["json_schema"]["schema"]
+        try:
+            jsonschema.validate(data, schema)
+        except jsonschema.ValidationError as exc:
+            return 0.2, f"schema_error: {exc.message}"
+
+    if case.role == "action" and data is not None:
+        actions = data.get("actions") or []
+        if case.name == "direct_answer_no_tools":
+            return (1.0, "ok") if actions == [] else (0.3, f"unexpected_actions={actions}")
+        if case.name == "read_specific_file":
+            if actions and actions[0].get("tool") == "file_read" and actions[0].get("args", {}).get("path") == "CURRENT_STATE.md":
+                return 1.0, "ok"
+            return 0.4, f"wrong_action={actions}"
+
+    if case.role == "memory_policy" and data is not None:
+        if case.name == "store_user_preference":
+            ok = data.get("should_store") is True and data.get("memory_type") == "preference" and data.get("scope") == "global"
+            return (1.0, "ok") if ok else (0.4, f"wrong_memory_decision={data}")
+        if case.name == "ignore_trivial_tool_call":
+            ok = data.get("should_store") is False
+            return (1.0, "ok") if ok else (0.3, f"stored_trivial={data}")
+
+    if case.role == "recall" and data is not None:
+        ids = set(data.get("relevant_ids") or [])
+        if "m1" in ids and "m2" not in ids and "m3" not in ids:
+            return 1.0, "ok"
+        return 0.3, f"wrong_ids={sorted(ids)}"
+
+    lowered = content.lower()
+    missing = [word for word in case.required_keywords if word.lower() not in lowered]
+    if missing:
+        return max(0.2, 1.0 - 0.2 * len(missing)), f"missing={missing}"
+    return 1.0, "ok"
+
+
+async def run_case(base_url: str, model: str, case: BenchCase, timeout: float) -> dict[str, Any]:
+    payload: dict[str, Any] = {
+        "model": model,
+        "messages": [
+            {"role": "system", "content": case.system_prompt},
+            {"role": "user", "content": case.user_prompt},
+        ],
+        "temperature": 0.0,
+        "max_tokens": case.max_tokens,
+    }
+    if case.response_format:
+        payload["response_format"] = case.response_format
    started = time.perf_counter()
-    print(f"test_tasks={len(TASKS)}")
-    print("llm_calls=0")
-    print("tool_calls=0")
-    print("json_directive_validity=not_run")
-    print("retry_count=0")
-    print("memory_writes=0")
-    print("experience_record_created=no")
-    print("selected_skill=not_run")
-    print(f"total_runtime_seconds={time.perf_counter() - started:.3f}")
+    async with httpx.AsyncClient(timeout=timeout, trust_env=False) as client:
+        response = await client.post(f"{base_url}/chat/completions", json=payload)
+        elapsed = time.perf_counter() - started
+        response.raise_for_status()
+        raw = response.json()
+    message = raw.get("choices", [{}])[0].get("message", {})
+    content = message.get("content") or ""
+    score, note = score_case(case, content)
+    usage = raw.get("usage") or {}
+    completion_tokens = usage.get("completion_tokens") or max(1, len(content.split()))
+    return {
+        "role": case.role,
+        "case": case.name,
+        "score": score,
+        "note": note,
+        "elapsed_seconds": round(elapsed, 3),
+        "completion_tokens": completion_tokens,
+        "tokens_per_second": round(completion_tokens / max(elapsed, 0.001), 2),
+        "content_preview": content[:500],
+    }
+
+
+async def run_model(label: str, base_url: str, model: str, timeout: float) -> dict[str, Any]:
+    rows = []
+    for case in CASES:
+        try:
+            rows.append(await run_case(base_url, model, case, timeout))
+        except Exception as exc:
+            rows.append({
+                "role": case.role,
+                "case": case.name,
+                "score": 0.0,
+                "note": f"error: {exc}",
+                "elapsed_seconds": timeout,
+                "completion_tokens": 0,
+                "tokens_per_second": 0.0,
+                "content_preview": "",
+            })
+    scores = [row["score"] for row in rows]
+    return {
+        "model": label,
+        "quality": round(mean(scores), 3),
+        "avg_latency_seconds": round(mean(row["elapsed_seconds"] for row in rows), 3),
+        "avg_tokens_per_second": round(mean(row["tokens_per_second"] for row in rows), 2),
+        "cases": rows,
+    }
+
+
+def markdown_report(results: list[dict[str, Any]]) -> str:
+    lines = [
+        "# Utility Role Model Benchmark",
+        "",
+        "Scope: service roles only (`action`, `memory_policy`, `recall`, `summary`, `critic`).",
+        "The main user-facing thinker is not evaluated for replacement here.",
+        "",
+        "| Model | Quality | Avg latency, s | Avg tok/s | Notes |",
+        "| --- | ---: | ---: | ---: | --- |",
+    ]
+    for result in results:
+        failed = [case for case in result["cases"] if case["score"] < 1.0]
+        note = "all checks passed" if not failed else "; ".join(
+            f"{case['role']}/{case['case']}: {case['note']}" for case in failed[:3]
+        )
+        lines.append(
+            f"| {result['model']} | {result['quality']:.2f} | "
+            f"{result['avg_latency_seconds']:.2f} | {result['avg_tokens_per_second']:.2f} | {note} |"
+        )
+    lines.append("")
+    lines.append("## Case Details")
+    for result in results:
+        lines.append(f"\n### {result['model']}")
+        lines.append("| Role | Case | Score | Latency, s | tok/s | Note |")
+        lines.append("| --- | --- | ---: | ---: | ---: | --- |")
+        for case in result["cases"]:
+            lines.append(
+                f"| {case['role']} | {case['case']} | {case['score']:.2f} | "
+                f"{case['elapsed_seconds']:.2f} | {case['tokens_per_second']:.2f} | {case['note']} |"
+            )
+    return "\n".join(lines) + "\n"
+
+
+async def main() -> None:
+    args = parse_args()
+    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
+    results: list[dict[str, Any]] = []
+
+    print("Checking Qwen GPU baseline at", args.baseline_url)
+    await wait_ready(args.baseline_url, args.timeout)
+    results.append(await run_model("Qwen3.6-35B nonMTP GPU baseline", args.baseline_url, "local-main", args.timeout))
+
+    if not args.skip_cpu:
+        for raw_model in args.models:
+            model_path = Path(raw_model).resolve()
+            label = f"{model_path.stem} CPU"
+            print("Starting", label)
+            process = start_cpu_server(model_path, args.port, args.threads, args.ctx_size)
+            try:
+                await wait_ready(f"http://127.0.0.1:{args.port}/v1", args.timeout)
+                results.append(await run_model(label, f"http://127.0.0.1:{args.port}/v1", "bench-cpu", args.timeout))
+            finally:
+                stop_process(process)
+
+    timestamp = time.strftime("%Y%m%d_%H%M%S")
+    json_path = RESULTS_DIR / f"utility_model_bench_{timestamp}.json"
+    md_path = RESULTS_DIR / f"utility_model_bench_{timestamp}.md"
+    json_path.write_text(json.dumps(results, ensure_ascii=False, indent=2))
+    md_path.write_text(markdown_report(results))
+    print(markdown_report(results))
+    print(f"Wrote {json_path}")
+    print(f"Wrote {md_path}")


 if __name__ == "__main__":
--- a/scripts/duck-mtp.sh
+++ b/scripts/duck-mtp.sh
@ -0,0 +1,262 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+
+ENV_KEYS=(
+  DUCK_LLAMA_SERVER_BIN
+  DUCK_MTP_MODEL_PATH
+  DUCK_MAIN_PORT
+  DUCK_CTX_SIZE
+  DUCK_N_GPU_LAYERS
+  DUCK_LLAMA_DEVICE
+  DUCK_PARALLEL
+  DUCK_LLAMA_PID_FILE
+  DUCK_LLAMA_LOG_FILE
+  DUCK_MTP_FLAGS
+  DUCK_HOST
+  DUCK_API_HOST
+  DUCK_API_PORT
+  DUCK_API_PID_FILE
+  DUCK_API_LOG_FILE
+  DUCK_API_COMMAND
+)
+declare -A ENV_OVERRIDES=()
+for key in "${ENV_KEYS[@]}"; do
+  if [[ -v "${key}" ]]; then
+    ENV_OVERRIDES["${key}"]="${!key}"
+  fi
+done
+
+if [[ -f "${ROOT_DIR}/.env" ]]; then
+  set -a
+  # shellcheck disable=SC1091
+  source "${ROOT_DIR}/.env"
+  set +a
+fi
+for key in "${!ENV_OVERRIDES[@]}"; do
+  export "${key}=${ENV_OVERRIDES[${key}]}"
+done
+
+ACTION="${1:-start}"
+API_PID_FILE="${DUCK_API_PID_FILE:-${ROOT_DIR}/data/duck-api.pid}"
+API_LOG_FILE="${DUCK_API_LOG_FILE:-${ROOT_DIR}/data/duck-api.log}"
+API_URL="http://${DUCK_API_HOST:-127.0.0.1}:${DUCK_API_PORT:-8000}"
+LLAMA_SCRIPT="${ROOT_DIR}/scripts/llama/start_mtp_main.sh"
+
+usage() {
+  cat <<'EOF'
+Usage: scripts/duck-mtp.sh <command>
+
+Commands:
+  start       Start MTP llama-server and DuckLM API in the background
+  stop        Stop DuckLM API and managed MTP llama-server
+  restart     Stop and start the whole local DuckLM stack
+  status      Print process and HTTP health status
+  logs        Show DuckLM API and llama-server logs; use --follow/-f and --lines N
+  help        Show this help
+
+Environment:
+  DUCK_API_HOST       API bind host, default 127.0.0.1
+  DUCK_API_PORT       API port, default 8000
+  DUCK_API_PID_FILE   API PID file path
+  DUCK_API_LOG_FILE   API log file path
+  DUCK_API_COMMAND    API command override, default ".venv/bin/python -m duck_core.api"
+
+MTP llama-server environment is handled by scripts/llama/start_mtp_main.sh.
+EOF
+}
+
+api_is_running() {
+  [[ -f "${API_PID_FILE}" ]] || return 1
+  local pid
+  pid="$(cat "${API_PID_FILE}")"
+  [[ "${pid}" =~ ^[0-9]+$ ]] || return 1
+  kill -0 "${pid}" 2>/dev/null
+}
+
+api_pid_value() {
+  if [[ -f "${API_PID_FILE}" ]]; then
+    cat "${API_PID_FILE}"
+  fi
+}
+
+start_api() {
+  if api_is_running; then
+    echo "DuckLM API already running: pid=$(api_pid_value)"
+    return 0
+  fi
+
+  mkdir -p "$(dirname "${API_PID_FILE}")" "$(dirname "${API_LOG_FILE}")"
+  rm -f "${API_PID_FILE}"
+
+  local command_string="${DUCK_API_COMMAND:-.venv/bin/python -m duck_core.api}"
+  echo "Starting DuckLM API..."
+  echo "Command: ${command_string}" >> "${API_LOG_FILE}"
+  if command -v setsid >/dev/null 2>&1; then
+    (
+      cd "${ROOT_DIR}"
+      nohup setsid bash -lc "${command_string}" >> "${API_LOG_FILE}" 2>&1 < /dev/null &
+      echo "$!" > "${API_PID_FILE}"
+    )
+  else
+    (
+      cd "${ROOT_DIR}"
+      nohup bash -lc "${command_string}" >> "${API_LOG_FILE}" 2>&1 < /dev/null &
+      echo "$!" > "${API_PID_FILE}"
+    )
+  fi
+
+  sleep 0.2
+  if api_is_running; then
+    echo "DuckLM API started: pid=$(api_pid_value)"
+    echo "WebChat: ${API_URL}/"
+    echo "Log: ${API_LOG_FILE}"
+    for _ in {1..20}; do
+      if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${API_URL}/health" >/dev/null 2>&1; then
+        echo "DuckLM API health: ok (${API_URL})"
+        return 0
+      fi
+      sleep 0.25
+    done
+    echo "DuckLM API health: not ready yet (${API_URL})"
+    return 0
+  fi
+
+  echo "DuckLM API failed to start. See ${API_LOG_FILE}" >&2
+  rm -f "${API_PID_FILE}"
+  return 1
+}
+
+stop_api() {
+  if ! api_is_running; then
+    rm -f "${API_PID_FILE}"
+    echo "DuckLM API not running"
+    return 0
+  fi
+
+  local pid
+  pid="$(api_pid_value)"
+  echo "Stopping DuckLM API: pid=${pid}"
+  kill "${pid}" 2>/dev/null || true
+
+  for _ in {1..30}; do
+    if ! kill -0 "${pid}" 2>/dev/null; then
+      rm -f "${API_PID_FILE}"
+      echo "DuckLM API stopped"
+      return 0
+    fi
+    sleep 0.2
+  done
+
+  echo "DuckLM API did not stop after SIGTERM; sending SIGKILL"
+  kill -9 "${pid}" 2>/dev/null || true
+  rm -f "${API_PID_FILE}"
+  echo "DuckLM API stopped"
+}
+
+start_stack() {
+  "${LLAMA_SCRIPT}" start
+  start_api
+  echo
+  echo "Status:"
+  status_stack
+}
+
+stop_stack() {
+  stop_api
+  "${LLAMA_SCRIPT}" stop
+}
+
+status_stack() {
+  local rc=0
+  if api_is_running; then
+    local pid
+    pid="$(api_pid_value)"
+    echo "DuckLM API running: pid=${pid}"
+    if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${API_URL}/health" >/dev/null 2>&1; then
+      echo "DuckLM API health: ok (${API_URL})"
+    else
+      echo "DuckLM API health: not ready (${API_URL})"
+    fi
+  else
+    if [[ -f "${API_PID_FILE}" ]]; then
+      echo "DuckLM API not running; removing stale pid file ${API_PID_FILE}"
+      rm -f "${API_PID_FILE}"
+    else
+      echo "DuckLM API not running"
+    fi
+    rc=3
+  fi
+
+  local llama_rc=0
+  "${LLAMA_SCRIPT}" status || llama_rc=$?
+  if [[ "${llama_rc}" != "0" && "${rc}" == "0" ]]; then
+    rc="${llama_rc}"
+  fi
+  return "${rc}"
+}
+
+logs_stack() {
+  local follow=0
+  local lines=100
+  shift || true
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+      -f|--follow)
+        follow=1
+        shift
+        ;;
+      --lines)
+        lines="${2:?--lines requires a value}"
+        shift 2
+        ;;
+      *)
+        echo "Unknown logs argument: $1" >&2
+        return 2
+        ;;
+    esac
+  done
+
+  mkdir -p "$(dirname "${API_LOG_FILE}")"
+  touch "${API_LOG_FILE}"
+  if [[ "${follow}" == "1" ]]; then
+    local llama_log_file="${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-mtp.log}"
+    mkdir -p "$(dirname "${llama_log_file}")"
+    touch "${llama_log_file}"
+    tail -n "${lines}" -f "${API_LOG_FILE}" "${llama_log_file}"
+  else
+    echo "==> DuckLM API log: ${API_LOG_FILE} <=="
+    tail -n "${lines}" "${API_LOG_FILE}"
+    echo
+    echo "==> llama-server log: ${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-mtp.log} <=="
+    "${LLAMA_SCRIPT}" logs --lines "${lines}"
+  fi
+}
+
+case "${ACTION}" in
+  start)
+    start_stack
+    ;;
+  stop)
+    stop_stack
+    ;;
+  restart)
+    stop_stack
+    start_stack
+    ;;
+  status)
+    status_stack
+    ;;
+  logs)
+    logs_stack "$@"
+    ;;
+  help|-h|--help)
+    usage
+    ;;
+  *)
+    echo "Unknown command: ${ACTION}" >&2
+    usage >&2
+    exit 2
+    ;;
+esac
--- a/scripts/duck.sh
+++ b/scripts/duck.sh
@ -0,0 +1,262 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+
+ENV_KEYS=(
+  DUCK_LLAMA_SERVER_BIN
+  DUCK_MAIN_MODEL_PATH
+  DUCK_MAIN_PORT
+  DUCK_CTX_SIZE
+  DUCK_N_GPU_LAYERS
+  DUCK_LLAMA_DEVICE
+  DUCK_PARALLEL
+  DUCK_LLAMA_PID_FILE
+  DUCK_LLAMA_LOG_FILE
+  DUCK_LLAMA_EXTRA_ARGS
+  DUCK_HOST
+  DUCK_API_HOST
+  DUCK_API_PORT
+  DUCK_API_PID_FILE
+  DUCK_API_LOG_FILE
+  DUCK_API_COMMAND
+)
+declare -A ENV_OVERRIDES=()
+for key in "${ENV_KEYS[@]}"; do
+  if [[ -v "${key}" ]]; then
+    ENV_OVERRIDES["${key}"]="${!key}"
+  fi
+done
+
+if [[ -f "${ROOT_DIR}/.env" ]]; then
+  set -a
+  # shellcheck disable=SC1091
+  source "${ROOT_DIR}/.env"
+  set +a
+fi
+for key in "${!ENV_OVERRIDES[@]}"; do
+  export "${key}=${ENV_OVERRIDES[${key}]}"
+done
+
+ACTION="${1:-start}"
+API_PID_FILE="${DUCK_API_PID_FILE:-${ROOT_DIR}/data/duck-api.pid}"
+API_LOG_FILE="${DUCK_API_LOG_FILE:-${ROOT_DIR}/data/duck-api.log}"
+API_URL="http://${DUCK_API_HOST:-127.0.0.1}:${DUCK_API_PORT:-8000}"
+LLAMA_SCRIPT="${ROOT_DIR}/scripts/llama/start_main.sh"
+
+usage() {
+  cat <<'EOF'
+Usage: scripts/duck.sh <command>
+
+Commands:
+  start       Start llama-server and DuckLM API in the background
+  stop        Stop DuckLM API and managed llama-server
+  restart     Stop and start the whole local DuckLM stack
+  status      Print process and HTTP health status
+  logs        Show DuckLM API and llama-server logs; use --follow/-f and --lines N
+  help        Show this help
+
+Environment:
+  DUCK_API_HOST       API bind host, default 127.0.0.1
+  DUCK_API_PORT       API port, default 8000
+  DUCK_API_PID_FILE   API PID file path
+  DUCK_API_LOG_FILE   API log file path
+  DUCK_API_COMMAND    API command override, default ".venv/bin/python -m duck_core.api"
+
+llama-server environment is handled by scripts/llama/start_main.sh.
+EOF
+}
+
+api_is_running() {
+  [[ -f "${API_PID_FILE}" ]] || return 1
+  local pid
+  pid="$(cat "${API_PID_FILE}")"
+  [[ "${pid}" =~ ^[0-9]+$ ]] || return 1
+  kill -0 "${pid}" 2>/dev/null
+}
+
+api_pid_value() {
+  if [[ -f "${API_PID_FILE}" ]]; then
+    cat "${API_PID_FILE}"
+  fi
+}
+
+start_api() {
+  if api_is_running; then
+    echo "DuckLM API already running: pid=$(api_pid_value)"
+    return 0
+  fi
+
+  mkdir -p "$(dirname "${API_PID_FILE}")" "$(dirname "${API_LOG_FILE}")"
+  rm -f "${API_PID_FILE}"
+
+  local command_string="${DUCK_API_COMMAND:-.venv/bin/python -m duck_core.api}"
+  echo "Starting DuckLM API..."
+  echo "Command: ${command_string}" >> "${API_LOG_FILE}"
+  if command -v setsid >/dev/null 2>&1; then
+    (
+      cd "${ROOT_DIR}"
+      nohup setsid bash -lc "${command_string}" >> "${API_LOG_FILE}" 2>&1 < /dev/null &
+      echo "$!" > "${API_PID_FILE}"
+    )
+  else
+    (
+      cd "${ROOT_DIR}"
+      nohup bash -lc "${command_string}" >> "${API_LOG_FILE}" 2>&1 < /dev/null &
+      echo "$!" > "${API_PID_FILE}"
+    )
+  fi
+
+  sleep 0.2
+  if api_is_running; then
+    echo "DuckLM API started: pid=$(api_pid_value)"
+    echo "WebChat: ${API_URL}/"
+    echo "Log: ${API_LOG_FILE}"
+    for _ in {1..20}; do
+      if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${API_URL}/health" >/dev/null 2>&1; then
+        echo "DuckLM API health: ok (${API_URL})"
+        return 0
+      fi
+      sleep 0.25
+    done
+    echo "DuckLM API health: not ready yet (${API_URL})"
+    return 0
+  fi
+
+  echo "DuckLM API failed to start. See ${API_LOG_FILE}" >&2
+  rm -f "${API_PID_FILE}"
+  return 1
+}
+
+stop_api() {
+  if ! api_is_running; then
+    rm -f "${API_PID_FILE}"
+    echo "DuckLM API not running"
+    return 0
+  fi
+
+  local pid
+  pid="$(api_pid_value)"
+  echo "Stopping DuckLM API: pid=${pid}"
+  kill "${pid}" 2>/dev/null || true
+
+  for _ in {1..30}; do
+    if ! kill -0 "${pid}" 2>/dev/null; then
+      rm -f "${API_PID_FILE}"
+      echo "DuckLM API stopped"
+      return 0
+    fi
+    sleep 0.2
+  done
+
+  echo "DuckLM API did not stop after SIGTERM; sending SIGKILL"
+  kill -9 "${pid}" 2>/dev/null || true
+  rm -f "${API_PID_FILE}"
+  echo "DuckLM API stopped"
+}
+
+start_stack() {
+  "${LLAMA_SCRIPT}" start
+  start_api
+  echo
+  echo "Status:"
+  status_stack
+}
+
+stop_stack() {
+  stop_api
+  "${LLAMA_SCRIPT}" stop
+}
+
+status_stack() {
+  local rc=0
+  if api_is_running; then
+    local pid
+    pid="$(api_pid_value)"
+    echo "DuckLM API running: pid=${pid}"
+    if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${API_URL}/health" >/dev/null 2>&1; then
+      echo "DuckLM API health: ok (${API_URL})"
+    else
+      echo "DuckLM API health: not ready (${API_URL})"
+    fi
+  else
+    if [[ -f "${API_PID_FILE}" ]]; then
+      echo "DuckLM API not running; removing stale pid file ${API_PID_FILE}"
+      rm -f "${API_PID_FILE}"
+    else
+      echo "DuckLM API not running"
+    fi
+    rc=3
+  fi
+
+  local llama_rc=0
+  "${LLAMA_SCRIPT}" status || llama_rc=$?
+  if [[ "${llama_rc}" != "0" && "${rc}" == "0" ]]; then
+    rc="${llama_rc}"
+  fi
+  return "${rc}"
+}
+
+logs_stack() {
+  local follow=0
+  local lines=100
+  shift || true
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+      -f|--follow)
+        follow=1
+        shift
+        ;;
+      --lines)
+        lines="${2:?--lines requires a value}"
+        shift 2
+        ;;
+      *)
+        echo "Unknown logs argument: $1" >&2
+        return 2
+        ;;
+    esac
+  done
+
+  mkdir -p "$(dirname "${API_LOG_FILE}")"
+  touch "${API_LOG_FILE}"
+  if [[ "${follow}" == "1" ]]; then
+    local llama_log_file="${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-main.log}"
+    mkdir -p "$(dirname "${llama_log_file}")"
+    touch "${llama_log_file}"
+    tail -n "${lines}" -f "${API_LOG_FILE}" "${llama_log_file}"
+  else
+    echo "==> DuckLM API log: ${API_LOG_FILE} <=="
+    tail -n "${lines}" "${API_LOG_FILE}"
+    echo
+    echo "==> llama-server log: ${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-main.log} <=="
+    "${LLAMA_SCRIPT}" logs --lines "${lines}"
+  fi
+}
+
+case "${ACTION}" in
+  start)
+    start_stack
+    ;;
+  stop)
+    stop_stack
+    ;;
+  restart)
+    stop_stack
+    start_stack
+    ;;
+  status)
+    status_stack
+    ;;
+  logs)
+    logs_stack "$@"
+    ;;
+  help|-h|--help)
+    usage
+    ;;
+  *)
+    echo "Unknown command: ${ACTION}" >&2
+    usage >&2
+    exit 2
+    ;;
+esac
--- a/scripts/llama/start_main.sh
+++ b/scripts/llama/start_main.sh
@ -37,6 +37,7 @@ ACTION="${1:-start}"
 PID_FILE="${DUCK_LLAMA_PID_FILE:-${ROOT_DIR}/data/llama-main.pid}"
 LOG_FILE="${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-main.log}"
 BASE_URL="http://${DUCK_HOST:-127.0.0.1}:${DUCK_MAIN_PORT:-8081}/v1"
+LLAMA_BIN_DIR=""

 resolve_project_path() {
  local value="$1"
@ -125,6 +126,7 @@ start() {
  llama_bin="${DUCK_LLAMA_SERVER_BIN:-llama-server}"
  if [[ "${llama_bin}" == */* ]]; then
    llama_bin="$(resolve_project_path "${llama_bin}")"
+    LLAMA_BIN_DIR="$(dirname "${llama_bin}")"
  fi
  model_path="$(resolve_project_path "${DUCK_MAIN_MODEL_PATH}")"
  local command=(
@ -152,9 +154,9 @@ start() {
  echo "Starting llama-server..."
  echo "Command: ${command[*]}" >> "${LOG_FILE}"
  if command -v setsid >/dev/null 2>&1; then
-    nohup setsid "${command[@]}" >> "${LOG_FILE}" 2>&1 &
+    nohup setsid env LD_LIBRARY_PATH="${LLAMA_BIN_DIR}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" "${command[@]}" >> "${LOG_FILE}" 2>&1 &
  else
-    nohup "${command[@]}" >> "${LOG_FILE}" 2>&1 &
+    nohup env LD_LIBRARY_PATH="${LLAMA_BIN_DIR}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" "${command[@]}" >> "${LOG_FILE}" 2>&1 &
  fi
  local pid=$!
  echo "${pid}" > "${PID_FILE}"
--- a/scripts/llama/start_mtp_main.sh
+++ b/scripts/llama/start_mtp_main.sh
@ -0,0 +1,271 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+
+ENV_KEYS=(
+  DUCK_LLAMA_SERVER_BIN
+  DUCK_MTP_MODEL_PATH
+  DUCK_MAIN_PORT
+  DUCK_CTX_SIZE
+  DUCK_N_GPU_LAYERS
+  DUCK_LLAMA_DEVICE
+  DUCK_PARALLEL
+  DUCK_LLAMA_PID_FILE
+  DUCK_LLAMA_LOG_FILE
+  DUCK_MTP_FLAGS
+  DUCK_HOST
+)
+declare -A ENV_OVERRIDES=()
+for key in "${ENV_KEYS[@]}"; do
+  if [[ -v "${key}" ]]; then
+    ENV_OVERRIDES["${key}"]="${!key}"
+  fi
+done
+
+if [[ -f "${ROOT_DIR}/.env" ]]; then
+  set -a
+  # shellcheck disable=SC1091
+  source "${ROOT_DIR}/.env"
+  set +a
+fi
+for key in "${!ENV_OVERRIDES[@]}"; do
+  export "${key}=${ENV_OVERRIDES[${key}]}"
+done
+
+ACTION="${1:-start}"
+PID_FILE="${DUCK_LLAMA_PID_FILE:-${ROOT_DIR}/data/llama-mtp.pid}"
+LOG_FILE="${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-mtp.log}"
+BASE_URL="http://${DUCK_HOST:-127.0.0.1}:${DUCK_MAIN_PORT:-8081}/v1"
+LLAMA_BIN_DIR=""
+
+resolve_project_path() {
+  local value="$1"
+  if [[ "${value}" == /* ]]; then
+    printf '%s\n' "${value}"
+  else
+    printf '%s\n' "${ROOT_DIR}/${value#./}"
+  fi
+}
+
+usage() {
+  cat <<'EOF'
+Usage: scripts/llama/start_mtp_main.sh <command>
+
+Commands:
+  start       Start MTP llama-server in the background
+  stop        Stop the managed MTP llama-server process
+  restart     Stop and start MTP llama-server
+  status      Print process and HTTP health status
+  logs        Show logs; use --follow/-f and --lines N
+  help        Show this help
+
+Environment:
+  DUCK_LLAMA_SERVER_BIN  Path to llama-server binary
+  DUCK_MTP_MODEL_PATH    Path to MTP GGUF model
+  DUCK_HOST              Bind host, default 127.0.0.1
+  DUCK_MAIN_PORT         Port, default 8081
+  DUCK_CTX_SIZE          Context size, default 65536
+  DUCK_N_GPU_LAYERS      GPU layers, default auto
+  DUCK_LLAMA_DEVICE      Device name, for example Vulkan0
+  DUCK_PARALLEL          Server slots, default 1
+  DUCK_LLAMA_PID_FILE    PID file path
+  DUCK_LLAMA_LOG_FILE    Log file path
+  DUCK_MTP_FLAGS         Extra MTP llama-server args
+EOF
+}
+
+is_running() {
+  [[ -f "${PID_FILE}" ]] || return 1
+  local pid
+  pid="$(cat "${PID_FILE}")"
+  [[ "${pid}" =~ ^[0-9]+$ ]] || return 1
+  kill -0 "${pid}" 2>/dev/null
+}
+
+pid_value() {
+  if [[ -f "${PID_FILE}" ]]; then
+    cat "${PID_FILE}"
+  fi
+}
+
+status() {
+  if is_running; then
+    local pid
+    pid="$(pid_value)"
+    echo "llama-server running: pid=${pid}"
+    if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${BASE_URL}/models" >/dev/null 2>&1; then
+      echo "HTTP health: ok (${BASE_URL})"
+    else
+      echo "HTTP health: not ready (${BASE_URL})"
+    fi
+    return 0
+  fi
+
+  if [[ -f "${PID_FILE}" ]]; then
+    echo "llama-server not running; removing stale pid file ${PID_FILE}"
+    rm -f "${PID_FILE}"
+  else
+    echo "llama-server not running"
+  fi
+  return 3
+}
+
+start() {
+  if is_running; then
+    echo "MTP llama-server already running: pid=$(pid_value)"
+    return 0
+  fi
+
+  : "${DUCK_MTP_MODEL_PATH:?DUCK_MTP_MODEL_PATH is required}"
+
+  mkdir -p "$(dirname "${PID_FILE}")" "$(dirname "${LOG_FILE}")"
+  rm -f "${PID_FILE}"
+
+  local llama_bin mtp_model_path
+  llama_bin="${DUCK_LLAMA_SERVER_BIN:-llama-server}"
+  if [[ "${llama_bin}" == */* ]]; then
+    llama_bin="$(resolve_project_path "${llama_bin}")"
+    LLAMA_BIN_DIR="$(dirname "${llama_bin}")"
+  fi
+  mtp_model_path="$(resolve_project_path "${DUCK_MTP_MODEL_PATH}")"
+
+  local help_text
+  help_text="$(LD_LIBRARY_PATH="${LLAMA_BIN_DIR}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" "${llama_bin}" --help 2>&1 || true)"
+  if ! grep -qi "draft-mtp" <<< "${help_text}"; then
+    echo "This llama-server build does not expose draft-mtp speculative decoding." >&2
+    return 1
+  fi
+
+  local command=(
+    "${llama_bin}"
+    -m "${mtp_model_path}"
+    --alias local-main
+    --host "${DUCK_HOST:-127.0.0.1}"
+    --port "${DUCK_MAIN_PORT:-8081}"
+    -c "${DUCK_CTX_SIZE:-65536}"
+    --parallel "${DUCK_PARALLEL:-1}"
+    -ngl "${DUCK_N_GPU_LAYERS:-auto}"
+    --flash-attn on
+    --cache-prompt
+    --metrics
+    --spec-type draft-mtp
+  )
+  if [[ -n "${DUCK_LLAMA_DEVICE:-}" ]]; then
+    command+=(--device "${DUCK_LLAMA_DEVICE}")
+  fi
+  if [[ -n "${DUCK_MTP_FLAGS:-}" ]]; then
+    # shellcheck disable=SC2206
+    local extra_args=( ${DUCK_MTP_FLAGS} )
+    command+=("${extra_args[@]}")
+  fi
+
+  echo "Starting MTP llama-server..."
+  echo "Command: ${command[*]}" >> "${LOG_FILE}"
+  if command -v setsid >/dev/null 2>&1; then
+    nohup setsid env LD_LIBRARY_PATH="${LLAMA_BIN_DIR}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" "${command[@]}" >> "${LOG_FILE}" 2>&1 &
+  else
+    nohup env LD_LIBRARY_PATH="${LLAMA_BIN_DIR}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" "${command[@]}" >> "${LOG_FILE}" 2>&1 &
+  fi
+  local pid=$!
+  echo "${pid}" > "${PID_FILE}"
+  sleep 0.2
+
+  if is_running; then
+    echo "MTP llama-server started: pid=${pid}"
+    echo "Log: ${LOG_FILE}"
+    return 0
+  fi
+
+  echo "MTP llama-server failed to start. See ${LOG_FILE}" >&2
+  rm -f "${PID_FILE}"
+  return 1
+}
+
+stop() {
+  if ! is_running; then
+    rm -f "${PID_FILE}"
+    echo "llama-server not running"
+    return 0
+  fi
+
+  local pid
+  pid="$(pid_value)"
+  echo "Stopping MTP llama-server: pid=${pid}"
+  kill "${pid}" 2>/dev/null || true
+
+  for _ in {1..30}; do
+    if ! kill -0 "${pid}" 2>/dev/null; then
+      rm -f "${PID_FILE}"
+      echo "MTP llama-server stopped"
+      return 0
+    fi
+    sleep 0.2
+  done
+
+  echo "MTP llama-server did not stop after SIGTERM; sending SIGKILL"
+  kill -9 "${pid}" 2>/dev/null || true
+  rm -f "${PID_FILE}"
+  echo "MTP llama-server stopped"
+}
+
+restart() {
+  stop
+  start
+}
+
+logs() {
+  local follow=0
+  local lines=100
+  shift || true
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+      -f|--follow)
+        follow=1
+        shift
+        ;;
+      --lines)
+        lines="${2:?--lines requires a value}"
+        shift 2
+        ;;
+      *)
+        echo "Unknown logs argument: $1" >&2
+        return 2
+        ;;
+    esac
+  done
+
+  mkdir -p "$(dirname "${LOG_FILE}")"
+  touch "${LOG_FILE}"
+  if [[ "${follow}" == "1" ]]; then
+    tail -n "${lines}" -f "${LOG_FILE}"
+  else
+    tail -n "${lines}" "${LOG_FILE}"
+  fi
+}
+
+case "${ACTION}" in
+  start)
+    start
+    ;;
+  stop)
+    stop
+    ;;
+  restart)
+    restart
+    ;;
+  status)
+    status
+    ;;
+  logs)
+    logs "$@"
+    ;;
+  help|-h|--help)
+    usage
+    ;;
+  *)
+    echo "Unknown command: ${ACTION}" >&2
+    usage >&2
+    exit 2
+    ;;
+esac
--- a/tests/smoke/test_api_stream_chat.py
+++ b/tests/smoke/test_api_stream_chat.py
@ -56,6 +56,61 @@ def test_stream_chat_endpoint_emits_sse_reasoning_and_content(tmp_path, monkeypa
    assert "answer" in body


+def test_stream_chat_forwards_reasoning_toggle_to_thinker(tmp_path, monkeypatch):
+    monkeypatch.setenv("DUCK_DB_PATH", str(tmp_path / "duck.sqlite3"))
+    captured = {}
+
+    async def fake_chat(self, role, messages, temperature=None, max_output_tokens=None, response_format=None):
+        return ModelResponse(
+            role=role,
+            model="local-main",
+            content=json.dumps(
+                {
+                    "kind": "action_directive",
+                    "intent": "answer directly",
+                    "risk_level": "none",
+                    "actions": [],
+                }
+            ),
+            reasoning_content=None,
+            raw={},
+            latency_ms=1.0,
+        )
+
+    async def fake_stream_chat(
+        self,
+        role,
+        messages,
+        temperature=None,
+        max_output_tokens=None,
+        response_format=None,
+        reasoning=None,
+    ):
+        captured["role"] = role
+        captured["reasoning"] = reasoning
+        yield {"type": "content_delta", "delta": "answer"}
+
+    monkeypatch.setattr("duck_core.model_client.ModelClient.chat", fake_chat)
+    monkeypatch.setattr("duck_core.model_client.ModelClient.stream_chat", fake_stream_chat)
+    client = TestClient(create_app())
+
+    with client.stream(
+        "POST",
+        "/v1/chat/stream",
+        json={
+            "message": "hello",
+            "workspace": "./workspace",
+            "debug": True,
+            "reasoning": "off",
+        },
+    ) as response:
+        body = "".join(response.iter_text())
+
+    assert response.status_code == 200
+    assert "event: done" in body
+    assert captured == {"role": "thinker", "reasoning": "off"}
+
+
 def test_stream_chat_runs_memory_policy_and_reflection_after_completion(tmp_path, monkeypatch):
    monkeypatch.setenv("DUCK_DB_PATH", str(tmp_path / "duck.sqlite3"))

--- a/tests/smoke/test_duck_service_script.py
+++ b/tests/smoke/test_duck_service_script.py
@ -0,0 +1,246 @@
+import os
+import subprocess
+import textwrap
+import time
+from pathlib import Path
+
+
+def test_duck_script_manages_llama_and_api(tmp_path):
+    fake_bin = tmp_path / "llama-server"
+    fake_bin.write_text(
+        textwrap.dedent(
+            """\
+            #!/usr/bin/env bash
+            echo "fake llama-server $*" >&2
+            trap 'exit 0' TERM INT
+            while true; do sleep 1; done
+            """
+        )
+    )
+    fake_bin.chmod(0o755)
+
+    fake_api = tmp_path / "fake-api.sh"
+    fake_api.write_text(
+        textwrap.dedent(
+            """\
+            #!/usr/bin/env bash
+            echo "fake duck api $*" >&2
+            trap 'exit 0' TERM INT
+            while true; do sleep 1; done
+            """
+        )
+    )
+    fake_api.chmod(0o755)
+
+    pid_file = tmp_path / "llama.pid"
+    log_file = tmp_path / "llama.log"
+    api_pid_file = tmp_path / "duck-api.pid"
+    api_log_file = tmp_path / "duck-api.log"
+    model_path = tmp_path / "model.gguf"
+    model_path.write_text("fake")
+
+    env = {
+        **os.environ,
+        "DUCK_LLAMA_SERVER_BIN": str(fake_bin),
+        "DUCK_MAIN_MODEL_PATH": str(model_path),
+        "DUCK_LLAMA_PID_FILE": str(pid_file),
+        "DUCK_LLAMA_LOG_FILE": str(log_file),
+        "DUCK_MAIN_PORT": "18081",
+        "DUCK_API_PID_FILE": str(api_pid_file),
+        "DUCK_API_LOG_FILE": str(api_log_file),
+        "DUCK_API_COMMAND": str(fake_api),
+        "DUCK_API_PORT": "18000",
+    }
+    script = "scripts/duck.sh"
+
+    stopped = subprocess.run([script, "status"], env=env, text=True, capture_output=True)
+    assert stopped.returncode == 3
+    assert "DuckLM API not running" in stopped.stdout
+    assert "llama-server not running" in stopped.stdout
+
+    started = subprocess.run([script, "start"], env=env, text=True, capture_output=True)
+    assert started.returncode == 0
+    assert "Starting llama-server" in started.stdout
+    assert "Starting DuckLM API" in started.stdout
+    assert "Status:" in started.stdout
+    assert "DuckLM API running" in started.stdout
+    assert "llama-server running" in started.stdout
+    assert api_pid_file.exists()
+    assert pid_file.exists()
+
+    try:
+        running = subprocess.run([script, "status"], env=env, text=True, capture_output=True)
+        assert running.returncode == 0
+        assert "DuckLM API running" in running.stdout
+        assert "llama-server running" in running.stdout
+
+        time.sleep(0.2)
+        logs = subprocess.run(
+            [script, "logs", "--lines", "20"], env=env, text=True, capture_output=True
+        )
+        assert logs.returncode == 0
+        assert "DuckLM API log" in logs.stdout
+        assert "fake duck api" in logs.stdout
+        assert "llama-server log" in logs.stdout
+        assert "--alias local-main" in logs.stdout
+    finally:
+        stopped = subprocess.run([script, "stop"], env=env, text=True, capture_output=True)
+        assert stopped.returncode == 0
+        assert not api_pid_file.exists()
+        assert not pid_file.exists()
+
+
+def test_duck_mtp_script_starts_mtp_llama_for_duck_api(tmp_path):
+    fake_bin = tmp_path / "llama-server"
+    fake_bin.write_text(
+        textwrap.dedent(
+            """\
+            #!/usr/bin/env bash
+            if [[ "${1:-}" == "--help" ]]; then
+              echo "supports --spec-type draft-mtp"
+              exit 0
+            fi
+            echo "fake mtp llama-server $*" >&2
+            trap 'exit 0' TERM INT
+            while true; do sleep 1; done
+            """
+        )
+    )
+    fake_bin.chmod(0o755)
+
+    fake_api = tmp_path / "fake-api.sh"
+    fake_api.write_text(
+        textwrap.dedent(
+            """\
+            #!/usr/bin/env bash
+            echo "fake duck api $*" >&2
+            trap 'exit 0' TERM INT
+            while true; do sleep 1; done
+            """
+        )
+    )
+    fake_api.chmod(0o755)
+
+    main_model_path = tmp_path / "main.gguf"
+    mtp_model_path = tmp_path / "mtp.gguf"
+    main_model_path.write_text("main")
+    mtp_model_path.write_text("mtp")
+    pid_file = tmp_path / "llama-mtp.pid"
+    log_file = tmp_path / "llama-mtp.log"
+    api_pid_file = tmp_path / "duck-api-mtp.pid"
+    api_log_file = tmp_path / "duck-api-mtp.log"
+
+    env = {
+        **os.environ,
+        "DUCK_LLAMA_SERVER_BIN": str(fake_bin),
+        "DUCK_MAIN_MODEL_PATH": str(main_model_path),
+        "DUCK_MTP_MODEL_PATH": str(mtp_model_path),
+        "DUCK_LLAMA_PID_FILE": str(pid_file),
+        "DUCK_LLAMA_LOG_FILE": str(log_file),
+        "DUCK_MAIN_PORT": "18081",
+        "DUCK_API_PID_FILE": str(api_pid_file),
+        "DUCK_API_LOG_FILE": str(api_log_file),
+        "DUCK_API_COMMAND": str(fake_api),
+        "DUCK_API_PORT": "18000",
+    }
+    script = "scripts/duck-mtp.sh"
+
+    started = subprocess.run([script, "start"], env=env, text=True, capture_output=True)
+    assert started.returncode == 0
+    assert "Starting MTP llama-server" in started.stdout
+    assert "DuckLM API running" in started.stdout
+    assert "llama-server running" in started.stdout
+
+    try:
+        logs = subprocess.run(
+            [script, "logs", "--lines", "30"], env=env, text=True, capture_output=True
+        )
+        assert logs.returncode == 0
+        assert "--alias local-main" in logs.stdout
+        assert "--spec-type draft-mtp" in logs.stdout
+        assert f"-m {mtp_model_path}" in logs.stdout
+        assert "--model-draft" not in logs.stdout
+        assert str(main_model_path) not in logs.stdout
+    finally:
+        stopped = subprocess.run([script, "stop"], env=env, text=True, capture_output=True)
+        assert stopped.returncode == 0
+        assert not api_pid_file.exists()
+        assert not pid_file.exists()
+
+
+def test_duck_mtp_script_sets_llama_bin_dir_library_path_for_help_check(tmp_path):
+    bin_dir = tmp_path / "build" / "bin"
+    bin_dir.mkdir(parents=True)
+    fake_bin = bin_dir / "llama-server"
+    fake_bin.write_text(
+        textwrap.dedent(
+            """\
+            #!/usr/bin/env bash
+            case ":${LD_LIBRARY_PATH:-}:" in
+              *":$(dirname "$0"):"*) ;;
+              *)
+                echo "error while loading shared libraries: libllama-common.so.0" >&2
+                exit 127
+                ;;
+            esac
+            if [[ "${1:-}" == "--help" ]]; then
+              echo "supports --spec-type draft-mtp"
+              exit 0
+            fi
+            echo "fake mtp llama-server $*" >&2
+            trap 'exit 0' TERM INT
+            while true; do sleep 1; done
+            """
+        )
+    )
+    fake_bin.chmod(0o755)
+
+    fake_api = tmp_path / "fake-api.sh"
+    fake_api.write_text(
+        textwrap.dedent(
+            """\
+            #!/usr/bin/env bash
+            trap 'exit 0' TERM INT
+            while true; do sleep 1; done
+            """
+        )
+    )
+    fake_api.chmod(0o755)
+
+    main_model_path = tmp_path / "main.gguf"
+    mtp_model_path = tmp_path / "mtp.gguf"
+    main_model_path.write_text("main")
+    mtp_model_path.write_text("mtp")
+
+    env = {
+        **os.environ,
+        "LD_LIBRARY_PATH": "",
+        "DUCK_LLAMA_SERVER_BIN": str(fake_bin),
+        "DUCK_MAIN_MODEL_PATH": str(main_model_path),
+        "DUCK_MTP_MODEL_PATH": str(mtp_model_path),
+        "DUCK_LLAMA_PID_FILE": str(tmp_path / "llama-mtp.pid"),
+        "DUCK_LLAMA_LOG_FILE": str(tmp_path / "llama-mtp.log"),
+        "DUCK_MAIN_PORT": "18081",
+        "DUCK_API_PID_FILE": str(tmp_path / "duck-api-mtp.pid"),
+        "DUCK_API_LOG_FILE": str(tmp_path / "duck-api-mtp.log"),
+        "DUCK_API_COMMAND": str(fake_api),
+        "DUCK_API_PORT": "18000",
+    }
+
+    started = subprocess.run(
+        ["scripts/duck-mtp.sh", "start"], env=env, text=True, capture_output=True
+    )
+    assert started.returncode == 0
+
+    try:
+        logs = subprocess.run(
+            ["scripts/duck-mtp.sh", "logs", "--lines", "30"],
+            env=env,
+            text=True,
+            capture_output=True,
+        )
+        assert logs.returncode == 0
+        assert "--spec-type draft-mtp" in logs.stdout
+        assert "--model-draft" not in logs.stdout
+    finally:
+        subprocess.run(["scripts/duck-mtp.sh", "stop"], env=env, text=True, capture_output=True)
--- a/tests/smoke/test_llama_service_script.py
+++ b/tests/smoke/test_llama_service_script.py
@ -55,3 +55,54 @@ def test_start_main_script_manages_pid_status_stop_and_logs(tmp_path):
        stopped = subprocess.run([script, "stop"], env=env, text=True, capture_output=True)
        assert stopped.returncode == 0
        assert not pid_file.exists()
+
+
+def test_start_main_script_sets_llama_bin_dir_library_path(tmp_path):
+    bin_dir = tmp_path / "build" / "bin"
+    bin_dir.mkdir(parents=True)
+    fake_bin = bin_dir / "llama-server"
+    fake_bin.write_text(
+        textwrap.dedent(
+            """\
+            #!/usr/bin/env bash
+            case ":${LD_LIBRARY_PATH:-}:" in
+              *":$(dirname "$0"):"*) ;;
+              *)
+                echo "error while loading shared libraries: libllama-common.so.0" >&2
+                exit 127
+                ;;
+            esac
+            echo "fake llama-server $*" >&2
+            trap 'exit 0' TERM INT
+            while true; do sleep 1; done
+            """
+        )
+    )
+    fake_bin.chmod(0o755)
+    model_path = tmp_path / "model.gguf"
+    model_path.write_text("fake")
+    pid_file = tmp_path / "llama.pid"
+    log_file = tmp_path / "llama.log"
+
+    env = {
+        **os.environ,
+        "LD_LIBRARY_PATH": "",
+        "DUCK_LLAMA_SERVER_BIN": str(fake_bin),
+        "DUCK_MAIN_MODEL_PATH": str(model_path),
+        "DUCK_LLAMA_PID_FILE": str(pid_file),
+        "DUCK_LLAMA_LOG_FILE": str(log_file),
+        "DUCK_MAIN_PORT": "18081",
+    }
+    script = "scripts/llama/start_main.sh"
+
+    started = subprocess.run([script, "start"], env=env, text=True, capture_output=True)
+    assert started.returncode == 0
+    try:
+        logs = subprocess.run(
+            [script, "logs", "--lines", "20"], env=env, text=True, capture_output=True
+        )
+        assert logs.returncode == 0
+        assert "--alias local-main" in logs.stdout
+        assert "error while loading shared libraries" not in logs.stdout
+    finally:
+        subprocess.run([script, "stop"], env=env, text=True, capture_output=True)
--- a/tests/smoke/test_model_client.py
+++ b/tests/smoke/test_model_client.py
@ -57,6 +57,32 @@ async def test_model_client_preserves_reasoning_content(monkeypatch):
    assert response.reasoning_content == "private reasoning"


+@pytest.mark.asyncio
+async def test_model_client_adds_request_reasoning_options(monkeypatch):
+    payloads = []
+
+    async def fake_post(self, url, json):
+        payloads.append(json)
+        return httpx.Response(
+            200,
+            json={"choices": [{"message": {"role": "assistant", "content": "ok"}}]},
+            request=httpx.Request("POST", url),
+        )
+
+    monkeypatch.setattr(httpx.AsyncClient, "post", fake_post)
+    client = ModelClient("config/models.yaml")
+
+    await client.chat("thinker", [{"role": "user", "content": "hello"}], reasoning="on")
+    await client.chat("thinker", [{"role": "user", "content": "hello"}], reasoning="off")
+
+    assert payloads[0]["reasoning_format"] == "deepseek"
+    assert payloads[0]["chat_template_kwargs"] == {"enable_thinking": True}
+    assert "thinking_budget_tokens" not in payloads[0]
+    assert payloads[1]["reasoning_format"] == "deepseek"
+    assert payloads[1]["chat_template_kwargs"] == {"enable_thinking": False}
+    assert payloads[1]["thinking_budget_tokens"] == 0
+
+
@pytest.mark.asyncio
 async def test_model_client_stream_chat_yields_reasoning_then_content(monkeypatch):
    class FakeStreamResponse: