From ff98224eb67516a957331ef302bacf9ff2a67c77 Mon Sep 17 00:00:00 2001
From: mirivlad <mirvtop@yandex.ru>
Date: Fri, 22 May 2026 07:27:59 +0800
Subject: [PATCH] Add DuckLM service scripts and utility model benchmark

---
 CURRENT_STATE.md                              |  21 +-
 Makefile                                      |  38 +-
 README.md                                     |  27 +-
 .../utility_model_bench_20260522_044407.json  |  80 ++++
 .../utility_model_bench_20260522_044407.md    |  21 +
 .../utility_model_bench_20260522_050427.json  | 392 ++++++++++++++++
 .../utility_model_bench_20260522_050427.md    |  69 +++
 docs/how_to_run.md                            |  55 ++-
 docs/web_api.md                               |  15 +
 duck_core/api.py                              |  11 +-
 duck_core/model_client.py                     |  22 +-
 duck_core/runtime_loop.py                     |   8 +-
 duck_core/web/static/app.js                   | 127 ++++-
 duck_core/web/static/style.css                |  54 +++
 duck_core/web/templates/index.html            |   4 +
 scripts/bench/bench_runtime.py                | 443 +++++++++++++++++-
 scripts/duck-mtp.sh                           | 262 +++++++++++
 scripts/duck.sh                               | 262 +++++++++++
 scripts/llama/start_main.sh                   |   6 +-
 scripts/llama/start_mtp_main.sh               | 271 +++++++++++
 tests/smoke/test_api_stream_chat.py           |  55 +++
 tests/smoke/test_duck_service_script.py       | 246 ++++++++++
 tests/smoke/test_llama_service_script.py      |  51 ++
 tests/smoke/test_model_client.py              |  26 +
 24 files changed, 2492 insertions(+), 74 deletions(-)
 create mode 100644 docs/bench/utility_model_bench_20260522_044407.json
 create mode 100644 docs/bench/utility_model_bench_20260522_044407.md
 create mode 100644 docs/bench/utility_model_bench_20260522_050427.json
 create mode 100644 docs/bench/utility_model_bench_20260522_050427.md
 create mode 100755 scripts/duck-mtp.sh
 create mode 100755 scripts/duck.sh
 create mode 100755 scripts/llama/start_mtp_main.sh
 create mode 100644 tests/smoke/test_duck_service_script.py

diff --git a/CURRENT_STATE.md b/CURRENT_STATE.md
index 9be8b48..9f82fe2 100644
--- a/CURRENT_STATE.md
+++ b/CURRENT_STATE.md
@@ -106,8 +106,7 @@ git diff --check
 
 ```bash
 . .venv/bin/activate
-bash scripts/llama/start_main.sh start
-python -m duck_core.api
+bash scripts/duck.sh start
 ```
 
 Открыть WebChat:
@@ -123,6 +122,24 @@ curl --noproxy '*' http://127.0.0.1:8000/health
 curl --noproxy '*' http://127.0.0.1:8000/v1/models/roles
 ```
 
+Управление процессами:
+
+```bash
+bash scripts/duck.sh status
+bash scripts/duck.sh logs --follow
+bash scripts/duck.sh restart
+bash scripts/duck.sh stop
+```
+
+MTP/speculative-вариант:
+
+```bash
+bash scripts/duck.sh stop
+bash scripts/duck-mtp.sh start
+bash scripts/duck-mtp.sh status
+bash scripts/duck-mtp.sh logs --follow
+```
+
 ## Что делать следующим
 
 1. Пройти live E2E checklist в WebChat на реальной модели.
diff --git a/Makefile b/Makefile
index ca06184..1a22cb9 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,32 @@
 duck-up:
-	docker compose -f docker-compose.memory.yml up -d
-	@echo "Memory services started."
-	@echo "Start llama-server:"
-	@echo "bash scripts/llama/start_main.sh start"
+	bash scripts/duck.sh start
+
+duck-stop:
+	bash scripts/duck.sh stop
+
+duck-restart:
+	bash scripts/duck.sh restart
+
+duck-status:
+	bash scripts/duck.sh status
+
+duck-logs:
+	bash scripts/duck.sh logs --follow
+
+duck-mtp-up:
+	bash scripts/duck-mtp.sh start
+
+duck-mtp-stop:
+	bash scripts/duck-mtp.sh stop
+
+duck-mtp-restart:
+	bash scripts/duck-mtp.sh restart
+
+duck-mtp-status:
+	bash scripts/duck-mtp.sh status
+
+duck-mtp-logs:
+	bash scripts/duck-mtp.sh logs --follow
 
 duck-llama-main:
 	bash scripts/llama/start_main.sh start
@@ -26,11 +50,7 @@ duck-api:
 	python3 -m duck_core.api
 
 duck-dev:
-	docker compose -f docker-compose.memory.yml up -d
-	@echo "Start llama-server in another terminal:"
-	@echo "bash scripts/llama/start_main.sh start"
-	@echo "Then run:"
-	@echo "make duck-api"
+	bash scripts/duck.sh start
 	@echo "Open:"
 	@echo "http://127.0.0.1:8000/"
 
diff --git a/README.md b/README.md
index 2d0982f..a511f72 100644
--- a/README.md
+++ b/README.md
@@ -9,14 +9,25 @@ python3 -m venv .venv
 . .venv/bin/activate
 python -m pip install -e ".[dev]"
 cp .env.example .env
-bash scripts/llama/start_main.sh
-```
-
-In another terminal:
-
-```bash
-. .venv/bin/activate
-python -m duck_core.api
+bash scripts/duck.sh start
 ```
 
 Open `http://127.0.0.1:8000/`.
+
+Useful commands:
+
+```bash
+bash scripts/duck.sh status
+bash scripts/duck.sh logs --follow
+bash scripts/duck.sh restart
+bash scripts/duck.sh stop
+```
+
+MTP/speculative variant:
+
+```bash
+bash scripts/duck-mtp.sh start
+bash scripts/duck-mtp.sh status
+bash scripts/duck-mtp.sh logs --follow
+bash scripts/duck-mtp.sh stop
+```
diff --git a/docs/bench/utility_model_bench_20260522_044407.json b/docs/bench/utility_model_bench_20260522_044407.json
new file mode 100644
index 0000000..c9c1696
--- /dev/null
+++ b/docs/bench/utility_model_bench_20260522_044407.json
@@ -0,0 +1,80 @@
+[
+  {
+    "model": "Qwen3.6-35B nonMTP GPU baseline",
+    "quality": 0.971,
+    "avg_latency_seconds": 17.935,
+    "avg_tokens_per_second": 4.51,
+    "cases": [
+      {
+        "role": "action",
+        "case": "direct_answer_no_tools",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 15.321,
+        "completion_tokens": 45,
+        "tokens_per_second": 2.94,
+        "content_preview": "{\n  \"kind\": \"action_directive\",\n  \"intent\": \"direct_answer\",\n  \"risk_level\": \"low\",\n  \"actions\": []\n}\n"
+      },
+      {
+        "role": "action",
+        "case": "read_specific_file",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 19.638,
+        "completion_tokens": 81,
+        "tokens_per_second": 4.12,
+        "content_preview": "{\n  \"kind\": \"action_directive\",\n  \"intent\": \"read_file\",\n  \"risk_level\": \"low\",\n  \"actions\": [\n    {\n      \"tool\": \"file_read\",\n      \"args\": {\n        \"path\": \"CURRENT_STATE.md\"\n      }\n    }\n  ]\n}\n"
+      },
+      {
+        "role": "memory_policy",
+        "case": "store_user_preference",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 18.421,
+        "completion_tokens": 88,
+        "tokens_per_second": 4.78,
+        "content_preview": "{\n  \"should_store\": true,\n  \"memory_type\": \"preference\",\n  \"summary\": \"User prefers responses in Russian and requires explicit confirmation before executing sudo commands.\",\n  \"importance\": 0.9,\n  \"scope\": \"global\",\n  \"metadata\": {\n    \"language\": \"ru\",\n    \"security_policy\": \"sudo_confirmation_required\"\n  }\n}"
+      },
+      {
+        "role": "memory_policy",
+        "case": "ignore_trivial_tool_call",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 14.978,
+        "completion_tokens": 61,
+        "tokens_per_second": 4.07,
+        "content_preview": "{\n  \"should_store\": false,\n  \"memory_type\": \"note\",\n  \"summary\": \"Routine execution of pwd command returning /tmp/project.\",\n  \"importance\": 0.1,\n  \"scope\": \"conversation\",\n  \"metadata\": {}\n}"
+      },
+      {
+        "role": "recall",
+        "case": "select_relevant_memory",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 15.038,
+        "completion_tokens": 66,
+        "tokens_per_second": 4.39,
+        "content_preview": "{\n  \"relevant_ids\": [\n    \"m1\"\n  ],\n  \"reasoning\": \"Memory m1 directly addresses the user's preference regarding the execution of sudo commands, which is the core of the query. Memories m2 and m3 are unrelated to sudo or command execution preferences.\"\n}"
+      },
+      {
+        "role": "summary",
+        "case": "preserve_decisions",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 9.991,
+        "completion_tokens": 44,
+        "tokens_per_second": 4.4,
+        "content_preview": "1. WebChat работает на 8000, llama-server на 8081.\n2. Для внешних путей нужен approval.\n3. allow_forever хранится по normalized action hash."
+      },
+      {
+        "role": "critic",
+        "case": "reflection_quality",
+        "score": 0.8,
+        "note": "missing=['lesson']",
+        "elapsed_seconds": 32.16,
+        "completion_tokens": 220,
+        "tokens_per_second": 6.84,
+        "content_preview": "**Critic Reflection: WebChat API Incident**\n\n**1. Risk Assessment**\n\n*   **Operational Fragility (High):** The incident revealed a critical dependency on manual intervention. If the API crashes or the server reboots, WebChat will silently fail or return errors until an engineer notices and manually restarts the service. This creates a \"single point of failure\" in the operational process.\n*   **Meaning Time to Recovery (MTTR) Variance:** Recovery time is currently dependent on human availability "
+      }
+    ]
+  }
+]
\ No newline at end of file
diff --git a/docs/bench/utility_model_bench_20260522_044407.md b/docs/bench/utility_model_bench_20260522_044407.md
new file mode 100644
index 0000000..d7da514
--- /dev/null
+++ b/docs/bench/utility_model_bench_20260522_044407.md
@@ -0,0 +1,21 @@
+# Utility Role Model Benchmark
+
+Scope: service roles only (`action`, `memory_policy`, `recall`, `summary`, `critic`).
+The main user-facing thinker is not evaluated for replacement here.
+
+| Model | Quality | Avg latency, s | Avg tok/s | Notes |
+| --- | ---: | ---: | ---: | --- |
+| Qwen3.6-35B nonMTP GPU baseline | 0.97 | 17.93 | 4.51 | critic/reflection_quality: missing=['lesson'] |
+
+## Case Details
+
+### Qwen3.6-35B nonMTP GPU baseline
+| Role | Case | Score | Latency, s | tok/s | Note |
+| --- | --- | ---: | ---: | ---: | --- |
+| action | direct_answer_no_tools | 1.00 | 15.32 | 2.94 | ok |
+| action | read_specific_file | 1.00 | 19.64 | 4.12 | ok |
+| memory_policy | store_user_preference | 1.00 | 18.42 | 4.78 | ok |
+| memory_policy | ignore_trivial_tool_call | 1.00 | 14.98 | 4.07 | ok |
+| recall | select_relevant_memory | 1.00 | 15.04 | 4.39 | ok |
+| summary | preserve_decisions | 1.00 | 9.99 | 4.40 | ok |
+| critic | reflection_quality | 0.80 | 32.16 | 6.84 | missing=['lesson'] |
diff --git a/docs/bench/utility_model_bench_20260522_050427.json b/docs/bench/utility_model_bench_20260522_050427.json
new file mode 100644
index 0000000..5371e9e
--- /dev/null
+++ b/docs/bench/utility_model_bench_20260522_050427.json
@@ -0,0 +1,392 @@
+[
+  {
+    "model": "Qwen3.6-35B nonMTP GPU baseline",
+    "quality": 0.971,
+    "avg_latency_seconds": 17.94,
+    "avg_tokens_per_second": 4.51,
+    "cases": [
+      {
+        "role": "action",
+        "case": "direct_answer_no_tools",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 15.307,
+        "completion_tokens": 45,
+        "tokens_per_second": 2.94,
+        "content_preview": "{\n  \"kind\": \"action_directive\",\n  \"intent\": \"direct_answer\",\n  \"risk_level\": \"low\",\n  \"actions\": []\n}\n"
+      },
+      {
+        "role": "action",
+        "case": "read_specific_file",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 19.612,
+        "completion_tokens": 81,
+        "tokens_per_second": 4.13,
+        "content_preview": "{\n  \"kind\": \"action_directive\",\n  \"intent\": \"read_file\",\n  \"risk_level\": \"low\",\n  \"actions\": [\n    {\n      \"tool\": \"file_read\",\n      \"args\": {\n        \"path\": \"CURRENT_STATE.md\"\n      }\n    }\n  ]\n}\n"
+      },
+      {
+        "role": "memory_policy",
+        "case": "store_user_preference",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 18.533,
+        "completion_tokens": 88,
+        "tokens_per_second": 4.75,
+        "content_preview": "{\n  \"should_store\": true,\n  \"memory_type\": \"preference\",\n  \"summary\": \"User prefers responses in Russian and requires explicit confirmation before executing sudo commands.\",\n  \"importance\": 0.9,\n  \"scope\": \"global\",\n  \"metadata\": {\n    \"language\": \"ru\",\n    \"security_policy\": \"sudo_confirmation_required\"\n  }\n}"
+      },
+      {
+        "role": "memory_policy",
+        "case": "ignore_trivial_tool_call",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 15.0,
+        "completion_tokens": 61,
+        "tokens_per_second": 4.07,
+        "content_preview": "{\n  \"should_store\": false,\n  \"memory_type\": \"note\",\n  \"summary\": \"Routine execution of pwd command returning /tmp/project.\",\n  \"importance\": 0.1,\n  \"scope\": \"conversation\",\n  \"metadata\": {}\n}"
+      },
+      {
+        "role": "recall",
+        "case": "select_relevant_memory",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 15.085,
+        "completion_tokens": 66,
+        "tokens_per_second": 4.38,
+        "content_preview": "{\n  \"relevant_ids\": [\n    \"m1\"\n  ],\n  \"reasoning\": \"Memory m1 directly addresses the user's preference regarding the execution of sudo commands, which is the core of the query. Memories m2 and m3 are unrelated to sudo or command execution preferences.\"\n}"
+      },
+      {
+        "role": "summary",
+        "case": "preserve_decisions",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 9.954,
+        "completion_tokens": 44,
+        "tokens_per_second": 4.42,
+        "content_preview": "1. WebChat работает на 8000, llama-server на 8081.\n2. Для внешних путей нужен approval.\n3. allow_forever хранится по normalized action hash."
+      },
+      {
+        "role": "critic",
+        "case": "reflection_quality",
+        "score": 0.8,
+        "note": "missing=['lesson']",
+        "elapsed_seconds": 32.088,
+        "completion_tokens": 220,
+        "tokens_per_second": 6.86,
+        "content_preview": "**Critic Reflection: WebChat API Incident**\n\n**1. Risk Assessment**\n\n*   **Operational Fragility (High):** The incident revealed a critical dependency on manual intervention. If the API crashes or the server reboots, WebChat will silently fail or return errors until an engineer notices and manually restarts the service. This creates a \"single point of failure\" in the operational process.\n*   **Meaning Time to Recovery (MTTR) Variance:** Recovery time is currently dependent on human availability "
+      }
+    ]
+  },
+  {
+    "model": "Menlo_Lucy-Q4_K_M CPU",
+    "quality": 0.771,
+    "avg_latency_seconds": 4.406,
+    "avg_tokens_per_second": 16.21,
+    "cases": [
+      {
+        "role": "action",
+        "case": "direct_answer_no_tools",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 3.228,
+        "completion_tokens": 31,
+        "tokens_per_second": 9.6,
+        "content_preview": "{\"kind\": \"action_directive\", \"intent\": \"answer_question\", \"risk_level\": \"low\", \"actions\": []}"
+      },
+      {
+        "role": "action",
+        "case": "read_specific_file",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 3.031,
+        "completion_tokens": 48,
+        "tokens_per_second": 15.84,
+        "content_preview": "{\"kind\": \"action_directive\", \"intent\": \"file_read\", \"risk_level\": \"low\", \"actions\": [{\"tool\": \"file_read\", \"args\": {\"path\": \"CURRENT_STATE.md\"}}]}\n\n"
+      },
+      {
+        "role": "memory_policy",
+        "case": "store_user_preference",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 3.62,
+        "completion_tokens": 54,
+        "tokens_per_second": 14.92,
+        "content_preview": "{\n  \"should_store\": true,\n  \"memory_type\": \"preference\",\n  \"summary\": \"User prefers Russian and requires sudo confirmation.\",\n  \"importance\": 0.7,\n  \"scope\": \"global\",\n  \"metadata\": {}\n}"
+      },
+      {
+        "role": "memory_policy",
+        "case": "ignore_trivial_tool_call",
+        "score": 0.3,
+        "note": "stored_trivial={'should_store': True, 'memory_type': 'fact', 'summary': 'Password was successfully launched and user was informed.', 'importance': 0.7, 'scope': 'global', 'metadata': {}}",
+        "elapsed_seconds": 3.192,
+        "completion_tokens": 58,
+        "tokens_per_second": 18.17,
+        "content_preview": "{\n  \"should_store\": true,\n  \"memory_type\": \"fact\",\n  \"summary\": \"Password was successfully launched and user was informed.\",\n  \"importance\": 0.7,\n  \"scope\": \"global\",\n  \"metadata\": {}\n}\n"
+      },
+      {
+        "role": "recall",
+        "case": "select_relevant_memory",
+        "score": 0.3,
+        "note": "wrong_ids=[]",
+        "elapsed_seconds": 3.737,
+        "completion_tokens": 60,
+        "tokens_per_second": 16.05,
+        "content_preview": "{\n  \"relevant_ids\": [],\n  \"reasoning\": \"The query is about how the user wants to run sudo, but none of the provided memories are related to sudo or user preferences for running commands. The memories are about SQLite and weather answers, which are unrelated to the query.\"\n}"
+      },
+      {
+        "role": "summary",
+        "case": "preserve_decisions",
+        "score": 0.8,
+        "note": "missing=['approval']",
+        "elapsed_seconds": 3.334,
+        "completion_tokens": 61,
+        "tokens_per_second": 18.29,
+        "content_preview": "1. WebChat работает на портах 8000, а LLaMA-Server на 8081.  \n2. Для внешних путей требуется подтверждение.  \n3. allow_forever хранится по hash-коду действий."
+      },
+      {
+        "role": "critic",
+        "case": "reflection_quality",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 10.697,
+        "completion_tokens": 220,
+        "tokens_per_second": 20.57,
+        "content_preview": "Okay, the user wants me to reflect on the risk and reusable lessons from fixing the WebChat issue. Let me break this down.\n\nFirst, the root cause was the API not running. They manually started the API but didn't have a unified service script. So the main risk here is that without a unified script, there's a chance the API might not be running consistently or could be misconfigured.\n\nReusability is key here. Maybe we can create a script that starts the API and monitors its status. That way, if th"
+      }
+    ]
+  },
+  {
+    "model": "Qwen3.5-9B-GLM5.1-Distill-v1-Q4_K_M CPU",
+    "quality": 0.4,
+    "avg_latency_seconds": 61.939,
+    "avg_tokens_per_second": 2.56,
+    "cases": [
+      {
+        "role": "action",
+        "case": "direct_answer_no_tools",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 68.075,
+        "completion_tokens": 72,
+        "tokens_per_second": 1.06,
+        "content_preview": "{\"kind\": \"action_directive\", \"intent\": \"answer\", \"risk_level\": \"high\", \"actions\": []}"
+      },
+      {
+        "role": "action",
+        "case": "read_specific_file",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 72.149,
+        "completion_tokens": 86,
+        "tokens_per_second": 1.19,
+        "content_preview": "{\"kind\": \"action_directive\", \"intent\": \"read file\", \"risk_level\": \"low\", \"actions\": [{\"tool\": \"file_read\", \"args\": {\"path\": \"CURRENT_STATE.md\"}}]}\n"
+      },
+      {
+        "role": "memory_policy",
+        "case": "store_user_preference",
+        "score": 0.0,
+        "note": "invalid_json: Expecting value: line 1 column 1 (char 0)",
+        "elapsed_seconds": 67.764,
+        "completion_tokens": 180,
+        "tokens_per_second": 2.66,
+        "content_preview": ""
+      },
+      {
+        "role": "memory_policy",
+        "case": "ignore_trivial_tool_call",
+        "score": 0.0,
+        "note": "invalid_json: Expecting value: line 1 column 1 (char 0)",
+        "elapsed_seconds": 64.653,
+        "completion_tokens": 160,
+        "tokens_per_second": 2.47,
+        "content_preview": ""
+      },
+      {
+        "role": "recall",
+        "case": "select_relevant_memory",
+        "score": 0.0,
+        "note": "invalid_json: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)",
+        "elapsed_seconds": 59.451,
+        "completion_tokens": 160,
+        "tokens_per_second": 2.69,
+        "content_preview": "{\n"
+      },
+      {
+        "role": "summary",
+        "case": "preserve_decisions",
+        "score": 0.2,
+        "note": "missing=['8000', '8081', 'approval', 'allow_forever']",
+        "elapsed_seconds": 47.05,
+        "completion_tokens": 180,
+        "tokens_per_second": 3.83,
+        "content_preview": ""
+      },
+      {
+        "role": "critic",
+        "case": "reflection_quality",
+        "score": 0.6,
+        "note": "missing=['risk', 'lesson']",
+        "elapsed_seconds": 54.433,
+        "completion_tokens": 220,
+        "tokens_per_second": 4.04,
+        "content_preview": ""
+      }
+    ]
+  },
+  {
+    "model": "X-Coder-SFT-Qwen3-8B.Q6_K CPU",
+    "quality": 0.757,
+    "avg_latency_seconds": 60.117,
+    "avg_tokens_per_second": 2.51,
+    "cases": [
+      {
+        "role": "action",
+        "case": "direct_answer_no_tools",
+        "score": 0.0,
+        "note": "invalid_json: Expecting ',' delimiter: line 13 column 6 (char 632)",
+        "elapsed_seconds": 121.049,
+        "completion_tokens": 180,
+        "tokens_per_second": 1.49,
+        "content_preview": "{\n  \"kind\": \"action_directive\",\n  \"intent\": \"answer\",\n  \"risk_level\": \"low\",\n  \"actions\": [\n    {\n      \"tool\": \"coder\",\n      \"args\": {\n        \"task_description\": \"Объясните коротко, что такое DuckLM\",\n        \"language\": \"python\",\n        \"context\": \"Пользователь запросил краткое описание DuckLM. DuckLM - это проект, который развивается сообществом, направленный на создание открытой и этичной языковой модели. Мы можем сказать, что это модель на основе Llama, созданная Meta, но адаптированная "
+      },
+      {
+        "role": "action",
+        "case": "read_specific_file",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 37.562,
+        "completion_tokens": 134,
+        "tokens_per_second": 3.57,
+        "content_preview": "{\n  \"kind\": \"action_directive\",\n  \"intent\": \"read_current_state\",\n  \"risk_level\": \"low\",\n  \"actions\": [\n    {\n      \"tool\": \"file_read\",\n      \"args\": {\n        \"path\": \"CURRENT_STATE.md\"\n      },\n      \"reason\": \"We need to read the content of CURRENT_STATE.md to provide a summary of the project status.\"\n    }\n  ],\n  \"expected_observations\": [\n    \"file_content\"\n  ],\n  \"stop_reason\": \"After reading the file, we can directly generate the response without further tools.\"\n}\n"
+      },
+      {
+        "role": "memory_policy",
+        "case": "store_user_preference",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 66.979,
+        "completion_tokens": 80,
+        "tokens_per_second": 1.19,
+        "content_preview": "{\n  \"should_store\": true,\n  \"memory_type\": \"preference\",\n  \"summary\": \"User prefers responses in Russian and explicit sudo confirmation.\",\n  \"importance\": 0.9,\n  \"scope\": \"global\",\n  \"metadata\": {\n      \"language\": \"Russian\",\n      \"sudo_policy\": \"explicit_confirmation_required\"\n  }\n}\n"
+      },
+      {
+        "role": "memory_policy",
+        "case": "ignore_trivial_tool_call",
+        "score": 0.3,
+        "note": "stored_trivial={'should_store': True, 'memory_type': 'event', 'summary': 'User executed pwd command and received /tmp/project as output.', 'importance': 0.8, 'scope': 'conversation', 'metadata': {}}",
+        "elapsed_seconds": 21.772,
+        "completion_tokens": 62,
+        "tokens_per_second": 2.85,
+        "content_preview": "{\n  \"should_store\": true,\n  \"memory_type\": \"event\",\n  \"summary\": \"User executed pwd command and received /tmp/project as output.\",\n  \"importance\": 0.8,\n  \"scope\": \"conversation\",\n  \"metadata\": {}\n}\n"
+      },
+      {
+        "role": "recall",
+        "case": "select_relevant_memory",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 58.656,
+        "completion_tokens": 90,
+        "tokens_per_second": 1.53,
+        "content_preview": "{\n  \"relevant_ids\": [\"m1\"],\n  \"reasoning\": \"The query is about how the user wants to run 'sudo'. The only memory that directly relates to this is memory m1, which states that the user does not want to run sudo without explicit confirmation. The other memories (m2 and m3) are about unrelated topics (database and weather). Therefore, only m1 is relevant.\"\n}\n"
+      },
+      {
+        "role": "summary",
+        "case": "preserve_decisions",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 53.245,
+        "completion_tokens": 180,
+        "tokens_per_second": 3.38,
+        "content_preview": "We are going to compress the task context into 3 key points while preserving the decisions and outcomes.\n\nKey Points:\n1. **Service Endpoints**: \n   - WebChat is running on port 8000.\n   - llama-server is running on port 8081.\n\n2. **External Paths Approval**:\n   - Any external path that requires access must be approved. This implies that we need to have a mechanism (like a firewall or access control list) to check and approve external requests.\n\n3. **Forever Allowance Storage**:\n   - The `allow_f"
+      },
+      {
+        "role": "critic",
+        "case": "reflection_quality",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 61.555,
+        "completion_tokens": 220,
+        "tokens_per_second": 3.57,
+        "content_preview": "We are reflecting on the task: fixing the WebChat which had an API that wasn't running.\n The root cause was that the API was not running. We started it manually but there was no unified service script.\n\n Reflection:\n - Risk: The manual start might be error-prone and not scalable. If multiple components, hard to manage.\n - Waste: We ended up having to start manually, which is not efficient and might lead to repeated issues.\n - Reusable Lesson: We should create a unified service script that starts"
+      }
+    ]
+  },
+  {
+    "model": "gemma-4-E4B-it-Q4_K_M CPU",
+    "quality": 0.971,
+    "avg_latency_seconds": 21.235,
+    "avg_tokens_per_second": 5.36,
+    "cases": [
+      {
+        "role": "action",
+        "case": "direct_answer_no_tools",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 35.717,
+        "completion_tokens": 53,
+        "tokens_per_second": 1.48,
+        "content_preview": "{\n  \"kind\": \"action_directive\",\n  \"intent\": \"answer_directly\"\n  ,\n  \"risk_level\": \"low\"\n  ,\n  \"actions\": []\n}\n"
+      },
+      {
+        "role": "action",
+        "case": "read_specific_file",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 13.325,
+        "completion_tokens": 88,
+        "tokens_per_second": 6.6,
+        "content_preview": "{\n  \"kind\": \"action_directive\",\n  \"intent\": \"tool_use\",\n  \"risk_level\": \"low\",\n  \"actions\": [\n    {\n      \"tool\": \"file_read\",\n      \"args\": {\n        \"path\": \"CURRENT_STATE.md\"\n      }\n    }\n  ]\n}\n"
+      },
+      {
+        "role": "memory_policy",
+        "case": "store_user_preference",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 27.13,
+        "completion_tokens": 98,
+        "tokens_per_second": 3.61,
+        "content_preview": "{\n  \"should_store\": true,\n  \"memory_type\": \"preference\",\n  \"summary\": \"Пользователь требует, чтобы ответы всегда были на русском языке и чтобы не запускать sudo без явного подтверждения.\",\n  \"importance\": 0.95,\n  \"scope\": \"global\",\n  \"metadata\": {\n    \"task_id\": \"task_pref\"\n  }\n}\n"
+      },
+      {
+        "role": "memory_policy",
+        "case": "ignore_trivial_tool_call",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 10.229,
+        "completion_tokens": 90,
+        "tokens_per_second": 8.8,
+        "content_preview": "{\n  \"should_store\": false,\n  \"memory_type\": \"note\",\n  \"summary\": \"The 'pwd' command was executed, and the output was sent to the user.\",\n  \"importance\": 0.1,\n  \"scope\": \"conversation\",\n  \"metadata\": {\n    \"task_id\": \"task_tmp\"\n  }\n}\n"
+      },
+      {
+        "role": "recall",
+        "case": "select_relevant_memory",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 19.391,
+        "completion_tokens": 62,
+        "tokens_per_second": 3.2,
+        "content_preview": "{\n  \"relevant_ids\": [\n    \"m1\"\n  ],\n  \"reasoning\": \"Воспоминание m1 напрямую отвечает на вопрос о том, как пользователь хочет, чтобы запускался sudo (с требованием отдельного подтверждения).\"\n}\n"
+      },
+      {
+        "role": "summary",
+        "case": "preserve_decisions",
+        "score": 1.0,
+        "note": "ok",
+        "elapsed_seconds": 14.371,
+        "completion_tokens": 88,
+        "tokens_per_second": 6.12,
+        "content_preview": "Вот сжатый контекст в 3 пунктах:\n\n1. **Порты:** WebChat использует порт 8000, llama-server — 8081.\n2. **Безопасность:** Для внешних путей требуется предварительное одобрение (approval).\n3. **Хранение разрешений:** `allow_forever` сохраняется на основе хеша нормализованного действия."
+      },
+      {
+        "role": "critic",
+        "case": "reflection_quality",
+        "score": 0.8,
+        "note": "missing=['lesson']",
+        "elapsed_seconds": 28.484,
+        "completion_tokens": 220,
+        "tokens_per_second": 7.72,
+        "content_preview": "## DuckLM: Critic Reflection\n\n**Task:** Fix WebChat.\n**Observed Root Cause:** API was not running.\n**Action Taken:** Started API manually.\n**Observed Deficiency:** Lack of a unified service script.\n\n---\n\n### 🔍 Reflection Analysis\n\n#### 1. Risk Assessment (What went wrong/could go wrong?)\n\n*   **Operational Risk (High):** The immediate risk was service unavailability (WebChat down). Manually starting the API is a brittle, high-touch workaround. If the system restarts, or if the API needs to be re"
+      }
+    ]
+  }
+]
\ No newline at end of file
diff --git a/docs/bench/utility_model_bench_20260522_050427.md b/docs/bench/utility_model_bench_20260522_050427.md
new file mode 100644
index 0000000..323c493
--- /dev/null
+++ b/docs/bench/utility_model_bench_20260522_050427.md
@@ -0,0 +1,69 @@
+# Utility Role Model Benchmark
+
+Scope: service roles only (`action`, `memory_policy`, `recall`, `summary`, `critic`).
+The main user-facing thinker is not evaluated for replacement here.
+
+| Model | Quality | Avg latency, s | Avg tok/s | Notes |
+| --- | ---: | ---: | ---: | --- |
+| Qwen3.6-35B nonMTP GPU baseline | 0.97 | 17.94 | 4.51 | critic/reflection_quality: missing=['lesson'] |
+| Menlo_Lucy-Q4_K_M CPU | 0.77 | 4.41 | 16.21 | memory_policy/ignore_trivial_tool_call: stored_trivial={'should_store': True, 'memory_type': 'fact', 'summary': 'Password was successfully launched and user was informed.', 'importance': 0.7, 'scope': 'global', 'metadata': {}}; recall/select_relevant_memory: wrong_ids=[]; summary/preserve_decisions: missing=['approval'] |
+| Qwen3.5-9B-GLM5.1-Distill-v1-Q4_K_M CPU | 0.40 | 61.94 | 2.56 | memory_policy/store_user_preference: invalid_json: Expecting value: line 1 column 1 (char 0); memory_policy/ignore_trivial_tool_call: invalid_json: Expecting value: line 1 column 1 (char 0); recall/select_relevant_memory: invalid_json: Expecting property name enclosed in double quotes: line 1 column 2 (char 1) |
+| X-Coder-SFT-Qwen3-8B.Q6_K CPU | 0.76 | 60.12 | 2.51 | action/direct_answer_no_tools: invalid_json: Expecting ',' delimiter: line 13 column 6 (char 632); memory_policy/ignore_trivial_tool_call: stored_trivial={'should_store': True, 'memory_type': 'event', 'summary': 'User executed pwd command and received /tmp/project as output.', 'importance': 0.8, 'scope': 'conversation', 'metadata': {}} |
+| gemma-4-E4B-it-Q4_K_M CPU | 0.97 | 21.23 | 5.36 | critic/reflection_quality: missing=['lesson'] |
+
+## Case Details
+
+### Qwen3.6-35B nonMTP GPU baseline
+| Role | Case | Score | Latency, s | tok/s | Note |
+| --- | --- | ---: | ---: | ---: | --- |
+| action | direct_answer_no_tools | 1.00 | 15.31 | 2.94 | ok |
+| action | read_specific_file | 1.00 | 19.61 | 4.13 | ok |
+| memory_policy | store_user_preference | 1.00 | 18.53 | 4.75 | ok |
+| memory_policy | ignore_trivial_tool_call | 1.00 | 15.00 | 4.07 | ok |
+| recall | select_relevant_memory | 1.00 | 15.09 | 4.38 | ok |
+| summary | preserve_decisions | 1.00 | 9.95 | 4.42 | ok |
+| critic | reflection_quality | 0.80 | 32.09 | 6.86 | missing=['lesson'] |
+
+### Menlo_Lucy-Q4_K_M CPU
+| Role | Case | Score | Latency, s | tok/s | Note |
+| --- | --- | ---: | ---: | ---: | --- |
+| action | direct_answer_no_tools | 1.00 | 3.23 | 9.60 | ok |
+| action | read_specific_file | 1.00 | 3.03 | 15.84 | ok |
+| memory_policy | store_user_preference | 1.00 | 3.62 | 14.92 | ok |
+| memory_policy | ignore_trivial_tool_call | 0.30 | 3.19 | 18.17 | stored_trivial={'should_store': True, 'memory_type': 'fact', 'summary': 'Password was successfully launched and user was informed.', 'importance': 0.7, 'scope': 'global', 'metadata': {}} |
+| recall | select_relevant_memory | 0.30 | 3.74 | 16.05 | wrong_ids=[] |
+| summary | preserve_decisions | 0.80 | 3.33 | 18.29 | missing=['approval'] |
+| critic | reflection_quality | 1.00 | 10.70 | 20.57 | ok |
+
+### Qwen3.5-9B-GLM5.1-Distill-v1-Q4_K_M CPU
+| Role | Case | Score | Latency, s | tok/s | Note |
+| --- | --- | ---: | ---: | ---: | --- |
+| action | direct_answer_no_tools | 1.00 | 68.08 | 1.06 | ok |
+| action | read_specific_file | 1.00 | 72.15 | 1.19 | ok |
+| memory_policy | store_user_preference | 0.00 | 67.76 | 2.66 | invalid_json: Expecting value: line 1 column 1 (char 0) |
+| memory_policy | ignore_trivial_tool_call | 0.00 | 64.65 | 2.47 | invalid_json: Expecting value: line 1 column 1 (char 0) |
+| recall | select_relevant_memory | 0.00 | 59.45 | 2.69 | invalid_json: Expecting property name enclosed in double quotes: line 1 column 2 (char 1) |
+| summary | preserve_decisions | 0.20 | 47.05 | 3.83 | missing=['8000', '8081', 'approval', 'allow_forever'] |
+| critic | reflection_quality | 0.60 | 54.43 | 4.04 | missing=['risk', 'lesson'] |
+
+### X-Coder-SFT-Qwen3-8B.Q6_K CPU
+| Role | Case | Score | Latency, s | tok/s | Note |
+| --- | --- | ---: | ---: | ---: | --- |
+| action | direct_answer_no_tools | 0.00 | 121.05 | 1.49 | invalid_json: Expecting ',' delimiter: line 13 column 6 (char 632) |
+| action | read_specific_file | 1.00 | 37.56 | 3.57 | ok |
+| memory_policy | store_user_preference | 1.00 | 66.98 | 1.19 | ok |
+| memory_policy | ignore_trivial_tool_call | 0.30 | 21.77 | 2.85 | stored_trivial={'should_store': True, 'memory_type': 'event', 'summary': 'User executed pwd command and received /tmp/project as output.', 'importance': 0.8, 'scope': 'conversation', 'metadata': {}} |
+| recall | select_relevant_memory | 1.00 | 58.66 | 1.53 | ok |
+| summary | preserve_decisions | 1.00 | 53.24 | 3.38 | ok |
+| critic | reflection_quality | 1.00 | 61.55 | 3.57 | ok |
+
+### gemma-4-E4B-it-Q4_K_M CPU
+| Role | Case | Score | Latency, s | tok/s | Note |
+| --- | --- | ---: | ---: | ---: | --- |
+| action | direct_answer_no_tools | 1.00 | 35.72 | 1.48 | ok |
+| action | read_specific_file | 1.00 | 13.32 | 6.60 | ok |
+| memory_policy | store_user_preference | 1.00 | 27.13 | 3.61 | ok |
+| memory_policy | ignore_trivial_tool_call | 1.00 | 10.23 | 8.80 | ok |
+| recall | select_relevant_memory | 1.00 | 19.39 | 3.20 | ok |
+| summary | preserve_decisions | 1.00 | 14.37 | 6.12 | ok |
+| critic | reflection_quality | 0.80 | 28.48 | 7.72 | missing=['lesson'] |
diff --git a/docs/how_to_run.md b/docs/how_to_run.md
index b185a8f..533c31e 100644
--- a/docs/how_to_run.md
+++ b/docs/how_to_run.md
@@ -16,34 +16,53 @@ cp .env.example .env
 
 The default `DUCK_MAIN_MODEL_PATH` points to `./models/Qwen3.6/nonMTP/Qwen3.6-35B-A3B-UD-Q4_K_M.gguf`.
 
-3. Start `llama-server`:
+3. Start DuckLM:
 
 ```bash
-bash scripts/llama/start_main.sh start
+bash scripts/duck.sh start
 ```
 
+This starts both processes:
+
+- `llama-server` on `http://127.0.0.1:8081/v1`
+- DuckLM API/WebChat on `http://127.0.0.1:8000/`
+
 Useful process commands:
 
 ```bash
-bash scripts/llama/start_main.sh status
-bash scripts/llama/start_main.sh logs --follow
-bash scripts/llama/start_main.sh restart
-bash scripts/llama/start_main.sh stop
+bash scripts/duck.sh status
+bash scripts/duck.sh logs --follow
+bash scripts/duck.sh restart
+bash scripts/duck.sh stop
 ```
 
-4. Start DuckLM API:
-
-```bash
-python -m duck_core.api
-```
-
-5. Open WebChat:
+4. Open WebChat:
 
 ```text
 http://127.0.0.1:8000/
 ```
 
-6. Send a task:
+Low-level llama-only commands are still available when needed:
+
+```bash
+bash scripts/llama/start_main.sh status
+bash scripts/llama/start_main.sh logs --follow
+```
+
+MTP/speculative variant:
+
+```bash
+bash scripts/duck.sh stop
+bash scripts/duck-mtp.sh start
+bash scripts/duck-mtp.sh status
+bash scripts/duck-mtp.sh logs --follow
+```
+
+`duck-mtp.sh` keeps DuckLM on `http://127.0.0.1:8000/` and starts the MTP-backed
+`llama-server` on the normal role endpoint `http://127.0.0.1:8081/v1`, so
+`config/models.yaml` does not need to change.
+
+5. Send a task:
 
 ```bash
 curl -X POST http://127.0.0.1:8000/v1/chat \
@@ -51,21 +70,21 @@ curl -X POST http://127.0.0.1:8000/v1/chat \
   -d '{"message":"Скажи коротко, что ты DuckLM","workspace":"./workspace","debug":true}'
 ```
 
-7. Inspect events:
+6. Inspect events:
 
 ```bash
 curl http://127.0.0.1:8000/v1/tasks/<task_id>/events
 ```
 
-8. Approvals:
+7. Approvals:
 
 ```bash
 curl http://127.0.0.1:8000/v1/approvals/pending
 ```
 
-9. Stop services:
+8. Stop services:
 
 ```bash
-bash scripts/llama/start_main.sh stop
+bash scripts/duck.sh stop
 docker compose -f docker-compose.memory.yml down
 ```
diff --git a/docs/web_api.md b/docs/web_api.md
index edd83a5..3f73041 100644
--- a/docs/web_api.md
+++ b/docs/web_api.md
@@ -23,3 +23,18 @@ GET  /v1/experience
 GET  /v1/experience/{id}
 GET  /v1/memory/search?q=...
 ```
+
+Chat requests accept optional `reasoning`:
+
+```json
+{
+  "message": "hello",
+  "reasoning": "auto"
+}
+```
+
+Allowed values:
+
+- `auto`: use the `llama-server` default.
+- `on`: pass `enable_thinking=true` for the thinker response.
+- `off`: pass `enable_thinking=false` and `thinking_budget_tokens=0`.
diff --git a/duck_core/api.py b/duck_core/api.py
index 3bfb4ea..b211446 100644
--- a/duck_core/api.py
+++ b/duck_core/api.py
@@ -3,7 +3,7 @@ import json
 import logging
 import time
 from pathlib import Path
-from typing import Any
+from typing import Any, Literal
 
 import uvicorn
 from fastapi import FastAPI, HTTPException, Request
@@ -60,6 +60,7 @@ class ChatRequest(BaseModel):
     conversation_id: str | None = None
     workspace: str | None = None
     debug: bool = False
+    reasoning: Literal["auto", "on", "off"] = "auto"
 
 
 class ConversationRequest(BaseModel):
@@ -208,6 +209,7 @@ def create_app() -> FastAPI:
             history_messages=history,
             memory_records=memory_records,
             skill_summary=await selected_skill_summary(body.message),
+            reasoning=body.reasoning,
         )
         await conversations.add_message(
             conversation.conversation_id,
@@ -411,7 +413,12 @@ def create_app() -> FastAPI:
                     "Формирую ответ...",
                 )
                 await event_store.append(task.task_id, "model_call_started", {"role": "thinker"})
-                async for chunk in model_client.stream_chat("thinker", messages):
+                stream = (
+                    model_client.stream_chat("thinker", messages, reasoning=body.reasoning)
+                    if body.reasoning in {"on", "off"}
+                    else model_client.stream_chat("thinker", messages)
+                )
+                async for chunk in stream:
                     delta = str(chunk.get("delta") or "")
                     if chunk.get("type") == "reasoning_delta":
                         generation_stats.record(delta)
diff --git a/duck_core/model_client.py b/duck_core/model_client.py
index 4c01d5d..17e73fd 100644
--- a/duck_core/model_client.py
+++ b/duck_core/model_client.py
@@ -3,13 +3,15 @@ import logging
 import time
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any
+from typing import Any, Literal
 
 import httpx
 import yaml
 
 logger = logging.getLogger(__name__)
 
+ReasoningMode = Literal["auto", "on", "off"]
+
 
 @dataclass(frozen=True)
 class RoleConfig:
@@ -92,6 +94,20 @@ class ModelClient:
             }
         return {"type": "json_object"}
 
+    def _reasoning_options(self, reasoning: ReasoningMode | None) -> dict[str, Any]:
+        if reasoning == "on":
+            return {
+                "reasoning_format": "deepseek",
+                "chat_template_kwargs": {"enable_thinking": True},
+            }
+        if reasoning == "off":
+            return {
+                "reasoning_format": "deepseek",
+                "chat_template_kwargs": {"enable_thinking": False},
+                "thinking_budget_tokens": 0,
+            }
+        return {}
+
     async def chat(
         self,
         role: str,
@@ -99,6 +115,7 @@ class ModelClient:
         temperature: float | None = None,
         max_output_tokens: int | None = None,
         response_format: dict[str, Any] | None = None,
+        reasoning: ReasoningMode | None = None,
     ) -> ModelResponse:
         cfg = self.get_role_config(role)
         outbound = list(messages)
@@ -115,6 +132,7 @@ class ModelClient:
         fmt = self._response_format(cfg, response_format)
         if fmt is not None:
             payload["response_format"] = fmt
+        payload.update(self._reasoning_options(reasoning))
 
         start = time.perf_counter()
         try:
@@ -150,6 +168,7 @@ class ModelClient:
         temperature: float | None = None,
         max_output_tokens: int | None = None,
         response_format: dict[str, Any] | None = None,
+        reasoning: ReasoningMode | None = None,
     ):
         cfg = self.get_role_config(role)
         outbound = list(messages)
@@ -167,6 +186,7 @@ class ModelClient:
         fmt = self._response_format(cfg, response_format)
         if fmt is not None:
             payload["response_format"] = fmt
+        payload.update(self._reasoning_options(reasoning))
 
         try:
             async with httpx.AsyncClient(timeout=self.timeout, trust_env=False) as client:
diff --git a/duck_core/runtime_loop.py b/duck_core/runtime_loop.py
index efcb8d7..b448eee 100644
--- a/duck_core/runtime_loop.py
+++ b/duck_core/runtime_loop.py
@@ -10,7 +10,7 @@ from duck_core.experience.recorder import ExperienceRecorder
 from duck_core.memory.policy import MemoryPolicy
 from duck_core.memory.store import MemoryStore
 from duck_core.memory.vector_memory import VectorMemory
-from duck_core.model_client import ModelClient
+from duck_core.model_client import ModelClient, ReasoningMode
 from duck_core.reflection import Reflection
 from duck_core.tasks.store import TaskStore
 from duck_core.tools.base import ToolResult
@@ -63,6 +63,7 @@ class RuntimeLoop:
         memory_records: list[dict[str, str]] | None = None,
         skill_summary: str | None = None,
         reflect: bool = True,
+        reasoning: ReasoningMode | None = None,
     ) -> ChatResult:
         task = await self.task_store.create_task(message, workspace, debug)
         await self.event_store.append(
@@ -100,7 +101,10 @@ class RuntimeLoop:
             await self.event_store.append(
                 task.task_id, "model_call_started", {"role": "thinker"}
             )
-            response = await self.model_client.chat("thinker", messages)
+            if reasoning in {"on", "off"}:
+                response = await self.model_client.chat("thinker", messages, reasoning=reasoning)
+            else:
+                response = await self.model_client.chat("thinker", messages)
             await self.event_store.append(
                 task.task_id,
                 "cognition_response",
diff --git a/duck_core/web/static/app.js b/duck_core/web/static/app.js
index d6556a5..d2c46e1 100644
--- a/duck_core/web/static/app.js
+++ b/duck_core/web/static/app.js
@@ -16,6 +16,100 @@ function escapeText(value) {
   return String(value ?? "");
 }
 
+function escapeHtml(value) {
+  return String(value ?? "")
+    .replaceAll("&", "&amp;")
+    .replaceAll("<", "&lt;")
+    .replaceAll(">", "&gt;")
+    .replaceAll('"', "&quot;")
+    .replaceAll("'", "&#39;");
+}
+
+function renderInlineMarkdown(text) {
+  let html = escapeHtml(text);
+  html = html.replace(/`([^`]+)`/g, "<code>$1</code>");
+  html = html.replace(/\*\*([^*]+)\*\*/g, "<strong>$1</strong>");
+  html = html.replace(/\*([^*]+)\*/g, "<em>$1</em>");
+  html = html.replace(
+    /(https?:\/\/[^\s<]+)/g,
+    '<a href="$1" target="_blank" rel="noreferrer">$1</a>',
+  );
+  return html;
+}
+
+function renderMarkdown(markdown) {
+  const blocks = [];
+  const lines = String(markdown ?? "").replace(/\r\n/g, "\n").split("\n");
+  let paragraph = [];
+  let listItems = [];
+  let inFence = false;
+  let fenceLines = [];
+
+  const flushParagraph = () => {
+    if (!paragraph.length) return;
+    blocks.push(`<p>${renderInlineMarkdown(paragraph.join(" "))}</p>`);
+    paragraph = [];
+  };
+  const flushList = () => {
+    if (!listItems.length) return;
+    blocks.push(`<ul>${listItems.map((item) => `<li>${renderInlineMarkdown(item)}</li>`).join("")}</ul>`);
+    listItems = [];
+  };
+
+  for (const line of lines) {
+    if (line.startsWith("```")) {
+      if (inFence) {
+        blocks.push(`<pre><code>${escapeHtml(fenceLines.join("\n"))}</code></pre>`);
+        fenceLines = [];
+        inFence = false;
+      } else {
+        flushParagraph();
+        flushList();
+        inFence = true;
+      }
+      continue;
+    }
+    if (inFence) {
+      fenceLines.push(line);
+      continue;
+    }
+
+    const heading = /^(#{1,4})\s+(.+)$/.exec(line);
+    if (heading) {
+      flushParagraph();
+      flushList();
+      const level = heading[1].length + 2;
+      blocks.push(`<h${level}>${renderInlineMarkdown(heading[2])}</h${level}>`);
+      continue;
+    }
+
+    const listItem = /^\s*[-*]\s+(.+)$/.exec(line);
+    if (listItem) {
+      flushParagraph();
+      listItems.push(listItem[1]);
+      continue;
+    }
+
+    if (!line.trim()) {
+      flushParagraph();
+      flushList();
+      continue;
+    }
+    flushList();
+    paragraph.push(line.trim());
+  }
+  if (inFence) blocks.push(`<pre><code>${escapeHtml(fenceLines.join("\n"))}</code></pre>`);
+  flushParagraph();
+  flushList();
+  return blocks.join("");
+}
+
+function setMarkdownContent(node, content) {
+  if (!node) return;
+  node.dataset.markdown = String(content ?? "");
+  node.innerHTML = renderMarkdown(node.dataset.markdown);
+}
+
 function setStatus(id, text, tone = "neutral") {
   const node = document.querySelector(id);
   if (!node) return;
@@ -67,7 +161,12 @@ function addMessage(role, content, meta = "", options = {}) {
   messageMeta.innerHTML = `<strong>${role === "user" ? "You" : "DuckLM"}</strong><span>${escapeText(meta)}</span>`;
 
   const text = document.createElement("p");
-  text.textContent = content;
+  text.className = role === "assistant" ? "message-body markdown-body" : "message-body";
+  if (role === "assistant") {
+    setMarkdownContent(text, content);
+  } else {
+    text.textContent = content;
+  }
 
   bubble.append(messageMeta);
   if (role === "assistant" && options.reasoning) {
@@ -204,11 +303,10 @@ function updateToolTerminal(article, eventPayload) {
   terminal.classList.remove("is-waiting");
   status.textContent = result.ok ? "ok" : "error";
 
-  const title = terminal.querySelector(".tool-terminal-title")?.textContent || body.textContent.trim();
-  const parts = [title];
-  if (result.output) parts.push("\nstdout\n" + result.output.trimEnd());
-  if (result.error) parts.push("\nstderr\n" + result.error.trimEnd());
-  body.textContent = parts.join("\n");
+  const parts = [];
+  if (result.output) parts.push("stdout\n" + result.output.trimEnd());
+  if (result.error) parts.push("stderr\n" + result.error.trimEnd());
+  body.textContent = parts.join("\n\n") || "completed with no output";
   document.querySelector("#messages").scrollTop = document.querySelector("#messages").scrollHeight;
 }
 
@@ -359,13 +457,22 @@ function humanApprovalDecision(action) {
 
 function setMessagePending(article, text) {
   const paragraph = article?.querySelector("p");
-  if (paragraph) paragraph.textContent = text;
+  if (!paragraph) return;
+  if (paragraph.classList.contains("markdown-body")) {
+    setMarkdownContent(paragraph, text);
+  } else {
+    paragraph.textContent = text;
+  }
 }
 
 function appendMessageText(article, delta) {
   const paragraph = article?.querySelector("p");
   if (!paragraph) return;
-  paragraph.textContent += delta;
+  if (paragraph.classList.contains("markdown-body")) {
+    setMarkdownContent(paragraph, `${paragraph.dataset.markdown || ""}${delta}`);
+  } else {
+    paragraph.textContent += delta;
+  }
   document.querySelector("#messages").scrollTop = document.querySelector("#messages").scrollHeight;
 }
 
@@ -758,7 +865,8 @@ async function sendMessage() {
   setStatus("#task-status", "running", "warn");
   addMessage("user", message, "submitted");
   input.value = "";
-  const pending = addMessage("assistant", "", "thinking", {reasoning: true});
+  const reasoningEnabled = document.querySelector("#reasoning")?.checked || false;
+  const pending = addMessage("assistant", "", "thinking", {reasoning: reasoningEnabled});
   const context = {taskId: "", contentStarted: false};
 
   try {
@@ -767,6 +875,7 @@ async function sendMessage() {
       conversation_id: state.currentConversationId || null,
       workspace: document.querySelector("#workspace").value,
       debug: document.querySelector("#debug").checked,
+      reasoning: reasoningEnabled ? "on" : "off",
     }, async ({name, data}) => {
       await handleAssistantStreamEvent(pending, name, data, context);
     });
diff --git a/duck_core/web/static/style.css b/duck_core/web/static/style.css
index 9fea56a..2f21094 100644
--- a/duck_core/web/static/style.css
+++ b/duck_core/web/static/style.css
@@ -500,6 +500,60 @@ dd {
   line-height: 1.5;
 }
 
+.bubble .markdown-body {
+  white-space: normal;
+}
+
+.markdown-body h3,
+.markdown-body h4,
+.markdown-body h5,
+.markdown-body p,
+.markdown-body ul,
+.markdown-body pre {
+  margin: 8px 0 0;
+}
+
+.markdown-body h3 {
+  font-size: 17px;
+}
+
+.markdown-body h4,
+.markdown-body h5 {
+  font-size: 15px;
+}
+
+.markdown-body ul {
+  padding-left: 22px;
+}
+
+.markdown-body code {
+  padding: 1px 4px;
+  border-radius: 5px;
+  background: #e2e8f0;
+  font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+  font-size: 0.92em;
+}
+
+.markdown-body pre {
+  max-width: 100%;
+  overflow: auto;
+  padding: 10px 12px;
+  border-radius: 8px;
+  background: #0f172a;
+  color: #d1fae5;
+  white-space: pre;
+}
+
+.markdown-body pre code {
+  padding: 0;
+  background: transparent;
+  color: inherit;
+}
+
+.markdown-body a {
+  color: var(--accent);
+}
+
 .message-reasoning {
   display: grid;
   gap: 8px;
diff --git a/duck_core/web/templates/index.html b/duck_core/web/templates/index.html
index 6021e09..3d02776 100644
--- a/duck_core/web/templates/index.html
+++ b/duck_core/web/templates/index.html
@@ -46,6 +46,10 @@
             <input id="debug" type="checkbox" checked>
             <span>Debug mode</span>
           </label>
+          <label class="toggle-row">
+            <input id="reasoning" type="checkbox">
+            <span>Reasoning</span>
+          </label>
         </section>
 
         <section class="status-panel" aria-labelledby="status-title">
diff --git a/scripts/bench/bench_runtime.py b/scripts/bench/bench_runtime.py
index 2c491b8..e87a722 100644
--- a/scripts/bench/bench_runtime.py
+++ b/scripts/bench/bench_runtime.py
@@ -1,33 +1,434 @@
+import argparse
 import asyncio
+import json
+import os
+import signal
+import subprocess
 import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from statistics import mean
+from typing import Any
 
-from duck_core.model_client import ModelClient
+import httpx
+import jsonschema
 
 
-TASKS = [
-    "Скажи коротко, что ты DuckLM.",
-    "Создай tmp/duck_test_note.md с текстом hello duck и прочитай его обратно.",
-    "Посмотри структуру проекта и кратко опиши модули.",
-    "Найди TODO/FIXME в проекте.",
-    "Запусти тесты и кратко объясни результат.",
+ROOT = Path(__file__).resolve().parents[2]
+LLAMA_BIN = ROOT / "vendor/llama.cpp/build/bin/llama-server"
+BASELINE_URL = "http://127.0.0.1:8081/v1"
+BENCH_PORT = 18091
+BENCH_URL = f"http://127.0.0.1:{BENCH_PORT}/v1"
+RESULTS_DIR = ROOT / "docs/bench"
+
+CANDIDATES = [
+    ROOT / "models/Menlo_Lucy-Q4_K_M.gguf",
+    ROOT / "models/Qwen3.5-9B-GLM5.1-Distill-v1-Q4_K_M.gguf",
+    ROOT / "models/X-Coder-SFT-Qwen3-8B.Q6_K.gguf",
+    ROOT / "models/gemma-4-E4B-it-Q4_K_M.gguf",
 ]
 
 
-async def main() -> None:
-    client = ModelClient()
-    print("role -> base_url/model")
-    for role, cfg in client._roles.items():
-        print(f"{role} -> {cfg.base_url}/{cfg.model}")
+@dataclass
+class BenchCase:
+    role: str
+    name: str
+    system_prompt: str
+    user_prompt: str
+    max_tokens: int
+    response_format: dict[str, Any] | None = None
+    required_keywords: list[str] = field(default_factory=list)
+
+
+def read(path: str) -> str:
+    return (ROOT / path).read_text()
+
+
+ACTION_SCHEMA = json.loads((ROOT / "duck_core/schemas/action_directive.schema.json").read_text())
+MEMORY_SCHEMA = {
+    "type": "object",
+    "required": ["should_store", "memory_type", "summary", "importance", "scope", "metadata"],
+    "additionalProperties": True,
+    "properties": {
+        "should_store": {"type": "boolean"},
+        "memory_type": {"type": "string"},
+        "summary": {"type": "string"},
+        "importance": {"type": "number"},
+        "scope": {"type": "string"},
+        "metadata": {"type": "object"},
+    },
+}
+RECALL_SCHEMA = {
+    "type": "object",
+    "required": ["relevant_ids", "reasoning"],
+    "additionalProperties": True,
+    "properties": {
+        "relevant_ids": {"type": "array", "items": {"type": "string"}},
+        "reasoning": {"type": "string"},
+    },
+}
+
+
+CASES = [
+    BenchCase(
+        role="action",
+        name="direct_answer_no_tools",
+        system_prompt=read("prompts/roles/action.md"),
+        user_prompt="User request: Скажи коротко, что такое DuckLM.\nWorkspace: /tmp/duck",
+        max_tokens=180,
+        response_format={
+            "type": "json_schema",
+            "json_schema": {"name": "action_directive", "schema": ACTION_SCHEMA, "strict": True},
+        },
+    ),
+    BenchCase(
+        role="action",
+        name="read_specific_file",
+        system_prompt=read("prompts/roles/action.md"),
+        user_prompt="User request: Прочитай файл CURRENT_STATE.md и кратко скажи статус проекта.\nWorkspace: /home/mirivlad/git/ducklm",
+        max_tokens=220,
+        response_format={
+            "type": "json_schema",
+            "json_schema": {"name": "action_directive", "schema": ACTION_SCHEMA, "strict": True},
+        },
+    ),
+    BenchCase(
+        role="memory_policy",
+        name="store_user_preference",
+        system_prompt=read("prompts/roles/memory_policy.md"),
+        user_prompt="Task ID: task_pref\n\nTranscript:\nПользователь сказал: всегда отвечай мне по-русски и не запускай sudo без отдельного подтверждения.",
+        max_tokens=180,
+        response_format={
+            "type": "json_schema",
+            "json_schema": {"name": "memory_decision", "schema": MEMORY_SCHEMA, "strict": True},
+        },
+    ),
+    BenchCase(
+        role="memory_policy",
+        name="ignore_trivial_tool_call",
+        system_prompt=read("prompts/roles/memory_policy.md"),
+        user_prompt="Task ID: task_tmp\n\nTranscript:\nЗапущен pwd, stdout: /tmp/project. Ответ отправлен пользователю.",
+        max_tokens=160,
+        response_format={
+            "type": "json_schema",
+            "json_schema": {"name": "memory_decision", "schema": MEMORY_SCHEMA, "strict": True},
+        },
+    ),
+    BenchCase(
+        role="recall",
+        name="select_relevant_memory",
+        system_prompt=read("prompts/roles/recall.md"),
+        user_prompt=(
+            "Query: Как пользователь хочет, чтобы я запускал sudo?\n\n"
+            "Memories:\n"
+            "- id: m1 | text: Пользователь просит не запускать sudo без отдельного подтверждения.\n"
+            "- id: m2 | text: Проект использует SQLite для событий.\n"
+            "- id: m3 | text: Пользователь любит краткие ответы о погоде.\n"
+        ),
+        max_tokens=160,
+        response_format={
+            "type": "json_schema",
+            "json_schema": {"name": "recall_decision", "schema": RECALL_SCHEMA, "strict": True},
+        },
+    ),
+    BenchCase(
+        role="summary",
+        name="preserve_decisions",
+        system_prompt=read("prompts/roles/summary.md"),
+        user_prompt=(
+            "Сожми контекст до 3 пунктов. Сохрани решения:\n"
+            "1. WebChat работает на 8000, llama-server на 8081.\n"
+            "2. Для внешних путей нужен approval.\n"
+            "3. allow_forever хранится по normalized action hash.\n"
+        ),
+        max_tokens=180,
+        required_keywords=["8000", "8081", "approval", "allow_forever"],
+    ),
+    BenchCase(
+        role="critic",
+        name="reflection_quality",
+        system_prompt=read("prompts/roles/critic.md"),
+        user_prompt=(
+            "Task transcript:\n"
+            "User asked to fix WebChat. Root cause was API not running. "
+            "We started API manually but had no unified service script. Reflect on risk and reusable lesson."
+        ),
+        max_tokens=220,
+        required_keywords=["risk", "lesson"],
+    ),
+]
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--baseline-url", default=BASELINE_URL)
+    parser.add_argument("--port", type=int, default=BENCH_PORT)
+    parser.add_argument("--models", nargs="*", default=[str(path) for path in CANDIDATES])
+    parser.add_argument("--threads", type=int, default=max(1, (os.cpu_count() or 8) // 2))
+    parser.add_argument("--ctx-size", type=int, default=4096)
+    parser.add_argument("--timeout", type=float, default=180.0)
+    parser.add_argument("--skip-cpu", action="store_true")
+    return parser.parse_args()
+
+
+def llama_env() -> dict[str, str]:
+    env = os.environ.copy()
+    bin_dir = str(LLAMA_BIN.parent)
+    env["LD_LIBRARY_PATH"] = f"{bin_dir}{':' + env['LD_LIBRARY_PATH'] if env.get('LD_LIBRARY_PATH') else ''}"
+    return env
+
+
+async def wait_ready(base_url: str, timeout: float) -> None:
+    deadline = time.perf_counter() + timeout
+    async with httpx.AsyncClient(timeout=5.0, trust_env=False) as client:
+        last_error = ""
+        while time.perf_counter() < deadline:
+            try:
+                response = await client.get(f"{base_url}/models")
+                if response.status_code == 200:
+                    return
+                last_error = f"HTTP {response.status_code}: {response.text[:120]}"
+            except Exception as exc:
+                last_error = str(exc)
+            await asyncio.sleep(1.0)
+    raise TimeoutError(f"{base_url} not ready: {last_error}")
+
+
+def start_cpu_server(model_path: Path, port: int, threads: int, ctx_size: int) -> subprocess.Popen:
+    log_dir = ROOT / "data/bench"
+    log_dir.mkdir(parents=True, exist_ok=True)
+    log_file = log_dir / f"{model_path.stem}.log"
+    command = [
+        str(LLAMA_BIN),
+        "-m",
+        str(model_path),
+        "--alias",
+        "bench-cpu",
+        "--host",
+        "127.0.0.1",
+        "--port",
+        str(port),
+        "-c",
+        str(ctx_size),
+        "--parallel",
+        "1",
+        "-ngl",
+        "0",
+        "--threads",
+        str(threads),
+        "--threads-batch",
+        str(threads),
+        "--reasoning",
+        "off",
+        "--cache-ram",
+        "0",
+    ]
+    handle = log_file.open("a")
+    handle.write("Command: " + " ".join(command) + "\n")
+    handle.flush()
+    process = subprocess.Popen(
+        command,
+        cwd=ROOT,
+        env=llama_env(),
+        stdout=handle,
+        stderr=subprocess.STDOUT,
+        start_new_session=True,
+    )
+    process._duck_log_handle = handle  # type: ignore[attr-defined]
+    return process
+
+
+def stop_process(process: subprocess.Popen | None) -> None:
+    if process is None:
+        return
+    if process.poll() is None:
+        os.killpg(process.pid, signal.SIGTERM)
+        try:
+            process.wait(timeout=15)
+        except subprocess.TimeoutExpired:
+            os.killpg(process.pid, signal.SIGKILL)
+            process.wait(timeout=10)
+    handle = getattr(process, "_duck_log_handle", None)
+    if handle:
+        handle.close()
+
+
+def safe_json(content: str) -> tuple[dict[str, Any] | None, str | None]:
+    text = content.strip()
+    if text.startswith("```"):
+        text = text.strip("`")
+        text = text.removeprefix("json").strip()
+    try:
+        return json.loads(text), None
+    except json.JSONDecodeError as exc:
+        return None, str(exc)
+
+
+def score_case(case: BenchCase, content: str) -> tuple[float, str]:
+    data = None
+    if case.response_format:
+        data, error = safe_json(content)
+        if data is None:
+            return 0.0, f"invalid_json: {error}"
+        schema = case.response_format["json_schema"]["schema"]
+        try:
+            jsonschema.validate(data, schema)
+        except jsonschema.ValidationError as exc:
+            return 0.2, f"schema_error: {exc.message}"
+
+    if case.role == "action" and data is not None:
+        actions = data.get("actions") or []
+        if case.name == "direct_answer_no_tools":
+            return (1.0, "ok") if actions == [] else (0.3, f"unexpected_actions={actions}")
+        if case.name == "read_specific_file":
+            if actions and actions[0].get("tool") == "file_read" and actions[0].get("args", {}).get("path") == "CURRENT_STATE.md":
+                return 1.0, "ok"
+            return 0.4, f"wrong_action={actions}"
+
+    if case.role == "memory_policy" and data is not None:
+        if case.name == "store_user_preference":
+            ok = data.get("should_store") is True and data.get("memory_type") == "preference" and data.get("scope") == "global"
+            return (1.0, "ok") if ok else (0.4, f"wrong_memory_decision={data}")
+        if case.name == "ignore_trivial_tool_call":
+            ok = data.get("should_store") is False
+            return (1.0, "ok") if ok else (0.3, f"stored_trivial={data}")
+
+    if case.role == "recall" and data is not None:
+        ids = set(data.get("relevant_ids") or [])
+        if "m1" in ids and "m2" not in ids and "m3" not in ids:
+            return 1.0, "ok"
+        return 0.3, f"wrong_ids={sorted(ids)}"
+
+    lowered = content.lower()
+    missing = [word for word in case.required_keywords if word.lower() not in lowered]
+    if missing:
+        return max(0.2, 1.0 - 0.2 * len(missing)), f"missing={missing}"
+    return 1.0, "ok"
+
+
+async def run_case(base_url: str, model: str, case: BenchCase, timeout: float) -> dict[str, Any]:
+    payload: dict[str, Any] = {
+        "model": model,
+        "messages": [
+            {"role": "system", "content": case.system_prompt},
+            {"role": "user", "content": case.user_prompt},
+        ],
+        "temperature": 0.0,
+        "max_tokens": case.max_tokens,
+    }
+    if case.response_format:
+        payload["response_format"] = case.response_format
     started = time.perf_counter()
-    print(f"test_tasks={len(TASKS)}")
-    print("llm_calls=0")
-    print("tool_calls=0")
-    print("json_directive_validity=not_run")
-    print("retry_count=0")
-    print("memory_writes=0")
-    print("experience_record_created=no")
-    print("selected_skill=not_run")
-    print(f"total_runtime_seconds={time.perf_counter() - started:.3f}")
+    async with httpx.AsyncClient(timeout=timeout, trust_env=False) as client:
+        response = await client.post(f"{base_url}/chat/completions", json=payload)
+        elapsed = time.perf_counter() - started
+        response.raise_for_status()
+        raw = response.json()
+    message = raw.get("choices", [{}])[0].get("message", {})
+    content = message.get("content") or ""
+    score, note = score_case(case, content)
+    usage = raw.get("usage") or {}
+    completion_tokens = usage.get("completion_tokens") or max(1, len(content.split()))
+    return {
+        "role": case.role,
+        "case": case.name,
+        "score": score,
+        "note": note,
+        "elapsed_seconds": round(elapsed, 3),
+        "completion_tokens": completion_tokens,
+        "tokens_per_second": round(completion_tokens / max(elapsed, 0.001), 2),
+        "content_preview": content[:500],
+    }
+
+
+async def run_model(label: str, base_url: str, model: str, timeout: float) -> dict[str, Any]:
+    rows = []
+    for case in CASES:
+        try:
+            rows.append(await run_case(base_url, model, case, timeout))
+        except Exception as exc:
+            rows.append({
+                "role": case.role,
+                "case": case.name,
+                "score": 0.0,
+                "note": f"error: {exc}",
+                "elapsed_seconds": timeout,
+                "completion_tokens": 0,
+                "tokens_per_second": 0.0,
+                "content_preview": "",
+            })
+    scores = [row["score"] for row in rows]
+    return {
+        "model": label,
+        "quality": round(mean(scores), 3),
+        "avg_latency_seconds": round(mean(row["elapsed_seconds"] for row in rows), 3),
+        "avg_tokens_per_second": round(mean(row["tokens_per_second"] for row in rows), 2),
+        "cases": rows,
+    }
+
+
+def markdown_report(results: list[dict[str, Any]]) -> str:
+    lines = [
+        "# Utility Role Model Benchmark",
+        "",
+        "Scope: service roles only (`action`, `memory_policy`, `recall`, `summary`, `critic`).",
+        "The main user-facing thinker is not evaluated for replacement here.",
+        "",
+        "| Model | Quality | Avg latency, s | Avg tok/s | Notes |",
+        "| --- | ---: | ---: | ---: | --- |",
+    ]
+    for result in results:
+        failed = [case for case in result["cases"] if case["score"] < 1.0]
+        note = "all checks passed" if not failed else "; ".join(
+            f"{case['role']}/{case['case']}: {case['note']}" for case in failed[:3]
+        )
+        lines.append(
+            f"| {result['model']} | {result['quality']:.2f} | "
+            f"{result['avg_latency_seconds']:.2f} | {result['avg_tokens_per_second']:.2f} | {note} |"
+        )
+    lines.append("")
+    lines.append("## Case Details")
+    for result in results:
+        lines.append(f"\n### {result['model']}")
+        lines.append("| Role | Case | Score | Latency, s | tok/s | Note |")
+        lines.append("| --- | --- | ---: | ---: | ---: | --- |")
+        for case in result["cases"]:
+            lines.append(
+                f"| {case['role']} | {case['case']} | {case['score']:.2f} | "
+                f"{case['elapsed_seconds']:.2f} | {case['tokens_per_second']:.2f} | {case['note']} |"
+            )
+    return "\n".join(lines) + "\n"
+
+
+async def main() -> None:
+    args = parse_args()
+    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
+    results: list[dict[str, Any]] = []
+
+    print("Checking Qwen GPU baseline at", args.baseline_url)
+    await wait_ready(args.baseline_url, args.timeout)
+    results.append(await run_model("Qwen3.6-35B nonMTP GPU baseline", args.baseline_url, "local-main", args.timeout))
+
+    if not args.skip_cpu:
+        for raw_model in args.models:
+            model_path = Path(raw_model).resolve()
+            label = f"{model_path.stem} CPU"
+            print("Starting", label)
+            process = start_cpu_server(model_path, args.port, args.threads, args.ctx_size)
+            try:
+                await wait_ready(f"http://127.0.0.1:{args.port}/v1", args.timeout)
+                results.append(await run_model(label, f"http://127.0.0.1:{args.port}/v1", "bench-cpu", args.timeout))
+            finally:
+                stop_process(process)
+
+    timestamp = time.strftime("%Y%m%d_%H%M%S")
+    json_path = RESULTS_DIR / f"utility_model_bench_{timestamp}.json"
+    md_path = RESULTS_DIR / f"utility_model_bench_{timestamp}.md"
+    json_path.write_text(json.dumps(results, ensure_ascii=False, indent=2))
+    md_path.write_text(markdown_report(results))
+    print(markdown_report(results))
+    print(f"Wrote {json_path}")
+    print(f"Wrote {md_path}")
 
 
 if __name__ == "__main__":
diff --git a/scripts/duck-mtp.sh b/scripts/duck-mtp.sh
new file mode 100755
index 0000000..387f23b
--- /dev/null
+++ b/scripts/duck-mtp.sh
@@ -0,0 +1,262 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+
+ENV_KEYS=(
+  DUCK_LLAMA_SERVER_BIN
+  DUCK_MTP_MODEL_PATH
+  DUCK_MAIN_PORT
+  DUCK_CTX_SIZE
+  DUCK_N_GPU_LAYERS
+  DUCK_LLAMA_DEVICE
+  DUCK_PARALLEL
+  DUCK_LLAMA_PID_FILE
+  DUCK_LLAMA_LOG_FILE
+  DUCK_MTP_FLAGS
+  DUCK_HOST
+  DUCK_API_HOST
+  DUCK_API_PORT
+  DUCK_API_PID_FILE
+  DUCK_API_LOG_FILE
+  DUCK_API_COMMAND
+)
+declare -A ENV_OVERRIDES=()
+for key in "${ENV_KEYS[@]}"; do
+  if [[ -v "${key}" ]]; then
+    ENV_OVERRIDES["${key}"]="${!key}"
+  fi
+done
+
+if [[ -f "${ROOT_DIR}/.env" ]]; then
+  set -a
+  # shellcheck disable=SC1091
+  source "${ROOT_DIR}/.env"
+  set +a
+fi
+for key in "${!ENV_OVERRIDES[@]}"; do
+  export "${key}=${ENV_OVERRIDES[${key}]}"
+done
+
+ACTION="${1:-start}"
+API_PID_FILE="${DUCK_API_PID_FILE:-${ROOT_DIR}/data/duck-api.pid}"
+API_LOG_FILE="${DUCK_API_LOG_FILE:-${ROOT_DIR}/data/duck-api.log}"
+API_URL="http://${DUCK_API_HOST:-127.0.0.1}:${DUCK_API_PORT:-8000}"
+LLAMA_SCRIPT="${ROOT_DIR}/scripts/llama/start_mtp_main.sh"
+
+usage() {
+  cat <<'EOF'
+Usage: scripts/duck-mtp.sh <command>
+
+Commands:
+  start       Start MTP llama-server and DuckLM API in the background
+  stop        Stop DuckLM API and managed MTP llama-server
+  restart     Stop and start the whole local DuckLM stack
+  status      Print process and HTTP health status
+  logs        Show DuckLM API and llama-server logs; use --follow/-f and --lines N
+  help        Show this help
+
+Environment:
+  DUCK_API_HOST       API bind host, default 127.0.0.1
+  DUCK_API_PORT       API port, default 8000
+  DUCK_API_PID_FILE   API PID file path
+  DUCK_API_LOG_FILE   API log file path
+  DUCK_API_COMMAND    API command override, default ".venv/bin/python -m duck_core.api"
+
+MTP llama-server environment is handled by scripts/llama/start_mtp_main.sh.
+EOF
+}
+
+api_is_running() {
+  [[ -f "${API_PID_FILE}" ]] || return 1
+  local pid
+  pid="$(cat "${API_PID_FILE}")"
+  [[ "${pid}" =~ ^[0-9]+$ ]] || return 1
+  kill -0 "${pid}" 2>/dev/null
+}
+
+api_pid_value() {
+  if [[ -f "${API_PID_FILE}" ]]; then
+    cat "${API_PID_FILE}"
+  fi
+}
+
+start_api() {
+  if api_is_running; then
+    echo "DuckLM API already running: pid=$(api_pid_value)"
+    return 0
+  fi
+
+  mkdir -p "$(dirname "${API_PID_FILE}")" "$(dirname "${API_LOG_FILE}")"
+  rm -f "${API_PID_FILE}"
+
+  local command_string="${DUCK_API_COMMAND:-.venv/bin/python -m duck_core.api}"
+  echo "Starting DuckLM API..."
+  echo "Command: ${command_string}" >> "${API_LOG_FILE}"
+  if command -v setsid >/dev/null 2>&1; then
+    (
+      cd "${ROOT_DIR}"
+      nohup setsid bash -lc "${command_string}" >> "${API_LOG_FILE}" 2>&1 < /dev/null &
+      echo "$!" > "${API_PID_FILE}"
+    )
+  else
+    (
+      cd "${ROOT_DIR}"
+      nohup bash -lc "${command_string}" >> "${API_LOG_FILE}" 2>&1 < /dev/null &
+      echo "$!" > "${API_PID_FILE}"
+    )
+  fi
+
+  sleep 0.2
+  if api_is_running; then
+    echo "DuckLM API started: pid=$(api_pid_value)"
+    echo "WebChat: ${API_URL}/"
+    echo "Log: ${API_LOG_FILE}"
+    for _ in {1..20}; do
+      if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${API_URL}/health" >/dev/null 2>&1; then
+        echo "DuckLM API health: ok (${API_URL})"
+        return 0
+      fi
+      sleep 0.25
+    done
+    echo "DuckLM API health: not ready yet (${API_URL})"
+    return 0
+  fi
+
+  echo "DuckLM API failed to start. See ${API_LOG_FILE}" >&2
+  rm -f "${API_PID_FILE}"
+  return 1
+}
+
+stop_api() {
+  if ! api_is_running; then
+    rm -f "${API_PID_FILE}"
+    echo "DuckLM API not running"
+    return 0
+  fi
+
+  local pid
+  pid="$(api_pid_value)"
+  echo "Stopping DuckLM API: pid=${pid}"
+  kill "${pid}" 2>/dev/null || true
+
+  for _ in {1..30}; do
+    if ! kill -0 "${pid}" 2>/dev/null; then
+      rm -f "${API_PID_FILE}"
+      echo "DuckLM API stopped"
+      return 0
+    fi
+    sleep 0.2
+  done
+
+  echo "DuckLM API did not stop after SIGTERM; sending SIGKILL"
+  kill -9 "${pid}" 2>/dev/null || true
+  rm -f "${API_PID_FILE}"
+  echo "DuckLM API stopped"
+}
+
+start_stack() {
+  "${LLAMA_SCRIPT}" start
+  start_api
+  echo
+  echo "Status:"
+  status_stack
+}
+
+stop_stack() {
+  stop_api
+  "${LLAMA_SCRIPT}" stop
+}
+
+status_stack() {
+  local rc=0
+  if api_is_running; then
+    local pid
+    pid="$(api_pid_value)"
+    echo "DuckLM API running: pid=${pid}"
+    if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${API_URL}/health" >/dev/null 2>&1; then
+      echo "DuckLM API health: ok (${API_URL})"
+    else
+      echo "DuckLM API health: not ready (${API_URL})"
+    fi
+  else
+    if [[ -f "${API_PID_FILE}" ]]; then
+      echo "DuckLM API not running; removing stale pid file ${API_PID_FILE}"
+      rm -f "${API_PID_FILE}"
+    else
+      echo "DuckLM API not running"
+    fi
+    rc=3
+  fi
+
+  local llama_rc=0
+  "${LLAMA_SCRIPT}" status || llama_rc=$?
+  if [[ "${llama_rc}" != "0" && "${rc}" == "0" ]]; then
+    rc="${llama_rc}"
+  fi
+  return "${rc}"
+}
+
+logs_stack() {
+  local follow=0
+  local lines=100
+  shift || true
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+      -f|--follow)
+        follow=1
+        shift
+        ;;
+      --lines)
+        lines="${2:?--lines requires a value}"
+        shift 2
+        ;;
+      *)
+        echo "Unknown logs argument: $1" >&2
+        return 2
+        ;;
+    esac
+  done
+
+  mkdir -p "$(dirname "${API_LOG_FILE}")"
+  touch "${API_LOG_FILE}"
+  if [[ "${follow}" == "1" ]]; then
+    local llama_log_file="${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-mtp.log}"
+    mkdir -p "$(dirname "${llama_log_file}")"
+    touch "${llama_log_file}"
+    tail -n "${lines}" -f "${API_LOG_FILE}" "${llama_log_file}"
+  else
+    echo "==> DuckLM API log: ${API_LOG_FILE} <=="
+    tail -n "${lines}" "${API_LOG_FILE}"
+    echo
+    echo "==> llama-server log: ${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-mtp.log} <=="
+    "${LLAMA_SCRIPT}" logs --lines "${lines}"
+  fi
+}
+
+case "${ACTION}" in
+  start)
+    start_stack
+    ;;
+  stop)
+    stop_stack
+    ;;
+  restart)
+    stop_stack
+    start_stack
+    ;;
+  status)
+    status_stack
+    ;;
+  logs)
+    logs_stack "$@"
+    ;;
+  help|-h|--help)
+    usage
+    ;;
+  *)
+    echo "Unknown command: ${ACTION}" >&2
+    usage >&2
+    exit 2
+    ;;
+esac
diff --git a/scripts/duck.sh b/scripts/duck.sh
new file mode 100755
index 0000000..26bcc16
--- /dev/null
+++ b/scripts/duck.sh
@@ -0,0 +1,262 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+
+ENV_KEYS=(
+  DUCK_LLAMA_SERVER_BIN
+  DUCK_MAIN_MODEL_PATH
+  DUCK_MAIN_PORT
+  DUCK_CTX_SIZE
+  DUCK_N_GPU_LAYERS
+  DUCK_LLAMA_DEVICE
+  DUCK_PARALLEL
+  DUCK_LLAMA_PID_FILE
+  DUCK_LLAMA_LOG_FILE
+  DUCK_LLAMA_EXTRA_ARGS
+  DUCK_HOST
+  DUCK_API_HOST
+  DUCK_API_PORT
+  DUCK_API_PID_FILE
+  DUCK_API_LOG_FILE
+  DUCK_API_COMMAND
+)
+declare -A ENV_OVERRIDES=()
+for key in "${ENV_KEYS[@]}"; do
+  if [[ -v "${key}" ]]; then
+    ENV_OVERRIDES["${key}"]="${!key}"
+  fi
+done
+
+if [[ -f "${ROOT_DIR}/.env" ]]; then
+  set -a
+  # shellcheck disable=SC1091
+  source "${ROOT_DIR}/.env"
+  set +a
+fi
+for key in "${!ENV_OVERRIDES[@]}"; do
+  export "${key}=${ENV_OVERRIDES[${key}]}"
+done
+
+ACTION="${1:-start}"
+API_PID_FILE="${DUCK_API_PID_FILE:-${ROOT_DIR}/data/duck-api.pid}"
+API_LOG_FILE="${DUCK_API_LOG_FILE:-${ROOT_DIR}/data/duck-api.log}"
+API_URL="http://${DUCK_API_HOST:-127.0.0.1}:${DUCK_API_PORT:-8000}"
+LLAMA_SCRIPT="${ROOT_DIR}/scripts/llama/start_main.sh"
+
+usage() {
+  cat <<'EOF'
+Usage: scripts/duck.sh <command>
+
+Commands:
+  start       Start llama-server and DuckLM API in the background
+  stop        Stop DuckLM API and managed llama-server
+  restart     Stop and start the whole local DuckLM stack
+  status      Print process and HTTP health status
+  logs        Show DuckLM API and llama-server logs; use --follow/-f and --lines N
+  help        Show this help
+
+Environment:
+  DUCK_API_HOST       API bind host, default 127.0.0.1
+  DUCK_API_PORT       API port, default 8000
+  DUCK_API_PID_FILE   API PID file path
+  DUCK_API_LOG_FILE   API log file path
+  DUCK_API_COMMAND    API command override, default ".venv/bin/python -m duck_core.api"
+
+llama-server environment is handled by scripts/llama/start_main.sh.
+EOF
+}
+
+api_is_running() {
+  [[ -f "${API_PID_FILE}" ]] || return 1
+  local pid
+  pid="$(cat "${API_PID_FILE}")"
+  [[ "${pid}" =~ ^[0-9]+$ ]] || return 1
+  kill -0 "${pid}" 2>/dev/null
+}
+
+api_pid_value() {
+  if [[ -f "${API_PID_FILE}" ]]; then
+    cat "${API_PID_FILE}"
+  fi
+}
+
+start_api() {
+  if api_is_running; then
+    echo "DuckLM API already running: pid=$(api_pid_value)"
+    return 0
+  fi
+
+  mkdir -p "$(dirname "${API_PID_FILE}")" "$(dirname "${API_LOG_FILE}")"
+  rm -f "${API_PID_FILE}"
+
+  local command_string="${DUCK_API_COMMAND:-.venv/bin/python -m duck_core.api}"
+  echo "Starting DuckLM API..."
+  echo "Command: ${command_string}" >> "${API_LOG_FILE}"
+  if command -v setsid >/dev/null 2>&1; then
+    (
+      cd "${ROOT_DIR}"
+      nohup setsid bash -lc "${command_string}" >> "${API_LOG_FILE}" 2>&1 < /dev/null &
+      echo "$!" > "${API_PID_FILE}"
+    )
+  else
+    (
+      cd "${ROOT_DIR}"
+      nohup bash -lc "${command_string}" >> "${API_LOG_FILE}" 2>&1 < /dev/null &
+      echo "$!" > "${API_PID_FILE}"
+    )
+  fi
+
+  sleep 0.2
+  if api_is_running; then
+    echo "DuckLM API started: pid=$(api_pid_value)"
+    echo "WebChat: ${API_URL}/"
+    echo "Log: ${API_LOG_FILE}"
+    for _ in {1..20}; do
+      if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${API_URL}/health" >/dev/null 2>&1; then
+        echo "DuckLM API health: ok (${API_URL})"
+        return 0
+      fi
+      sleep 0.25
+    done
+    echo "DuckLM API health: not ready yet (${API_URL})"
+    return 0
+  fi
+
+  echo "DuckLM API failed to start. See ${API_LOG_FILE}" >&2
+  rm -f "${API_PID_FILE}"
+  return 1
+}
+
+stop_api() {
+  if ! api_is_running; then
+    rm -f "${API_PID_FILE}"
+    echo "DuckLM API not running"
+    return 0
+  fi
+
+  local pid
+  pid="$(api_pid_value)"
+  echo "Stopping DuckLM API: pid=${pid}"
+  kill "${pid}" 2>/dev/null || true
+
+  for _ in {1..30}; do
+    if ! kill -0 "${pid}" 2>/dev/null; then
+      rm -f "${API_PID_FILE}"
+      echo "DuckLM API stopped"
+      return 0
+    fi
+    sleep 0.2
+  done
+
+  echo "DuckLM API did not stop after SIGTERM; sending SIGKILL"
+  kill -9 "${pid}" 2>/dev/null || true
+  rm -f "${API_PID_FILE}"
+  echo "DuckLM API stopped"
+}
+
+start_stack() {
+  "${LLAMA_SCRIPT}" start
+  start_api
+  echo
+  echo "Status:"
+  status_stack
+}
+
+stop_stack() {
+  stop_api
+  "${LLAMA_SCRIPT}" stop
+}
+
+status_stack() {
+  local rc=0
+  if api_is_running; then
+    local pid
+    pid="$(api_pid_value)"
+    echo "DuckLM API running: pid=${pid}"
+    if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${API_URL}/health" >/dev/null 2>&1; then
+      echo "DuckLM API health: ok (${API_URL})"
+    else
+      echo "DuckLM API health: not ready (${API_URL})"
+    fi
+  else
+    if [[ -f "${API_PID_FILE}" ]]; then
+      echo "DuckLM API not running; removing stale pid file ${API_PID_FILE}"
+      rm -f "${API_PID_FILE}"
+    else
+      echo "DuckLM API not running"
+    fi
+    rc=3
+  fi
+
+  local llama_rc=0
+  "${LLAMA_SCRIPT}" status || llama_rc=$?
+  if [[ "${llama_rc}" != "0" && "${rc}" == "0" ]]; then
+    rc="${llama_rc}"
+  fi
+  return "${rc}"
+}
+
+logs_stack() {
+  local follow=0
+  local lines=100
+  shift || true
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+      -f|--follow)
+        follow=1
+        shift
+        ;;
+      --lines)
+        lines="${2:?--lines requires a value}"
+        shift 2
+        ;;
+      *)
+        echo "Unknown logs argument: $1" >&2
+        return 2
+        ;;
+    esac
+  done
+
+  mkdir -p "$(dirname "${API_LOG_FILE}")"
+  touch "${API_LOG_FILE}"
+  if [[ "${follow}" == "1" ]]; then
+    local llama_log_file="${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-main.log}"
+    mkdir -p "$(dirname "${llama_log_file}")"
+    touch "${llama_log_file}"
+    tail -n "${lines}" -f "${API_LOG_FILE}" "${llama_log_file}"
+  else
+    echo "==> DuckLM API log: ${API_LOG_FILE} <=="
+    tail -n "${lines}" "${API_LOG_FILE}"
+    echo
+    echo "==> llama-server log: ${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-main.log} <=="
+    "${LLAMA_SCRIPT}" logs --lines "${lines}"
+  fi
+}
+
+case "${ACTION}" in
+  start)
+    start_stack
+    ;;
+  stop)
+    stop_stack
+    ;;
+  restart)
+    stop_stack
+    start_stack
+    ;;
+  status)
+    status_stack
+    ;;
+  logs)
+    logs_stack "$@"
+    ;;
+  help|-h|--help)
+    usage
+    ;;
+  *)
+    echo "Unknown command: ${ACTION}" >&2
+    usage >&2
+    exit 2
+    ;;
+esac
diff --git a/scripts/llama/start_main.sh b/scripts/llama/start_main.sh
index 09c1191..62bd70b 100755
--- a/scripts/llama/start_main.sh
+++ b/scripts/llama/start_main.sh
@@ -37,6 +37,7 @@ ACTION="${1:-start}"
 PID_FILE="${DUCK_LLAMA_PID_FILE:-${ROOT_DIR}/data/llama-main.pid}"
 LOG_FILE="${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-main.log}"
 BASE_URL="http://${DUCK_HOST:-127.0.0.1}:${DUCK_MAIN_PORT:-8081}/v1"
+LLAMA_BIN_DIR=""
 
 resolve_project_path() {
   local value="$1"
@@ -125,6 +126,7 @@ start() {
   llama_bin="${DUCK_LLAMA_SERVER_BIN:-llama-server}"
   if [[ "${llama_bin}" == */* ]]; then
     llama_bin="$(resolve_project_path "${llama_bin}")"
+    LLAMA_BIN_DIR="$(dirname "${llama_bin}")"
   fi
   model_path="$(resolve_project_path "${DUCK_MAIN_MODEL_PATH}")"
   local command=(
@@ -152,9 +154,9 @@ start() {
   echo "Starting llama-server..."
   echo "Command: ${command[*]}" >> "${LOG_FILE}"
   if command -v setsid >/dev/null 2>&1; then
-    nohup setsid "${command[@]}" >> "${LOG_FILE}" 2>&1 &
+    nohup setsid env LD_LIBRARY_PATH="${LLAMA_BIN_DIR}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" "${command[@]}" >> "${LOG_FILE}" 2>&1 &
   else
-    nohup "${command[@]}" >> "${LOG_FILE}" 2>&1 &
+    nohup env LD_LIBRARY_PATH="${LLAMA_BIN_DIR}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" "${command[@]}" >> "${LOG_FILE}" 2>&1 &
   fi
   local pid=$!
   echo "${pid}" > "${PID_FILE}"
diff --git a/scripts/llama/start_mtp_main.sh b/scripts/llama/start_mtp_main.sh
new file mode 100755
index 0000000..db56a20
--- /dev/null
+++ b/scripts/llama/start_mtp_main.sh
@@ -0,0 +1,271 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+
+ENV_KEYS=(
+  DUCK_LLAMA_SERVER_BIN
+  DUCK_MTP_MODEL_PATH
+  DUCK_MAIN_PORT
+  DUCK_CTX_SIZE
+  DUCK_N_GPU_LAYERS
+  DUCK_LLAMA_DEVICE
+  DUCK_PARALLEL
+  DUCK_LLAMA_PID_FILE
+  DUCK_LLAMA_LOG_FILE
+  DUCK_MTP_FLAGS
+  DUCK_HOST
+)
+declare -A ENV_OVERRIDES=()
+for key in "${ENV_KEYS[@]}"; do
+  if [[ -v "${key}" ]]; then
+    ENV_OVERRIDES["${key}"]="${!key}"
+  fi
+done
+
+if [[ -f "${ROOT_DIR}/.env" ]]; then
+  set -a
+  # shellcheck disable=SC1091
+  source "${ROOT_DIR}/.env"
+  set +a
+fi
+for key in "${!ENV_OVERRIDES[@]}"; do
+  export "${key}=${ENV_OVERRIDES[${key}]}"
+done
+
+ACTION="${1:-start}"
+PID_FILE="${DUCK_LLAMA_PID_FILE:-${ROOT_DIR}/data/llama-mtp.pid}"
+LOG_FILE="${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-mtp.log}"
+BASE_URL="http://${DUCK_HOST:-127.0.0.1}:${DUCK_MAIN_PORT:-8081}/v1"
+LLAMA_BIN_DIR=""
+
+resolve_project_path() {
+  local value="$1"
+  if [[ "${value}" == /* ]]; then
+    printf '%s\n' "${value}"
+  else
+    printf '%s\n' "${ROOT_DIR}/${value#./}"
+  fi
+}
+
+usage() {
+  cat <<'EOF'
+Usage: scripts/llama/start_mtp_main.sh <command>
+
+Commands:
+  start       Start MTP llama-server in the background
+  stop        Stop the managed MTP llama-server process
+  restart     Stop and start MTP llama-server
+  status      Print process and HTTP health status
+  logs        Show logs; use --follow/-f and --lines N
+  help        Show this help
+
+Environment:
+  DUCK_LLAMA_SERVER_BIN  Path to llama-server binary
+  DUCK_MTP_MODEL_PATH    Path to MTP GGUF model
+  DUCK_HOST              Bind host, default 127.0.0.1
+  DUCK_MAIN_PORT         Port, default 8081
+  DUCK_CTX_SIZE          Context size, default 65536
+  DUCK_N_GPU_LAYERS      GPU layers, default auto
+  DUCK_LLAMA_DEVICE      Device name, for example Vulkan0
+  DUCK_PARALLEL          Server slots, default 1
+  DUCK_LLAMA_PID_FILE    PID file path
+  DUCK_LLAMA_LOG_FILE    Log file path
+  DUCK_MTP_FLAGS         Extra MTP llama-server args
+EOF
+}
+
+is_running() {
+  [[ -f "${PID_FILE}" ]] || return 1
+  local pid
+  pid="$(cat "${PID_FILE}")"
+  [[ "${pid}" =~ ^[0-9]+$ ]] || return 1
+  kill -0 "${pid}" 2>/dev/null
+}
+
+pid_value() {
+  if [[ -f "${PID_FILE}" ]]; then
+    cat "${PID_FILE}"
+  fi
+}
+
+status() {
+  if is_running; then
+    local pid
+    pid="$(pid_value)"
+    echo "llama-server running: pid=${pid}"
+    if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${BASE_URL}/models" >/dev/null 2>&1; then
+      echo "HTTP health: ok (${BASE_URL})"
+    else
+      echo "HTTP health: not ready (${BASE_URL})"
+    fi
+    return 0
+  fi
+
+  if [[ -f "${PID_FILE}" ]]; then
+    echo "llama-server not running; removing stale pid file ${PID_FILE}"
+    rm -f "${PID_FILE}"
+  else
+    echo "llama-server not running"
+  fi
+  return 3
+}
+
+start() {
+  if is_running; then
+    echo "MTP llama-server already running: pid=$(pid_value)"
+    return 0
+  fi
+
+  : "${DUCK_MTP_MODEL_PATH:?DUCK_MTP_MODEL_PATH is required}"
+
+  mkdir -p "$(dirname "${PID_FILE}")" "$(dirname "${LOG_FILE}")"
+  rm -f "${PID_FILE}"
+
+  local llama_bin mtp_model_path
+  llama_bin="${DUCK_LLAMA_SERVER_BIN:-llama-server}"
+  if [[ "${llama_bin}" == */* ]]; then
+    llama_bin="$(resolve_project_path "${llama_bin}")"
+    LLAMA_BIN_DIR="$(dirname "${llama_bin}")"
+  fi
+  mtp_model_path="$(resolve_project_path "${DUCK_MTP_MODEL_PATH}")"
+
+  local help_text
+  help_text="$(LD_LIBRARY_PATH="${LLAMA_BIN_DIR}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" "${llama_bin}" --help 2>&1 || true)"
+  if ! grep -qi "draft-mtp" <<< "${help_text}"; then
+    echo "This llama-server build does not expose draft-mtp speculative decoding." >&2
+    return 1
+  fi
+
+  local command=(
+    "${llama_bin}"
+    -m "${mtp_model_path}"
+    --alias local-main
+    --host "${DUCK_HOST:-127.0.0.1}"
+    --port "${DUCK_MAIN_PORT:-8081}"
+    -c "${DUCK_CTX_SIZE:-65536}"
+    --parallel "${DUCK_PARALLEL:-1}"
+    -ngl "${DUCK_N_GPU_LAYERS:-auto}"
+    --flash-attn on
+    --cache-prompt
+    --metrics
+    --spec-type draft-mtp
+  )
+  if [[ -n "${DUCK_LLAMA_DEVICE:-}" ]]; then
+    command+=(--device "${DUCK_LLAMA_DEVICE}")
+  fi
+  if [[ -n "${DUCK_MTP_FLAGS:-}" ]]; then
+    # shellcheck disable=SC2206
+    local extra_args=( ${DUCK_MTP_FLAGS} )
+    command+=("${extra_args[@]}")
+  fi
+
+  echo "Starting MTP llama-server..."
+  echo "Command: ${command[*]}" >> "${LOG_FILE}"
+  if command -v setsid >/dev/null 2>&1; then
+    nohup setsid env LD_LIBRARY_PATH="${LLAMA_BIN_DIR}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" "${command[@]}" >> "${LOG_FILE}" 2>&1 &
+  else
+    nohup env LD_LIBRARY_PATH="${LLAMA_BIN_DIR}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" "${command[@]}" >> "${LOG_FILE}" 2>&1 &
+  fi
+  local pid=$!
+  echo "${pid}" > "${PID_FILE}"
+  sleep 0.2
+
+  if is_running; then
+    echo "MTP llama-server started: pid=${pid}"
+    echo "Log: ${LOG_FILE}"
+    return 0
+  fi
+
+  echo "MTP llama-server failed to start. See ${LOG_FILE}" >&2
+  rm -f "${PID_FILE}"
+  return 1
+}
+
+stop() {
+  if ! is_running; then
+    rm -f "${PID_FILE}"
+    echo "llama-server not running"
+    return 0
+  fi
+
+  local pid
+  pid="$(pid_value)"
+  echo "Stopping MTP llama-server: pid=${pid}"
+  kill "${pid}" 2>/dev/null || true
+
+  for _ in {1..30}; do
+    if ! kill -0 "${pid}" 2>/dev/null; then
+      rm -f "${PID_FILE}"
+      echo "MTP llama-server stopped"
+      return 0
+    fi
+    sleep 0.2
+  done
+
+  echo "MTP llama-server did not stop after SIGTERM; sending SIGKILL"
+  kill -9 "${pid}" 2>/dev/null || true
+  rm -f "${PID_FILE}"
+  echo "MTP llama-server stopped"
+}
+
+restart() {
+  stop
+  start
+}
+
+logs() {
+  local follow=0
+  local lines=100
+  shift || true
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+      -f|--follow)
+        follow=1
+        shift
+        ;;
+      --lines)
+        lines="${2:?--lines requires a value}"
+        shift 2
+        ;;
+      *)
+        echo "Unknown logs argument: $1" >&2
+        return 2
+        ;;
+    esac
+  done
+
+  mkdir -p "$(dirname "${LOG_FILE}")"
+  touch "${LOG_FILE}"
+  if [[ "${follow}" == "1" ]]; then
+    tail -n "${lines}" -f "${LOG_FILE}"
+  else
+    tail -n "${lines}" "${LOG_FILE}"
+  fi
+}
+
+case "${ACTION}" in
+  start)
+    start
+    ;;
+  stop)
+    stop
+    ;;
+  restart)
+    restart
+    ;;
+  status)
+    status
+    ;;
+  logs)
+    logs "$@"
+    ;;
+  help|-h|--help)
+    usage
+    ;;
+  *)
+    echo "Unknown command: ${ACTION}" >&2
+    usage >&2
+    exit 2
+    ;;
+esac
diff --git a/tests/smoke/test_api_stream_chat.py b/tests/smoke/test_api_stream_chat.py
index 4d24dc8..911c9d4 100644
--- a/tests/smoke/test_api_stream_chat.py
+++ b/tests/smoke/test_api_stream_chat.py
@@ -56,6 +56,61 @@ def test_stream_chat_endpoint_emits_sse_reasoning_and_content(tmp_path, monkeypa
     assert "answer" in body
 
 
+def test_stream_chat_forwards_reasoning_toggle_to_thinker(tmp_path, monkeypatch):
+    monkeypatch.setenv("DUCK_DB_PATH", str(tmp_path / "duck.sqlite3"))
+    captured = {}
+
+    async def fake_chat(self, role, messages, temperature=None, max_output_tokens=None, response_format=None):
+        return ModelResponse(
+            role=role,
+            model="local-main",
+            content=json.dumps(
+                {
+                    "kind": "action_directive",
+                    "intent": "answer directly",
+                    "risk_level": "none",
+                    "actions": [],
+                }
+            ),
+            reasoning_content=None,
+            raw={},
+            latency_ms=1.0,
+        )
+
+    async def fake_stream_chat(
+        self,
+        role,
+        messages,
+        temperature=None,
+        max_output_tokens=None,
+        response_format=None,
+        reasoning=None,
+    ):
+        captured["role"] = role
+        captured["reasoning"] = reasoning
+        yield {"type": "content_delta", "delta": "answer"}
+
+    monkeypatch.setattr("duck_core.model_client.ModelClient.chat", fake_chat)
+    monkeypatch.setattr("duck_core.model_client.ModelClient.stream_chat", fake_stream_chat)
+    client = TestClient(create_app())
+
+    with client.stream(
+        "POST",
+        "/v1/chat/stream",
+        json={
+            "message": "hello",
+            "workspace": "./workspace",
+            "debug": True,
+            "reasoning": "off",
+        },
+    ) as response:
+        body = "".join(response.iter_text())
+
+    assert response.status_code == 200
+    assert "event: done" in body
+    assert captured == {"role": "thinker", "reasoning": "off"}
+
+
 def test_stream_chat_runs_memory_policy_and_reflection_after_completion(tmp_path, monkeypatch):
     monkeypatch.setenv("DUCK_DB_PATH", str(tmp_path / "duck.sqlite3"))
 
diff --git a/tests/smoke/test_duck_service_script.py b/tests/smoke/test_duck_service_script.py
new file mode 100644
index 0000000..6bcfca8
--- /dev/null
+++ b/tests/smoke/test_duck_service_script.py
@@ -0,0 +1,246 @@
+import os
+import subprocess
+import textwrap
+import time
+from pathlib import Path
+
+
+def test_duck_script_manages_llama_and_api(tmp_path):
+    fake_bin = tmp_path / "llama-server"
+    fake_bin.write_text(
+        textwrap.dedent(
+            """\
+            #!/usr/bin/env bash
+            echo "fake llama-server $*" >&2
+            trap 'exit 0' TERM INT
+            while true; do sleep 1; done
+            """
+        )
+    )
+    fake_bin.chmod(0o755)
+
+    fake_api = tmp_path / "fake-api.sh"
+    fake_api.write_text(
+        textwrap.dedent(
+            """\
+            #!/usr/bin/env bash
+            echo "fake duck api $*" >&2
+            trap 'exit 0' TERM INT
+            while true; do sleep 1; done
+            """
+        )
+    )
+    fake_api.chmod(0o755)
+
+    pid_file = tmp_path / "llama.pid"
+    log_file = tmp_path / "llama.log"
+    api_pid_file = tmp_path / "duck-api.pid"
+    api_log_file = tmp_path / "duck-api.log"
+    model_path = tmp_path / "model.gguf"
+    model_path.write_text("fake")
+
+    env = {
+        **os.environ,
+        "DUCK_LLAMA_SERVER_BIN": str(fake_bin),
+        "DUCK_MAIN_MODEL_PATH": str(model_path),
+        "DUCK_LLAMA_PID_FILE": str(pid_file),
+        "DUCK_LLAMA_LOG_FILE": str(log_file),
+        "DUCK_MAIN_PORT": "18081",
+        "DUCK_API_PID_FILE": str(api_pid_file),
+        "DUCK_API_LOG_FILE": str(api_log_file),
+        "DUCK_API_COMMAND": str(fake_api),
+        "DUCK_API_PORT": "18000",
+    }
+    script = "scripts/duck.sh"
+
+    stopped = subprocess.run([script, "status"], env=env, text=True, capture_output=True)
+    assert stopped.returncode == 3
+    assert "DuckLM API not running" in stopped.stdout
+    assert "llama-server not running" in stopped.stdout
+
+    started = subprocess.run([script, "start"], env=env, text=True, capture_output=True)
+    assert started.returncode == 0
+    assert "Starting llama-server" in started.stdout
+    assert "Starting DuckLM API" in started.stdout
+    assert "Status:" in started.stdout
+    assert "DuckLM API running" in started.stdout
+    assert "llama-server running" in started.stdout
+    assert api_pid_file.exists()
+    assert pid_file.exists()
+
+    try:
+        running = subprocess.run([script, "status"], env=env, text=True, capture_output=True)
+        assert running.returncode == 0
+        assert "DuckLM API running" in running.stdout
+        assert "llama-server running" in running.stdout
+
+        time.sleep(0.2)
+        logs = subprocess.run(
+            [script, "logs", "--lines", "20"], env=env, text=True, capture_output=True
+        )
+        assert logs.returncode == 0
+        assert "DuckLM API log" in logs.stdout
+        assert "fake duck api" in logs.stdout
+        assert "llama-server log" in logs.stdout
+        assert "--alias local-main" in logs.stdout
+    finally:
+        stopped = subprocess.run([script, "stop"], env=env, text=True, capture_output=True)
+        assert stopped.returncode == 0
+        assert not api_pid_file.exists()
+        assert not pid_file.exists()
+
+
+def test_duck_mtp_script_starts_mtp_llama_for_duck_api(tmp_path):
+    fake_bin = tmp_path / "llama-server"
+    fake_bin.write_text(
+        textwrap.dedent(
+            """\
+            #!/usr/bin/env bash
+            if [[ "${1:-}" == "--help" ]]; then
+              echo "supports --spec-type draft-mtp"
+              exit 0
+            fi
+            echo "fake mtp llama-server $*" >&2
+            trap 'exit 0' TERM INT
+            while true; do sleep 1; done
+            """
+        )
+    )
+    fake_bin.chmod(0o755)
+
+    fake_api = tmp_path / "fake-api.sh"
+    fake_api.write_text(
+        textwrap.dedent(
+            """\
+            #!/usr/bin/env bash
+            echo "fake duck api $*" >&2
+            trap 'exit 0' TERM INT
+            while true; do sleep 1; done
+            """
+        )
+    )
+    fake_api.chmod(0o755)
+
+    main_model_path = tmp_path / "main.gguf"
+    mtp_model_path = tmp_path / "mtp.gguf"
+    main_model_path.write_text("main")
+    mtp_model_path.write_text("mtp")
+    pid_file = tmp_path / "llama-mtp.pid"
+    log_file = tmp_path / "llama-mtp.log"
+    api_pid_file = tmp_path / "duck-api-mtp.pid"
+    api_log_file = tmp_path / "duck-api-mtp.log"
+
+    env = {
+        **os.environ,
+        "DUCK_LLAMA_SERVER_BIN": str(fake_bin),
+        "DUCK_MAIN_MODEL_PATH": str(main_model_path),
+        "DUCK_MTP_MODEL_PATH": str(mtp_model_path),
+        "DUCK_LLAMA_PID_FILE": str(pid_file),
+        "DUCK_LLAMA_LOG_FILE": str(log_file),
+        "DUCK_MAIN_PORT": "18081",
+        "DUCK_API_PID_FILE": str(api_pid_file),
+        "DUCK_API_LOG_FILE": str(api_log_file),
+        "DUCK_API_COMMAND": str(fake_api),
+        "DUCK_API_PORT": "18000",
+    }
+    script = "scripts/duck-mtp.sh"
+
+    started = subprocess.run([script, "start"], env=env, text=True, capture_output=True)
+    assert started.returncode == 0
+    assert "Starting MTP llama-server" in started.stdout
+    assert "DuckLM API running" in started.stdout
+    assert "llama-server running" in started.stdout
+
+    try:
+        logs = subprocess.run(
+            [script, "logs", "--lines", "30"], env=env, text=True, capture_output=True
+        )
+        assert logs.returncode == 0
+        assert "--alias local-main" in logs.stdout
+        assert "--spec-type draft-mtp" in logs.stdout
+        assert f"-m {mtp_model_path}" in logs.stdout
+        assert "--model-draft" not in logs.stdout
+        assert str(main_model_path) not in logs.stdout
+    finally:
+        stopped = subprocess.run([script, "stop"], env=env, text=True, capture_output=True)
+        assert stopped.returncode == 0
+        assert not api_pid_file.exists()
+        assert not pid_file.exists()
+
+
+def test_duck_mtp_script_sets_llama_bin_dir_library_path_for_help_check(tmp_path):
+    bin_dir = tmp_path / "build" / "bin"
+    bin_dir.mkdir(parents=True)
+    fake_bin = bin_dir / "llama-server"
+    fake_bin.write_text(
+        textwrap.dedent(
+            """\
+            #!/usr/bin/env bash
+            case ":${LD_LIBRARY_PATH:-}:" in
+              *":$(dirname "$0"):"*) ;;
+              *)
+                echo "error while loading shared libraries: libllama-common.so.0" >&2
+                exit 127
+                ;;
+            esac
+            if [[ "${1:-}" == "--help" ]]; then
+              echo "supports --spec-type draft-mtp"
+              exit 0
+            fi
+            echo "fake mtp llama-server $*" >&2
+            trap 'exit 0' TERM INT
+            while true; do sleep 1; done
+            """
+        )
+    )
+    fake_bin.chmod(0o755)
+
+    fake_api = tmp_path / "fake-api.sh"
+    fake_api.write_text(
+        textwrap.dedent(
+            """\
+            #!/usr/bin/env bash
+            trap 'exit 0' TERM INT
+            while true; do sleep 1; done
+            """
+        )
+    )
+    fake_api.chmod(0o755)
+
+    main_model_path = tmp_path / "main.gguf"
+    mtp_model_path = tmp_path / "mtp.gguf"
+    main_model_path.write_text("main")
+    mtp_model_path.write_text("mtp")
+
+    env = {
+        **os.environ,
+        "LD_LIBRARY_PATH": "",
+        "DUCK_LLAMA_SERVER_BIN": str(fake_bin),
+        "DUCK_MAIN_MODEL_PATH": str(main_model_path),
+        "DUCK_MTP_MODEL_PATH": str(mtp_model_path),
+        "DUCK_LLAMA_PID_FILE": str(tmp_path / "llama-mtp.pid"),
+        "DUCK_LLAMA_LOG_FILE": str(tmp_path / "llama-mtp.log"),
+        "DUCK_MAIN_PORT": "18081",
+        "DUCK_API_PID_FILE": str(tmp_path / "duck-api-mtp.pid"),
+        "DUCK_API_LOG_FILE": str(tmp_path / "duck-api-mtp.log"),
+        "DUCK_API_COMMAND": str(fake_api),
+        "DUCK_API_PORT": "18000",
+    }
+
+    started = subprocess.run(
+        ["scripts/duck-mtp.sh", "start"], env=env, text=True, capture_output=True
+    )
+    assert started.returncode == 0
+
+    try:
+        logs = subprocess.run(
+            ["scripts/duck-mtp.sh", "logs", "--lines", "30"],
+            env=env,
+            text=True,
+            capture_output=True,
+        )
+        assert logs.returncode == 0
+        assert "--spec-type draft-mtp" in logs.stdout
+        assert "--model-draft" not in logs.stdout
+    finally:
+        subprocess.run(["scripts/duck-mtp.sh", "stop"], env=env, text=True, capture_output=True)
diff --git a/tests/smoke/test_llama_service_script.py b/tests/smoke/test_llama_service_script.py
index b0472c8..ecc29d6 100644
--- a/tests/smoke/test_llama_service_script.py
+++ b/tests/smoke/test_llama_service_script.py
@@ -55,3 +55,54 @@ def test_start_main_script_manages_pid_status_stop_and_logs(tmp_path):
         stopped = subprocess.run([script, "stop"], env=env, text=True, capture_output=True)
         assert stopped.returncode == 0
         assert not pid_file.exists()
+
+
+def test_start_main_script_sets_llama_bin_dir_library_path(tmp_path):
+    bin_dir = tmp_path / "build" / "bin"
+    bin_dir.mkdir(parents=True)
+    fake_bin = bin_dir / "llama-server"
+    fake_bin.write_text(
+        textwrap.dedent(
+            """\
+            #!/usr/bin/env bash
+            case ":${LD_LIBRARY_PATH:-}:" in
+              *":$(dirname "$0"):"*) ;;
+              *)
+                echo "error while loading shared libraries: libllama-common.so.0" >&2
+                exit 127
+                ;;
+            esac
+            echo "fake llama-server $*" >&2
+            trap 'exit 0' TERM INT
+            while true; do sleep 1; done
+            """
+        )
+    )
+    fake_bin.chmod(0o755)
+    model_path = tmp_path / "model.gguf"
+    model_path.write_text("fake")
+    pid_file = tmp_path / "llama.pid"
+    log_file = tmp_path / "llama.log"
+
+    env = {
+        **os.environ,
+        "LD_LIBRARY_PATH": "",
+        "DUCK_LLAMA_SERVER_BIN": str(fake_bin),
+        "DUCK_MAIN_MODEL_PATH": str(model_path),
+        "DUCK_LLAMA_PID_FILE": str(pid_file),
+        "DUCK_LLAMA_LOG_FILE": str(log_file),
+        "DUCK_MAIN_PORT": "18081",
+    }
+    script = "scripts/llama/start_main.sh"
+
+    started = subprocess.run([script, "start"], env=env, text=True, capture_output=True)
+    assert started.returncode == 0
+    try:
+        logs = subprocess.run(
+            [script, "logs", "--lines", "20"], env=env, text=True, capture_output=True
+        )
+        assert logs.returncode == 0
+        assert "--alias local-main" in logs.stdout
+        assert "error while loading shared libraries" not in logs.stdout
+    finally:
+        subprocess.run([script, "stop"], env=env, text=True, capture_output=True)
diff --git a/tests/smoke/test_model_client.py b/tests/smoke/test_model_client.py
index 5901b29..8749ff6 100644
--- a/tests/smoke/test_model_client.py
+++ b/tests/smoke/test_model_client.py
@@ -57,6 +57,32 @@ async def test_model_client_preserves_reasoning_content(monkeypatch):
     assert response.reasoning_content == "private reasoning"
 
 
+@pytest.mark.asyncio
+async def test_model_client_adds_request_reasoning_options(monkeypatch):
+    payloads = []
+
+    async def fake_post(self, url, json):
+        payloads.append(json)
+        return httpx.Response(
+            200,
+            json={"choices": [{"message": {"role": "assistant", "content": "ok"}}]},
+            request=httpx.Request("POST", url),
+        )
+
+    monkeypatch.setattr(httpx.AsyncClient, "post", fake_post)
+    client = ModelClient("config/models.yaml")
+
+    await client.chat("thinker", [{"role": "user", "content": "hello"}], reasoning="on")
+    await client.chat("thinker", [{"role": "user", "content": "hello"}], reasoning="off")
+
+    assert payloads[0]["reasoning_format"] == "deepseek"
+    assert payloads[0]["chat_template_kwargs"] == {"enable_thinking": True}
+    assert "thinking_budget_tokens" not in payloads[0]
+    assert payloads[1]["reasoning_format"] == "deepseek"
+    assert payloads[1]["chat_template_kwargs"] == {"enable_thinking": False}
+    assert payloads[1]["thinking_budget_tokens"] == 0
+
+
 @pytest.mark.asyncio
 async def test_model_client_stream_chat_yields_reasoning_then_content(monkeypatch):
     class FakeStreamResponse: