Add DuckLM service scripts and utility model benchmark

This commit is contained in:
mirivlad 2026-05-22 07:27:59 +08:00
parent e6b82f0376
commit ff98224eb6
24 changed files with 2492 additions and 74 deletions

View File

@ -106,8 +106,7 @@ git diff --check
```bash
. .venv/bin/activate
bash scripts/llama/start_main.sh start
python -m duck_core.api
bash scripts/duck.sh start
```
Открыть WebChat:
@ -123,6 +122,24 @@ curl --noproxy '*' http://127.0.0.1:8000/health
curl --noproxy '*' http://127.0.0.1:8000/v1/models/roles
```
Управление процессами:
```bash
bash scripts/duck.sh status
bash scripts/duck.sh logs --follow
bash scripts/duck.sh restart
bash scripts/duck.sh stop
```
MTP/speculative-вариант:
```bash
bash scripts/duck.sh stop
bash scripts/duck-mtp.sh start
bash scripts/duck-mtp.sh status
bash scripts/duck-mtp.sh logs --follow
```
## Что делать следующим
1. Пройти live E2E checklist в WebChat на реальной модели.

View File

@ -1,8 +1,32 @@
duck-up:
docker compose -f docker-compose.memory.yml up -d
@echo "Memory services started."
@echo "Start llama-server:"
@echo "bash scripts/llama/start_main.sh start"
bash scripts/duck.sh start
duck-stop:
bash scripts/duck.sh stop
duck-restart:
bash scripts/duck.sh restart
duck-status:
bash scripts/duck.sh status
duck-logs:
bash scripts/duck.sh logs --follow
duck-mtp-up:
bash scripts/duck-mtp.sh start
duck-mtp-stop:
bash scripts/duck-mtp.sh stop
duck-mtp-restart:
bash scripts/duck-mtp.sh restart
duck-mtp-status:
bash scripts/duck-mtp.sh status
duck-mtp-logs:
bash scripts/duck-mtp.sh logs --follow
duck-llama-main:
bash scripts/llama/start_main.sh start
@ -26,11 +50,7 @@ duck-api:
python3 -m duck_core.api
duck-dev:
docker compose -f docker-compose.memory.yml up -d
@echo "Start llama-server in another terminal:"
@echo "bash scripts/llama/start_main.sh start"
@echo "Then run:"
@echo "make duck-api"
bash scripts/duck.sh start
@echo "Open:"
@echo "http://127.0.0.1:8000/"

View File

@ -9,14 +9,25 @@ python3 -m venv .venv
. .venv/bin/activate
python -m pip install -e ".[dev]"
cp .env.example .env
bash scripts/llama/start_main.sh
```
In another terminal:
```bash
. .venv/bin/activate
python -m duck_core.api
bash scripts/duck.sh start
```
Open `http://127.0.0.1:8000/`.
Useful commands:
```bash
bash scripts/duck.sh status
bash scripts/duck.sh logs --follow
bash scripts/duck.sh restart
bash scripts/duck.sh stop
```
MTP/speculative variant:
```bash
bash scripts/duck-mtp.sh start
bash scripts/duck-mtp.sh status
bash scripts/duck-mtp.sh logs --follow
bash scripts/duck-mtp.sh stop
```

View File

@ -0,0 +1,80 @@
[
{
"model": "Qwen3.6-35B nonMTP GPU baseline",
"quality": 0.971,
"avg_latency_seconds": 17.935,
"avg_tokens_per_second": 4.51,
"cases": [
{
"role": "action",
"case": "direct_answer_no_tools",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 15.321,
"completion_tokens": 45,
"tokens_per_second": 2.94,
"content_preview": "{\n \"kind\": \"action_directive\",\n \"intent\": \"direct_answer\",\n \"risk_level\": \"low\",\n \"actions\": []\n}\n"
},
{
"role": "action",
"case": "read_specific_file",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 19.638,
"completion_tokens": 81,
"tokens_per_second": 4.12,
"content_preview": "{\n \"kind\": \"action_directive\",\n \"intent\": \"read_file\",\n \"risk_level\": \"low\",\n \"actions\": [\n {\n \"tool\": \"file_read\",\n \"args\": {\n \"path\": \"CURRENT_STATE.md\"\n }\n }\n ]\n}\n"
},
{
"role": "memory_policy",
"case": "store_user_preference",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 18.421,
"completion_tokens": 88,
"tokens_per_second": 4.78,
"content_preview": "{\n \"should_store\": true,\n \"memory_type\": \"preference\",\n \"summary\": \"User prefers responses in Russian and requires explicit confirmation before executing sudo commands.\",\n \"importance\": 0.9,\n \"scope\": \"global\",\n \"metadata\": {\n \"language\": \"ru\",\n \"security_policy\": \"sudo_confirmation_required\"\n }\n}"
},
{
"role": "memory_policy",
"case": "ignore_trivial_tool_call",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 14.978,
"completion_tokens": 61,
"tokens_per_second": 4.07,
"content_preview": "{\n \"should_store\": false,\n \"memory_type\": \"note\",\n \"summary\": \"Routine execution of pwd command returning /tmp/project.\",\n \"importance\": 0.1,\n \"scope\": \"conversation\",\n \"metadata\": {}\n}"
},
{
"role": "recall",
"case": "select_relevant_memory",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 15.038,
"completion_tokens": 66,
"tokens_per_second": 4.39,
"content_preview": "{\n \"relevant_ids\": [\n \"m1\"\n ],\n \"reasoning\": \"Memory m1 directly addresses the user's preference regarding the execution of sudo commands, which is the core of the query. Memories m2 and m3 are unrelated to sudo or command execution preferences.\"\n}"
},
{
"role": "summary",
"case": "preserve_decisions",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 9.991,
"completion_tokens": 44,
"tokens_per_second": 4.4,
"content_preview": "1. WebChat работает на 8000, llama-server на 8081.\n2. Для внешних путей нужен approval.\n3. allow_forever хранится по normalized action hash."
},
{
"role": "critic",
"case": "reflection_quality",
"score": 0.8,
"note": "missing=['lesson']",
"elapsed_seconds": 32.16,
"completion_tokens": 220,
"tokens_per_second": 6.84,
"content_preview": "**Critic Reflection: WebChat API Incident**\n\n**1. Risk Assessment**\n\n* **Operational Fragility (High):** The incident revealed a critical dependency on manual intervention. If the API crashes or the server reboots, WebChat will silently fail or return errors until an engineer notices and manually restarts the service. This creates a \"single point of failure\" in the operational process.\n* **Meaning Time to Recovery (MTTR) Variance:** Recovery time is currently dependent on human availability "
}
]
}
]

View File

@ -0,0 +1,21 @@
# Utility Role Model Benchmark
Scope: service roles only (`action`, `memory_policy`, `recall`, `summary`, `critic`).
The main user-facing thinker is not evaluated for replacement here.
| Model | Quality | Avg latency, s | Avg tok/s | Notes |
| --- | ---: | ---: | ---: | --- |
| Qwen3.6-35B nonMTP GPU baseline | 0.97 | 17.93 | 4.51 | critic/reflection_quality: missing=['lesson'] |
## Case Details
### Qwen3.6-35B nonMTP GPU baseline
| Role | Case | Score | Latency, s | tok/s | Note |
| --- | --- | ---: | ---: | ---: | --- |
| action | direct_answer_no_tools | 1.00 | 15.32 | 2.94 | ok |
| action | read_specific_file | 1.00 | 19.64 | 4.12 | ok |
| memory_policy | store_user_preference | 1.00 | 18.42 | 4.78 | ok |
| memory_policy | ignore_trivial_tool_call | 1.00 | 14.98 | 4.07 | ok |
| recall | select_relevant_memory | 1.00 | 15.04 | 4.39 | ok |
| summary | preserve_decisions | 1.00 | 9.99 | 4.40 | ok |
| critic | reflection_quality | 0.80 | 32.16 | 6.84 | missing=['lesson'] |

View File

@ -0,0 +1,392 @@
[
{
"model": "Qwen3.6-35B nonMTP GPU baseline",
"quality": 0.971,
"avg_latency_seconds": 17.94,
"avg_tokens_per_second": 4.51,
"cases": [
{
"role": "action",
"case": "direct_answer_no_tools",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 15.307,
"completion_tokens": 45,
"tokens_per_second": 2.94,
"content_preview": "{\n \"kind\": \"action_directive\",\n \"intent\": \"direct_answer\",\n \"risk_level\": \"low\",\n \"actions\": []\n}\n"
},
{
"role": "action",
"case": "read_specific_file",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 19.612,
"completion_tokens": 81,
"tokens_per_second": 4.13,
"content_preview": "{\n \"kind\": \"action_directive\",\n \"intent\": \"read_file\",\n \"risk_level\": \"low\",\n \"actions\": [\n {\n \"tool\": \"file_read\",\n \"args\": {\n \"path\": \"CURRENT_STATE.md\"\n }\n }\n ]\n}\n"
},
{
"role": "memory_policy",
"case": "store_user_preference",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 18.533,
"completion_tokens": 88,
"tokens_per_second": 4.75,
"content_preview": "{\n \"should_store\": true,\n \"memory_type\": \"preference\",\n \"summary\": \"User prefers responses in Russian and requires explicit confirmation before executing sudo commands.\",\n \"importance\": 0.9,\n \"scope\": \"global\",\n \"metadata\": {\n \"language\": \"ru\",\n \"security_policy\": \"sudo_confirmation_required\"\n }\n}"
},
{
"role": "memory_policy",
"case": "ignore_trivial_tool_call",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 15.0,
"completion_tokens": 61,
"tokens_per_second": 4.07,
"content_preview": "{\n \"should_store\": false,\n \"memory_type\": \"note\",\n \"summary\": \"Routine execution of pwd command returning /tmp/project.\",\n \"importance\": 0.1,\n \"scope\": \"conversation\",\n \"metadata\": {}\n}"
},
{
"role": "recall",
"case": "select_relevant_memory",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 15.085,
"completion_tokens": 66,
"tokens_per_second": 4.38,
"content_preview": "{\n \"relevant_ids\": [\n \"m1\"\n ],\n \"reasoning\": \"Memory m1 directly addresses the user's preference regarding the execution of sudo commands, which is the core of the query. Memories m2 and m3 are unrelated to sudo or command execution preferences.\"\n}"
},
{
"role": "summary",
"case": "preserve_decisions",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 9.954,
"completion_tokens": 44,
"tokens_per_second": 4.42,
"content_preview": "1. WebChat работает на 8000, llama-server на 8081.\n2. Для внешних путей нужен approval.\n3. allow_forever хранится по normalized action hash."
},
{
"role": "critic",
"case": "reflection_quality",
"score": 0.8,
"note": "missing=['lesson']",
"elapsed_seconds": 32.088,
"completion_tokens": 220,
"tokens_per_second": 6.86,
"content_preview": "**Critic Reflection: WebChat API Incident**\n\n**1. Risk Assessment**\n\n* **Operational Fragility (High):** The incident revealed a critical dependency on manual intervention. If the API crashes or the server reboots, WebChat will silently fail or return errors until an engineer notices and manually restarts the service. This creates a \"single point of failure\" in the operational process.\n* **Meaning Time to Recovery (MTTR) Variance:** Recovery time is currently dependent on human availability "
}
]
},
{
"model": "Menlo_Lucy-Q4_K_M CPU",
"quality": 0.771,
"avg_latency_seconds": 4.406,
"avg_tokens_per_second": 16.21,
"cases": [
{
"role": "action",
"case": "direct_answer_no_tools",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 3.228,
"completion_tokens": 31,
"tokens_per_second": 9.6,
"content_preview": "{\"kind\": \"action_directive\", \"intent\": \"answer_question\", \"risk_level\": \"low\", \"actions\": []}"
},
{
"role": "action",
"case": "read_specific_file",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 3.031,
"completion_tokens": 48,
"tokens_per_second": 15.84,
"content_preview": "{\"kind\": \"action_directive\", \"intent\": \"file_read\", \"risk_level\": \"low\", \"actions\": [{\"tool\": \"file_read\", \"args\": {\"path\": \"CURRENT_STATE.md\"}}]}\n\n"
},
{
"role": "memory_policy",
"case": "store_user_preference",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 3.62,
"completion_tokens": 54,
"tokens_per_second": 14.92,
"content_preview": "{\n \"should_store\": true,\n \"memory_type\": \"preference\",\n \"summary\": \"User prefers Russian and requires sudo confirmation.\",\n \"importance\": 0.7,\n \"scope\": \"global\",\n \"metadata\": {}\n}"
},
{
"role": "memory_policy",
"case": "ignore_trivial_tool_call",
"score": 0.3,
"note": "stored_trivial={'should_store': True, 'memory_type': 'fact', 'summary': 'Password was successfully launched and user was informed.', 'importance': 0.7, 'scope': 'global', 'metadata': {}}",
"elapsed_seconds": 3.192,
"completion_tokens": 58,
"tokens_per_second": 18.17,
"content_preview": "{\n \"should_store\": true,\n \"memory_type\": \"fact\",\n \"summary\": \"Password was successfully launched and user was informed.\",\n \"importance\": 0.7,\n \"scope\": \"global\",\n \"metadata\": {}\n}\n"
},
{
"role": "recall",
"case": "select_relevant_memory",
"score": 0.3,
"note": "wrong_ids=[]",
"elapsed_seconds": 3.737,
"completion_tokens": 60,
"tokens_per_second": 16.05,
"content_preview": "{\n \"relevant_ids\": [],\n \"reasoning\": \"The query is about how the user wants to run sudo, but none of the provided memories are related to sudo or user preferences for running commands. The memories are about SQLite and weather answers, which are unrelated to the query.\"\n}"
},
{
"role": "summary",
"case": "preserve_decisions",
"score": 0.8,
"note": "missing=['approval']",
"elapsed_seconds": 3.334,
"completion_tokens": 61,
"tokens_per_second": 18.29,
"content_preview": "1. WebChat работает на портах 8000, а LLaMA-Server на 8081. \n2. Для внешних путей требуется подтверждение. \n3. allow_forever хранится по hash-коду действий."
},
{
"role": "critic",
"case": "reflection_quality",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 10.697,
"completion_tokens": 220,
"tokens_per_second": 20.57,
"content_preview": "Okay, the user wants me to reflect on the risk and reusable lessons from fixing the WebChat issue. Let me break this down.\n\nFirst, the root cause was the API not running. They manually started the API but didn't have a unified service script. So the main risk here is that without a unified script, there's a chance the API might not be running consistently or could be misconfigured.\n\nReusability is key here. Maybe we can create a script that starts the API and monitors its status. That way, if th"
}
]
},
{
"model": "Qwen3.5-9B-GLM5.1-Distill-v1-Q4_K_M CPU",
"quality": 0.4,
"avg_latency_seconds": 61.939,
"avg_tokens_per_second": 2.56,
"cases": [
{
"role": "action",
"case": "direct_answer_no_tools",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 68.075,
"completion_tokens": 72,
"tokens_per_second": 1.06,
"content_preview": "{\"kind\": \"action_directive\", \"intent\": \"answer\", \"risk_level\": \"high\", \"actions\": []}"
},
{
"role": "action",
"case": "read_specific_file",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 72.149,
"completion_tokens": 86,
"tokens_per_second": 1.19,
"content_preview": "{\"kind\": \"action_directive\", \"intent\": \"read file\", \"risk_level\": \"low\", \"actions\": [{\"tool\": \"file_read\", \"args\": {\"path\": \"CURRENT_STATE.md\"}}]}\n"
},
{
"role": "memory_policy",
"case": "store_user_preference",
"score": 0.0,
"note": "invalid_json: Expecting value: line 1 column 1 (char 0)",
"elapsed_seconds": 67.764,
"completion_tokens": 180,
"tokens_per_second": 2.66,
"content_preview": ""
},
{
"role": "memory_policy",
"case": "ignore_trivial_tool_call",
"score": 0.0,
"note": "invalid_json: Expecting value: line 1 column 1 (char 0)",
"elapsed_seconds": 64.653,
"completion_tokens": 160,
"tokens_per_second": 2.47,
"content_preview": ""
},
{
"role": "recall",
"case": "select_relevant_memory",
"score": 0.0,
"note": "invalid_json: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)",
"elapsed_seconds": 59.451,
"completion_tokens": 160,
"tokens_per_second": 2.69,
"content_preview": "{\n"
},
{
"role": "summary",
"case": "preserve_decisions",
"score": 0.2,
"note": "missing=['8000', '8081', 'approval', 'allow_forever']",
"elapsed_seconds": 47.05,
"completion_tokens": 180,
"tokens_per_second": 3.83,
"content_preview": ""
},
{
"role": "critic",
"case": "reflection_quality",
"score": 0.6,
"note": "missing=['risk', 'lesson']",
"elapsed_seconds": 54.433,
"completion_tokens": 220,
"tokens_per_second": 4.04,
"content_preview": ""
}
]
},
{
"model": "X-Coder-SFT-Qwen3-8B.Q6_K CPU",
"quality": 0.757,
"avg_latency_seconds": 60.117,
"avg_tokens_per_second": 2.51,
"cases": [
{
"role": "action",
"case": "direct_answer_no_tools",
"score": 0.0,
"note": "invalid_json: Expecting ',' delimiter: line 13 column 6 (char 632)",
"elapsed_seconds": 121.049,
"completion_tokens": 180,
"tokens_per_second": 1.49,
"content_preview": "{\n \"kind\": \"action_directive\",\n \"intent\": \"answer\",\n \"risk_level\": \"low\",\n \"actions\": [\n {\n \"tool\": \"coder\",\n \"args\": {\n \"task_description\": \"Объясните коротко, что такое DuckLM\",\n \"language\": \"python\",\n \"context\": \"Пользователь запросил краткое описание DuckLM. DuckLM - это проект, который развивается сообществом, направленный на создание открытой и этичной языковой модели. Мы можем сказать, что это модель на основе Llama, созданная Meta, но адаптированная "
},
{
"role": "action",
"case": "read_specific_file",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 37.562,
"completion_tokens": 134,
"tokens_per_second": 3.57,
"content_preview": "{\n \"kind\": \"action_directive\",\n \"intent\": \"read_current_state\",\n \"risk_level\": \"low\",\n \"actions\": [\n {\n \"tool\": \"file_read\",\n \"args\": {\n \"path\": \"CURRENT_STATE.md\"\n },\n \"reason\": \"We need to read the content of CURRENT_STATE.md to provide a summary of the project status.\"\n }\n ],\n \"expected_observations\": [\n \"file_content\"\n ],\n \"stop_reason\": \"After reading the file, we can directly generate the response without further tools.\"\n}\n"
},
{
"role": "memory_policy",
"case": "store_user_preference",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 66.979,
"completion_tokens": 80,
"tokens_per_second": 1.19,
"content_preview": "{\n \"should_store\": true,\n \"memory_type\": \"preference\",\n \"summary\": \"User prefers responses in Russian and explicit sudo confirmation.\",\n \"importance\": 0.9,\n \"scope\": \"global\",\n \"metadata\": {\n \"language\": \"Russian\",\n \"sudo_policy\": \"explicit_confirmation_required\"\n }\n}\n"
},
{
"role": "memory_policy",
"case": "ignore_trivial_tool_call",
"score": 0.3,
"note": "stored_trivial={'should_store': True, 'memory_type': 'event', 'summary': 'User executed pwd command and received /tmp/project as output.', 'importance': 0.8, 'scope': 'conversation', 'metadata': {}}",
"elapsed_seconds": 21.772,
"completion_tokens": 62,
"tokens_per_second": 2.85,
"content_preview": "{\n \"should_store\": true,\n \"memory_type\": \"event\",\n \"summary\": \"User executed pwd command and received /tmp/project as output.\",\n \"importance\": 0.8,\n \"scope\": \"conversation\",\n \"metadata\": {}\n}\n"
},
{
"role": "recall",
"case": "select_relevant_memory",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 58.656,
"completion_tokens": 90,
"tokens_per_second": 1.53,
"content_preview": "{\n \"relevant_ids\": [\"m1\"],\n \"reasoning\": \"The query is about how the user wants to run 'sudo'. The only memory that directly relates to this is memory m1, which states that the user does not want to run sudo without explicit confirmation. The other memories (m2 and m3) are about unrelated topics (database and weather). Therefore, only m1 is relevant.\"\n}\n"
},
{
"role": "summary",
"case": "preserve_decisions",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 53.245,
"completion_tokens": 180,
"tokens_per_second": 3.38,
"content_preview": "We are going to compress the task context into 3 key points while preserving the decisions and outcomes.\n\nKey Points:\n1. **Service Endpoints**: \n - WebChat is running on port 8000.\n - llama-server is running on port 8081.\n\n2. **External Paths Approval**:\n - Any external path that requires access must be approved. This implies that we need to have a mechanism (like a firewall or access control list) to check and approve external requests.\n\n3. **Forever Allowance Storage**:\n - The `allow_f"
},
{
"role": "critic",
"case": "reflection_quality",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 61.555,
"completion_tokens": 220,
"tokens_per_second": 3.57,
"content_preview": "We are reflecting on the task: fixing the WebChat which had an API that wasn't running.\n The root cause was that the API was not running. We started it manually but there was no unified service script.\n\n Reflection:\n - Risk: The manual start might be error-prone and not scalable. If multiple components, hard to manage.\n - Waste: We ended up having to start manually, which is not efficient and might lead to repeated issues.\n - Reusable Lesson: We should create a unified service script that starts"
}
]
},
{
"model": "gemma-4-E4B-it-Q4_K_M CPU",
"quality": 0.971,
"avg_latency_seconds": 21.235,
"avg_tokens_per_second": 5.36,
"cases": [
{
"role": "action",
"case": "direct_answer_no_tools",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 35.717,
"completion_tokens": 53,
"tokens_per_second": 1.48,
"content_preview": "{\n \"kind\": \"action_directive\",\n \"intent\": \"answer_directly\"\n ,\n \"risk_level\": \"low\"\n ,\n \"actions\": []\n}\n"
},
{
"role": "action",
"case": "read_specific_file",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 13.325,
"completion_tokens": 88,
"tokens_per_second": 6.6,
"content_preview": "{\n \"kind\": \"action_directive\",\n \"intent\": \"tool_use\",\n \"risk_level\": \"low\",\n \"actions\": [\n {\n \"tool\": \"file_read\",\n \"args\": {\n \"path\": \"CURRENT_STATE.md\"\n }\n }\n ]\n}\n"
},
{
"role": "memory_policy",
"case": "store_user_preference",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 27.13,
"completion_tokens": 98,
"tokens_per_second": 3.61,
"content_preview": "{\n \"should_store\": true,\n \"memory_type\": \"preference\",\n \"summary\": \"Пользователь требует, чтобы ответы всегда были на русском языке и чтобы не запускать sudo без явного подтверждения.\",\n \"importance\": 0.95,\n \"scope\": \"global\",\n \"metadata\": {\n \"task_id\": \"task_pref\"\n }\n}\n"
},
{
"role": "memory_policy",
"case": "ignore_trivial_tool_call",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 10.229,
"completion_tokens": 90,
"tokens_per_second": 8.8,
"content_preview": "{\n \"should_store\": false,\n \"memory_type\": \"note\",\n \"summary\": \"The 'pwd' command was executed, and the output was sent to the user.\",\n \"importance\": 0.1,\n \"scope\": \"conversation\",\n \"metadata\": {\n \"task_id\": \"task_tmp\"\n }\n}\n"
},
{
"role": "recall",
"case": "select_relevant_memory",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 19.391,
"completion_tokens": 62,
"tokens_per_second": 3.2,
"content_preview": "{\n \"relevant_ids\": [\n \"m1\"\n ],\n \"reasoning\": \"Воспоминание m1 напрямую отвечает на вопрос о том, как пользователь хочет, чтобы запускался sudo (с требованием отдельного подтверждения).\"\n}\n"
},
{
"role": "summary",
"case": "preserve_decisions",
"score": 1.0,
"note": "ok",
"elapsed_seconds": 14.371,
"completion_tokens": 88,
"tokens_per_second": 6.12,
"content_preview": "Вот сжатый контекст в 3 пунктах:\n\n1. **Порты:** WebChat использует порт 8000, llama-server — 8081.\n2. **Безопасность:** Для внешних путей требуется предварительное одобрение (approval).\n3. **Хранение разрешений:** `allow_forever` сохраняется на основе хеша нормализованного действия."
},
{
"role": "critic",
"case": "reflection_quality",
"score": 0.8,
"note": "missing=['lesson']",
"elapsed_seconds": 28.484,
"completion_tokens": 220,
"tokens_per_second": 7.72,
"content_preview": "## DuckLM: Critic Reflection\n\n**Task:** Fix WebChat.\n**Observed Root Cause:** API was not running.\n**Action Taken:** Started API manually.\n**Observed Deficiency:** Lack of a unified service script.\n\n---\n\n### 🔍 Reflection Analysis\n\n#### 1. Risk Assessment (What went wrong/could go wrong?)\n\n* **Operational Risk (High):** The immediate risk was service unavailability (WebChat down). Manually starting the API is a brittle, high-touch workaround. If the system restarts, or if the API needs to be re"
}
]
}
]

View File

@ -0,0 +1,69 @@
# Utility Role Model Benchmark
Scope: service roles only (`action`, `memory_policy`, `recall`, `summary`, `critic`).
The main user-facing thinker is not evaluated for replacement here.
| Model | Quality | Avg latency, s | Avg tok/s | Notes |
| --- | ---: | ---: | ---: | --- |
| Qwen3.6-35B nonMTP GPU baseline | 0.97 | 17.94 | 4.51 | critic/reflection_quality: missing=['lesson'] |
| Menlo_Lucy-Q4_K_M CPU | 0.77 | 4.41 | 16.21 | memory_policy/ignore_trivial_tool_call: stored_trivial={'should_store': True, 'memory_type': 'fact', 'summary': 'Password was successfully launched and user was informed.', 'importance': 0.7, 'scope': 'global', 'metadata': {}}; recall/select_relevant_memory: wrong_ids=[]; summary/preserve_decisions: missing=['approval'] |
| Qwen3.5-9B-GLM5.1-Distill-v1-Q4_K_M CPU | 0.40 | 61.94 | 2.56 | memory_policy/store_user_preference: invalid_json: Expecting value: line 1 column 1 (char 0); memory_policy/ignore_trivial_tool_call: invalid_json: Expecting value: line 1 column 1 (char 0); recall/select_relevant_memory: invalid_json: Expecting property name enclosed in double quotes: line 1 column 2 (char 1) |
| X-Coder-SFT-Qwen3-8B.Q6_K CPU | 0.76 | 60.12 | 2.51 | action/direct_answer_no_tools: invalid_json: Expecting ',' delimiter: line 13 column 6 (char 632); memory_policy/ignore_trivial_tool_call: stored_trivial={'should_store': True, 'memory_type': 'event', 'summary': 'User executed pwd command and received /tmp/project as output.', 'importance': 0.8, 'scope': 'conversation', 'metadata': {}} |
| gemma-4-E4B-it-Q4_K_M CPU | 0.97 | 21.23 | 5.36 | critic/reflection_quality: missing=['lesson'] |
## Case Details
### Qwen3.6-35B nonMTP GPU baseline
| Role | Case | Score | Latency, s | tok/s | Note |
| --- | --- | ---: | ---: | ---: | --- |
| action | direct_answer_no_tools | 1.00 | 15.31 | 2.94 | ok |
| action | read_specific_file | 1.00 | 19.61 | 4.13 | ok |
| memory_policy | store_user_preference | 1.00 | 18.53 | 4.75 | ok |
| memory_policy | ignore_trivial_tool_call | 1.00 | 15.00 | 4.07 | ok |
| recall | select_relevant_memory | 1.00 | 15.09 | 4.38 | ok |
| summary | preserve_decisions | 1.00 | 9.95 | 4.42 | ok |
| critic | reflection_quality | 0.80 | 32.09 | 6.86 | missing=['lesson'] |
### Menlo_Lucy-Q4_K_M CPU
| Role | Case | Score | Latency, s | tok/s | Note |
| --- | --- | ---: | ---: | ---: | --- |
| action | direct_answer_no_tools | 1.00 | 3.23 | 9.60 | ok |
| action | read_specific_file | 1.00 | 3.03 | 15.84 | ok |
| memory_policy | store_user_preference | 1.00 | 3.62 | 14.92 | ok |
| memory_policy | ignore_trivial_tool_call | 0.30 | 3.19 | 18.17 | stored_trivial={'should_store': True, 'memory_type': 'fact', 'summary': 'Password was successfully launched and user was informed.', 'importance': 0.7, 'scope': 'global', 'metadata': {}} |
| recall | select_relevant_memory | 0.30 | 3.74 | 16.05 | wrong_ids=[] |
| summary | preserve_decisions | 0.80 | 3.33 | 18.29 | missing=['approval'] |
| critic | reflection_quality | 1.00 | 10.70 | 20.57 | ok |
### Qwen3.5-9B-GLM5.1-Distill-v1-Q4_K_M CPU
| Role | Case | Score | Latency, s | tok/s | Note |
| --- | --- | ---: | ---: | ---: | --- |
| action | direct_answer_no_tools | 1.00 | 68.08 | 1.06 | ok |
| action | read_specific_file | 1.00 | 72.15 | 1.19 | ok |
| memory_policy | store_user_preference | 0.00 | 67.76 | 2.66 | invalid_json: Expecting value: line 1 column 1 (char 0) |
| memory_policy | ignore_trivial_tool_call | 0.00 | 64.65 | 2.47 | invalid_json: Expecting value: line 1 column 1 (char 0) |
| recall | select_relevant_memory | 0.00 | 59.45 | 2.69 | invalid_json: Expecting property name enclosed in double quotes: line 1 column 2 (char 1) |
| summary | preserve_decisions | 0.20 | 47.05 | 3.83 | missing=['8000', '8081', 'approval', 'allow_forever'] |
| critic | reflection_quality | 0.60 | 54.43 | 4.04 | missing=['risk', 'lesson'] |
### X-Coder-SFT-Qwen3-8B.Q6_K CPU
| Role | Case | Score | Latency, s | tok/s | Note |
| --- | --- | ---: | ---: | ---: | --- |
| action | direct_answer_no_tools | 0.00 | 121.05 | 1.49 | invalid_json: Expecting ',' delimiter: line 13 column 6 (char 632) |
| action | read_specific_file | 1.00 | 37.56 | 3.57 | ok |
| memory_policy | store_user_preference | 1.00 | 66.98 | 1.19 | ok |
| memory_policy | ignore_trivial_tool_call | 0.30 | 21.77 | 2.85 | stored_trivial={'should_store': True, 'memory_type': 'event', 'summary': 'User executed pwd command and received /tmp/project as output.', 'importance': 0.8, 'scope': 'conversation', 'metadata': {}} |
| recall | select_relevant_memory | 1.00 | 58.66 | 1.53 | ok |
| summary | preserve_decisions | 1.00 | 53.24 | 3.38 | ok |
| critic | reflection_quality | 1.00 | 61.55 | 3.57 | ok |
### gemma-4-E4B-it-Q4_K_M CPU
| Role | Case | Score | Latency, s | tok/s | Note |
| --- | --- | ---: | ---: | ---: | --- |
| action | direct_answer_no_tools | 1.00 | 35.72 | 1.48 | ok |
| action | read_specific_file | 1.00 | 13.32 | 6.60 | ok |
| memory_policy | store_user_preference | 1.00 | 27.13 | 3.61 | ok |
| memory_policy | ignore_trivial_tool_call | 1.00 | 10.23 | 8.80 | ok |
| recall | select_relevant_memory | 1.00 | 19.39 | 3.20 | ok |
| summary | preserve_decisions | 1.00 | 14.37 | 6.12 | ok |
| critic | reflection_quality | 0.80 | 28.48 | 7.72 | missing=['lesson'] |

View File

@ -16,34 +16,53 @@ cp .env.example .env
The default `DUCK_MAIN_MODEL_PATH` points to `./models/Qwen3.6/nonMTP/Qwen3.6-35B-A3B-UD-Q4_K_M.gguf`.
3. Start `llama-server`:
3. Start DuckLM:
```bash
bash scripts/llama/start_main.sh start
bash scripts/duck.sh start
```
This starts both processes:
- `llama-server` on `http://127.0.0.1:8081/v1`
- DuckLM API/WebChat on `http://127.0.0.1:8000/`
Useful process commands:
```bash
bash scripts/llama/start_main.sh status
bash scripts/llama/start_main.sh logs --follow
bash scripts/llama/start_main.sh restart
bash scripts/llama/start_main.sh stop
bash scripts/duck.sh status
bash scripts/duck.sh logs --follow
bash scripts/duck.sh restart
bash scripts/duck.sh stop
```
4. Start DuckLM API:
```bash
python -m duck_core.api
```
5. Open WebChat:
4. Open WebChat:
```text
http://127.0.0.1:8000/
```
6. Send a task:
Low-level llama-only commands are still available when needed:
```bash
bash scripts/llama/start_main.sh status
bash scripts/llama/start_main.sh logs --follow
```
MTP/speculative variant:
```bash
bash scripts/duck.sh stop
bash scripts/duck-mtp.sh start
bash scripts/duck-mtp.sh status
bash scripts/duck-mtp.sh logs --follow
```
`duck-mtp.sh` keeps DuckLM on `http://127.0.0.1:8000/` and starts the MTP-backed
`llama-server` on the normal role endpoint `http://127.0.0.1:8081/v1`, so
`config/models.yaml` does not need to change.
5. Send a task:
```bash
curl -X POST http://127.0.0.1:8000/v1/chat \
@ -51,21 +70,21 @@ curl -X POST http://127.0.0.1:8000/v1/chat \
-d '{"message":"Скажи коротко, что ты DuckLM","workspace":"./workspace","debug":true}'
```
7. Inspect events:
6. Inspect events:
```bash
curl http://127.0.0.1:8000/v1/tasks/<task_id>/events
```
8. Approvals:
7. Approvals:
```bash
curl http://127.0.0.1:8000/v1/approvals/pending
```
9. Stop services:
8. Stop services:
```bash
bash scripts/llama/start_main.sh stop
bash scripts/duck.sh stop
docker compose -f docker-compose.memory.yml down
```

View File

@ -23,3 +23,18 @@ GET /v1/experience
GET /v1/experience/{id}
GET /v1/memory/search?q=...
```
Chat requests accept optional `reasoning`:
```json
{
"message": "hello",
"reasoning": "auto"
}
```
Allowed values:
- `auto`: use the `llama-server` default.
- `on`: pass `enable_thinking=true` for the thinker response.
- `off`: pass `enable_thinking=false` and `thinking_budget_tokens=0`.

View File

@ -3,7 +3,7 @@ import json
import logging
import time
from pathlib import Path
from typing import Any
from typing import Any, Literal
import uvicorn
from fastapi import FastAPI, HTTPException, Request
@ -60,6 +60,7 @@ class ChatRequest(BaseModel):
conversation_id: str | None = None
workspace: str | None = None
debug: bool = False
reasoning: Literal["auto", "on", "off"] = "auto"
class ConversationRequest(BaseModel):
@ -208,6 +209,7 @@ def create_app() -> FastAPI:
history_messages=history,
memory_records=memory_records,
skill_summary=await selected_skill_summary(body.message),
reasoning=body.reasoning,
)
await conversations.add_message(
conversation.conversation_id,
@ -411,7 +413,12 @@ def create_app() -> FastAPI:
"Формирую ответ...",
)
await event_store.append(task.task_id, "model_call_started", {"role": "thinker"})
async for chunk in model_client.stream_chat("thinker", messages):
stream = (
model_client.stream_chat("thinker", messages, reasoning=body.reasoning)
if body.reasoning in {"on", "off"}
else model_client.stream_chat("thinker", messages)
)
async for chunk in stream:
delta = str(chunk.get("delta") or "")
if chunk.get("type") == "reasoning_delta":
generation_stats.record(delta)

View File

@ -3,13 +3,15 @@ import logging
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from typing import Any, Literal
import httpx
import yaml
logger = logging.getLogger(__name__)
ReasoningMode = Literal["auto", "on", "off"]
@dataclass(frozen=True)
class RoleConfig:
@ -92,6 +94,20 @@ class ModelClient:
}
return {"type": "json_object"}
def _reasoning_options(self, reasoning: ReasoningMode | None) -> dict[str, Any]:
if reasoning == "on":
return {
"reasoning_format": "deepseek",
"chat_template_kwargs": {"enable_thinking": True},
}
if reasoning == "off":
return {
"reasoning_format": "deepseek",
"chat_template_kwargs": {"enable_thinking": False},
"thinking_budget_tokens": 0,
}
return {}
async def chat(
self,
role: str,
@ -99,6 +115,7 @@ class ModelClient:
temperature: float | None = None,
max_output_tokens: int | None = None,
response_format: dict[str, Any] | None = None,
reasoning: ReasoningMode | None = None,
) -> ModelResponse:
cfg = self.get_role_config(role)
outbound = list(messages)
@ -115,6 +132,7 @@ class ModelClient:
fmt = self._response_format(cfg, response_format)
if fmt is not None:
payload["response_format"] = fmt
payload.update(self._reasoning_options(reasoning))
start = time.perf_counter()
try:
@ -150,6 +168,7 @@ class ModelClient:
temperature: float | None = None,
max_output_tokens: int | None = None,
response_format: dict[str, Any] | None = None,
reasoning: ReasoningMode | None = None,
):
cfg = self.get_role_config(role)
outbound = list(messages)
@ -167,6 +186,7 @@ class ModelClient:
fmt = self._response_format(cfg, response_format)
if fmt is not None:
payload["response_format"] = fmt
payload.update(self._reasoning_options(reasoning))
try:
async with httpx.AsyncClient(timeout=self.timeout, trust_env=False) as client:

View File

@ -10,7 +10,7 @@ from duck_core.experience.recorder import ExperienceRecorder
from duck_core.memory.policy import MemoryPolicy
from duck_core.memory.store import MemoryStore
from duck_core.memory.vector_memory import VectorMemory
from duck_core.model_client import ModelClient
from duck_core.model_client import ModelClient, ReasoningMode
from duck_core.reflection import Reflection
from duck_core.tasks.store import TaskStore
from duck_core.tools.base import ToolResult
@ -63,6 +63,7 @@ class RuntimeLoop:
memory_records: list[dict[str, str]] | None = None,
skill_summary: str | None = None,
reflect: bool = True,
reasoning: ReasoningMode | None = None,
) -> ChatResult:
task = await self.task_store.create_task(message, workspace, debug)
await self.event_store.append(
@ -100,6 +101,9 @@ class RuntimeLoop:
await self.event_store.append(
task.task_id, "model_call_started", {"role": "thinker"}
)
if reasoning in {"on", "off"}:
response = await self.model_client.chat("thinker", messages, reasoning=reasoning)
else:
response = await self.model_client.chat("thinker", messages)
await self.event_store.append(
task.task_id,

View File

@ -16,6 +16,100 @@ function escapeText(value) {
return String(value ?? "");
}
function escapeHtml(value) {
return String(value ?? "")
.replaceAll("&", "&amp;")
.replaceAll("<", "&lt;")
.replaceAll(">", "&gt;")
.replaceAll('"', "&quot;")
.replaceAll("'", "&#39;");
}
function renderInlineMarkdown(text) {
let html = escapeHtml(text);
html = html.replace(/`([^`]+)`/g, "<code>$1</code>");
html = html.replace(/\*\*([^*]+)\*\*/g, "<strong>$1</strong>");
html = html.replace(/\*([^*]+)\*/g, "<em>$1</em>");
html = html.replace(
/(https?:\/\/[^\s<]+)/g,
'<a href="$1" target="_blank" rel="noreferrer">$1</a>',
);
return html;
}
function renderMarkdown(markdown) {
const blocks = [];
const lines = String(markdown ?? "").replace(/\r\n/g, "\n").split("\n");
let paragraph = [];
let listItems = [];
let inFence = false;
let fenceLines = [];
const flushParagraph = () => {
if (!paragraph.length) return;
blocks.push(`<p>${renderInlineMarkdown(paragraph.join(" "))}</p>`);
paragraph = [];
};
const flushList = () => {
if (!listItems.length) return;
blocks.push(`<ul>${listItems.map((item) => `<li>${renderInlineMarkdown(item)}</li>`).join("")}</ul>`);
listItems = [];
};
for (const line of lines) {
if (line.startsWith("```")) {
if (inFence) {
blocks.push(`<pre><code>${escapeHtml(fenceLines.join("\n"))}</code></pre>`);
fenceLines = [];
inFence = false;
} else {
flushParagraph();
flushList();
inFence = true;
}
continue;
}
if (inFence) {
fenceLines.push(line);
continue;
}
const heading = /^(#{1,4})\s+(.+)$/.exec(line);
if (heading) {
flushParagraph();
flushList();
const level = heading[1].length + 2;
blocks.push(`<h${level}>${renderInlineMarkdown(heading[2])}</h${level}>`);
continue;
}
const listItem = /^\s*[-*]\s+(.+)$/.exec(line);
if (listItem) {
flushParagraph();
listItems.push(listItem[1]);
continue;
}
if (!line.trim()) {
flushParagraph();
flushList();
continue;
}
flushList();
paragraph.push(line.trim());
}
if (inFence) blocks.push(`<pre><code>${escapeHtml(fenceLines.join("\n"))}</code></pre>`);
flushParagraph();
flushList();
return blocks.join("");
}
function setMarkdownContent(node, content) {
if (!node) return;
node.dataset.markdown = String(content ?? "");
node.innerHTML = renderMarkdown(node.dataset.markdown);
}
function setStatus(id, text, tone = "neutral") {
const node = document.querySelector(id);
if (!node) return;
@ -67,7 +161,12 @@ function addMessage(role, content, meta = "", options = {}) {
messageMeta.innerHTML = `<strong>${role === "user" ? "You" : "DuckLM"}</strong><span>${escapeText(meta)}</span>`;
const text = document.createElement("p");
text.className = role === "assistant" ? "message-body markdown-body" : "message-body";
if (role === "assistant") {
setMarkdownContent(text, content);
} else {
text.textContent = content;
}
bubble.append(messageMeta);
if (role === "assistant" && options.reasoning) {
@ -204,11 +303,10 @@ function updateToolTerminal(article, eventPayload) {
terminal.classList.remove("is-waiting");
status.textContent = result.ok ? "ok" : "error";
const title = terminal.querySelector(".tool-terminal-title")?.textContent || body.textContent.trim();
const parts = [title];
if (result.output) parts.push("\nstdout\n" + result.output.trimEnd());
if (result.error) parts.push("\nstderr\n" + result.error.trimEnd());
body.textContent = parts.join("\n");
const parts = [];
if (result.output) parts.push("stdout\n" + result.output.trimEnd());
if (result.error) parts.push("stderr\n" + result.error.trimEnd());
body.textContent = parts.join("\n\n") || "completed with no output";
document.querySelector("#messages").scrollTop = document.querySelector("#messages").scrollHeight;
}
@ -359,13 +457,22 @@ function humanApprovalDecision(action) {
function setMessagePending(article, text) {
const paragraph = article?.querySelector("p");
if (paragraph) paragraph.textContent = text;
if (!paragraph) return;
if (paragraph.classList.contains("markdown-body")) {
setMarkdownContent(paragraph, text);
} else {
paragraph.textContent = text;
}
}
function appendMessageText(article, delta) {
const paragraph = article?.querySelector("p");
if (!paragraph) return;
if (paragraph.classList.contains("markdown-body")) {
setMarkdownContent(paragraph, `${paragraph.dataset.markdown || ""}${delta}`);
} else {
paragraph.textContent += delta;
}
document.querySelector("#messages").scrollTop = document.querySelector("#messages").scrollHeight;
}
@ -758,7 +865,8 @@ async function sendMessage() {
setStatus("#task-status", "running", "warn");
addMessage("user", message, "submitted");
input.value = "";
const pending = addMessage("assistant", "", "thinking", {reasoning: true});
const reasoningEnabled = document.querySelector("#reasoning")?.checked || false;
const pending = addMessage("assistant", "", "thinking", {reasoning: reasoningEnabled});
const context = {taskId: "", contentStarted: false};
try {
@ -767,6 +875,7 @@ async function sendMessage() {
conversation_id: state.currentConversationId || null,
workspace: document.querySelector("#workspace").value,
debug: document.querySelector("#debug").checked,
reasoning: reasoningEnabled ? "on" : "off",
}, async ({name, data}) => {
await handleAssistantStreamEvent(pending, name, data, context);
});

View File

@ -500,6 +500,60 @@ dd {
line-height: 1.5;
}
.bubble .markdown-body {
white-space: normal;
}
.markdown-body h3,
.markdown-body h4,
.markdown-body h5,
.markdown-body p,
.markdown-body ul,
.markdown-body pre {
margin: 8px 0 0;
}
.markdown-body h3 {
font-size: 17px;
}
.markdown-body h4,
.markdown-body h5 {
font-size: 15px;
}
.markdown-body ul {
padding-left: 22px;
}
.markdown-body code {
padding: 1px 4px;
border-radius: 5px;
background: #e2e8f0;
font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
font-size: 0.92em;
}
.markdown-body pre {
max-width: 100%;
overflow: auto;
padding: 10px 12px;
border-radius: 8px;
background: #0f172a;
color: #d1fae5;
white-space: pre;
}
.markdown-body pre code {
padding: 0;
background: transparent;
color: inherit;
}
.markdown-body a {
color: var(--accent);
}
.message-reasoning {
display: grid;
gap: 8px;

View File

@ -46,6 +46,10 @@
<input id="debug" type="checkbox" checked>
<span>Debug mode</span>
</label>
<label class="toggle-row">
<input id="reasoning" type="checkbox">
<span>Reasoning</span>
</label>
</section>
<section class="status-panel" aria-labelledby="status-title">

View File

@ -1,33 +1,434 @@
import argparse
import asyncio
import json
import os
import signal
import subprocess
import time
from dataclasses import dataclass, field
from pathlib import Path
from statistics import mean
from typing import Any
from duck_core.model_client import ModelClient
import httpx
import jsonschema
TASKS = [
"Скажи коротко, что ты DuckLM.",
"Создай tmp/duck_test_note.md с текстом hello duck и прочитай его обратно.",
"Посмотри структуру проекта и кратко опиши модули.",
"Найди TODO/FIXME в проекте.",
"Запусти тесты и кратко объясни результат.",
ROOT = Path(__file__).resolve().parents[2]
LLAMA_BIN = ROOT / "vendor/llama.cpp/build/bin/llama-server"
BASELINE_URL = "http://127.0.0.1:8081/v1"
BENCH_PORT = 18091
BENCH_URL = f"http://127.0.0.1:{BENCH_PORT}/v1"
RESULTS_DIR = ROOT / "docs/bench"
CANDIDATES = [
ROOT / "models/Menlo_Lucy-Q4_K_M.gguf",
ROOT / "models/Qwen3.5-9B-GLM5.1-Distill-v1-Q4_K_M.gguf",
ROOT / "models/X-Coder-SFT-Qwen3-8B.Q6_K.gguf",
ROOT / "models/gemma-4-E4B-it-Q4_K_M.gguf",
]
async def main() -> None:
client = ModelClient()
print("role -> base_url/model")
for role, cfg in client._roles.items():
print(f"{role} -> {cfg.base_url}/{cfg.model}")
@dataclass
class BenchCase:
role: str
name: str
system_prompt: str
user_prompt: str
max_tokens: int
response_format: dict[str, Any] | None = None
required_keywords: list[str] = field(default_factory=list)
def read(path: str) -> str:
return (ROOT / path).read_text()
ACTION_SCHEMA = json.loads((ROOT / "duck_core/schemas/action_directive.schema.json").read_text())
MEMORY_SCHEMA = {
"type": "object",
"required": ["should_store", "memory_type", "summary", "importance", "scope", "metadata"],
"additionalProperties": True,
"properties": {
"should_store": {"type": "boolean"},
"memory_type": {"type": "string"},
"summary": {"type": "string"},
"importance": {"type": "number"},
"scope": {"type": "string"},
"metadata": {"type": "object"},
},
}
RECALL_SCHEMA = {
"type": "object",
"required": ["relevant_ids", "reasoning"],
"additionalProperties": True,
"properties": {
"relevant_ids": {"type": "array", "items": {"type": "string"}},
"reasoning": {"type": "string"},
},
}
CASES = [
BenchCase(
role="action",
name="direct_answer_no_tools",
system_prompt=read("prompts/roles/action.md"),
user_prompt="User request: Скажи коротко, что такое DuckLM.\nWorkspace: /tmp/duck",
max_tokens=180,
response_format={
"type": "json_schema",
"json_schema": {"name": "action_directive", "schema": ACTION_SCHEMA, "strict": True},
},
),
BenchCase(
role="action",
name="read_specific_file",
system_prompt=read("prompts/roles/action.md"),
user_prompt="User request: Прочитай файл CURRENT_STATE.md и кратко скажи статус проекта.\nWorkspace: /home/mirivlad/git/ducklm",
max_tokens=220,
response_format={
"type": "json_schema",
"json_schema": {"name": "action_directive", "schema": ACTION_SCHEMA, "strict": True},
},
),
BenchCase(
role="memory_policy",
name="store_user_preference",
system_prompt=read("prompts/roles/memory_policy.md"),
user_prompt="Task ID: task_pref\n\nTranscript:\nПользователь сказал: всегда отвечай мне по-русски и не запускай sudo без отдельного подтверждения.",
max_tokens=180,
response_format={
"type": "json_schema",
"json_schema": {"name": "memory_decision", "schema": MEMORY_SCHEMA, "strict": True},
},
),
BenchCase(
role="memory_policy",
name="ignore_trivial_tool_call",
system_prompt=read("prompts/roles/memory_policy.md"),
user_prompt="Task ID: task_tmp\n\nTranscript:\nЗапущен pwd, stdout: /tmp/project. Ответ отправлен пользователю.",
max_tokens=160,
response_format={
"type": "json_schema",
"json_schema": {"name": "memory_decision", "schema": MEMORY_SCHEMA, "strict": True},
},
),
BenchCase(
role="recall",
name="select_relevant_memory",
system_prompt=read("prompts/roles/recall.md"),
user_prompt=(
"Query: Как пользователь хочет, чтобы я запускал sudo?\n\n"
"Memories:\n"
"- id: m1 | text: Пользователь просит не запускать sudo без отдельного подтверждения.\n"
"- id: m2 | text: Проект использует SQLite для событий.\n"
"- id: m3 | text: Пользователь любит краткие ответы о погоде.\n"
),
max_tokens=160,
response_format={
"type": "json_schema",
"json_schema": {"name": "recall_decision", "schema": RECALL_SCHEMA, "strict": True},
},
),
BenchCase(
role="summary",
name="preserve_decisions",
system_prompt=read("prompts/roles/summary.md"),
user_prompt=(
"Сожми контекст до 3 пунктов. Сохрани решения:\n"
"1. WebChat работает на 8000, llama-server на 8081.\n"
"2. Для внешних путей нужен approval.\n"
"3. allow_forever хранится по normalized action hash.\n"
),
max_tokens=180,
required_keywords=["8000", "8081", "approval", "allow_forever"],
),
BenchCase(
role="critic",
name="reflection_quality",
system_prompt=read("prompts/roles/critic.md"),
user_prompt=(
"Task transcript:\n"
"User asked to fix WebChat. Root cause was API not running. "
"We started API manually but had no unified service script. Reflect on risk and reusable lesson."
),
max_tokens=220,
required_keywords=["risk", "lesson"],
),
]
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument("--baseline-url", default=BASELINE_URL)
parser.add_argument("--port", type=int, default=BENCH_PORT)
parser.add_argument("--models", nargs="*", default=[str(path) for path in CANDIDATES])
parser.add_argument("--threads", type=int, default=max(1, (os.cpu_count() or 8) // 2))
parser.add_argument("--ctx-size", type=int, default=4096)
parser.add_argument("--timeout", type=float, default=180.0)
parser.add_argument("--skip-cpu", action="store_true")
return parser.parse_args()
def llama_env() -> dict[str, str]:
env = os.environ.copy()
bin_dir = str(LLAMA_BIN.parent)
env["LD_LIBRARY_PATH"] = f"{bin_dir}{':' + env['LD_LIBRARY_PATH'] if env.get('LD_LIBRARY_PATH') else ''}"
return env
async def wait_ready(base_url: str, timeout: float) -> None:
deadline = time.perf_counter() + timeout
async with httpx.AsyncClient(timeout=5.0, trust_env=False) as client:
last_error = ""
while time.perf_counter() < deadline:
try:
response = await client.get(f"{base_url}/models")
if response.status_code == 200:
return
last_error = f"HTTP {response.status_code}: {response.text[:120]}"
except Exception as exc:
last_error = str(exc)
await asyncio.sleep(1.0)
raise TimeoutError(f"{base_url} not ready: {last_error}")
def start_cpu_server(model_path: Path, port: int, threads: int, ctx_size: int) -> subprocess.Popen:
log_dir = ROOT / "data/bench"
log_dir.mkdir(parents=True, exist_ok=True)
log_file = log_dir / f"{model_path.stem}.log"
command = [
str(LLAMA_BIN),
"-m",
str(model_path),
"--alias",
"bench-cpu",
"--host",
"127.0.0.1",
"--port",
str(port),
"-c",
str(ctx_size),
"--parallel",
"1",
"-ngl",
"0",
"--threads",
str(threads),
"--threads-batch",
str(threads),
"--reasoning",
"off",
"--cache-ram",
"0",
]
handle = log_file.open("a")
handle.write("Command: " + " ".join(command) + "\n")
handle.flush()
process = subprocess.Popen(
command,
cwd=ROOT,
env=llama_env(),
stdout=handle,
stderr=subprocess.STDOUT,
start_new_session=True,
)
process._duck_log_handle = handle # type: ignore[attr-defined]
return process
def stop_process(process: subprocess.Popen | None) -> None:
if process is None:
return
if process.poll() is None:
os.killpg(process.pid, signal.SIGTERM)
try:
process.wait(timeout=15)
except subprocess.TimeoutExpired:
os.killpg(process.pid, signal.SIGKILL)
process.wait(timeout=10)
handle = getattr(process, "_duck_log_handle", None)
if handle:
handle.close()
def safe_json(content: str) -> tuple[dict[str, Any] | None, str | None]:
text = content.strip()
if text.startswith("```"):
text = text.strip("`")
text = text.removeprefix("json").strip()
try:
return json.loads(text), None
except json.JSONDecodeError as exc:
return None, str(exc)
def score_case(case: BenchCase, content: str) -> tuple[float, str]:
data = None
if case.response_format:
data, error = safe_json(content)
if data is None:
return 0.0, f"invalid_json: {error}"
schema = case.response_format["json_schema"]["schema"]
try:
jsonschema.validate(data, schema)
except jsonschema.ValidationError as exc:
return 0.2, f"schema_error: {exc.message}"
if case.role == "action" and data is not None:
actions = data.get("actions") or []
if case.name == "direct_answer_no_tools":
return (1.0, "ok") if actions == [] else (0.3, f"unexpected_actions={actions}")
if case.name == "read_specific_file":
if actions and actions[0].get("tool") == "file_read" and actions[0].get("args", {}).get("path") == "CURRENT_STATE.md":
return 1.0, "ok"
return 0.4, f"wrong_action={actions}"
if case.role == "memory_policy" and data is not None:
if case.name == "store_user_preference":
ok = data.get("should_store") is True and data.get("memory_type") == "preference" and data.get("scope") == "global"
return (1.0, "ok") if ok else (0.4, f"wrong_memory_decision={data}")
if case.name == "ignore_trivial_tool_call":
ok = data.get("should_store") is False
return (1.0, "ok") if ok else (0.3, f"stored_trivial={data}")
if case.role == "recall" and data is not None:
ids = set(data.get("relevant_ids") or [])
if "m1" in ids and "m2" not in ids and "m3" not in ids:
return 1.0, "ok"
return 0.3, f"wrong_ids={sorted(ids)}"
lowered = content.lower()
missing = [word for word in case.required_keywords if word.lower() not in lowered]
if missing:
return max(0.2, 1.0 - 0.2 * len(missing)), f"missing={missing}"
return 1.0, "ok"
async def run_case(base_url: str, model: str, case: BenchCase, timeout: float) -> dict[str, Any]:
payload: dict[str, Any] = {
"model": model,
"messages": [
{"role": "system", "content": case.system_prompt},
{"role": "user", "content": case.user_prompt},
],
"temperature": 0.0,
"max_tokens": case.max_tokens,
}
if case.response_format:
payload["response_format"] = case.response_format
started = time.perf_counter()
print(f"test_tasks={len(TASKS)}")
print("llm_calls=0")
print("tool_calls=0")
print("json_directive_validity=not_run")
print("retry_count=0")
print("memory_writes=0")
print("experience_record_created=no")
print("selected_skill=not_run")
print(f"total_runtime_seconds={time.perf_counter() - started:.3f}")
async with httpx.AsyncClient(timeout=timeout, trust_env=False) as client:
response = await client.post(f"{base_url}/chat/completions", json=payload)
elapsed = time.perf_counter() - started
response.raise_for_status()
raw = response.json()
message = raw.get("choices", [{}])[0].get("message", {})
content = message.get("content") or ""
score, note = score_case(case, content)
usage = raw.get("usage") or {}
completion_tokens = usage.get("completion_tokens") or max(1, len(content.split()))
return {
"role": case.role,
"case": case.name,
"score": score,
"note": note,
"elapsed_seconds": round(elapsed, 3),
"completion_tokens": completion_tokens,
"tokens_per_second": round(completion_tokens / max(elapsed, 0.001), 2),
"content_preview": content[:500],
}
async def run_model(label: str, base_url: str, model: str, timeout: float) -> dict[str, Any]:
rows = []
for case in CASES:
try:
rows.append(await run_case(base_url, model, case, timeout))
except Exception as exc:
rows.append({
"role": case.role,
"case": case.name,
"score": 0.0,
"note": f"error: {exc}",
"elapsed_seconds": timeout,
"completion_tokens": 0,
"tokens_per_second": 0.0,
"content_preview": "",
})
scores = [row["score"] for row in rows]
return {
"model": label,
"quality": round(mean(scores), 3),
"avg_latency_seconds": round(mean(row["elapsed_seconds"] for row in rows), 3),
"avg_tokens_per_second": round(mean(row["tokens_per_second"] for row in rows), 2),
"cases": rows,
}
def markdown_report(results: list[dict[str, Any]]) -> str:
lines = [
"# Utility Role Model Benchmark",
"",
"Scope: service roles only (`action`, `memory_policy`, `recall`, `summary`, `critic`).",
"The main user-facing thinker is not evaluated for replacement here.",
"",
"| Model | Quality | Avg latency, s | Avg tok/s | Notes |",
"| --- | ---: | ---: | ---: | --- |",
]
for result in results:
failed = [case for case in result["cases"] if case["score"] < 1.0]
note = "all checks passed" if not failed else "; ".join(
f"{case['role']}/{case['case']}: {case['note']}" for case in failed[:3]
)
lines.append(
f"| {result['model']} | {result['quality']:.2f} | "
f"{result['avg_latency_seconds']:.2f} | {result['avg_tokens_per_second']:.2f} | {note} |"
)
lines.append("")
lines.append("## Case Details")
for result in results:
lines.append(f"\n### {result['model']}")
lines.append("| Role | Case | Score | Latency, s | tok/s | Note |")
lines.append("| --- | --- | ---: | ---: | ---: | --- |")
for case in result["cases"]:
lines.append(
f"| {case['role']} | {case['case']} | {case['score']:.2f} | "
f"{case['elapsed_seconds']:.2f} | {case['tokens_per_second']:.2f} | {case['note']} |"
)
return "\n".join(lines) + "\n"
async def main() -> None:
args = parse_args()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
results: list[dict[str, Any]] = []
print("Checking Qwen GPU baseline at", args.baseline_url)
await wait_ready(args.baseline_url, args.timeout)
results.append(await run_model("Qwen3.6-35B nonMTP GPU baseline", args.baseline_url, "local-main", args.timeout))
if not args.skip_cpu:
for raw_model in args.models:
model_path = Path(raw_model).resolve()
label = f"{model_path.stem} CPU"
print("Starting", label)
process = start_cpu_server(model_path, args.port, args.threads, args.ctx_size)
try:
await wait_ready(f"http://127.0.0.1:{args.port}/v1", args.timeout)
results.append(await run_model(label, f"http://127.0.0.1:{args.port}/v1", "bench-cpu", args.timeout))
finally:
stop_process(process)
timestamp = time.strftime("%Y%m%d_%H%M%S")
json_path = RESULTS_DIR / f"utility_model_bench_{timestamp}.json"
md_path = RESULTS_DIR / f"utility_model_bench_{timestamp}.md"
json_path.write_text(json.dumps(results, ensure_ascii=False, indent=2))
md_path.write_text(markdown_report(results))
print(markdown_report(results))
print(f"Wrote {json_path}")
print(f"Wrote {md_path}")
if __name__ == "__main__":

262
scripts/duck-mtp.sh Executable file
View File

@ -0,0 +1,262 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
ENV_KEYS=(
DUCK_LLAMA_SERVER_BIN
DUCK_MTP_MODEL_PATH
DUCK_MAIN_PORT
DUCK_CTX_SIZE
DUCK_N_GPU_LAYERS
DUCK_LLAMA_DEVICE
DUCK_PARALLEL
DUCK_LLAMA_PID_FILE
DUCK_LLAMA_LOG_FILE
DUCK_MTP_FLAGS
DUCK_HOST
DUCK_API_HOST
DUCK_API_PORT
DUCK_API_PID_FILE
DUCK_API_LOG_FILE
DUCK_API_COMMAND
)
declare -A ENV_OVERRIDES=()
for key in "${ENV_KEYS[@]}"; do
if [[ -v "${key}" ]]; then
ENV_OVERRIDES["${key}"]="${!key}"
fi
done
if [[ -f "${ROOT_DIR}/.env" ]]; then
set -a
# shellcheck disable=SC1091
source "${ROOT_DIR}/.env"
set +a
fi
for key in "${!ENV_OVERRIDES[@]}"; do
export "${key}=${ENV_OVERRIDES[${key}]}"
done
ACTION="${1:-start}"
API_PID_FILE="${DUCK_API_PID_FILE:-${ROOT_DIR}/data/duck-api.pid}"
API_LOG_FILE="${DUCK_API_LOG_FILE:-${ROOT_DIR}/data/duck-api.log}"
API_URL="http://${DUCK_API_HOST:-127.0.0.1}:${DUCK_API_PORT:-8000}"
LLAMA_SCRIPT="${ROOT_DIR}/scripts/llama/start_mtp_main.sh"
usage() {
cat <<'EOF'
Usage: scripts/duck-mtp.sh <command>
Commands:
start Start MTP llama-server and DuckLM API in the background
stop Stop DuckLM API and managed MTP llama-server
restart Stop and start the whole local DuckLM stack
status Print process and HTTP health status
logs Show DuckLM API and llama-server logs; use --follow/-f and --lines N
help Show this help
Environment:
DUCK_API_HOST API bind host, default 127.0.0.1
DUCK_API_PORT API port, default 8000
DUCK_API_PID_FILE API PID file path
DUCK_API_LOG_FILE API log file path
DUCK_API_COMMAND API command override, default ".venv/bin/python -m duck_core.api"
MTP llama-server environment is handled by scripts/llama/start_mtp_main.sh.
EOF
}
api_is_running() {
[[ -f "${API_PID_FILE}" ]] || return 1
local pid
pid="$(cat "${API_PID_FILE}")"
[[ "${pid}" =~ ^[0-9]+$ ]] || return 1
kill -0 "${pid}" 2>/dev/null
}
api_pid_value() {
if [[ -f "${API_PID_FILE}" ]]; then
cat "${API_PID_FILE}"
fi
}
start_api() {
if api_is_running; then
echo "DuckLM API already running: pid=$(api_pid_value)"
return 0
fi
mkdir -p "$(dirname "${API_PID_FILE}")" "$(dirname "${API_LOG_FILE}")"
rm -f "${API_PID_FILE}"
local command_string="${DUCK_API_COMMAND:-.venv/bin/python -m duck_core.api}"
echo "Starting DuckLM API..."
echo "Command: ${command_string}" >> "${API_LOG_FILE}"
if command -v setsid >/dev/null 2>&1; then
(
cd "${ROOT_DIR}"
nohup setsid bash -lc "${command_string}" >> "${API_LOG_FILE}" 2>&1 < /dev/null &
echo "$!" > "${API_PID_FILE}"
)
else
(
cd "${ROOT_DIR}"
nohup bash -lc "${command_string}" >> "${API_LOG_FILE}" 2>&1 < /dev/null &
echo "$!" > "${API_PID_FILE}"
)
fi
sleep 0.2
if api_is_running; then
echo "DuckLM API started: pid=$(api_pid_value)"
echo "WebChat: ${API_URL}/"
echo "Log: ${API_LOG_FILE}"
for _ in {1..20}; do
if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${API_URL}/health" >/dev/null 2>&1; then
echo "DuckLM API health: ok (${API_URL})"
return 0
fi
sleep 0.25
done
echo "DuckLM API health: not ready yet (${API_URL})"
return 0
fi
echo "DuckLM API failed to start. See ${API_LOG_FILE}" >&2
rm -f "${API_PID_FILE}"
return 1
}
stop_api() {
if ! api_is_running; then
rm -f "${API_PID_FILE}"
echo "DuckLM API not running"
return 0
fi
local pid
pid="$(api_pid_value)"
echo "Stopping DuckLM API: pid=${pid}"
kill "${pid}" 2>/dev/null || true
for _ in {1..30}; do
if ! kill -0 "${pid}" 2>/dev/null; then
rm -f "${API_PID_FILE}"
echo "DuckLM API stopped"
return 0
fi
sleep 0.2
done
echo "DuckLM API did not stop after SIGTERM; sending SIGKILL"
kill -9 "${pid}" 2>/dev/null || true
rm -f "${API_PID_FILE}"
echo "DuckLM API stopped"
}
start_stack() {
"${LLAMA_SCRIPT}" start
start_api
echo
echo "Status:"
status_stack
}
stop_stack() {
stop_api
"${LLAMA_SCRIPT}" stop
}
status_stack() {
local rc=0
if api_is_running; then
local pid
pid="$(api_pid_value)"
echo "DuckLM API running: pid=${pid}"
if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${API_URL}/health" >/dev/null 2>&1; then
echo "DuckLM API health: ok (${API_URL})"
else
echo "DuckLM API health: not ready (${API_URL})"
fi
else
if [[ -f "${API_PID_FILE}" ]]; then
echo "DuckLM API not running; removing stale pid file ${API_PID_FILE}"
rm -f "${API_PID_FILE}"
else
echo "DuckLM API not running"
fi
rc=3
fi
local llama_rc=0
"${LLAMA_SCRIPT}" status || llama_rc=$?
if [[ "${llama_rc}" != "0" && "${rc}" == "0" ]]; then
rc="${llama_rc}"
fi
return "${rc}"
}
logs_stack() {
local follow=0
local lines=100
shift || true
while [[ $# -gt 0 ]]; do
case "$1" in
-f|--follow)
follow=1
shift
;;
--lines)
lines="${2:?--lines requires a value}"
shift 2
;;
*)
echo "Unknown logs argument: $1" >&2
return 2
;;
esac
done
mkdir -p "$(dirname "${API_LOG_FILE}")"
touch "${API_LOG_FILE}"
if [[ "${follow}" == "1" ]]; then
local llama_log_file="${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-mtp.log}"
mkdir -p "$(dirname "${llama_log_file}")"
touch "${llama_log_file}"
tail -n "${lines}" -f "${API_LOG_FILE}" "${llama_log_file}"
else
echo "==> DuckLM API log: ${API_LOG_FILE} <=="
tail -n "${lines}" "${API_LOG_FILE}"
echo
echo "==> llama-server log: ${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-mtp.log} <=="
"${LLAMA_SCRIPT}" logs --lines "${lines}"
fi
}
case "${ACTION}" in
start)
start_stack
;;
stop)
stop_stack
;;
restart)
stop_stack
start_stack
;;
status)
status_stack
;;
logs)
logs_stack "$@"
;;
help|-h|--help)
usage
;;
*)
echo "Unknown command: ${ACTION}" >&2
usage >&2
exit 2
;;
esac

262
scripts/duck.sh Executable file
View File

@ -0,0 +1,262 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
ENV_KEYS=(
DUCK_LLAMA_SERVER_BIN
DUCK_MAIN_MODEL_PATH
DUCK_MAIN_PORT
DUCK_CTX_SIZE
DUCK_N_GPU_LAYERS
DUCK_LLAMA_DEVICE
DUCK_PARALLEL
DUCK_LLAMA_PID_FILE
DUCK_LLAMA_LOG_FILE
DUCK_LLAMA_EXTRA_ARGS
DUCK_HOST
DUCK_API_HOST
DUCK_API_PORT
DUCK_API_PID_FILE
DUCK_API_LOG_FILE
DUCK_API_COMMAND
)
declare -A ENV_OVERRIDES=()
for key in "${ENV_KEYS[@]}"; do
if [[ -v "${key}" ]]; then
ENV_OVERRIDES["${key}"]="${!key}"
fi
done
if [[ -f "${ROOT_DIR}/.env" ]]; then
set -a
# shellcheck disable=SC1091
source "${ROOT_DIR}/.env"
set +a
fi
for key in "${!ENV_OVERRIDES[@]}"; do
export "${key}=${ENV_OVERRIDES[${key}]}"
done
ACTION="${1:-start}"
API_PID_FILE="${DUCK_API_PID_FILE:-${ROOT_DIR}/data/duck-api.pid}"
API_LOG_FILE="${DUCK_API_LOG_FILE:-${ROOT_DIR}/data/duck-api.log}"
API_URL="http://${DUCK_API_HOST:-127.0.0.1}:${DUCK_API_PORT:-8000}"
LLAMA_SCRIPT="${ROOT_DIR}/scripts/llama/start_main.sh"
usage() {
cat <<'EOF'
Usage: scripts/duck.sh <command>
Commands:
start Start llama-server and DuckLM API in the background
stop Stop DuckLM API and managed llama-server
restart Stop and start the whole local DuckLM stack
status Print process and HTTP health status
logs Show DuckLM API and llama-server logs; use --follow/-f and --lines N
help Show this help
Environment:
DUCK_API_HOST API bind host, default 127.0.0.1
DUCK_API_PORT API port, default 8000
DUCK_API_PID_FILE API PID file path
DUCK_API_LOG_FILE API log file path
DUCK_API_COMMAND API command override, default ".venv/bin/python -m duck_core.api"
llama-server environment is handled by scripts/llama/start_main.sh.
EOF
}
api_is_running() {
[[ -f "${API_PID_FILE}" ]] || return 1
local pid
pid="$(cat "${API_PID_FILE}")"
[[ "${pid}" =~ ^[0-9]+$ ]] || return 1
kill -0 "${pid}" 2>/dev/null
}
api_pid_value() {
if [[ -f "${API_PID_FILE}" ]]; then
cat "${API_PID_FILE}"
fi
}
start_api() {
if api_is_running; then
echo "DuckLM API already running: pid=$(api_pid_value)"
return 0
fi
mkdir -p "$(dirname "${API_PID_FILE}")" "$(dirname "${API_LOG_FILE}")"
rm -f "${API_PID_FILE}"
local command_string="${DUCK_API_COMMAND:-.venv/bin/python -m duck_core.api}"
echo "Starting DuckLM API..."
echo "Command: ${command_string}" >> "${API_LOG_FILE}"
if command -v setsid >/dev/null 2>&1; then
(
cd "${ROOT_DIR}"
nohup setsid bash -lc "${command_string}" >> "${API_LOG_FILE}" 2>&1 < /dev/null &
echo "$!" > "${API_PID_FILE}"
)
else
(
cd "${ROOT_DIR}"
nohup bash -lc "${command_string}" >> "${API_LOG_FILE}" 2>&1 < /dev/null &
echo "$!" > "${API_PID_FILE}"
)
fi
sleep 0.2
if api_is_running; then
echo "DuckLM API started: pid=$(api_pid_value)"
echo "WebChat: ${API_URL}/"
echo "Log: ${API_LOG_FILE}"
for _ in {1..20}; do
if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${API_URL}/health" >/dev/null 2>&1; then
echo "DuckLM API health: ok (${API_URL})"
return 0
fi
sleep 0.25
done
echo "DuckLM API health: not ready yet (${API_URL})"
return 0
fi
echo "DuckLM API failed to start. See ${API_LOG_FILE}" >&2
rm -f "${API_PID_FILE}"
return 1
}
stop_api() {
if ! api_is_running; then
rm -f "${API_PID_FILE}"
echo "DuckLM API not running"
return 0
fi
local pid
pid="$(api_pid_value)"
echo "Stopping DuckLM API: pid=${pid}"
kill "${pid}" 2>/dev/null || true
for _ in {1..30}; do
if ! kill -0 "${pid}" 2>/dev/null; then
rm -f "${API_PID_FILE}"
echo "DuckLM API stopped"
return 0
fi
sleep 0.2
done
echo "DuckLM API did not stop after SIGTERM; sending SIGKILL"
kill -9 "${pid}" 2>/dev/null || true
rm -f "${API_PID_FILE}"
echo "DuckLM API stopped"
}
start_stack() {
"${LLAMA_SCRIPT}" start
start_api
echo
echo "Status:"
status_stack
}
stop_stack() {
stop_api
"${LLAMA_SCRIPT}" stop
}
status_stack() {
local rc=0
if api_is_running; then
local pid
pid="$(api_pid_value)"
echo "DuckLM API running: pid=${pid}"
if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${API_URL}/health" >/dev/null 2>&1; then
echo "DuckLM API health: ok (${API_URL})"
else
echo "DuckLM API health: not ready (${API_URL})"
fi
else
if [[ -f "${API_PID_FILE}" ]]; then
echo "DuckLM API not running; removing stale pid file ${API_PID_FILE}"
rm -f "${API_PID_FILE}"
else
echo "DuckLM API not running"
fi
rc=3
fi
local llama_rc=0
"${LLAMA_SCRIPT}" status || llama_rc=$?
if [[ "${llama_rc}" != "0" && "${rc}" == "0" ]]; then
rc="${llama_rc}"
fi
return "${rc}"
}
logs_stack() {
local follow=0
local lines=100
shift || true
while [[ $# -gt 0 ]]; do
case "$1" in
-f|--follow)
follow=1
shift
;;
--lines)
lines="${2:?--lines requires a value}"
shift 2
;;
*)
echo "Unknown logs argument: $1" >&2
return 2
;;
esac
done
mkdir -p "$(dirname "${API_LOG_FILE}")"
touch "${API_LOG_FILE}"
if [[ "${follow}" == "1" ]]; then
local llama_log_file="${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-main.log}"
mkdir -p "$(dirname "${llama_log_file}")"
touch "${llama_log_file}"
tail -n "${lines}" -f "${API_LOG_FILE}" "${llama_log_file}"
else
echo "==> DuckLM API log: ${API_LOG_FILE} <=="
tail -n "${lines}" "${API_LOG_FILE}"
echo
echo "==> llama-server log: ${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-main.log} <=="
"${LLAMA_SCRIPT}" logs --lines "${lines}"
fi
}
case "${ACTION}" in
start)
start_stack
;;
stop)
stop_stack
;;
restart)
stop_stack
start_stack
;;
status)
status_stack
;;
logs)
logs_stack "$@"
;;
help|-h|--help)
usage
;;
*)
echo "Unknown command: ${ACTION}" >&2
usage >&2
exit 2
;;
esac

View File

@ -37,6 +37,7 @@ ACTION="${1:-start}"
PID_FILE="${DUCK_LLAMA_PID_FILE:-${ROOT_DIR}/data/llama-main.pid}"
LOG_FILE="${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-main.log}"
BASE_URL="http://${DUCK_HOST:-127.0.0.1}:${DUCK_MAIN_PORT:-8081}/v1"
LLAMA_BIN_DIR=""
resolve_project_path() {
local value="$1"
@ -125,6 +126,7 @@ start() {
llama_bin="${DUCK_LLAMA_SERVER_BIN:-llama-server}"
if [[ "${llama_bin}" == */* ]]; then
llama_bin="$(resolve_project_path "${llama_bin}")"
LLAMA_BIN_DIR="$(dirname "${llama_bin}")"
fi
model_path="$(resolve_project_path "${DUCK_MAIN_MODEL_PATH}")"
local command=(
@ -152,9 +154,9 @@ start() {
echo "Starting llama-server..."
echo "Command: ${command[*]}" >> "${LOG_FILE}"
if command -v setsid >/dev/null 2>&1; then
nohup setsid "${command[@]}" >> "${LOG_FILE}" 2>&1 &
nohup setsid env LD_LIBRARY_PATH="${LLAMA_BIN_DIR}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" "${command[@]}" >> "${LOG_FILE}" 2>&1 &
else
nohup "${command[@]}" >> "${LOG_FILE}" 2>&1 &
nohup env LD_LIBRARY_PATH="${LLAMA_BIN_DIR}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" "${command[@]}" >> "${LOG_FILE}" 2>&1 &
fi
local pid=$!
echo "${pid}" > "${PID_FILE}"

271
scripts/llama/start_mtp_main.sh Executable file
View File

@ -0,0 +1,271 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
ENV_KEYS=(
DUCK_LLAMA_SERVER_BIN
DUCK_MTP_MODEL_PATH
DUCK_MAIN_PORT
DUCK_CTX_SIZE
DUCK_N_GPU_LAYERS
DUCK_LLAMA_DEVICE
DUCK_PARALLEL
DUCK_LLAMA_PID_FILE
DUCK_LLAMA_LOG_FILE
DUCK_MTP_FLAGS
DUCK_HOST
)
declare -A ENV_OVERRIDES=()
for key in "${ENV_KEYS[@]}"; do
if [[ -v "${key}" ]]; then
ENV_OVERRIDES["${key}"]="${!key}"
fi
done
if [[ -f "${ROOT_DIR}/.env" ]]; then
set -a
# shellcheck disable=SC1091
source "${ROOT_DIR}/.env"
set +a
fi
for key in "${!ENV_OVERRIDES[@]}"; do
export "${key}=${ENV_OVERRIDES[${key}]}"
done
ACTION="${1:-start}"
PID_FILE="${DUCK_LLAMA_PID_FILE:-${ROOT_DIR}/data/llama-mtp.pid}"
LOG_FILE="${DUCK_LLAMA_LOG_FILE:-${ROOT_DIR}/data/llama-mtp.log}"
BASE_URL="http://${DUCK_HOST:-127.0.0.1}:${DUCK_MAIN_PORT:-8081}/v1"
LLAMA_BIN_DIR=""
resolve_project_path() {
local value="$1"
if [[ "${value}" == /* ]]; then
printf '%s\n' "${value}"
else
printf '%s\n' "${ROOT_DIR}/${value#./}"
fi
}
usage() {
cat <<'EOF'
Usage: scripts/llama/start_mtp_main.sh <command>
Commands:
start Start MTP llama-server in the background
stop Stop the managed MTP llama-server process
restart Stop and start MTP llama-server
status Print process and HTTP health status
logs Show logs; use --follow/-f and --lines N
help Show this help
Environment:
DUCK_LLAMA_SERVER_BIN Path to llama-server binary
DUCK_MTP_MODEL_PATH Path to MTP GGUF model
DUCK_HOST Bind host, default 127.0.0.1
DUCK_MAIN_PORT Port, default 8081
DUCK_CTX_SIZE Context size, default 65536
DUCK_N_GPU_LAYERS GPU layers, default auto
DUCK_LLAMA_DEVICE Device name, for example Vulkan0
DUCK_PARALLEL Server slots, default 1
DUCK_LLAMA_PID_FILE PID file path
DUCK_LLAMA_LOG_FILE Log file path
DUCK_MTP_FLAGS Extra MTP llama-server args
EOF
}
is_running() {
[[ -f "${PID_FILE}" ]] || return 1
local pid
pid="$(cat "${PID_FILE}")"
[[ "${pid}" =~ ^[0-9]+$ ]] || return 1
kill -0 "${pid}" 2>/dev/null
}
pid_value() {
if [[ -f "${PID_FILE}" ]]; then
cat "${PID_FILE}"
fi
}
status() {
if is_running; then
local pid
pid="$(pid_value)"
echo "llama-server running: pid=${pid}"
if command -v curl >/dev/null 2>&1 && curl --noproxy "*" -fsS "${BASE_URL}/models" >/dev/null 2>&1; then
echo "HTTP health: ok (${BASE_URL})"
else
echo "HTTP health: not ready (${BASE_URL})"
fi
return 0
fi
if [[ -f "${PID_FILE}" ]]; then
echo "llama-server not running; removing stale pid file ${PID_FILE}"
rm -f "${PID_FILE}"
else
echo "llama-server not running"
fi
return 3
}
start() {
if is_running; then
echo "MTP llama-server already running: pid=$(pid_value)"
return 0
fi
: "${DUCK_MTP_MODEL_PATH:?DUCK_MTP_MODEL_PATH is required}"
mkdir -p "$(dirname "${PID_FILE}")" "$(dirname "${LOG_FILE}")"
rm -f "${PID_FILE}"
local llama_bin mtp_model_path
llama_bin="${DUCK_LLAMA_SERVER_BIN:-llama-server}"
if [[ "${llama_bin}" == */* ]]; then
llama_bin="$(resolve_project_path "${llama_bin}")"
LLAMA_BIN_DIR="$(dirname "${llama_bin}")"
fi
mtp_model_path="$(resolve_project_path "${DUCK_MTP_MODEL_PATH}")"
local help_text
help_text="$(LD_LIBRARY_PATH="${LLAMA_BIN_DIR}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" "${llama_bin}" --help 2>&1 || true)"
if ! grep -qi "draft-mtp" <<< "${help_text}"; then
echo "This llama-server build does not expose draft-mtp speculative decoding." >&2
return 1
fi
local command=(
"${llama_bin}"
-m "${mtp_model_path}"
--alias local-main
--host "${DUCK_HOST:-127.0.0.1}"
--port "${DUCK_MAIN_PORT:-8081}"
-c "${DUCK_CTX_SIZE:-65536}"
--parallel "${DUCK_PARALLEL:-1}"
-ngl "${DUCK_N_GPU_LAYERS:-auto}"
--flash-attn on
--cache-prompt
--metrics
--spec-type draft-mtp
)
if [[ -n "${DUCK_LLAMA_DEVICE:-}" ]]; then
command+=(--device "${DUCK_LLAMA_DEVICE}")
fi
if [[ -n "${DUCK_MTP_FLAGS:-}" ]]; then
# shellcheck disable=SC2206
local extra_args=( ${DUCK_MTP_FLAGS} )
command+=("${extra_args[@]}")
fi
echo "Starting MTP llama-server..."
echo "Command: ${command[*]}" >> "${LOG_FILE}"
if command -v setsid >/dev/null 2>&1; then
nohup setsid env LD_LIBRARY_PATH="${LLAMA_BIN_DIR}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" "${command[@]}" >> "${LOG_FILE}" 2>&1 &
else
nohup env LD_LIBRARY_PATH="${LLAMA_BIN_DIR}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" "${command[@]}" >> "${LOG_FILE}" 2>&1 &
fi
local pid=$!
echo "${pid}" > "${PID_FILE}"
sleep 0.2
if is_running; then
echo "MTP llama-server started: pid=${pid}"
echo "Log: ${LOG_FILE}"
return 0
fi
echo "MTP llama-server failed to start. See ${LOG_FILE}" >&2
rm -f "${PID_FILE}"
return 1
}
stop() {
if ! is_running; then
rm -f "${PID_FILE}"
echo "llama-server not running"
return 0
fi
local pid
pid="$(pid_value)"
echo "Stopping MTP llama-server: pid=${pid}"
kill "${pid}" 2>/dev/null || true
for _ in {1..30}; do
if ! kill -0 "${pid}" 2>/dev/null; then
rm -f "${PID_FILE}"
echo "MTP llama-server stopped"
return 0
fi
sleep 0.2
done
echo "MTP llama-server did not stop after SIGTERM; sending SIGKILL"
kill -9 "${pid}" 2>/dev/null || true
rm -f "${PID_FILE}"
echo "MTP llama-server stopped"
}
restart() {
stop
start
}
logs() {
local follow=0
local lines=100
shift || true
while [[ $# -gt 0 ]]; do
case "$1" in
-f|--follow)
follow=1
shift
;;
--lines)
lines="${2:?--lines requires a value}"
shift 2
;;
*)
echo "Unknown logs argument: $1" >&2
return 2
;;
esac
done
mkdir -p "$(dirname "${LOG_FILE}")"
touch "${LOG_FILE}"
if [[ "${follow}" == "1" ]]; then
tail -n "${lines}" -f "${LOG_FILE}"
else
tail -n "${lines}" "${LOG_FILE}"
fi
}
case "${ACTION}" in
start)
start
;;
stop)
stop
;;
restart)
restart
;;
status)
status
;;
logs)
logs "$@"
;;
help|-h|--help)
usage
;;
*)
echo "Unknown command: ${ACTION}" >&2
usage >&2
exit 2
;;
esac

View File

@ -56,6 +56,61 @@ def test_stream_chat_endpoint_emits_sse_reasoning_and_content(tmp_path, monkeypa
assert "answer" in body
def test_stream_chat_forwards_reasoning_toggle_to_thinker(tmp_path, monkeypatch):
monkeypatch.setenv("DUCK_DB_PATH", str(tmp_path / "duck.sqlite3"))
captured = {}
async def fake_chat(self, role, messages, temperature=None, max_output_tokens=None, response_format=None):
return ModelResponse(
role=role,
model="local-main",
content=json.dumps(
{
"kind": "action_directive",
"intent": "answer directly",
"risk_level": "none",
"actions": [],
}
),
reasoning_content=None,
raw={},
latency_ms=1.0,
)
async def fake_stream_chat(
self,
role,
messages,
temperature=None,
max_output_tokens=None,
response_format=None,
reasoning=None,
):
captured["role"] = role
captured["reasoning"] = reasoning
yield {"type": "content_delta", "delta": "answer"}
monkeypatch.setattr("duck_core.model_client.ModelClient.chat", fake_chat)
monkeypatch.setattr("duck_core.model_client.ModelClient.stream_chat", fake_stream_chat)
client = TestClient(create_app())
with client.stream(
"POST",
"/v1/chat/stream",
json={
"message": "hello",
"workspace": "./workspace",
"debug": True,
"reasoning": "off",
},
) as response:
body = "".join(response.iter_text())
assert response.status_code == 200
assert "event: done" in body
assert captured == {"role": "thinker", "reasoning": "off"}
def test_stream_chat_runs_memory_policy_and_reflection_after_completion(tmp_path, monkeypatch):
monkeypatch.setenv("DUCK_DB_PATH", str(tmp_path / "duck.sqlite3"))

View File

@ -0,0 +1,246 @@
import os
import subprocess
import textwrap
import time
from pathlib import Path
def test_duck_script_manages_llama_and_api(tmp_path):
fake_bin = tmp_path / "llama-server"
fake_bin.write_text(
textwrap.dedent(
"""\
#!/usr/bin/env bash
echo "fake llama-server $*" >&2
trap 'exit 0' TERM INT
while true; do sleep 1; done
"""
)
)
fake_bin.chmod(0o755)
fake_api = tmp_path / "fake-api.sh"
fake_api.write_text(
textwrap.dedent(
"""\
#!/usr/bin/env bash
echo "fake duck api $*" >&2
trap 'exit 0' TERM INT
while true; do sleep 1; done
"""
)
)
fake_api.chmod(0o755)
pid_file = tmp_path / "llama.pid"
log_file = tmp_path / "llama.log"
api_pid_file = tmp_path / "duck-api.pid"
api_log_file = tmp_path / "duck-api.log"
model_path = tmp_path / "model.gguf"
model_path.write_text("fake")
env = {
**os.environ,
"DUCK_LLAMA_SERVER_BIN": str(fake_bin),
"DUCK_MAIN_MODEL_PATH": str(model_path),
"DUCK_LLAMA_PID_FILE": str(pid_file),
"DUCK_LLAMA_LOG_FILE": str(log_file),
"DUCK_MAIN_PORT": "18081",
"DUCK_API_PID_FILE": str(api_pid_file),
"DUCK_API_LOG_FILE": str(api_log_file),
"DUCK_API_COMMAND": str(fake_api),
"DUCK_API_PORT": "18000",
}
script = "scripts/duck.sh"
stopped = subprocess.run([script, "status"], env=env, text=True, capture_output=True)
assert stopped.returncode == 3
assert "DuckLM API not running" in stopped.stdout
assert "llama-server not running" in stopped.stdout
started = subprocess.run([script, "start"], env=env, text=True, capture_output=True)
assert started.returncode == 0
assert "Starting llama-server" in started.stdout
assert "Starting DuckLM API" in started.stdout
assert "Status:" in started.stdout
assert "DuckLM API running" in started.stdout
assert "llama-server running" in started.stdout
assert api_pid_file.exists()
assert pid_file.exists()
try:
running = subprocess.run([script, "status"], env=env, text=True, capture_output=True)
assert running.returncode == 0
assert "DuckLM API running" in running.stdout
assert "llama-server running" in running.stdout
time.sleep(0.2)
logs = subprocess.run(
[script, "logs", "--lines", "20"], env=env, text=True, capture_output=True
)
assert logs.returncode == 0
assert "DuckLM API log" in logs.stdout
assert "fake duck api" in logs.stdout
assert "llama-server log" in logs.stdout
assert "--alias local-main" in logs.stdout
finally:
stopped = subprocess.run([script, "stop"], env=env, text=True, capture_output=True)
assert stopped.returncode == 0
assert not api_pid_file.exists()
assert not pid_file.exists()
def test_duck_mtp_script_starts_mtp_llama_for_duck_api(tmp_path):
fake_bin = tmp_path / "llama-server"
fake_bin.write_text(
textwrap.dedent(
"""\
#!/usr/bin/env bash
if [[ "${1:-}" == "--help" ]]; then
echo "supports --spec-type draft-mtp"
exit 0
fi
echo "fake mtp llama-server $*" >&2
trap 'exit 0' TERM INT
while true; do sleep 1; done
"""
)
)
fake_bin.chmod(0o755)
fake_api = tmp_path / "fake-api.sh"
fake_api.write_text(
textwrap.dedent(
"""\
#!/usr/bin/env bash
echo "fake duck api $*" >&2
trap 'exit 0' TERM INT
while true; do sleep 1; done
"""
)
)
fake_api.chmod(0o755)
main_model_path = tmp_path / "main.gguf"
mtp_model_path = tmp_path / "mtp.gguf"
main_model_path.write_text("main")
mtp_model_path.write_text("mtp")
pid_file = tmp_path / "llama-mtp.pid"
log_file = tmp_path / "llama-mtp.log"
api_pid_file = tmp_path / "duck-api-mtp.pid"
api_log_file = tmp_path / "duck-api-mtp.log"
env = {
**os.environ,
"DUCK_LLAMA_SERVER_BIN": str(fake_bin),
"DUCK_MAIN_MODEL_PATH": str(main_model_path),
"DUCK_MTP_MODEL_PATH": str(mtp_model_path),
"DUCK_LLAMA_PID_FILE": str(pid_file),
"DUCK_LLAMA_LOG_FILE": str(log_file),
"DUCK_MAIN_PORT": "18081",
"DUCK_API_PID_FILE": str(api_pid_file),
"DUCK_API_LOG_FILE": str(api_log_file),
"DUCK_API_COMMAND": str(fake_api),
"DUCK_API_PORT": "18000",
}
script = "scripts/duck-mtp.sh"
started = subprocess.run([script, "start"], env=env, text=True, capture_output=True)
assert started.returncode == 0
assert "Starting MTP llama-server" in started.stdout
assert "DuckLM API running" in started.stdout
assert "llama-server running" in started.stdout
try:
logs = subprocess.run(
[script, "logs", "--lines", "30"], env=env, text=True, capture_output=True
)
assert logs.returncode == 0
assert "--alias local-main" in logs.stdout
assert "--spec-type draft-mtp" in logs.stdout
assert f"-m {mtp_model_path}" in logs.stdout
assert "--model-draft" not in logs.stdout
assert str(main_model_path) not in logs.stdout
finally:
stopped = subprocess.run([script, "stop"], env=env, text=True, capture_output=True)
assert stopped.returncode == 0
assert not api_pid_file.exists()
assert not pid_file.exists()
def test_duck_mtp_script_sets_llama_bin_dir_library_path_for_help_check(tmp_path):
bin_dir = tmp_path / "build" / "bin"
bin_dir.mkdir(parents=True)
fake_bin = bin_dir / "llama-server"
fake_bin.write_text(
textwrap.dedent(
"""\
#!/usr/bin/env bash
case ":${LD_LIBRARY_PATH:-}:" in
*":$(dirname "$0"):"*) ;;
*)
echo "error while loading shared libraries: libllama-common.so.0" >&2
exit 127
;;
esac
if [[ "${1:-}" == "--help" ]]; then
echo "supports --spec-type draft-mtp"
exit 0
fi
echo "fake mtp llama-server $*" >&2
trap 'exit 0' TERM INT
while true; do sleep 1; done
"""
)
)
fake_bin.chmod(0o755)
fake_api = tmp_path / "fake-api.sh"
fake_api.write_text(
textwrap.dedent(
"""\
#!/usr/bin/env bash
trap 'exit 0' TERM INT
while true; do sleep 1; done
"""
)
)
fake_api.chmod(0o755)
main_model_path = tmp_path / "main.gguf"
mtp_model_path = tmp_path / "mtp.gguf"
main_model_path.write_text("main")
mtp_model_path.write_text("mtp")
env = {
**os.environ,
"LD_LIBRARY_PATH": "",
"DUCK_LLAMA_SERVER_BIN": str(fake_bin),
"DUCK_MAIN_MODEL_PATH": str(main_model_path),
"DUCK_MTP_MODEL_PATH": str(mtp_model_path),
"DUCK_LLAMA_PID_FILE": str(tmp_path / "llama-mtp.pid"),
"DUCK_LLAMA_LOG_FILE": str(tmp_path / "llama-mtp.log"),
"DUCK_MAIN_PORT": "18081",
"DUCK_API_PID_FILE": str(tmp_path / "duck-api-mtp.pid"),
"DUCK_API_LOG_FILE": str(tmp_path / "duck-api-mtp.log"),
"DUCK_API_COMMAND": str(fake_api),
"DUCK_API_PORT": "18000",
}
started = subprocess.run(
["scripts/duck-mtp.sh", "start"], env=env, text=True, capture_output=True
)
assert started.returncode == 0
try:
logs = subprocess.run(
["scripts/duck-mtp.sh", "logs", "--lines", "30"],
env=env,
text=True,
capture_output=True,
)
assert logs.returncode == 0
assert "--spec-type draft-mtp" in logs.stdout
assert "--model-draft" not in logs.stdout
finally:
subprocess.run(["scripts/duck-mtp.sh", "stop"], env=env, text=True, capture_output=True)

View File

@ -55,3 +55,54 @@ def test_start_main_script_manages_pid_status_stop_and_logs(tmp_path):
stopped = subprocess.run([script, "stop"], env=env, text=True, capture_output=True)
assert stopped.returncode == 0
assert not pid_file.exists()
def test_start_main_script_sets_llama_bin_dir_library_path(tmp_path):
bin_dir = tmp_path / "build" / "bin"
bin_dir.mkdir(parents=True)
fake_bin = bin_dir / "llama-server"
fake_bin.write_text(
textwrap.dedent(
"""\
#!/usr/bin/env bash
case ":${LD_LIBRARY_PATH:-}:" in
*":$(dirname "$0"):"*) ;;
*)
echo "error while loading shared libraries: libllama-common.so.0" >&2
exit 127
;;
esac
echo "fake llama-server $*" >&2
trap 'exit 0' TERM INT
while true; do sleep 1; done
"""
)
)
fake_bin.chmod(0o755)
model_path = tmp_path / "model.gguf"
model_path.write_text("fake")
pid_file = tmp_path / "llama.pid"
log_file = tmp_path / "llama.log"
env = {
**os.environ,
"LD_LIBRARY_PATH": "",
"DUCK_LLAMA_SERVER_BIN": str(fake_bin),
"DUCK_MAIN_MODEL_PATH": str(model_path),
"DUCK_LLAMA_PID_FILE": str(pid_file),
"DUCK_LLAMA_LOG_FILE": str(log_file),
"DUCK_MAIN_PORT": "18081",
}
script = "scripts/llama/start_main.sh"
started = subprocess.run([script, "start"], env=env, text=True, capture_output=True)
assert started.returncode == 0
try:
logs = subprocess.run(
[script, "logs", "--lines", "20"], env=env, text=True, capture_output=True
)
assert logs.returncode == 0
assert "--alias local-main" in logs.stdout
assert "error while loading shared libraries" not in logs.stdout
finally:
subprocess.run([script, "stop"], env=env, text=True, capture_output=True)

View File

@ -57,6 +57,32 @@ async def test_model_client_preserves_reasoning_content(monkeypatch):
assert response.reasoning_content == "private reasoning"
@pytest.mark.asyncio
async def test_model_client_adds_request_reasoning_options(monkeypatch):
payloads = []
async def fake_post(self, url, json):
payloads.append(json)
return httpx.Response(
200,
json={"choices": [{"message": {"role": "assistant", "content": "ok"}}]},
request=httpx.Request("POST", url),
)
monkeypatch.setattr(httpx.AsyncClient, "post", fake_post)
client = ModelClient("config/models.yaml")
await client.chat("thinker", [{"role": "user", "content": "hello"}], reasoning="on")
await client.chat("thinker", [{"role": "user", "content": "hello"}], reasoning="off")
assert payloads[0]["reasoning_format"] == "deepseek"
assert payloads[0]["chat_template_kwargs"] == {"enable_thinking": True}
assert "thinking_budget_tokens" not in payloads[0]
assert payloads[1]["reasoning_format"] == "deepseek"
assert payloads[1]["chat_template_kwargs"] == {"enable_thinking": False}
assert payloads[1]["thinking_budget_tokens"] == 0
@pytest.mark.asyncio
async def test_model_client_stream_chat_yields_reasoning_then_content(monkeypatch):
class FakeStreamResponse: