[ { "model": "Qwen3.6-35B nonMTP GPU baseline", "quality": 0.971, "avg_latency_seconds": 17.935, "avg_tokens_per_second": 4.51, "cases": [ { "role": "action", "case": "direct_answer_no_tools", "score": 1.0, "note": "ok", "elapsed_seconds": 15.321, "completion_tokens": 45, "tokens_per_second": 2.94, "content_preview": "{\n \"kind\": \"action_directive\",\n \"intent\": \"direct_answer\",\n \"risk_level\": \"low\",\n \"actions\": []\n}\n" }, { "role": "action", "case": "read_specific_file", "score": 1.0, "note": "ok", "elapsed_seconds": 19.638, "completion_tokens": 81, "tokens_per_second": 4.12, "content_preview": "{\n \"kind\": \"action_directive\",\n \"intent\": \"read_file\",\n \"risk_level\": \"low\",\n \"actions\": [\n {\n \"tool\": \"file_read\",\n \"args\": {\n \"path\": \"CURRENT_STATE.md\"\n }\n }\n ]\n}\n" }, { "role": "memory_policy", "case": "store_user_preference", "score": 1.0, "note": "ok", "elapsed_seconds": 18.421, "completion_tokens": 88, "tokens_per_second": 4.78, "content_preview": "{\n \"should_store\": true,\n \"memory_type\": \"preference\",\n \"summary\": \"User prefers responses in Russian and requires explicit confirmation before executing sudo commands.\",\n \"importance\": 0.9,\n \"scope\": \"global\",\n \"metadata\": {\n \"language\": \"ru\",\n \"security_policy\": \"sudo_confirmation_required\"\n }\n}" }, { "role": "memory_policy", "case": "ignore_trivial_tool_call", "score": 1.0, "note": "ok", "elapsed_seconds": 14.978, "completion_tokens": 61, "tokens_per_second": 4.07, "content_preview": "{\n \"should_store\": false,\n \"memory_type\": \"note\",\n \"summary\": \"Routine execution of pwd command returning /tmp/project.\",\n \"importance\": 0.1,\n \"scope\": \"conversation\",\n \"metadata\": {}\n}" }, { "role": "recall", "case": "select_relevant_memory", "score": 1.0, "note": "ok", "elapsed_seconds": 15.038, "completion_tokens": 66, "tokens_per_second": 4.39, "content_preview": "{\n \"relevant_ids\": [\n \"m1\"\n ],\n \"reasoning\": \"Memory m1 directly addresses the user's preference regarding the execution of sudo commands, which is the core of the query. Memories m2 and m3 are unrelated to sudo or command execution preferences.\"\n}" }, { "role": "summary", "case": "preserve_decisions", "score": 1.0, "note": "ok", "elapsed_seconds": 9.991, "completion_tokens": 44, "tokens_per_second": 4.4, "content_preview": "1. WebChat работает на 8000, llama-server на 8081.\n2. Для внешних путей нужен approval.\n3. allow_forever хранится по normalized action hash." }, { "role": "critic", "case": "reflection_quality", "score": 0.8, "note": "missing=['lesson']", "elapsed_seconds": 32.16, "completion_tokens": 220, "tokens_per_second": 6.84, "content_preview": "**Critic Reflection: WebChat API Incident**\n\n**1. Risk Assessment**\n\n* **Operational Fragility (High):** The incident revealed a critical dependency on manual intervention. If the API crashes or the server reboots, WebChat will silently fail or return errors until an engineer notices and manually restarts the service. This creates a \"single point of failure\" in the operational process.\n* **Meaning Time to Recovery (MTTR) Variance:** Recovery time is currently dependent on human availability " } ] } ]