ducklm/tests/smoke/test_runtime_tools.py

574 lines
21 KiB
Python

import json
import pytest
from duck_core.events.store import EventStore
from duck_core.model_client import ModelResponse
from duck_core.approvals.service import ApprovalService
from duck_core.runtime_loop import RuntimeLoop
from duck_core.tasks.store import TaskStore
class FakeToolModelClient:
async def chat(self, role, messages):
if role == "action":
if any("tool_observations" in message["content"] for message in messages):
actions = []
else:
actions = [
{
"tool": "file_read",
"args": {"path": "note.txt"},
"reason": "User asked for file contents",
}
]
return ModelResponse(
role=role,
model="local-main",
content=json.dumps(
{
"kind": "action_directive",
"intent": "read requested file",
"risk_level": "low",
"actions": actions,
}
),
reasoning_content=None,
raw={},
latency_ms=5.0,
)
assert role == "thinker"
assert any("tool_observations" in message["content"] for message in messages)
return ModelResponse(
role=role,
model="local-main",
content="The file says: hello from tool",
reasoning_content="used file_read",
raw={},
latency_ms=12.0,
)
class FakeMultiStepToolModelClient:
async def chat(self, role, messages):
if role == "action":
observation_text = "\n".join(message["content"] for message in messages)
if "tool_observations" not in observation_text:
actions = [
{
"tool": "list_dir",
"args": {"path": "."},
"reason": "Find available files",
}
]
elif "README.md" in observation_text and "readme contents" not in observation_text:
actions = [
{
"tool": "file_read",
"args": {"path": "README.md"},
"reason": "Read discovered README",
}
]
else:
actions = []
return ModelResponse(
role=role,
model="local-main",
content=json.dumps(
{
"kind": "action_directive",
"intent": "multi-step file inspection",
"risk_level": "low",
"actions": actions,
}
),
reasoning_content=None,
raw={},
latency_ms=5.0,
)
assert role == "thinker"
observation_text = "\n".join(message["content"] for message in messages)
assert "list_dir" in observation_text
assert "file_read" in observation_text
assert "readme contents" in observation_text
return ModelResponse(
role=role,
model="local-main",
content="Readme inspected",
reasoning_content=None,
raw={},
latency_ms=12.0,
)
class FakeUpdateCheckModelClient:
async def chat(self, role, messages):
if role == "action":
actions = []
if not any("tool_observations" in message["content"] for message in messages):
actions = [
{
"tool": "shell_exec_safe",
"args": {"command": "apt list --upgradable"},
"reason": "Check OS updates",
}
]
return ModelResponse(
role=role,
model="local-main",
content=json.dumps(
{
"kind": "action_directive",
"intent": "check system updates",
"risk_level": "low",
"actions": actions,
}
),
reasoning_content=None,
raw={},
latency_ms=5.0,
)
assert role == "thinker"
observation_text = "\n".join(message["content"] for message in messages)
assert "apt list --upgradable" in observation_text
assert "requires_approval" not in observation_text
return ModelResponse(
role=role,
model="local-main",
content="Updates checked without approval loop.",
reasoning_content=None,
raw={},
latency_ms=12.0,
)
class FakeMalformedActionModelClient:
async def chat(self, role, messages):
if role == "action":
return ModelResponse(
role=role,
model="local-main",
content=json.dumps(
{
"kind": "action_directive",
"intent": "broken action",
"risk_level": "low",
"actions": [
{
"tool": "file_read",
"reason": "Missing args must fail schema validation",
}
],
}
),
reasoning_content=None,
raw={},
latency_ms=5.0,
)
assert role == "thinker"
return ModelResponse(
role=role,
model="local-main",
content="Answered without tool execution.",
reasoning_content=None,
raw={},
latency_ms=12.0,
)
class FakeRepeatingActionModelClient:
async def chat(self, role, messages):
if role == "action":
return ModelResponse(
role=role,
model="local-main",
content=json.dumps(
{
"kind": "action_directive",
"intent": "repeat same action",
"risk_level": "low",
"actions": [
{
"tool": "file_read",
"args": {"path": "note.txt"},
"reason": "Read requested file",
}
],
}
),
reasoning_content=None,
raw={},
latency_ms=5.0,
)
assert role == "thinker"
return ModelResponse(
role=role,
model="local-main",
content="Final answer from first observation.",
reasoning_content=None,
raw={},
latency_ms=12.0,
)
@pytest.mark.asyncio
async def test_runtime_executes_action_directive_tool_and_finishes_with_observation(tmp_path):
(tmp_path / "note.txt").write_text("hello from tool")
db_path = str(tmp_path / "duck.sqlite3")
task_store = TaskStore(db_path)
event_store = EventStore(db_path)
loop = RuntimeLoop(task_store, event_store, FakeToolModelClient())
result = await loop.run_chat("read note.txt", str(tmp_path), debug=True)
events = await event_store.list_events(result.task_id)
event_types = [event.event_type for event in events]
tool_finished = next(event for event in events if event.event_type == "tool_call_finished")
assert result.status == "completed"
assert result.final_response == "The file says: hello from tool"
assert "action_directive" in event_types
assert "tool_call_started" in event_types
assert tool_finished.payload["tool"] == "file_read"
assert tool_finished.payload["result"]["ok"] is True
assert tool_finished.payload["result"]["output"] == "hello from tool"
@pytest.mark.asyncio
async def test_runtime_runs_multiple_tool_steps_before_final_answer(tmp_path):
(tmp_path / "README.md").write_text("readme contents")
db_path = str(tmp_path / "duck.sqlite3")
task_store = TaskStore(db_path)
event_store = EventStore(db_path)
loop = RuntimeLoop(task_store, event_store, FakeMultiStepToolModelClient())
result = await loop.run_chat("inspect the workspace readme", str(tmp_path), debug=True)
events = await event_store.list_events(result.task_id)
finished_tools = [
event.payload["tool"] for event in events if event.event_type == "tool_call_finished"
]
assert result.status == "completed"
assert result.final_response == "Readme inspected"
assert finished_tools == ["list_dir", "file_read"]
@pytest.mark.asyncio
async def test_runtime_checks_system_updates_without_approval_loop(tmp_path):
db_path = str(tmp_path / "duck.sqlite3")
task_store = TaskStore(db_path)
event_store = EventStore(db_path)
loop = RuntimeLoop(task_store, event_store, FakeUpdateCheckModelClient())
result = await loop.run_chat("Привет. Проверь обновления в системе", str(tmp_path), debug=True)
events = await event_store.list_events(result.task_id)
assert result.status == "completed"
assert not any(event.event_type == "tool_approval_requested" for event in events)
assert any(
event.event_type == "tool_call_finished"
and event.payload["tool"] == "shell_exec_safe"
for event in events
)
@pytest.mark.asyncio
async def test_runtime_rejects_malformed_action_directive_before_tools(tmp_path):
(tmp_path / "note.txt").write_text("hello")
db_path = str(tmp_path / "duck.sqlite3")
task_store = TaskStore(db_path)
event_store = EventStore(db_path)
loop = RuntimeLoop(task_store, event_store, FakeMalformedActionModelClient())
result = await loop.run_chat("read note.txt", str(tmp_path), debug=True)
events = await event_store.list_events(result.task_id)
failed = next(event for event in events if event.event_type == "action_directive_failed")
assert result.status == "completed"
assert "schema violation" in failed.payload["error"]
assert not any(event.event_type == "tool_call_started" for event in events)
def test_runtime_compacts_large_tool_observations_for_model_context(tmp_path):
db_path = str(tmp_path / "duck.sqlite3")
task_store = TaskStore(db_path)
event_store = EventStore(db_path)
loop = RuntimeLoop(task_store, event_store, FakeToolModelClient())
compact = loop.format_tool_observations_for_model([
{
"tool": "shell_exec_safe",
"result": {
"ok": True,
"output": "A" * 2500 + "KEEP_TAIL",
"metadata": {"command": "ls /tmp"},
},
}
])
assert "tool_observations" in compact
assert "truncated" in compact
assert "KEEP_TAIL" in compact
assert len(compact) < 2300
@pytest.mark.asyncio
async def test_runtime_skips_duplicate_action_within_same_task(tmp_path):
(tmp_path / "note.txt").write_text("hello once")
db_path = str(tmp_path / "duck.sqlite3")
task_store = TaskStore(db_path)
event_store = EventStore(db_path)
loop = RuntimeLoop(task_store, event_store, FakeRepeatingActionModelClient())
result = await loop.run_chat("read note.txt", str(tmp_path), debug=True)
events = await event_store.list_events(result.task_id)
finished_tools = [event for event in events if event.event_type == "tool_call_finished"]
skipped_tools = [event for event in events if event.event_type == "tool_call_skipped"]
assert result.status == "completed"
assert len(finished_tools) == 1
assert len(skipped_tools) == 1
assert skipped_tools[0].payload["reason"] == "duplicate_action"
class FakeApprovalModelClient:
async def chat(self, role, messages):
if role == "action":
if any("tool_observations" in message["content"] for message in messages):
actions = []
else:
actions = [
{
"tool": "shell_exec_safe",
"args": {"command": "uname -a"},
"reason": "User requested system information",
}
]
return ModelResponse(
role=role,
model="local-main",
content=json.dumps(
{
"kind": "action_directive",
"intent": "run command",
"risk_level": "medium",
"actions": actions,
}
),
reasoning_content=None,
raw={},
latency_ms=5.0,
)
raise AssertionError("thinker must not be called while approval is pending")
@pytest.mark.asyncio
async def test_runtime_creates_pending_approval_when_tool_requires_it(tmp_path):
db_path = str(tmp_path / "duck.sqlite3")
task_store = TaskStore(db_path)
event_store = EventStore(db_path)
approvals = ApprovalService(db_path)
loop = RuntimeLoop(task_store, event_store, FakeApprovalModelClient(), approval_service=approvals)
result = await loop.run_chat("run uname", str(tmp_path), debug=True)
pending = await approvals.pending()
events = await event_store.list_events(result.task_id)
assert result.status == "waiting_for_approval"
assert pending[0].task_id == result.task_id
assert pending[0].normalized_action["tool"] == "shell_exec_safe"
assert any(event.event_type == "tool_approval_requested" for event in events)
class FakeApprovalContinuationModelClient:
def __init__(self):
self.thinker_messages = []
async def chat(self, role, messages):
if role == "action":
if any("tool_observations" in message["content"] for message in messages):
actions = []
else:
actions = [
{
"tool": "shell_exec_safe",
"args": {"command": "uname -a"},
"reason": "User requested system information",
}
]
return ModelResponse(
role=role,
model="local-main",
content=json.dumps(
{
"kind": "action_directive",
"intent": "run command",
"risk_level": "medium",
"actions": actions,
}
),
reasoning_content=None,
raw={},
latency_ms=5.0,
)
assert role == "thinker"
self.thinker_messages = messages
assert any("tool_observations" in message["content"] for message in messages)
return ModelResponse(
role=role,
model="local-main",
content="uname completed",
reasoning_content="used approved shell command",
raw={},
latency_ms=10.0,
)
class FakeApprovalThenSecondToolModelClient:
async def chat(self, role, messages):
observation_text = "\n".join(message["content"] for message in messages)
if role == "action":
if "tool_observations" in observation_text and "second step content" not in observation_text:
actions = [
{
"tool": "file_read",
"args": {"path": "second.txt"},
"reason": "Read follow-up file after approved command",
}
]
elif "tool_observations" in observation_text:
actions = []
else:
actions = [
{
"tool": "shell_exec_safe",
"args": {"command": "uname -a"},
"reason": "User requested system information",
}
]
return ModelResponse(
role=role,
model="local-main",
content=json.dumps(
{
"kind": "action_directive",
"intent": "approval then follow-up",
"risk_level": "medium",
"actions": actions,
}
),
reasoning_content=None,
raw={},
latency_ms=5.0,
)
assert role == "thinker"
assert "shell_exec_safe" in observation_text
assert "file_read" in observation_text
assert "second step content" in observation_text
return ModelResponse(
role=role,
model="local-main",
content="approved command and second tool completed",
reasoning_content=None,
raw={},
latency_ms=10.0,
)
@pytest.mark.asyncio
async def test_runtime_continues_after_approved_tool_call(tmp_path):
db_path = str(tmp_path / "duck.sqlite3")
task_store = TaskStore(db_path)
event_store = EventStore(db_path)
approvals = ApprovalService(db_path)
model_client = FakeApprovalContinuationModelClient()
loop = RuntimeLoop(task_store, event_store, model_client, approval_service=approvals)
pending_result = await loop.run_chat("run uname", str(tmp_path), debug=True)
pending = await approvals.pending()
await approvals.allow_once(pending[0].approval_id)
result = await loop.continue_after_approval(pending_result.task_id, pending[0].approval_id)
events = await event_store.list_events(result.task_id)
finished = next(event for event in events if event.event_type == "tool_call_finished")
assert result.status == "completed"
assert result.final_response == "uname completed"
assert finished.payload["tool"] == "shell_exec_safe"
assert finished.payload["result"]["ok"] is True
assert "uname" in finished.payload["result"]["metadata"]["command"]
assert any(event.event_type == "task_completed" for event in events)
@pytest.mark.asyncio
async def test_runtime_can_run_followup_tool_after_approval(tmp_path):
(tmp_path / "second.txt").write_text("second step content")
db_path = str(tmp_path / "duck.sqlite3")
task_store = TaskStore(db_path)
event_store = EventStore(db_path)
approvals = ApprovalService(db_path)
loop = RuntimeLoop(
task_store,
event_store,
FakeApprovalThenSecondToolModelClient(),
approval_service=approvals,
)
pending_result = await loop.run_chat("run uname then inspect second file", str(tmp_path), debug=True)
pending = await approvals.pending()
await approvals.allow_once(pending[0].approval_id)
result = await loop.continue_after_approval(pending_result.task_id, pending[0].approval_id)
events = await event_store.list_events(result.task_id)
finished_tools = [
event.payload["tool"] for event in events if event.event_type == "tool_call_finished"
]
assert result.status == "completed"
assert finished_tools == ["shell_exec_safe", "file_read"]
@pytest.mark.asyncio
async def test_runtime_continues_after_denied_tool_call_without_execution(tmp_path):
db_path = str(tmp_path / "duck.sqlite3")
task_store = TaskStore(db_path)
event_store = EventStore(db_path)
approvals = ApprovalService(db_path)
model_client = FakeApprovalContinuationModelClient()
loop = RuntimeLoop(task_store, event_store, model_client, approval_service=approvals)
pending_result = await loop.run_chat("run uname", str(tmp_path), debug=True)
pending = await approvals.pending()
await approvals.deny(pending[0].approval_id)
result = await loop.continue_after_approval(pending_result.task_id, pending[0].approval_id)
events = await event_store.list_events(result.task_id)
finished = next(event for event in events if event.event_type == "tool_call_finished")
assert result.status == "completed"
assert finished.payload["result"]["ok"] is False
assert finished.payload["result"]["metadata"]["decision"] == "deny"
assert "denied" in finished.payload["result"]["error"].lower()
@pytest.mark.asyncio
async def test_runtime_reuses_allow_forever_for_matching_action(tmp_path):
db_path = str(tmp_path / "duck.sqlite3")
task_store = TaskStore(db_path)
event_store = EventStore(db_path)
approvals = ApprovalService(db_path)
model_client = FakeApprovalContinuationModelClient()
loop = RuntimeLoop(task_store, event_store, model_client, approval_service=approvals)
first_result = await loop.run_chat("run uname", str(tmp_path), debug=True)
first_pending = await approvals.pending()
await approvals.allow_forever(first_pending[0].approval_id)
await loop.continue_after_approval(first_result.task_id, first_pending[0].approval_id)
second_result = await loop.run_chat("run uname again", str(tmp_path), debug=True)
second_events = await event_store.list_events(second_result.task_id)
assert second_result.status == "completed"
assert second_result.final_response == "uname completed"
assert not any(event.event_type == "tool_approval_requested" for event in second_events)
assert any(event.event_type == "tool_call_finished" for event in second_events)