ducklm/tests/test_tools_flow.py

490 lines
18 KiB
Python

import json
from pathlib import Path
from app.core.contracts import ExecutionDirective, UserTask
from app.core.contracts import PermissionDecision
from app.core.contracts import ToolResult
from app.events.event_types import TOOL_OUTPUT_CHUNK
from app.runtime.runtime_controller import RuntimeController
from app.tools.sandbox import ToolSandbox
def _write_config_tree(base_dir: Path) -> None:
(base_dir / "config").mkdir()
(base_dir / "data" / "events").mkdir(parents=True, exist_ok=True)
(base_dir / "data" / "state").mkdir(parents=True, exist_ok=True)
(base_dir / "data" / "permissions").mkdir(parents=True, exist_ok=True)
(base_dir / "models").mkdir(exist_ok=True)
configs = {
"models.json": {
"orchestrator_path": "models/llama.gguf",
"coder_path": "models/xcoder.gguf",
"critic_path": "models/gemma.gguf",
"embeddings_path": "models/all-MiniLM-L6-v2",
"inference": {},
},
"prompts.json": {
"orchestration_prompt": "",
"planning_prompt": "",
"coder_prompt": "",
"critic_prompt": "",
},
"permissions.json": {
"settings": {
"allow_caching": True,
"cache_file": str(base_dir / "data/runtime/allowed_commands.json"),
"normalize_commands": True,
"split_chained": True
},
"command_categories": {
"hard_stop": {
"commands": ["rm -rf /", "rm -rf /*", "dd if=/dev/zero of=/dev/sd*"]
},
"no_always": {
"allow_once": True,
"allow_always": False,
"commands": [
"rm -rf *", "rm -rf .*", "shutdown", "reboot", "halt",
"apt", "apt-get", "dpkg", "yum", "dnf", "pacman",
"systemctl stop", "systemctl start", "systemctl restart",
"service stop", "service start", "killall", "pkill -9"
]
},
"normal": {
"allow_once": True,
"allow_always": True,
"commands": ["shell_exec", "file_write"]
}
},
"path_settings": {
"allow_read_outside": True,
"allow_write_paths": [str(base_dir), "/tmp"],
"require_confirmation_for_write": True,
"require_confirmation_for_shell": True
}
},
"runtime.json": {
"step_timeout_ms": 5000,
"task_timeout_ms": 30000,
"planner_retry_limit": 1,
"tool_retry_limit": 0,
"replan_limit": 0,
"max_execution_steps": 5,
"retrieval_top_k": 3,
"memory_thresholds": {},
"critic_fallback_policy": "continue_without_critic",
"checkpoint_policy": {"save_on_transition": True},
"event_retention_policy": {"keep_all": True},
"streaming_settings": {"enabled": True},
},
}
for name, payload in configs.items():
(base_dir / "config" / name).write_text(json.dumps(payload), encoding="utf-8")
def test_file_write_and_read_tool_flow(tmp_path: Path) -> None:
_write_config_tree(tmp_path)
controller = RuntimeController(base_dir=tmp_path)
target = tmp_path / "notes" / "test.txt"
write_result = controller.handle_task(
UserTask(
input="write a file",
context={
"requested_tool": "file_write",
"tool_args": {"path": str(target), "content": "hello from ducklm"},
},
)
)
assert write_result["status"] == "completed"
assert target.read_text(encoding="utf-8") == "hello from ducklm"
read_result = controller.handle_task(
UserTask(
input="read the file",
context={
"requested_tool": "file_read",
"tool_args": {"path": str(target)},
},
)
)
assert read_result["status"] == "completed"
assert read_result["result"]["output"] == "hello from ducklm"
def test_shell_exec_requires_permission_for_dangerous_command(tmp_path: Path) -> None:
_write_config_tree(tmp_path)
controller = RuntimeController(base_dir=tmp_path)
result = controller.handle_task(
UserTask(
input="run dangerous shell command",
context={
"requested_tool": "shell_exec",
"tool_args": {"command": "rm -rf /tmp/nonexistent"},
},
)
)
# rm -rf /tmp/nonexistent is not hard_stop (only exact "rm -rf /" is)
# but it matches "rm -rf *" in no_always category
assert result["status"] == "awaiting_permission"
assert "permission_request" in result["result"]
def test_shell_exec_allows_safe_command(tmp_path: Path) -> None:
_write_config_tree(tmp_path)
controller = RuntimeController(base_dir=tmp_path)
result = controller.handle_task(
UserTask(
input="run safe shell command",
context={
"requested_tool": "shell_exec",
"tool_args": {"command": "pwd"},
},
)
)
# Even safe commands require permission in the new permission model
assert result["status"] == "awaiting_permission"
assert "permission_request" in result["result"]
# Grant permission and verify execution
resumed = controller.resolve_permission(task_id=result["task_id"], decision="allow_once")
assert resumed["status"] == "completed"
assert str(tmp_path) in resumed["result"]["output"]
def test_shell_exec_publishes_output_chunks_before_completion(tmp_path: Path) -> None:
_write_config_tree(tmp_path)
controller = RuntimeController(base_dir=tmp_path)
perm_override = PermissionDecision(
action_type="shell_command",
pattern="printf",
decision="allow_always",
)
task = UserTask(
input="stream shell output",
context={
"requested_tool": "shell_exec",
"tool_args": {"command": "printf 'first\\n'; sleep 0.1; printf 'second\\n'"},
},
)
result = controller.execution_engine.execute(
task,
ExecutionDirective(
type="tool",
payload={
"tool": "shell_exec",
"args": {"command": "printf 'first\\n'; sleep 0.1; printf 'second\\n'"},
},
),
permission_override=perm_override,
)
events = controller.event_bus.list_for_task(task.task_id)
chunk_events = [event for event in events if event.type == TOOL_OUTPUT_CHUNK]
completed_index = next(index for index, event in enumerate(events) if event.type == "tool_completed")
first_chunk_index = next(index for index, event in enumerate(events) if event.type == TOOL_OUTPUT_CHUNK)
assert result["status"] == "completed"
assert [event.payload["chunk"] for event in chunk_events] == ["first\n", "second\n"]
assert first_chunk_index < completed_index
def test_streaming_shell_uses_idle_timeout_not_step_timeout(tmp_path: Path) -> None:
sandbox = ToolSandbox(
allowed_root=tmp_path,
timeout_ms=100,
command_timeout_ms=2000,
idle_timeout_ms=500,
)
chunks: list[str] = []
result = sandbox.run_shell(
command="printf 'first\\n'; sleep 0.2; printf 'second\\n'",
output_callback=lambda _stream, chunk: chunks.append(chunk),
)
assert result.returncode == 0
assert result.stdout == "first\nsecond\n"
assert chunks == ["first\n", "second\n"]
def test_streaming_shell_timeout_kills_child_process_group(tmp_path: Path) -> None:
marker = tmp_path / "child-survived"
sandbox = ToolSandbox(
allowed_root=tmp_path,
timeout_ms=100,
command_timeout_ms=100,
idle_timeout_ms=1000,
)
result = sandbox.run_shell(
command=f"sh -c 'sleep 1; touch {marker}'",
output_callback=lambda _stream, _chunk: None,
)
assert result.returncode == -9
assert not marker.exists()
class _RecoveryCritic:
async def generate(self, prompt: str, max_tokens: int | None = None) -> str:
return '{"action":"continue","reason":"No matches is acceptable information for this exploratory check."}'
def test_failed_shell_step_can_recover_and_continue(tmp_path: Path) -> None:
_write_config_tree(tmp_path)
controller = RuntimeController(base_dir=tmp_path)
controller.execution_engine.set_critic(_RecoveryCritic())
controller.execution_engine._recovery_limit = 1
# Bypass permission check for this test — we're testing recovery, not permissions
from app.core.contracts import PermissionDecision
perm_override = PermissionDecision(
action_type="shell_command",
pattern="grep",
decision="allow_always",
)
result = controller.execution_engine.execute(
UserTask(
input="run grep with no matches and recover",
),
ExecutionDirective(
type="plan",
payload={
"steps": [
{
"id": "1",
"tool": "shell_exec",
"args": {"command": "printf 'abc\\n' | grep definitely_missing"},
"depends_on": [],
}
]
},
),
permission_override=perm_override,
)
assert result["status"] == "completed"
failed_result = result["result"]["step_results"][0]["result"]["result"]
assert failed_result["metadata"]["exit_code"] == 1
def test_privilege_scope_failure_awaits_user_review_before_replan(tmp_path: Path) -> None:
_write_config_tree(tmp_path)
controller = RuntimeController(base_dir=tmp_path)
task = UserTask(
input="обнови систему",
context={
"requested_tool": "shell_exec",
"tool_args": {"command": "sudo apt update && apt upgrade -y"},
},
)
class FailingShellTool:
def execute(self, task: UserTask, args: dict[str, object]) -> ToolResult:
return ToolResult(
tool="shell_exec",
ok=False,
output="Error: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?",
error="Command failed with exit code 100",
metadata={"exit_code": 100},
)
controller.tool_registry._tools["shell_exec"] = FailingShellTool()
initial = controller.handle_task(task)
assert initial["status"] == "awaiting_permission"
controller.resolve_permission(task_id=task.task_id, decision="allow_once")
result = controller.resolve_secret(task_id=task.task_id, secret="secret")
assert result["status"] == "awaiting_review"
assert result["result"]["review"]["diagnosis"]["type"] == "privilege_scope_error"
assert result["result"]["review"]["critic_assessment"]["classification"] == "model_planning_error"
def test_plan_pauses_on_privilege_scope_review_instead_of_completing(tmp_path: Path) -> None:
_write_config_tree(tmp_path)
controller = RuntimeController(base_dir=tmp_path)
class FailingShellTool:
def execute(self, task: UserTask, args: dict[str, object]) -> ToolResult:
return ToolResult(
tool="shell_exec",
ok=False,
output="Error: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?",
error="Command failed with exit code 100",
metadata={"exit_code": 100},
)
controller.tool_registry._tools["shell_exec"] = FailingShellTool()
result = controller.execution_engine.execute(
UserTask(input="обнови систему"),
ExecutionDirective(
type="plan",
payload={
"steps": [
{
"id": "1",
"tool": "shell_exec",
"args": {"command": "sudo apt update && apt upgrade -y"},
"depends_on": [],
}
]
},
),
permission_override=PermissionDecision(
action_type="shell_command",
pattern="apt",
decision="allow_once",
),
secret_override="secret",
)
assert result["status"] == "awaiting_review"
assert result["result"]["review"]["diagnosis"]["type"] == "privilege_scope_error"
def test_sudo_auth_failure_requests_secret_retry_not_review(tmp_path: Path) -> None:
_write_config_tree(tmp_path)
controller = RuntimeController(base_dir=tmp_path)
class BadPasswordShellTool:
def execute(self, task: UserTask, args: dict[str, object]) -> ToolResult:
return ToolResult(
tool="shell_exec",
ok=False,
output="Sorry, try again.\nsudo: no password was provided\nsudo: 1 incorrect password attempt\n",
error="Command failed with exit code 1",
metadata={"exit_code": 1, "sudo_auth_failed": True},
)
controller.tool_registry._tools["shell_exec"] = BadPasswordShellTool()
result = controller.execution_engine.execute(
UserTask(input="обнови систему"),
ExecutionDirective(
type="plan",
payload={
"steps": [
{
"id": "1",
"tool": "shell_exec",
"args": {"command": "sudo apt update && apt upgrade -y"},
"depends_on": [],
}
]
},
),
permission_override=PermissionDecision(
action_type="shell_command",
pattern="apt",
decision="allow_once",
),
secret_override="wrong",
)
assert result["status"] == "awaiting_input"
assert result["result"]["secret_request"]["kind"] == "sudo_password"
assert result["result"]["secret_request"]["prompt"] == "Sudo password incorrect. Try again"
assert result["result"]["attempt_failed"] is True
def test_runtime_keeps_secret_state_after_bad_sudo_password(tmp_path: Path) -> None:
_write_config_tree(tmp_path)
controller = RuntimeController(base_dir=tmp_path)
class RetryPasswordShellTool:
calls = 0
def execute(self, task: UserTask, args: dict[str, object]) -> ToolResult:
self.calls += 1
if self.calls == 1:
return ToolResult(
tool="shell_exec",
ok=False,
output="Sorry, try again.\nsudo: no password was provided\nsudo: 1 incorrect password attempt\n",
error="Command failed with exit code 1",
metadata={"exit_code": 1, "sudo_auth_failed": True},
)
return ToolResult(
tool="shell_exec",
ok=True,
output="root\n",
metadata={"exit_code": 0},
)
controller.tool_registry._tools["shell_exec"] = RetryPasswordShellTool()
task = UserTask(
input="кто root",
context={
"requested_tool": "shell_exec",
"tool_args": {"command": "sudo whoami"},
},
)
initial = controller.handle_task(task)
assert initial["status"] == "awaiting_permission"
allowed = controller.resolve_permission(task_id=task.task_id, decision="allow_once")
assert allowed["status"] == "awaiting_input"
retry = controller.resolve_secret(task_id=task.task_id, secret="wrong")
assert retry["status"] == "awaiting_input"
assert retry["result"]["attempt_failed"] is True
final = controller.resolve_secret(task_id=task.task_id, secret="correct")
assert final["status"] == "completed"
assert final["result"]["output"] == "root\n"
def test_permission_resolution_can_resume_task(tmp_path: Path) -> None:
_write_config_tree(tmp_path)
controller = RuntimeController(base_dir=tmp_path)
initial = controller.handle_task(
UserTask(
input="запусти sudo apt update",
)
)
assert initial["status"] == "awaiting_permission"
resumed = controller.resolve_permission(task_id=initial["task_id"], decision="deny")
assert resumed["status"] == "failed"
assert resumed["result"]["error"] == "Permission denied by user."
def test_sudo_permission_resolution_requests_secret_input(tmp_path: Path) -> None:
_write_config_tree(tmp_path)
controller = RuntimeController(base_dir=tmp_path)
initial = controller.handle_task(UserTask(input="запусти sudo apt update"))
assert initial["status"] == "awaiting_permission"
resumed = controller.resolve_permission(task_id=initial["task_id"], decision="allow_once")
assert resumed["status"] == "awaiting_input"
assert resumed["result"]["secret_request"]["kind"] == "sudo_password"
def test_implicit_sudo_command_requests_password(tmp_path: Path) -> None:
"""Commands like 'apt list --upgradable' that require sudo but don't start with 'sudo'
should also trigger password request after permission is granted."""
_write_config_tree(tmp_path)
controller = RuntimeController(base_dir=tmp_path)
# apt list --upgradable requires root but doesn't start with 'sudo'
initial = controller.handle_task(
UserTask(
input="проверь обновления",
context={
"requested_tool": "shell_exec",
"tool_args": {"command": "apt list --upgradable"},
},
)
)
assert initial["status"] == "awaiting_permission"
# Grant permission — should request sudo password since apt requires root
resumed = controller.resolve_permission(task_id=initial["task_id"], decision="allow_once")
assert resumed["status"] == "awaiting_input"
assert resumed["result"]["secret_request"]["kind"] == "sudo_password"
def test_secret_resolution_continues_after_pending_secret_saved(tmp_path: Path) -> None:
_write_config_tree(tmp_path)
controller = RuntimeController(base_dir=tmp_path)
initial = controller.handle_task(UserTask(input="запусти sudo apt update"))
assert initial["status"] == "awaiting_permission"
resumed = controller.resolve_permission(task_id=initial["task_id"], decision="allow_once")
assert resumed["status"] == "awaiting_input"
final = controller.resolve_secret(task_id=initial["task_id"], secret="wrongpass")
assert final["status"] in {"completed", "failed", "awaiting_input"}
assert "error" in final["result"] or "output" in final["result"]