ducklm/tests/smoke/test_runtime_tools.py

import json

import pytest

from duck_core.events.store import EventStore
from duck_core.model_client import ModelResponse
from duck_core.approvals.service import ApprovalService
from duck_core.runtime_loop import RuntimeLoop
from duck_core.tasks.store import TaskStore


class FakeToolModelClient:
    async def chat(self, role, messages):
        if role == "action":
            if any("tool_observations" in message["content"] for message in messages):
                actions = []
            else:
                actions = [
                    {
                        "tool": "file_read",
                        "args": {"path": "note.txt"},
                        "reason": "User asked for file contents",
                    }
                ]
            return ModelResponse(
                role=role,
                model="local-main",
                content=json.dumps(
                    {
                        "kind": "action_directive",
                        "intent": "read requested file",
                        "risk_level": "low",
                        "actions": actions,
                    }
                ),
                reasoning_content=None,
                raw={},
                latency_ms=5.0,
            )
        assert role == "thinker"
        assert any("tool_observations" in message["content"] for message in messages)
        return ModelResponse(
            role=role,
            model="local-main",
            content="The file says: hello from tool",
            reasoning_content="used file_read",
            raw={},
            latency_ms=12.0,
        )


class FakeMultiStepToolModelClient:
    async def chat(self, role, messages):
        if role == "action":
            observation_text = "\n".join(message["content"] for message in messages)
            if "tool_observations" not in observation_text:
                actions = [
                    {
                        "tool": "list_dir",
                        "args": {"path": "."},
                        "reason": "Find available files",
                    }
                ]
            elif "README.md" in observation_text and "readme contents" not in observation_text:
                actions = [
                    {
                        "tool": "file_read",
                        "args": {"path": "README.md"},
                        "reason": "Read discovered README",
                    }
                ]
            else:
                actions = []
            return ModelResponse(
                role=role,
                model="local-main",
                content=json.dumps(
                    {
                        "kind": "action_directive",
                        "intent": "multi-step file inspection",
                        "risk_level": "low",
                        "actions": actions,
                    }
                ),
                reasoning_content=None,
                raw={},
                latency_ms=5.0,
            )
        assert role == "thinker"
        observation_text = "\n".join(message["content"] for message in messages)
        assert "list_dir" in observation_text
        assert "file_read" in observation_text
        assert "readme contents" in observation_text
        return ModelResponse(
            role=role,
            model="local-main",
            content="Readme inspected",
            reasoning_content=None,
            raw={},
            latency_ms=12.0,
        )


class FakeUpdateCheckModelClient:
    async def chat(self, role, messages):
        if role == "action":
            actions = []
            if not any("tool_observations" in message["content"] for message in messages):
                actions = [
                    {
                        "tool": "shell_exec_safe",
                        "args": {"command": "apt list --upgradable"},
                        "reason": "Check OS updates",
                    }
                ]
            return ModelResponse(
                role=role,
                model="local-main",
                content=json.dumps(
                    {
                        "kind": "action_directive",
                        "intent": "check system updates",
                        "risk_level": "low",
                        "actions": actions,
                    }
                ),
                reasoning_content=None,
                raw={},
                latency_ms=5.0,
            )
        assert role == "thinker"
        observation_text = "\n".join(message["content"] for message in messages)
        assert "apt list --upgradable" in observation_text
        assert "requires_approval" not in observation_text
        return ModelResponse(
            role=role,
            model="local-main",
            content="Updates checked without approval loop.",
            reasoning_content=None,
            raw={},
            latency_ms=12.0,
        )


class FakeMalformedActionModelClient:
    async def chat(self, role, messages):
        if role == "action":
            return ModelResponse(
                role=role,
                model="local-main",
                content=json.dumps(
                    {
                        "kind": "action_directive",
                        "intent": "broken action",
                        "risk_level": "low",
                        "actions": [
                            {
                                "tool": "file_read",
                                "reason": "Missing args must fail schema validation",
                            }
                        ],
                    }
                ),
                reasoning_content=None,
                raw={},
                latency_ms=5.0,
            )
        assert role == "thinker"
        return ModelResponse(
            role=role,
            model="local-main",
            content="Answered without tool execution.",
            reasoning_content=None,
            raw={},
            latency_ms=12.0,
        )


class FakeRepeatingActionModelClient:
    async def chat(self, role, messages):
        if role == "action":
            return ModelResponse(
                role=role,
                model="local-main",
                content=json.dumps(
                    {
                        "kind": "action_directive",
                        "intent": "repeat same action",
                        "risk_level": "low",
                        "actions": [
                            {
                                "tool": "file_read",
                                "args": {"path": "note.txt"},
                                "reason": "Read requested file",
                            }
                        ],
                    }
                ),
                reasoning_content=None,
                raw={},
                latency_ms=5.0,
            )
        assert role == "thinker"
        return ModelResponse(
            role=role,
            model="local-main",
            content="Final answer from first observation.",
            reasoning_content=None,
            raw={},
            latency_ms=12.0,
        )


class FakeUnknownToolActionModelClient:
    async def chat(self, role, messages):
        if role == "action":
            return ModelResponse(
                role=role,
                model="local-main",
                content=json.dumps(
                    {
                        "kind": "action_directive",
                        "intent": "answer from context",
                        "risk_level": "low",
                        "actions": [
                            {
                                "tool": "answer",
                                "args": {"text": "This is not a real tool."},
                                "reason": "Model attempted to answer as a tool",
                            }
                        ],
                    }
                ),
                reasoning_content=None,
                raw={},
                latency_ms=5.0,
            )
        assert role == "thinker"
        return ModelResponse(
            role=role,
            model="local-main",
            content="Answered normally without unknown tool execution.",
            reasoning_content=None,
            raw={},
            latency_ms=12.0,
        )


class FakeDirectMemoryQuestionModelClient:
    def __init__(self):
        self.roles = []

    async def chat(self, role, messages, **kwargs):
        self.roles.append(role)
        if role == "action":
            raise AssertionError("direct memory question should skip action role")
        assert role == "thinker"
        assert any("Known memory" in message["content"] for message in messages)
        return ModelResponse(
            role=role,
            model="local-main",
            content="Вас зовут Владимир.",
            reasoning_content=None,
            raw={},
            latency_ms=12.0,
        )


@pytest.mark.asyncio
async def test_runtime_executes_action_directive_tool_and_finishes_with_observation(tmp_path):
    (tmp_path / "note.txt").write_text("hello from tool")
    db_path = str(tmp_path / "duck.sqlite3")
    task_store = TaskStore(db_path)
    event_store = EventStore(db_path)
    loop = RuntimeLoop(task_store, event_store, FakeToolModelClient())

    result = await loop.run_chat("read note.txt", str(tmp_path), debug=True)
    events = await event_store.list_events(result.task_id)
    event_types = [event.event_type for event in events]
    tool_finished = next(event for event in events if event.event_type == "tool_call_finished")

    assert result.status == "completed"
    assert result.final_response == "The file says: hello from tool"
    assert "action_directive" in event_types
    assert "tool_call_started" in event_types
    assert tool_finished.payload["tool"] == "file_read"
    assert tool_finished.payload["result"]["ok"] is True
    assert tool_finished.payload["result"]["output"] == "hello from tool"


@pytest.mark.asyncio
async def test_runtime_runs_multiple_tool_steps_before_final_answer(tmp_path):
    (tmp_path / "README.md").write_text("readme contents")
    db_path = str(tmp_path / "duck.sqlite3")
    task_store = TaskStore(db_path)
    event_store = EventStore(db_path)
    loop = RuntimeLoop(task_store, event_store, FakeMultiStepToolModelClient())

    result = await loop.run_chat("inspect the workspace readme", str(tmp_path), debug=True)
    events = await event_store.list_events(result.task_id)
    finished_tools = [
        event.payload["tool"] for event in events if event.event_type == "tool_call_finished"
    ]

    assert result.status == "completed"
    assert result.final_response == "Readme inspected"
    assert finished_tools == ["list_dir", "file_read"]


@pytest.mark.asyncio
async def test_runtime_checks_system_updates_without_approval_loop(tmp_path):
    db_path = str(tmp_path / "duck.sqlite3")
    task_store = TaskStore(db_path)
    event_store = EventStore(db_path)
    loop = RuntimeLoop(task_store, event_store, FakeUpdateCheckModelClient())

    result = await loop.run_chat("Привет. Проверь обновления в системе", str(tmp_path), debug=True)
    events = await event_store.list_events(result.task_id)

    assert result.status == "completed"
    assert not any(event.event_type == "tool_approval_requested" for event in events)
    assert any(
        event.event_type == "tool_call_finished"
        and event.payload["tool"] == "shell_exec_safe"
        for event in events
    )


@pytest.mark.asyncio
async def test_runtime_rejects_malformed_action_directive_before_tools(tmp_path):
    (tmp_path / "note.txt").write_text("hello")
    db_path = str(tmp_path / "duck.sqlite3")
    task_store = TaskStore(db_path)
    event_store = EventStore(db_path)
    loop = RuntimeLoop(task_store, event_store, FakeMalformedActionModelClient())

    result = await loop.run_chat("read note.txt", str(tmp_path), debug=True)
    events = await event_store.list_events(result.task_id)
    failed = next(event for event in events if event.event_type == "action_directive_failed")

    assert result.status == "completed"
    assert "schema violation" in failed.payload["error"]
    assert not any(event.event_type == "tool_call_started" for event in events)


def test_runtime_compacts_large_tool_observations_for_model_context(tmp_path):
    db_path = str(tmp_path / "duck.sqlite3")
    task_store = TaskStore(db_path)
    event_store = EventStore(db_path)
    loop = RuntimeLoop(task_store, event_store, FakeToolModelClient())

    compact = loop.format_tool_observations_for_model([
        {
            "tool": "shell_exec_safe",
            "result": {
                "ok": True,
                "output": "A" * 2500 + "KEEP_TAIL",
                "metadata": {"command": "ls /tmp"},
            },
        }
    ])

    assert "tool_observations" in compact
    assert "truncated" in compact
    assert "KEEP_TAIL" in compact
    assert len(compact) < 2300


@pytest.mark.asyncio
async def test_runtime_skips_duplicate_action_within_same_task(tmp_path):
    (tmp_path / "note.txt").write_text("hello once")
    db_path = str(tmp_path / "duck.sqlite3")
    task_store = TaskStore(db_path)
    event_store = EventStore(db_path)
    loop = RuntimeLoop(task_store, event_store, FakeRepeatingActionModelClient())

    result = await loop.run_chat("read note.txt", str(tmp_path), debug=True)
    events = await event_store.list_events(result.task_id)
    finished_tools = [event for event in events if event.event_type == "tool_call_finished"]
    skipped_tools = [event for event in events if event.event_type == "tool_call_skipped"]

    assert result.status == "completed"
    assert len(finished_tools) == 1
    assert len(skipped_tools) == 1
    assert skipped_tools[0].payload["reason"] == "duplicate_action"


@pytest.mark.asyncio
async def test_runtime_skips_unknown_action_tools_before_gateway(tmp_path):
    db_path = str(tmp_path / "duck.sqlite3")
    task_store = TaskStore(db_path)
    event_store = EventStore(db_path)
    loop = RuntimeLoop(task_store, event_store, FakeUnknownToolActionModelClient())

    result = await loop.run_chat("answer from known context", str(tmp_path), debug=True)
    events = await event_store.list_events(result.task_id)
    skipped_tools = [event for event in events if event.event_type == "tool_call_skipped"]

    assert result.status == "completed"
    assert result.final_response == "Answered normally without unknown tool execution."
    assert len(skipped_tools) == 1
    assert skipped_tools[0].payload["reason"] == "unknown_tool"
    assert skipped_tools[0].payload["tool"] == "answer"
    assert not any(event.event_type == "tool_call_started" for event in events)


@pytest.mark.asyncio
async def test_runtime_skips_action_loop_for_direct_memory_question(tmp_path):
    db_path = str(tmp_path / "duck.sqlite3")
    task_store = TaskStore(db_path)
    event_store = EventStore(db_path)
    model_client = FakeDirectMemoryQuestionModelClient()
    loop = RuntimeLoop(task_store, event_store, model_client)

    result = await loop.run_chat(
        "Как меня зовут? Ответь коротко.",
        str(tmp_path),
        debug=True,
        memory_records=[{"text": "Known memory: user's name is Vladimir."}],
        skip_action_loop=True,
        reflect=False,
    )
    events = await event_store.list_events(result.task_id)

    assert result.status == "completed"
    assert result.final_response == "Вас зовут Владимир."
    assert model_client.roles == ["thinker"]
    assert any(event.event_type == "action_loop_skipped" for event in events)
    assert not any(event.event_type == "model_call_started" and event.payload["role"] == "action" for event in events)


class FakeApprovalModelClient:
    async def chat(self, role, messages):
        if role == "action":
            if any("tool_observations" in message["content"] for message in messages):
                actions = []
            else:
                actions = [
                    {
                        "tool": "shell_exec_safe",
                        "args": {"command": "uname -a"},
                        "reason": "User requested system information",
                    }
                ]
            return ModelResponse(
                role=role,
                model="local-main",
                content=json.dumps(
                    {
                        "kind": "action_directive",
                        "intent": "run command",
                        "risk_level": "medium",
                        "actions": actions,
                    }
                ),
                reasoning_content=None,
                raw={},
                latency_ms=5.0,
            )
        raise AssertionError("thinker must not be called while approval is pending")


@pytest.mark.asyncio
async def test_runtime_creates_pending_approval_when_tool_requires_it(tmp_path):
    db_path = str(tmp_path / "duck.sqlite3")
    task_store = TaskStore(db_path)
    event_store = EventStore(db_path)
    approvals = ApprovalService(db_path)
    loop = RuntimeLoop(task_store, event_store, FakeApprovalModelClient(), approval_service=approvals)

    result = await loop.run_chat("run uname", str(tmp_path), debug=True)
    pending = await approvals.pending()
    events = await event_store.list_events(result.task_id)

    assert result.status == "waiting_for_approval"
    assert pending[0].task_id == result.task_id
    assert pending[0].normalized_action["tool"] == "shell_exec_safe"
    assert any(event.event_type == "tool_approval_requested" for event in events)


class FakeApprovalContinuationModelClient:
    def __init__(self):
        self.thinker_messages = []

    async def chat(self, role, messages):
        if role == "action":
            if any("tool_observations" in message["content"] for message in messages):
                actions = []
            else:
                actions = [
                    {
                        "tool": "shell_exec_safe",
                        "args": {"command": "uname -a"},
                        "reason": "User requested system information",
                    }
                ]
            return ModelResponse(
                role=role,
                model="local-main",
                content=json.dumps(
                    {
                        "kind": "action_directive",
                        "intent": "run command",
                        "risk_level": "medium",
                        "actions": actions,
                    }
                ),
                reasoning_content=None,
                raw={},
                latency_ms=5.0,
            )
        assert role == "thinker"
        self.thinker_messages = messages
        assert any("tool_observations" in message["content"] for message in messages)
        return ModelResponse(
            role=role,
            model="local-main",
            content="uname completed",
            reasoning_content="used approved shell command",
            raw={},
            latency_ms=10.0,
        )


class FakeApprovalThenSecondToolModelClient:
    async def chat(self, role, messages):
        observation_text = "\n".join(message["content"] for message in messages)
        if role == "action":
            if "tool_observations" in observation_text and "second step content" not in observation_text:
                actions = [
                    {
                        "tool": "file_read",
                        "args": {"path": "second.txt"},
                        "reason": "Read follow-up file after approved command",
                    }
                ]
            elif "tool_observations" in observation_text:
                actions = []
            else:
                actions = [
                    {
                        "tool": "shell_exec_safe",
                        "args": {"command": "uname -a"},
                        "reason": "User requested system information",
                    }
                ]
            return ModelResponse(
                role=role,
                model="local-main",
                content=json.dumps(
                    {
                        "kind": "action_directive",
                        "intent": "approval then follow-up",
                        "risk_level": "medium",
                        "actions": actions,
                    }
                ),
                reasoning_content=None,
                raw={},
                latency_ms=5.0,
            )
        assert role == "thinker"
        assert "shell_exec_safe" in observation_text
        assert "file_read" in observation_text
        assert "second step content" in observation_text
        return ModelResponse(
            role=role,
            model="local-main",
            content="approved command and second tool completed",
            reasoning_content=None,
            raw={},
            latency_ms=10.0,
        )


@pytest.mark.asyncio
async def test_runtime_continues_after_approved_tool_call(tmp_path):
    db_path = str(tmp_path / "duck.sqlite3")
    task_store = TaskStore(db_path)
    event_store = EventStore(db_path)
    approvals = ApprovalService(db_path)
    model_client = FakeApprovalContinuationModelClient()
    loop = RuntimeLoop(task_store, event_store, model_client, approval_service=approvals)

    pending_result = await loop.run_chat("run uname", str(tmp_path), debug=True)
    pending = await approvals.pending()
    await approvals.allow_once(pending[0].approval_id)

    result = await loop.continue_after_approval(pending_result.task_id, pending[0].approval_id)
    events = await event_store.list_events(result.task_id)
    finished = next(event for event in events if event.event_type == "tool_call_finished")

    assert result.status == "completed"
    assert result.final_response == "uname completed"
    assert finished.payload["tool"] == "shell_exec_safe"
    assert finished.payload["result"]["ok"] is True
    assert "uname" in finished.payload["result"]["metadata"]["command"]
    assert any(event.event_type == "task_completed" for event in events)


@pytest.mark.asyncio
async def test_runtime_can_run_followup_tool_after_approval(tmp_path):
    (tmp_path / "second.txt").write_text("second step content")
    db_path = str(tmp_path / "duck.sqlite3")
    task_store = TaskStore(db_path)
    event_store = EventStore(db_path)
    approvals = ApprovalService(db_path)
    loop = RuntimeLoop(
        task_store,
        event_store,
        FakeApprovalThenSecondToolModelClient(),
        approval_service=approvals,
    )

    pending_result = await loop.run_chat("run uname then inspect second file", str(tmp_path), debug=True)
    pending = await approvals.pending()
    await approvals.allow_once(pending[0].approval_id)

    result = await loop.continue_after_approval(pending_result.task_id, pending[0].approval_id)
    events = await event_store.list_events(result.task_id)
    finished_tools = [
        event.payload["tool"] for event in events if event.event_type == "tool_call_finished"
    ]

    assert result.status == "completed"
    assert finished_tools == ["shell_exec_safe", "file_read"]


@pytest.mark.asyncio
async def test_runtime_continues_after_denied_tool_call_without_execution(tmp_path):
    db_path = str(tmp_path / "duck.sqlite3")
    task_store = TaskStore(db_path)
    event_store = EventStore(db_path)
    approvals = ApprovalService(db_path)
    model_client = FakeApprovalContinuationModelClient()
    loop = RuntimeLoop(task_store, event_store, model_client, approval_service=approvals)

    pending_result = await loop.run_chat("run uname", str(tmp_path), debug=True)
    pending = await approvals.pending()
    await approvals.deny(pending[0].approval_id)

    result = await loop.continue_after_approval(pending_result.task_id, pending[0].approval_id)
    events = await event_store.list_events(result.task_id)
    finished = next(event for event in events if event.event_type == "tool_call_finished")

    assert result.status == "completed"
    assert finished.payload["result"]["ok"] is False
    assert finished.payload["result"]["metadata"]["decision"] == "deny"
    assert "denied" in finished.payload["result"]["error"].lower()


@pytest.mark.asyncio
async def test_runtime_reuses_allow_forever_for_matching_action(tmp_path):
    db_path = str(tmp_path / "duck.sqlite3")
    task_store = TaskStore(db_path)
    event_store = EventStore(db_path)
    approvals = ApprovalService(db_path)
    model_client = FakeApprovalContinuationModelClient()
    loop = RuntimeLoop(task_store, event_store, model_client, approval_service=approvals)

    first_result = await loop.run_chat("run uname", str(tmp_path), debug=True)
    first_pending = await approvals.pending()
    await approvals.allow_forever(first_pending[0].approval_id)
    await loop.continue_after_approval(first_result.task_id, first_pending[0].approval_id)

    second_result = await loop.run_chat("run uname again", str(tmp_path), debug=True)
    second_events = await event_store.list_events(second_result.task_id)

    assert second_result.status == "completed"
    assert second_result.final_response == "uname completed"
    assert not any(event.event_type == "tool_approval_requested" for event in second_events)
    assert any(event.event_type == "tool_call_finished" for event in second_events)