ducklm/tests/smoke/test_reflection.py

import json
from unittest.mock import AsyncMock

import pytest

from duck_core.events.store import EventStore
from duck_core.experience.recorder import ExperienceRecorder
from duck_core.memory.policy import MemoryPolicy
from duck_core.memory.store import MemoryStore
from duck_core.model_client import ModelClient, ModelResponse
from duck_core.runtime_loop import RuntimeLoop
from duck_core.tasks.store import TaskStore


@pytest.fixture
def task_store(tmp_path):
    store = TaskStore(str(tmp_path / "duck.sqlite3"))
    return store


@pytest.fixture
def event_store(tmp_path):
    store = EventStore(str(tmp_path / "duck.sqlite3"))
    return store


@pytest.fixture
def memory_store(tmp_path):
    store = MemoryStore(str(tmp_path / "duck.sqlite3"))
    return store


@pytest.fixture
def experience_recorder(tmp_path):
    recorder = ExperienceRecorder(str(tmp_path / "duck.sqlite3"))
    return recorder


@pytest.fixture
def mock_model_client():
    client = AsyncMock(spec=ModelClient)
    client.chat = AsyncMock(
        side_effect=[
        # First call: action role — return empty actions
        ModelResponse(
            role="action",
            model="local-main",
            content=json.dumps({
                "kind": "action_directive",
                "intent": "answer directly",
                "risk_level": "none",
                "actions": [],
            }),
            reasoning_content=None,
            raw={},
            latency_ms=1.0,
        ),
        # Second call: thinker role — final answer
        ModelResponse(
            role="thinker",
            model="local-main",
            content="DuckLM is a local cognitive runtime.",
            reasoning_content=None,
            raw={},
            latency_ms=1.0,
        ),
        # Third call: memory_policy role
        ModelResponse(
            role="critic",
            model="local-main",
            content=json.dumps({
                "should_store": False,
                "memory_type": "event",
                "summary": "Routine answer, nothing to remember.",
                "importance": 0.1,
                "scope": "workspace",
                "metadata": {},
            }),
            reasoning_content=None,
            raw={},
            latency_ms=1.0,
        ),
        # Fourth call: critic role (reflection)
        ModelResponse(
            role="critic",
            model="local-main",
            content="Task completed successfully. No issues found. Reusable lesson: direct answers work well for simple queries.",
            reasoning_content=None,
            raw={},
            latency_ms=1.0,
        ),
    ]
    )
    return client


@pytest.mark.asyncio
async def test_reflection_is_called_after_task_completion(
    task_store, event_store, memory_store, experience_recorder, mock_model_client
):
    policy = MemoryPolicy(model_client=mock_model_client)
    runtime = RuntimeLoop(
        task_store=task_store,
        event_store=event_store,
        model_client=mock_model_client,
        memory_policy=policy,
        memory_store=memory_store,
        experience_recorder=experience_recorder,
    )

    result = await runtime.run_chat("What is DuckLM?", workspace="/tmp/test")

    assert result.status == "completed"
    assert "DuckLM" in result.final_response

    # Check that reflection was called — experience record created
    records = await experience_recorder.list_records()
    assert len(records) == 1
    assert records[0].task_id == result.task_id
    assert "completed successfully" in records[0].reusable_lesson

    # Check that reflection_completed event was recorded
    events = await event_store.list_events(result.task_id)
    event_types = [e.event_type for e in events]
    assert "reflection_completed" in event_types
    assert records[0].task_id == result.task_id
    assert "completed successfully" in records[0].reusable_lesson

    # Check that reflection_completed event was recorded
    events = await event_store.list_events(result.task_id)
    event_types = [e.event_type for e in events]
    assert "reflection_completed" in event_types


@pytest.mark.asyncio
async def test_reflection_failure_does_not_break_task(
    task_store, event_store, memory_store, experience_recorder
):
    """If reflection fails, the task should still complete successfully."""
    client = AsyncMock(spec=ModelClient)
    client.chat = AsyncMock(
        side_effect=[
            # Action: empty
            ModelResponse(
                role="action",
                model="local-main",
                content=json.dumps({
                    "kind": "action_directive",
                    "intent": "answer",
                    "risk_level": "none",
                    "actions": [],
                }),
                reasoning_content=None,
                raw={},
                latency_ms=1.0,
            ),
            # Thinker: answer
            ModelResponse(
                role="thinker",
                model="local-main",
                content="Answer.",
                reasoning_content=None,
                raw={},
                latency_ms=1.0,
            ),
            # Memory policy
            ModelResponse(
                role="critic",
                model="local-main",
                content=json.dumps({
                    "should_store": False,
                    "memory_type": "event",
                    "summary": "Routine.",
                    "importance": 0.1,
                    "scope": "workspace",
                    "metadata": {},
                }),
                reasoning_content=None,
                raw={},
                latency_ms=1.0,
            ),
            # Critic (reflection) — raises exception
            ConnectionError("LLM unavailable"),
        ]
    )

    policy = MemoryPolicy(model_client=client)
    runtime = RuntimeLoop(
        task_store=task_store,
        event_store=event_store,
        model_client=client,
        memory_policy=policy,
        memory_store=memory_store,
        experience_recorder=experience_recorder,
    )

    result = await runtime.run_chat("test", workspace="/tmp/test")

    # Task should still complete
    assert result.status == "completed"

    # Reflection failure event should be recorded
    events = await event_store.list_events(result.task_id)
    event_types = [e.event_type for e in events]
    assert "reflection_failed" in event_types


@pytest.mark.asyncio
async def test_reflection_not_called_when_disabled(
    task_store, event_store, memory_store, mock_model_client
):
    """When reflect=False, no reflection should be called."""
    policy = MemoryPolicy(model_client=mock_model_client)
    runtime = RuntimeLoop(
        task_store=task_store,
        event_store=event_store,
        model_client=mock_model_client,
        memory_policy=policy,
        memory_store=memory_store,
    )

    result = await runtime.run_chat("What is DuckLM?", workspace="/tmp/test", reflect=False)

    assert result.status == "completed"
    # mock_model_client.chat should have been called 3 times (action, thinker, memory_policy)
    # NOT 4 times (no critic/reflection call)
    assert mock_model_client.chat.call_count == 3