ducklm/tests/smoke/test_reflection.py

228 lines
7.1 KiB
Python

import json
from unittest.mock import AsyncMock
import pytest
from duck_core.events.store import EventStore
from duck_core.experience.recorder import ExperienceRecorder
from duck_core.memory.policy import MemoryPolicy
from duck_core.memory.store import MemoryStore
from duck_core.model_client import ModelClient, ModelResponse
from duck_core.runtime_loop import RuntimeLoop
from duck_core.tasks.store import TaskStore
@pytest.fixture
def task_store(tmp_path):
store = TaskStore(str(tmp_path / "duck.sqlite3"))
return store
@pytest.fixture
def event_store(tmp_path):
store = EventStore(str(tmp_path / "duck.sqlite3"))
return store
@pytest.fixture
def memory_store(tmp_path):
store = MemoryStore(str(tmp_path / "duck.sqlite3"))
return store
@pytest.fixture
def experience_recorder(tmp_path):
recorder = ExperienceRecorder(str(tmp_path / "duck.sqlite3"))
return recorder
@pytest.fixture
def mock_model_client():
client = AsyncMock(spec=ModelClient)
client.chat = AsyncMock(
side_effect=[
# First call: action role — return empty actions
ModelResponse(
role="action",
model="local-main",
content=json.dumps({
"kind": "action_directive",
"intent": "answer directly",
"risk_level": "none",
"actions": [],
}),
reasoning_content=None,
raw={},
latency_ms=1.0,
),
# Second call: thinker role — final answer
ModelResponse(
role="thinker",
model="local-main",
content="DuckLM is a local cognitive runtime.",
reasoning_content=None,
raw={},
latency_ms=1.0,
),
# Third call: memory_policy role
ModelResponse(
role="critic",
model="local-main",
content=json.dumps({
"should_store": False,
"memory_type": "event",
"summary": "Routine answer, nothing to remember.",
"importance": 0.1,
"scope": "workspace",
"metadata": {},
}),
reasoning_content=None,
raw={},
latency_ms=1.0,
),
# Fourth call: critic role (reflection)
ModelResponse(
role="critic",
model="local-main",
content="Task completed successfully. No issues found. Reusable lesson: direct answers work well for simple queries.",
reasoning_content=None,
raw={},
latency_ms=1.0,
),
]
)
return client
@pytest.mark.asyncio
async def test_reflection_is_called_after_task_completion(
task_store, event_store, memory_store, experience_recorder, mock_model_client
):
policy = MemoryPolicy(model_client=mock_model_client)
runtime = RuntimeLoop(
task_store=task_store,
event_store=event_store,
model_client=mock_model_client,
memory_policy=policy,
memory_store=memory_store,
experience_recorder=experience_recorder,
)
result = await runtime.run_chat("What is DuckLM?", workspace="/tmp/test")
assert result.status == "completed"
assert "DuckLM" in result.final_response
# Check that reflection was called — experience record created
records = await experience_recorder.list_records()
assert len(records) == 1
assert records[0].task_id == result.task_id
assert "completed successfully" in records[0].reusable_lesson
# Check that reflection_completed event was recorded
events = await event_store.list_events(result.task_id)
event_types = [e.event_type for e in events]
assert "reflection_completed" in event_types
assert records[0].task_id == result.task_id
assert "completed successfully" in records[0].reusable_lesson
# Check that reflection_completed event was recorded
events = await event_store.list_events(result.task_id)
event_types = [e.event_type for e in events]
assert "reflection_completed" in event_types
@pytest.mark.asyncio
async def test_reflection_failure_does_not_break_task(
task_store, event_store, memory_store, experience_recorder
):
"""If reflection fails, the task should still complete successfully."""
client = AsyncMock(spec=ModelClient)
client.chat = AsyncMock(
side_effect=[
# Action: empty
ModelResponse(
role="action",
model="local-main",
content=json.dumps({
"kind": "action_directive",
"intent": "answer",
"risk_level": "none",
"actions": [],
}),
reasoning_content=None,
raw={},
latency_ms=1.0,
),
# Thinker: answer
ModelResponse(
role="thinker",
model="local-main",
content="Answer.",
reasoning_content=None,
raw={},
latency_ms=1.0,
),
# Memory policy
ModelResponse(
role="critic",
model="local-main",
content=json.dumps({
"should_store": False,
"memory_type": "event",
"summary": "Routine.",
"importance": 0.1,
"scope": "workspace",
"metadata": {},
}),
reasoning_content=None,
raw={},
latency_ms=1.0,
),
# Critic (reflection) — raises exception
ConnectionError("LLM unavailable"),
]
)
policy = MemoryPolicy(model_client=client)
runtime = RuntimeLoop(
task_store=task_store,
event_store=event_store,
model_client=client,
memory_policy=policy,
memory_store=memory_store,
experience_recorder=experience_recorder,
)
result = await runtime.run_chat("test", workspace="/tmp/test")
# Task should still complete
assert result.status == "completed"
# Reflection failure event should be recorded
events = await event_store.list_events(result.task_id)
event_types = [e.event_type for e in events]
assert "reflection_failed" in event_types
@pytest.mark.asyncio
async def test_reflection_not_called_when_disabled(
task_store, event_store, memory_store, mock_model_client
):
"""When reflect=False, no reflection should be called."""
policy = MemoryPolicy(model_client=mock_model_client)
runtime = RuntimeLoop(
task_store=task_store,
event_store=event_store,
model_client=mock_model_client,
memory_policy=policy,
memory_store=memory_store,
)
result = await runtime.run_chat("What is DuckLM?", workspace="/tmp/test", reflect=False)
assert result.status == "completed"
# mock_model_client.chat should have been called 3 times (action, thinker, memory_policy)
# NOT 4 times (no critic/reflection call)
assert mock_model_client.chat.call_count == 3