Vendored deer-flow upstream (bytedance/deer-flow) plus prompt-injection hardening: - New deerflow.security package: content_delimiter, html_cleaner, sanitizer (8 layers — invisible chars, control chars, symbols, NFC, PUA, tag chars, horizontal whitespace collapse with newline/tab preservation, length cap) - New deerflow.community.searx package: web_search, web_fetch, image_search backed by a private SearX instance, every external string sanitized and wrapped in <<<EXTERNAL_UNTRUSTED_CONTENT>>> delimiters - All native community web providers (ddg_search, tavily, exa, firecrawl, jina_ai, infoquest, image_search) replaced with hard-fail stubs that raise NativeWebToolDisabledError at import time, so a misconfigured tool.use path fails loud rather than silently falling back to unsanitized output - Native client back-doors (jina_client.py, infoquest_client.py) stubbed too - Native-tool tests quarantined under tests/_disabled_native/ (collect_ignore_glob via local conftest.py) - Sanitizer Layer 7 fix: only collapse horizontal whitespace, preserve newlines and tabs so list/table structure survives - Hardened runtime config.yaml references only the searx-backed tools - Factory overlay (backend/) kept in sync with deer-flow tree as a reference / source See HARDENING.md for the full audit trail and verification steps.
128 lines
4.7 KiB
Python
128 lines
4.7 KiB
Python
"""Regression tests for ToolMessage content normalization in serialization.
|
|
|
|
Ensures that structured content (list-of-blocks) is properly extracted to
|
|
plain text, preventing raw Python repr strings from reaching the UI.
|
|
|
|
See: https://github.com/bytedance/deer-flow/issues/1149
|
|
"""
|
|
|
|
from langchain_core.messages import ToolMessage
|
|
|
|
from deerflow.client import DeerFlowClient
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _serialize_message
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestSerializeToolMessageContent:
|
|
"""DeerFlowClient._serialize_message should normalize ToolMessage content."""
|
|
|
|
def test_string_content(self):
|
|
msg = ToolMessage(content="ok", tool_call_id="tc1", name="search")
|
|
result = DeerFlowClient._serialize_message(msg)
|
|
assert result["content"] == "ok"
|
|
assert result["type"] == "tool"
|
|
|
|
def test_list_of_blocks_content(self):
|
|
"""List-of-blocks should be extracted, not repr'd."""
|
|
msg = ToolMessage(
|
|
content=[{"type": "text", "text": "hello world"}],
|
|
tool_call_id="tc1",
|
|
name="search",
|
|
)
|
|
result = DeerFlowClient._serialize_message(msg)
|
|
assert result["content"] == "hello world"
|
|
# Must NOT contain Python repr artifacts
|
|
assert "[" not in result["content"]
|
|
assert "{" not in result["content"]
|
|
|
|
def test_multiple_text_blocks(self):
|
|
"""Multiple full text blocks should be joined with newlines."""
|
|
msg = ToolMessage(
|
|
content=[
|
|
{"type": "text", "text": "line 1"},
|
|
{"type": "text", "text": "line 2"},
|
|
],
|
|
tool_call_id="tc1",
|
|
name="search",
|
|
)
|
|
result = DeerFlowClient._serialize_message(msg)
|
|
assert result["content"] == "line 1\nline 2"
|
|
|
|
def test_string_chunks_are_joined_without_newlines(self):
|
|
"""Chunked string payloads should not get artificial separators."""
|
|
msg = ToolMessage(
|
|
content=['{"a"', ': "b"}'],
|
|
tool_call_id="tc1",
|
|
name="search",
|
|
)
|
|
result = DeerFlowClient._serialize_message(msg)
|
|
assert result["content"] == '{"a": "b"}'
|
|
|
|
def test_mixed_string_chunks_and_blocks(self):
|
|
"""String chunks stay contiguous, but text blocks remain separated."""
|
|
msg = ToolMessage(
|
|
content=["prefix", "-continued", {"type": "text", "text": "block text"}],
|
|
tool_call_id="tc1",
|
|
name="search",
|
|
)
|
|
result = DeerFlowClient._serialize_message(msg)
|
|
assert result["content"] == "prefix-continued\nblock text"
|
|
|
|
def test_mixed_blocks_with_non_text(self):
|
|
"""Non-text blocks (e.g. image) should be skipped gracefully."""
|
|
msg = ToolMessage(
|
|
content=[
|
|
{"type": "text", "text": "found results"},
|
|
{"type": "image_url", "image_url": {"url": "http://img.png"}},
|
|
],
|
|
tool_call_id="tc1",
|
|
name="view_image",
|
|
)
|
|
result = DeerFlowClient._serialize_message(msg)
|
|
assert result["content"] == "found results"
|
|
|
|
def test_empty_list_content(self):
|
|
msg = ToolMessage(content=[], tool_call_id="tc1", name="search")
|
|
result = DeerFlowClient._serialize_message(msg)
|
|
assert result["content"] == ""
|
|
|
|
def test_plain_string_in_list(self):
|
|
"""Bare strings inside a list should be kept."""
|
|
msg = ToolMessage(
|
|
content=["plain text block"],
|
|
tool_call_id="tc1",
|
|
name="search",
|
|
)
|
|
result = DeerFlowClient._serialize_message(msg)
|
|
assert result["content"] == "plain text block"
|
|
|
|
def test_unknown_content_type_falls_back(self):
|
|
"""Unexpected types should not crash — return str()."""
|
|
msg = ToolMessage(content=42, tool_call_id="tc1", name="calc")
|
|
result = DeerFlowClient._serialize_message(msg)
|
|
# int → not str, not list → falls to str()
|
|
assert result["content"] == "42"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _extract_text (already existed, but verify it also covers ToolMessage paths)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestExtractText:
|
|
"""DeerFlowClient._extract_text should handle all content shapes."""
|
|
|
|
def test_string_passthrough(self):
|
|
assert DeerFlowClient._extract_text("hello") == "hello"
|
|
|
|
def test_list_text_blocks(self):
|
|
assert DeerFlowClient._extract_text([{"type": "text", "text": "hi"}]) == "hi"
|
|
|
|
def test_empty_list(self):
|
|
assert DeerFlowClient._extract_text([]) == ""
|
|
|
|
def test_fallback_non_iterable(self):
|
|
assert DeerFlowClient._extract_text(123) == "123"
|