Vendored deer-flow upstream (bytedance/deer-flow) plus prompt-injection hardening: - New deerflow.security package: content_delimiter, html_cleaner, sanitizer (8 layers — invisible chars, control chars, symbols, NFC, PUA, tag chars, horizontal whitespace collapse with newline/tab preservation, length cap) - New deerflow.community.searx package: web_search, web_fetch, image_search backed by a private SearX instance, every external string sanitized and wrapped in <<<EXTERNAL_UNTRUSTED_CONTENT>>> delimiters - All native community web providers (ddg_search, tavily, exa, firecrawl, jina_ai, infoquest, image_search) replaced with hard-fail stubs that raise NativeWebToolDisabledError at import time, so a misconfigured tool.use path fails loud rather than silently falling back to unsanitized output - Native client back-doors (jina_client.py, infoquest_client.py) stubbed too - Native-tool tests quarantined under tests/_disabled_native/ (collect_ignore_glob via local conftest.py) - Sanitizer Layer 7 fix: only collapse horizontal whitespace, preserve newlines and tabs so list/table structure survives - Hardened runtime config.yaml references only the searx-backed tools - Factory overlay (backend/) kept in sync with deer-flow tree as a reference / source See HARDENING.md for the full audit trail and verification steps.
161 lines
5.3 KiB
Python
161 lines
5.3 KiB
Python
"""Hardened SearX web search, web fetch, and image search tools.
|
|
|
|
Every external response is sanitized and wrapped in security delimiters
|
|
before being returned to the LLM. See deerflow.security for the pipeline.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from urllib.parse import quote
|
|
|
|
import httpx
|
|
from langchain.tools import tool
|
|
|
|
from deerflow.config import get_app_config
|
|
from deerflow.security.content_delimiter import wrap_untrusted_content
|
|
from deerflow.security.html_cleaner import extract_secure_text
|
|
from deerflow.security.sanitizer import sanitizer
|
|
|
|
DEFAULT_SEARX_URL = "http://localhost:8888"
|
|
DEFAULT_TIMEOUT = 30.0
|
|
DEFAULT_USER_AGENT = "DeerFlow-Hardened/1.0 (+searx)"
|
|
|
|
|
|
def _tool_extra(name: str) -> dict:
|
|
"""Read the model_extra dict for a tool config entry, defensively."""
|
|
cfg = get_app_config().get_tool_config(name)
|
|
if cfg is None:
|
|
return {}
|
|
return getattr(cfg, "model_extra", {}) or {}
|
|
|
|
|
|
def _searx_url(tool_name: str = "web_search") -> str:
|
|
return _tool_extra(tool_name).get("searx_url", DEFAULT_SEARX_URL)
|
|
|
|
|
|
def _http_get(url: str, params: dict, timeout: float = DEFAULT_TIMEOUT) -> dict:
|
|
"""GET a SearX endpoint and return parsed JSON. Raises on transport/HTTP error."""
|
|
with httpx.Client(headers={"User-Agent": DEFAULT_USER_AGENT}) as client:
|
|
response = client.get(url, params=params, timeout=timeout)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
|
|
|
|
@tool("web_search", parse_docstring=True)
|
|
def web_search_tool(query: str, max_results: int = 10) -> str:
|
|
"""Search the web via the private hardened SearX instance.
|
|
|
|
All results are sanitized against prompt-injection vectors and
|
|
wrapped in <<<EXTERNAL_UNTRUSTED_CONTENT>>> markers.
|
|
|
|
Args:
|
|
query: Search keywords.
|
|
max_results: Maximum results to return (capped by config).
|
|
"""
|
|
extra = _tool_extra("web_search")
|
|
cap = int(extra.get("max_results", 10))
|
|
searx_url = extra.get("searx_url", DEFAULT_SEARX_URL)
|
|
limit = max(1, min(int(max_results), cap))
|
|
|
|
try:
|
|
data = _http_get(
|
|
f"{searx_url}/search",
|
|
{"q": quote(query), "format": "json"},
|
|
)
|
|
except Exception as exc:
|
|
return wrap_untrusted_content({"error": f"Search failed: {exc}"})
|
|
|
|
results = []
|
|
for item in data.get("results", [])[:limit]:
|
|
results.append(
|
|
{
|
|
"title": sanitizer.sanitize(item.get("title", ""), max_length=200),
|
|
"url": item.get("url", ""),
|
|
"content": sanitizer.sanitize(item.get("content", ""), max_length=500),
|
|
}
|
|
)
|
|
|
|
return wrap_untrusted_content(
|
|
{
|
|
"query": query,
|
|
"total_results": len(results),
|
|
"results": results,
|
|
}
|
|
)
|
|
|
|
|
|
@tool("web_fetch", parse_docstring=True)
|
|
async def web_fetch_tool(url: str, max_chars: int = 10000) -> str:
|
|
"""Fetch a web page and return sanitized visible text.
|
|
|
|
Dangerous HTML elements (script, style, iframe, form, ...) are stripped,
|
|
invisible Unicode is removed, and the result is wrapped in security markers.
|
|
Only call this for URLs returned by web_search or supplied directly by the
|
|
user — do not invent URLs.
|
|
|
|
Args:
|
|
url: Absolute URL to fetch (must include scheme).
|
|
max_chars: Maximum number of characters to return.
|
|
"""
|
|
extra = _tool_extra("web_fetch")
|
|
cap = int(extra.get("max_chars", max_chars))
|
|
limit = max(256, min(int(max_chars), cap))
|
|
|
|
try:
|
|
async with httpx.AsyncClient(
|
|
headers={"User-Agent": DEFAULT_USER_AGENT},
|
|
follow_redirects=True,
|
|
) as client:
|
|
response = await client.get(url, timeout=DEFAULT_TIMEOUT)
|
|
response.raise_for_status()
|
|
html = response.text
|
|
except Exception as exc:
|
|
return wrap_untrusted_content({"error": f"Fetch failed: {exc}", "url": url})
|
|
|
|
raw_text = extract_secure_text(html)
|
|
clean_text = sanitizer.sanitize(raw_text, max_length=limit)
|
|
return wrap_untrusted_content({"url": url, "content": clean_text})
|
|
|
|
|
|
@tool("image_search", parse_docstring=True)
|
|
def image_search_tool(query: str, max_results: int = 5) -> str:
|
|
"""Search for images via the private hardened SearX instance.
|
|
|
|
Returns sanitized title/url pairs (no inline image data). Wrapped in
|
|
security delimiters.
|
|
|
|
Args:
|
|
query: Image search keywords.
|
|
max_results: Maximum number of images to return.
|
|
"""
|
|
extra = _tool_extra("image_search")
|
|
cap = int(extra.get("max_results", 5))
|
|
searx_url = extra.get("searx_url", _searx_url("web_search"))
|
|
limit = max(1, min(int(max_results), cap))
|
|
|
|
try:
|
|
data = _http_get(
|
|
f"{searx_url}/search",
|
|
{"q": quote(query), "format": "json", "categories": "images"},
|
|
)
|
|
except Exception as exc:
|
|
return wrap_untrusted_content({"error": f"Image search failed: {exc}"})
|
|
|
|
results = []
|
|
for item in data.get("results", [])[:limit]:
|
|
results.append(
|
|
{
|
|
"title": sanitizer.sanitize(item.get("title", ""), max_length=200),
|
|
"url": item.get("url", ""),
|
|
"thumbnail": item.get("thumbnail_src") or item.get("img_src", ""),
|
|
}
|
|
)
|
|
|
|
return wrap_untrusted_content(
|
|
{
|
|
"query": query,
|
|
"total_results": len(results),
|
|
"results": results,
|
|
}
|
|
)
|