diff --git a/HARDENING.md b/HARDENING.md index 41d8e61..35fabdb 100644 --- a/HARDENING.md +++ b/HARDENING.md @@ -46,8 +46,9 @@ The hardening below is a port of the OpenClaw approach (`searx-scripts/`, LangChain `@tool` exports: - `web_search_tool(query, max_results=10)` — calls a private SearX instance, sanitizes title + content, wraps results in security delimiters -- `web_fetch_tool(url, max_chars=10000)` — fetches URL, runs `extract_secure_text` then `sanitizer.sanitize`, wraps result -- `image_search_tool(query, max_results=5)` — SearX `categories=images`, sanitized title/url/thumbnail, wrapped +- `web_fetch_tool(url, max_chars=10000)` — streams response, refuses non-text Content-Type **before reading the body**, then runs `extract_secure_text` and `sanitizer.sanitize` over the head of the body, wraps result + +`image_search_tool` was removed on purpose — see section 2.8. Reads its config from `get_app_config().get_tool_config().model_extra`: `searx_url`, `max_results`, `max_chars`. @@ -203,6 +204,40 @@ curl -s -o /dev/null -w "%{http_code}\n" --max-time 5 http://192.168.3.1/ curl -s -o /dev/null -w "%{http_code}\n" --max-time 5 http://10.67.67.16/ # FAIL (blocked by 10/8 reject; .16 is not whitelisted) ``` +### 2.8 No-images policy + +Agents in this build are **text-only researchers**. They never need to +fetch image, audio, video, or binary content, so the entire pipeline is +hardened to refuse it: + +| Layer | What it does | +|---|---| +| `web_fetch_tool` | Streams the response and inspects the `Content-Type` header **before** reading the body. Anything that is not `text/*`, `application/json`, `application/xml`, `application/xhtml+xml`, `application/ld+json`, `application/atom+xml`, or `application/rss+xml` is refused with `wrap_untrusted_content({"error": "Refused: non-text response..."})`. The body bytes are never loaded into memory. | +| `image_search_tool` | **Removed**. The function no longer exists in `deerflow/community/searx/tools.py`. Any `tool.use: deerflow.community.searx.tools:image_search_tool` in `config.yaml` would fail with an attribute error during tool loading. | +| `config.yaml` | The `image_search` tool entry was deleted. Only `web_search` and `web_fetch` are registered in the `web` group. | + +**Why no allowlist?** A domain allowlist for image fetching would either +be impossible to maintain (research touches new domains every day) or +silently rot into a permanent allow-everything. Removing image fetching +entirely is the only honest answer for a text-only research use case. + +**Frontend caveat:** the LLM can still emit `![alt](https://...)` +markdown in its **answer**. If the deer-flow frontend renders that +markdown, the **user's browser** (not the container!) will load the +image and potentially leak referrer/timing data. The egress firewall +on data-nuc does not see this traffic. Mitigations: + +1. Best: configure the frontend's markdown renderer to disable images, + or replace `` tags with a placeholder. **Not yet implemented in + this repo** — needs a patch in the deer-flow frontend. +2. Workaround: render answers in a CSP-restricted iframe with + `img-src 'none'`. + +If you bring image fetching back, build a **separate** tool with an +explicit per-call allowlist and a server-side image proxy that runs +under the same egress firewall as the rest of the container. Do not +relax `web_fetch_tool`'s Content-Type check. + ## 3. Verification All checks below assume `PYTHONPATH=deer-flow/backend/packages/harness`. @@ -244,6 +279,24 @@ PYTHONPATH=deer-flow/backend/packages/harness pytest \ Expected: `8 passed`. +### 3.4 No-images verification + +```bash +PYTHONPATH=deer-flow/backend/packages/harness python3 -c " +import deerflow.community.searx.tools as t +assert hasattr(t, 'web_search_tool'), 'web_search_tool missing' +assert hasattr(t, 'web_fetch_tool'), 'web_fetch_tool missing' +assert not hasattr(t, 'image_search_tool'), 'image_search_tool must be removed' +from deerflow.community.searx.tools import _is_text_content_type +assert _is_text_content_type('text/html; charset=utf-8') +assert _is_text_content_type('application/json') +assert not _is_text_content_type('image/png') +assert not _is_text_content_type('application/octet-stream') +assert not _is_text_content_type('') +print('OK — no-images policy intact') +" +``` + ## 4. Adding a new web tool 1. Implement it in `deer-flow/backend/packages/harness/deerflow/community//tools.py`. @@ -273,7 +326,7 @@ deer-flow/backend/packages/harness/deerflow/security/html_cleaner.py (new) deer-flow/backend/packages/harness/deerflow/security/sanitizer.py (new, with newline-preserving fix) deer-flow/backend/packages/harness/deerflow/community/searx/__init__.py (new) -deer-flow/backend/packages/harness/deerflow/community/searx/tools.py (new) +deer-flow/backend/packages/harness/deerflow/community/searx/tools.py (new — web_search + web_fetch with Content-Type gate; image_search_tool intentionally absent) deer-flow/backend/packages/harness/deerflow/community/_disabled_native.py (new) deer-flow/backend/packages/harness/deerflow/community/ddg_search/tools.py (replaced with stub) diff --git a/backend/packages/harness/deerflow/community/searx/tools.py b/backend/packages/harness/deerflow/community/searx/tools.py index 58d7066..2dc5ffc 100644 --- a/backend/packages/harness/deerflow/community/searx/tools.py +++ b/backend/packages/harness/deerflow/community/searx/tools.py @@ -1,7 +1,17 @@ -"""Hardened SearX web search and fetch tools.""" +"""Hardened SearX web search and web fetch tools. + +Every external response is sanitized and wrapped in security delimiters +before being returned to the LLM. See deerflow.security for the pipeline. + +Image fetching is intentionally NOT supported. Agents in this build are +text-only researchers; image_search_tool was removed and web_fetch_tool +refuses any response whose Content-Type is not a textual media type. If +you need an image-aware agent, add a dedicated tool with explicit user +review — do not lift these restrictions in place. +""" + +from __future__ import annotations -import json -import os from urllib.parse import quote import httpx @@ -9,90 +19,159 @@ from langchain.tools import tool from deerflow.config import get_app_config from deerflow.security.content_delimiter import wrap_untrusted_content -from deerflow.security.sanitizer import sanitizer from deerflow.security.html_cleaner import extract_secure_text +from deerflow.security.sanitizer import sanitizer + +DEFAULT_SEARX_URL = "http://localhost:8888" +DEFAULT_TIMEOUT = 30.0 +DEFAULT_USER_AGENT = "DeerFlow-Hardened/1.0 (+searx)" + +# Allowed Content-Type prefixes for web_fetch responses. Anything else +# (image/*, audio/*, video/*, application/octet-stream, font/*, ...) is +# rejected before its body is read into memory. +ALLOWED_CONTENT_TYPE_PREFIXES = ( + "text/", + "application/json", + "application/xml", + "application/xhtml+xml", + "application/ld+json", + "application/atom+xml", + "application/rss+xml", +) -def _get_searx_config() -> dict: - """Get SearX configuration from app config.""" - config = get_app_config().get_tool_config("web_search") - return { - "url": config.model_extra.get("searx_url", "http://localhost:8888"), - "max_results": config.model_extra.get("max_results", 10), - } +def _is_text_content_type(header_value: str) -> bool: + """True if the Content-Type header is a textual media type we're willing to read.""" + if not header_value: + # No header at all → refuse: we don't speculate. + return False + media = header_value.split(";", 1)[0].strip().lower() + return any(media == prefix.rstrip("/") or media.startswith(prefix) for prefix in ALLOWED_CONTENT_TYPE_PREFIXES) + + +def _tool_extra(name: str) -> dict: + """Read the model_extra dict for a tool config entry, defensively.""" + cfg = get_app_config().get_tool_config(name) + if cfg is None: + return {} + return getattr(cfg, "model_extra", {}) or {} + + +def _searx_url(tool_name: str = "web_search") -> str: + return _tool_extra(tool_name).get("searx_url", DEFAULT_SEARX_URL) + + +def _http_get(url: str, params: dict, timeout: float = DEFAULT_TIMEOUT) -> dict: + """GET a SearX endpoint and return parsed JSON. Raises on transport/HTTP error.""" + with httpx.Client(headers={"User-Agent": DEFAULT_USER_AGENT}) as client: + response = client.get(url, params=params, timeout=timeout) + response.raise_for_status() + return response.json() @tool("web_search", parse_docstring=True) def web_search_tool(query: str, max_results: int = 10) -> str: - """Search the web using hardened SearX instance. - - All results are sanitized against prompt injection attacks. - + """Search the web via the private hardened SearX instance. + + All results are sanitized against prompt-injection vectors and + wrapped in <<>> markers. + Args: - query: Search keywords - max_results: Maximum results to return (default 10) + query: Search keywords. + max_results: Maximum results to return (capped by config). """ - cfg = _get_searx_config() - searx_url = cfg["url"] - - # URL-safe encoding - encoded_query = quote(query) - + extra = _tool_extra("web_search") + cap = int(extra.get("max_results", 10)) + searx_url = extra.get("searx_url", DEFAULT_SEARX_URL) + limit = max(1, min(int(max_results), cap)) + try: - response = httpx.get( + data = _http_get( f"{searx_url}/search", - params={ - "q": encoded_query, - "format": "json", - "max_results": min(max_results, cfg["max_results"]), - }, - timeout=30.0 + {"q": quote(query), "format": "json"}, ) - response.raise_for_status() - data = response.json() - except Exception as e: - return wrap_untrusted_content({"error": f"Search failed: {e}"}) - - # Sanitize and limit results + except Exception as exc: + return wrap_untrusted_content({"error": f"Search failed: {exc}"}) + results = [] - for r in data.get("results", [])[:max_results]: - results.append({ - "title": sanitizer.sanitize(r.get("title", "")), - "url": r.get("url", ""), # Keep URL intact - "content": sanitizer.sanitize(r.get("content", ""), max_length=500), - }) - - output = { - "query": query, - "total_results": len(results), - "results": results, - } - - # Wrap with security delimiters - return wrap_untrusted_content(output) + for item in data.get("results", [])[:limit]: + results.append( + { + "title": sanitizer.sanitize(item.get("title", ""), max_length=200), + "url": item.get("url", ""), + "content": sanitizer.sanitize(item.get("content", ""), max_length=500), + } + ) + + return wrap_untrusted_content( + { + "query": query, + "total_results": len(results), + "results": results, + } + ) @tool("web_fetch", parse_docstring=True) async def web_fetch_tool(url: str, max_chars: int = 10000) -> str: - """Fetch web page content with security hardening. - - Dangerous HTML elements are stripped and content is sanitized. - + """Fetch a web page and return sanitized visible text. + + Only textual responses are accepted (text/html, application/json, ...). + Image, audio, video, and binary responses are refused before the body + is read into memory — this build is text-only by policy. + + Dangerous HTML elements (script, style, iframe, form, ...) are stripped, + invisible Unicode is removed, and the result is wrapped in security markers. + Only call this for URLs returned by web_search or supplied directly by the + user — do not invent URLs. + Args: - url: URL to fetch - max_chars: Maximum characters to return (default 10000) + url: Absolute URL to fetch (must include scheme). + max_chars: Maximum number of characters to return. """ + extra = _tool_extra("web_fetch") + cap = int(extra.get("max_chars", max_chars)) + limit = max(256, min(int(max_chars), cap)) + try: - async with httpx.AsyncClient() as client: - response = await client.get(url, timeout=30.0) - response.raise_for_status() - html = response.text - except Exception as e: - return wrap_untrusted_content({"error": f"Fetch failed: {e}"}) - - # Extract text and sanitize + async with httpx.AsyncClient( + headers={"User-Agent": DEFAULT_USER_AGENT}, + follow_redirects=True, + ) as client: + # Stream so we can inspect headers BEFORE reading the body. + # Refuses image/audio/video/binary responses without ever + # touching their bytes. + async with client.stream("GET", url, timeout=DEFAULT_TIMEOUT) as response: + response.raise_for_status() + content_type = response.headers.get("content-type", "") + if not _is_text_content_type(content_type): + return wrap_untrusted_content( + { + "error": "Refused: non-text response (this build does not fetch images, audio, video or binary content).", + "url": url, + "content_type": content_type or "", + } + ) + # Read at most ~4x the char limit in bytes to bound memory. + # extract_secure_text + sanitizer will trim further. + max_bytes = max(4096, limit * 4) + buf = bytearray() + async for chunk in response.aiter_bytes(): + buf.extend(chunk) + if len(buf) >= max_bytes: + break + html = buf.decode(response.encoding or "utf-8", errors="replace") + except Exception as exc: + return wrap_untrusted_content({"error": f"Fetch failed: {exc}", "url": url}) + raw_text = extract_secure_text(html) - clean_text = sanitizer.sanitize(raw_text, max_length=max_chars) - - # Wrap with security delimiters - return wrap_untrusted_content(clean_text) \ No newline at end of file + clean_text = sanitizer.sanitize(raw_text, max_length=limit) + return wrap_untrusted_content({"url": url, "content": clean_text}) + + +# image_search_tool was intentionally removed in this hardened build. +# Agents are text-only researchers; image fetching has no business in the +# pipeline and only widens the attack surface (data exfiltration via +# rendered tags, server-side image content, ...). If you need to +# bring it back, build a separate tool with explicit user-side allowlist +# and a render-side proxy — do not just paste the old function back. diff --git a/config.yaml b/config.yaml index 7adccd7..7fc627f 100644 --- a/config.yaml +++ b/config.yaml @@ -75,11 +75,8 @@ tools: use: deerflow.community.searx.tools:web_fetch_tool max_chars: 10000 - # Image search via SearX - - name: image_search - group: web - use: deerflow.community.searx.tools:image_search_tool - max_results: 5 + # NOTE: image_search is intentionally NOT registered in this build. + # Agents are text-only researchers. See HARDENING.md sec. 2.8. # File operations (standard) - name: ls @@ -128,7 +125,7 @@ guardrails: # Deny potentially dangerous tools denied_tools: [] # Or use allowlist approach (only these allowed): - # allowed_tools: ["web_search", "web_fetch", "image_search", "read_file", "write_file", "ls", "glob", "grep"] + # allowed_tools: ["web_search", "web_fetch", "read_file", "write_file", "ls", "glob", "grep"] # ============================================================================ # Sandbox Configuration diff --git a/deer-flow/backend/packages/harness/deerflow/community/searx/tools.py b/deer-flow/backend/packages/harness/deerflow/community/searx/tools.py index 7b0d2ba..2dc5ffc 100644 --- a/deer-flow/backend/packages/harness/deerflow/community/searx/tools.py +++ b/deer-flow/backend/packages/harness/deerflow/community/searx/tools.py @@ -1,7 +1,13 @@ -"""Hardened SearX web search, web fetch, and image search tools. +"""Hardened SearX web search and web fetch tools. Every external response is sanitized and wrapped in security delimiters before being returned to the LLM. See deerflow.security for the pipeline. + +Image fetching is intentionally NOT supported. Agents in this build are +text-only researchers; image_search_tool was removed and web_fetch_tool +refuses any response whose Content-Type is not a textual media type. If +you need an image-aware agent, add a dedicated tool with explicit user +review — do not lift these restrictions in place. """ from __future__ import annotations @@ -20,6 +26,28 @@ DEFAULT_SEARX_URL = "http://localhost:8888" DEFAULT_TIMEOUT = 30.0 DEFAULT_USER_AGENT = "DeerFlow-Hardened/1.0 (+searx)" +# Allowed Content-Type prefixes for web_fetch responses. Anything else +# (image/*, audio/*, video/*, application/octet-stream, font/*, ...) is +# rejected before its body is read into memory. +ALLOWED_CONTENT_TYPE_PREFIXES = ( + "text/", + "application/json", + "application/xml", + "application/xhtml+xml", + "application/ld+json", + "application/atom+xml", + "application/rss+xml", +) + + +def _is_text_content_type(header_value: str) -> bool: + """True if the Content-Type header is a textual media type we're willing to read.""" + if not header_value: + # No header at all → refuse: we don't speculate. + return False + media = header_value.split(";", 1)[0].strip().lower() + return any(media == prefix.rstrip("/") or media.startswith(prefix) for prefix in ALLOWED_CONTENT_TYPE_PREFIXES) + def _tool_extra(name: str) -> dict: """Read the model_extra dict for a tool config entry, defensively.""" @@ -88,6 +116,10 @@ def web_search_tool(query: str, max_results: int = 10) -> str: async def web_fetch_tool(url: str, max_chars: int = 10000) -> str: """Fetch a web page and return sanitized visible text. + Only textual responses are accepted (text/html, application/json, ...). + Image, audio, video, and binary responses are refused before the body + is read into memory — this build is text-only by policy. + Dangerous HTML elements (script, style, iframe, form, ...) are stripped, invisible Unicode is removed, and the result is wrapped in security markers. Only call this for URLs returned by web_search or supplied directly by the @@ -106,9 +138,29 @@ async def web_fetch_tool(url: str, max_chars: int = 10000) -> str: headers={"User-Agent": DEFAULT_USER_AGENT}, follow_redirects=True, ) as client: - response = await client.get(url, timeout=DEFAULT_TIMEOUT) - response.raise_for_status() - html = response.text + # Stream so we can inspect headers BEFORE reading the body. + # Refuses image/audio/video/binary responses without ever + # touching their bytes. + async with client.stream("GET", url, timeout=DEFAULT_TIMEOUT) as response: + response.raise_for_status() + content_type = response.headers.get("content-type", "") + if not _is_text_content_type(content_type): + return wrap_untrusted_content( + { + "error": "Refused: non-text response (this build does not fetch images, audio, video or binary content).", + "url": url, + "content_type": content_type or "", + } + ) + # Read at most ~4x the char limit in bytes to bound memory. + # extract_secure_text + sanitizer will trim further. + max_bytes = max(4096, limit * 4) + buf = bytearray() + async for chunk in response.aiter_bytes(): + buf.extend(chunk) + if len(buf) >= max_bytes: + break + html = buf.decode(response.encoding or "utf-8", errors="replace") except Exception as exc: return wrap_untrusted_content({"error": f"Fetch failed: {exc}", "url": url}) @@ -117,44 +169,9 @@ async def web_fetch_tool(url: str, max_chars: int = 10000) -> str: return wrap_untrusted_content({"url": url, "content": clean_text}) -@tool("image_search", parse_docstring=True) -def image_search_tool(query: str, max_results: int = 5) -> str: - """Search for images via the private hardened SearX instance. - - Returns sanitized title/url pairs (no inline image data). Wrapped in - security delimiters. - - Args: - query: Image search keywords. - max_results: Maximum number of images to return. - """ - extra = _tool_extra("image_search") - cap = int(extra.get("max_results", 5)) - searx_url = extra.get("searx_url", _searx_url("web_search")) - limit = max(1, min(int(max_results), cap)) - - try: - data = _http_get( - f"{searx_url}/search", - {"q": quote(query), "format": "json", "categories": "images"}, - ) - except Exception as exc: - return wrap_untrusted_content({"error": f"Image search failed: {exc}"}) - - results = [] - for item in data.get("results", [])[:limit]: - results.append( - { - "title": sanitizer.sanitize(item.get("title", ""), max_length=200), - "url": item.get("url", ""), - "thumbnail": item.get("thumbnail_src") or item.get("img_src", ""), - } - ) - - return wrap_untrusted_content( - { - "query": query, - "total_results": len(results), - "results": results, - } - ) +# image_search_tool was intentionally removed in this hardened build. +# Agents are text-only researchers; image fetching has no business in the +# pipeline and only widens the attack surface (data exfiltration via +# rendered tags, server-side image content, ...). If you need to +# bring it back, build a separate tool with explicit user-side allowlist +# and a render-side proxy — do not just paste the old function back.