No-images policy: refuse non-text fetches, drop image_search_tool

Agents in this build are text-only researchers. Image, audio, video, and binary content has no role in the pipeline and only widens the attack surface (server-side image fetches, exfiltration via rendered img tags, etc). The cleanest answer is to never load it in the first place rather than maintain a domain allowlist that nobody can keep up to date. - web_fetch_tool now uses httpx.AsyncClient.stream and inspects the Content-Type header BEFORE the body is read into memory. Only text/*, application/json, application/xml, application/xhtml+xml, application/ld+json, application/atom+xml, application/rss+xml are accepted; everything else (image/*, audio/*, video/*, octet-stream, pdf, font, missing header, ...) is refused with a wrap_untrusted error reply. The body bytes never enter the process for refused responses. Read budget is bounded to ~4x max_chars regardless. - image_search_tool removed from deerflow.community.searx.tools (both the deer-flow runtime tree and the factory overlay). The function is gone, not stubbed — any tool.use referencing it will raise AttributeError at tool-loading time. - config.yaml: image_search tool entry removed; the example allowed_tools list updated to drop image_search. - HARDENING.md: new section 2.8 explains the policy and the frontend caveat (the LLM can still emit ![](url) markdown which the user's browser would render — that requires a separate frontend patch that is not yet implemented). Section 3.4 adds a verification snippet for the policy. The web_fetch entry in section 2.2 is updated to mention the streaming Content-Type gate. Both source trees stay in sync.
2026-04-12 15:59:55 +02:00
parent 4237f03a83
commit e510f975f6
4 changed files with 269 additions and 123 deletions
--- a/backend/packages/harness/deerflow/community/searx/tools.py
+++ b/backend/packages/harness/deerflow/community/searx/tools.py
@@ -1,7 +1,17 @@
-"""Hardened SearX web search and fetch tools."""
+"""Hardened SearX web search and web fetch tools.
+
+Every external response is sanitized and wrapped in security delimiters
+before being returned to the LLM. See deerflow.security for the pipeline.
+
+Image fetching is intentionally NOT supported. Agents in this build are
+text-only researchers; image_search_tool was removed and web_fetch_tool
+refuses any response whose Content-Type is not a textual media type. If
+you need an image-aware agent, add a dedicated tool with explicit user
+review — do not lift these restrictions in place.
+"""
+
+from __future__ import annotations

-import json
-import os
 from urllib.parse import quote

 import httpx
@@ -9,90 +19,159 @@ from langchain.tools import tool

 from deerflow.config import get_app_config
 from deerflow.security.content_delimiter import wrap_untrusted_content
-from deerflow.security.sanitizer import sanitizer
 from deerflow.security.html_cleaner import extract_secure_text
+from deerflow.security.sanitizer import sanitizer
+
+DEFAULT_SEARX_URL = "http://localhost:8888"
+DEFAULT_TIMEOUT = 30.0
+DEFAULT_USER_AGENT = "DeerFlow-Hardened/1.0 (+searx)"
+
+# Allowed Content-Type prefixes for web_fetch responses. Anything else
+# (image/*, audio/*, video/*, application/octet-stream, font/*, ...) is
+# rejected before its body is read into memory.
+ALLOWED_CONTENT_TYPE_PREFIXES = (
+    "text/",
+    "application/json",
+    "application/xml",
+    "application/xhtml+xml",
+    "application/ld+json",
+    "application/atom+xml",
+    "application/rss+xml",
+)


-def _get_searx_config() -> dict:
-    """Get SearX configuration from app config."""
-    config = get_app_config().get_tool_config("web_search")
-    return {
-        "url": config.model_extra.get("searx_url", "http://localhost:8888"),
-        "max_results": config.model_extra.get("max_results", 10),
-    }
+def _is_text_content_type(header_value: str) -> bool:
+    """True if the Content-Type header is a textual media type we're willing to read."""
+    if not header_value:
+        # No header at all → refuse: we don't speculate.
+        return False
+    media = header_value.split(";", 1)[0].strip().lower()
+    return any(media == prefix.rstrip("/") or media.startswith(prefix) for prefix in ALLOWED_CONTENT_TYPE_PREFIXES)
+
+
+def _tool_extra(name: str) -> dict:
+    """Read the model_extra dict for a tool config entry, defensively."""
+    cfg = get_app_config().get_tool_config(name)
+    if cfg is None:
+        return {}
+    return getattr(cfg, "model_extra", {}) or {}
+
+
+def _searx_url(tool_name: str = "web_search") -> str:
+    return _tool_extra(tool_name).get("searx_url", DEFAULT_SEARX_URL)
+
+
+def _http_get(url: str, params: dict, timeout: float = DEFAULT_TIMEOUT) -> dict:
+    """GET a SearX endpoint and return parsed JSON. Raises on transport/HTTP error."""
+    with httpx.Client(headers={"User-Agent": DEFAULT_USER_AGENT}) as client:
+        response = client.get(url, params=params, timeout=timeout)
+        response.raise_for_status()
+        return response.json()


@tool("web_search", parse_docstring=True)
 def web_search_tool(query: str, max_results: int = 10) -> str:
-    """Search the web using hardened SearX instance.
-    
-    All results are sanitized against prompt injection attacks.
-    
+    """Search the web via the private hardened SearX instance.
+
+    All results are sanitized against prompt-injection vectors and
+    wrapped in <<<EXTERNAL_UNTRUSTED_CONTENT>>> markers.
+
    Args:
-        query: Search keywords
-        max_results: Maximum results to return (default 10)
+        query: Search keywords.
+        max_results: Maximum results to return (capped by config).
    """
-    cfg = _get_searx_config()
-    searx_url = cfg["url"]
-    
-    # URL-safe encoding
-    encoded_query = quote(query)
-    
+    extra = _tool_extra("web_search")
+    cap = int(extra.get("max_results", 10))
+    searx_url = extra.get("searx_url", DEFAULT_SEARX_URL)
+    limit = max(1, min(int(max_results), cap))
+
    try:
-        response = httpx.get(
+        data = _http_get(
            f"{searx_url}/search",
-            params={
-                "q": encoded_query,
-                "format": "json",
-                "max_results": min(max_results, cfg["max_results"]),
-            },
-            timeout=30.0
+            {"q": quote(query), "format": "json"},
        )
-        response.raise_for_status()
-        data = response.json()
-    except Exception as e:
-        return wrap_untrusted_content({"error": f"Search failed: {e}"})
-    
-    # Sanitize and limit results
+    except Exception as exc:
+        return wrap_untrusted_content({"error": f"Search failed: {exc}"})
+
    results = []
-    for r in data.get("results", [])[:max_results]:
-        results.append({
-            "title": sanitizer.sanitize(r.get("title", "")),
-            "url": r.get("url", ""),  # Keep URL intact
-            "content": sanitizer.sanitize(r.get("content", ""), max_length=500),
-        })
-    
-    output = {
-        "query": query,
-        "total_results": len(results),
-        "results": results,
-    }
-    
-    # Wrap with security delimiters
-    return wrap_untrusted_content(output)
+    for item in data.get("results", [])[:limit]:
+        results.append(
+            {
+                "title": sanitizer.sanitize(item.get("title", ""), max_length=200),
+                "url": item.get("url", ""),
+                "content": sanitizer.sanitize(item.get("content", ""), max_length=500),
+            }
+        )
+
+    return wrap_untrusted_content(
+        {
+            "query": query,
+            "total_results": len(results),
+            "results": results,
+        }
+    )


@tool("web_fetch", parse_docstring=True)
 async def web_fetch_tool(url: str, max_chars: int = 10000) -> str:
-    """Fetch web page content with security hardening.
-    
-    Dangerous HTML elements are stripped and content is sanitized.
-    
+    """Fetch a web page and return sanitized visible text.
+
+    Only textual responses are accepted (text/html, application/json, ...).
+    Image, audio, video, and binary responses are refused before the body
+    is read into memory — this build is text-only by policy.
+
+    Dangerous HTML elements (script, style, iframe, form, ...) are stripped,
+    invisible Unicode is removed, and the result is wrapped in security markers.
+    Only call this for URLs returned by web_search or supplied directly by the
+    user — do not invent URLs.
+
    Args:
-        url: URL to fetch
-        max_chars: Maximum characters to return (default 10000)
+        url: Absolute URL to fetch (must include scheme).
+        max_chars: Maximum number of characters to return.
    """
+    extra = _tool_extra("web_fetch")
+    cap = int(extra.get("max_chars", max_chars))
+    limit = max(256, min(int(max_chars), cap))
+
    try:
-        async with httpx.AsyncClient() as client:
-            response = await client.get(url, timeout=30.0)
-            response.raise_for_status()
-            html = response.text
-    except Exception as e:
-        return wrap_untrusted_content({"error": f"Fetch failed: {e}"})
-    
-    # Extract text and sanitize
+        async with httpx.AsyncClient(
+            headers={"User-Agent": DEFAULT_USER_AGENT},
+            follow_redirects=True,
+        ) as client:
+            # Stream so we can inspect headers BEFORE reading the body.
+            # Refuses image/audio/video/binary responses without ever
+            # touching their bytes.
+            async with client.stream("GET", url, timeout=DEFAULT_TIMEOUT) as response:
+                response.raise_for_status()
+                content_type = response.headers.get("content-type", "")
+                if not _is_text_content_type(content_type):
+                    return wrap_untrusted_content(
+                        {
+                            "error": "Refused: non-text response (this build does not fetch images, audio, video or binary content).",
+                            "url": url,
+                            "content_type": content_type or "<missing>",
+                        }
+                    )
+                # Read at most ~4x the char limit in bytes to bound memory.
+                # extract_secure_text + sanitizer will trim further.
+                max_bytes = max(4096, limit * 4)
+                buf = bytearray()
+                async for chunk in response.aiter_bytes():
+                    buf.extend(chunk)
+                    if len(buf) >= max_bytes:
+                        break
+                html = buf.decode(response.encoding or "utf-8", errors="replace")
+    except Exception as exc:
+        return wrap_untrusted_content({"error": f"Fetch failed: {exc}", "url": url})
+
    raw_text = extract_secure_text(html)
-    clean_text = sanitizer.sanitize(raw_text, max_length=max_chars)
-    
-    # Wrap with security delimiters
-    return wrap_untrusted_content(clean_text)
+    clean_text = sanitizer.sanitize(raw_text, max_length=limit)
+    return wrap_untrusted_content({"url": url, "content": clean_text})
+
+
+# image_search_tool was intentionally removed in this hardened build.
+# Agents are text-only researchers; image fetching has no business in the
+# pipeline and only widens the attack surface (data exfiltration via
+# rendered <img> tags, server-side image content, ...). If you need to
+# bring it back, build a separate tool with explicit user-side allowlist
+# and a render-side proxy — do not just paste the old function back.