deerflow-factory/backend/packages/harness/deerflow/community/searx/tools.py

"""Hardened SearX web search and fetch tools."""

import json
import os
from urllib.parse import quote

import httpx
from langchain.tools import tool

from deerflow.config import get_app_config
from deerflow.security.content_delimiter import wrap_untrusted_content
from deerflow.security.sanitizer import sanitizer
from deerflow.security.html_cleaner import extract_secure_text


def _get_searx_config() -> dict:
    """Get SearX configuration from app config."""
    config = get_app_config().get_tool_config("web_search")
    return {
        "url": config.model_extra.get("searx_url", "http://localhost:8888"),
        "max_results": config.model_extra.get("max_results", 10),
    }


@tool("web_search", parse_docstring=True)
def web_search_tool(query: str, max_results: int = 10) -> str:
    """Search the web using hardened SearX instance.

    All results are sanitized against prompt injection attacks.

    Args:
        query: Search keywords
        max_results: Maximum results to return (default 10)
    """
    cfg = _get_searx_config()
    searx_url = cfg["url"]

    # URL-safe encoding
    encoded_query = quote(query)

    try:
        response = httpx.get(
            f"{searx_url}/search",
            params={
                "q": encoded_query,
                "format": "json",
                "max_results": min(max_results, cfg["max_results"]),
            },
            timeout=30.0
        )
        response.raise_for_status()
        data = response.json()
    except Exception as e:
        return wrap_untrusted_content({"error": f"Search failed: {e}"})

    # Sanitize and limit results
    results = []
    for r in data.get("results", [])[:max_results]:
        results.append({
            "title": sanitizer.sanitize(r.get("title", "")),
            "url": r.get("url", ""),  # Keep URL intact
            "content": sanitizer.sanitize(r.get("content", ""), max_length=500),
        })

    output = {
        "query": query,
        "total_results": len(results),
        "results": results,
    }

    # Wrap with security delimiters
    return wrap_untrusted_content(output)


@tool("web_fetch", parse_docstring=True)
async def web_fetch_tool(url: str, max_chars: int = 10000) -> str:
    """Fetch web page content with security hardening.

    Dangerous HTML elements are stripped and content is sanitized.

    Args:
        url: URL to fetch
        max_chars: Maximum characters to return (default 10000)
    """
    try:
        async with httpx.AsyncClient() as client:
            response = await client.get(url, timeout=30.0)
            response.raise_for_status()
            html = response.text
    except Exception as e:
        return wrap_untrusted_content({"error": f"Fetch failed: {e}"})

    # Extract text and sanitize
    raw_text = extract_secure_text(html)
    clean_text = sanitizer.sanitize(raw_text, max_length=max_chars)

    # Wrap with security delimiters
    return wrap_untrusted_content(clean_text)