Initial commit: hardened DeerFlow factory

Vendored deer-flow upstream (bytedance/deer-flow) plus prompt-injection hardening: - New deerflow.security package: content_delimiter, html_cleaner, sanitizer (8 layers — invisible chars, control chars, symbols, NFC, PUA, tag chars, horizontal whitespace collapse with newline/tab preservation, length cap) - New deerflow.community.searx package: web_search, web_fetch, image_search backed by a private SearX instance, every external string sanitized and wrapped in <<<EXTERNAL_UNTRUSTED_CONTENT>>> delimiters - All native community web providers (ddg_search, tavily, exa, firecrawl, jina_ai, infoquest, image_search) replaced with hard-fail stubs that raise NativeWebToolDisabledError at import time, so a misconfigured tool.use path fails loud rather than silently falling back to unsanitized output - Native client back-doors (jina_client.py, infoquest_client.py) stubbed too - Native-tool tests quarantined under tests/_disabled_native/ (collect_ignore_glob via local conftest.py) - Sanitizer Layer 7 fix: only collapse horizontal whitespace, preserve newlines and tabs so list/table structure survives - Hardened runtime config.yaml references only the searx-backed tools - Factory overlay (backend/) kept in sync with deer-flow tree as a reference / source See HARDENING.md for the full audit trail and verification steps.
2026-04-12 14:23:57 +02:00
commit 6de0bf9f5b
889 changed files with 173052 additions and 0 deletions
--- a/deer-flow/backend/packages/harness/deerflow/mcp/cache.py
+++ b/deer-flow/backend/packages/harness/deerflow/mcp/cache.py
@@ -0,0 +1,138 @@
+"""Cache for MCP tools to avoid repeated loading."""
+
+import asyncio
+import logging
+import os
+
+from langchain_core.tools import BaseTool
+
+logger = logging.getLogger(__name__)
+
+_mcp_tools_cache: list[BaseTool] | None = None
+_cache_initialized = False
+_initialization_lock = asyncio.Lock()
+_config_mtime: float | None = None  # Track config file modification time
+
+
+def _get_config_mtime() -> float | None:
+    """Get the modification time of the extensions config file.
+
+    Returns:
+        The modification time as a float, or None if the file doesn't exist.
+    """
+    from deerflow.config.extensions_config import ExtensionsConfig
+
+    config_path = ExtensionsConfig.resolve_config_path()
+    if config_path and config_path.exists():
+        return os.path.getmtime(config_path)
+    return None
+
+
+def _is_cache_stale() -> bool:
+    """Check if the cache is stale due to config file changes.
+
+    Returns:
+        True if the cache should be invalidated, False otherwise.
+    """
+    global _config_mtime
+
+    if not _cache_initialized:
+        return False  # Not initialized yet, not stale
+
+    current_mtime = _get_config_mtime()
+
+    # If we couldn't get mtime before or now, assume not stale
+    if _config_mtime is None or current_mtime is None:
+        return False
+
+    # If the config file has been modified since we cached, it's stale
+    if current_mtime > _config_mtime:
+        logger.info(f"MCP config file has been modified (mtime: {_config_mtime} -> {current_mtime}), cache is stale")
+        return True
+
+    return False
+
+
+async def initialize_mcp_tools() -> list[BaseTool]:
+    """Initialize and cache MCP tools.
+
+    This should be called once at application startup.
+
+    Returns:
+        List of LangChain tools from all enabled MCP servers.
+    """
+    global _mcp_tools_cache, _cache_initialized, _config_mtime
+
+    async with _initialization_lock:
+        if _cache_initialized:
+            logger.info("MCP tools already initialized")
+            return _mcp_tools_cache or []
+
+        from deerflow.mcp.tools import get_mcp_tools
+
+        logger.info("Initializing MCP tools...")
+        _mcp_tools_cache = await get_mcp_tools()
+        _cache_initialized = True
+        _config_mtime = _get_config_mtime()  # Record config file mtime
+        logger.info(f"MCP tools initialized: {len(_mcp_tools_cache)} tool(s) loaded (config mtime: {_config_mtime})")
+
+        return _mcp_tools_cache
+
+
+def get_cached_mcp_tools() -> list[BaseTool]:
+    """Get cached MCP tools with lazy initialization.
+
+    If tools are not initialized, automatically initializes them.
+    This ensures MCP tools work in both FastAPI and LangGraph Studio contexts.
+
+    Also checks if the config file has been modified since last initialization,
+    and re-initializes if needed. This ensures that changes made through the
+    Gateway API (which runs in a separate process) are reflected in the
+    LangGraph Server.
+
+    Returns:
+        List of cached MCP tools.
+    """
+    global _cache_initialized
+
+    # Check if cache is stale due to config file changes
+    if _is_cache_stale():
+        logger.info("MCP cache is stale, resetting for re-initialization...")
+        reset_mcp_tools_cache()
+
+    if not _cache_initialized:
+        logger.info("MCP tools not initialized, performing lazy initialization...")
+        try:
+            # Try to initialize in the current event loop
+            loop = asyncio.get_event_loop()
+            if loop.is_running():
+                # If loop is already running (e.g., in LangGraph Studio),
+                # we need to create a new loop in a thread
+                import concurrent.futures
+
+                with concurrent.futures.ThreadPoolExecutor() as executor:
+                    future = executor.submit(asyncio.run, initialize_mcp_tools())
+                    future.result()
+            else:
+                # If no loop is running, we can use the current loop
+                loop.run_until_complete(initialize_mcp_tools())
+        except RuntimeError:
+            # No event loop exists, create one
+            asyncio.run(initialize_mcp_tools())
+        except Exception as e:
+            logger.error(f"Failed to lazy-initialize MCP tools: {e}")
+            return []
+
+    return _mcp_tools_cache or []
+
+
+def reset_mcp_tools_cache() -> None:
+    """Reset the MCP tools cache.
+
+    This is useful for testing or when you want to reload MCP tools.
+    """
+    global _mcp_tools_cache, _cache_initialized, _config_mtime
+    _mcp_tools_cache = None
+    _cache_initialized = False
+    _config_mtime = None
+    logger.info("MCP tools cache reset")