Initial commit: hardened DeerFlow factory

Vendored deer-flow upstream (bytedance/deer-flow) plus prompt-injection
hardening:

- New deerflow.security package: content_delimiter, html_cleaner,
  sanitizer (8 layers — invisible chars, control chars, symbols, NFC,
  PUA, tag chars, horizontal whitespace collapse with newline/tab
  preservation, length cap)
- New deerflow.community.searx package: web_search, web_fetch,
  image_search backed by a private SearX instance, every external
  string sanitized and wrapped in <<<EXTERNAL_UNTRUSTED_CONTENT>>>
  delimiters
- All native community web providers (ddg_search, tavily, exa,
  firecrawl, jina_ai, infoquest, image_search) replaced with hard-fail
  stubs that raise NativeWebToolDisabledError at import time, so a
  misconfigured tool.use path fails loud rather than silently falling
  back to unsanitized output
- Native client back-doors (jina_client.py, infoquest_client.py)
  stubbed too
- Native-tool tests quarantined under tests/_disabled_native/
  (collect_ignore_glob via local conftest.py)
- Sanitizer Layer 7 fix: only collapse horizontal whitespace, preserve
  newlines and tabs so list/table structure survives
- Hardened runtime config.yaml references only the searx-backed tools
- Factory overlay (backend/) kept in sync with deer-flow tree as a
  reference / source

See HARDENING.md for the full audit trail and verification steps.
This commit is contained in:
2026-04-12 14:23:57 +02:00
commit 6de0bf9f5b
889 changed files with 173052 additions and 0 deletions

View File

@@ -0,0 +1,8 @@
from .sandbox import Sandbox
from .sandbox_provider import SandboxProvider, get_sandbox_provider
__all__ = [
"Sandbox",
"SandboxProvider",
"get_sandbox_provider",
]

View File

@@ -0,0 +1,71 @@
"""Sandbox-related exceptions with structured error information."""
class SandboxError(Exception):
"""Base exception for all sandbox-related errors."""
def __init__(self, message: str, details: dict | None = None):
super().__init__(message)
self.message = message
self.details = details or {}
def __str__(self) -> str:
if self.details:
detail_str = ", ".join(f"{k}={v}" for k, v in self.details.items())
return f"{self.message} ({detail_str})"
return self.message
class SandboxNotFoundError(SandboxError):
"""Raised when a sandbox cannot be found or is not available."""
def __init__(self, message: str = "Sandbox not found", sandbox_id: str | None = None):
details = {"sandbox_id": sandbox_id} if sandbox_id else None
super().__init__(message, details)
self.sandbox_id = sandbox_id
class SandboxRuntimeError(SandboxError):
"""Raised when sandbox runtime is not available or misconfigured."""
pass
class SandboxCommandError(SandboxError):
"""Raised when a command execution fails in the sandbox."""
def __init__(self, message: str, command: str | None = None, exit_code: int | None = None):
details = {}
if command:
details["command"] = command[:100] + "..." if len(command) > 100 else command
if exit_code is not None:
details["exit_code"] = exit_code
super().__init__(message, details)
self.command = command
self.exit_code = exit_code
class SandboxFileError(SandboxError):
"""Raised when a file operation fails in the sandbox."""
def __init__(self, message: str, path: str | None = None, operation: str | None = None):
details = {}
if path:
details["path"] = path
if operation:
details["operation"] = operation
super().__init__(message, details)
self.path = path
self.operation = operation
class SandboxPermissionError(SandboxFileError):
"""Raised when a permission error occurs during file operations."""
pass
class SandboxFileNotFoundError(SandboxFileError):
"""Raised when a file or directory is not found."""
pass

View File

@@ -0,0 +1,27 @@
import threading
import weakref
from deerflow.sandbox.sandbox import Sandbox
# Use WeakValueDictionary to prevent memory leak in long-running processes.
# Locks are automatically removed when no longer referenced by any thread.
_LockKey = tuple[str, str]
_FILE_OPERATION_LOCKS: weakref.WeakValueDictionary[_LockKey, threading.Lock] = weakref.WeakValueDictionary()
_FILE_OPERATION_LOCKS_GUARD = threading.Lock()
def get_file_operation_lock_key(sandbox: Sandbox, path: str) -> tuple[str, str]:
sandbox_id = getattr(sandbox, "id", None)
if not sandbox_id:
sandbox_id = f"instance:{id(sandbox)}"
return sandbox_id, path
def get_file_operation_lock(sandbox: Sandbox, path: str) -> threading.Lock:
lock_key = get_file_operation_lock_key(sandbox, path)
with _FILE_OPERATION_LOCKS_GUARD:
lock = _FILE_OPERATION_LOCKS.get(lock_key)
if lock is None:
lock = threading.Lock()
_FILE_OPERATION_LOCKS[lock_key] = lock
return lock

View File

@@ -0,0 +1,3 @@
from .local_sandbox_provider import LocalSandboxProvider
__all__ = ["LocalSandboxProvider"]

View File

@@ -0,0 +1,46 @@
from pathlib import Path
from deerflow.sandbox.search import should_ignore_name
def list_dir(path: str, max_depth: int = 2) -> list[str]:
"""
List files and directories up to max_depth levels deep.
Args:
path: The root directory path to list.
max_depth: Maximum depth to traverse (default: 2).
1 = only direct children, 2 = children + grandchildren, etc.
Returns:
A list of absolute paths for files and directories,
excluding items matching IGNORE_PATTERNS.
"""
result: list[str] = []
root_path = Path(path).resolve()
if not root_path.is_dir():
return result
def _traverse(current_path: Path, current_depth: int) -> None:
"""Recursively traverse directories up to max_depth."""
if current_depth > max_depth:
return
try:
for item in current_path.iterdir():
if should_ignore_name(item.name):
continue
post_fix = "/" if item.is_dir() else ""
result.append(str(item.resolve()) + post_fix)
# Recurse into subdirectories if not at max depth
if item.is_dir() and current_depth < max_depth:
_traverse(item, current_depth + 1)
except PermissionError:
pass
_traverse(root_path, 1)
return sorted(result)

View File

@@ -0,0 +1,398 @@
import errno
import ntpath
import os
import shutil
import subprocess
from dataclasses import dataclass
from pathlib import Path
from deerflow.sandbox.local.list_dir import list_dir
from deerflow.sandbox.sandbox import Sandbox
from deerflow.sandbox.search import GrepMatch, find_glob_matches, find_grep_matches
@dataclass(frozen=True)
class PathMapping:
"""A path mapping from a container path to a local path with optional read-only flag."""
container_path: str
local_path: str
read_only: bool = False
class LocalSandbox(Sandbox):
@staticmethod
def _shell_name(shell: str) -> str:
"""Return the executable name for a shell path or command."""
return shell.replace("\\", "/").rsplit("/", 1)[-1].lower()
@staticmethod
def _is_powershell(shell: str) -> bool:
"""Return whether the selected shell is a PowerShell executable."""
return LocalSandbox._shell_name(shell) in {"powershell", "powershell.exe", "pwsh", "pwsh.exe"}
@staticmethod
def _is_cmd_shell(shell: str) -> bool:
"""Return whether the selected shell is cmd.exe."""
return LocalSandbox._shell_name(shell) in {"cmd", "cmd.exe"}
@staticmethod
def _find_first_available_shell(candidates: tuple[str, ...]) -> str | None:
"""Return the first executable shell path or command found from candidates."""
for shell in candidates:
if os.path.isabs(shell):
if os.path.isfile(shell) and os.access(shell, os.X_OK):
return shell
continue
shell_from_path = shutil.which(shell)
if shell_from_path is not None:
return shell_from_path
return None
def __init__(self, id: str, path_mappings: list[PathMapping] | None = None):
"""
Initialize local sandbox with optional path mappings.
Args:
id: Sandbox identifier
path_mappings: List of path mappings with optional read-only flag.
Skills directory is read-only by default.
"""
super().__init__(id)
self.path_mappings = path_mappings or []
# Track files written through write_file so read_file only
# reverse-resolves paths in agent-authored content.
self._agent_written_paths: set[str] = set()
def _is_read_only_path(self, resolved_path: str) -> bool:
"""Check if a resolved path is under a read-only mount.
When multiple mappings match (nested mounts), prefer the most specific
mapping (i.e. the one whose local_path is the longest prefix of the
resolved path), similar to how ``_resolve_path`` handles container paths.
"""
resolved = str(Path(resolved_path).resolve())
best_mapping: PathMapping | None = None
best_prefix_len = -1
for mapping in self.path_mappings:
local_resolved = str(Path(mapping.local_path).resolve())
if resolved == local_resolved or resolved.startswith(local_resolved + os.sep):
prefix_len = len(local_resolved)
if prefix_len > best_prefix_len:
best_prefix_len = prefix_len
best_mapping = mapping
if best_mapping is None:
return False
return best_mapping.read_only
def _resolve_path(self, path: str) -> str:
"""
Resolve container path to actual local path using mappings.
Args:
path: Path that might be a container path
Returns:
Resolved local path
"""
path_str = str(path)
# Try each mapping (longest prefix first for more specific matches)
for mapping in sorted(self.path_mappings, key=lambda m: len(m.container_path), reverse=True):
container_path = mapping.container_path
local_path = mapping.local_path
if path_str == container_path or path_str.startswith(container_path + "/"):
# Replace the container path prefix with local path
relative = path_str[len(container_path) :].lstrip("/")
resolved = str(Path(local_path) / relative) if relative else local_path
return resolved
# No mapping found, return original path
return path_str
def _reverse_resolve_path(self, path: str) -> str:
"""
Reverse resolve local path back to container path using mappings.
Args:
path: Local path that might need to be mapped to container path
Returns:
Container path if mapping exists, otherwise original path
"""
normalized_path = path.replace("\\", "/")
path_str = str(Path(normalized_path).resolve())
# Try each mapping (longest local path first for more specific matches)
for mapping in sorted(self.path_mappings, key=lambda m: len(m.local_path), reverse=True):
local_path_resolved = str(Path(mapping.local_path).resolve())
if path_str == local_path_resolved or path_str.startswith(local_path_resolved + "/"):
# Replace the local path prefix with container path
relative = path_str[len(local_path_resolved) :].lstrip("/")
resolved = f"{mapping.container_path}/{relative}" if relative else mapping.container_path
return resolved
# No mapping found, return original path
return path_str
def _reverse_resolve_paths_in_output(self, output: str) -> str:
"""
Reverse resolve local paths back to container paths in output string.
Args:
output: Output string that may contain local paths
Returns:
Output with local paths resolved to container paths
"""
import re
# Sort mappings by local path length (longest first) for correct prefix matching
sorted_mappings = sorted(self.path_mappings, key=lambda m: len(m.local_path), reverse=True)
if not sorted_mappings:
return output
# Create pattern that matches absolute paths
# Match paths like /Users/... or other absolute paths
result = output
for mapping in sorted_mappings:
# Escape the local path for use in regex
escaped_local = re.escape(str(Path(mapping.local_path).resolve()))
# Match the local path followed by optional path components with either separator
pattern = re.compile(escaped_local + r"(?:[/\\][^\s\"';&|<>()]*)?")
def replace_match(match: re.Match) -> str:
matched_path = match.group(0)
return self._reverse_resolve_path(matched_path)
result = pattern.sub(replace_match, result)
return result
def _resolve_paths_in_command(self, command: str) -> str:
"""
Resolve container paths to local paths in a command string.
Args:
command: Command string that may contain container paths
Returns:
Command with container paths resolved to local paths
"""
import re
# Sort mappings by length (longest first) for correct prefix matching
sorted_mappings = sorted(self.path_mappings, key=lambda m: len(m.container_path), reverse=True)
# Build regex pattern to match all container paths
# Match container path followed by optional path components
if not sorted_mappings:
return command
# Create pattern that matches any of the container paths.
# The lookahead (?=/|$|...) ensures we only match at a path-segment boundary,
# preventing /mnt/skills from matching inside /mnt/skills-extra.
patterns = [re.escape(m.container_path) + r"(?=/|$|[\s\"';&|<>()])(?:/[^\s\"';&|<>()]*)?" for m in sorted_mappings]
pattern = re.compile("|".join(f"({p})" for p in patterns))
def replace_match(match: re.Match) -> str:
matched_path = match.group(0)
return self._resolve_path(matched_path)
return pattern.sub(replace_match, command)
def _resolve_paths_in_content(self, content: str) -> str:
"""Resolve container paths to local paths in arbitrary file content.
Unlike ``_resolve_paths_in_command`` which uses shell-aware boundary
characters, this method treats the content as plain text and resolves
every occurrence of a container path prefix. Resolved paths are
normalized to forward slashes to avoid backslash-escape issues on
Windows hosts (e.g. ``C:\\Users\\..`` breaking Python string literals).
Args:
content: File content that may contain container paths.
Returns:
Content with container paths resolved to local paths (forward slashes).
"""
import re
sorted_mappings = sorted(self.path_mappings, key=lambda m: len(m.container_path), reverse=True)
if not sorted_mappings:
return content
patterns = [re.escape(m.container_path) + r"(?=/|$|[^\w./-])(?:/[^\s\"';&|<>()]*)?" for m in sorted_mappings]
pattern = re.compile("|".join(f"({p})" for p in patterns))
def replace_match(match: re.Match) -> str:
matched_path = match.group(0)
resolved = self._resolve_path(matched_path)
# Normalize to forward slashes so that Windows backslash paths
# don't create invalid escape sequences in source files.
return resolved.replace("\\", "/")
return pattern.sub(replace_match, content)
@staticmethod
def _get_shell() -> str:
"""Detect available shell executable with fallback."""
shell = LocalSandbox._find_first_available_shell(("/bin/zsh", "/bin/bash", "/bin/sh", "sh"))
if shell is not None:
return shell
if os.name == "nt":
system_root = os.environ.get("SystemRoot", r"C:\Windows")
shell = LocalSandbox._find_first_available_shell(
(
"pwsh",
"pwsh.exe",
"powershell",
"powershell.exe",
ntpath.join(system_root, "System32", "WindowsPowerShell", "v1.0", "powershell.exe"),
"cmd.exe",
)
)
if shell is not None:
return shell
raise RuntimeError("No suitable shell executable found. Tried /bin/zsh, /bin/bash, /bin/sh, `sh` on PATH, then PowerShell and cmd.exe fallbacks for Windows.")
raise RuntimeError("No suitable shell executable found. Tried /bin/zsh, /bin/bash, /bin/sh, and `sh` on PATH.")
def execute_command(self, command: str) -> str:
# Resolve container paths in command before execution
resolved_command = self._resolve_paths_in_command(command)
shell = self._get_shell()
if os.name == "nt":
if self._is_powershell(shell):
args = [shell, "-NoProfile", "-Command", resolved_command]
elif self._is_cmd_shell(shell):
args = [shell, "/c", resolved_command]
else:
args = [shell, "-c", resolved_command]
result = subprocess.run(
args,
shell=False,
capture_output=True,
text=True,
timeout=600,
)
else:
result = subprocess.run(
resolved_command,
executable=shell,
shell=True,
capture_output=True,
text=True,
timeout=600,
)
output = result.stdout
if result.stderr:
output += f"\nStd Error:\n{result.stderr}" if output else result.stderr
if result.returncode != 0:
output += f"\nExit Code: {result.returncode}"
final_output = output if output else "(no output)"
# Reverse resolve local paths back to container paths in output
return self._reverse_resolve_paths_in_output(final_output)
def list_dir(self, path: str, max_depth=2) -> list[str]:
resolved_path = self._resolve_path(path)
entries = list_dir(resolved_path, max_depth)
# Reverse resolve local paths back to container paths in output
return [self._reverse_resolve_paths_in_output(entry) for entry in entries]
def read_file(self, path: str) -> str:
resolved_path = self._resolve_path(path)
try:
with open(resolved_path, encoding="utf-8") as f:
content = f.read()
# Only reverse-resolve paths in files that were previously written
# by write_file (agent-authored content). User-uploaded files,
# external tool output, and other non-agent content should not be
# silently rewritten — see discussion on PR #1935.
if resolved_path in self._agent_written_paths:
content = self._reverse_resolve_paths_in_output(content)
return content
except OSError as e:
# Re-raise with the original path for clearer error messages, hiding internal resolved paths
raise type(e)(e.errno, e.strerror, path) from None
def write_file(self, path: str, content: str, append: bool = False) -> None:
resolved_path = self._resolve_path(path)
if self._is_read_only_path(resolved_path):
raise OSError(errno.EROFS, "Read-only file system", path)
try:
dir_path = os.path.dirname(resolved_path)
if dir_path:
os.makedirs(dir_path, exist_ok=True)
# Resolve container paths in content to local paths
# using the content-specific resolver (forward-slash safe)
resolved_content = self._resolve_paths_in_content(content)
mode = "a" if append else "w"
with open(resolved_path, mode, encoding="utf-8") as f:
f.write(resolved_content)
# Track this path so read_file knows to reverse-resolve on read.
# Only agent-written files get reverse-resolved; user uploads and
# external tool output are left untouched.
self._agent_written_paths.add(resolved_path)
except OSError as e:
# Re-raise with the original path for clearer error messages, hiding internal resolved paths
raise type(e)(e.errno, e.strerror, path) from None
def glob(self, path: str, pattern: str, *, include_dirs: bool = False, max_results: int = 200) -> tuple[list[str], bool]:
resolved_path = Path(self._resolve_path(path))
matches, truncated = find_glob_matches(resolved_path, pattern, include_dirs=include_dirs, max_results=max_results)
return [self._reverse_resolve_path(match) for match in matches], truncated
def grep(
self,
path: str,
pattern: str,
*,
glob: str | None = None,
literal: bool = False,
case_sensitive: bool = False,
max_results: int = 100,
) -> tuple[list[GrepMatch], bool]:
resolved_path = Path(self._resolve_path(path))
matches, truncated = find_grep_matches(
resolved_path,
pattern,
glob_pattern=glob,
literal=literal,
case_sensitive=case_sensitive,
max_results=max_results,
)
return [
GrepMatch(
path=self._reverse_resolve_path(match.path),
line_number=match.line_number,
line=match.line,
)
for match in matches
], truncated
def update_file(self, path: str, content: bytes) -> None:
resolved_path = self._resolve_path(path)
if self._is_read_only_path(resolved_path):
raise OSError(errno.EROFS, "Read-only file system", path)
try:
dir_path = os.path.dirname(resolved_path)
if dir_path:
os.makedirs(dir_path, exist_ok=True)
with open(resolved_path, "wb") as f:
f.write(content)
except OSError as e:
# Re-raise with the original path for clearer error messages, hiding internal resolved paths
raise type(e)(e.errno, e.strerror, path) from None

View File

@@ -0,0 +1,119 @@
import logging
from pathlib import Path
from deerflow.sandbox.local.local_sandbox import LocalSandbox, PathMapping
from deerflow.sandbox.sandbox import Sandbox
from deerflow.sandbox.sandbox_provider import SandboxProvider
logger = logging.getLogger(__name__)
_singleton: LocalSandbox | None = None
class LocalSandboxProvider(SandboxProvider):
def __init__(self):
"""Initialize the local sandbox provider with path mappings."""
self._path_mappings = self._setup_path_mappings()
def _setup_path_mappings(self) -> list[PathMapping]:
"""
Setup path mappings for local sandbox.
Maps container paths to actual local paths, including skills directory
and any custom mounts configured in config.yaml.
Returns:
List of path mappings
"""
mappings: list[PathMapping] = []
# Map skills container path to local skills directory
try:
from deerflow.config import get_app_config
config = get_app_config()
skills_path = config.skills.get_skills_path()
container_path = config.skills.container_path
# Only add mapping if skills directory exists
if skills_path.exists():
mappings.append(
PathMapping(
container_path=container_path,
local_path=str(skills_path),
read_only=True, # Skills directory is always read-only
)
)
# Map custom mounts from sandbox config
_RESERVED_CONTAINER_PREFIXES = [container_path, "/mnt/acp-workspace", "/mnt/user-data"]
sandbox_config = config.sandbox
if sandbox_config and sandbox_config.mounts:
for mount in sandbox_config.mounts:
host_path = Path(mount.host_path)
container_path = mount.container_path.rstrip("/") or "/"
if not host_path.is_absolute():
logger.warning(
"Mount host_path must be absolute, skipping: %s -> %s",
mount.host_path,
mount.container_path,
)
continue
if not container_path.startswith("/"):
logger.warning(
"Mount container_path must be absolute, skipping: %s -> %s",
mount.host_path,
mount.container_path,
)
continue
# Reject mounts that conflict with reserved container paths
if any(container_path == p or container_path.startswith(p + "/") for p in _RESERVED_CONTAINER_PREFIXES):
logger.warning(
"Mount container_path conflicts with reserved prefix, skipping: %s",
mount.container_path,
)
continue
# Ensure the host path exists before adding mapping
if host_path.exists():
mappings.append(
PathMapping(
container_path=container_path,
local_path=str(host_path.resolve()),
read_only=mount.read_only,
)
)
else:
logger.warning(
"Mount host_path does not exist, skipping: %s -> %s",
mount.host_path,
mount.container_path,
)
except Exception as e:
# Log but don't fail if config loading fails
logger.warning("Could not setup path mappings: %s", e, exc_info=True)
return mappings
def acquire(self, thread_id: str | None = None) -> str:
global _singleton
if _singleton is None:
_singleton = LocalSandbox("local", path_mappings=self._path_mappings)
return _singleton.id
def get(self, sandbox_id: str) -> Sandbox | None:
if sandbox_id == "local":
if _singleton is None:
self.acquire()
return _singleton
return None
def release(self, sandbox_id: str) -> None:
# LocalSandbox uses singleton pattern - no cleanup needed.
# Note: This method is intentionally not called by SandboxMiddleware
# to allow sandbox reuse across multiple turns in a thread.
# For Docker-based providers (e.g., AioSandboxProvider), cleanup
# happens at application shutdown via the shutdown() method.
pass

View File

@@ -0,0 +1,83 @@
import logging
from typing import NotRequired, override
from langchain.agents import AgentState
from langchain.agents.middleware import AgentMiddleware
from langgraph.runtime import Runtime
from deerflow.agents.thread_state import SandboxState, ThreadDataState
from deerflow.sandbox import get_sandbox_provider
logger = logging.getLogger(__name__)
class SandboxMiddlewareState(AgentState):
"""Compatible with the `ThreadState` schema."""
sandbox: NotRequired[SandboxState | None]
thread_data: NotRequired[ThreadDataState | None]
class SandboxMiddleware(AgentMiddleware[SandboxMiddlewareState]):
"""Create a sandbox environment and assign it to an agent.
Lifecycle Management:
- With lazy_init=True (default): Sandbox is acquired on first tool call
- With lazy_init=False: Sandbox is acquired on first agent invocation (before_agent)
- Sandbox is reused across multiple turns within the same thread
- Sandbox is NOT released after each agent call to avoid wasteful recreation
- Cleanup happens at application shutdown via SandboxProvider.shutdown()
"""
state_schema = SandboxMiddlewareState
def __init__(self, lazy_init: bool = True):
"""Initialize sandbox middleware.
Args:
lazy_init: If True, defer sandbox acquisition until first tool call.
If False, acquire sandbox eagerly in before_agent().
Default is True for optimal performance.
"""
super().__init__()
self._lazy_init = lazy_init
def _acquire_sandbox(self, thread_id: str) -> str:
provider = get_sandbox_provider()
sandbox_id = provider.acquire(thread_id)
logger.info(f"Acquiring sandbox {sandbox_id}")
return sandbox_id
@override
def before_agent(self, state: SandboxMiddlewareState, runtime: Runtime) -> dict | None:
# Skip acquisition if lazy_init is enabled
if self._lazy_init:
return super().before_agent(state, runtime)
# Eager initialization (original behavior)
if "sandbox" not in state or state["sandbox"] is None:
thread_id = (runtime.context or {}).get("thread_id")
if thread_id is None:
return super().before_agent(state, runtime)
sandbox_id = self._acquire_sandbox(thread_id)
logger.info(f"Assigned sandbox {sandbox_id} to thread {thread_id}")
return {"sandbox": {"sandbox_id": sandbox_id}}
return super().before_agent(state, runtime)
@override
def after_agent(self, state: SandboxMiddlewareState, runtime: Runtime) -> dict | None:
sandbox = state.get("sandbox")
if sandbox is not None:
sandbox_id = sandbox["sandbox_id"]
logger.info(f"Releasing sandbox {sandbox_id}")
get_sandbox_provider().release(sandbox_id)
return None
if (runtime.context or {}).get("sandbox_id") is not None:
sandbox_id = runtime.context.get("sandbox_id")
logger.info(f"Releasing sandbox {sandbox_id} from context")
get_sandbox_provider().release(sandbox_id)
return None
# No sandbox to release
return super().after_agent(state, runtime)

View File

@@ -0,0 +1,93 @@
from abc import ABC, abstractmethod
from deerflow.sandbox.search import GrepMatch
class Sandbox(ABC):
"""Abstract base class for sandbox environments"""
_id: str
def __init__(self, id: str):
self._id = id
@property
def id(self) -> str:
return self._id
@abstractmethod
def execute_command(self, command: str) -> str:
"""Execute bash command in sandbox.
Args:
command: The command to execute.
Returns:
The standard or error output of the command.
"""
pass
@abstractmethod
def read_file(self, path: str) -> str:
"""Read the content of a file.
Args:
path: The absolute path of the file to read.
Returns:
The content of the file.
"""
pass
@abstractmethod
def list_dir(self, path: str, max_depth=2) -> list[str]:
"""List the contents of a directory.
Args:
path: The absolute path of the directory to list.
max_depth: The maximum depth to traverse. Default is 2.
Returns:
The contents of the directory.
"""
pass
@abstractmethod
def write_file(self, path: str, content: str, append: bool = False) -> None:
"""Write content to a file.
Args:
path: The absolute path of the file to write to.
content: The text content to write to the file.
append: Whether to append the content to the file. If False, the file will be created or overwritten.
"""
pass
@abstractmethod
def glob(self, path: str, pattern: str, *, include_dirs: bool = False, max_results: int = 200) -> tuple[list[str], bool]:
"""Find paths that match a glob pattern under a root directory."""
pass
@abstractmethod
def grep(
self,
path: str,
pattern: str,
*,
glob: str | None = None,
literal: bool = False,
case_sensitive: bool = False,
max_results: int = 100,
) -> tuple[list[GrepMatch], bool]:
"""Search for matches inside text files under a directory."""
pass
@abstractmethod
def update_file(self, path: str, content: bytes) -> None:
"""Update a file with binary content.
Args:
path: The absolute path of the file to update.
content: The binary content to write to the file.
"""
pass

View File

@@ -0,0 +1,96 @@
from abc import ABC, abstractmethod
from deerflow.config import get_app_config
from deerflow.reflection import resolve_class
from deerflow.sandbox.sandbox import Sandbox
class SandboxProvider(ABC):
"""Abstract base class for sandbox providers"""
@abstractmethod
def acquire(self, thread_id: str | None = None) -> str:
"""Acquire a sandbox environment and return its ID.
Returns:
The ID of the acquired sandbox environment.
"""
pass
@abstractmethod
def get(self, sandbox_id: str) -> Sandbox | None:
"""Get a sandbox environment by ID.
Args:
sandbox_id: The ID of the sandbox environment to retain.
"""
pass
@abstractmethod
def release(self, sandbox_id: str) -> None:
"""Release a sandbox environment.
Args:
sandbox_id: The ID of the sandbox environment to destroy.
"""
pass
_default_sandbox_provider: SandboxProvider | None = None
def get_sandbox_provider(**kwargs) -> SandboxProvider:
"""Get the sandbox provider singleton.
Returns a cached singleton instance. Use `reset_sandbox_provider()` to clear
the cache, or `shutdown_sandbox_provider()` to properly shutdown and clear.
Returns:
A sandbox provider instance.
"""
global _default_sandbox_provider
if _default_sandbox_provider is None:
config = get_app_config()
cls = resolve_class(config.sandbox.use, SandboxProvider)
_default_sandbox_provider = cls(**kwargs)
return _default_sandbox_provider
def reset_sandbox_provider() -> None:
"""Reset the sandbox provider singleton.
This clears the cached instance without calling shutdown.
The next call to `get_sandbox_provider()` will create a new instance.
Useful for testing or when switching configurations.
Note: If the provider has active sandboxes, they will be orphaned.
Use `shutdown_sandbox_provider()` for proper cleanup.
"""
global _default_sandbox_provider
_default_sandbox_provider = None
def shutdown_sandbox_provider() -> None:
"""Shutdown and reset the sandbox provider.
This properly shuts down the provider (releasing all sandboxes)
before clearing the singleton. Call this when the application
is shutting down or when you need to completely reset the sandbox system.
"""
global _default_sandbox_provider
if _default_sandbox_provider is not None:
if hasattr(_default_sandbox_provider, "shutdown"):
_default_sandbox_provider.shutdown()
_default_sandbox_provider = None
def set_sandbox_provider(provider: SandboxProvider) -> None:
"""Set a custom sandbox provider instance.
This allows injecting a custom or mock provider for testing purposes.
Args:
provider: The SandboxProvider instance to use.
"""
global _default_sandbox_provider
_default_sandbox_provider = provider

View File

@@ -0,0 +1,210 @@
import fnmatch
import os
import re
from dataclasses import dataclass
from pathlib import Path, PurePosixPath
IGNORE_PATTERNS = [
".git",
".svn",
".hg",
".bzr",
"node_modules",
"__pycache__",
".venv",
"venv",
".env",
"env",
".tox",
".nox",
".eggs",
"*.egg-info",
"site-packages",
"dist",
"build",
".next",
".nuxt",
".output",
".turbo",
"target",
"out",
".idea",
".vscode",
"*.swp",
"*.swo",
"*~",
".project",
".classpath",
".settings",
".DS_Store",
"Thumbs.db",
"desktop.ini",
"*.lnk",
"*.log",
"*.tmp",
"*.temp",
"*.bak",
"*.cache",
".cache",
"logs",
".coverage",
"coverage",
".nyc_output",
"htmlcov",
".pytest_cache",
".mypy_cache",
".ruff_cache",
]
DEFAULT_MAX_FILE_SIZE_BYTES = 1_000_000
DEFAULT_LINE_SUMMARY_LENGTH = 200
@dataclass(frozen=True)
class GrepMatch:
path: str
line_number: int
line: str
def should_ignore_name(name: str) -> bool:
for pattern in IGNORE_PATTERNS:
if fnmatch.fnmatch(name, pattern):
return True
return False
def should_ignore_path(path: str) -> bool:
return any(should_ignore_name(segment) for segment in path.replace("\\", "/").split("/") if segment)
def path_matches(pattern: str, rel_path: str) -> bool:
path = PurePosixPath(rel_path)
if path.match(pattern):
return True
if pattern.startswith("**/"):
return path.match(pattern[3:])
return False
def truncate_line(line: str, max_chars: int = DEFAULT_LINE_SUMMARY_LENGTH) -> str:
line = line.rstrip("\n\r")
if len(line) <= max_chars:
return line
return line[: max_chars - 3] + "..."
def is_binary_file(path: Path, sample_size: int = 8192) -> bool:
try:
with path.open("rb") as handle:
return b"\0" in handle.read(sample_size)
except OSError:
return True
def find_glob_matches(root: Path, pattern: str, *, include_dirs: bool = False, max_results: int = 200) -> tuple[list[str], bool]:
matches: list[str] = []
truncated = False
root = root.resolve()
if not root.exists():
raise FileNotFoundError(root)
if not root.is_dir():
raise NotADirectoryError(root)
for current_root, dirs, files in os.walk(root):
dirs[:] = [name for name in dirs if not should_ignore_name(name)]
# root is already resolved; os.walk builds current_root by joining under root,
# so relative_to() works without an extra stat()/resolve() per directory.
rel_dir = Path(current_root).relative_to(root)
if include_dirs:
for name in dirs:
rel_path = (rel_dir / name).as_posix()
if path_matches(pattern, rel_path):
matches.append(str(Path(current_root) / name))
if len(matches) >= max_results:
truncated = True
return matches, truncated
for name in files:
if should_ignore_name(name):
continue
rel_path = (rel_dir / name).as_posix()
if path_matches(pattern, rel_path):
matches.append(str(Path(current_root) / name))
if len(matches) >= max_results:
truncated = True
return matches, truncated
return matches, truncated
def find_grep_matches(
root: Path,
pattern: str,
*,
glob_pattern: str | None = None,
literal: bool = False,
case_sensitive: bool = False,
max_results: int = 100,
max_file_size: int = DEFAULT_MAX_FILE_SIZE_BYTES,
line_summary_length: int = DEFAULT_LINE_SUMMARY_LENGTH,
) -> tuple[list[GrepMatch], bool]:
matches: list[GrepMatch] = []
truncated = False
root = root.resolve()
if not root.exists():
raise FileNotFoundError(root)
if not root.is_dir():
raise NotADirectoryError(root)
regex_source = re.escape(pattern) if literal else pattern
flags = 0 if case_sensitive else re.IGNORECASE
regex = re.compile(regex_source, flags)
# Skip lines longer than this to prevent ReDoS on minified / no-newline files.
_max_line_chars = line_summary_length * 10
for current_root, dirs, files in os.walk(root):
dirs[:] = [name for name in dirs if not should_ignore_name(name)]
rel_dir = Path(current_root).relative_to(root)
for name in files:
if should_ignore_name(name):
continue
candidate_path = Path(current_root) / name
rel_path = (rel_dir / name).as_posix()
if glob_pattern is not None and not path_matches(glob_pattern, rel_path):
continue
try:
if candidate_path.is_symlink():
continue
file_path = candidate_path.resolve()
if not file_path.is_relative_to(root):
continue
if file_path.stat().st_size > max_file_size or is_binary_file(file_path):
continue
with file_path.open(encoding="utf-8", errors="replace") as handle:
for line_number, line in enumerate(handle, start=1):
if len(line) > _max_line_chars:
continue
if regex.search(line):
matches.append(
GrepMatch(
path=str(file_path),
line_number=line_number,
line=truncate_line(line, line_summary_length),
)
)
if len(matches) >= max_results:
truncated = True
return matches, truncated
except OSError:
continue
return matches, truncated

View File

@@ -0,0 +1,45 @@
"""Security helpers for sandbox capability gating."""
from deerflow.config import get_app_config
_LOCAL_SANDBOX_PROVIDER_MARKERS = (
"deerflow.sandbox.local:LocalSandboxProvider",
"deerflow.sandbox.local.local_sandbox_provider:LocalSandboxProvider",
)
LOCAL_HOST_BASH_DISABLED_MESSAGE = (
"Host bash execution is disabled for LocalSandboxProvider because it is not a secure "
"sandbox boundary. Switch to AioSandboxProvider for isolated bash access, or set "
"sandbox.allow_host_bash: true only in a fully trusted local environment."
)
LOCAL_BASH_SUBAGENT_DISABLED_MESSAGE = (
"Bash subagent is disabled for LocalSandboxProvider because host bash execution is not "
"a secure sandbox boundary. Switch to AioSandboxProvider for isolated bash access, or "
"set sandbox.allow_host_bash: true only in a fully trusted local environment."
)
def uses_local_sandbox_provider(config=None) -> bool:
"""Return True when the active sandbox provider is the host-local provider."""
if config is None:
config = get_app_config()
sandbox_cfg = getattr(config, "sandbox", None)
sandbox_use = getattr(sandbox_cfg, "use", "")
if sandbox_use in _LOCAL_SANDBOX_PROVIDER_MARKERS:
return True
return sandbox_use.endswith(":LocalSandboxProvider") and "deerflow.sandbox.local" in sandbox_use
def is_host_bash_allowed(config=None) -> bool:
"""Return whether host bash execution is explicitly allowed."""
if config is None:
config = get_app_config()
sandbox_cfg = getattr(config, "sandbox", None)
if sandbox_cfg is None:
return False
if not uses_local_sandbox_provider(config):
return True
return bool(getattr(sandbox_cfg, "allow_host_bash", False))

File diff suppressed because it is too large Load Diff