Initial commit: hardened DeerFlow factory

Vendored deer-flow upstream (bytedance/deer-flow) plus prompt-injection
hardening:

- New deerflow.security package: content_delimiter, html_cleaner,
  sanitizer (8 layers — invisible chars, control chars, symbols, NFC,
  PUA, tag chars, horizontal whitespace collapse with newline/tab
  preservation, length cap)
- New deerflow.community.searx package: web_search, web_fetch,
  image_search backed by a private SearX instance, every external
  string sanitized and wrapped in <<<EXTERNAL_UNTRUSTED_CONTENT>>>
  delimiters
- All native community web providers (ddg_search, tavily, exa,
  firecrawl, jina_ai, infoquest, image_search) replaced with hard-fail
  stubs that raise NativeWebToolDisabledError at import time, so a
  misconfigured tool.use path fails loud rather than silently falling
  back to unsanitized output
- Native client back-doors (jina_client.py, infoquest_client.py)
  stubbed too
- Native-tool tests quarantined under tests/_disabled_native/
  (collect_ignore_glob via local conftest.py)
- Sanitizer Layer 7 fix: only collapse horizontal whitespace, preserve
  newlines and tabs so list/table structure survives
- Hardened runtime config.yaml references only the searx-backed tools
- Factory overlay (backend/) kept in sync with deer-flow tree as a
  reference / source

See HARDENING.md for the full audit trail and verification steps.
This commit is contained in:
2026-04-12 14:23:57 +02:00
commit 6de0bf9f5b
889 changed files with 173052 additions and 0 deletions

View File

@@ -0,0 +1,15 @@
from .aio_sandbox import AioSandbox
from .aio_sandbox_provider import AioSandboxProvider
from .backend import SandboxBackend
from .local_backend import LocalContainerBackend
from .remote_backend import RemoteSandboxBackend
from .sandbox_info import SandboxInfo
__all__ = [
"AioSandbox",
"AioSandboxProvider",
"LocalContainerBackend",
"RemoteSandboxBackend",
"SandboxBackend",
"SandboxInfo",
]

View File

@@ -0,0 +1,232 @@
import base64
import logging
import shlex
import threading
import uuid
from agent_sandbox import Sandbox as AioSandboxClient
from deerflow.sandbox.sandbox import Sandbox
from deerflow.sandbox.search import GrepMatch, path_matches, should_ignore_path, truncate_line
logger = logging.getLogger(__name__)
_ERROR_OBSERVATION_SIGNATURE = "'ErrorObservation' object has no attribute 'exit_code'"
class AioSandbox(Sandbox):
"""Sandbox implementation using the agent-infra/sandbox Docker container.
This sandbox connects to a running AIO sandbox container via HTTP API.
A threading lock serializes shell commands to prevent concurrent requests
from corrupting the container's single persistent session (see #1433).
"""
def __init__(self, id: str, base_url: str, home_dir: str | None = None):
"""Initialize the AIO sandbox.
Args:
id: Unique identifier for this sandbox instance.
base_url: URL of the sandbox API (e.g., http://localhost:8080).
home_dir: Home directory inside the sandbox. If None, will be fetched from the sandbox.
"""
super().__init__(id)
self._base_url = base_url
self._client = AioSandboxClient(base_url=base_url, timeout=600)
self._home_dir = home_dir
self._lock = threading.Lock()
@property
def base_url(self) -> str:
return self._base_url
@property
def home_dir(self) -> str:
"""Get the home directory inside the sandbox."""
if self._home_dir is None:
context = self._client.sandbox.get_context()
self._home_dir = context.home_dir
return self._home_dir
def execute_command(self, command: str) -> str:
"""Execute a shell command in the sandbox.
Uses a lock to serialize concurrent requests. The AIO sandbox
container maintains a single persistent shell session that
corrupts when hit with concurrent exec_command calls (returns
``ErrorObservation`` instead of real output). If corruption is
detected despite the lock (e.g. multiple processes sharing a
sandbox), the command is retried on a fresh session.
Args:
command: The command to execute.
Returns:
The output of the command.
"""
with self._lock:
try:
result = self._client.shell.exec_command(command=command)
output = result.data.output if result.data else ""
if output and _ERROR_OBSERVATION_SIGNATURE in output:
logger.warning("ErrorObservation detected in sandbox output, retrying with a fresh session")
fresh_id = str(uuid.uuid4())
result = self._client.shell.exec_command(command=command, id=fresh_id)
output = result.data.output if result.data else ""
return output if output else "(no output)"
except Exception as e:
logger.error(f"Failed to execute command in sandbox: {e}")
return f"Error: {e}"
def read_file(self, path: str) -> str:
"""Read the content of a file in the sandbox.
Args:
path: The absolute path of the file to read.
Returns:
The content of the file.
"""
try:
result = self._client.file.read_file(file=path)
return result.data.content if result.data else ""
except Exception as e:
logger.error(f"Failed to read file in sandbox: {e}")
return f"Error: {e}"
def list_dir(self, path: str, max_depth: int = 2) -> list[str]:
"""List the contents of a directory in the sandbox.
Args:
path: The absolute path of the directory to list.
max_depth: The maximum depth to traverse. Default is 2.
Returns:
The contents of the directory.
"""
with self._lock:
try:
result = self._client.shell.exec_command(command=f"find {shlex.quote(path)} -maxdepth {max_depth} -type f -o -type d 2>/dev/null | head -500")
output = result.data.output if result.data else ""
if output:
return [line.strip() for line in output.strip().split("\n") if line.strip()]
return []
except Exception as e:
logger.error(f"Failed to list directory in sandbox: {e}")
return []
def write_file(self, path: str, content: str, append: bool = False) -> None:
"""Write content to a file in the sandbox.
Args:
path: The absolute path of the file to write to.
content: The text content to write to the file.
append: Whether to append the content to the file.
"""
with self._lock:
try:
if append:
existing = self.read_file(path)
if not existing.startswith("Error:"):
content = existing + content
self._client.file.write_file(file=path, content=content)
except Exception as e:
logger.error(f"Failed to write file in sandbox: {e}")
raise
def glob(self, path: str, pattern: str, *, include_dirs: bool = False, max_results: int = 200) -> tuple[list[str], bool]:
if not include_dirs:
result = self._client.file.find_files(path=path, glob=pattern)
files = result.data.files if result.data and result.data.files else []
filtered = [file_path for file_path in files if not should_ignore_path(file_path)]
truncated = len(filtered) > max_results
return filtered[:max_results], truncated
result = self._client.file.list_path(path=path, recursive=True, show_hidden=False)
entries = result.data.files if result.data and result.data.files else []
matches: list[str] = []
root_path = path.rstrip("/") or "/"
root_prefix = root_path if root_path == "/" else f"{root_path}/"
for entry in entries:
if entry.path != root_path and not entry.path.startswith(root_prefix):
continue
if should_ignore_path(entry.path):
continue
rel_path = entry.path[len(root_path) :].lstrip("/")
if path_matches(pattern, rel_path):
matches.append(entry.path)
if len(matches) >= max_results:
return matches, True
return matches, False
def grep(
self,
path: str,
pattern: str,
*,
glob: str | None = None,
literal: bool = False,
case_sensitive: bool = False,
max_results: int = 100,
) -> tuple[list[GrepMatch], bool]:
import re as _re
regex_source = _re.escape(pattern) if literal else pattern
# Validate the pattern locally so an invalid regex raises re.error
# (caught by grep_tool's except re.error handler) rather than a
# generic remote API error.
_re.compile(regex_source, 0 if case_sensitive else _re.IGNORECASE)
regex = regex_source if case_sensitive else f"(?i){regex_source}"
if glob is not None:
find_result = self._client.file.find_files(path=path, glob=glob)
candidate_paths = find_result.data.files if find_result.data and find_result.data.files else []
else:
list_result = self._client.file.list_path(path=path, recursive=True, show_hidden=False)
entries = list_result.data.files if list_result.data and list_result.data.files else []
candidate_paths = [entry.path for entry in entries if not entry.is_directory]
matches: list[GrepMatch] = []
truncated = False
for file_path in candidate_paths:
if should_ignore_path(file_path):
continue
search_result = self._client.file.search_in_file(file=file_path, regex=regex)
data = search_result.data
if data is None:
continue
line_numbers = data.line_numbers or []
matched_lines = data.matches or []
for line_number, line in zip(line_numbers, matched_lines):
matches.append(
GrepMatch(
path=file_path,
line_number=line_number if isinstance(line_number, int) else 0,
line=truncate_line(line),
)
)
if len(matches) >= max_results:
truncated = True
return matches, truncated
return matches, truncated
def update_file(self, path: str, content: bytes) -> None:
"""Update a file with binary content in the sandbox.
Args:
path: The absolute path of the file to update.
content: The binary content to write to the file.
"""
with self._lock:
try:
base64_content = base64.b64encode(content).decode("utf-8")
self._client.file.write_file(file=path, content=base64_content, encoding="base64")
except Exception as e:
logger.error(f"Failed to update file in sandbox: {e}")
raise

View File

@@ -0,0 +1,694 @@
"""AIO Sandbox Provider — orchestrates sandbox lifecycle with pluggable backends.
This provider composes:
- SandboxBackend: how sandboxes are provisioned (local container vs remote/K8s)
The provider itself handles:
- In-process caching for fast repeated access
- Idle timeout management
- Graceful shutdown with signal handling
- Mount computation (thread-specific, skills)
"""
import atexit
import hashlib
import logging
import os
import signal
import threading
import time
import uuid
try:
import fcntl
except ImportError: # pragma: no cover - Windows fallback
fcntl = None # type: ignore[assignment]
import msvcrt
from deerflow.config import get_app_config
from deerflow.config.paths import VIRTUAL_PATH_PREFIX, get_paths
from deerflow.sandbox.sandbox import Sandbox
from deerflow.sandbox.sandbox_provider import SandboxProvider
from .aio_sandbox import AioSandbox
from .backend import SandboxBackend, wait_for_sandbox_ready
from .local_backend import LocalContainerBackend
from .remote_backend import RemoteSandboxBackend
from .sandbox_info import SandboxInfo
logger = logging.getLogger(__name__)
# Default configuration
DEFAULT_IMAGE = "enterprise-public-cn-beijing.cr.volces.com/vefaas-public/all-in-one-sandbox:latest"
DEFAULT_PORT = 8080
DEFAULT_CONTAINER_PREFIX = "deer-flow-sandbox"
DEFAULT_IDLE_TIMEOUT = 600 # 10 minutes in seconds
DEFAULT_REPLICAS = 3 # Maximum concurrent sandbox containers
IDLE_CHECK_INTERVAL = 60 # Check every 60 seconds
def _lock_file_exclusive(lock_file) -> None:
if fcntl is not None:
fcntl.flock(lock_file, fcntl.LOCK_EX)
return
lock_file.seek(0)
msvcrt.locking(lock_file.fileno(), msvcrt.LK_LOCK, 1)
def _unlock_file(lock_file) -> None:
if fcntl is not None:
fcntl.flock(lock_file, fcntl.LOCK_UN)
return
lock_file.seek(0)
msvcrt.locking(lock_file.fileno(), msvcrt.LK_UNLCK, 1)
class AioSandboxProvider(SandboxProvider):
"""Sandbox provider that manages containers running the AIO sandbox.
Architecture:
This provider composes a SandboxBackend (how to provision), enabling:
- Local Docker/Apple Container mode (auto-start containers)
- Remote/K8s mode (connect to pre-existing sandbox URL)
Configuration options in config.yaml under sandbox:
use: deerflow.community.aio_sandbox:AioSandboxProvider
image: <container image>
port: 8080 # Base port for local containers
container_prefix: deer-flow-sandbox
idle_timeout: 600 # Idle timeout in seconds (0 to disable)
replicas: 3 # Max concurrent sandbox containers (LRU eviction when exceeded)
mounts: # Volume mounts for local containers
- host_path: /path/on/host
container_path: /path/in/container
read_only: false
environment: # Environment variables for containers
NODE_ENV: production
API_KEY: $MY_API_KEY
"""
def __init__(self):
self._lock = threading.Lock()
self._sandboxes: dict[str, AioSandbox] = {} # sandbox_id -> AioSandbox instance
self._sandbox_infos: dict[str, SandboxInfo] = {} # sandbox_id -> SandboxInfo (for destroy)
self._thread_sandboxes: dict[str, str] = {} # thread_id -> sandbox_id
self._thread_locks: dict[str, threading.Lock] = {} # thread_id -> in-process lock
self._last_activity: dict[str, float] = {} # sandbox_id -> last activity timestamp
# Warm pool: released sandboxes whose containers are still running.
# Maps sandbox_id -> (SandboxInfo, release_timestamp).
# Containers here can be reclaimed quickly (no cold-start) or destroyed
# when replicas capacity is exhausted.
self._warm_pool: dict[str, tuple[SandboxInfo, float]] = {}
self._shutdown_called = False
self._idle_checker_stop = threading.Event()
self._idle_checker_thread: threading.Thread | None = None
self._config = self._load_config()
self._backend: SandboxBackend = self._create_backend()
# Register shutdown handler
atexit.register(self.shutdown)
self._register_signal_handlers()
# Reconcile orphaned containers from previous process lifecycles
self._reconcile_orphans()
# Start idle checker if enabled
if self._config.get("idle_timeout", DEFAULT_IDLE_TIMEOUT) > 0:
self._start_idle_checker()
# ── Factory methods ──────────────────────────────────────────────────
def _create_backend(self) -> SandboxBackend:
"""Create the appropriate backend based on configuration.
Selection logic (checked in order):
1. ``provisioner_url`` set → RemoteSandboxBackend (provisioner mode)
Provisioner dynamically creates Pods + Services in k3s.
2. Default → LocalContainerBackend (local mode)
Local provider manages container lifecycle directly (start/stop).
"""
provisioner_url = self._config.get("provisioner_url")
if provisioner_url:
logger.info(f"Using remote sandbox backend with provisioner at {provisioner_url}")
return RemoteSandboxBackend(provisioner_url=provisioner_url)
logger.info("Using local container sandbox backend")
return LocalContainerBackend(
image=self._config["image"],
base_port=self._config["port"],
container_prefix=self._config["container_prefix"],
config_mounts=self._config["mounts"],
environment=self._config["environment"],
)
# ── Configuration ────────────────────────────────────────────────────
def _load_config(self) -> dict:
"""Load sandbox configuration from app config."""
config = get_app_config()
sandbox_config = config.sandbox
idle_timeout = getattr(sandbox_config, "idle_timeout", None)
replicas = getattr(sandbox_config, "replicas", None)
return {
"image": sandbox_config.image or DEFAULT_IMAGE,
"port": sandbox_config.port or DEFAULT_PORT,
"container_prefix": sandbox_config.container_prefix or DEFAULT_CONTAINER_PREFIX,
"idle_timeout": idle_timeout if idle_timeout is not None else DEFAULT_IDLE_TIMEOUT,
"replicas": replicas if replicas is not None else DEFAULT_REPLICAS,
"mounts": sandbox_config.mounts or [],
"environment": self._resolve_env_vars(sandbox_config.environment or {}),
# provisioner URL for dynamic pod management (e.g. http://provisioner:8002)
"provisioner_url": getattr(sandbox_config, "provisioner_url", None) or "",
}
@staticmethod
def _resolve_env_vars(env_config: dict[str, str]) -> dict[str, str]:
"""Resolve environment variable references (values starting with $)."""
resolved = {}
for key, value in env_config.items():
if isinstance(value, str) and value.startswith("$"):
env_name = value[1:]
resolved[key] = os.environ.get(env_name, "")
else:
resolved[key] = str(value)
return resolved
# ── Startup reconciliation ────────────────────────────────────────────
def _reconcile_orphans(self) -> None:
"""Reconcile orphaned containers left by previous process lifecycles.
On startup, enumerate all running containers matching our prefix
and adopt them all into the warm pool. The idle checker will reclaim
containers that nobody re-acquires within ``idle_timeout``.
All containers are adopted unconditionally because we cannot
distinguish "orphaned" from "actively used by another process"
based on age alone — ``idle_timeout`` represents inactivity, not
uptime. Adopting into the warm pool and letting the idle checker
decide avoids destroying containers that a concurrent process may
still be using.
This closes the fundamental gap where in-memory state loss (process
restart, crash, SIGKILL) leaves Docker containers running forever.
"""
try:
running = self._backend.list_running()
except Exception as e:
logger.warning(f"Failed to enumerate running containers during startup reconciliation: {e}")
return
if not running:
return
current_time = time.time()
adopted = 0
for info in running:
age = current_time - info.created_at if info.created_at > 0 else float("inf")
# Single lock acquisition per container: atomic check-and-insert.
# Avoids a TOCTOU window between the "already tracked?" check and
# the warm-pool insert.
with self._lock:
if info.sandbox_id in self._sandboxes or info.sandbox_id in self._warm_pool:
continue
self._warm_pool[info.sandbox_id] = (info, current_time)
adopted += 1
logger.info(f"Adopted container {info.sandbox_id} into warm pool (age: {age:.0f}s)")
logger.info(f"Startup reconciliation complete: {adopted} adopted into warm pool, {len(running)} total found")
# ── Deterministic ID ─────────────────────────────────────────────────
@staticmethod
def _deterministic_sandbox_id(thread_id: str) -> str:
"""Generate a deterministic sandbox ID from a thread ID.
Ensures all processes derive the same sandbox_id for a given thread,
enabling cross-process sandbox discovery without shared memory.
"""
return hashlib.sha256(thread_id.encode()).hexdigest()[:8]
# ── Mount helpers ────────────────────────────────────────────────────
def _get_extra_mounts(self, thread_id: str | None) -> list[tuple[str, str, bool]]:
"""Collect all extra mounts for a sandbox (thread-specific + skills)."""
mounts: list[tuple[str, str, bool]] = []
if thread_id:
mounts.extend(self._get_thread_mounts(thread_id))
logger.info(f"Adding thread mounts for thread {thread_id}: {mounts}")
skills_mount = self._get_skills_mount()
if skills_mount:
mounts.append(skills_mount)
logger.info(f"Adding skills mount: {skills_mount}")
return mounts
@staticmethod
def _get_thread_mounts(thread_id: str) -> list[tuple[str, str, bool]]:
"""Get volume mounts for a thread's data directories.
Creates directories if they don't exist (lazy initialization).
Mount sources use host_base_dir so that when running inside Docker with a
mounted Docker socket (DooD), the host Docker daemon can resolve the paths.
"""
paths = get_paths()
paths.ensure_thread_dirs(thread_id)
return [
(paths.host_sandbox_work_dir(thread_id), f"{VIRTUAL_PATH_PREFIX}/workspace", False),
(paths.host_sandbox_uploads_dir(thread_id), f"{VIRTUAL_PATH_PREFIX}/uploads", False),
(paths.host_sandbox_outputs_dir(thread_id), f"{VIRTUAL_PATH_PREFIX}/outputs", False),
# ACP workspace: read-only inside the sandbox (lead agent reads results;
# the ACP subprocess writes from the host side, not from within the container).
(paths.host_acp_workspace_dir(thread_id), "/mnt/acp-workspace", True),
]
@staticmethod
def _get_skills_mount() -> tuple[str, str, bool] | None:
"""Get the skills directory mount configuration.
Mount source uses DEER_FLOW_HOST_SKILLS_PATH when running inside Docker (DooD)
so the host Docker daemon can resolve the path.
"""
try:
config = get_app_config()
skills_path = config.skills.get_skills_path()
container_path = config.skills.container_path
if skills_path.exists():
# When running inside Docker with DooD, use host-side skills path.
host_skills = os.environ.get("DEER_FLOW_HOST_SKILLS_PATH") or str(skills_path)
return (host_skills, container_path, True) # Read-only for security
except Exception as e:
logger.warning(f"Could not setup skills mount: {e}")
return None
# ── Idle timeout management ──────────────────────────────────────────
def _start_idle_checker(self) -> None:
"""Start the background thread that checks for idle sandboxes."""
self._idle_checker_thread = threading.Thread(
target=self._idle_checker_loop,
name="sandbox-idle-checker",
daemon=True,
)
self._idle_checker_thread.start()
logger.info(f"Started idle checker thread (timeout: {self._config.get('idle_timeout', DEFAULT_IDLE_TIMEOUT)}s)")
def _idle_checker_loop(self) -> None:
idle_timeout = self._config.get("idle_timeout", DEFAULT_IDLE_TIMEOUT)
while not self._idle_checker_stop.wait(timeout=IDLE_CHECK_INTERVAL):
try:
self._cleanup_idle_sandboxes(idle_timeout)
except Exception as e:
logger.error(f"Error in idle checker loop: {e}")
def _cleanup_idle_sandboxes(self, idle_timeout: float) -> None:
current_time = time.time()
active_to_destroy = []
warm_to_destroy: list[tuple[str, SandboxInfo]] = []
with self._lock:
# Active sandboxes: tracked via _last_activity
for sandbox_id, last_activity in self._last_activity.items():
idle_duration = current_time - last_activity
if idle_duration > idle_timeout:
active_to_destroy.append(sandbox_id)
logger.info(f"Sandbox {sandbox_id} idle for {idle_duration:.1f}s, marking for destroy")
# Warm pool: tracked via release_timestamp stored in _warm_pool
for sandbox_id, (info, release_ts) in list(self._warm_pool.items()):
warm_duration = current_time - release_ts
if warm_duration > idle_timeout:
warm_to_destroy.append((sandbox_id, info))
del self._warm_pool[sandbox_id]
logger.info(f"Warm-pool sandbox {sandbox_id} idle for {warm_duration:.1f}s, marking for destroy")
# Destroy active sandboxes (re-verify still idle before acting)
for sandbox_id in active_to_destroy:
try:
# Re-verify the sandbox is still idle under the lock before destroying.
# Between the snapshot above and here, the sandbox may have been
# re-acquired (last_activity updated) or already released/destroyed.
with self._lock:
last_activity = self._last_activity.get(sandbox_id)
if last_activity is None:
# Already released or destroyed by another path — skip.
logger.info(f"Sandbox {sandbox_id} already gone before idle destroy, skipping")
continue
if (time.time() - last_activity) < idle_timeout:
# Re-acquired (activity updated) since the snapshot — skip.
logger.info(f"Sandbox {sandbox_id} was re-acquired before idle destroy, skipping")
continue
logger.info(f"Destroying idle sandbox {sandbox_id}")
self.destroy(sandbox_id)
except Exception as e:
logger.error(f"Failed to destroy idle sandbox {sandbox_id}: {e}")
# Destroy warm-pool sandboxes (already removed from _warm_pool under lock above)
for sandbox_id, info in warm_to_destroy:
try:
self._backend.destroy(info)
logger.info(f"Destroyed idle warm-pool sandbox {sandbox_id}")
except Exception as e:
logger.error(f"Failed to destroy idle warm-pool sandbox {sandbox_id}: {e}")
# ── Signal handling ──────────────────────────────────────────────────
def _register_signal_handlers(self) -> None:
"""Register signal handlers for graceful shutdown.
Handles SIGTERM, SIGINT, and SIGHUP (terminal close) to ensure
sandbox containers are cleaned up even when the user closes the terminal.
"""
self._original_sigterm = signal.getsignal(signal.SIGTERM)
self._original_sigint = signal.getsignal(signal.SIGINT)
self._original_sighup = signal.getsignal(signal.SIGHUP) if hasattr(signal, "SIGHUP") else None
def signal_handler(signum, frame):
self.shutdown()
if signum == signal.SIGTERM:
original = self._original_sigterm
elif hasattr(signal, "SIGHUP") and signum == signal.SIGHUP:
original = self._original_sighup
else:
original = self._original_sigint
if callable(original):
original(signum, frame)
elif original == signal.SIG_DFL:
signal.signal(signum, signal.SIG_DFL)
signal.raise_signal(signum)
try:
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
if hasattr(signal, "SIGHUP"):
signal.signal(signal.SIGHUP, signal_handler)
except ValueError:
logger.debug("Could not register signal handlers (not main thread)")
# ── Thread locking (in-process) ──────────────────────────────────────
def _get_thread_lock(self, thread_id: str) -> threading.Lock:
"""Get or create an in-process lock for a specific thread_id."""
with self._lock:
if thread_id not in self._thread_locks:
self._thread_locks[thread_id] = threading.Lock()
return self._thread_locks[thread_id]
# ── Core: acquire / get / release / shutdown ─────────────────────────
def acquire(self, thread_id: str | None = None) -> str:
"""Acquire a sandbox environment and return its ID.
For the same thread_id, this method will return the same sandbox_id
across multiple turns, multiple processes, and (with shared storage)
multiple pods.
Thread-safe with both in-process and cross-process locking.
Args:
thread_id: Optional thread ID for thread-specific configurations.
Returns:
The ID of the acquired sandbox environment.
"""
if thread_id:
thread_lock = self._get_thread_lock(thread_id)
with thread_lock:
return self._acquire_internal(thread_id)
else:
return self._acquire_internal(thread_id)
def _acquire_internal(self, thread_id: str | None) -> str:
"""Internal sandbox acquisition with two-layer consistency.
Layer 1: In-process cache (fastest, covers same-process repeated access)
Layer 2: Backend discovery (covers containers started by other processes;
sandbox_id is deterministic from thread_id so no shared state file
is needed — any process can derive the same container name)
"""
# ── Layer 1: In-process cache (fast path) ──
if thread_id:
with self._lock:
if thread_id in self._thread_sandboxes:
existing_id = self._thread_sandboxes[thread_id]
if existing_id in self._sandboxes:
logger.info(f"Reusing in-process sandbox {existing_id} for thread {thread_id}")
self._last_activity[existing_id] = time.time()
return existing_id
else:
del self._thread_sandboxes[thread_id]
# Deterministic ID for thread-specific, random for anonymous
sandbox_id = self._deterministic_sandbox_id(thread_id) if thread_id else str(uuid.uuid4())[:8]
# ── Layer 1.5: Warm pool (container still running, no cold-start) ──
if thread_id:
with self._lock:
if sandbox_id in self._warm_pool:
info, _ = self._warm_pool.pop(sandbox_id)
sandbox = AioSandbox(id=sandbox_id, base_url=info.sandbox_url)
self._sandboxes[sandbox_id] = sandbox
self._sandbox_infos[sandbox_id] = info
self._last_activity[sandbox_id] = time.time()
self._thread_sandboxes[thread_id] = sandbox_id
logger.info(f"Reclaimed warm-pool sandbox {sandbox_id} for thread {thread_id} at {info.sandbox_url}")
return sandbox_id
# ── Layer 2: Backend discovery + create (protected by cross-process lock) ──
# Use a file lock so that two processes racing to create the same sandbox
# for the same thread_id serialize here: the second process will discover
# the container started by the first instead of hitting a name-conflict.
if thread_id:
return self._discover_or_create_with_lock(thread_id, sandbox_id)
return self._create_sandbox(thread_id, sandbox_id)
def _discover_or_create_with_lock(self, thread_id: str, sandbox_id: str) -> str:
"""Discover an existing sandbox or create a new one under a cross-process file lock.
The file lock serializes concurrent sandbox creation for the same thread_id
across multiple processes, preventing container-name conflicts.
"""
paths = get_paths()
paths.ensure_thread_dirs(thread_id)
lock_path = paths.thread_dir(thread_id) / f"{sandbox_id}.lock"
with open(lock_path, "a", encoding="utf-8") as lock_file:
locked = False
try:
_lock_file_exclusive(lock_file)
locked = True
# Re-check in-process caches under the file lock in case another
# thread in this process won the race while we were waiting.
with self._lock:
if thread_id in self._thread_sandboxes:
existing_id = self._thread_sandboxes[thread_id]
if existing_id in self._sandboxes:
logger.info(f"Reusing in-process sandbox {existing_id} for thread {thread_id} (post-lock check)")
self._last_activity[existing_id] = time.time()
return existing_id
if sandbox_id in self._warm_pool:
info, _ = self._warm_pool.pop(sandbox_id)
sandbox = AioSandbox(id=sandbox_id, base_url=info.sandbox_url)
self._sandboxes[sandbox_id] = sandbox
self._sandbox_infos[sandbox_id] = info
self._last_activity[sandbox_id] = time.time()
self._thread_sandboxes[thread_id] = sandbox_id
logger.info(f"Reclaimed warm-pool sandbox {sandbox_id} for thread {thread_id} (post-lock check)")
return sandbox_id
# Backend discovery: another process may have created the container.
discovered = self._backend.discover(sandbox_id)
if discovered is not None:
sandbox = AioSandbox(id=discovered.sandbox_id, base_url=discovered.sandbox_url)
with self._lock:
self._sandboxes[discovered.sandbox_id] = sandbox
self._sandbox_infos[discovered.sandbox_id] = discovered
self._last_activity[discovered.sandbox_id] = time.time()
self._thread_sandboxes[thread_id] = discovered.sandbox_id
logger.info(f"Discovered existing sandbox {discovered.sandbox_id} for thread {thread_id} at {discovered.sandbox_url}")
return discovered.sandbox_id
return self._create_sandbox(thread_id, sandbox_id)
finally:
if locked:
_unlock_file(lock_file)
def _evict_oldest_warm(self) -> str | None:
"""Destroy the oldest container in the warm pool to free capacity.
Returns:
The evicted sandbox_id, or None if warm pool is empty.
"""
with self._lock:
if not self._warm_pool:
return None
oldest_id = min(self._warm_pool, key=lambda sid: self._warm_pool[sid][1])
info, _ = self._warm_pool.pop(oldest_id)
try:
self._backend.destroy(info)
logger.info(f"Destroyed warm-pool sandbox {oldest_id}")
except Exception as e:
logger.error(f"Failed to destroy warm-pool sandbox {oldest_id}: {e}")
return None
return oldest_id
def _create_sandbox(self, thread_id: str | None, sandbox_id: str) -> str:
"""Create a new sandbox via the backend.
Args:
thread_id: Optional thread ID.
sandbox_id: The sandbox ID to use.
Returns:
The sandbox_id.
Raises:
RuntimeError: If sandbox creation or readiness check fails.
"""
extra_mounts = self._get_extra_mounts(thread_id)
# Enforce replicas: only warm-pool containers count toward eviction budget.
# Active sandboxes are in use by live threads and must not be forcibly stopped.
replicas = self._config.get("replicas", DEFAULT_REPLICAS)
with self._lock:
total = len(self._sandboxes) + len(self._warm_pool)
if total >= replicas:
evicted = self._evict_oldest_warm()
if evicted:
logger.info(f"Evicted warm-pool sandbox {evicted} to stay within replicas={replicas}")
else:
# All slots are occupied by active sandboxes — proceed anyway and log.
# The replicas limit is a soft cap; we never forcibly stop a container
# that is actively serving a thread.
logger.warning(f"All {replicas} replica slots are in active use; creating sandbox {sandbox_id} beyond the soft limit")
info = self._backend.create(thread_id, sandbox_id, extra_mounts=extra_mounts or None)
# Wait for sandbox to be ready
if not wait_for_sandbox_ready(info.sandbox_url, timeout=60):
self._backend.destroy(info)
raise RuntimeError(f"Sandbox {sandbox_id} failed to become ready within timeout at {info.sandbox_url}")
sandbox = AioSandbox(id=sandbox_id, base_url=info.sandbox_url)
with self._lock:
self._sandboxes[sandbox_id] = sandbox
self._sandbox_infos[sandbox_id] = info
self._last_activity[sandbox_id] = time.time()
if thread_id:
self._thread_sandboxes[thread_id] = sandbox_id
logger.info(f"Created sandbox {sandbox_id} for thread {thread_id} at {info.sandbox_url}")
return sandbox_id
def get(self, sandbox_id: str) -> Sandbox | None:
"""Get a sandbox by ID. Updates last activity timestamp.
Args:
sandbox_id: The ID of the sandbox.
Returns:
The sandbox instance if found, None otherwise.
"""
with self._lock:
sandbox = self._sandboxes.get(sandbox_id)
if sandbox is not None:
self._last_activity[sandbox_id] = time.time()
return sandbox
def release(self, sandbox_id: str) -> None:
"""Release a sandbox from active use into the warm pool.
The container is kept running so it can be reclaimed quickly by the same
thread on its next turn without a cold-start. The container will only be
stopped when the replicas limit forces eviction or during shutdown.
Args:
sandbox_id: The ID of the sandbox to release.
"""
info = None
thread_ids_to_remove: list[str] = []
with self._lock:
self._sandboxes.pop(sandbox_id, None)
info = self._sandbox_infos.pop(sandbox_id, None)
thread_ids_to_remove = [tid for tid, sid in self._thread_sandboxes.items() if sid == sandbox_id]
for tid in thread_ids_to_remove:
del self._thread_sandboxes[tid]
self._last_activity.pop(sandbox_id, None)
# Park in warm pool — container keeps running
if info and sandbox_id not in self._warm_pool:
self._warm_pool[sandbox_id] = (info, time.time())
logger.info(f"Released sandbox {sandbox_id} to warm pool (container still running)")
def destroy(self, sandbox_id: str) -> None:
"""Destroy a sandbox: stop the container and free all resources.
Unlike release(), this actually stops the container. Use this for
explicit cleanup, capacity-driven eviction, or shutdown.
Args:
sandbox_id: The ID of the sandbox to destroy.
"""
info = None
thread_ids_to_remove: list[str] = []
with self._lock:
self._sandboxes.pop(sandbox_id, None)
info = self._sandbox_infos.pop(sandbox_id, None)
thread_ids_to_remove = [tid for tid, sid in self._thread_sandboxes.items() if sid == sandbox_id]
for tid in thread_ids_to_remove:
del self._thread_sandboxes[tid]
self._last_activity.pop(sandbox_id, None)
# Also pull from warm pool if it was parked there
if info is None and sandbox_id in self._warm_pool:
info, _ = self._warm_pool.pop(sandbox_id)
else:
self._warm_pool.pop(sandbox_id, None)
if info:
self._backend.destroy(info)
logger.info(f"Destroyed sandbox {sandbox_id}")
def shutdown(self) -> None:
"""Shutdown all sandboxes. Thread-safe and idempotent."""
with self._lock:
if self._shutdown_called:
return
self._shutdown_called = True
sandbox_ids = list(self._sandboxes.keys())
warm_items = list(self._warm_pool.items())
self._warm_pool.clear()
# Stop idle checker
self._idle_checker_stop.set()
if self._idle_checker_thread is not None and self._idle_checker_thread.is_alive():
self._idle_checker_thread.join(timeout=5)
logger.info("Stopped idle checker thread")
logger.info(f"Shutting down {len(sandbox_ids)} active + {len(warm_items)} warm-pool sandbox(es)")
for sandbox_id in sandbox_ids:
try:
self.destroy(sandbox_id)
except Exception as e:
logger.error(f"Failed to destroy sandbox {sandbox_id} during shutdown: {e}")
for sandbox_id, (info, _) in warm_items:
try:
self._backend.destroy(info)
logger.info(f"Destroyed warm-pool sandbox {sandbox_id} during shutdown")
except Exception as e:
logger.error(f"Failed to destroy warm-pool sandbox {sandbox_id} during shutdown: {e}")

View File

@@ -0,0 +1,114 @@
"""Abstract base class for sandbox provisioning backends."""
from __future__ import annotations
import logging
import time
from abc import ABC, abstractmethod
import requests
from .sandbox_info import SandboxInfo
logger = logging.getLogger(__name__)
def wait_for_sandbox_ready(sandbox_url: str, timeout: int = 30) -> bool:
"""Poll sandbox health endpoint until ready or timeout.
Args:
sandbox_url: URL of the sandbox (e.g. http://k3s:30001).
timeout: Maximum time to wait in seconds.
Returns:
True if sandbox is ready, False otherwise.
"""
start_time = time.time()
while time.time() - start_time < timeout:
try:
response = requests.get(f"{sandbox_url}/v1/sandbox", timeout=5)
if response.status_code == 200:
return True
except requests.exceptions.RequestException:
pass
time.sleep(1)
return False
class SandboxBackend(ABC):
"""Abstract base for sandbox provisioning backends.
Two implementations:
- LocalContainerBackend: starts Docker/Apple Container locally, manages ports
- RemoteSandboxBackend: connects to a pre-existing URL (K8s service, external)
"""
@abstractmethod
def create(self, thread_id: str, sandbox_id: str, extra_mounts: list[tuple[str, str, bool]] | None = None) -> SandboxInfo:
"""Create/provision a new sandbox.
Args:
thread_id: Thread ID for which the sandbox is being created. Useful for backends that want to organize sandboxes by thread.
sandbox_id: Deterministic sandbox identifier.
extra_mounts: Additional volume mounts as (host_path, container_path, read_only) tuples.
Ignored by backends that don't manage containers (e.g., remote).
Returns:
SandboxInfo with connection details.
"""
...
@abstractmethod
def destroy(self, info: SandboxInfo) -> None:
"""Destroy/cleanup a sandbox and release its resources.
Args:
info: The sandbox metadata to destroy.
"""
...
@abstractmethod
def is_alive(self, info: SandboxInfo) -> bool:
"""Quick check whether a sandbox is still alive.
This should be a lightweight check (e.g., container inspect)
rather than a full health check.
Args:
info: The sandbox metadata to check.
Returns:
True if the sandbox appears to be alive.
"""
...
@abstractmethod
def discover(self, sandbox_id: str) -> SandboxInfo | None:
"""Try to discover an existing sandbox by its deterministic ID.
Used for cross-process recovery: when another process started a sandbox,
this process can discover it by the deterministic container name or URL.
Args:
sandbox_id: The deterministic sandbox ID to look for.
Returns:
SandboxInfo if found and healthy, None otherwise.
"""
...
def list_running(self) -> list[SandboxInfo]:
"""Enumerate all running sandboxes managed by this backend.
Used for startup reconciliation: when the process restarts, it needs
to discover containers started by previous processes so they can be
adopted into the warm pool or destroyed if idle too long.
The default implementation returns an empty list, which is correct
for backends that don't manage local containers (e.g., RemoteSandboxBackend
delegates lifecycle to the provisioner which handles its own cleanup).
Returns:
A list of SandboxInfo for all currently running sandboxes.
"""
return []

View File

@@ -0,0 +1,530 @@
"""Local container backend for sandbox provisioning.
Manages sandbox containers using Docker or Apple Container on the local machine.
Handles container lifecycle, port allocation, and cross-process container discovery.
"""
from __future__ import annotations
import json
import logging
import os
import subprocess
from datetime import datetime
from deerflow.utils.network import get_free_port, release_port
from .backend import SandboxBackend, wait_for_sandbox_ready
from .sandbox_info import SandboxInfo
logger = logging.getLogger(__name__)
def _parse_docker_timestamp(raw: str) -> float:
"""Parse Docker's ISO 8601 timestamp into a Unix epoch float.
Docker returns timestamps with nanosecond precision and a trailing ``Z``
(e.g. ``2026-04-08T01:22:50.123456789Z``). Python's ``fromisoformat``
accepts at most microseconds and (pre-3.11) does not accept ``Z``, so the
string is normalized before parsing. Returns ``0.0`` on empty input or
parse failure so callers can use ``0.0`` as a sentinel for "unknown age".
"""
if not raw:
return 0.0
try:
s = raw.strip()
if "." in s:
dot_pos = s.index(".")
tz_start = dot_pos + 1
while tz_start < len(s) and s[tz_start].isdigit():
tz_start += 1
frac = s[dot_pos + 1 : tz_start][:6] # truncate to microseconds
tz_suffix = s[tz_start:]
s = s[: dot_pos + 1] + frac + tz_suffix
if s.endswith("Z"):
s = s[:-1] + "+00:00"
return datetime.fromisoformat(s).timestamp()
except (ValueError, TypeError) as e:
logger.debug(f"Could not parse docker timestamp {raw!r}: {e}")
return 0.0
def _extract_host_port(inspect_entry: dict, container_port: int) -> int | None:
"""Extract the host port mapped to ``container_port/tcp`` from a docker inspect entry.
Returns None if the container has no port mapping for that port.
"""
try:
ports = (inspect_entry.get("NetworkSettings") or {}).get("Ports") or {}
bindings = ports.get(f"{container_port}/tcp") or []
if bindings:
host_port = bindings[0].get("HostPort")
if host_port:
return int(host_port)
except (ValueError, TypeError, AttributeError):
pass
return None
def _format_container_mount(runtime: str, host_path: str, container_path: str, read_only: bool) -> list[str]:
"""Format a bind-mount argument for the selected runtime.
Docker's ``-v host:container`` syntax is ambiguous for Windows drive-letter
paths like ``D:/...`` because ``:`` is both the drive separator and the
volume separator. Use ``--mount type=bind,...`` for Docker to avoid that
parsing ambiguity. Apple Container keeps using ``-v``.
"""
if runtime == "docker":
mount_spec = f"type=bind,src={host_path},dst={container_path}"
if read_only:
mount_spec += ",readonly"
return ["--mount", mount_spec]
mount_spec = f"{host_path}:{container_path}"
if read_only:
mount_spec += ":ro"
return ["-v", mount_spec]
class LocalContainerBackend(SandboxBackend):
"""Backend that manages sandbox containers locally using Docker or Apple Container.
On macOS, automatically prefers Apple Container if available, otherwise falls back to Docker.
On other platforms, uses Docker.
Features:
- Deterministic container naming for cross-process discovery
- Port allocation with thread-safe utilities
- Container lifecycle management (start/stop with --rm)
- Support for volume mounts and environment variables
"""
def __init__(
self,
*,
image: str,
base_port: int,
container_prefix: str,
config_mounts: list,
environment: dict[str, str],
):
"""Initialize the local container backend.
Args:
image: Container image to use.
base_port: Base port number to start searching for free ports.
container_prefix: Prefix for container names (e.g., "deer-flow-sandbox").
config_mounts: Volume mount configurations from config (list of VolumeMountConfig).
environment: Environment variables to inject into containers.
"""
self._image = image
self._base_port = base_port
self._container_prefix = container_prefix
self._config_mounts = config_mounts
self._environment = environment
self._runtime = self._detect_runtime()
@property
def runtime(self) -> str:
"""The detected container runtime ("docker" or "container")."""
return self._runtime
def _detect_runtime(self) -> str:
"""Detect which container runtime to use.
On macOS, prefer Apple Container if available, otherwise fall back to Docker.
On other platforms, use Docker.
Returns:
"container" for Apple Container, "docker" for Docker.
"""
import platform
if platform.system() == "Darwin":
try:
result = subprocess.run(
["container", "--version"],
capture_output=True,
text=True,
check=True,
timeout=5,
)
logger.info(f"Detected Apple Container: {result.stdout.strip()}")
return "container"
except (FileNotFoundError, subprocess.CalledProcessError, subprocess.TimeoutExpired):
logger.info("Apple Container not available, falling back to Docker")
return "docker"
# ── SandboxBackend interface ──────────────────────────────────────────
def create(self, thread_id: str, sandbox_id: str, extra_mounts: list[tuple[str, str, bool]] | None = None) -> SandboxInfo:
"""Start a new container and return its connection info.
Args:
thread_id: Thread ID for which the sandbox is being created. Useful for backends that want to organize sandboxes by thread.
sandbox_id: Deterministic sandbox identifier (used in container name).
extra_mounts: Additional volume mounts as (host_path, container_path, read_only) tuples.
Returns:
SandboxInfo with container details.
Raises:
RuntimeError: If the container fails to start.
"""
container_name = f"{self._container_prefix}-{sandbox_id}"
# Retry loop: if Docker rejects the port (e.g. a stale container still
# holds the binding after a process restart), skip that port and try the
# next one. The socket-bind check in get_free_port mirrors Docker's
# 0.0.0.0 bind, but Docker's port-release can be slightly asynchronous,
# so a reactive fallback here ensures we always make progress.
_next_start = self._base_port
container_id: str | None = None
port: int = 0
for _attempt in range(10):
port = get_free_port(start_port=_next_start)
try:
container_id = self._start_container(container_name, port, extra_mounts)
break
except RuntimeError as exc:
release_port(port)
err = str(exc)
err_lower = err.lower()
# Port already bound: skip this port and retry with the next one.
if "port is already allocated" in err or "address already in use" in err_lower:
logger.warning(f"Port {port} rejected by Docker (already allocated), retrying with next port")
_next_start = port + 1
continue
# Container-name conflict: another process may have already started
# the deterministic sandbox container for this sandbox_id. Try to
# discover and adopt the existing container instead of failing.
if "is already in use by container" in err_lower or "conflict. the container name" in err_lower:
logger.warning(f"Container name {container_name} already in use, attempting to discover existing sandbox instance")
existing = self.discover(sandbox_id)
if existing is not None:
return existing
raise
else:
raise RuntimeError("Could not start sandbox container: all candidate ports are already allocated by Docker")
# When running inside Docker (DooD), sandbox containers are reachable via
# host.docker.internal rather than localhost (they run on the host daemon).
sandbox_host = os.environ.get("DEER_FLOW_SANDBOX_HOST", "localhost")
return SandboxInfo(
sandbox_id=sandbox_id,
sandbox_url=f"http://{sandbox_host}:{port}",
container_name=container_name,
container_id=container_id,
)
def destroy(self, info: SandboxInfo) -> None:
"""Stop the container and release its port."""
# Prefer container_id, fall back to container_name (both accepted by docker stop).
# This ensures containers discovered via list_running() (which only has the name)
# can also be stopped.
stop_target = info.container_id or info.container_name
if stop_target:
self._stop_container(stop_target)
# Extract port from sandbox_url for release
try:
from urllib.parse import urlparse
port = urlparse(info.sandbox_url).port
if port:
release_port(port)
except Exception:
pass
def is_alive(self, info: SandboxInfo) -> bool:
"""Check if the container is still running (lightweight, no HTTP)."""
if info.container_name:
return self._is_container_running(info.container_name)
return False
def discover(self, sandbox_id: str) -> SandboxInfo | None:
"""Discover an existing container by its deterministic name.
Checks if a container with the expected name is running, retrieves its
port, and verifies it responds to health checks.
Args:
sandbox_id: The deterministic sandbox ID (determines container name).
Returns:
SandboxInfo if container found and healthy, None otherwise.
"""
container_name = f"{self._container_prefix}-{sandbox_id}"
if not self._is_container_running(container_name):
return None
port = self._get_container_port(container_name)
if port is None:
return None
sandbox_host = os.environ.get("DEER_FLOW_SANDBOX_HOST", "localhost")
sandbox_url = f"http://{sandbox_host}:{port}"
if not wait_for_sandbox_ready(sandbox_url, timeout=5):
return None
return SandboxInfo(
sandbox_id=sandbox_id,
sandbox_url=sandbox_url,
container_name=container_name,
)
def list_running(self) -> list[SandboxInfo]:
"""Enumerate all running containers matching the configured prefix.
Uses a single ``docker ps`` call to list container names, then a
single batched ``docker inspect`` call to retrieve creation timestamp
and port mapping for all containers at once. Total subprocess calls:
2 (down from 2N+1 in the naive per-container approach).
Note: Docker's ``--filter name=`` performs *substring* matching,
so a secondary ``startswith`` check is applied to ensure only
containers with the exact prefix are included.
Containers without port mappings are still included (with empty
sandbox_url) so that startup reconciliation can adopt orphans
regardless of their port state.
"""
# Step 1: enumerate container names via docker ps
try:
result = subprocess.run(
[
self._runtime,
"ps",
"--filter",
f"name={self._container_prefix}-",
"--format",
"{{.Names}}",
],
capture_output=True,
text=True,
timeout=10,
)
if result.returncode != 0:
stderr = (result.stderr or "").strip()
logger.warning(
"Failed to list running containers with %s ps (returncode=%s, stderr=%s)",
self._runtime,
result.returncode,
stderr or "<empty>",
)
return []
if not result.stdout.strip():
return []
except (subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError, OSError) as e:
logger.warning(f"Failed to list running containers: {e}")
return []
# Filter to names matching our exact prefix (docker filter is substring-based)
container_names = [name.strip() for name in result.stdout.strip().splitlines() if name.strip().startswith(self._container_prefix + "-")]
if not container_names:
return []
# Step 2: batched docker inspect — single subprocess call for all containers
inspections = self._batch_inspect(container_names)
infos: list[SandboxInfo] = []
sandbox_host = os.environ.get("DEER_FLOW_SANDBOX_HOST", "localhost")
for container_name in container_names:
data = inspections.get(container_name)
if data is None:
# Container disappeared between ps and inspect, or inspect failed
continue
created_at, host_port = data
sandbox_id = container_name[len(self._container_prefix) + 1 :]
sandbox_url = f"http://{sandbox_host}:{host_port}" if host_port else ""
infos.append(
SandboxInfo(
sandbox_id=sandbox_id,
sandbox_url=sandbox_url,
container_name=container_name,
created_at=created_at,
)
)
logger.info(f"Found {len(infos)} running sandbox container(s)")
return infos
def _batch_inspect(self, container_names: list[str]) -> dict[str, tuple[float, int | None]]:
"""Batch-inspect containers in a single subprocess call.
Returns a mapping of ``container_name -> (created_at, host_port)``.
Missing containers or parse failures are silently dropped from the result.
"""
if not container_names:
return {}
try:
result = subprocess.run(
[self._runtime, "inspect", *container_names],
capture_output=True,
text=True,
timeout=15,
)
except (subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError, OSError) as e:
logger.warning(f"Failed to batch-inspect containers: {e}")
return {}
if result.returncode != 0:
stderr = (result.stderr or "").strip()
logger.warning(
"Failed to batch-inspect containers with %s inspect (returncode=%s, stderr=%s)",
self._runtime,
result.returncode,
stderr or "<empty>",
)
return {}
try:
payload = json.loads(result.stdout or "[]")
except json.JSONDecodeError as e:
logger.warning(f"Failed to parse docker inspect output as JSON: {e}")
return {}
out: dict[str, tuple[float, int | None]] = {}
for entry in payload:
# ``Name`` is prefixed with ``/`` in the docker inspect response
name = (entry.get("Name") or "").lstrip("/")
if not name:
continue
created_at = _parse_docker_timestamp(entry.get("Created", ""))
host_port = _extract_host_port(entry, 8080)
out[name] = (created_at, host_port)
return out
# ── Container operations ─────────────────────────────────────────────
def _start_container(
self,
container_name: str,
port: int,
extra_mounts: list[tuple[str, str, bool]] | None = None,
) -> str:
"""Start a new container.
Args:
container_name: Name for the container.
port: Host port to map to container port 8080.
extra_mounts: Additional volume mounts.
Returns:
The container ID.
Raises:
RuntimeError: If container fails to start.
"""
cmd = [self._runtime, "run"]
# Docker-specific security options
if self._runtime == "docker":
cmd.extend(["--security-opt", "seccomp=unconfined"])
cmd.extend(
[
"--rm",
"-d",
"-p",
f"{port}:8080",
"--name",
container_name,
]
)
# Environment variables
for key, value in self._environment.items():
cmd.extend(["-e", f"{key}={value}"])
# Config-level volume mounts
for mount in self._config_mounts:
cmd.extend(
_format_container_mount(
self._runtime,
mount.host_path,
mount.container_path,
mount.read_only,
)
)
# Extra mounts (thread-specific, skills, etc.)
if extra_mounts:
for host_path, container_path, read_only in extra_mounts:
cmd.extend(
_format_container_mount(
self._runtime,
host_path,
container_path,
read_only,
)
)
cmd.append(self._image)
logger.info(f"Starting container using {self._runtime}: {' '.join(cmd)}")
try:
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
container_id = result.stdout.strip()
logger.info(f"Started container {container_name} (ID: {container_id}) using {self._runtime}")
return container_id
except subprocess.CalledProcessError as e:
logger.error(f"Failed to start container using {self._runtime}: {e.stderr}")
raise RuntimeError(f"Failed to start sandbox container: {e.stderr}")
def _stop_container(self, container_id: str) -> None:
"""Stop a container (--rm ensures automatic removal)."""
try:
subprocess.run(
[self._runtime, "stop", container_id],
capture_output=True,
text=True,
check=True,
)
logger.info(f"Stopped container {container_id} using {self._runtime}")
except subprocess.CalledProcessError as e:
logger.warning(f"Failed to stop container {container_id}: {e.stderr}")
def _is_container_running(self, container_name: str) -> bool:
"""Check if a named container is currently running.
This enables cross-process container discovery — any process can detect
containers started by another process via the deterministic container name.
"""
try:
result = subprocess.run(
[self._runtime, "inspect", "-f", "{{.State.Running}}", container_name],
capture_output=True,
text=True,
timeout=5,
)
return result.returncode == 0 and result.stdout.strip().lower() == "true"
except (subprocess.CalledProcessError, subprocess.TimeoutExpired):
return False
def _get_container_port(self, container_name: str) -> int | None:
"""Get the host port of a running container.
Args:
container_name: The container name to inspect.
Returns:
The host port mapped to container port 8080, or None if not found.
"""
try:
result = subprocess.run(
[self._runtime, "port", container_name, "8080"],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout.strip():
# Output format: "0.0.0.0:PORT" or ":::PORT"
port_str = result.stdout.strip().split(":")[-1]
return int(port_str)
except (subprocess.CalledProcessError, subprocess.TimeoutExpired, ValueError):
pass
return None

View File

@@ -0,0 +1,156 @@
"""Remote sandbox backend — delegates Pod lifecycle to the provisioner service.
The provisioner dynamically creates per-sandbox-id Pods + NodePort Services
in k3s. The backend accesses sandbox pods directly via ``k3s:{NodePort}``.
Architecture:
┌────────────┐ HTTP ┌─────────────┐ K8s API ┌──────────┐
│ this file │ ──────▸ │ provisioner │ ────────▸ │ k3s │
│ (backend) │ │ :8002 │ │ :6443 │
└────────────┘ └─────────────┘ └─────┬────┘
│ creates
┌─────────────┐ ┌─────▼──────┐
│ backend │ ────────▸ │ sandbox │
│ │ direct │ Pod(s) │
└─────────────┘ k3s:NPort └────────────┘
"""
from __future__ import annotations
import logging
import requests
from .backend import SandboxBackend
from .sandbox_info import SandboxInfo
logger = logging.getLogger(__name__)
class RemoteSandboxBackend(SandboxBackend):
"""Backend that delegates sandbox lifecycle to the provisioner service.
All Pod creation, destruction, and discovery are handled by the
provisioner. This backend is a thin HTTP client.
Typical config.yaml::
sandbox:
use: deerflow.community.aio_sandbox:AioSandboxProvider
provisioner_url: http://provisioner:8002
"""
def __init__(self, provisioner_url: str):
"""Initialize with the provisioner service URL.
Args:
provisioner_url: URL of the provisioner service
(e.g., ``http://provisioner:8002``).
"""
self._provisioner_url = provisioner_url.rstrip("/")
@property
def provisioner_url(self) -> str:
return self._provisioner_url
# ── SandboxBackend interface ──────────────────────────────────────────
def create(
self,
thread_id: str,
sandbox_id: str,
extra_mounts: list[tuple[str, str, bool]] | None = None,
) -> SandboxInfo:
"""Create a sandbox Pod + Service via the provisioner.
Calls ``POST /api/sandboxes`` which creates a dedicated Pod +
NodePort Service in k3s.
"""
return self._provisioner_create(thread_id, sandbox_id, extra_mounts)
def destroy(self, info: SandboxInfo) -> None:
"""Destroy a sandbox Pod + Service via the provisioner."""
self._provisioner_destroy(info.sandbox_id)
def is_alive(self, info: SandboxInfo) -> bool:
"""Check whether the sandbox Pod is running."""
return self._provisioner_is_alive(info.sandbox_id)
def discover(self, sandbox_id: str) -> SandboxInfo | None:
"""Discover an existing sandbox via the provisioner.
Calls ``GET /api/sandboxes/{sandbox_id}`` and returns info if
the Pod exists.
"""
return self._provisioner_discover(sandbox_id)
# ── Provisioner API calls ─────────────────────────────────────────────
def _provisioner_create(self, thread_id: str, sandbox_id: str, extra_mounts: list[tuple[str, str, bool]] | None = None) -> SandboxInfo:
"""POST /api/sandboxes → create Pod + Service."""
try:
resp = requests.post(
f"{self._provisioner_url}/api/sandboxes",
json={
"sandbox_id": sandbox_id,
"thread_id": thread_id,
},
timeout=30,
)
resp.raise_for_status()
data = resp.json()
logger.info(f"Provisioner created sandbox {sandbox_id}: sandbox_url={data['sandbox_url']}")
return SandboxInfo(
sandbox_id=sandbox_id,
sandbox_url=data["sandbox_url"],
)
except requests.RequestException as exc:
logger.error(f"Provisioner create failed for {sandbox_id}: {exc}")
raise RuntimeError(f"Provisioner create failed: {exc}") from exc
def _provisioner_destroy(self, sandbox_id: str) -> None:
"""DELETE /api/sandboxes/{sandbox_id} → destroy Pod + Service."""
try:
resp = requests.delete(
f"{self._provisioner_url}/api/sandboxes/{sandbox_id}",
timeout=15,
)
if resp.ok:
logger.info(f"Provisioner destroyed sandbox {sandbox_id}")
else:
logger.warning(f"Provisioner destroy returned {resp.status_code}: {resp.text}")
except requests.RequestException as exc:
logger.warning(f"Provisioner destroy failed for {sandbox_id}: {exc}")
def _provisioner_is_alive(self, sandbox_id: str) -> bool:
"""GET /api/sandboxes/{sandbox_id} → check Pod phase."""
try:
resp = requests.get(
f"{self._provisioner_url}/api/sandboxes/{sandbox_id}",
timeout=10,
)
if resp.ok:
data = resp.json()
return data.get("status") == "Running"
return False
except requests.RequestException:
return False
def _provisioner_discover(self, sandbox_id: str) -> SandboxInfo | None:
"""GET /api/sandboxes/{sandbox_id} → discover existing sandbox."""
try:
resp = requests.get(
f"{self._provisioner_url}/api/sandboxes/{sandbox_id}",
timeout=10,
)
if resp.status_code == 404:
return None
resp.raise_for_status()
data = resp.json()
return SandboxInfo(
sandbox_id=sandbox_id,
sandbox_url=data["sandbox_url"],
)
except requests.RequestException as exc:
logger.debug(f"Provisioner discover failed for {sandbox_id}: {exc}")
return None

View File

@@ -0,0 +1,41 @@
"""Sandbox metadata for cross-process discovery and state persistence."""
from __future__ import annotations
import time
from dataclasses import dataclass, field
@dataclass
class SandboxInfo:
"""Persisted sandbox metadata that enables cross-process discovery.
This dataclass holds all the information needed to reconnect to an
existing sandbox from a different process (e.g., gateway vs langgraph,
multiple workers, or across K8s pods with shared storage).
"""
sandbox_id: str
sandbox_url: str # e.g. http://localhost:8080 or http://k3s:30001
container_name: str | None = None # Only for local container backend
container_id: str | None = None # Only for local container backend
created_at: float = field(default_factory=time.time)
def to_dict(self) -> dict:
return {
"sandbox_id": self.sandbox_id,
"sandbox_url": self.sandbox_url,
"container_name": self.container_name,
"container_id": self.container_id,
"created_at": self.created_at,
}
@classmethod
def from_dict(cls, data: dict) -> SandboxInfo:
return cls(
sandbox_id=data["sandbox_id"],
sandbox_url=data.get("sandbox_url", data.get("base_url", "")),
container_name=data.get("container_name"),
container_id=data.get("container_id"),
created_at=data.get("created_at", time.time()),
)