Initial commit: hardened DeerFlow factory

Vendored deer-flow upstream (bytedance/deer-flow) plus prompt-injection
hardening:

- New deerflow.security package: content_delimiter, html_cleaner,
  sanitizer (8 layers — invisible chars, control chars, symbols, NFC,
  PUA, tag chars, horizontal whitespace collapse with newline/tab
  preservation, length cap)
- New deerflow.community.searx package: web_search, web_fetch,
  image_search backed by a private SearX instance, every external
  string sanitized and wrapped in <<<EXTERNAL_UNTRUSTED_CONTENT>>>
  delimiters
- All native community web providers (ddg_search, tavily, exa,
  firecrawl, jina_ai, infoquest, image_search) replaced with hard-fail
  stubs that raise NativeWebToolDisabledError at import time, so a
  misconfigured tool.use path fails loud rather than silently falling
  back to unsanitized output
- Native client back-doors (jina_client.py, infoquest_client.py)
  stubbed too
- Native-tool tests quarantined under tests/_disabled_native/
  (collect_ignore_glob via local conftest.py)
- Sanitizer Layer 7 fix: only collapse horizontal whitespace, preserve
  newlines and tabs so list/table structure survives
- Hardened runtime config.yaml references only the searx-backed tools
- Factory overlay (backend/) kept in sync with deer-flow tree as a
  reference / source

See HARDENING.md for the full audit trail and verification steps.
This commit is contained in:
2026-04-12 14:23:57 +02:00
commit 6de0bf9f5b
889 changed files with 173052 additions and 0 deletions

View File

@@ -0,0 +1,21 @@
"""Stream bridge — decouples agent workers from SSE endpoints.
A ``StreamBridge`` sits between the background task that runs an agent
(producer) and the HTTP endpoint that pushes Server-Sent Events to
the client (consumer). This package provides an abstract protocol
(:class:`StreamBridge`) plus a default in-memory implementation backed
by :mod:`asyncio.Queue`.
"""
from .async_provider import make_stream_bridge
from .base import END_SENTINEL, HEARTBEAT_SENTINEL, StreamBridge, StreamEvent
from .memory import MemoryStreamBridge
__all__ = [
"END_SENTINEL",
"HEARTBEAT_SENTINEL",
"MemoryStreamBridge",
"StreamBridge",
"StreamEvent",
"make_stream_bridge",
]

View File

@@ -0,0 +1,52 @@
"""Async stream bridge factory.
Provides an **async context manager** aligned with
:func:`deerflow.agents.checkpointer.async_provider.make_checkpointer`.
Usage (e.g. FastAPI lifespan)::
from deerflow.agents.stream_bridge import make_stream_bridge
async with make_stream_bridge() as bridge:
app.state.stream_bridge = bridge
"""
from __future__ import annotations
import contextlib
import logging
from collections.abc import AsyncIterator
from deerflow.config.stream_bridge_config import get_stream_bridge_config
from .base import StreamBridge
logger = logging.getLogger(__name__)
@contextlib.asynccontextmanager
async def make_stream_bridge(config=None) -> AsyncIterator[StreamBridge]:
"""Async context manager that yields a :class:`StreamBridge`.
Falls back to :class:`MemoryStreamBridge` when no configuration is
provided and nothing is set globally.
"""
if config is None:
config = get_stream_bridge_config()
if config is None or config.type == "memory":
from deerflow.runtime.stream_bridge.memory import MemoryStreamBridge
maxsize = config.queue_maxsize if config is not None else 256
bridge = MemoryStreamBridge(queue_maxsize=maxsize)
logger.info("Stream bridge initialised: memory (queue_maxsize=%d)", maxsize)
try:
yield bridge
finally:
await bridge.close()
return
if config.type == "redis":
raise NotImplementedError("Redis stream bridge planned for Phase 2")
raise ValueError(f"Unknown stream bridge type: {config.type!r}")

View File

@@ -0,0 +1,72 @@
"""Abstract stream bridge protocol.
StreamBridge decouples agent workers (producers) from SSE endpoints
(consumers), aligning with LangGraph Platform's Queue + StreamManager
architecture.
"""
from __future__ import annotations
import abc
from collections.abc import AsyncIterator
from dataclasses import dataclass
from typing import Any
@dataclass(frozen=True)
class StreamEvent:
"""Single stream event.
Attributes:
id: Monotonically increasing event ID (used as SSE ``id:`` field,
supports ``Last-Event-ID`` reconnection).
event: SSE event name, e.g. ``"metadata"``, ``"updates"``,
``"events"``, ``"error"``, ``"end"``.
data: JSON-serialisable payload.
"""
id: str
event: str
data: Any
HEARTBEAT_SENTINEL = StreamEvent(id="", event="__heartbeat__", data=None)
END_SENTINEL = StreamEvent(id="", event="__end__", data=None)
class StreamBridge(abc.ABC):
"""Abstract base for stream bridges."""
@abc.abstractmethod
async def publish(self, run_id: str, event: str, data: Any) -> None:
"""Enqueue a single event for *run_id* (producer side)."""
@abc.abstractmethod
async def publish_end(self, run_id: str) -> None:
"""Signal that no more events will be produced for *run_id*."""
@abc.abstractmethod
def subscribe(
self,
run_id: str,
*,
last_event_id: str | None = None,
heartbeat_interval: float = 15.0,
) -> AsyncIterator[StreamEvent]:
"""Async iterator that yields events for *run_id* (consumer side).
Yields :data:`HEARTBEAT_SENTINEL` when no event arrives within
*heartbeat_interval* seconds. Yields :data:`END_SENTINEL` once
the producer calls :meth:`publish_end`.
"""
@abc.abstractmethod
async def cleanup(self, run_id: str, *, delay: float = 0) -> None:
"""Release resources associated with *run_id*.
If *delay* > 0 the implementation should wait before releasing,
giving late subscribers a chance to drain remaining events.
"""
async def close(self) -> None:
"""Release backend resources. Default is a no-op."""

View File

@@ -0,0 +1,133 @@
"""In-memory stream bridge backed by an in-process event log."""
from __future__ import annotations
import asyncio
import logging
import time
from collections.abc import AsyncIterator
from dataclasses import dataclass, field
from typing import Any
from .base import END_SENTINEL, HEARTBEAT_SENTINEL, StreamBridge, StreamEvent
logger = logging.getLogger(__name__)
@dataclass
class _RunStream:
events: list[StreamEvent] = field(default_factory=list)
condition: asyncio.Condition = field(default_factory=asyncio.Condition)
ended: bool = False
start_offset: int = 0
class MemoryStreamBridge(StreamBridge):
"""Per-run in-memory event log implementation.
Events are retained for a bounded time window per run so late subscribers
and reconnecting clients can replay buffered events from ``Last-Event-ID``.
"""
def __init__(self, *, queue_maxsize: int = 256) -> None:
self._maxsize = queue_maxsize
self._streams: dict[str, _RunStream] = {}
self._counters: dict[str, int] = {}
# -- helpers ---------------------------------------------------------------
def _get_or_create_stream(self, run_id: str) -> _RunStream:
if run_id not in self._streams:
self._streams[run_id] = _RunStream()
self._counters[run_id] = 0
return self._streams[run_id]
def _next_id(self, run_id: str) -> str:
self._counters[run_id] = self._counters.get(run_id, 0) + 1
ts = int(time.time() * 1000)
seq = self._counters[run_id] - 1
return f"{ts}-{seq}"
def _resolve_start_offset(self, stream: _RunStream, last_event_id: str | None) -> int:
if last_event_id is None:
return stream.start_offset
for index, entry in enumerate(stream.events):
if entry.id == last_event_id:
return stream.start_offset + index + 1
if stream.events:
logger.warning(
"last_event_id=%s not found in retained buffer; replaying from earliest retained event",
last_event_id,
)
return stream.start_offset
# -- StreamBridge API ------------------------------------------------------
async def publish(self, run_id: str, event: str, data: Any) -> None:
stream = self._get_or_create_stream(run_id)
entry = StreamEvent(id=self._next_id(run_id), event=event, data=data)
async with stream.condition:
stream.events.append(entry)
if len(stream.events) > self._maxsize:
overflow = len(stream.events) - self._maxsize
del stream.events[:overflow]
stream.start_offset += overflow
stream.condition.notify_all()
async def publish_end(self, run_id: str) -> None:
stream = self._get_or_create_stream(run_id)
async with stream.condition:
stream.ended = True
stream.condition.notify_all()
async def subscribe(
self,
run_id: str,
*,
last_event_id: str | None = None,
heartbeat_interval: float = 15.0,
) -> AsyncIterator[StreamEvent]:
stream = self._get_or_create_stream(run_id)
async with stream.condition:
next_offset = self._resolve_start_offset(stream, last_event_id)
while True:
async with stream.condition:
if next_offset < stream.start_offset:
logger.warning(
"subscriber for run %s fell behind retained buffer; resuming from offset %s",
run_id,
stream.start_offset,
)
next_offset = stream.start_offset
local_index = next_offset - stream.start_offset
if 0 <= local_index < len(stream.events):
entry = stream.events[local_index]
next_offset += 1
elif stream.ended:
entry = END_SENTINEL
else:
try:
await asyncio.wait_for(stream.condition.wait(), timeout=heartbeat_interval)
except TimeoutError:
entry = HEARTBEAT_SENTINEL
else:
continue
if entry is END_SENTINEL:
yield END_SENTINEL
return
yield entry
async def cleanup(self, run_id: str, *, delay: float = 0) -> None:
if delay > 0:
await asyncio.sleep(delay)
self._streams.pop(run_id, None)
self._counters.pop(run_id, None)
async def close(self) -> None:
self._streams.clear()
self._counters.clear()