Initial commit: hardened DeerFlow factory
Vendored deer-flow upstream (bytedance/deer-flow) plus prompt-injection hardening: - New deerflow.security package: content_delimiter, html_cleaner, sanitizer (8 layers — invisible chars, control chars, symbols, NFC, PUA, tag chars, horizontal whitespace collapse with newline/tab preservation, length cap) - New deerflow.community.searx package: web_search, web_fetch, image_search backed by a private SearX instance, every external string sanitized and wrapped in <<<EXTERNAL_UNTRUSTED_CONTENT>>> delimiters - All native community web providers (ddg_search, tavily, exa, firecrawl, jina_ai, infoquest, image_search) replaced with hard-fail stubs that raise NativeWebToolDisabledError at import time, so a misconfigured tool.use path fails loud rather than silently falling back to unsanitized output - Native client back-doors (jina_client.py, infoquest_client.py) stubbed too - Native-tool tests quarantined under tests/_disabled_native/ (collect_ignore_glob via local conftest.py) - Sanitizer Layer 7 fix: only collapse horizontal whitespace, preserve newlines and tabs so list/table structure survives - Hardened runtime config.yaml references only the searx-backed tools - Factory overlay (backend/) kept in sync with deer-flow tree as a reference / source See HARDENING.md for the full audit trail and verification steps.
This commit is contained in:
@@ -0,0 +1,21 @@
|
||||
"""Stream bridge — decouples agent workers from SSE endpoints.
|
||||
|
||||
A ``StreamBridge`` sits between the background task that runs an agent
|
||||
(producer) and the HTTP endpoint that pushes Server-Sent Events to
|
||||
the client (consumer). This package provides an abstract protocol
|
||||
(:class:`StreamBridge`) plus a default in-memory implementation backed
|
||||
by :mod:`asyncio.Queue`.
|
||||
"""
|
||||
|
||||
from .async_provider import make_stream_bridge
|
||||
from .base import END_SENTINEL, HEARTBEAT_SENTINEL, StreamBridge, StreamEvent
|
||||
from .memory import MemoryStreamBridge
|
||||
|
||||
__all__ = [
|
||||
"END_SENTINEL",
|
||||
"HEARTBEAT_SENTINEL",
|
||||
"MemoryStreamBridge",
|
||||
"StreamBridge",
|
||||
"StreamEvent",
|
||||
"make_stream_bridge",
|
||||
]
|
||||
@@ -0,0 +1,52 @@
|
||||
"""Async stream bridge factory.
|
||||
|
||||
Provides an **async context manager** aligned with
|
||||
:func:`deerflow.agents.checkpointer.async_provider.make_checkpointer`.
|
||||
|
||||
Usage (e.g. FastAPI lifespan)::
|
||||
|
||||
from deerflow.agents.stream_bridge import make_stream_bridge
|
||||
|
||||
async with make_stream_bridge() as bridge:
|
||||
app.state.stream_bridge = bridge
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import logging
|
||||
from collections.abc import AsyncIterator
|
||||
|
||||
from deerflow.config.stream_bridge_config import get_stream_bridge_config
|
||||
|
||||
from .base import StreamBridge
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@contextlib.asynccontextmanager
|
||||
async def make_stream_bridge(config=None) -> AsyncIterator[StreamBridge]:
|
||||
"""Async context manager that yields a :class:`StreamBridge`.
|
||||
|
||||
Falls back to :class:`MemoryStreamBridge` when no configuration is
|
||||
provided and nothing is set globally.
|
||||
"""
|
||||
if config is None:
|
||||
config = get_stream_bridge_config()
|
||||
|
||||
if config is None or config.type == "memory":
|
||||
from deerflow.runtime.stream_bridge.memory import MemoryStreamBridge
|
||||
|
||||
maxsize = config.queue_maxsize if config is not None else 256
|
||||
bridge = MemoryStreamBridge(queue_maxsize=maxsize)
|
||||
logger.info("Stream bridge initialised: memory (queue_maxsize=%d)", maxsize)
|
||||
try:
|
||||
yield bridge
|
||||
finally:
|
||||
await bridge.close()
|
||||
return
|
||||
|
||||
if config.type == "redis":
|
||||
raise NotImplementedError("Redis stream bridge planned for Phase 2")
|
||||
|
||||
raise ValueError(f"Unknown stream bridge type: {config.type!r}")
|
||||
@@ -0,0 +1,72 @@
|
||||
"""Abstract stream bridge protocol.
|
||||
|
||||
StreamBridge decouples agent workers (producers) from SSE endpoints
|
||||
(consumers), aligning with LangGraph Platform's Queue + StreamManager
|
||||
architecture.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import abc
|
||||
from collections.abc import AsyncIterator
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class StreamEvent:
|
||||
"""Single stream event.
|
||||
|
||||
Attributes:
|
||||
id: Monotonically increasing event ID (used as SSE ``id:`` field,
|
||||
supports ``Last-Event-ID`` reconnection).
|
||||
event: SSE event name, e.g. ``"metadata"``, ``"updates"``,
|
||||
``"events"``, ``"error"``, ``"end"``.
|
||||
data: JSON-serialisable payload.
|
||||
"""
|
||||
|
||||
id: str
|
||||
event: str
|
||||
data: Any
|
||||
|
||||
|
||||
HEARTBEAT_SENTINEL = StreamEvent(id="", event="__heartbeat__", data=None)
|
||||
END_SENTINEL = StreamEvent(id="", event="__end__", data=None)
|
||||
|
||||
|
||||
class StreamBridge(abc.ABC):
|
||||
"""Abstract base for stream bridges."""
|
||||
|
||||
@abc.abstractmethod
|
||||
async def publish(self, run_id: str, event: str, data: Any) -> None:
|
||||
"""Enqueue a single event for *run_id* (producer side)."""
|
||||
|
||||
@abc.abstractmethod
|
||||
async def publish_end(self, run_id: str) -> None:
|
||||
"""Signal that no more events will be produced for *run_id*."""
|
||||
|
||||
@abc.abstractmethod
|
||||
def subscribe(
|
||||
self,
|
||||
run_id: str,
|
||||
*,
|
||||
last_event_id: str | None = None,
|
||||
heartbeat_interval: float = 15.0,
|
||||
) -> AsyncIterator[StreamEvent]:
|
||||
"""Async iterator that yields events for *run_id* (consumer side).
|
||||
|
||||
Yields :data:`HEARTBEAT_SENTINEL` when no event arrives within
|
||||
*heartbeat_interval* seconds. Yields :data:`END_SENTINEL` once
|
||||
the producer calls :meth:`publish_end`.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
async def cleanup(self, run_id: str, *, delay: float = 0) -> None:
|
||||
"""Release resources associated with *run_id*.
|
||||
|
||||
If *delay* > 0 the implementation should wait before releasing,
|
||||
giving late subscribers a chance to drain remaining events.
|
||||
"""
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Release backend resources. Default is a no-op."""
|
||||
@@ -0,0 +1,133 @@
|
||||
"""In-memory stream bridge backed by an in-process event log."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from collections.abc import AsyncIterator
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
from .base import END_SENTINEL, HEARTBEAT_SENTINEL, StreamBridge, StreamEvent
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class _RunStream:
|
||||
events: list[StreamEvent] = field(default_factory=list)
|
||||
condition: asyncio.Condition = field(default_factory=asyncio.Condition)
|
||||
ended: bool = False
|
||||
start_offset: int = 0
|
||||
|
||||
|
||||
class MemoryStreamBridge(StreamBridge):
|
||||
"""Per-run in-memory event log implementation.
|
||||
|
||||
Events are retained for a bounded time window per run so late subscribers
|
||||
and reconnecting clients can replay buffered events from ``Last-Event-ID``.
|
||||
"""
|
||||
|
||||
def __init__(self, *, queue_maxsize: int = 256) -> None:
|
||||
self._maxsize = queue_maxsize
|
||||
self._streams: dict[str, _RunStream] = {}
|
||||
self._counters: dict[str, int] = {}
|
||||
|
||||
# -- helpers ---------------------------------------------------------------
|
||||
|
||||
def _get_or_create_stream(self, run_id: str) -> _RunStream:
|
||||
if run_id not in self._streams:
|
||||
self._streams[run_id] = _RunStream()
|
||||
self._counters[run_id] = 0
|
||||
return self._streams[run_id]
|
||||
|
||||
def _next_id(self, run_id: str) -> str:
|
||||
self._counters[run_id] = self._counters.get(run_id, 0) + 1
|
||||
ts = int(time.time() * 1000)
|
||||
seq = self._counters[run_id] - 1
|
||||
return f"{ts}-{seq}"
|
||||
|
||||
def _resolve_start_offset(self, stream: _RunStream, last_event_id: str | None) -> int:
|
||||
if last_event_id is None:
|
||||
return stream.start_offset
|
||||
|
||||
for index, entry in enumerate(stream.events):
|
||||
if entry.id == last_event_id:
|
||||
return stream.start_offset + index + 1
|
||||
|
||||
if stream.events:
|
||||
logger.warning(
|
||||
"last_event_id=%s not found in retained buffer; replaying from earliest retained event",
|
||||
last_event_id,
|
||||
)
|
||||
return stream.start_offset
|
||||
|
||||
# -- StreamBridge API ------------------------------------------------------
|
||||
|
||||
async def publish(self, run_id: str, event: str, data: Any) -> None:
|
||||
stream = self._get_or_create_stream(run_id)
|
||||
entry = StreamEvent(id=self._next_id(run_id), event=event, data=data)
|
||||
async with stream.condition:
|
||||
stream.events.append(entry)
|
||||
if len(stream.events) > self._maxsize:
|
||||
overflow = len(stream.events) - self._maxsize
|
||||
del stream.events[:overflow]
|
||||
stream.start_offset += overflow
|
||||
stream.condition.notify_all()
|
||||
|
||||
async def publish_end(self, run_id: str) -> None:
|
||||
stream = self._get_or_create_stream(run_id)
|
||||
async with stream.condition:
|
||||
stream.ended = True
|
||||
stream.condition.notify_all()
|
||||
|
||||
async def subscribe(
|
||||
self,
|
||||
run_id: str,
|
||||
*,
|
||||
last_event_id: str | None = None,
|
||||
heartbeat_interval: float = 15.0,
|
||||
) -> AsyncIterator[StreamEvent]:
|
||||
stream = self._get_or_create_stream(run_id)
|
||||
async with stream.condition:
|
||||
next_offset = self._resolve_start_offset(stream, last_event_id)
|
||||
|
||||
while True:
|
||||
async with stream.condition:
|
||||
if next_offset < stream.start_offset:
|
||||
logger.warning(
|
||||
"subscriber for run %s fell behind retained buffer; resuming from offset %s",
|
||||
run_id,
|
||||
stream.start_offset,
|
||||
)
|
||||
next_offset = stream.start_offset
|
||||
|
||||
local_index = next_offset - stream.start_offset
|
||||
if 0 <= local_index < len(stream.events):
|
||||
entry = stream.events[local_index]
|
||||
next_offset += 1
|
||||
elif stream.ended:
|
||||
entry = END_SENTINEL
|
||||
else:
|
||||
try:
|
||||
await asyncio.wait_for(stream.condition.wait(), timeout=heartbeat_interval)
|
||||
except TimeoutError:
|
||||
entry = HEARTBEAT_SENTINEL
|
||||
else:
|
||||
continue
|
||||
|
||||
if entry is END_SENTINEL:
|
||||
yield END_SENTINEL
|
||||
return
|
||||
yield entry
|
||||
|
||||
async def cleanup(self, run_id: str, *, delay: float = 0) -> None:
|
||||
if delay > 0:
|
||||
await asyncio.sleep(delay)
|
||||
self._streams.pop(run_id, None)
|
||||
self._counters.pop(run_id, None)
|
||||
|
||||
async def close(self) -> None:
|
||||
self._streams.clear()
|
||||
self._counters.clear()
|
||||
Reference in New Issue
Block a user