Initial commit: hardened DeerFlow factory

Vendored deer-flow upstream (bytedance/deer-flow) plus prompt-injection
hardening:

- New deerflow.security package: content_delimiter, html_cleaner,
  sanitizer (8 layers — invisible chars, control chars, symbols, NFC,
  PUA, tag chars, horizontal whitespace collapse with newline/tab
  preservation, length cap)
- New deerflow.community.searx package: web_search, web_fetch,
  image_search backed by a private SearX instance, every external
  string sanitized and wrapped in <<<EXTERNAL_UNTRUSTED_CONTENT>>>
  delimiters
- All native community web providers (ddg_search, tavily, exa,
  firecrawl, jina_ai, infoquest, image_search) replaced with hard-fail
  stubs that raise NativeWebToolDisabledError at import time, so a
  misconfigured tool.use path fails loud rather than silently falling
  back to unsanitized output
- Native client back-doors (jina_client.py, infoquest_client.py)
  stubbed too
- Native-tool tests quarantined under tests/_disabled_native/
  (collect_ignore_glob via local conftest.py)
- Sanitizer Layer 7 fix: only collapse horizontal whitespace, preserve
  newlines and tabs so list/table structure survives
- Hardened runtime config.yaml references only the searx-backed tools
- Factory overlay (backend/) kept in sync with deer-flow tree as a
  reference / source

See HARDENING.md for the full audit trail and verification steps.
This commit is contained in:
2026-04-12 14:23:57 +02:00
commit 6de0bf9f5b
889 changed files with 173052 additions and 0 deletions

View File

@@ -0,0 +1,31 @@
"""Store provider for the DeerFlow runtime.
Re-exports the public API of both the async provider (for long-running
servers) and the sync provider (for CLI tools and the embedded client).
Async usage (FastAPI lifespan)::
from deerflow.runtime.store import make_store
async with make_store() as store:
app.state.store = store
Sync usage (CLI / DeerFlowClient)::
from deerflow.runtime.store import get_store, store_context
store = get_store() # singleton
with store_context() as store: ... # one-shot
"""
from .async_provider import make_store
from .provider import get_store, reset_store, store_context
__all__ = [
# async
"make_store",
# sync
"get_store",
"reset_store",
"store_context",
]

View File

@@ -0,0 +1,28 @@
"""Shared SQLite connection utilities for store and checkpointer providers."""
from __future__ import annotations
import pathlib
from deerflow.config.paths import resolve_path
def resolve_sqlite_conn_str(raw: str) -> str:
"""Return a SQLite connection string ready for use with store/checkpointer backends.
SQLite special strings (``":memory:"`` and ``file:`` URIs) are returned
unchanged. Plain filesystem paths — relative or absolute — are resolved
to an absolute string via :func:`resolve_path`.
"""
if raw == ":memory:" or raw.startswith("file:"):
return raw
return str(resolve_path(raw))
def ensure_sqlite_parent_dir(conn_str: str) -> None:
"""Create parent directory for a SQLite filesystem path.
No-op for in-memory databases (``":memory:"``) and ``file:`` URIs.
"""
if conn_str != ":memory:" and not conn_str.startswith("file:"):
pathlib.Path(conn_str).parent.mkdir(parents=True, exist_ok=True)

View File

@@ -0,0 +1,113 @@
"""Async Store factory — backend mirrors the configured checkpointer.
The store and checkpointer share the same ``checkpointer`` section in
*config.yaml* so they always use the same persistence backend:
- ``type: memory`` → :class:`langgraph.store.memory.InMemoryStore`
- ``type: sqlite`` → :class:`langgraph.store.sqlite.aio.AsyncSqliteStore`
- ``type: postgres`` → :class:`langgraph.store.postgres.aio.AsyncPostgresStore`
Usage (e.g. FastAPI lifespan)::
from deerflow.runtime.store import make_store
async with make_store() as store:
app.state.store = store
"""
from __future__ import annotations
import contextlib
import logging
from collections.abc import AsyncIterator
from langgraph.store.base import BaseStore
from deerflow.config.app_config import get_app_config
from deerflow.runtime.store.provider import POSTGRES_CONN_REQUIRED, POSTGRES_STORE_INSTALL, SQLITE_STORE_INSTALL, ensure_sqlite_parent_dir, resolve_sqlite_conn_str
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Internal backend factory
# ---------------------------------------------------------------------------
@contextlib.asynccontextmanager
async def _async_store(config) -> AsyncIterator[BaseStore]:
"""Async context manager that constructs and tears down a Store.
The ``config`` argument is a :class:`deerflow.config.checkpointer_config.CheckpointerConfig`
instance — the same object used by the checkpointer factory.
"""
if config.type == "memory":
from langgraph.store.memory import InMemoryStore
logger.info("Store: using InMemoryStore (in-process, not persistent)")
yield InMemoryStore()
return
if config.type == "sqlite":
try:
from langgraph.store.sqlite.aio import AsyncSqliteStore
except ImportError as exc:
raise ImportError(SQLITE_STORE_INSTALL) from exc
conn_str = resolve_sqlite_conn_str(config.connection_string or "store.db")
ensure_sqlite_parent_dir(conn_str)
async with AsyncSqliteStore.from_conn_string(conn_str) as store:
await store.setup()
logger.info("Store: using AsyncSqliteStore (%s)", conn_str)
yield store
return
if config.type == "postgres":
try:
from langgraph.store.postgres.aio import AsyncPostgresStore # type: ignore[import]
except ImportError as exc:
raise ImportError(POSTGRES_STORE_INSTALL) from exc
if not config.connection_string:
raise ValueError(POSTGRES_CONN_REQUIRED)
async with AsyncPostgresStore.from_conn_string(config.connection_string) as store:
await store.setup()
logger.info("Store: using AsyncPostgresStore")
yield store
return
raise ValueError(f"Unknown store backend type: {config.type!r}")
# ---------------------------------------------------------------------------
# Public async context manager
# ---------------------------------------------------------------------------
@contextlib.asynccontextmanager
async def make_store() -> AsyncIterator[BaseStore]:
"""Async context manager that yields a Store whose backend matches the
configured checkpointer.
Reads from the same ``checkpointer`` section of *config.yaml* used by
:func:`deerflow.agents.checkpointer.async_provider.make_checkpointer` so
that both singletons always use the same persistence technology::
async with make_store() as store:
app.state.store = store
Yields an :class:`~langgraph.store.memory.InMemoryStore` when no
``checkpointer`` section is configured (emits a WARNING in that case).
"""
config = get_app_config()
if config.checkpointer is None:
from langgraph.store.memory import InMemoryStore
logger.warning("No 'checkpointer' section in config.yaml — using InMemoryStore for the store. Thread list will be lost on server restart. Configure a sqlite or postgres backend for persistence.")
yield InMemoryStore()
return
async with _async_store(config.checkpointer) as store:
yield store

View File

@@ -0,0 +1,188 @@
"""Sync Store factory.
Provides a **sync singleton** and a **sync context manager** for CLI tools
and the embedded :class:`~deerflow.client.DeerFlowClient`.
The backend mirrors the configured checkpointer so that both always use the
same persistence technology. Supported backends: memory, sqlite, postgres.
Usage::
from deerflow.runtime.store.provider import get_store, store_context
# Singleton — reused across calls, closed on process exit
store = get_store()
# One-shot — fresh connection, closed on block exit
with store_context() as store:
store.put(("ns",), "key", {"value": 1})
"""
from __future__ import annotations
import contextlib
import logging
from collections.abc import Iterator
from langgraph.store.base import BaseStore
from deerflow.config.app_config import get_app_config
from deerflow.runtime.store._sqlite_utils import ensure_sqlite_parent_dir, resolve_sqlite_conn_str
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Error message constants
# ---------------------------------------------------------------------------
SQLITE_STORE_INSTALL = "langgraph-checkpoint-sqlite is required for the SQLite store. Install it with: uv add langgraph-checkpoint-sqlite"
POSTGRES_STORE_INSTALL = "langgraph-checkpoint-postgres is required for the PostgreSQL store. Install it with: uv add langgraph-checkpoint-postgres psycopg[binary] psycopg-pool"
POSTGRES_CONN_REQUIRED = "checkpointer.connection_string is required for the postgres backend"
# ---------------------------------------------------------------------------
# Sync factory
# ---------------------------------------------------------------------------
@contextlib.contextmanager
def _sync_store_cm(config) -> Iterator[BaseStore]:
"""Context manager that creates and tears down a sync Store.
The ``config`` argument is a
:class:`~deerflow.config.checkpointer_config.CheckpointerConfig` instance —
the same object used by the checkpointer factory.
"""
if config.type == "memory":
from langgraph.store.memory import InMemoryStore
logger.info("Store: using InMemoryStore (in-process, not persistent)")
yield InMemoryStore()
return
if config.type == "sqlite":
try:
from langgraph.store.sqlite import SqliteStore
except ImportError as exc:
raise ImportError(SQLITE_STORE_INSTALL) from exc
conn_str = resolve_sqlite_conn_str(config.connection_string or "store.db")
ensure_sqlite_parent_dir(conn_str)
with SqliteStore.from_conn_string(conn_str) as store:
store.setup()
logger.info("Store: using SqliteStore (%s)", conn_str)
yield store
return
if config.type == "postgres":
try:
from langgraph.store.postgres import PostgresStore # type: ignore[import]
except ImportError as exc:
raise ImportError(POSTGRES_STORE_INSTALL) from exc
if not config.connection_string:
raise ValueError(POSTGRES_CONN_REQUIRED)
with PostgresStore.from_conn_string(config.connection_string) as store:
store.setup()
logger.info("Store: using PostgresStore")
yield store
return
raise ValueError(f"Unknown store backend type: {config.type!r}")
# ---------------------------------------------------------------------------
# Sync singleton
# ---------------------------------------------------------------------------
_store: BaseStore | None = None
_store_ctx = None # open context manager keeping the connection alive
def get_store() -> BaseStore:
"""Return the global sync Store singleton, creating it on first call.
Returns an :class:`~langgraph.store.memory.InMemoryStore` when no
checkpointer is configured in *config.yaml* (emits a WARNING in that case).
Raises:
ImportError: If the required package for the configured backend is not installed.
ValueError: If ``connection_string`` is missing for a backend that requires it.
"""
global _store, _store_ctx
if _store is not None:
return _store
# Lazily load app config, mirroring the checkpointer singleton pattern so
# that tests that set the global checkpointer config explicitly remain isolated.
from deerflow.config.app_config import _app_config
from deerflow.config.checkpointer_config import get_checkpointer_config
config = get_checkpointer_config()
if config is None and _app_config is None:
try:
get_app_config()
except FileNotFoundError:
pass
config = get_checkpointer_config()
if config is None:
from langgraph.store.memory import InMemoryStore
logger.warning("No 'checkpointer' section in config.yaml — using InMemoryStore for the store. Thread list will be lost on server restart. Configure a sqlite or postgres backend for persistence.")
_store = InMemoryStore()
return _store
_store_ctx = _sync_store_cm(config)
_store = _store_ctx.__enter__()
return _store
def reset_store() -> None:
"""Reset the sync singleton, forcing recreation on the next call.
Closes any open backend connections and clears the cached instance.
Useful in tests or after a configuration change.
"""
global _store, _store_ctx
if _store_ctx is not None:
try:
_store_ctx.__exit__(None, None, None)
except Exception:
logger.warning("Error during store cleanup", exc_info=True)
_store_ctx = None
_store = None
# ---------------------------------------------------------------------------
# Sync context manager
# ---------------------------------------------------------------------------
@contextlib.contextmanager
def store_context() -> Iterator[BaseStore]:
"""Sync context manager that yields a Store and cleans up on exit.
Unlike :func:`get_store`, this does **not** cache the instance — each
``with`` block creates and destroys its own connection. Use it in CLI
scripts or tests where you want deterministic cleanup::
with store_context() as store:
store.put(("threads",), thread_id, {...})
Yields an :class:`~langgraph.store.memory.InMemoryStore` when no
checkpointer is configured in *config.yaml*.
"""
config = get_app_config()
if config.checkpointer is None:
from langgraph.store.memory import InMemoryStore
logger.warning("No 'checkpointer' section in config.yaml — using InMemoryStore for the store. Thread list will be lost on server restart. Configure a sqlite or postgres backend for persistence.")
yield InMemoryStore()
return
with _sync_store_cm(config.checkpointer) as store:
yield store