Initial commit: hardened DeerFlow factory

Vendored deer-flow upstream (bytedance/deer-flow) plus prompt-injection
hardening:

- New deerflow.security package: content_delimiter, html_cleaner,
  sanitizer (8 layers — invisible chars, control chars, symbols, NFC,
  PUA, tag chars, horizontal whitespace collapse with newline/tab
  preservation, length cap)
- New deerflow.community.searx package: web_search, web_fetch,
  image_search backed by a private SearX instance, every external
  string sanitized and wrapped in <<<EXTERNAL_UNTRUSTED_CONTENT>>>
  delimiters
- All native community web providers (ddg_search, tavily, exa,
  firecrawl, jina_ai, infoquest, image_search) replaced with hard-fail
  stubs that raise NativeWebToolDisabledError at import time, so a
  misconfigured tool.use path fails loud rather than silently falling
  back to unsanitized output
- Native client back-doors (jina_client.py, infoquest_client.py)
  stubbed too
- Native-tool tests quarantined under tests/_disabled_native/
  (collect_ignore_glob via local conftest.py)
- Sanitizer Layer 7 fix: only collapse horizontal whitespace, preserve
  newlines and tabs so list/table structure survives
- Hardened runtime config.yaml references only the searx-backed tools
- Factory overlay (backend/) kept in sync with deer-flow tree as a
  reference / source

See HARDENING.md for the full audit trail and verification steps.
This commit is contained in:
2026-04-12 14:23:57 +02:00
commit 6de0bf9f5b
889 changed files with 173052 additions and 0 deletions

View File

@@ -0,0 +1,81 @@
#!/usr/bin/env python3
"""Load the Memory Settings review sample into a local DeerFlow runtime."""
from __future__ import annotations
import argparse
import json
import shutil
from datetime import datetime
from pathlib import Path
def default_source(repo_root: Path) -> Path:
return repo_root / "backend" / "docs" / "memory-settings-sample.json"
def default_target(repo_root: Path) -> Path:
return repo_root / "backend" / ".deer-flow" / "memory.json"
def parse_args(repo_root: Path) -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Copy the Memory Settings sample data into the local runtime memory file.",
)
parser.add_argument(
"--source",
type=Path,
default=default_source(repo_root),
help="Path to the sample JSON file.",
)
parser.add_argument(
"--target",
type=Path,
default=default_target(repo_root),
help="Path to the runtime memory.json file.",
)
parser.add_argument(
"--no-backup",
action="store_true",
help="Overwrite the target without writing a backup copy first.",
)
return parser.parse_args()
def validate_json_file(path: Path) -> None:
with path.open(encoding="utf-8") as handle:
json.load(handle)
def main() -> int:
repo_root = Path(__file__).resolve().parents[1]
args = parse_args(repo_root)
source = args.source.resolve()
target = args.target.resolve()
if not source.exists():
raise SystemExit(f"Sample file not found: {source}")
validate_json_file(source)
target.parent.mkdir(parents=True, exist_ok=True)
backup_path: Path | None = None
if target.exists() and not args.no_backup:
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
backup_path = target.with_name(f"{target.name}.bak-{timestamp}")
shutil.copy2(target, backup_path)
shutil.copy2(source, target)
print(f"Loaded sample memory into: {target}")
if backup_path is not None:
print(f"Backup created at: {backup_path}")
else:
print("No backup created.")
return 0
if __name__ == "__main__":
raise SystemExit(main())