Initial commit: hardened DeerFlow factory

Vendored deer-flow upstream (bytedance/deer-flow) plus prompt-injection hardening: - New deerflow.security package: content_delimiter, html_cleaner, sanitizer (8 layers — invisible chars, control chars, symbols, NFC, PUA, tag chars, horizontal whitespace collapse with newline/tab preservation, length cap) - New deerflow.community.searx package: web_search, web_fetch, image_search backed by a private SearX instance, every external string sanitized and wrapped in <<<EXTERNAL_UNTRUSTED_CONTENT>>> delimiters - All native community web providers (ddg_search, tavily, exa, firecrawl, jina_ai, infoquest, image_search) replaced with hard-fail stubs that raise NativeWebToolDisabledError at import time, so a misconfigured tool.use path fails loud rather than silently falling back to unsanitized output - Native client back-doors (jina_client.py, infoquest_client.py) stubbed too - Native-tool tests quarantined under tests/_disabled_native/ (collect_ignore_glob via local conftest.py) - Sanitizer Layer 7 fix: only collapse horizontal whitespace, preserve newlines and tabs so list/table structure survives - Hardened runtime config.yaml references only the searx-backed tools - Factory overlay (backend/) kept in sync with deer-flow tree as a reference / source See HARDENING.md for the full audit trail and verification steps.
2026-04-12 14:23:57 +02:00
commit 6de0bf9f5b
889 changed files with 173052 additions and 0 deletions
--- a/deer-flow/backend/tests/test_local_sandbox_encoding.py
+++ b/deer-flow/backend/tests/test_local_sandbox_encoding.py
@@ -0,0 +1,164 @@
+import builtins
+from types import SimpleNamespace
+
+import deerflow.sandbox.local.local_sandbox as local_sandbox
+from deerflow.sandbox.local.local_sandbox import LocalSandbox
+
+
+def _open(base, file, mode="r", *args, **kwargs):
+    if "b" in mode:
+        return base(file, mode, *args, **kwargs)
+    return base(file, mode, *args, encoding=kwargs.pop("encoding", "gbk"), **kwargs)
+
+
+def test_read_file_uses_utf8_on_windows_locale(tmp_path, monkeypatch):
+    path = tmp_path / "utf8.txt"
+    text = "\u201cutf8\u201d"
+    path.write_text(text, encoding="utf-8")
+    base = builtins.open
+
+    monkeypatch.setattr(local_sandbox, "open", lambda file, mode="r", *args, **kwargs: _open(base, file, mode, *args, **kwargs), raising=False)
+
+    assert LocalSandbox("t").read_file(str(path)) == text
+
+
+def test_write_file_uses_utf8_on_windows_locale(tmp_path, monkeypatch):
+    path = tmp_path / "utf8.txt"
+    text = "emoji \U0001f600"
+    base = builtins.open
+
+    monkeypatch.setattr(local_sandbox, "open", lambda file, mode="r", *args, **kwargs: _open(base, file, mode, *args, **kwargs), raising=False)
+
+    LocalSandbox("t").write_file(str(path), text)
+
+    assert path.read_text(encoding="utf-8") == text
+
+
+def test_get_shell_prefers_posix_shell_from_path_before_windows_fallback(monkeypatch):
+    monkeypatch.setattr(local_sandbox.os, "name", "nt")
+    monkeypatch.setattr(LocalSandbox, "_find_first_available_shell", lambda candidates: r"C:\Program Files\Git\bin\sh.exe" if candidates == ("/bin/zsh", "/bin/bash", "/bin/sh", "sh") else None)
+
+    assert LocalSandbox._get_shell() == r"C:\Program Files\Git\bin\sh.exe"
+
+
+def test_get_shell_uses_powershell_fallback_on_windows(monkeypatch):
+    calls: list[tuple[str, ...]] = []
+
+    def fake_find(candidates: tuple[str, ...]) -> str | None:
+        calls.append(candidates)
+        if candidates == ("/bin/zsh", "/bin/bash", "/bin/sh", "sh"):
+            return None
+        return r"C:\Windows\System32\WindowsPowerShell\v1.0\powershell.exe"
+
+    monkeypatch.setattr(local_sandbox.os, "name", "nt")
+    monkeypatch.setattr(local_sandbox.os, "environ", {"SystemRoot": r"C:\Windows"})
+    monkeypatch.setattr(LocalSandbox, "_find_first_available_shell", fake_find)
+
+    assert LocalSandbox._get_shell() == r"C:\Windows\System32\WindowsPowerShell\v1.0\powershell.exe"
+    assert calls[1] == (
+        "pwsh",
+        "pwsh.exe",
+        "powershell",
+        "powershell.exe",
+        r"C:\Windows\System32\WindowsPowerShell\v1.0\powershell.exe",
+        "cmd.exe",
+    )
+
+
+def test_get_shell_uses_cmd_as_last_windows_fallback(monkeypatch):
+    def fake_find(candidates: tuple[str, ...]) -> str | None:
+        if candidates == ("/bin/zsh", "/bin/bash", "/bin/sh", "sh"):
+            return None
+        return r"C:\Windows\System32\cmd.exe"
+
+    monkeypatch.setattr(local_sandbox.os, "name", "nt")
+    monkeypatch.setattr(local_sandbox.os, "environ", {"SystemRoot": r"C:\Windows"})
+    monkeypatch.setattr(LocalSandbox, "_find_first_available_shell", fake_find)
+
+    assert LocalSandbox._get_shell() == r"C:\Windows\System32\cmd.exe"
+
+
+def test_execute_command_uses_powershell_command_mode_on_windows(monkeypatch):
+    calls: list[tuple[object, dict]] = []
+
+    def fake_run(*args, **kwargs):
+        calls.append((args[0], kwargs))
+        return SimpleNamespace(stdout="ok", stderr="", returncode=0)
+
+    monkeypatch.setattr(local_sandbox.os, "name", "nt")
+    monkeypatch.setattr(LocalSandbox, "_get_shell", staticmethod(lambda: r"C:\Windows\System32\WindowsPowerShell\v1.0\powershell.exe"))
+    monkeypatch.setattr(local_sandbox.subprocess, "run", fake_run)
+
+    output = LocalSandbox("t").execute_command("Write-Output hello")
+
+    assert output == "ok"
+    assert calls == [
+        (
+            [
+                r"C:\Windows\System32\WindowsPowerShell\v1.0\powershell.exe",
+                "-NoProfile",
+                "-Command",
+                "Write-Output hello",
+            ],
+            {
+                "shell": False,
+                "capture_output": True,
+                "text": True,
+                "timeout": 600,
+            },
+        )
+    ]
+
+
+def test_execute_command_uses_posix_shell_command_mode_on_windows(monkeypatch):
+    calls: list[tuple[object, dict]] = []
+
+    def fake_run(*args, **kwargs):
+        calls.append((args[0], kwargs))
+        return SimpleNamespace(stdout="ok", stderr="", returncode=0)
+
+    monkeypatch.setattr(local_sandbox.os, "name", "nt")
+    monkeypatch.setattr(LocalSandbox, "_get_shell", staticmethod(lambda: r"C:\Program Files\Git\bin\sh.exe"))
+    monkeypatch.setattr(local_sandbox.subprocess, "run", fake_run)
+
+    output = LocalSandbox("t").execute_command("echo hello")
+
+    assert output == "ok"
+    assert calls == [
+        (
+            [r"C:\Program Files\Git\bin\sh.exe", "-c", "echo hello"],
+            {
+                "shell": False,
+                "capture_output": True,
+                "text": True,
+                "timeout": 600,
+            },
+        )
+    ]
+
+
+def test_execute_command_uses_cmd_command_mode_on_windows(monkeypatch):
+    calls: list[tuple[object, dict]] = []
+
+    def fake_run(*args, **kwargs):
+        calls.append((args[0], kwargs))
+        return SimpleNamespace(stdout="ok", stderr="", returncode=0)
+
+    monkeypatch.setattr(local_sandbox.os, "name", "nt")
+    monkeypatch.setattr(LocalSandbox, "_get_shell", staticmethod(lambda: r"C:\Windows\System32\cmd.exe"))
+    monkeypatch.setattr(local_sandbox.subprocess, "run", fake_run)
+
+    output = LocalSandbox("t").execute_command("echo hello")
+
+    assert output == "ok"
+    assert calls == [
+        (
+            [r"C:\Windows\System32\cmd.exe", "/c", "echo hello"],
+            {
+                "shell": False,
+                "capture_output": True,
+                "text": True,
+                "timeout": 600,
+            },
+        )
+    ]