Vendored deer-flow upstream (bytedance/deer-flow) plus prompt-injection hardening: - New deerflow.security package: content_delimiter, html_cleaner, sanitizer (8 layers — invisible chars, control chars, symbols, NFC, PUA, tag chars, horizontal whitespace collapse with newline/tab preservation, length cap) - New deerflow.community.searx package: web_search, web_fetch, image_search backed by a private SearX instance, every external string sanitized and wrapped in <<<EXTERNAL_UNTRUSTED_CONTENT>>> delimiters - All native community web providers (ddg_search, tavily, exa, firecrawl, jina_ai, infoquest, image_search) replaced with hard-fail stubs that raise NativeWebToolDisabledError at import time, so a misconfigured tool.use path fails loud rather than silently falling back to unsanitized output - Native client back-doors (jina_client.py, infoquest_client.py) stubbed too - Native-tool tests quarantined under tests/_disabled_native/ (collect_ignore_glob via local conftest.py) - Sanitizer Layer 7 fix: only collapse horizontal whitespace, preserve newlines and tabs so list/table structure survives - Hardened runtime config.yaml references only the searx-backed tools - Factory overlay (backend/) kept in sync with deer-flow tree as a reference / source See HARDENING.md for the full audit trail and verification steps.
211 lines
5.9 KiB
Python
211 lines
5.9 KiB
Python
import fnmatch
|
|
import os
|
|
import re
|
|
from dataclasses import dataclass
|
|
from pathlib import Path, PurePosixPath
|
|
|
|
IGNORE_PATTERNS = [
|
|
".git",
|
|
".svn",
|
|
".hg",
|
|
".bzr",
|
|
"node_modules",
|
|
"__pycache__",
|
|
".venv",
|
|
"venv",
|
|
".env",
|
|
"env",
|
|
".tox",
|
|
".nox",
|
|
".eggs",
|
|
"*.egg-info",
|
|
"site-packages",
|
|
"dist",
|
|
"build",
|
|
".next",
|
|
".nuxt",
|
|
".output",
|
|
".turbo",
|
|
"target",
|
|
"out",
|
|
".idea",
|
|
".vscode",
|
|
"*.swp",
|
|
"*.swo",
|
|
"*~",
|
|
".project",
|
|
".classpath",
|
|
".settings",
|
|
".DS_Store",
|
|
"Thumbs.db",
|
|
"desktop.ini",
|
|
"*.lnk",
|
|
"*.log",
|
|
"*.tmp",
|
|
"*.temp",
|
|
"*.bak",
|
|
"*.cache",
|
|
".cache",
|
|
"logs",
|
|
".coverage",
|
|
"coverage",
|
|
".nyc_output",
|
|
"htmlcov",
|
|
".pytest_cache",
|
|
".mypy_cache",
|
|
".ruff_cache",
|
|
]
|
|
|
|
DEFAULT_MAX_FILE_SIZE_BYTES = 1_000_000
|
|
DEFAULT_LINE_SUMMARY_LENGTH = 200
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class GrepMatch:
|
|
path: str
|
|
line_number: int
|
|
line: str
|
|
|
|
|
|
def should_ignore_name(name: str) -> bool:
|
|
for pattern in IGNORE_PATTERNS:
|
|
if fnmatch.fnmatch(name, pattern):
|
|
return True
|
|
return False
|
|
|
|
|
|
def should_ignore_path(path: str) -> bool:
|
|
return any(should_ignore_name(segment) for segment in path.replace("\\", "/").split("/") if segment)
|
|
|
|
|
|
def path_matches(pattern: str, rel_path: str) -> bool:
|
|
path = PurePosixPath(rel_path)
|
|
if path.match(pattern):
|
|
return True
|
|
if pattern.startswith("**/"):
|
|
return path.match(pattern[3:])
|
|
return False
|
|
|
|
|
|
def truncate_line(line: str, max_chars: int = DEFAULT_LINE_SUMMARY_LENGTH) -> str:
|
|
line = line.rstrip("\n\r")
|
|
if len(line) <= max_chars:
|
|
return line
|
|
return line[: max_chars - 3] + "..."
|
|
|
|
|
|
def is_binary_file(path: Path, sample_size: int = 8192) -> bool:
|
|
try:
|
|
with path.open("rb") as handle:
|
|
return b"\0" in handle.read(sample_size)
|
|
except OSError:
|
|
return True
|
|
|
|
|
|
def find_glob_matches(root: Path, pattern: str, *, include_dirs: bool = False, max_results: int = 200) -> tuple[list[str], bool]:
|
|
matches: list[str] = []
|
|
truncated = False
|
|
root = root.resolve()
|
|
|
|
if not root.exists():
|
|
raise FileNotFoundError(root)
|
|
if not root.is_dir():
|
|
raise NotADirectoryError(root)
|
|
|
|
for current_root, dirs, files in os.walk(root):
|
|
dirs[:] = [name for name in dirs if not should_ignore_name(name)]
|
|
# root is already resolved; os.walk builds current_root by joining under root,
|
|
# so relative_to() works without an extra stat()/resolve() per directory.
|
|
rel_dir = Path(current_root).relative_to(root)
|
|
|
|
if include_dirs:
|
|
for name in dirs:
|
|
rel_path = (rel_dir / name).as_posix()
|
|
if path_matches(pattern, rel_path):
|
|
matches.append(str(Path(current_root) / name))
|
|
if len(matches) >= max_results:
|
|
truncated = True
|
|
return matches, truncated
|
|
|
|
for name in files:
|
|
if should_ignore_name(name):
|
|
continue
|
|
rel_path = (rel_dir / name).as_posix()
|
|
if path_matches(pattern, rel_path):
|
|
matches.append(str(Path(current_root) / name))
|
|
if len(matches) >= max_results:
|
|
truncated = True
|
|
return matches, truncated
|
|
|
|
return matches, truncated
|
|
|
|
|
|
def find_grep_matches(
|
|
root: Path,
|
|
pattern: str,
|
|
*,
|
|
glob_pattern: str | None = None,
|
|
literal: bool = False,
|
|
case_sensitive: bool = False,
|
|
max_results: int = 100,
|
|
max_file_size: int = DEFAULT_MAX_FILE_SIZE_BYTES,
|
|
line_summary_length: int = DEFAULT_LINE_SUMMARY_LENGTH,
|
|
) -> tuple[list[GrepMatch], bool]:
|
|
matches: list[GrepMatch] = []
|
|
truncated = False
|
|
root = root.resolve()
|
|
|
|
if not root.exists():
|
|
raise FileNotFoundError(root)
|
|
if not root.is_dir():
|
|
raise NotADirectoryError(root)
|
|
|
|
regex_source = re.escape(pattern) if literal else pattern
|
|
flags = 0 if case_sensitive else re.IGNORECASE
|
|
regex = re.compile(regex_source, flags)
|
|
|
|
# Skip lines longer than this to prevent ReDoS on minified / no-newline files.
|
|
_max_line_chars = line_summary_length * 10
|
|
|
|
for current_root, dirs, files in os.walk(root):
|
|
dirs[:] = [name for name in dirs if not should_ignore_name(name)]
|
|
rel_dir = Path(current_root).relative_to(root)
|
|
|
|
for name in files:
|
|
if should_ignore_name(name):
|
|
continue
|
|
|
|
candidate_path = Path(current_root) / name
|
|
rel_path = (rel_dir / name).as_posix()
|
|
|
|
if glob_pattern is not None and not path_matches(glob_pattern, rel_path):
|
|
continue
|
|
|
|
try:
|
|
if candidate_path.is_symlink():
|
|
continue
|
|
file_path = candidate_path.resolve()
|
|
if not file_path.is_relative_to(root):
|
|
continue
|
|
if file_path.stat().st_size > max_file_size or is_binary_file(file_path):
|
|
continue
|
|
with file_path.open(encoding="utf-8", errors="replace") as handle:
|
|
for line_number, line in enumerate(handle, start=1):
|
|
if len(line) > _max_line_chars:
|
|
continue
|
|
if regex.search(line):
|
|
matches.append(
|
|
GrepMatch(
|
|
path=str(file_path),
|
|
line_number=line_number,
|
|
line=truncate_line(line, line_summary_length),
|
|
)
|
|
)
|
|
if len(matches) >= max_results:
|
|
truncated = True
|
|
return matches, truncated
|
|
except OSError:
|
|
continue
|
|
|
|
return matches, truncated
|