Initial commit: hardened DeerFlow factory
Vendored deer-flow upstream (bytedance/deer-flow) plus prompt-injection hardening: - New deerflow.security package: content_delimiter, html_cleaner, sanitizer (8 layers — invisible chars, control chars, symbols, NFC, PUA, tag chars, horizontal whitespace collapse with newline/tab preservation, length cap) - New deerflow.community.searx package: web_search, web_fetch, image_search backed by a private SearX instance, every external string sanitized and wrapped in <<<EXTERNAL_UNTRUSTED_CONTENT>>> delimiters - All native community web providers (ddg_search, tavily, exa, firecrawl, jina_ai, infoquest, image_search) replaced with hard-fail stubs that raise NativeWebToolDisabledError at import time, so a misconfigured tool.use path fails loud rather than silently falling back to unsanitized output - Native client back-doors (jina_client.py, infoquest_client.py) stubbed too - Native-tool tests quarantined under tests/_disabled_native/ (collect_ignore_glob via local conftest.py) - Sanitizer Layer 7 fix: only collapse horizontal whitespace, preserve newlines and tabs so list/table structure survives - Hardened runtime config.yaml references only the searx-backed tools - Factory overlay (backend/) kept in sync with deer-flow tree as a reference / source See HARDENING.md for the full audit trail and verification steps.
This commit is contained in:
103
deer-flow/backend/packages/harness/deerflow/skills/loader.py
Normal file
103
deer-flow/backend/packages/harness/deerflow/skills/loader.py
Normal file
@@ -0,0 +1,103 @@
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from .parser import parse_skill_file
|
||||
from .types import Skill
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_skills_root_path() -> Path:
|
||||
"""
|
||||
Get the root path of the skills directory.
|
||||
|
||||
Returns:
|
||||
Path to the skills directory (deer-flow/skills)
|
||||
"""
|
||||
# loader.py lives at packages/harness/deerflow/skills/loader.py — 5 parents up reaches backend/
|
||||
backend_dir = Path(__file__).resolve().parent.parent.parent.parent.parent
|
||||
# skills directory is sibling to backend directory
|
||||
skills_dir = backend_dir.parent / "skills"
|
||||
return skills_dir
|
||||
|
||||
|
||||
def load_skills(skills_path: Path | None = None, use_config: bool = True, enabled_only: bool = False) -> list[Skill]:
|
||||
"""
|
||||
Load all skills from the skills directory.
|
||||
|
||||
Scans both public and custom skill directories, parsing SKILL.md files
|
||||
to extract metadata. The enabled state is determined by the skills_state_config.json file.
|
||||
|
||||
Args:
|
||||
skills_path: Optional custom path to skills directory.
|
||||
If not provided and use_config is True, uses path from config.
|
||||
Otherwise defaults to deer-flow/skills
|
||||
use_config: Whether to load skills path from config (default: True)
|
||||
enabled_only: If True, only return enabled skills (default: False)
|
||||
|
||||
Returns:
|
||||
List of Skill objects, sorted by name
|
||||
"""
|
||||
if skills_path is None:
|
||||
if use_config:
|
||||
try:
|
||||
from deerflow.config import get_app_config
|
||||
|
||||
config = get_app_config()
|
||||
skills_path = config.skills.get_skills_path()
|
||||
except Exception:
|
||||
# Fallback to default if config fails
|
||||
skills_path = get_skills_root_path()
|
||||
else:
|
||||
skills_path = get_skills_root_path()
|
||||
|
||||
if not skills_path.exists():
|
||||
return []
|
||||
|
||||
skills_by_name: dict[str, Skill] = {}
|
||||
|
||||
# Scan public and custom directories
|
||||
for category in ["public", "custom"]:
|
||||
category_path = skills_path / category
|
||||
if not category_path.exists() or not category_path.is_dir():
|
||||
continue
|
||||
|
||||
for current_root, dir_names, file_names in os.walk(category_path, followlinks=True):
|
||||
# Keep traversal deterministic and skip hidden directories.
|
||||
dir_names[:] = sorted(name for name in dir_names if not name.startswith("."))
|
||||
if "SKILL.md" not in file_names:
|
||||
continue
|
||||
|
||||
skill_file = Path(current_root) / "SKILL.md"
|
||||
relative_path = skill_file.parent.relative_to(category_path)
|
||||
|
||||
skill = parse_skill_file(skill_file, category=category, relative_path=relative_path)
|
||||
if skill:
|
||||
skills_by_name[skill.name] = skill
|
||||
|
||||
skills = list(skills_by_name.values())
|
||||
|
||||
# Load skills state configuration and update enabled status
|
||||
# NOTE: We use ExtensionsConfig.from_file() instead of get_extensions_config()
|
||||
# to always read the latest configuration from disk. This ensures that changes
|
||||
# made through the Gateway API (which runs in a separate process) are immediately
|
||||
# reflected in the LangGraph Server when loading skills.
|
||||
try:
|
||||
from deerflow.config.extensions_config import ExtensionsConfig
|
||||
|
||||
extensions_config = ExtensionsConfig.from_file()
|
||||
for skill in skills:
|
||||
skill.enabled = extensions_config.is_skill_enabled(skill.name, skill.category)
|
||||
except Exception as e:
|
||||
# If config loading fails, default to all enabled
|
||||
logger.warning("Failed to load extensions config: %s", e)
|
||||
|
||||
# Filter by enabled status if requested
|
||||
if enabled_only:
|
||||
skills = [skill for skill in skills if skill.enabled]
|
||||
|
||||
# Sort by name for consistent ordering
|
||||
skills.sort(key=lambda s: s.name)
|
||||
|
||||
return skills
|
||||
Reference in New Issue
Block a user