Initial commit: hardened DeerFlow factory

Vendored deer-flow upstream (bytedance/deer-flow) plus prompt-injection hardening: - New deerflow.security package: content_delimiter, html_cleaner, sanitizer (8 layers — invisible chars, control chars, symbols, NFC, PUA, tag chars, horizontal whitespace collapse with newline/tab preservation, length cap) - New deerflow.community.searx package: web_search, web_fetch, image_search backed by a private SearX instance, every external string sanitized and wrapped in <<<EXTERNAL_UNTRUSTED_CONTENT>>> delimiters - All native community web providers (ddg_search, tavily, exa, firecrawl, jina_ai, infoquest, image_search) replaced with hard-fail stubs that raise NativeWebToolDisabledError at import time, so a misconfigured tool.use path fails loud rather than silently falling back to unsanitized output - Native client back-doors (jina_client.py, infoquest_client.py) stubbed too - Native-tool tests quarantined under tests/_disabled_native/ (collect_ignore_glob via local conftest.py) - Sanitizer Layer 7 fix: only collapse horizontal whitespace, preserve newlines and tabs so list/table structure survives - Hardened runtime config.yaml references only the searx-backed tools - Factory overlay (backend/) kept in sync with deer-flow tree as a reference / source See HARDENING.md for the full audit trail and verification steps.
2026-04-12 14:23:57 +02:00
commit 6de0bf9f5b
889 changed files with 173052 additions and 0 deletions
--- a/deer-flow/backend/packages/harness/deerflow/skills/loader.py
+++ b/deer-flow/backend/packages/harness/deerflow/skills/loader.py
@@ -0,0 +1,103 @@
+import logging
+import os
+from pathlib import Path
+
+from .parser import parse_skill_file
+from .types import Skill
+
+logger = logging.getLogger(__name__)
+
+
+def get_skills_root_path() -> Path:
+    """
+    Get the root path of the skills directory.
+
+    Returns:
+        Path to the skills directory (deer-flow/skills)
+    """
+    # loader.py lives at packages/harness/deerflow/skills/loader.py — 5 parents up reaches backend/
+    backend_dir = Path(__file__).resolve().parent.parent.parent.parent.parent
+    # skills directory is sibling to backend directory
+    skills_dir = backend_dir.parent / "skills"
+    return skills_dir
+
+
+def load_skills(skills_path: Path | None = None, use_config: bool = True, enabled_only: bool = False) -> list[Skill]:
+    """
+    Load all skills from the skills directory.
+
+    Scans both public and custom skill directories, parsing SKILL.md files
+    to extract metadata. The enabled state is determined by the skills_state_config.json file.
+
+    Args:
+        skills_path: Optional custom path to skills directory.
+                     If not provided and use_config is True, uses path from config.
+                     Otherwise defaults to deer-flow/skills
+        use_config: Whether to load skills path from config (default: True)
+        enabled_only: If True, only return enabled skills (default: False)
+
+    Returns:
+        List of Skill objects, sorted by name
+    """
+    if skills_path is None:
+        if use_config:
+            try:
+                from deerflow.config import get_app_config
+
+                config = get_app_config()
+                skills_path = config.skills.get_skills_path()
+            except Exception:
+                # Fallback to default if config fails
+                skills_path = get_skills_root_path()
+        else:
+            skills_path = get_skills_root_path()
+
+    if not skills_path.exists():
+        return []
+
+    skills_by_name: dict[str, Skill] = {}
+
+    # Scan public and custom directories
+    for category in ["public", "custom"]:
+        category_path = skills_path / category
+        if not category_path.exists() or not category_path.is_dir():
+            continue
+
+        for current_root, dir_names, file_names in os.walk(category_path, followlinks=True):
+            # Keep traversal deterministic and skip hidden directories.
+            dir_names[:] = sorted(name for name in dir_names if not name.startswith("."))
+            if "SKILL.md" not in file_names:
+                continue
+
+            skill_file = Path(current_root) / "SKILL.md"
+            relative_path = skill_file.parent.relative_to(category_path)
+
+            skill = parse_skill_file(skill_file, category=category, relative_path=relative_path)
+            if skill:
+                skills_by_name[skill.name] = skill
+
+    skills = list(skills_by_name.values())
+
+    # Load skills state configuration and update enabled status
+    # NOTE: We use ExtensionsConfig.from_file() instead of get_extensions_config()
+    # to always read the latest configuration from disk. This ensures that changes
+    # made through the Gateway API (which runs in a separate process) are immediately
+    # reflected in the LangGraph Server when loading skills.
+    try:
+        from deerflow.config.extensions_config import ExtensionsConfig
+
+        extensions_config = ExtensionsConfig.from_file()
+        for skill in skills:
+            skill.enabled = extensions_config.is_skill_enabled(skill.name, skill.category)
+    except Exception as e:
+        # If config loading fails, default to all enabled
+        logger.warning("Failed to load extensions config: %s", e)
+
+    # Filter by enabled status if requested
+    if enabled_only:
+        skills = [skill for skill in skills if skill.enabled]
+
+    # Sort by name for consistent ordering
+    skills.sort(key=lambda s: s.name)
+
+    return skills