Initial commit: hardened DeerFlow factory

Vendored deer-flow upstream (bytedance/deer-flow) plus prompt-injection hardening: - New deerflow.security package: content_delimiter, html_cleaner, sanitizer (8 layers — invisible chars, control chars, symbols, NFC, PUA, tag chars, horizontal whitespace collapse with newline/tab preservation, length cap) - New deerflow.community.searx package: web_search, web_fetch, image_search backed by a private SearX instance, every external string sanitized and wrapped in <<<EXTERNAL_UNTRUSTED_CONTENT>>> delimiters - All native community web providers (ddg_search, tavily, exa, firecrawl, jina_ai, infoquest, image_search) replaced with hard-fail stubs that raise NativeWebToolDisabledError at import time, so a misconfigured tool.use path fails loud rather than silently falling back to unsanitized output - Native client back-doors (jina_client.py, infoquest_client.py) stubbed too - Native-tool tests quarantined under tests/_disabled_native/ (collect_ignore_glob via local conftest.py) - Sanitizer Layer 7 fix: only collapse horizontal whitespace, preserve newlines and tabs so list/table structure survives - Hardened runtime config.yaml references only the searx-backed tools - Factory overlay (backend/) kept in sync with deer-flow tree as a reference / source See HARDENING.md for the full audit trail and verification steps.
2026-04-12 14:23:57 +02:00
commit 6de0bf9f5b
889 changed files with 173052 additions and 0 deletions
--- a/deer-flow/backend/packages/harness/deerflow/utils/file_conversion.py
+++ b/deer-flow/backend/packages/harness/deerflow/utils/file_conversion.py
@@ -0,0 +1,309 @@
+"""File conversion utilities.
+
+Converts document files (PDF, PPT, Excel, Word) to Markdown.
+
+PDF conversion strategy (auto mode):
+  1. Try pymupdf4llm if installed — better heading detection, faster on most files.
+  2. If output is suspiciously short (< _MIN_CHARS_PER_PAGE chars/page, or < 200 chars
+     total when page count is unavailable), treat as image-based and fall back to MarkItDown.
+  3. If pymupdf4llm is not installed, use MarkItDown directly (existing behaviour).
+
+Large files (> ASYNC_THRESHOLD_BYTES) are converted in a thread pool via
+asyncio.to_thread() to avoid blocking the event loop (fixes #1569).
+
+No FastAPI or HTTP dependencies — pure utility functions.
+"""
+
+import asyncio
+import logging
+import re
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+# File extensions that should be converted to markdown
+CONVERTIBLE_EXTENSIONS = {
+    ".pdf",
+    ".ppt",
+    ".pptx",
+    ".xls",
+    ".xlsx",
+    ".doc",
+    ".docx",
+}
+
+# Files larger than this threshold are converted in a background thread.
+# Small files complete in < 1s synchronously; spawning a thread adds unnecessary
+# scheduling overhead for them.
+_ASYNC_THRESHOLD_BYTES = 1 * 1024 * 1024  # 1 MB
+
+# If pymupdf4llm produces fewer characters *per page* than this threshold,
+# the PDF is likely image-based or encrypted — fall back to MarkItDown.
+# Rationale: normal text PDFs yield 200-2000 chars/page; image-based PDFs
+# yield close to 0. 50 chars/page gives a wide safety margin.
+# Falls back to absolute 200-char check when page count is unavailable.
+_MIN_CHARS_PER_PAGE = 50
+
+
+def _pymupdf_output_too_sparse(text: str, file_path: Path) -> bool:
+    """Return True if pymupdf4llm output is suspiciously short (image-based PDF).
+
+    Uses chars-per-page rather than an absolute threshold so that both short
+    documents (few pages, few chars) and long documents (many pages, many chars)
+    are handled correctly.
+    """
+    chars = len(text.strip())
+    doc = None
+    pages: int | None = None
+    try:
+        import pymupdf
+
+        doc = pymupdf.open(str(file_path))
+        pages = len(doc)
+    except Exception:
+        pass
+    finally:
+        if doc is not None:
+            try:
+                doc.close()
+            except Exception:
+                pass
+    if pages is not None and pages > 0:
+        return (chars / pages) < _MIN_CHARS_PER_PAGE
+    # Fallback: absolute threshold when page count is unavailable
+    return chars < 200
+
+
+def _convert_pdf_with_pymupdf4llm(file_path: Path) -> str | None:
+    """Attempt PDF conversion with pymupdf4llm.
+
+    Returns the markdown text, or None if pymupdf4llm is not installed or
+    if conversion fails (e.g. encrypted/corrupt PDF).
+    """
+    try:
+        import pymupdf4llm
+    except ImportError:
+        return None
+
+    try:
+        return pymupdf4llm.to_markdown(str(file_path))
+    except Exception:
+        logger.exception("pymupdf4llm failed to convert %s; falling back to MarkItDown", file_path.name)
+        return None
+
+
+def _convert_with_markitdown(file_path: Path) -> str:
+    """Convert any supported file to markdown text using MarkItDown."""
+    from markitdown import MarkItDown
+
+    md = MarkItDown()
+    return md.convert(str(file_path)).text_content
+
+
+def _do_convert(file_path: Path, pdf_converter: str) -> str:
+    """Synchronous conversion — called directly or via asyncio.to_thread.
+
+    Args:
+        file_path: Path to the file.
+        pdf_converter: "auto" | "pymupdf4llm" | "markitdown"
+    """
+    is_pdf = file_path.suffix.lower() == ".pdf"
+
+    if is_pdf and pdf_converter != "markitdown":
+        # Try pymupdf4llm first (auto or explicit)
+        pymupdf_text = _convert_pdf_with_pymupdf4llm(file_path)
+
+        if pymupdf_text is not None:
+            # pymupdf4llm is installed
+            if pdf_converter == "pymupdf4llm":
+                # Explicit — use as-is regardless of output length
+                return pymupdf_text
+            # auto mode: fall back if output looks like a failed parse.
+            # Use chars-per-page to distinguish image-based PDFs (near 0) from
+            # legitimately short documents.
+            if not _pymupdf_output_too_sparse(pymupdf_text, file_path):
+                return pymupdf_text
+            logger.warning(
+                "pymupdf4llm produced only %d chars for %s (likely image-based PDF); falling back to MarkItDown",
+                len(pymupdf_text.strip()),
+                file_path.name,
+            )
+        # pymupdf4llm not installed or fallback triggered → use MarkItDown
+
+    return _convert_with_markitdown(file_path)
+
+
+async def convert_file_to_markdown(file_path: Path) -> Path | None:
+    """Convert a supported document file to Markdown.
+
+    PDF files are handled with a two-converter strategy (see module docstring).
+    Large files (> 1 MB) are offloaded to a thread pool to avoid blocking the
+    event loop.
+
+    Args:
+        file_path: Path to the file to convert.
+
+    Returns:
+        Path to the generated .md file, or None if conversion failed.
+    """
+    try:
+        pdf_converter = _get_pdf_converter()
+        file_size = file_path.stat().st_size
+
+        if file_size > _ASYNC_THRESHOLD_BYTES:
+            text = await asyncio.to_thread(_do_convert, file_path, pdf_converter)
+        else:
+            text = _do_convert(file_path, pdf_converter)
+
+        md_path = file_path.with_suffix(".md")
+        md_path.write_text(text, encoding="utf-8")
+
+        logger.info("Converted %s to markdown: %s (%d chars)", file_path.name, md_path.name, len(text))
+        return md_path
+    except Exception as e:
+        logger.error("Failed to convert %s to markdown: %s", file_path.name, e)
+        return None
+
+
+# Regex for bold-only lines that look like section headings.
+# Targets SEC filing structural headings that pymupdf4llm renders as **bold**
+# rather than # Markdown headings (because they use same font size as body text,
+# distinguished only by bold+caps formatting).
+#
+# Pattern requires ALL of:
+#   1. Entire line is a single **...** block (no surrounding prose)
+#   2. Starts with a recognised structural keyword:
+#      - ITEM / PART / SECTION (with optional number/letter after)
+#      - SCHEDULE, EXHIBIT, APPENDIX, ANNEX, CHAPTER
+#      All-caps addresses, boilerplate ("CURRENT REPORT", "SIGNATURES",
+#      "WASHINGTON, DC 20549") do NOT start with these keywords and are excluded.
+#
+# Chinese headings (第三节...) are already captured as standard # headings
+# by pymupdf4llm, so they don't need this pattern.
+_BOLD_HEADING_RE = re.compile(r"^\*\*((ITEM|PART|SECTION|SCHEDULE|EXHIBIT|APPENDIX|ANNEX|CHAPTER)\b[A-Z0-9 .,\-]*)\*\*\s*$")
+
+# Regex for split-bold headings produced by pymupdf4llm when a heading spans
+# multiple text spans in the PDF (e.g. section number and title are separate spans).
+# Matches lines like:  **1** **Introduction**  or  **3.2** **Multi-Head Attention**
+# Requirements:
+#   1. Entire line consists only of **...** blocks separated by whitespace (no prose)
+#   2. First block is a section number (digits and dots, e.g. "1", "3.2", "A.1")
+#   3. Second block must not be purely numeric/punctuation — excludes financial table
+#      headers like **2023** **2022** **2021** while allowing non-ASCII titles such as
+#      **1** **概述** or accented words (negative lookahead instead of [A-Za-z])
+#   4. At most two additional blocks (four total) with [^*]+ (no * inside) to keep
+#      the regex linear and avoid ReDoS on attacker-controlled content
+_SPLIT_BOLD_HEADING_RE = re.compile(r"^\*\*[\dA-Z][\d\.]*\*\*\s+\*\*(?!\d[\d\s.,\-–—/:()%]*\*\*)[^*]+\*\*(?:\s+\*\*[^*]+\*\*){0,2}\s*$")
+
+# Maximum number of outline entries injected into the agent context.
+# Keeps prompt size bounded even for very long documents.
+MAX_OUTLINE_ENTRIES = 50
+
+_ALLOWED_PDF_CONVERTERS = {"auto", "pymupdf4llm", "markitdown"}
+
+
+def _clean_bold_title(raw: str) -> str:
+    """Normalise a title string that may contain pymupdf4llm bold artefacts.
+
+    pymupdf4llm sometimes emits adjacent bold spans as ``**A** **B**`` instead
+    of a single ``**A B**`` block.  This helper merges those fragments and then
+    strips the outermost ``**...**`` wrapper so the caller gets plain text.
+
+    Examples::
+
+        "**Overview**"                       → "Overview"
+        "**UNITED STATES** **SECURITIES**"   → "UNITED STATES SECURITIES"
+        "plain text"                         → "plain text"  (unchanged)
+    """
+    # Merge adjacent bold spans: "** **" → " "
+    merged = re.sub(r"\*\*\s*\*\*", " ", raw).strip()
+    # Strip outermost **...** if the whole string is wrapped
+    if m := re.fullmatch(r"\*\*(.+?)\*\*", merged, re.DOTALL):
+        return m.group(1).strip()
+    return merged
+
+
+def extract_outline(md_path: Path) -> list[dict]:
+    """Extract document outline (headings) from a Markdown file.
+
+    Recognises three heading styles produced by pymupdf4llm:
+
+    1. Standard Markdown headings: lines starting with one or more '#'.
+       Inline ``**...**`` wrappers and adjacent bold spans (``** **``) are
+       cleaned so the title is plain text.
+
+    2. Bold-only structural headings: ``**ITEM 1. BUSINESS**``, ``**PART II**``,
+       etc.  SEC filings use bold+caps for section headings with the same font
+       size as body text, so pymupdf4llm cannot promote them to # headings.
+
+    3. Split-bold headings: ``**1** **Introduction**``, ``**3.2** **Attention**``.
+       pymupdf4llm emits these when the section number and title text are
+       separate spans in the underlying PDF (common in academic papers).
+
+    Args:
+        md_path: Path to the .md file.
+
+    Returns:
+        List of dicts with keys: title (str), line (int, 1-based).
+        When the outline is truncated at MAX_OUTLINE_ENTRIES, a sentinel entry
+        ``{"truncated": True}`` is appended as the last element so callers can
+        render a "showing first N headings" hint without re-scanning the file.
+        Returns an empty list if the file cannot be read or has no headings.
+    """
+    outline: list[dict] = []
+    try:
+        with md_path.open(encoding="utf-8") as f:
+            for lineno, line in enumerate(f, 1):
+                stripped = line.strip()
+                if not stripped:
+                    continue
+
+                # Style 1: standard Markdown heading
+                if stripped.startswith("#"):
+                    title = _clean_bold_title(stripped.lstrip("#").strip())
+                    if title:
+                        outline.append({"title": title, "line": lineno})
+
+                # Style 2: single bold block with SEC structural keyword
+                elif m := _BOLD_HEADING_RE.match(stripped):
+                    title = m.group(1).strip()
+                    if title:
+                        outline.append({"title": title, "line": lineno})
+
+                # Style 3: split-bold heading — **<num>** **<title>**
+                # Regex already enforces max 4 blocks and non-numeric second block.
+                elif _SPLIT_BOLD_HEADING_RE.match(stripped):
+                    title = " ".join(re.findall(r"\*\*([^*]+)\*\*", stripped))
+                    if title:
+                        outline.append({"title": title, "line": lineno})
+
+                if len(outline) >= MAX_OUTLINE_ENTRIES:
+                    outline.append({"truncated": True})
+                    break
+    except Exception:
+        return []
+
+    return outline
+
+
+def _get_pdf_converter() -> str:
+    """Read pdf_converter setting from app config, defaulting to 'auto'.
+
+    Normalizes the value to lowercase and validates it against the allowed set
+    so that values like 'AUTO' or 'MarkItDown' from config.yaml don't silently
+    fall through to unexpected behaviour.
+    """
+    try:
+        from deerflow.config.app_config import get_app_config
+
+        cfg = get_app_config()
+        uploads_cfg = getattr(cfg, "uploads", None)
+        if uploads_cfg is not None:
+            raw = str(getattr(uploads_cfg, "pdf_converter", "auto")).strip().lower()
+            if raw not in _ALLOWED_PDF_CONVERTERS:
+                logger.warning("Invalid pdf_converter value %r; falling back to 'auto'", raw)
+                return "auto"
+            return raw
+    except Exception:
+        pass
+    return "auto"
--- a/deer-flow/backend/packages/harness/deerflow/utils/network.py
+++ b/deer-flow/backend/packages/harness/deerflow/utils/network.py
@@ -0,0 +1,139 @@
+"""Thread-safe network utilities."""
+
+import socket
+import threading
+from contextlib import contextmanager
+
+
+class PortAllocator:
+    """Thread-safe port allocator that prevents port conflicts in concurrent environments.
+
+    This class maintains a set of reserved ports and uses a lock to ensure that
+    port allocation is atomic. Once a port is allocated, it remains reserved until
+    explicitly released.
+
+    Usage:
+        allocator = PortAllocator()
+
+        # Option 1: Manual allocation and release
+        port = allocator.allocate(start_port=8080)
+        try:
+            # Use the port...
+        finally:
+            allocator.release(port)
+
+        # Option 2: Context manager (recommended)
+        with allocator.allocate_context(start_port=8080) as port:
+            # Use the port...
+            # Port is automatically released when exiting the context
+    """
+
+    def __init__(self):
+        self._lock = threading.Lock()
+        self._reserved_ports: set[int] = set()
+
+    def _is_port_available(self, port: int) -> bool:
+        """Check if a port is available for binding.
+
+        Args:
+            port: The port number to check.
+
+        Returns:
+            True if the port is available, False otherwise.
+        """
+        if port in self._reserved_ports:
+            return False
+
+        # Bind to 0.0.0.0 (wildcard) rather than localhost so that the check
+        # mirrors exactly what Docker does.  Docker binds to 0.0.0.0:PORT;
+        # checking only 127.0.0.1 can falsely report a port as available even
+        # when Docker already occupies it on the wildcard address.
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            try:
+                s.bind(("0.0.0.0", port))
+                return True
+            except OSError:
+                return False
+
+    def allocate(self, start_port: int = 8080, max_range: int = 100) -> int:
+        """Allocate an available port in a thread-safe manner.
+
+        This method is thread-safe. It finds an available port, marks it as reserved,
+        and returns it. The port remains reserved until release() is called.
+
+        Args:
+            start_port: The port number to start searching from.
+            max_range: Maximum number of ports to search.
+
+        Returns:
+            An available port number.
+
+        Raises:
+            RuntimeError: If no available port is found in the specified range.
+        """
+        with self._lock:
+            for port in range(start_port, start_port + max_range):
+                if self._is_port_available(port):
+                    self._reserved_ports.add(port)
+                    return port
+
+            raise RuntimeError(f"No available port found in range {start_port}-{start_port + max_range}")
+
+    def release(self, port: int) -> None:
+        """Release a previously allocated port.
+
+        Args:
+            port: The port number to release.
+        """
+        with self._lock:
+            self._reserved_ports.discard(port)
+
+    @contextmanager
+    def allocate_context(self, start_port: int = 8080, max_range: int = 100):
+        """Context manager for port allocation with automatic release.
+
+        Args:
+            start_port: The port number to start searching from.
+            max_range: Maximum number of ports to search.
+
+        Yields:
+            An available port number.
+        """
+        port = self.allocate(start_port, max_range)
+        try:
+            yield port
+        finally:
+            self.release(port)
+
+
+# Global port allocator instance for shared use across the application
+_global_port_allocator = PortAllocator()
+
+
+def get_free_port(start_port: int = 8080, max_range: int = 100) -> int:
+    """Get a free port in a thread-safe manner.
+
+    This function uses a global port allocator to ensure that concurrent calls
+    don't return the same port. The port is marked as reserved until release_port()
+    is called.
+
+    Args:
+        start_port: The port number to start searching from.
+        max_range: Maximum number of ports to search.
+
+    Returns:
+        An available port number.
+
+    Raises:
+        RuntimeError: If no available port is found in the specified range.
+    """
+    return _global_port_allocator.allocate(start_port, max_range)
+
+
+def release_port(port: int) -> None:
+    """Release a previously allocated port.
+
+    Args:
+        port: The port number to release.
+    """
+    _global_port_allocator.release(port)
--- a/deer-flow/backend/packages/harness/deerflow/utils/readability.py
+++ b/deer-flow/backend/packages/harness/deerflow/utils/readability.py
@@ -0,0 +1,83 @@
+import logging
+import re
+import subprocess
+from urllib.parse import urljoin
+
+from markdownify import markdownify as md
+from readabilipy import simple_json_from_html_string
+
+logger = logging.getLogger(__name__)
+
+
+class Article:
+    url: str
+
+    def __init__(self, title: str, html_content: str):
+        self.title = title
+        self.html_content = html_content
+
+    def to_markdown(self, including_title: bool = True) -> str:
+        markdown = ""
+        if including_title:
+            markdown += f"# {self.title}\n\n"
+
+        if self.html_content is None or not str(self.html_content).strip():
+            markdown += "*No content available*\n"
+        else:
+            markdown += md(self.html_content)
+
+        return markdown
+
+    def to_message(self) -> list[dict]:
+        image_pattern = r"!\[.*?\]\((.*?)\)"
+
+        content: list[dict[str, str]] = []
+        markdown = self.to_markdown()
+
+        if not markdown or not markdown.strip():
+            return [{"type": "text", "text": "No content available"}]
+
+        parts = re.split(image_pattern, markdown)
+
+        for i, part in enumerate(parts):
+            if i % 2 == 1:
+                image_url = urljoin(self.url, part.strip())
+                content.append({"type": "image_url", "image_url": {"url": image_url}})
+            else:
+                text_part = part.strip()
+                if text_part:
+                    content.append({"type": "text", "text": text_part})
+
+        # If after processing all parts, content is still empty, provide a fallback message.
+        if not content:
+            content = [{"type": "text", "text": "No content available"}]
+
+        return content
+
+
+class ReadabilityExtractor:
+    def extract_article(self, html: str) -> Article:
+        try:
+            article = simple_json_from_html_string(html, use_readability=True)
+        except (subprocess.CalledProcessError, FileNotFoundError) as exc:
+            stderr = getattr(exc, "stderr", None)
+            if isinstance(stderr, bytes):
+                stderr = stderr.decode(errors="replace")
+            stderr_info = f"; stderr={stderr.strip()}" if isinstance(stderr, str) and stderr.strip() else ""
+            logger.warning(
+                "Readability.js extraction failed with %s%s; falling back to pure-Python extraction",
+                type(exc).__name__,
+                stderr_info,
+                exc_info=True,
+            )
+            article = simple_json_from_html_string(html, use_readability=False)
+
+        html_content = article.get("content")
+        if not html_content or not str(html_content).strip():
+            html_content = "No content could be extracted from this page"
+
+        title = article.get("title")
+        if not title or not str(title).strip():
+            title = "Untitled"
+
+        return Article(title=title, html_content=html_content)