Initial commit: hardened DeerFlow factory

Vendored deer-flow upstream (bytedance/deer-flow) plus prompt-injection
hardening:

- New deerflow.security package: content_delimiter, html_cleaner,
  sanitizer (8 layers — invisible chars, control chars, symbols, NFC,
  PUA, tag chars, horizontal whitespace collapse with newline/tab
  preservation, length cap)
- New deerflow.community.searx package: web_search, web_fetch,
  image_search backed by a private SearX instance, every external
  string sanitized and wrapped in <<<EXTERNAL_UNTRUSTED_CONTENT>>>
  delimiters
- All native community web providers (ddg_search, tavily, exa,
  firecrawl, jina_ai, infoquest, image_search) replaced with hard-fail
  stubs that raise NativeWebToolDisabledError at import time, so a
  misconfigured tool.use path fails loud rather than silently falling
  back to unsanitized output
- Native client back-doors (jina_client.py, infoquest_client.py)
  stubbed too
- Native-tool tests quarantined under tests/_disabled_native/
  (collect_ignore_glob via local conftest.py)
- Sanitizer Layer 7 fix: only collapse horizontal whitespace, preserve
  newlines and tabs so list/table structure survives
- Hardened runtime config.yaml references only the searx-backed tools
- Factory overlay (backend/) kept in sync with deer-flow tree as a
  reference / source

See HARDENING.md for the full audit trail and verification steps.
This commit is contained in:
2026-04-12 14:23:57 +02:00
commit 6de0bf9f5b
889 changed files with 173052 additions and 0 deletions

View File

@@ -0,0 +1,309 @@
"""File conversion utilities.
Converts document files (PDF, PPT, Excel, Word) to Markdown.
PDF conversion strategy (auto mode):
1. Try pymupdf4llm if installed — better heading detection, faster on most files.
2. If output is suspiciously short (< _MIN_CHARS_PER_PAGE chars/page, or < 200 chars
total when page count is unavailable), treat as image-based and fall back to MarkItDown.
3. If pymupdf4llm is not installed, use MarkItDown directly (existing behaviour).
Large files (> ASYNC_THRESHOLD_BYTES) are converted in a thread pool via
asyncio.to_thread() to avoid blocking the event loop (fixes #1569).
No FastAPI or HTTP dependencies — pure utility functions.
"""
import asyncio
import logging
import re
from pathlib import Path
logger = logging.getLogger(__name__)
# File extensions that should be converted to markdown
CONVERTIBLE_EXTENSIONS = {
".pdf",
".ppt",
".pptx",
".xls",
".xlsx",
".doc",
".docx",
}
# Files larger than this threshold are converted in a background thread.
# Small files complete in < 1s synchronously; spawning a thread adds unnecessary
# scheduling overhead for them.
_ASYNC_THRESHOLD_BYTES = 1 * 1024 * 1024 # 1 MB
# If pymupdf4llm produces fewer characters *per page* than this threshold,
# the PDF is likely image-based or encrypted — fall back to MarkItDown.
# Rationale: normal text PDFs yield 200-2000 chars/page; image-based PDFs
# yield close to 0. 50 chars/page gives a wide safety margin.
# Falls back to absolute 200-char check when page count is unavailable.
_MIN_CHARS_PER_PAGE = 50
def _pymupdf_output_too_sparse(text: str, file_path: Path) -> bool:
"""Return True if pymupdf4llm output is suspiciously short (image-based PDF).
Uses chars-per-page rather than an absolute threshold so that both short
documents (few pages, few chars) and long documents (many pages, many chars)
are handled correctly.
"""
chars = len(text.strip())
doc = None
pages: int | None = None
try:
import pymupdf
doc = pymupdf.open(str(file_path))
pages = len(doc)
except Exception:
pass
finally:
if doc is not None:
try:
doc.close()
except Exception:
pass
if pages is not None and pages > 0:
return (chars / pages) < _MIN_CHARS_PER_PAGE
# Fallback: absolute threshold when page count is unavailable
return chars < 200
def _convert_pdf_with_pymupdf4llm(file_path: Path) -> str | None:
"""Attempt PDF conversion with pymupdf4llm.
Returns the markdown text, or None if pymupdf4llm is not installed or
if conversion fails (e.g. encrypted/corrupt PDF).
"""
try:
import pymupdf4llm
except ImportError:
return None
try:
return pymupdf4llm.to_markdown(str(file_path))
except Exception:
logger.exception("pymupdf4llm failed to convert %s; falling back to MarkItDown", file_path.name)
return None
def _convert_with_markitdown(file_path: Path) -> str:
"""Convert any supported file to markdown text using MarkItDown."""
from markitdown import MarkItDown
md = MarkItDown()
return md.convert(str(file_path)).text_content
def _do_convert(file_path: Path, pdf_converter: str) -> str:
"""Synchronous conversion — called directly or via asyncio.to_thread.
Args:
file_path: Path to the file.
pdf_converter: "auto" | "pymupdf4llm" | "markitdown"
"""
is_pdf = file_path.suffix.lower() == ".pdf"
if is_pdf and pdf_converter != "markitdown":
# Try pymupdf4llm first (auto or explicit)
pymupdf_text = _convert_pdf_with_pymupdf4llm(file_path)
if pymupdf_text is not None:
# pymupdf4llm is installed
if pdf_converter == "pymupdf4llm":
# Explicit — use as-is regardless of output length
return pymupdf_text
# auto mode: fall back if output looks like a failed parse.
# Use chars-per-page to distinguish image-based PDFs (near 0) from
# legitimately short documents.
if not _pymupdf_output_too_sparse(pymupdf_text, file_path):
return pymupdf_text
logger.warning(
"pymupdf4llm produced only %d chars for %s (likely image-based PDF); falling back to MarkItDown",
len(pymupdf_text.strip()),
file_path.name,
)
# pymupdf4llm not installed or fallback triggered → use MarkItDown
return _convert_with_markitdown(file_path)
async def convert_file_to_markdown(file_path: Path) -> Path | None:
"""Convert a supported document file to Markdown.
PDF files are handled with a two-converter strategy (see module docstring).
Large files (> 1 MB) are offloaded to a thread pool to avoid blocking the
event loop.
Args:
file_path: Path to the file to convert.
Returns:
Path to the generated .md file, or None if conversion failed.
"""
try:
pdf_converter = _get_pdf_converter()
file_size = file_path.stat().st_size
if file_size > _ASYNC_THRESHOLD_BYTES:
text = await asyncio.to_thread(_do_convert, file_path, pdf_converter)
else:
text = _do_convert(file_path, pdf_converter)
md_path = file_path.with_suffix(".md")
md_path.write_text(text, encoding="utf-8")
logger.info("Converted %s to markdown: %s (%d chars)", file_path.name, md_path.name, len(text))
return md_path
except Exception as e:
logger.error("Failed to convert %s to markdown: %s", file_path.name, e)
return None
# Regex for bold-only lines that look like section headings.
# Targets SEC filing structural headings that pymupdf4llm renders as **bold**
# rather than # Markdown headings (because they use same font size as body text,
# distinguished only by bold+caps formatting).
#
# Pattern requires ALL of:
# 1. Entire line is a single **...** block (no surrounding prose)
# 2. Starts with a recognised structural keyword:
# - ITEM / PART / SECTION (with optional number/letter after)
# - SCHEDULE, EXHIBIT, APPENDIX, ANNEX, CHAPTER
# All-caps addresses, boilerplate ("CURRENT REPORT", "SIGNATURES",
# "WASHINGTON, DC 20549") do NOT start with these keywords and are excluded.
#
# Chinese headings (第三节...) are already captured as standard # headings
# by pymupdf4llm, so they don't need this pattern.
_BOLD_HEADING_RE = re.compile(r"^\*\*((ITEM|PART|SECTION|SCHEDULE|EXHIBIT|APPENDIX|ANNEX|CHAPTER)\b[A-Z0-9 .,\-]*)\*\*\s*$")
# Regex for split-bold headings produced by pymupdf4llm when a heading spans
# multiple text spans in the PDF (e.g. section number and title are separate spans).
# Matches lines like: **1** **Introduction** or **3.2** **Multi-Head Attention**
# Requirements:
# 1. Entire line consists only of **...** blocks separated by whitespace (no prose)
# 2. First block is a section number (digits and dots, e.g. "1", "3.2", "A.1")
# 3. Second block must not be purely numeric/punctuation — excludes financial table
# headers like **2023** **2022** **2021** while allowing non-ASCII titles such as
# **1** **概述** or accented words (negative lookahead instead of [A-Za-z])
# 4. At most two additional blocks (four total) with [^*]+ (no * inside) to keep
# the regex linear and avoid ReDoS on attacker-controlled content
_SPLIT_BOLD_HEADING_RE = re.compile(r"^\*\*[\dA-Z][\d\.]*\*\*\s+\*\*(?!\d[\d\s.,\-–—/:()%]*\*\*)[^*]+\*\*(?:\s+\*\*[^*]+\*\*){0,2}\s*$")
# Maximum number of outline entries injected into the agent context.
# Keeps prompt size bounded even for very long documents.
MAX_OUTLINE_ENTRIES = 50
_ALLOWED_PDF_CONVERTERS = {"auto", "pymupdf4llm", "markitdown"}
def _clean_bold_title(raw: str) -> str:
"""Normalise a title string that may contain pymupdf4llm bold artefacts.
pymupdf4llm sometimes emits adjacent bold spans as ``**A** **B**`` instead
of a single ``**A B**`` block. This helper merges those fragments and then
strips the outermost ``**...**`` wrapper so the caller gets plain text.
Examples::
"**Overview**""Overview"
"**UNITED STATES** **SECURITIES**""UNITED STATES SECURITIES"
"plain text""plain text" (unchanged)
"""
# Merge adjacent bold spans: "** **" → " "
merged = re.sub(r"\*\*\s*\*\*", " ", raw).strip()
# Strip outermost **...** if the whole string is wrapped
if m := re.fullmatch(r"\*\*(.+?)\*\*", merged, re.DOTALL):
return m.group(1).strip()
return merged
def extract_outline(md_path: Path) -> list[dict]:
"""Extract document outline (headings) from a Markdown file.
Recognises three heading styles produced by pymupdf4llm:
1. Standard Markdown headings: lines starting with one or more '#'.
Inline ``**...**`` wrappers and adjacent bold spans (``** **``) are
cleaned so the title is plain text.
2. Bold-only structural headings: ``**ITEM 1. BUSINESS**``, ``**PART II**``,
etc. SEC filings use bold+caps for section headings with the same font
size as body text, so pymupdf4llm cannot promote them to # headings.
3. Split-bold headings: ``**1** **Introduction**``, ``**3.2** **Attention**``.
pymupdf4llm emits these when the section number and title text are
separate spans in the underlying PDF (common in academic papers).
Args:
md_path: Path to the .md file.
Returns:
List of dicts with keys: title (str), line (int, 1-based).
When the outline is truncated at MAX_OUTLINE_ENTRIES, a sentinel entry
``{"truncated": True}`` is appended as the last element so callers can
render a "showing first N headings" hint without re-scanning the file.
Returns an empty list if the file cannot be read or has no headings.
"""
outline: list[dict] = []
try:
with md_path.open(encoding="utf-8") as f:
for lineno, line in enumerate(f, 1):
stripped = line.strip()
if not stripped:
continue
# Style 1: standard Markdown heading
if stripped.startswith("#"):
title = _clean_bold_title(stripped.lstrip("#").strip())
if title:
outline.append({"title": title, "line": lineno})
# Style 2: single bold block with SEC structural keyword
elif m := _BOLD_HEADING_RE.match(stripped):
title = m.group(1).strip()
if title:
outline.append({"title": title, "line": lineno})
# Style 3: split-bold heading — **<num>** **<title>**
# Regex already enforces max 4 blocks and non-numeric second block.
elif _SPLIT_BOLD_HEADING_RE.match(stripped):
title = " ".join(re.findall(r"\*\*([^*]+)\*\*", stripped))
if title:
outline.append({"title": title, "line": lineno})
if len(outline) >= MAX_OUTLINE_ENTRIES:
outline.append({"truncated": True})
break
except Exception:
return []
return outline
def _get_pdf_converter() -> str:
"""Read pdf_converter setting from app config, defaulting to 'auto'.
Normalizes the value to lowercase and validates it against the allowed set
so that values like 'AUTO' or 'MarkItDown' from config.yaml don't silently
fall through to unexpected behaviour.
"""
try:
from deerflow.config.app_config import get_app_config
cfg = get_app_config()
uploads_cfg = getattr(cfg, "uploads", None)
if uploads_cfg is not None:
raw = str(getattr(uploads_cfg, "pdf_converter", "auto")).strip().lower()
if raw not in _ALLOWED_PDF_CONVERTERS:
logger.warning("Invalid pdf_converter value %r; falling back to 'auto'", raw)
return "auto"
return raw
except Exception:
pass
return "auto"

View File

@@ -0,0 +1,139 @@
"""Thread-safe network utilities."""
import socket
import threading
from contextlib import contextmanager
class PortAllocator:
"""Thread-safe port allocator that prevents port conflicts in concurrent environments.
This class maintains a set of reserved ports and uses a lock to ensure that
port allocation is atomic. Once a port is allocated, it remains reserved until
explicitly released.
Usage:
allocator = PortAllocator()
# Option 1: Manual allocation and release
port = allocator.allocate(start_port=8080)
try:
# Use the port...
finally:
allocator.release(port)
# Option 2: Context manager (recommended)
with allocator.allocate_context(start_port=8080) as port:
# Use the port...
# Port is automatically released when exiting the context
"""
def __init__(self):
self._lock = threading.Lock()
self._reserved_ports: set[int] = set()
def _is_port_available(self, port: int) -> bool:
"""Check if a port is available for binding.
Args:
port: The port number to check.
Returns:
True if the port is available, False otherwise.
"""
if port in self._reserved_ports:
return False
# Bind to 0.0.0.0 (wildcard) rather than localhost so that the check
# mirrors exactly what Docker does. Docker binds to 0.0.0.0:PORT;
# checking only 127.0.0.1 can falsely report a port as available even
# when Docker already occupies it on the wildcard address.
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
try:
s.bind(("0.0.0.0", port))
return True
except OSError:
return False
def allocate(self, start_port: int = 8080, max_range: int = 100) -> int:
"""Allocate an available port in a thread-safe manner.
This method is thread-safe. It finds an available port, marks it as reserved,
and returns it. The port remains reserved until release() is called.
Args:
start_port: The port number to start searching from.
max_range: Maximum number of ports to search.
Returns:
An available port number.
Raises:
RuntimeError: If no available port is found in the specified range.
"""
with self._lock:
for port in range(start_port, start_port + max_range):
if self._is_port_available(port):
self._reserved_ports.add(port)
return port
raise RuntimeError(f"No available port found in range {start_port}-{start_port + max_range}")
def release(self, port: int) -> None:
"""Release a previously allocated port.
Args:
port: The port number to release.
"""
with self._lock:
self._reserved_ports.discard(port)
@contextmanager
def allocate_context(self, start_port: int = 8080, max_range: int = 100):
"""Context manager for port allocation with automatic release.
Args:
start_port: The port number to start searching from.
max_range: Maximum number of ports to search.
Yields:
An available port number.
"""
port = self.allocate(start_port, max_range)
try:
yield port
finally:
self.release(port)
# Global port allocator instance for shared use across the application
_global_port_allocator = PortAllocator()
def get_free_port(start_port: int = 8080, max_range: int = 100) -> int:
"""Get a free port in a thread-safe manner.
This function uses a global port allocator to ensure that concurrent calls
don't return the same port. The port is marked as reserved until release_port()
is called.
Args:
start_port: The port number to start searching from.
max_range: Maximum number of ports to search.
Returns:
An available port number.
Raises:
RuntimeError: If no available port is found in the specified range.
"""
return _global_port_allocator.allocate(start_port, max_range)
def release_port(port: int) -> None:
"""Release a previously allocated port.
Args:
port: The port number to release.
"""
_global_port_allocator.release(port)

View File

@@ -0,0 +1,83 @@
import logging
import re
import subprocess
from urllib.parse import urljoin
from markdownify import markdownify as md
from readabilipy import simple_json_from_html_string
logger = logging.getLogger(__name__)
class Article:
url: str
def __init__(self, title: str, html_content: str):
self.title = title
self.html_content = html_content
def to_markdown(self, including_title: bool = True) -> str:
markdown = ""
if including_title:
markdown += f"# {self.title}\n\n"
if self.html_content is None or not str(self.html_content).strip():
markdown += "*No content available*\n"
else:
markdown += md(self.html_content)
return markdown
def to_message(self) -> list[dict]:
image_pattern = r"!\[.*?\]\((.*?)\)"
content: list[dict[str, str]] = []
markdown = self.to_markdown()
if not markdown or not markdown.strip():
return [{"type": "text", "text": "No content available"}]
parts = re.split(image_pattern, markdown)
for i, part in enumerate(parts):
if i % 2 == 1:
image_url = urljoin(self.url, part.strip())
content.append({"type": "image_url", "image_url": {"url": image_url}})
else:
text_part = part.strip()
if text_part:
content.append({"type": "text", "text": text_part})
# If after processing all parts, content is still empty, provide a fallback message.
if not content:
content = [{"type": "text", "text": "No content available"}]
return content
class ReadabilityExtractor:
def extract_article(self, html: str) -> Article:
try:
article = simple_json_from_html_string(html, use_readability=True)
except (subprocess.CalledProcessError, FileNotFoundError) as exc:
stderr = getattr(exc, "stderr", None)
if isinstance(stderr, bytes):
stderr = stderr.decode(errors="replace")
stderr_info = f"; stderr={stderr.strip()}" if isinstance(stderr, str) and stderr.strip() else ""
logger.warning(
"Readability.js extraction failed with %s%s; falling back to pure-Python extraction",
type(exc).__name__,
stderr_info,
exc_info=True,
)
article = simple_json_from_html_string(html, use_readability=False)
html_content = article.get("content")
if not html_content or not str(html_content).strip():
html_content = "No content could be extracted from this page"
title = article.get("title")
if not title or not str(title).strip():
title = "Untitled"
return Article(title=title, html_content=html_content)