Files
deerflow-factory/deer-flow/backend/tests/test_file_conversion.py
DATA 6de0bf9f5b Initial commit: hardened DeerFlow factory
Vendored deer-flow upstream (bytedance/deer-flow) plus prompt-injection
hardening:

- New deerflow.security package: content_delimiter, html_cleaner,
  sanitizer (8 layers — invisible chars, control chars, symbols, NFC,
  PUA, tag chars, horizontal whitespace collapse with newline/tab
  preservation, length cap)
- New deerflow.community.searx package: web_search, web_fetch,
  image_search backed by a private SearX instance, every external
  string sanitized and wrapped in <<<EXTERNAL_UNTRUSTED_CONTENT>>>
  delimiters
- All native community web providers (ddg_search, tavily, exa,
  firecrawl, jina_ai, infoquest, image_search) replaced with hard-fail
  stubs that raise NativeWebToolDisabledError at import time, so a
  misconfigured tool.use path fails loud rather than silently falling
  back to unsanitized output
- Native client back-doors (jina_client.py, infoquest_client.py)
  stubbed too
- Native-tool tests quarantined under tests/_disabled_native/
  (collect_ignore_glob via local conftest.py)
- Sanitizer Layer 7 fix: only collapse horizontal whitespace, preserve
  newlines and tabs so list/table structure survives
- Hardened runtime config.yaml references only the searx-backed tools
- Factory overlay (backend/) kept in sync with deer-flow tree as a
  reference / source

See HARDENING.md for the full audit trail and verification steps.
2026-04-12 14:23:57 +02:00

460 lines
18 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Tests for file_conversion utilities (PR1: pymupdf4llm + asyncio.to_thread; PR2: extract_outline)."""
from __future__ import annotations
import asyncio
import sys
from types import ModuleType
from unittest.mock import MagicMock, patch
from deerflow.utils.file_conversion import (
_ASYNC_THRESHOLD_BYTES,
_MIN_CHARS_PER_PAGE,
MAX_OUTLINE_ENTRIES,
_do_convert,
_pymupdf_output_too_sparse,
convert_file_to_markdown,
extract_outline,
)
def _make_pymupdf_mock(page_count: int) -> ModuleType:
"""Return a fake *pymupdf* module whose ``open()`` reports *page_count* pages."""
mock_doc = MagicMock()
mock_doc.__len__ = MagicMock(return_value=page_count)
fake_pymupdf = ModuleType("pymupdf")
fake_pymupdf.open = MagicMock(return_value=mock_doc) # type: ignore[attr-defined]
return fake_pymupdf
def _run(coro):
loop = asyncio.new_event_loop()
try:
return loop.run_until_complete(coro)
finally:
loop.close()
# ---------------------------------------------------------------------------
# _pymupdf_output_too_sparse
# ---------------------------------------------------------------------------
class TestPymupdfOutputTooSparse:
"""Check the chars-per-page sparsity heuristic."""
def test_dense_text_pdf_not_sparse(self, tmp_path):
"""Normal text PDF: many chars per page → not sparse."""
pdf = tmp_path / "dense.pdf"
pdf.write_bytes(b"%PDF-1.4 fake")
# 10 pages × 10 000 chars → 1000/page ≫ threshold
with patch.dict(sys.modules, {"pymupdf": _make_pymupdf_mock(page_count=10)}):
result = _pymupdf_output_too_sparse("x" * 10_000, pdf)
assert result is False
def test_image_based_pdf_is_sparse(self, tmp_path):
"""Image-based PDF: near-zero chars per page → sparse."""
pdf = tmp_path / "image.pdf"
pdf.write_bytes(b"%PDF-1.4 fake")
# 612 chars / 31 pages ≈ 19.7/page < _MIN_CHARS_PER_PAGE (50)
with patch.dict(sys.modules, {"pymupdf": _make_pymupdf_mock(page_count=31)}):
result = _pymupdf_output_too_sparse("x" * 612, pdf)
assert result is True
def test_fallback_when_pymupdf_unavailable(self, tmp_path):
"""When pymupdf is not installed, fall back to absolute 200-char threshold."""
pdf = tmp_path / "broken.pdf"
pdf.write_bytes(b"%PDF-1.4 fake")
# Remove pymupdf from sys.modules so the `import pymupdf` inside the
# function raises ImportError, triggering the absolute-threshold fallback.
with patch.dict(sys.modules, {"pymupdf": None}):
sparse = _pymupdf_output_too_sparse("x" * 100, pdf)
not_sparse = _pymupdf_output_too_sparse("x" * 300, pdf)
assert sparse is True
assert not_sparse is False
def test_exactly_at_threshold_is_not_sparse(self, tmp_path):
"""Chars-per-page == threshold is treated as NOT sparse (boundary inclusive)."""
pdf = tmp_path / "boundary.pdf"
pdf.write_bytes(b"%PDF-1.4 fake")
# 2 pages × _MIN_CHARS_PER_PAGE chars = exactly at threshold
with patch.dict(sys.modules, {"pymupdf": _make_pymupdf_mock(page_count=2)}):
result = _pymupdf_output_too_sparse("x" * (_MIN_CHARS_PER_PAGE * 2), pdf)
assert result is False
# ---------------------------------------------------------------------------
# _do_convert — routing logic
# ---------------------------------------------------------------------------
class TestDoConvert:
"""Verify that _do_convert routes to the right sub-converter."""
def test_non_pdf_always_uses_markitdown(self, tmp_path):
"""DOCX / XLSX / PPTX always go through MarkItDown regardless of setting."""
docx = tmp_path / "report.docx"
docx.write_bytes(b"PK fake docx")
with patch(
"deerflow.utils.file_conversion._convert_with_markitdown",
return_value="# Markdown from MarkItDown",
) as mock_md:
result = _do_convert(docx, "auto")
mock_md.assert_called_once_with(docx)
assert result == "# Markdown from MarkItDown"
def test_pdf_auto_uses_pymupdf4llm_when_dense(self, tmp_path):
"""auto mode: use pymupdf4llm output when it's dense enough."""
pdf = tmp_path / "report.pdf"
pdf.write_bytes(b"%PDF-1.4 fake")
dense_text = "# Heading\n" + "word " * 2000 # clearly dense
with (
patch(
"deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm",
return_value=dense_text,
),
patch(
"deerflow.utils.file_conversion._pymupdf_output_too_sparse",
return_value=False,
),
patch("deerflow.utils.file_conversion._convert_with_markitdown") as mock_md,
):
result = _do_convert(pdf, "auto")
mock_md.assert_not_called()
assert result == dense_text
def test_pdf_auto_falls_back_when_sparse(self, tmp_path):
"""auto mode: fall back to MarkItDown when pymupdf4llm output is sparse."""
pdf = tmp_path / "scanned.pdf"
pdf.write_bytes(b"%PDF-1.4 fake")
with (
patch(
"deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm",
return_value="x" * 612, # 19.7 chars/page for 31-page doc
),
patch(
"deerflow.utils.file_conversion._pymupdf_output_too_sparse",
return_value=True,
),
patch(
"deerflow.utils.file_conversion._convert_with_markitdown",
return_value="OCR result via MarkItDown",
) as mock_md,
):
result = _do_convert(pdf, "auto")
mock_md.assert_called_once_with(pdf)
assert result == "OCR result via MarkItDown"
def test_pdf_explicit_pymupdf4llm_skips_sparsity_check(self, tmp_path):
"""'pymupdf4llm' mode: use output as-is even if sparse."""
pdf = tmp_path / "explicit.pdf"
pdf.write_bytes(b"%PDF-1.4 fake")
sparse_text = "x" * 10 # very short
with (
patch(
"deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm",
return_value=sparse_text,
),
patch("deerflow.utils.file_conversion._convert_with_markitdown") as mock_md,
):
result = _do_convert(pdf, "pymupdf4llm")
mock_md.assert_not_called()
assert result == sparse_text
def test_pdf_explicit_markitdown_skips_pymupdf4llm(self, tmp_path):
"""'markitdown' mode: never attempt pymupdf4llm."""
pdf = tmp_path / "force_md.pdf"
pdf.write_bytes(b"%PDF-1.4 fake")
with (
patch("deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm") as mock_pymu,
patch(
"deerflow.utils.file_conversion._convert_with_markitdown",
return_value="MarkItDown result",
),
):
result = _do_convert(pdf, "markitdown")
mock_pymu.assert_not_called()
assert result == "MarkItDown result"
def test_pdf_auto_falls_back_when_pymupdf4llm_not_installed(self, tmp_path):
"""auto mode: if pymupdf4llm is not installed, use MarkItDown directly."""
pdf = tmp_path / "no_pymupdf.pdf"
pdf.write_bytes(b"%PDF-1.4 fake")
with (
patch(
"deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm",
return_value=None, # None signals not installed
),
patch(
"deerflow.utils.file_conversion._convert_with_markitdown",
return_value="MarkItDown fallback",
) as mock_md,
):
result = _do_convert(pdf, "auto")
mock_md.assert_called_once_with(pdf)
assert result == "MarkItDown fallback"
# ---------------------------------------------------------------------------
# convert_file_to_markdown — async + file writing
# ---------------------------------------------------------------------------
class TestConvertFileToMarkdown:
def test_small_file_runs_synchronously(self, tmp_path):
"""Small files (< 1 MB) are converted in the event loop thread."""
pdf = tmp_path / "small.pdf"
pdf.write_bytes(b"%PDF-1.4 " + b"x" * 100) # well under 1 MB
with (
patch("deerflow.utils.file_conversion._get_pdf_converter", return_value="auto"),
patch(
"deerflow.utils.file_conversion._do_convert",
return_value="# Small PDF",
) as mock_convert,
patch("asyncio.to_thread") as mock_thread,
):
md_path = _run(convert_file_to_markdown(pdf))
# asyncio.to_thread must NOT have been called
mock_thread.assert_not_called()
mock_convert.assert_called_once()
assert md_path == pdf.with_suffix(".md")
assert md_path.read_text() == "# Small PDF"
def test_large_file_offloaded_to_thread(self, tmp_path):
"""Large files (> 1 MB) are offloaded via asyncio.to_thread."""
pdf = tmp_path / "large.pdf"
# Write slightly more than the threshold
pdf.write_bytes(b"%PDF-1.4 " + b"x" * (_ASYNC_THRESHOLD_BYTES + 1))
async def fake_to_thread(fn, *args, **kwargs):
return fn(*args, **kwargs)
with (
patch("deerflow.utils.file_conversion._get_pdf_converter", return_value="auto"),
patch(
"deerflow.utils.file_conversion._do_convert",
return_value="# Large PDF",
),
patch("asyncio.to_thread", side_effect=fake_to_thread) as mock_thread,
):
md_path = _run(convert_file_to_markdown(pdf))
mock_thread.assert_called_once()
assert md_path == pdf.with_suffix(".md")
assert md_path.read_text() == "# Large PDF"
def test_returns_none_on_conversion_error(self, tmp_path):
"""If conversion raises, return None without propagating the exception."""
pdf = tmp_path / "broken.pdf"
pdf.write_bytes(b"%PDF-1.4 fake")
with (
patch("deerflow.utils.file_conversion._get_pdf_converter", return_value="auto"),
patch(
"deerflow.utils.file_conversion._do_convert",
side_effect=RuntimeError("conversion failed"),
),
):
result = _run(convert_file_to_markdown(pdf))
assert result is None
def test_writes_utf8_markdown_file(self, tmp_path):
"""Generated .md file is written with UTF-8 encoding."""
pdf = tmp_path / "report.pdf"
pdf.write_bytes(b"%PDF-1.4 fake")
chinese_content = "# 中文报告\n\n这是测试内容。"
with (
patch("deerflow.utils.file_conversion._get_pdf_converter", return_value="auto"),
patch(
"deerflow.utils.file_conversion._do_convert",
return_value=chinese_content,
),
):
md_path = _run(convert_file_to_markdown(pdf))
assert md_path is not None
assert md_path.read_text(encoding="utf-8") == chinese_content
# ---------------------------------------------------------------------------
# extract_outline
# ---------------------------------------------------------------------------
class TestExtractOutline:
"""Tests for extract_outline()."""
def test_empty_file_returns_empty(self, tmp_path):
"""Empty markdown file yields no outline entries."""
md = tmp_path / "empty.md"
md.write_text("", encoding="utf-8")
assert extract_outline(md) == []
def test_missing_file_returns_empty(self, tmp_path):
"""Non-existent path returns [] without raising."""
assert extract_outline(tmp_path / "nonexistent.md") == []
def test_standard_markdown_headings(self, tmp_path):
"""# / ## / ### headings are all recognised."""
md = tmp_path / "doc.md"
md.write_text(
"# Chapter One\n\nSome text.\n\n## Section 1.1\n\nMore text.\n\n### Sub 1.1.1\n",
encoding="utf-8",
)
outline = extract_outline(md)
assert len(outline) == 3
assert outline[0] == {"title": "Chapter One", "line": 1}
assert outline[1] == {"title": "Section 1.1", "line": 5}
assert outline[2] == {"title": "Sub 1.1.1", "line": 9}
def test_bold_sec_item_heading(self, tmp_path):
"""**ITEM N. TITLE** lines in SEC filings are recognised."""
md = tmp_path / "10k.md"
md.write_text(
"Cover page text.\n\n**ITEM 1. BUSINESS**\n\nBody.\n\n**ITEM 1A. RISK FACTORS**\n",
encoding="utf-8",
)
outline = extract_outline(md)
assert len(outline) == 2
assert outline[0] == {"title": "ITEM 1. BUSINESS", "line": 3}
assert outline[1] == {"title": "ITEM 1A. RISK FACTORS", "line": 7}
def test_bold_part_heading(self, tmp_path):
"""**PART I** / **PART II** headings are recognised."""
md = tmp_path / "10k.md"
md.write_text("**PART I**\n\n**PART II**\n\n**PART III**\n", encoding="utf-8")
outline = extract_outline(md)
assert len(outline) == 3
titles = [e["title"] for e in outline]
assert "PART I" in titles
assert "PART II" in titles
assert "PART III" in titles
def test_sec_cover_page_boilerplate_excluded(self, tmp_path):
"""Address lines and short cover boilerplate must NOT appear in outline."""
md = tmp_path / "8k.md"
md.write_text(
"## **UNITED STATES SECURITIES AND EXCHANGE COMMISSION**\n\n**WASHINGTON, DC 20549**\n\n**CURRENT REPORT**\n\n**SIGNATURES**\n\n**TESLA, INC.**\n\n**ITEM 2.02. RESULTS OF OPERATIONS**\n",
encoding="utf-8",
)
outline = extract_outline(md)
titles = [e["title"] for e in outline]
# Cover-page boilerplate should be excluded
assert "WASHINGTON, DC 20549" not in titles
assert "CURRENT REPORT" not in titles
assert "SIGNATURES" not in titles
assert "TESLA, INC." not in titles
# Real SEC heading must be included
assert "ITEM 2.02. RESULTS OF OPERATIONS" in titles
def test_chinese_headings_via_standard_markdown(self, tmp_path):
"""Chinese annual report headings emitted as # by pymupdf4llm are captured."""
md = tmp_path / "annual.md"
md.write_text(
"# 第一节 公司简介\n\n内容。\n\n## 第三节 管理层讨论与分析\n\n分析内容。\n",
encoding="utf-8",
)
outline = extract_outline(md)
assert len(outline) == 2
assert outline[0]["title"] == "第一节 公司简介"
assert outline[1]["title"] == "第三节 管理层讨论与分析"
def test_outline_capped_at_max_entries(self, tmp_path):
"""When truncated, result has MAX_OUTLINE_ENTRIES real entries + 1 sentinel."""
lines = [f"# Heading {i}" for i in range(MAX_OUTLINE_ENTRIES + 10)]
md = tmp_path / "long.md"
md.write_text("\n".join(lines), encoding="utf-8")
outline = extract_outline(md)
# Last entry is the truncation sentinel
assert outline[-1] == {"truncated": True}
# Visible entries are exactly MAX_OUTLINE_ENTRIES
visible = [e for e in outline if not e.get("truncated")]
assert len(visible) == MAX_OUTLINE_ENTRIES
def test_no_truncation_sentinel_when_under_limit(self, tmp_path):
"""Short documents produce no sentinel entry."""
lines = [f"# Heading {i}" for i in range(5)]
md = tmp_path / "short.md"
md.write_text("\n".join(lines), encoding="utf-8")
outline = extract_outline(md)
assert len(outline) == 5
assert not any(e.get("truncated") for e in outline)
def test_blank_lines_and_whitespace_ignored(self, tmp_path):
"""Blank lines between headings do not produce empty entries."""
md = tmp_path / "spaced.md"
md.write_text("\n\n# Title One\n\n\n\n# Title Two\n\n", encoding="utf-8")
outline = extract_outline(md)
assert len(outline) == 2
assert all(e["title"] for e in outline)
def test_inline_bold_not_confused_with_heading(self, tmp_path):
"""Mid-sentence bold text must not be mistaken for a heading."""
md = tmp_path / "prose.md"
md.write_text(
"This sentence has **bold words** inside it.\n\nAnother with **MULTIPLE CAPS** inline.\n",
encoding="utf-8",
)
outline = extract_outline(md)
assert outline == []
def test_split_bold_heading_academic_paper(self, tmp_path):
"""**<num>** **<title>** lines from academic papers are recognised (Style 3)."""
md = tmp_path / "paper.md"
md.write_text(
"## **Attention Is All You Need**\n\n**1** **Introduction**\n\nBody text.\n\n**2** **Background**\n\nMore text.\n\n**3.1** **Encoder and Decoder Stacks**\n",
encoding="utf-8",
)
outline = extract_outline(md)
titles = [e["title"] for e in outline]
assert "1 Introduction" in titles
assert "2 Background" in titles
assert "3.1 Encoder and Decoder Stacks" in titles
def test_split_bold_year_columns_excluded(self, tmp_path):
"""Financial table headers like **2023** **2022** **2021** are NOT headings."""
md = tmp_path / "annual.md"
md.write_text(
"# Financial Summary\n\n**2023** **2022** **2021**\n\nRevenue 100 90 80\n",
encoding="utf-8",
)
outline = extract_outline(md)
titles = [e["title"] for e in outline]
# Only the # heading should appear, not the year-column row
assert titles == ["Financial Summary"]
def test_adjacent_bold_spans_merged_in_markdown_heading(self, tmp_path):
"""** ** artefacts inside a # heading are merged into clean plain text."""
md = tmp_path / "sec.md"
md.write_text(
"## **UNITED STATES** **SECURITIES AND EXCHANGE COMMISSION**\n\nBody text.\n",
encoding="utf-8",
)
outline = extract_outline(md)
assert len(outline) == 1
# Title must be clean — no ** ** artefacts
assert outline[0]["title"] == "UNITED STATES SECURITIES AND EXCHANGE COMMISSION"