Vendored deer-flow upstream (bytedance/deer-flow) plus prompt-injection hardening: - New deerflow.security package: content_delimiter, html_cleaner, sanitizer (8 layers — invisible chars, control chars, symbols, NFC, PUA, tag chars, horizontal whitespace collapse with newline/tab preservation, length cap) - New deerflow.community.searx package: web_search, web_fetch, image_search backed by a private SearX instance, every external string sanitized and wrapped in <<<EXTERNAL_UNTRUSTED_CONTENT>>> delimiters - All native community web providers (ddg_search, tavily, exa, firecrawl, jina_ai, infoquest, image_search) replaced with hard-fail stubs that raise NativeWebToolDisabledError at import time, so a misconfigured tool.use path fails loud rather than silently falling back to unsanitized output - Native client back-doors (jina_client.py, infoquest_client.py) stubbed too - Native-tool tests quarantined under tests/_disabled_native/ (collect_ignore_glob via local conftest.py) - Sanitizer Layer 7 fix: only collapse horizontal whitespace, preserve newlines and tabs so list/table structure survives - Hardened runtime config.yaml references only the searx-backed tools - Factory overlay (backend/) kept in sync with deer-flow tree as a reference / source See HARDENING.md for the full audit trail and verification steps.
307 lines
10 KiB
Python
307 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""arXiv search client for the systematic-literature-review skill.
|
|
|
|
Queries the public arXiv API (http://export.arxiv.org/api/query) and
|
|
returns structured paper metadata as JSON. No API key required.
|
|
|
|
Design notes:
|
|
|
|
- No additional dependencies required. Uses `requests` when available,
|
|
falls back to `urllib` with a requests-compatible shim (same pattern as
|
|
../../github-deep-research/scripts/github_api.py).
|
|
- Query parameters are URL-encoded via `urllib.parse.urlencode` with
|
|
`quote_via=quote_plus`. Hand-rolled `k=v` joining would break on
|
|
multi-word topics like "transformer attention".
|
|
- Atom XML is parsed with `xml.etree.ElementTree` using an explicit
|
|
namespace map. Forgetting the namespace prefix is the #1 arXiv API
|
|
parsing bug, so we bake it into NS_MAP.
|
|
- The `<id>` field in arXiv responses is a full URL like
|
|
"http://arxiv.org/abs/1706.03762v5". Callers usually want the bare
|
|
id "1706.03762", so we normalise it.
|
|
- max_results is clamped to 50 to match the skill's documented upper
|
|
bound. Larger surveys are out of scope for the MVP.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
from typing import Any
|
|
|
|
# Namespace map for arXiv's Atom feed. arXiv extends Atom with its own
|
|
# elements (primary_category, comment, journal_ref) under the `arxiv:`
|
|
# prefix; the core entry fields live under `atom:`.
|
|
NS_MAP = {
|
|
"atom": "http://www.w3.org/2005/Atom",
|
|
"arxiv": "http://arxiv.org/schemas/atom",
|
|
}
|
|
|
|
ARXIV_ENDPOINT = "http://export.arxiv.org/api/query"
|
|
MAX_RESULTS_UPPER_BOUND = 50
|
|
DEFAULT_TIMEOUT_SECONDS = 30
|
|
|
|
|
|
# --- HTTP client with requests -> urllib fallback --------------------------
|
|
|
|
try:
|
|
import requests # type: ignore
|
|
except ImportError:
|
|
import urllib.error
|
|
import urllib.parse
|
|
import urllib.request
|
|
|
|
class _UrllibResponse:
|
|
def __init__(self, data: bytes, status: int) -> None:
|
|
self._data = data
|
|
self.status_code = status
|
|
self.text = data.decode("utf-8", errors="replace")
|
|
self.content = data
|
|
|
|
def raise_for_status(self) -> None:
|
|
if self.status_code >= 400:
|
|
raise RuntimeError(f"HTTP {self.status_code}")
|
|
|
|
class _UrllibRequestsShim:
|
|
"""Minimal requests-compatible shim using urllib.
|
|
|
|
Only supports what arxiv_search needs: GET with query params.
|
|
Params are encoded with quote_plus so multi-word queries work.
|
|
"""
|
|
|
|
@staticmethod
|
|
def get(
|
|
url: str,
|
|
params: dict | None = None,
|
|
timeout: int = DEFAULT_TIMEOUT_SECONDS,
|
|
) -> _UrllibResponse:
|
|
if params:
|
|
query = urllib.parse.urlencode(params, quote_via=urllib.parse.quote_plus)
|
|
url = f"{url}?{query}"
|
|
req = urllib.request.Request(url, headers={"User-Agent": "deerflow-slr-skill/0.1"})
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
return _UrllibResponse(resp.read(), resp.status)
|
|
except urllib.error.HTTPError as e:
|
|
return _UrllibResponse(e.read(), e.code)
|
|
|
|
requests = _UrllibRequestsShim() # type: ignore
|
|
|
|
|
|
# --- Core query + parsing --------------------------------------------------
|
|
|
|
|
|
def _build_search_query(
|
|
query: str,
|
|
category: str | None,
|
|
start_date: str | None,
|
|
end_date: str | None,
|
|
) -> str:
|
|
"""Build arXiv's `search_query` field.
|
|
|
|
arXiv uses its own query grammar: `ti:`, `abs:`, `cat:`, `all:`, with
|
|
`AND`/`OR`/`ANDNOT` combinators. We search `all:` for the user's
|
|
topic (matches title + abstract + authors) and optionally AND it
|
|
with a category filter and a submission date range.
|
|
"""
|
|
# Wrap multi-word queries in double quotes so arXiv's Lucene parser
|
|
# treats them as a phrase. Without quotes, `all:diffusion model` is
|
|
# parsed as `all:diffusion OR model`, pulling in unrelated papers
|
|
# that merely mention the word "model".
|
|
if " " in query:
|
|
parts = [f'all:"{query}"']
|
|
else:
|
|
parts = [f"all:{query}"]
|
|
if category:
|
|
parts.append(f"cat:{category}")
|
|
if start_date or end_date:
|
|
# arXiv date range format: [YYYYMMDDHHMM TO YYYYMMDDHHMM]
|
|
lo = (start_date or "19910101").replace("-", "") + "0000"
|
|
hi = (end_date or "29991231").replace("-", "") + "2359"
|
|
parts.append(f"submittedDate:[{lo} TO {hi}]")
|
|
return " AND ".join(parts)
|
|
|
|
|
|
def _normalise_arxiv_id(raw_id: str) -> str:
|
|
"""Convert a full arXiv URL to a bare id.
|
|
|
|
Handles both modern and legacy arXiv ID formats:
|
|
- Modern: "http://arxiv.org/abs/1706.03762v5" -> "1706.03762"
|
|
- Legacy: "http://arxiv.org/abs/hep-th/9901001v1" -> "hep-th/9901001"
|
|
"""
|
|
# Extract everything after /abs/ to preserve legacy archive prefix
|
|
if "/abs/" in raw_id:
|
|
tail = raw_id.split("/abs/", 1)[1]
|
|
else:
|
|
tail = raw_id.rsplit("/", 1)[-1]
|
|
# Strip version suffix: "1706.03762v5" -> "1706.03762"
|
|
if "v" in tail:
|
|
base, _, suffix = tail.rpartition("v")
|
|
if suffix.isdigit():
|
|
return base
|
|
return tail
|
|
|
|
|
|
def _parse_entry(entry: Any) -> dict:
|
|
"""Turn one Atom <entry> element into a paper dict."""
|
|
import xml.etree.ElementTree as ET
|
|
|
|
def _text(path: str) -> str:
|
|
node = entry.find(path, NS_MAP)
|
|
return (node.text or "").strip() if node is not None and node.text else ""
|
|
|
|
raw_id = _text("atom:id")
|
|
arxiv_id = _normalise_arxiv_id(raw_id)
|
|
|
|
authors = [(a.findtext("atom:name", default="", namespaces=NS_MAP) or "").strip() for a in entry.findall("atom:author", NS_MAP)]
|
|
authors = [a for a in authors if a]
|
|
|
|
categories = [c.get("term", "") for c in entry.findall("atom:category", NS_MAP) if c.get("term")]
|
|
|
|
pdf_url = ""
|
|
abs_url = raw_id # default
|
|
for link in entry.findall("atom:link", NS_MAP):
|
|
if link.get("title") == "pdf":
|
|
pdf_url = link.get("href", "")
|
|
elif link.get("rel") == "alternate":
|
|
abs_url = link.get("href", abs_url)
|
|
|
|
# Dates come as ISO 8601 (2017-06-12T17:57:34Z). Keep the date part.
|
|
published_raw = _text("atom:published")
|
|
updated_raw = _text("atom:updated")
|
|
published = published_raw.split("T", 1)[0] if published_raw else ""
|
|
updated = updated_raw.split("T", 1)[0] if updated_raw else ""
|
|
|
|
# Abstract (<summary>) has ragged whitespace from arXiv's formatting.
|
|
# Collapse internal whitespace to make downstream LLM consumption easier.
|
|
abstract = " ".join(_text("atom:summary").split())
|
|
|
|
# Silence unused import warning; ET is only needed for type hints above.
|
|
del ET
|
|
|
|
return {
|
|
"id": arxiv_id,
|
|
"title": " ".join(_text("atom:title").split()),
|
|
"authors": authors,
|
|
"abstract": abstract,
|
|
"published": published,
|
|
"updated": updated,
|
|
"categories": categories,
|
|
"pdf_url": pdf_url,
|
|
"abs_url": abs_url,
|
|
}
|
|
|
|
|
|
def search(
|
|
query: str,
|
|
max_results: int = 20,
|
|
category: str | None = None,
|
|
sort_by: str = "relevance",
|
|
start_date: str | None = None,
|
|
end_date: str | None = None,
|
|
) -> list[dict]:
|
|
"""Query arXiv and return a list of paper dicts.
|
|
|
|
Args:
|
|
query: free-text topic, e.g. "transformer attention".
|
|
max_results: number of papers to return (clamped to 50).
|
|
category: optional arXiv category, e.g. "cs.CL".
|
|
sort_by: "relevance", "submittedDate", or "lastUpdatedDate".
|
|
start_date: YYYY-MM-DD or YYYYMMDD, inclusive.
|
|
end_date: YYYY-MM-DD or YYYYMMDD, inclusive.
|
|
|
|
Returns:
|
|
list of dicts, each matching the schema documented in SKILL.md.
|
|
"""
|
|
import xml.etree.ElementTree as ET
|
|
|
|
if max_results <= 0:
|
|
return []
|
|
max_results = min(max_results, MAX_RESULTS_UPPER_BOUND)
|
|
|
|
search_query = _build_search_query(query, category, start_date, end_date)
|
|
params = {
|
|
"search_query": search_query,
|
|
"start": 0,
|
|
"max_results": max_results,
|
|
"sortBy": sort_by,
|
|
"sortOrder": "descending",
|
|
}
|
|
|
|
resp = requests.get(ARXIV_ENDPOINT, params=params, timeout=DEFAULT_TIMEOUT_SECONDS)
|
|
resp.raise_for_status()
|
|
|
|
# arXiv returns Atom XML, not JSON.
|
|
root = ET.fromstring(resp.text)
|
|
entries = root.findall("atom:entry", NS_MAP)
|
|
return [_parse_entry(e) for e in entries]
|
|
|
|
|
|
# --- CLI -------------------------------------------------------------------
|
|
|
|
|
|
def _build_parser() -> argparse.ArgumentParser:
|
|
parser = argparse.ArgumentParser(
|
|
description="Query the arXiv API and emit structured paper metadata as JSON.",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog=(
|
|
"Examples:\n"
|
|
' python arxiv_search.py "transformer attention" --max-results 10\n'
|
|
' python arxiv_search.py "diffusion models" --category cs.CV --sort-by submittedDate\n'
|
|
' python arxiv_search.py "graph neural networks" --start-date 2023-01-01\n'
|
|
),
|
|
)
|
|
parser.add_argument("query", help="free-text search topic")
|
|
parser.add_argument(
|
|
"--max-results",
|
|
type=int,
|
|
default=20,
|
|
help=f"number of papers to return (default: 20, max: {MAX_RESULTS_UPPER_BOUND})",
|
|
)
|
|
parser.add_argument(
|
|
"--category",
|
|
default=None,
|
|
help="optional arXiv category filter, e.g. cs.CL, cs.CV, stat.ML",
|
|
)
|
|
parser.add_argument(
|
|
"--sort-by",
|
|
default="relevance",
|
|
choices=["relevance", "submittedDate", "lastUpdatedDate"],
|
|
help="sort order (default: relevance)",
|
|
)
|
|
parser.add_argument(
|
|
"--start-date",
|
|
default=None,
|
|
help="earliest submission date, YYYY-MM-DD (inclusive)",
|
|
)
|
|
parser.add_argument(
|
|
"--end-date",
|
|
default=None,
|
|
help="latest submission date, YYYY-MM-DD (inclusive)",
|
|
)
|
|
return parser
|
|
|
|
|
|
def main() -> int:
|
|
args = _build_parser().parse_args()
|
|
try:
|
|
papers = search(
|
|
query=args.query,
|
|
max_results=args.max_results,
|
|
category=args.category,
|
|
sort_by=args.sort_by,
|
|
start_date=args.start_date,
|
|
end_date=args.end_date,
|
|
)
|
|
except Exception as exc:
|
|
print(f"arxiv_search.py: {exc}", file=sys.stderr)
|
|
return 1
|
|
|
|
json.dump(papers, sys.stdout, ensure_ascii=False, indent=2)
|
|
sys.stdout.write("\n")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|