deerflow-factory/backend/packages/harness/deerflow/security/html_cleaner.py

"""HTML-to-text extraction with security-focused stripping."""

from html.parser import HTMLParser
from typing import Optional


class SecureTextExtractor(HTMLParser):
    """Extract visible text while stripping potentially dangerous elements.

    Based on OpenClaw's fetch.sh implementation.
    """

    DANGEROUS_TAGS = {
        'script', 'style', 'noscript',
        'header', 'footer', 'nav', 'aside',
        'iframe', 'object', 'embed', 'form',
    }

    def __init__(self):
        super().__init__()
        self.text = []
        self.skip_depth = 0

    def handle_starttag(self, tag, attrs):
        if tag in self.DANGEROUS_TAGS:
            self.skip_depth += 1

    def handle_endtag(self, tag):
        if tag in self.DANGEROUS_TAGS and self.skip_depth > 0:
            self.skip_depth -= 1

    def handle_data(self, data):
        if self.skip_depth == 0:
            self.text.append(data)

    def get_text(self) -> str:
        return ' '.join(self.text)


def extract_secure_text(html: str, max_chars: Optional[int] = None) -> str:
    """Extract clean text from HTML.

    Args:
        html: Raw HTML content
        max_chars: Optional maximum length

    Returns:
        Clean text without dangerous elements
    """
    extractor = SecureTextExtractor()
    extractor.feed(html)
    text = extractor.get_text()

    # Collapse whitespace
    import re
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = text.strip()

    if max_chars and len(text) > max_chars:
        text = text[:max_chars-3] + '...'

    return text