"""HTML-to-text extraction with security-focused stripping.""" from html.parser import HTMLParser from typing import Optional class SecureTextExtractor(HTMLParser): """Extract visible text while stripping potentially dangerous elements. Based on OpenClaw's fetch.sh implementation. """ DANGEROUS_TAGS = { 'script', 'style', 'noscript', 'header', 'footer', 'nav', 'aside', 'iframe', 'object', 'embed', 'form', } def __init__(self): super().__init__() self.text = [] self.skip_depth = 0 def handle_starttag(self, tag, attrs): if tag in self.DANGEROUS_TAGS: self.skip_depth += 1 def handle_endtag(self, tag): if tag in self.DANGEROUS_TAGS and self.skip_depth > 0: self.skip_depth -= 1 def handle_data(self, data): if self.skip_depth == 0: self.text.append(data) def get_text(self) -> str: return ' '.join(self.text) def extract_secure_text(html: str, max_chars: Optional[int] = None) -> str: """Extract clean text from HTML. Args: html: Raw HTML content max_chars: Optional maximum length Returns: Clean text without dangerous elements """ extractor = SecureTextExtractor() extractor.feed(html) text = extractor.get_text() # Collapse whitespace import re text = re.sub(r'[ \t]+', ' ', text) text = re.sub(r'\n{3,}', '\n\n', text) text = text.strip() if max_chars and len(text) > max_chars: text = text[:max_chars-3] + '...' return text