"""HTML-to-text extraction with security-focused stripping."""

from html.parser import HTMLParser
from typing import Optional


class SecureTextExtractor(HTMLParser):
    """Extract visible text while stripping potentially dangerous elements.
    
    Based on OpenClaw's fetch.sh implementation.
    """
    
    DANGEROUS_TAGS = {
        'script', 'style', 'noscript', 
        'header', 'footer', 'nav', 'aside',
        'iframe', 'object', 'embed', 'form',
    }
    
    def __init__(self):
        super().__init__()
        self.text = []
        self.skip_depth = 0
    
    def handle_starttag(self, tag, attrs):
        if tag in self.DANGEROUS_TAGS:
            self.skip_depth += 1
    
    def handle_endtag(self, tag):
        if tag in self.DANGEROUS_TAGS and self.skip_depth > 0:
            self.skip_depth -= 1
    
    def handle_data(self, data):
        if self.skip_depth == 0:
            self.text.append(data)
    
    def get_text(self) -> str:
        return ' '.join(self.text)


def extract_secure_text(html: str, max_chars: Optional[int] = None) -> str:
    """Extract clean text from HTML.
    
    Args:
        html: Raw HTML content
        max_chars: Optional maximum length
        
    Returns:
        Clean text without dangerous elements
    """
    extractor = SecureTextExtractor()
    extractor.feed(html)
    text = extractor.get_text()
    
    # Collapse whitespace
    import re
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = text.strip()
    
    if max_chars and len(text) > max_chars:
        text = text[:max_chars-3] + '...'
    
    return text