hotdog/hotdog/html.py

import html.parser
import typing
from dataclasses import dataclass, field


@dataclass
class HtmlNode:
    parent: "HtmlNode" = field(repr=False)
    children: typing.List["HtmlNode"]

    def __getattr__(self, name):
        for c in self.children:
            if type(c) is HtmlElement and c.name.replace("-", "_") == name:
                return c
        raise IndexError(f"No such element: {name}")


@dataclass(init=False)
class HtmlDocument(HtmlNode):
    def __init__(self):
        super().__init__(parent=self, children=[])


@dataclass
class HtmlElement(HtmlNode):
    name: str
    attrs: typing.Mapping[str, str]

    def __getitem__(self, name):
        return self.attrs[name]


@dataclass
class HtmlText(HtmlNode):
    content: str
    children: typing.List[HtmlNode] = field(
        repr=False, init=False, default_factory=list
    )


class HtmlParser(html.parser.HTMLParser):
    """Parse HTML for the first matching element"""

    def __init__(self):
        super().__init__()
        self.document = HtmlDocument()
        self.document.parent = self.document
        self.__current = self.document

    def handle_starttag(self, tag, attrs):
        parent = self.__current
        self.__current = HtmlElement(
            name=tag, attrs=dict(attrs), parent=parent, children=[]
        )
        parent.children.append(self.__current)

    def handle_endtag(self, tag):
        while not is_root(self.__current):
            done = self.__current.name == tag
            self.__current = self.__current.parent
            if done:
                break

    def handle_data(self, data):
        cur = self.__current
        if cur.children and type(cur.children[-1]) is HtmlText:
            cur.children[-1].content += data
        else:
            cur.children.append(HtmlText(parent=cur, content=data))


def is_root(node: HtmlNode):
    return node is node.parent


def root(node: HtmlNode) -> HtmlNode:
    el = node
    while not is_root(el):
        el = el.parent
    return el


iter_next = next


def next(node: HtmlNode) -> typing.Optional[HtmlNode]:
    if is_root(node):
        return None
    children = iter(node.parent.children)
    for c in children:
        if c is node:
            return iter_next(children, None)
    return None  # should never be reached ...


def prev(node: HtmlNode) -> typing.Optional[HtmlNode]:
    if is_root(node):
        return None
    prev = None
    for c in node.parent.children:
        if c is node:
            break
        prev = c
    return prev


def text(node: HtmlNode) -> str:
    return "".join(
        (c.content if type(c) is HtmlText else text(c)) for c in node.children
    )


def find(
    node: HtmlNode,
    name=None,
    *,
    pred: typing.Callable[[HtmlNode], bool] = None,
    class_=None,
    id=None,
) -> typing.Iterable[HtmlNode]:
    assert name or pred or class_ or id
    assert not (pred and (name or class_ or id))
    if name:
        pred = lambda n: (type(n) is HtmlElement and name == n.name)
    if class_:
        pred = lambda n: (
            type(n) is HtmlElement and class_ in n.attrs.get("class", " ").split()
        )
    if id:
        pred = lambda n: (type(n) is HtmlElement and id == n.attrs.get("id"))
    if pred(node):
        yield node
    else:
        for c in node.children:
            yield from find(c, pred=pred)


def detach(node: HtmlNode):
    """Remove the node from its parent."""
    if is_root(node):
        return
    node.parent.children = [c for c in node.parent.children if c is not node]


def parse_html(html) -> HtmlNode:
    parser = HtmlParser()
    parser.feed(html)
    return parser.document