hotdog/hotdog/html.py
ducklet c58423ae58 add (parsed) html to Message
The html tools aren't very sophisticated and maybe we should use sth
like BeautifulSoup instead, but it works for us™ and is much smaller.
2020-11-10 21:30:25 +01:00

148 lines
3.6 KiB
Python

import html.parser
import typing
from dataclasses import dataclass, field
@dataclass
class HtmlNode:
parent: "HtmlNode" = field(repr=False)
children: typing.List["HtmlNode"]
def __getattr__(self, name):
for c in self.children:
if type(c) is HtmlElement and c.name.replace("-", "_") == name:
return c
raise IndexError(f"No such element: {name}")
@dataclass(init=False)
class HtmlDocument(HtmlNode):
def __init__(self):
super().__init__(parent=self, children=[])
@dataclass
class HtmlElement(HtmlNode):
name: str
attrs: typing.Mapping[str, str]
def __getitem__(self, name):
return self.attrs[name]
@dataclass
class HtmlText(HtmlNode):
content: str
children: typing.List[HtmlNode] = field(
repr=False, init=False, default_factory=list
)
class HtmlParser(html.parser.HTMLParser):
"""Parse HTML for the first matching element"""
def __init__(self):
super().__init__()
self.document = HtmlDocument()
self.document.parent = self.document
self.__current = self.document
def handle_starttag(self, tag, attrs):
parent = self.__current
self.__current = HtmlElement(
name=tag, attrs=dict(attrs), parent=parent, children=[]
)
parent.children.append(self.__current)
def handle_endtag(self, tag):
while not is_root(self.__current):
done = self.__current.name == tag
self.__current = self.__current.parent
if done:
break
def handle_data(self, data):
cur = self.__current
if cur.children and type(cur.children[-1]) is HtmlText:
cur.children[-1].content += data
else:
cur.children.append(HtmlText(parent=cur, content=data))
def is_root(node: HtmlNode):
return node is node.parent
def root(node: HtmlNode) -> HtmlNode:
el = node
while not is_root(el):
el = el.parent
return el
iter_next = next
def next(node: HtmlNode) -> typing.Optional[HtmlNode]:
if is_root(node):
return None
children = iter(node.parent.children)
for c in children:
if c is node:
return iter_next(children, None)
return None # should never be reached ...
def prev(node: HtmlNode) -> typing.Optional[HtmlNode]:
if is_root(node):
return None
prev = None
for c in node.parent.children:
if c is node:
break
prev = c
return prev
def text(node: HtmlNode) -> str:
return "".join(
(c.content if type(c) is HtmlText else text(c)) for c in node.children
)
def find(
node: HtmlNode,
name=None,
*,
pred: typing.Callable[[HtmlNode], bool] = None,
class_=None,
id=None,
) -> typing.Iterable[HtmlNode]:
assert name or pred or class_ or id
assert not (pred and (name or class_ or id))
if name:
pred = lambda n: (type(n) is HtmlElement and name == n.name)
if class_:
pred = lambda n: (
type(n) is HtmlElement and class_ in n.attrs.get("class", " ").split()
)
if id:
pred = lambda n: (type(n) is HtmlElement and id == n.attrs.get("id"))
if pred(node):
yield node
else:
for c in node.children:
yield from find(c, pred=pred)
def detach(node: HtmlNode):
"""Remove the node from its parent."""
if is_root(node):
return
node.parent.children = [c for c in node.parent.children if c is not node]
def parse_html(html) -> HtmlNode:
parser = HtmlParser()
parser.feed(html)
return parser.document