The html tools aren't very sophisticated and maybe we should use sth like BeautifulSoup instead, but it works for us™ and is much smaller.
148 lines
3.6 KiB
Python
148 lines
3.6 KiB
Python
import html.parser
|
|
import typing
|
|
from dataclasses import dataclass, field
|
|
|
|
|
|
@dataclass
|
|
class HtmlNode:
|
|
parent: "HtmlNode" = field(repr=False)
|
|
children: typing.List["HtmlNode"]
|
|
|
|
def __getattr__(self, name):
|
|
for c in self.children:
|
|
if type(c) is HtmlElement and c.name.replace("-", "_") == name:
|
|
return c
|
|
raise IndexError(f"No such element: {name}")
|
|
|
|
|
|
@dataclass(init=False)
|
|
class HtmlDocument(HtmlNode):
|
|
def __init__(self):
|
|
super().__init__(parent=self, children=[])
|
|
|
|
|
|
@dataclass
|
|
class HtmlElement(HtmlNode):
|
|
name: str
|
|
attrs: typing.Mapping[str, str]
|
|
|
|
def __getitem__(self, name):
|
|
return self.attrs[name]
|
|
|
|
|
|
@dataclass
|
|
class HtmlText(HtmlNode):
|
|
content: str
|
|
children: typing.List[HtmlNode] = field(
|
|
repr=False, init=False, default_factory=list
|
|
)
|
|
|
|
|
|
class HtmlParser(html.parser.HTMLParser):
|
|
"""Parse HTML for the first matching element"""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.document = HtmlDocument()
|
|
self.document.parent = self.document
|
|
self.__current = self.document
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
parent = self.__current
|
|
self.__current = HtmlElement(
|
|
name=tag, attrs=dict(attrs), parent=parent, children=[]
|
|
)
|
|
parent.children.append(self.__current)
|
|
|
|
def handle_endtag(self, tag):
|
|
while not is_root(self.__current):
|
|
done = self.__current.name == tag
|
|
self.__current = self.__current.parent
|
|
if done:
|
|
break
|
|
|
|
def handle_data(self, data):
|
|
cur = self.__current
|
|
if cur.children and type(cur.children[-1]) is HtmlText:
|
|
cur.children[-1].content += data
|
|
else:
|
|
cur.children.append(HtmlText(parent=cur, content=data))
|
|
|
|
|
|
def is_root(node: HtmlNode):
|
|
return node is node.parent
|
|
|
|
|
|
def root(node: HtmlNode) -> HtmlNode:
|
|
el = node
|
|
while not is_root(el):
|
|
el = el.parent
|
|
return el
|
|
|
|
|
|
iter_next = next
|
|
|
|
|
|
def next(node: HtmlNode) -> typing.Optional[HtmlNode]:
|
|
if is_root(node):
|
|
return None
|
|
children = iter(node.parent.children)
|
|
for c in children:
|
|
if c is node:
|
|
return iter_next(children, None)
|
|
return None # should never be reached ...
|
|
|
|
|
|
def prev(node: HtmlNode) -> typing.Optional[HtmlNode]:
|
|
if is_root(node):
|
|
return None
|
|
prev = None
|
|
for c in node.parent.children:
|
|
if c is node:
|
|
break
|
|
prev = c
|
|
return prev
|
|
|
|
|
|
def text(node: HtmlNode) -> str:
|
|
return "".join(
|
|
(c.content if type(c) is HtmlText else text(c)) for c in node.children
|
|
)
|
|
|
|
|
|
def find(
|
|
node: HtmlNode,
|
|
name=None,
|
|
*,
|
|
pred: typing.Callable[[HtmlNode], bool] = None,
|
|
class_=None,
|
|
id=None,
|
|
) -> typing.Iterable[HtmlNode]:
|
|
assert name or pred or class_ or id
|
|
assert not (pred and (name or class_ or id))
|
|
if name:
|
|
pred = lambda n: (type(n) is HtmlElement and name == n.name)
|
|
if class_:
|
|
pred = lambda n: (
|
|
type(n) is HtmlElement and class_ in n.attrs.get("class", " ").split()
|
|
)
|
|
if id:
|
|
pred = lambda n: (type(n) is HtmlElement and id == n.attrs.get("id"))
|
|
if pred(node):
|
|
yield node
|
|
else:
|
|
for c in node.children:
|
|
yield from find(c, pred=pred)
|
|
|
|
|
|
def detach(node: HtmlNode):
|
|
"""Remove the node from its parent."""
|
|
if is_root(node):
|
|
return
|
|
node.parent.children = [c for c in node.parent.children if c is not node]
|
|
|
|
|
|
def parse_html(html) -> HtmlNode:
|
|
parser = HtmlParser()
|
|
parser.feed(html)
|
|
return parser.document
|