add (parsed) html to Message

The html tools aren't very sophisticated and maybe we should use sth
like BeautifulSoup instead, but it works for us™ and is much smaller.
This commit is contained in:
ducklet 2020-11-10 21:30:25 +01:00
parent d275da418f
commit c58423ae58
3 changed files with 164 additions and 4 deletions

View file

@ -140,7 +140,7 @@ class Bot:
log.info(f"#{room.display_name} <{room.user_name(event.sender)}> {event.body}") log.info(f"#{room.display_name} <{room.user_name(event.sender)}> {event.body}")
msg = Message(self, event.body, room, event) msg = Message(self, room, event)
tasks = {} tasks = {}
for h in self.message_handlers: for h in self.message_handlers:

148
hotdog/html.py Normal file
View file

@ -0,0 +1,148 @@
import html.parser
import typing
from dataclasses import dataclass, field
@dataclass
class HtmlNode:
parent: "HtmlNode" = field(repr=False)
children: typing.List["HtmlNode"]
def __getattr__(self, name):
for c in self.children:
if type(c) is HtmlElement and c.name.replace("-", "_") == name:
return c
raise IndexError(f"No such element: {name}")
@dataclass(init=False)
class HtmlDocument(HtmlNode):
def __init__(self):
super().__init__(parent=self, children=[])
@dataclass
class HtmlElement(HtmlNode):
name: str
attrs: typing.Mapping[str, str]
def __getitem__(self, name):
return self.attrs[name]
@dataclass
class HtmlText(HtmlNode):
content: str
children: typing.List[HtmlNode] = field(
repr=False, init=False, default_factory=list
)
class HtmlParser(html.parser.HTMLParser):
"""Parse HTML for the first matching element"""
def __init__(self):
super().__init__()
self.document = HtmlDocument()
self.document.parent = self.document
self.__current = self.document
def handle_starttag(self, tag, attrs):
parent = self.__current
self.__current = HtmlElement(
name=tag, attrs=dict(attrs), parent=parent, children=[]
)
parent.children.append(self.__current)
def handle_endtag(self, tag):
while not is_root(self.__current):
done = self.__current.name == tag
self.__current = self.__current.parent
if done:
break
def handle_data(self, data):
cur = self.__current
if cur.children and type(cur.children[-1]) is HtmlText:
cur.children[-1].content += data
else:
cur.children.append(HtmlText(parent=cur, content=data))
def is_root(node: HtmlNode):
return node is node.parent
def root(node: HtmlNode) -> HtmlNode:
el = node
while not is_root(el):
el = el.parent
return el
iter_next = next
def next(node: HtmlNode) -> typing.Optional[HtmlNode]:
if is_root(node):
return None
children = iter(node.parent.children)
for c in children:
if c is node:
return iter_next(children, None)
return None # should never be reached ...
def prev(node: HtmlNode) -> typing.Optional[HtmlNode]:
if is_root(node):
return None
prev = None
for c in node.parent.children:
if c is node:
break
prev = c
return prev
def text(node: HtmlNode) -> str:
return "".join(
(c.content if type(c) is HtmlText else text(c)) for c in node.children
)
def find(
node: HtmlNode,
name=None,
*,
pred: typing.Callable[[HtmlNode], bool] = None,
class_=None,
id=None,
) -> typing.Iterable[HtmlNode]:
assert name or pred or class_ or id
assert not (pred and (name or class_ or id))
if name:
pred = lambda n: (type(n) is HtmlElement and name == n.name)
if class_:
pred = lambda n: (
type(n) is HtmlElement and class_ in n.attrs.get("class", " ").split()
)
if id:
pred = lambda n: (type(n) is HtmlElement and id == n.attrs.get("id"))
if pred(node):
yield node
else:
for c in node.children:
yield from find(c, pred=pred)
def detach(node: HtmlNode):
"""Remove the node from its parent."""
if is_root(node):
return
node.parent.children = [c for c in node.parent.children if c is not node]
def parse_html(html) -> HtmlNode:
parser = HtmlParser()
parser.feed(html)
return parser.document

View file

@ -6,6 +6,8 @@ from typing import *
import nio import nio
from .html import HtmlDocument, parse_html
JobCallback = Callable[["Job"], None] JobCallback = Callable[["Job"], None]
@ -65,9 +67,9 @@ class Tokens(Tuple[str]):
@dataclass @dataclass
class Message: class Message:
app: "Bot" app: "Bot"
text: str
room: nio.rooms.MatrixRoom room: nio.rooms.MatrixRoom
event: nio.events.room_events.RoomMessageText event: nio.events.room_events.RoomMessageText
text: str = None
tokens: Tokens = ( tokens: Tokens = (
None # The text split up into clean tokens, to be used for command handlers. None # The text split up into clean tokens, to be used for command handlers.
) )
@ -77,6 +79,7 @@ class Message:
is_for_me: bool = False # Wether the user addressed the bot using its name. is_for_me: bool = False # Wether the user addressed the bot using its name.
command: Optional[str] = None # The command keyword issued by the user, if any. command: Optional[str] = None # The command keyword issued by the user, if any.
args: Optional[Tokens] = None # args will always be set if command is set args: Optional[Tokens] = None # args will always be set if command is set
html: Optional[HtmlDocument] = None
@property @property
def sender_name(self) -> str: def sender_name(self) -> str:
@ -96,7 +99,16 @@ class Message:
# return self.args or self.tokens # return self.args or self.tokens
def __post_init__(self): def __post_init__(self):
self.tokens = Tokens.from_str(self.text) plain = self.event.body
html = (
parse_html(self.event.formatted_body)
if self.event.format == "org.matrix.custom.html"
else None
)
self.text = plain
self.html = html
self.tokens = Tokens.from_str(plain)
self.words = self.tokens self.words = self.tokens
""" """
@ -108,7 +120,7 @@ class Message:
""" """
first_arg = self.tokens.str(0) first_arg = self.tokens.str(0)
if self.text.startswith(self.app.config.command_prefix): if plain.startswith(self.app.config.command_prefix):
self.command = first_arg[len(self.app.config.command_prefix) :] self.command = first_arg[len(self.app.config.command_prefix) :]
self.args = self.tokens[1:] self.args = self.tokens[1:]
self.words = self.args self.words = self.args