add (parsed) html to Message
The html tools aren't very sophisticated and maybe we should use sth like BeautifulSoup instead, but it works for us™ and is much smaller.
This commit is contained in:
parent
d275da418f
commit
c58423ae58
3 changed files with 164 additions and 4 deletions
|
|
@ -140,7 +140,7 @@ class Bot:
|
|||
|
||||
log.info(f"#{room.display_name} <{room.user_name(event.sender)}> {event.body}")
|
||||
|
||||
msg = Message(self, event.body, room, event)
|
||||
msg = Message(self, room, event)
|
||||
|
||||
tasks = {}
|
||||
for h in self.message_handlers:
|
||||
|
|
|
|||
148
hotdog/html.py
Normal file
148
hotdog/html.py
Normal file
|
|
@ -0,0 +1,148 @@
|
|||
import html.parser
|
||||
import typing
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class HtmlNode:
|
||||
parent: "HtmlNode" = field(repr=False)
|
||||
children: typing.List["HtmlNode"]
|
||||
|
||||
def __getattr__(self, name):
|
||||
for c in self.children:
|
||||
if type(c) is HtmlElement and c.name.replace("-", "_") == name:
|
||||
return c
|
||||
raise IndexError(f"No such element: {name}")
|
||||
|
||||
|
||||
@dataclass(init=False)
|
||||
class HtmlDocument(HtmlNode):
|
||||
def __init__(self):
|
||||
super().__init__(parent=self, children=[])
|
||||
|
||||
|
||||
@dataclass
|
||||
class HtmlElement(HtmlNode):
|
||||
name: str
|
||||
attrs: typing.Mapping[str, str]
|
||||
|
||||
def __getitem__(self, name):
|
||||
return self.attrs[name]
|
||||
|
||||
|
||||
@dataclass
|
||||
class HtmlText(HtmlNode):
|
||||
content: str
|
||||
children: typing.List[HtmlNode] = field(
|
||||
repr=False, init=False, default_factory=list
|
||||
)
|
||||
|
||||
|
||||
class HtmlParser(html.parser.HTMLParser):
|
||||
"""Parse HTML for the first matching element"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.document = HtmlDocument()
|
||||
self.document.parent = self.document
|
||||
self.__current = self.document
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
parent = self.__current
|
||||
self.__current = HtmlElement(
|
||||
name=tag, attrs=dict(attrs), parent=parent, children=[]
|
||||
)
|
||||
parent.children.append(self.__current)
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
while not is_root(self.__current):
|
||||
done = self.__current.name == tag
|
||||
self.__current = self.__current.parent
|
||||
if done:
|
||||
break
|
||||
|
||||
def handle_data(self, data):
|
||||
cur = self.__current
|
||||
if cur.children and type(cur.children[-1]) is HtmlText:
|
||||
cur.children[-1].content += data
|
||||
else:
|
||||
cur.children.append(HtmlText(parent=cur, content=data))
|
||||
|
||||
|
||||
def is_root(node: HtmlNode):
|
||||
return node is node.parent
|
||||
|
||||
|
||||
def root(node: HtmlNode) -> HtmlNode:
|
||||
el = node
|
||||
while not is_root(el):
|
||||
el = el.parent
|
||||
return el
|
||||
|
||||
|
||||
iter_next = next
|
||||
|
||||
|
||||
def next(node: HtmlNode) -> typing.Optional[HtmlNode]:
|
||||
if is_root(node):
|
||||
return None
|
||||
children = iter(node.parent.children)
|
||||
for c in children:
|
||||
if c is node:
|
||||
return iter_next(children, None)
|
||||
return None # should never be reached ...
|
||||
|
||||
|
||||
def prev(node: HtmlNode) -> typing.Optional[HtmlNode]:
|
||||
if is_root(node):
|
||||
return None
|
||||
prev = None
|
||||
for c in node.parent.children:
|
||||
if c is node:
|
||||
break
|
||||
prev = c
|
||||
return prev
|
||||
|
||||
|
||||
def text(node: HtmlNode) -> str:
|
||||
return "".join(
|
||||
(c.content if type(c) is HtmlText else text(c)) for c in node.children
|
||||
)
|
||||
|
||||
|
||||
def find(
|
||||
node: HtmlNode,
|
||||
name=None,
|
||||
*,
|
||||
pred: typing.Callable[[HtmlNode], bool] = None,
|
||||
class_=None,
|
||||
id=None,
|
||||
) -> typing.Iterable[HtmlNode]:
|
||||
assert name or pred or class_ or id
|
||||
assert not (pred and (name or class_ or id))
|
||||
if name:
|
||||
pred = lambda n: (type(n) is HtmlElement and name == n.name)
|
||||
if class_:
|
||||
pred = lambda n: (
|
||||
type(n) is HtmlElement and class_ in n.attrs.get("class", " ").split()
|
||||
)
|
||||
if id:
|
||||
pred = lambda n: (type(n) is HtmlElement and id == n.attrs.get("id"))
|
||||
if pred(node):
|
||||
yield node
|
||||
else:
|
||||
for c in node.children:
|
||||
yield from find(c, pred=pred)
|
||||
|
||||
|
||||
def detach(node: HtmlNode):
|
||||
"""Remove the node from its parent."""
|
||||
if is_root(node):
|
||||
return
|
||||
node.parent.children = [c for c in node.parent.children if c is not node]
|
||||
|
||||
|
||||
def parse_html(html) -> HtmlNode:
|
||||
parser = HtmlParser()
|
||||
parser.feed(html)
|
||||
return parser.document
|
||||
|
|
@ -6,6 +6,8 @@ from typing import *
|
|||
|
||||
import nio
|
||||
|
||||
from .html import HtmlDocument, parse_html
|
||||
|
||||
JobCallback = Callable[["Job"], None]
|
||||
|
||||
|
||||
|
|
@ -65,9 +67,9 @@ class Tokens(Tuple[str]):
|
|||
@dataclass
|
||||
class Message:
|
||||
app: "Bot"
|
||||
text: str
|
||||
room: nio.rooms.MatrixRoom
|
||||
event: nio.events.room_events.RoomMessageText
|
||||
text: str = None
|
||||
tokens: Tokens = (
|
||||
None # The text split up into clean tokens, to be used for command handlers.
|
||||
)
|
||||
|
|
@ -77,6 +79,7 @@ class Message:
|
|||
is_for_me: bool = False # Wether the user addressed the bot using its name.
|
||||
command: Optional[str] = None # The command keyword issued by the user, if any.
|
||||
args: Optional[Tokens] = None # args will always be set if command is set
|
||||
html: Optional[HtmlDocument] = None
|
||||
|
||||
@property
|
||||
def sender_name(self) -> str:
|
||||
|
|
@ -96,7 +99,16 @@ class Message:
|
|||
# return self.args or self.tokens
|
||||
|
||||
def __post_init__(self):
|
||||
self.tokens = Tokens.from_str(self.text)
|
||||
plain = self.event.body
|
||||
html = (
|
||||
parse_html(self.event.formatted_body)
|
||||
if self.event.format == "org.matrix.custom.html"
|
||||
else None
|
||||
)
|
||||
|
||||
self.text = plain
|
||||
self.html = html
|
||||
self.tokens = Tokens.from_str(plain)
|
||||
self.words = self.tokens
|
||||
|
||||
"""
|
||||
|
|
@ -108,7 +120,7 @@ class Message:
|
|||
"""
|
||||
|
||||
first_arg = self.tokens.str(0)
|
||||
if self.text.startswith(self.app.config.command_prefix):
|
||||
if plain.startswith(self.app.config.command_prefix):
|
||||
self.command = first_arg[len(self.app.config.command_prefix) :]
|
||||
self.args = self.tokens[1:]
|
||||
self.words = self.args
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue