diff --git a/hotdog/command/feed.py b/hotdog/command/feed.py index 2e26095..e37f6c3 100644 --- a/hotdog/command/feed.py +++ b/hotdog/command/feed.py @@ -6,7 +6,7 @@ from html import escape import feeder import postillon -from ..functions import clamp, localizedtz, reply, send_message, strip_tags +from ..functions import capped_text, clamp, localizedtz, reply, send_message, strip_tags from ..models import Job, Message log = logging.getLogger(__name__) @@ -121,11 +121,5 @@ def post_as_html(post, tzname: str, lc: str, *, max_content_len: int = 300): if post.content and max_content_len > 0: if parts: parts.append("—") - content = "" - for word in strip_tags(post.content).split(" "): - if len(content + f" {word}") > max_content_len - 3: - content += " […]" - break - content += f" {word}" - parts.append(escape(content)) + parts.append(escape(capped_text(strip_tags(post.content), max_content_len))) return " ".join(parts) diff --git a/hotdog/command/urlinfo.py b/hotdog/command/urlinfo.py index c70de70..2d329de 100644 --- a/hotdog/command/urlinfo.py +++ b/hotdog/command/urlinfo.py @@ -1,6 +1,6 @@ import codecs import re -from functools import lru_cache +from dataclasses import dataclass from html import escape from html.parser import HTMLParser from random import randint @@ -9,8 +9,9 @@ from typing import * import requests -from ..functions import reply +from ..functions import ElementParser, reply from ..models import Message +from .urlinfo_ import imdb # XXX make this dynamic? (like we load plugins) HELP = """Return information about an online HTTP resource. !u[rl] @@ -18,43 +19,15 @@ HELP = """Return information about an online HTTP resource. def init(bot): - bot.on_command({"u", "url"}, handle) + bot.on_message(handle) -match_url = re.compile( +is_url = re.compile( # r"https?://(?:[a-zA-Z]|[0-9]|[$-_~@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" - r"https?://\S+" + r"(https?://|www\.)\S+" ).fullmatch -class TitleParser(HTMLParser): - """Parse the first from HTML""" - - # XXX check if it's the <head>'s title we're in, but beware that head can be implicit - def __init__(self): - super().__init__() - self.__is_title = False - self.__found = False - self.__title = "" - - def handle_starttag(self, tag, attrs): - if tag == "title": - self.__is_title = True - - def handle_endtag(self, tag): - if tag == "title": - self.__found = True - self.__is_title = False - - def handle_data(self, data): - if self.__is_title and not self.__found: - self.__title += data - - @property - def title(self) -> Optional[str]: - return self.__title if self.__found else None - - def get_encodings_from_content(content: str) -> List[str]: """Returns encodings from given content string.""" @@ -89,15 +62,12 @@ def stream_decode_response_unicode(content: Iterable[bytes]) -> Iterable[str]: def title(content: Iterable[str]) -> Optional[str]: - t = TitleParser() - for chunk in content: - t.feed(chunk) - if t.title is not None: - break - return t.title + t = ElementParser(lambda tag, attrs: tag == "title") + t.load_chunks(content) + return t.value -def capped(content: Iterable[str], read_max: int) -> Iterable[str]: +def capped(content: Iterable[AnyStr], read_max: int) -> Iterable[AnyStr]: read = 0 for chunk in content: read += len(chunk) @@ -106,9 +76,44 @@ def capped(content: Iterable[str], read_max: int) -> Iterable[str]: break -@lru_cache(maxsize=5) -def load_info(url: str, cachetoken) -> Optional[Mapping[str, Any]]: +@dataclass +class Info: + """Information extracted by loading a URL. + + This information can/will be cached for successive lookups of the same URL. + When the info object is handed to the extractor function _resp and _chunks + will be set. Prior to commiting the info object to cache these references + will be removed. + """ + + code: int + final_url: str + elapsed_ms: int + reason: str + content_type: Optional[str] + size: Optional[int] + filename: Optional[str] + extracted: Optional[Any] = None + + _resp: Optional[requests.Response] = None + _chunks_str: Optional[Iterable[str]] = None + _chunks_bytes: Optional[Iterable[bytes]] = None + + +# XXX can't use lru_cache with async funcs +# TODO: create lru_cache that supports async and timeout + +_load_info_cache = {} + + +async def load_info( + url: str, extractor: Callable[[Info], None], cachetoken +) -> Optional[Info]: """The cachetoken is just there to bust the LRU cache after a while.""" + cachekey = (url, cachetoken) + if cachekey in _load_info_cache: + return _load_info_cache[cachekey] + try: r = requests.get( url, @@ -119,18 +124,6 @@ def load_info(url: str, cachetoken) -> Optional[Mapping[str, Any]]: except Exception: return None - content_type = r.headers.get("Content-Type", "") - is_html = content_type.startswith("text/html") or url.lower().endswith( - (".html", ".htm") - ) - if is_html: - one_kb = 2 ** 10 - # chunks = r.iter_content(chunk_size=30 * one_kb, decode_unicode=True) - chunks = stream_decode_response_unicode(r.iter_content(chunk_size=30 * one_kb)) - html_title = title(capped(chunks, read_max=200 * one_kb)) - else: - html_title = None - filename = None dispo = r.headers.get("Content-Disposition", "").split(";") if len(dispo) == 2 and dispo[0] == "attachment": @@ -138,18 +131,33 @@ def load_info(url: str, cachetoken) -> Optional[Mapping[str, Any]]: if len(dispo) == 2 and dispo[0] == "filename": filename = dispo[1].strip() - return { - "code": r.status_code, - "url": r.url, - "elapsed_ms": int(r.elapsed.total_seconds() * 1_000), - "reason": r.reason, - "type": r.headers.get("Content-Type"), - "size": ( + one_kb = 2 ** 10 + chunks = capped(r.iter_content(chunk_size=30 * one_kb), read_max=200 * one_kb) + + info = Info( + code=r.status_code, + final_url=r.url, + elapsed_ms=int(r.elapsed.total_seconds() * 1_000), + reason=r.reason, + content_type=r.headers.get("Content-Type"), + size=( int(r.headers["Content-Length"]) if "Content-Length" in r.headers else None ), - "title": html_title, - "filename": filename, - } + filename=filename, + _resp=r, + _chunks_str=stream_decode_response_unicode(chunks), + _chunks_bytes=chunks, + ) + + await extractor(info) + + # Remove all references to the Response before the info is commited to cache. + info._resp = None + info._chunks_str = None + info._chunks_bytes = None + + _load_info_cache[cachekey] = info + return _load_info_cache[cachekey] def cachetoken(quant_m=15): @@ -168,32 +176,71 @@ def pretty_size(size: int) -> str: return f"{size:_.02f} {q}B" -async def handle(message: Message): - url = message.args.str(0) - if not match_url(url): - return - - info = load_info(url, cachetoken()) - if not info: - return - +async def generic_handler(message: Message, url: str, info: Info): details = [] - if info["type"]: - details.append(f"<i>Media type</i>: {escape(info['type'])}") - if info["size"]: - details.append(f"<i>Size</i>: {pretty_size(info['size'])}") - details.append(f"<i>Status</i>: {info['code']}") - if info["reason"]: - details[-1] += f" ({escape(info['reason'])})" - if info["url"] != url: + if info.content_type: + details.append(f"<i>Media type</i>: {escape(info.content_type)}") + if info.size: + details.append(f"<i>Size</i>: {pretty_size(info.size)}") + details.append(f"<i>Status</i>: {info.code}") + if info.reason: + details[-1] += f" ({escape(info.reason)})" + if info.final_url != url: details.append( - f"""<i>Redirected to</i>: <a href="{escape(info['url'])}">{escape(info['url'])}</a>""" + f"""<i>Redirected to</i>: <a href="{escape(info.final_url)}">{escape(info.final_url)}</a>""" ) - if info["filename"] and info["filename"] != url.rsplit("/", 2)[-1]: - details.append(f"<i>Filename</i>: {escape(info['filename'])}") - details.append(f"<i>TTFB</i>: {info['elapsed_ms']:_} ms") + if info.filename and info.filename != url.rsplit("/", 2)[-1]: + details.append(f"<i>Filename</i>: {escape(info.filename)}") + details.append(f"<i>TTFB</i>: {info.elapsed_ms:_} ms") - text = f"<b>{escape(info['title'])}</b> — " if info["title"] else "" + text = ( + f"<b>{escape(info.extracted['title'])}</b> — " + if info.extracted["title"] + else "" + ) text += "; ".join(details) await reply(message, html=text, in_thread=True) + + +async def generic_extractor(info: Info): + content_type = info._resp.headers.get("Content-Type", "") + is_html = content_type.startswith("text/html") or url.lower().endswith( + (".html", ".htm") + ) + info.extracted = {"title": title(info._chunks_str) if is_html else None} + + +def full_url(ref: str) -> str: + return f"http://{ref}" if ref.startswith("www") else ref + + +async def handle(message: Message): + if message.command and message.command not in {"u", "url"}: + return + + limit = 3 + urls = [full_url(w) for w in message.words if is_url(w)][:limit] + if not urls: + return + + handlers = (imdb,) + + for url in urls: + for handler in handlers: + if handler.can_handle(url): + handler = handler.handle + extractor = handler.extractor + break + else: + if not ( + message.command + ): # We only want the generic handler if we were called explicitly + continue + handler = generic_handler + extractor = generic_extractor + + info = await load_info(url, extractor, cachetoken()) + if not info: + continue + await handler(message, url, info) diff --git a/hotdog/command/urlinfo_/imdb.py b/hotdog/command/urlinfo_/imdb.py new file mode 100644 index 0000000..c3e2aa9 --- /dev/null +++ b/hotdog/command/urlinfo_/imdb.py @@ -0,0 +1,180 @@ +import io +import json +import re +from dataclasses import dataclass, fields +from dataclasses import replace as clone +from datetime import date +from html import escape +from html.parser import HTMLParser +from pathlib import Path +from typing import * +from urllib.parse import urlparse + +import requests + +from ...functions import ( + ElementParser, + capped_text, + escape_all, + pretty_duration, + reply, + send_image, + strip_tags, +) +from ...models import Message + +# https://www.imdb.com/title/tt13399862/ +can_handle = re.compile(r"https://www.imdb.com/title/(?P<id>tt\d+)/").fullmatch + + +def thumbnail(url, width=182, height=268): + """Return a thumbnail URL for the given IMDb image URL. + + The default settings are what IMDb currently uses for desktop display. + """ + resize = f"UY{height}" # there's also 'UX' to resize on width + offset = "2,0" # by setting non-0 for the first value the image is fitted + crop = f"CR{offset},{width},{height}" + al = "AL" # not sure what this is, doesn't seem to do much but they use it. + + variant = "_".join((resize, crop, al)) + + parts = urlparse(url) + path = Path(parts.path) + # path.with_stem(f"{path.stem}_{variant}") XXX py3.9 + path = path.with_name(f"{path.stem}_{variant}{path.suffix}") + return parts._replace(path=str(path)).geturl() + + +period_re = re.compile( + r"P((?P<year>\d+)Y)?((?P<month>\d+)M)?((?P<day>\d+)D)?T((?P<hour>\d+)H)?((?P<minute>\d+)M)?((?P<second>\d+)S)?" +) + + +def parse_period(s: str) -> int: + # see https://en.wikipedia.org/wiki/ISO_8601#Durations + seconds = { + "year": 365 * 86400, + "month": 30 * 86400, + "day": 86400, + "hour": 3600, + "minute": 60, + "second": 1, + } + if not (match := period_re.fullmatch(s)): + return 0 + return sum(seconds[k] * int(v) for k, v in match.groupdict().items() if v) + + +_import_image_cache = ( + {} +) # XXX ideally we'd cache these forever (in some permanent storage) + + +async def import_image(client, url, filename=None): + if url in _import_image_cache: + return _import_image_cache[url] + r = requests.get( + url, + # stream=True, + timeout=(3, 3), + headers={"user-agent": "hotdog/v1 urlinfo"}, + ) + size = len(r.content) + uploaded, keys = await client.upload( + # io.BufferedReader(r.raw), + io.BufferedReader(io.BytesIO(r.content)), + content_type="image/jpeg", + filename=filename, + filesize=size, + ) + _import_image_cache[url] = { + "size": size, + "url": uploaded.content_uri, + } + return _import_image_cache[url] + + +@dataclass +class Extracted: + title: str + original_image: str + genres: List[str] + description: str + published: date + duration_s: int + rating_value: float + rating_count: int + creators: List[str] + + +async def extractor(info): + parser = ElementParser( + lambda tag, attrs: ( + tag == "script" and dict(attrs).get("type") == "application/ld+json" + ) + ) + parser.load_chunks(info._chunks_str) + + if not parser.value: + return + + ld = json.loads(parser.value) + assert ld["@context"] == "http://schema.org" and ld["@type"] == "Movie" + assert ld["aggregateRating"]["@type"] == "AggregateRating" + + creators = [] + for k in "director", "creator": + if k in ld: + t = [ld[k]] if type(ld[k]) is dict else ld[k] + creators += [p["name"] for p in t if p["@type"] == "Person"] + creators = list({k: None for k in creators}) # remove dupes + + info.extracted = Extracted( + title=ld["name"], + original_image=ld["image"], + genres=ld["genre"], + description=ld["description"], + published=date.fromisoformat(ld["datePublished"]), + duration_s=parse_period(ld["duration"]), + rating_value=float(ld["aggregateRating"]["ratingValue"]), + rating_count=ld["aggregateRating"]["ratingCount"], + creators=creators, + ) + + +async def handle(message, url, info): + ex = clone(info.extracted) + image_title = f"Poster for {ex.title} ({ex.published:%Y})" + hosted_image = await import_image( + message.app.client, + thumbnail(ex.original_image), + filename=f"{image_title}.jpg", + ) + await send_image( + message.app.client, + message.room.room_id, + hosted_image["url"], + description=image_title, + mimetype="image/jpeg", + size=hosted_image["size"], + ) + + ex.description = strip_tags(ex.description) + escape_all(ex) + + details = [ + f"🖋 {' ∕ '.join(ex.creators[:2])}", + f"⏱ {pretty_duration(ex.duration_s)}", + f"⭐️ {ex.rating_value:_.01f} ⁄ 10 (👤 {ex.rating_count})", + f"🏷 {' ∕ '.join(ex.genres)}", + ] + lines = [ + f"<b>{ex.title}</b> (<b>{ex.published:%Y}</b>)", + f"{', '.join(details)}", + f"<i>{capped_text(ex.description, 500)}</i>", + ] + + html = "<br>".join(lines) + plain = strip_tags(" — ".join(lines)) + await reply(message, plain, html=html) diff --git a/hotdog/command/youtube.py b/hotdog/command/youtube.py index fa4a0fb..91bee92 100644 --- a/hotdog/command/youtube.py +++ b/hotdog/command/youtube.py @@ -7,7 +7,7 @@ from typing import * import youtube_dl -from ..functions import reply +from ..functions import escape_all, pretty_duration, reply from ..models import Message HELP = """Gibt Informationen zu Youtube-Videos aus. @@ -45,7 +45,7 @@ async def handle(message: Message): youtube_id = match["id"] info = load_info(youtube_id, cachetoken()) - info.escape_all() + escape_all(info) details = [ f"🖋 {info.author}", f"⏱ {pretty_duration(info.duration_seconds)}", @@ -59,15 +59,6 @@ async def handle(message: Message): await reply(message, html=text) -def pretty_duration(seconds: int) -> str: - hours = seconds // 3600 - minutes = (seconds - hours * 3600) // 60 - seconds = seconds % 60 - return ( - f"{hours}h{minutes:02}m{seconds:02}s" if hours else f"{minutes}m{seconds:02}s" - ) - - class Nolog: def debug(self, msg): pass @@ -95,13 +86,6 @@ class Info: categories: List[str] tags: List[str] - def escape_all(self): - for f in fields(self): - if f.type is str: - setattr(self, f.name, escape(getattr(self, f.name))) - elif get_origin(f.type) is list: - setattr(self, f.name, [escape(x) for x in getattr(self, f.name)]) - @classmethod def from_url(cls, url): info = ytdl.extract_info(url, download=False) diff --git a/hotdog/functions.py b/hotdog/functions.py index dc4fc62..f2e59bf 100644 --- a/hotdog/functions.py +++ b/hotdog/functions.py @@ -1,9 +1,11 @@ import locale import logging import unicodedata +from collections import defaultdict from contextlib import contextmanager +from dataclasses import dataclass, fields from datetime import datetime, timedelta, timezone -from html import escape +from html import escape as html_escape from html.parser import HTMLParser from io import StringIO from typing import * @@ -18,7 +20,7 @@ tzdb = { def html_nametag(uid, name): - return f'<a href="https://matrix.to/#/{escape(uid)}">{escape(name)}</a>' + return f'<a href="https://matrix.to/#/{html_escape(uid)}">{html_escape(name)}</a>' async def reply( @@ -143,6 +145,67 @@ async def send_message( log.exception(f"Unable to send message to room: {room_id}") +async def send_image( + client: nio.AsyncClient, + room_id: str, + url: str, + description: str, + *, + width: Optional[int] = None, + height: Optional[int] = None, + size: Optional[int] = None, + mimetype: Optional[str] = None, + thumbnail_url: Optional[str] = None, + thumbnail_width: Optional[int] = None, + thumbnail_height: Optional[int] = None, + thumbnail_size: Optional[int] = None, + thumbnail_mimetype: Optional[str] = None, +) -> nio.RoomSendResponse: + # https://matrix.org/docs/spec/client_server/r0.6.1#m-image + content = defaultdict( + dict, + { + "body": description, + "msgtype": "m.image", + "url": url, + }, + ) + + # Map all image keyword args into the content dict. + kwds = locals() + kwmap = { + "width": "w", + "height": "h", + "size": "size", + "mimetype": "mimetype", + "thumbnail_url": "thumbnail_url", + } + for kwarg, carg in kwmap.items(): + if kwds[kwarg] is not None: + content["info"][carg] = kwds[kwarg] + + # Map all thumbnail keyword args into the content dict. + kwmap = { + "thumbnail_width": "w", + "thumbnail_height": "h", + "thumbnail_size": "size", + "thumbnail_mimetype": "mimetype", + } + thumbinfo = defaultdict(dict) + for kwarg, carg in kwmap.items(): + if kwds[kwarg] is not None: + thumbinfo[carg] = kwds[kwarg] + if thumbinfo: + content["info"]["thumbnail_info"] = thumbinfo + + return await client.room_send( + room_id, + "m.room.message", + content, + ignore_unverified_devices=True, + ) + + @contextmanager def localized(lc: str, category=locale.LC_ALL): locale.setlocale(category, lc) @@ -179,3 +242,81 @@ def strip_tags(html): def clamp(lower, x, upper): return max(lower, min(x, upper)) + + +def pretty_duration(seconds: int) -> str: + hours = seconds // 3600 + minutes = (seconds - hours * 3600) // 60 + seconds = seconds % 60 + + # full: 1h 23m 13s + # 0 seconds: 1h 23m + # 0 hours: 23m 13s + # 0 hours 0 seconds: 23m 00s + + parts = {} + if hours: + parts["h"] = f"{hours}h" + parts["m"] = f"{minutes:02}m" + if seconds or not hours: + parts["s"] = f"{seconds:02}s" + + return " ".join(parts.values()) + + +def capped_text(text: str, max_len: int, mark=" […]") -> str: + if len(text) <= max_len: + return text + + capped = "" + for word in text.split(" "): + if len(capped + f" {word}") > max_len - len(mark): + capped += mark + break + capped += f" {word}" + return capped + + +class ElementParser(HTMLParser): + """Parse HTML for the first matching element""" + + def __init__(self, selector: Callable[[str, Mapping[str, str]], bool]): + super().__init__() + self.selector = selector + self.__active_tag = None + self.__done = False + self.__value = "" + + def handle_starttag(self, tag, attrs): + if self.selector(tag, attrs): + self.__active_tag = tag + + def handle_endtag(self, tag): + if tag == self.__active_tag: + self.__done = True + self.__active_tag = None + + def handle_data(self, data): + if self.__active_tag and not self.__done: + self.__value += data + + @property + def value(self) -> Optional[str]: + return self.__value if self.__done else None + + def load_chunks(self, content: Iterable[str]) -> None: + for chunk in content: + self.feed(chunk) + if self.__done: + break + + +def escape_all(dc: dataclass, escape: Callable[[str], str] = html_escape) -> None: + """Patch a dataclass to escape all strings.""" + for f in fields(dc): + if f.type is str: + setattr(dc, f.name, escape(getattr(dc, f.name))) + elif get_origin(f.type) is list and get_args(f.type)[0] is str: + setattr(dc, f.name, [escape(x) for x in getattr(dc, f.name)]) + elif get_origin(f.type) is dict and get_args(f.type)[1] is str: + setattr(dc, f.name, {k: escape(v) for k, v in getattr(dc, f.name).items()})