urlinfo: allow sub-modules and add module for IMDb movies

The urlinfo plugin is now set up to look up URL information for any URL occurring in text, not only when triggered explicitly as a command. The youtube plugin should probably be integrated into this setup, replacing the bot plugin with a urlinfo extension.
2020-11-07 20:35:52 +01:00 · 2020-11-07 20:35:52 +01:00 · efc6ecbb45
commit efc6ecbb45
parent 81a176eb0c
5 changed files with 460 additions and 114 deletions
--- a/hotdog/command/feed.py
+++ b/hotdog/command/feed.py
@ -6,7 +6,7 @@ from html import escape
 import feeder
 import postillon

-from ..functions import clamp, localizedtz, reply, send_message, strip_tags
+from ..functions import capped_text, clamp, localizedtz, reply, send_message, strip_tags
 from ..models import Job, Message

 log = logging.getLogger(__name__)
@ -121,11 +121,5 @@ def post_as_html(post, tzname: str, lc: str, *, max_content_len: int = 300):
    if post.content and max_content_len > 0:
        if parts:
            parts.append("—")
-        content = ""
-        for word in strip_tags(post.content).split(" "):
-            if len(content + f" {word}") > max_content_len - 3:
-                content += " […]"
-                break
-            content += f" {word}"
-        parts.append(escape(content))
+        parts.append(escape(capped_text(strip_tags(post.content), max_content_len)))
    return " ".join(parts)
--- a/hotdog/command/urlinfo.py
+++ b/hotdog/command/urlinfo.py
@ -1,6 +1,6 @@
 import codecs
 import re
-from functools import lru_cache
+from dataclasses import dataclass
 from html import escape
 from html.parser import HTMLParser
 from random import randint
@ -9,8 +9,9 @@ from typing import *

 import requests

-from ..functions import reply
+from ..functions import ElementParser, reply
 from ..models import Message
+from .urlinfo_ import imdb  # XXX make this dynamic? (like we load plugins)

 HELP = """Return information about an online HTTP resource.
 !u[rl] <url>
@ -18,43 +19,15 @@ HELP = """Return information about an online HTTP resource.


 def init(bot):
-    bot.on_command({"u", "url"}, handle)
+    bot.on_message(handle)


-match_url = re.compile(
+is_url = re.compile(
    # r"https?://(?:[a-zA-Z]|[0-9]|[$-_~@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
-    r"https?://\S+"
+    r"(https?://|www\.)\S+"
 ).fullmatch


-class TitleParser(HTMLParser):
-    """Parse the first <title> from HTML"""
-
-    # XXX check if it's the <head>'s title we're in, but beware that head can be implicit
-    def __init__(self):
-        super().__init__()
-        self.__is_title = False
-        self.__found = False
-        self.__title = ""
-
-    def handle_starttag(self, tag, attrs):
-        if tag == "title":
-            self.__is_title = True
-
-    def handle_endtag(self, tag):
-        if tag == "title":
-            self.__found = True
-            self.__is_title = False
-
-    def handle_data(self, data):
-        if self.__is_title and not self.__found:
-            self.__title += data
-
-    @property
-    def title(self) -> Optional[str]:
-        return self.__title if self.__found else None
-
-
 def get_encodings_from_content(content: str) -> List[str]:
    """Returns encodings from given content string."""

@ -89,15 +62,12 @@ def stream_decode_response_unicode(content: Iterable[bytes]) -> Iterable[str]:


 def title(content: Iterable[str]) -> Optional[str]:
-    t = TitleParser()
-    for chunk in content:
-        t.feed(chunk)
-        if t.title is not None:
-            break
-    return t.title
+    t = ElementParser(lambda tag, attrs: tag == "title")
+    t.load_chunks(content)
+    return t.value


-def capped(content: Iterable[str], read_max: int) -> Iterable[str]:
+def capped(content: Iterable[AnyStr], read_max: int) -> Iterable[AnyStr]:
    read = 0
    for chunk in content:
        read += len(chunk)
@ -106,9 +76,44 @@ def capped(content: Iterable[str], read_max: int) -> Iterable[str]:
            break


-@lru_cache(maxsize=5)
-def load_info(url: str, cachetoken) -> Optional[Mapping[str, Any]]:
+@dataclass
+class Info:
+    """Information extracted by loading a URL.
+
+    This information can/will be cached for successive lookups of the same URL.
+    When the info object is handed to the extractor function _resp and _chunks
+    will be set.  Prior to commiting the info object to cache these references
+    will be removed.
+    """
+
+    code: int
+    final_url: str
+    elapsed_ms: int
+    reason: str
+    content_type: Optional[str]
+    size: Optional[int]
+    filename: Optional[str]
+    extracted: Optional[Any] = None
+
+    _resp: Optional[requests.Response] = None
+    _chunks_str: Optional[Iterable[str]] = None
+    _chunks_bytes: Optional[Iterable[bytes]] = None
+
+
+# XXX can't use lru_cache with async funcs
+#     TODO: create lru_cache that supports async and timeout
+
+_load_info_cache = {}
+
+
+async def load_info(
+    url: str, extractor: Callable[[Info], None], cachetoken
+) -> Optional[Info]:
    """The cachetoken is just there to bust the LRU cache after a while."""
+    cachekey = (url, cachetoken)
+    if cachekey in _load_info_cache:
+        return _load_info_cache[cachekey]
+
    try:
        r = requests.get(
            url,
@ -119,18 +124,6 @@ def load_info(url: str, cachetoken) -> Optional[Mapping[str, Any]]:
    except Exception:
        return None

-    content_type = r.headers.get("Content-Type", "")
-    is_html = content_type.startswith("text/html") or url.lower().endswith(
-        (".html", ".htm")
-    )
-    if is_html:
-        one_kb = 2 ** 10
-        # chunks = r.iter_content(chunk_size=30 * one_kb, decode_unicode=True)
-        chunks = stream_decode_response_unicode(r.iter_content(chunk_size=30 * one_kb))
-        html_title = title(capped(chunks, read_max=200 * one_kb))
-    else:
-        html_title = None
-
    filename = None
    dispo = r.headers.get("Content-Disposition", "").split(";")
    if len(dispo) == 2 and dispo[0] == "attachment":
@ -138,18 +131,33 @@ def load_info(url: str, cachetoken) -> Optional[Mapping[str, Any]]:
        if len(dispo) == 2 and dispo[0] == "filename":
            filename = dispo[1].strip()

-    return {
-        "code": r.status_code,
-        "url": r.url,
-        "elapsed_ms": int(r.elapsed.total_seconds() * 1_000),
-        "reason": r.reason,
-        "type": r.headers.get("Content-Type"),
-        "size": (
+    one_kb = 2 ** 10
+    chunks = capped(r.iter_content(chunk_size=30 * one_kb), read_max=200 * one_kb)
+
+    info = Info(
+        code=r.status_code,
+        final_url=r.url,
+        elapsed_ms=int(r.elapsed.total_seconds() * 1_000),
+        reason=r.reason,
+        content_type=r.headers.get("Content-Type"),
+        size=(
            int(r.headers["Content-Length"]) if "Content-Length" in r.headers else None
        ),
-        "title": html_title,
-        "filename": filename,
-    }
+        filename=filename,
+        _resp=r,
+        _chunks_str=stream_decode_response_unicode(chunks),
+        _chunks_bytes=chunks,
+    )
+
+    await extractor(info)
+
+    # Remove all references to the Response before the info is commited to cache.
+    info._resp = None
+    info._chunks_str = None
+    info._chunks_bytes = None
+
+    _load_info_cache[cachekey] = info
+    return _load_info_cache[cachekey]


 def cachetoken(quant_m=15):
@ -168,32 +176,71 @@ def pretty_size(size: int) -> str:
    return f"{size:_.02f} {q}B"


-async def handle(message: Message):
-    url = message.args.str(0)
-    if not match_url(url):
-        return
-
-    info = load_info(url, cachetoken())
-    if not info:
-        return
-
+async def generic_handler(message: Message, url: str, info: Info):
    details = []
-    if info["type"]:
-        details.append(f"<i>Media type</i>: {escape(info['type'])}")
-    if info["size"]:
-        details.append(f"<i>Size</i>: {pretty_size(info['size'])}")
-    details.append(f"<i>Status</i>: {info['code']}")
-    if info["reason"]:
-        details[-1] += f" ({escape(info['reason'])})"
-    if info["url"] != url:
+    if info.content_type:
+        details.append(f"<i>Media type</i>: {escape(info.content_type)}")
+    if info.size:
+        details.append(f"<i>Size</i>: {pretty_size(info.size)}")
+    details.append(f"<i>Status</i>: {info.code}")
+    if info.reason:
+        details[-1] += f" ({escape(info.reason)})"
+    if info.final_url != url:
        details.append(
-            f"""<i>Redirected to</i>: <a href="{escape(info['url'])}">{escape(info['url'])}</a>"""
+            f"""<i>Redirected to</i>: <a href="{escape(info.final_url)}">{escape(info.final_url)}</a>"""
        )
-    if info["filename"] and info["filename"] != url.rsplit("/", 2)[-1]:
-        details.append(f"<i>Filename</i>: {escape(info['filename'])}")
-    details.append(f"<i>TTFB</i>: {info['elapsed_ms']:_} ms")
+    if info.filename and info.filename != url.rsplit("/", 2)[-1]:
+        details.append(f"<i>Filename</i>: {escape(info.filename)}")
+    details.append(f"<i>TTFB</i>: {info.elapsed_ms:_} ms")

-    text = f"<b>{escape(info['title'])}</b> — " if info["title"] else ""
+    text = (
+        f"<b>{escape(info.extracted['title'])}</b> — "
+        if info.extracted["title"]
+        else ""
+    )
    text += "; ".join(details)

    await reply(message, html=text, in_thread=True)
+
+
+async def generic_extractor(info: Info):
+    content_type = info._resp.headers.get("Content-Type", "")
+    is_html = content_type.startswith("text/html") or url.lower().endswith(
+        (".html", ".htm")
+    )
+    info.extracted = {"title": title(info._chunks_str) if is_html else None}
+
+
+def full_url(ref: str) -> str:
+    return f"http://{ref}" if ref.startswith("www") else ref
+
+
+async def handle(message: Message):
+    if message.command and message.command not in {"u", "url"}:
+        return
+
+    limit = 3
+    urls = [full_url(w) for w in message.words if is_url(w)][:limit]
+    if not urls:
+        return
+
+    handlers = (imdb,)
+
+    for url in urls:
+        for handler in handlers:
+            if handler.can_handle(url):
+                handler = handler.handle
+                extractor = handler.extractor
+                break
+        else:
+            if not (
+                message.command
+            ):  # We only want the generic handler if we were called explicitly
+                continue
+            handler = generic_handler
+            extractor = generic_extractor
+
+        info = await load_info(url, extractor, cachetoken())
+        if not info:
+            continue
+        await handler(message, url, info)
--- a/hotdog/command/urlinfo_/imdb.py
+++ b/hotdog/command/urlinfo_/imdb.py
@ -0,0 +1,180 @@
+import io
+import json
+import re
+from dataclasses import dataclass, fields
+from dataclasses import replace as clone
+from datetime import date
+from html import escape
+from html.parser import HTMLParser
+from pathlib import Path
+from typing import *
+from urllib.parse import urlparse
+
+import requests
+
+from ...functions import (
+    ElementParser,
+    capped_text,
+    escape_all,
+    pretty_duration,
+    reply,
+    send_image,
+    strip_tags,
+)
+from ...models import Message
+
+# https://www.imdb.com/title/tt13399862/
+can_handle = re.compile(r"https://www.imdb.com/title/(?P<id>tt\d+)/").fullmatch
+
+
+def thumbnail(url, width=182, height=268):
+    """Return a thumbnail URL for the given IMDb image URL.
+
+    The default settings are what IMDb currently uses for desktop display.
+    """
+    resize = f"UY{height}"  # there's also 'UX' to resize on width
+    offset = "2,0"  # by setting non-0 for the first value the image is fitted
+    crop = f"CR{offset},{width},{height}"
+    al = "AL"  # not sure what this is, doesn't seem to do much but they use it.
+
+    variant = "_".join((resize, crop, al))
+
+    parts = urlparse(url)
+    path = Path(parts.path)
+    # path.with_stem(f"{path.stem}_{variant}")  XXX py3.9
+    path = path.with_name(f"{path.stem}_{variant}{path.suffix}")
+    return parts._replace(path=str(path)).geturl()
+
+
+period_re = re.compile(
+    r"P((?P<year>\d+)Y)?((?P<month>\d+)M)?((?P<day>\d+)D)?T((?P<hour>\d+)H)?((?P<minute>\d+)M)?((?P<second>\d+)S)?"
+)
+
+
+def parse_period(s: str) -> int:
+    # see https://en.wikipedia.org/wiki/ISO_8601#Durations
+    seconds = {
+        "year": 365 * 86400,
+        "month": 30 * 86400,
+        "day": 86400,
+        "hour": 3600,
+        "minute": 60,
+        "second": 1,
+    }
+    if not (match := period_re.fullmatch(s)):
+        return 0
+    return sum(seconds[k] * int(v) for k, v in match.groupdict().items() if v)
+
+
+_import_image_cache = (
+    {}
+)  # XXX ideally we'd cache these forever (in some permanent storage)
+
+
+async def import_image(client, url, filename=None):
+    if url in _import_image_cache:
+        return _import_image_cache[url]
+    r = requests.get(
+        url,
+        # stream=True,
+        timeout=(3, 3),
+        headers={"user-agent": "hotdog/v1 urlinfo"},
+    )
+    size = len(r.content)
+    uploaded, keys = await client.upload(
+        # io.BufferedReader(r.raw),
+        io.BufferedReader(io.BytesIO(r.content)),
+        content_type="image/jpeg",
+        filename=filename,
+        filesize=size,
+    )
+    _import_image_cache[url] = {
+        "size": size,
+        "url": uploaded.content_uri,
+    }
+    return _import_image_cache[url]
+
+
+@dataclass
+class Extracted:
+    title: str
+    original_image: str
+    genres: List[str]
+    description: str
+    published: date
+    duration_s: int
+    rating_value: float
+    rating_count: int
+    creators: List[str]
+
+
+async def extractor(info):
+    parser = ElementParser(
+        lambda tag, attrs: (
+            tag == "script" and dict(attrs).get("type") == "application/ld+json"
+        )
+    )
+    parser.load_chunks(info._chunks_str)
+
+    if not parser.value:
+        return
+
+    ld = json.loads(parser.value)
+    assert ld["@context"] == "http://schema.org" and ld["@type"] == "Movie"
+    assert ld["aggregateRating"]["@type"] == "AggregateRating"
+
+    creators = []
+    for k in "director", "creator":
+        if k in ld:
+            t = [ld[k]] if type(ld[k]) is dict else ld[k]
+            creators += [p["name"] for p in t if p["@type"] == "Person"]
+    creators = list({k: None for k in creators})  # remove dupes
+
+    info.extracted = Extracted(
+        title=ld["name"],
+        original_image=ld["image"],
+        genres=ld["genre"],
+        description=ld["description"],
+        published=date.fromisoformat(ld["datePublished"]),
+        duration_s=parse_period(ld["duration"]),
+        rating_value=float(ld["aggregateRating"]["ratingValue"]),
+        rating_count=ld["aggregateRating"]["ratingCount"],
+        creators=creators,
+    )
+
+
+async def handle(message, url, info):
+    ex = clone(info.extracted)
+    image_title = f"Poster for {ex.title} ({ex.published:%Y})"
+    hosted_image = await import_image(
+        message.app.client,
+        thumbnail(ex.original_image),
+        filename=f"{image_title}.jpg",
+    )
+    await send_image(
+        message.app.client,
+        message.room.room_id,
+        hosted_image["url"],
+        description=image_title,
+        mimetype="image/jpeg",
+        size=hosted_image["size"],
+    )
+
+    ex.description = strip_tags(ex.description)
+    escape_all(ex)
+
+    details = [
+        f"🖋 {' ∕ '.join(ex.creators[:2])}",
+        f"⏱ {pretty_duration(ex.duration_s)}",
+        f"⭐️ {ex.rating_value:_.01f} ⁄ 10 (👤 {ex.rating_count})",
+        f"🏷 {' ∕ '.join(ex.genres)}",
+    ]
+    lines = [
+        f"<b>{ex.title}</b> (<b>{ex.published:%Y}</b>)",
+        f"{', '.join(details)}",
+        f"<i>{capped_text(ex.description, 500)}</i>",
+    ]
+
+    html = "<br>".join(lines)
+    plain = strip_tags(" — ".join(lines))
+    await reply(message, plain, html=html)
--- a/hotdog/command/youtube.py
+++ b/hotdog/command/youtube.py
@ -7,7 +7,7 @@ from typing import *

 import youtube_dl

-from ..functions import reply
+from ..functions import escape_all, pretty_duration, reply
 from ..models import Message

 HELP = """Gibt Informationen zu Youtube-Videos aus.
@ -45,7 +45,7 @@ async def handle(message: Message):
    youtube_id = match["id"]

    info = load_info(youtube_id, cachetoken())
-    info.escape_all()
+    escape_all(info)
    details = [
        f"🖋 {info.author}",
        f"⏱ {pretty_duration(info.duration_seconds)}",
@ -59,15 +59,6 @@ async def handle(message: Message):
    await reply(message, html=text)


-def pretty_duration(seconds: int) -> str:
-    hours = seconds // 3600
-    minutes = (seconds - hours * 3600) // 60
-    seconds = seconds % 60
-    return (
-        f"{hours}h{minutes:02}m{seconds:02}s" if hours else f"{minutes}m{seconds:02}s"
-    )
-
-
 class Nolog:
    def debug(self, msg):
        pass
@ -95,13 +86,6 @@ class Info:
    categories: List[str]
    tags: List[str]

-    def escape_all(self):
-        for f in fields(self):
-            if f.type is str:
-                setattr(self, f.name, escape(getattr(self, f.name)))
-            elif get_origin(f.type) is list:
-                setattr(self, f.name, [escape(x) for x in getattr(self, f.name)])
-
    @classmethod
    def from_url(cls, url):
        info = ytdl.extract_info(url, download=False)
--- a/hotdog/functions.py
+++ b/hotdog/functions.py
@ -1,9 +1,11 @@
 import locale
 import logging
 import unicodedata
+from collections import defaultdict
 from contextlib import contextmanager
+from dataclasses import dataclass, fields
 from datetime import datetime, timedelta, timezone
-from html import escape
+from html import escape as html_escape
 from html.parser import HTMLParser
 from io import StringIO
 from typing import *
@ -18,7 +20,7 @@ tzdb = {


 def html_nametag(uid, name):
-    return f'<a href="https://matrix.to/#/{escape(uid)}">{escape(name)}</a>'
+    return f'<a href="https://matrix.to/#/{html_escape(uid)}">{html_escape(name)}</a>'


 async def reply(
@ -143,6 +145,67 @@ async def send_message(
        log.exception(f"Unable to send message to room: {room_id}")


+async def send_image(
+    client: nio.AsyncClient,
+    room_id: str,
+    url: str,
+    description: str,
+    *,
+    width: Optional[int] = None,
+    height: Optional[int] = None,
+    size: Optional[int] = None,
+    mimetype: Optional[str] = None,
+    thumbnail_url: Optional[str] = None,
+    thumbnail_width: Optional[int] = None,
+    thumbnail_height: Optional[int] = None,
+    thumbnail_size: Optional[int] = None,
+    thumbnail_mimetype: Optional[str] = None,
+) -> nio.RoomSendResponse:
+    # https://matrix.org/docs/spec/client_server/r0.6.1#m-image
+    content = defaultdict(
+        dict,
+        {
+            "body": description,
+            "msgtype": "m.image",
+            "url": url,
+        },
+    )
+
+    # Map all image keyword args into the content dict.
+    kwds = locals()
+    kwmap = {
+        "width": "w",
+        "height": "h",
+        "size": "size",
+        "mimetype": "mimetype",
+        "thumbnail_url": "thumbnail_url",
+    }
+    for kwarg, carg in kwmap.items():
+        if kwds[kwarg] is not None:
+            content["info"][carg] = kwds[kwarg]
+
+    # Map all thumbnail keyword args into the content dict.
+    kwmap = {
+        "thumbnail_width": "w",
+        "thumbnail_height": "h",
+        "thumbnail_size": "size",
+        "thumbnail_mimetype": "mimetype",
+    }
+    thumbinfo = defaultdict(dict)
+    for kwarg, carg in kwmap.items():
+        if kwds[kwarg] is not None:
+            thumbinfo[carg] = kwds[kwarg]
+    if thumbinfo:
+        content["info"]["thumbnail_info"] = thumbinfo
+
+    return await client.room_send(
+        room_id,
+        "m.room.message",
+        content,
+        ignore_unverified_devices=True,
+    )
+
+
@contextmanager
 def localized(lc: str, category=locale.LC_ALL):
    locale.setlocale(category, lc)
@ -179,3 +242,81 @@ def strip_tags(html):

 def clamp(lower, x, upper):
    return max(lower, min(x, upper))
+
+
+def pretty_duration(seconds: int) -> str:
+    hours = seconds // 3600
+    minutes = (seconds - hours * 3600) // 60
+    seconds = seconds % 60
+
+    # full: 1h 23m 13s
+    # 0 seconds: 1h 23m
+    # 0 hours: 23m 13s
+    # 0 hours 0 seconds: 23m 00s
+
+    parts = {}
+    if hours:
+        parts["h"] = f"{hours}h"
+    parts["m"] = f"{minutes:02}m"
+    if seconds or not hours:
+        parts["s"] = f"{seconds:02}s"
+
+    return " ".join(parts.values())
+
+
+def capped_text(text: str, max_len: int, mark=" […]") -> str:
+    if len(text) <= max_len:
+        return text
+
+    capped = ""
+    for word in text.split(" "):
+        if len(capped + f" {word}") > max_len - len(mark):
+            capped += mark
+            break
+        capped += f" {word}"
+    return capped
+
+
+class ElementParser(HTMLParser):
+    """Parse HTML for the first matching element"""
+
+    def __init__(self, selector: Callable[[str, Mapping[str, str]], bool]):
+        super().__init__()
+        self.selector = selector
+        self.__active_tag = None
+        self.__done = False
+        self.__value = ""
+
+    def handle_starttag(self, tag, attrs):
+        if self.selector(tag, attrs):
+            self.__active_tag = tag
+
+    def handle_endtag(self, tag):
+        if tag == self.__active_tag:
+            self.__done = True
+            self.__active_tag = None
+
+    def handle_data(self, data):
+        if self.__active_tag and not self.__done:
+            self.__value += data
+
+    @property
+    def value(self) -> Optional[str]:
+        return self.__value if self.__done else None
+
+    def load_chunks(self, content: Iterable[str]) -> None:
+        for chunk in content:
+            self.feed(chunk)
+            if self.__done:
+                break
+
+
+def escape_all(dc: dataclass, escape: Callable[[str], str] = html_escape) -> None:
+    """Patch a dataclass to escape all strings."""
+    for f in fields(dc):
+        if f.type is str:
+            setattr(dc, f.name, escape(getattr(dc, f.name)))
+        elif get_origin(f.type) is list and get_args(f.type)[0] is str:
+            setattr(dc, f.name, [escape(x) for x in getattr(dc, f.name)])
+        elif get_origin(f.type) is dict and get_args(f.type)[1] is str:
+            setattr(dc, f.name, {k: escape(v) for k, v in getattr(dc, f.name).items()})