urlinfo: allow sub-modules and add module for IMDb movies

The urlinfo plugin is now set up to look up URL information for any URL
occurring in text, not only when triggered explicitly as a command.
The youtube plugin should probably be integrated into this setup,
replacing the bot plugin with a urlinfo extension.
This commit is contained in:
ducklet 2020-11-07 20:35:52 +01:00
parent 81a176eb0c
commit efc6ecbb45
5 changed files with 460 additions and 114 deletions

View file

@ -6,7 +6,7 @@ from html import escape
import feeder
import postillon
from ..functions import clamp, localizedtz, reply, send_message, strip_tags
from ..functions import capped_text, clamp, localizedtz, reply, send_message, strip_tags
from ..models import Job, Message
log = logging.getLogger(__name__)
@ -121,11 +121,5 @@ def post_as_html(post, tzname: str, lc: str, *, max_content_len: int = 300):
if post.content and max_content_len > 0:
if parts:
parts.append("")
content = ""
for word in strip_tags(post.content).split(" "):
if len(content + f" {word}") > max_content_len - 3:
content += " […]"
break
content += f" {word}"
parts.append(escape(content))
parts.append(escape(capped_text(strip_tags(post.content), max_content_len)))
return " ".join(parts)

View file

@ -1,6 +1,6 @@
import codecs
import re
from functools import lru_cache
from dataclasses import dataclass
from html import escape
from html.parser import HTMLParser
from random import randint
@ -9,8 +9,9 @@ from typing import *
import requests
from ..functions import reply
from ..functions import ElementParser, reply
from ..models import Message
from .urlinfo_ import imdb # XXX make this dynamic? (like we load plugins)
HELP = """Return information about an online HTTP resource.
!u[rl] <url>
@ -18,43 +19,15 @@ HELP = """Return information about an online HTTP resource.
def init(bot):
bot.on_command({"u", "url"}, handle)
bot.on_message(handle)
match_url = re.compile(
is_url = re.compile(
# r"https?://(?:[a-zA-Z]|[0-9]|[$-_~@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
r"https?://\S+"
r"(https?://|www\.)\S+"
).fullmatch
class TitleParser(HTMLParser):
"""Parse the first <title> from HTML"""
# XXX check if it's the <head>'s title we're in, but beware that head can be implicit
def __init__(self):
super().__init__()
self.__is_title = False
self.__found = False
self.__title = ""
def handle_starttag(self, tag, attrs):
if tag == "title":
self.__is_title = True
def handle_endtag(self, tag):
if tag == "title":
self.__found = True
self.__is_title = False
def handle_data(self, data):
if self.__is_title and not self.__found:
self.__title += data
@property
def title(self) -> Optional[str]:
return self.__title if self.__found else None
def get_encodings_from_content(content: str) -> List[str]:
"""Returns encodings from given content string."""
@ -89,15 +62,12 @@ def stream_decode_response_unicode(content: Iterable[bytes]) -> Iterable[str]:
def title(content: Iterable[str]) -> Optional[str]:
t = TitleParser()
for chunk in content:
t.feed(chunk)
if t.title is not None:
break
return t.title
t = ElementParser(lambda tag, attrs: tag == "title")
t.load_chunks(content)
return t.value
def capped(content: Iterable[str], read_max: int) -> Iterable[str]:
def capped(content: Iterable[AnyStr], read_max: int) -> Iterable[AnyStr]:
read = 0
for chunk in content:
read += len(chunk)
@ -106,9 +76,44 @@ def capped(content: Iterable[str], read_max: int) -> Iterable[str]:
break
@lru_cache(maxsize=5)
def load_info(url: str, cachetoken) -> Optional[Mapping[str, Any]]:
@dataclass
class Info:
"""Information extracted by loading a URL.
This information can/will be cached for successive lookups of the same URL.
When the info object is handed to the extractor function _resp and _chunks
will be set. Prior to commiting the info object to cache these references
will be removed.
"""
code: int
final_url: str
elapsed_ms: int
reason: str
content_type: Optional[str]
size: Optional[int]
filename: Optional[str]
extracted: Optional[Any] = None
_resp: Optional[requests.Response] = None
_chunks_str: Optional[Iterable[str]] = None
_chunks_bytes: Optional[Iterable[bytes]] = None
# XXX can't use lru_cache with async funcs
# TODO: create lru_cache that supports async and timeout
_load_info_cache = {}
async def load_info(
url: str, extractor: Callable[[Info], None], cachetoken
) -> Optional[Info]:
"""The cachetoken is just there to bust the LRU cache after a while."""
cachekey = (url, cachetoken)
if cachekey in _load_info_cache:
return _load_info_cache[cachekey]
try:
r = requests.get(
url,
@ -119,18 +124,6 @@ def load_info(url: str, cachetoken) -> Optional[Mapping[str, Any]]:
except Exception:
return None
content_type = r.headers.get("Content-Type", "")
is_html = content_type.startswith("text/html") or url.lower().endswith(
(".html", ".htm")
)
if is_html:
one_kb = 2 ** 10
# chunks = r.iter_content(chunk_size=30 * one_kb, decode_unicode=True)
chunks = stream_decode_response_unicode(r.iter_content(chunk_size=30 * one_kb))
html_title = title(capped(chunks, read_max=200 * one_kb))
else:
html_title = None
filename = None
dispo = r.headers.get("Content-Disposition", "").split(";")
if len(dispo) == 2 and dispo[0] == "attachment":
@ -138,18 +131,33 @@ def load_info(url: str, cachetoken) -> Optional[Mapping[str, Any]]:
if len(dispo) == 2 and dispo[0] == "filename":
filename = dispo[1].strip()
return {
"code": r.status_code,
"url": r.url,
"elapsed_ms": int(r.elapsed.total_seconds() * 1_000),
"reason": r.reason,
"type": r.headers.get("Content-Type"),
"size": (
one_kb = 2 ** 10
chunks = capped(r.iter_content(chunk_size=30 * one_kb), read_max=200 * one_kb)
info = Info(
code=r.status_code,
final_url=r.url,
elapsed_ms=int(r.elapsed.total_seconds() * 1_000),
reason=r.reason,
content_type=r.headers.get("Content-Type"),
size=(
int(r.headers["Content-Length"]) if "Content-Length" in r.headers else None
),
"title": html_title,
"filename": filename,
}
filename=filename,
_resp=r,
_chunks_str=stream_decode_response_unicode(chunks),
_chunks_bytes=chunks,
)
await extractor(info)
# Remove all references to the Response before the info is commited to cache.
info._resp = None
info._chunks_str = None
info._chunks_bytes = None
_load_info_cache[cachekey] = info
return _load_info_cache[cachekey]
def cachetoken(quant_m=15):
@ -168,32 +176,71 @@ def pretty_size(size: int) -> str:
return f"{size:_.02f} {q}B"
async def handle(message: Message):
url = message.args.str(0)
if not match_url(url):
return
info = load_info(url, cachetoken())
if not info:
return
async def generic_handler(message: Message, url: str, info: Info):
details = []
if info["type"]:
details.append(f"<i>Media type</i>: {escape(info['type'])}")
if info["size"]:
details.append(f"<i>Size</i>: {pretty_size(info['size'])}")
details.append(f"<i>Status</i>: {info['code']}")
if info["reason"]:
details[-1] += f" ({escape(info['reason'])})"
if info["url"] != url:
if info.content_type:
details.append(f"<i>Media type</i>: {escape(info.content_type)}")
if info.size:
details.append(f"<i>Size</i>: {pretty_size(info.size)}")
details.append(f"<i>Status</i>: {info.code}")
if info.reason:
details[-1] += f" ({escape(info.reason)})"
if info.final_url != url:
details.append(
f"""<i>Redirected to</i>: <a href="{escape(info['url'])}">{escape(info['url'])}</a>"""
f"""<i>Redirected to</i>: <a href="{escape(info.final_url)}">{escape(info.final_url)}</a>"""
)
if info["filename"] and info["filename"] != url.rsplit("/", 2)[-1]:
details.append(f"<i>Filename</i>: {escape(info['filename'])}")
details.append(f"<i>TTFB</i>: {info['elapsed_ms']:_} ms")
if info.filename and info.filename != url.rsplit("/", 2)[-1]:
details.append(f"<i>Filename</i>: {escape(info.filename)}")
details.append(f"<i>TTFB</i>: {info.elapsed_ms:_} ms")
text = f"<b>{escape(info['title'])}</b> — " if info["title"] else ""
text = (
f"<b>{escape(info.extracted['title'])}</b> — "
if info.extracted["title"]
else ""
)
text += "; ".join(details)
await reply(message, html=text, in_thread=True)
async def generic_extractor(info: Info):
content_type = info._resp.headers.get("Content-Type", "")
is_html = content_type.startswith("text/html") or url.lower().endswith(
(".html", ".htm")
)
info.extracted = {"title": title(info._chunks_str) if is_html else None}
def full_url(ref: str) -> str:
return f"http://{ref}" if ref.startswith("www") else ref
async def handle(message: Message):
if message.command and message.command not in {"u", "url"}:
return
limit = 3
urls = [full_url(w) for w in message.words if is_url(w)][:limit]
if not urls:
return
handlers = (imdb,)
for url in urls:
for handler in handlers:
if handler.can_handle(url):
handler = handler.handle
extractor = handler.extractor
break
else:
if not (
message.command
): # We only want the generic handler if we were called explicitly
continue
handler = generic_handler
extractor = generic_extractor
info = await load_info(url, extractor, cachetoken())
if not info:
continue
await handler(message, url, info)

View file

@ -0,0 +1,180 @@
import io
import json
import re
from dataclasses import dataclass, fields
from dataclasses import replace as clone
from datetime import date
from html import escape
from html.parser import HTMLParser
from pathlib import Path
from typing import *
from urllib.parse import urlparse
import requests
from ...functions import (
ElementParser,
capped_text,
escape_all,
pretty_duration,
reply,
send_image,
strip_tags,
)
from ...models import Message
# https://www.imdb.com/title/tt13399862/
can_handle = re.compile(r"https://www.imdb.com/title/(?P<id>tt\d+)/").fullmatch
def thumbnail(url, width=182, height=268):
"""Return a thumbnail URL for the given IMDb image URL.
The default settings are what IMDb currently uses for desktop display.
"""
resize = f"UY{height}" # there's also 'UX' to resize on width
offset = "2,0" # by setting non-0 for the first value the image is fitted
crop = f"CR{offset},{width},{height}"
al = "AL" # not sure what this is, doesn't seem to do much but they use it.
variant = "_".join((resize, crop, al))
parts = urlparse(url)
path = Path(parts.path)
# path.with_stem(f"{path.stem}_{variant}") XXX py3.9
path = path.with_name(f"{path.stem}_{variant}{path.suffix}")
return parts._replace(path=str(path)).geturl()
period_re = re.compile(
r"P((?P<year>\d+)Y)?((?P<month>\d+)M)?((?P<day>\d+)D)?T((?P<hour>\d+)H)?((?P<minute>\d+)M)?((?P<second>\d+)S)?"
)
def parse_period(s: str) -> int:
# see https://en.wikipedia.org/wiki/ISO_8601#Durations
seconds = {
"year": 365 * 86400,
"month": 30 * 86400,
"day": 86400,
"hour": 3600,
"minute": 60,
"second": 1,
}
if not (match := period_re.fullmatch(s)):
return 0
return sum(seconds[k] * int(v) for k, v in match.groupdict().items() if v)
_import_image_cache = (
{}
) # XXX ideally we'd cache these forever (in some permanent storage)
async def import_image(client, url, filename=None):
if url in _import_image_cache:
return _import_image_cache[url]
r = requests.get(
url,
# stream=True,
timeout=(3, 3),
headers={"user-agent": "hotdog/v1 urlinfo"},
)
size = len(r.content)
uploaded, keys = await client.upload(
# io.BufferedReader(r.raw),
io.BufferedReader(io.BytesIO(r.content)),
content_type="image/jpeg",
filename=filename,
filesize=size,
)
_import_image_cache[url] = {
"size": size,
"url": uploaded.content_uri,
}
return _import_image_cache[url]
@dataclass
class Extracted:
title: str
original_image: str
genres: List[str]
description: str
published: date
duration_s: int
rating_value: float
rating_count: int
creators: List[str]
async def extractor(info):
parser = ElementParser(
lambda tag, attrs: (
tag == "script" and dict(attrs).get("type") == "application/ld+json"
)
)
parser.load_chunks(info._chunks_str)
if not parser.value:
return
ld = json.loads(parser.value)
assert ld["@context"] == "http://schema.org" and ld["@type"] == "Movie"
assert ld["aggregateRating"]["@type"] == "AggregateRating"
creators = []
for k in "director", "creator":
if k in ld:
t = [ld[k]] if type(ld[k]) is dict else ld[k]
creators += [p["name"] for p in t if p["@type"] == "Person"]
creators = list({k: None for k in creators}) # remove dupes
info.extracted = Extracted(
title=ld["name"],
original_image=ld["image"],
genres=ld["genre"],
description=ld["description"],
published=date.fromisoformat(ld["datePublished"]),
duration_s=parse_period(ld["duration"]),
rating_value=float(ld["aggregateRating"]["ratingValue"]),
rating_count=ld["aggregateRating"]["ratingCount"],
creators=creators,
)
async def handle(message, url, info):
ex = clone(info.extracted)
image_title = f"Poster for {ex.title} ({ex.published:%Y})"
hosted_image = await import_image(
message.app.client,
thumbnail(ex.original_image),
filename=f"{image_title}.jpg",
)
await send_image(
message.app.client,
message.room.room_id,
hosted_image["url"],
description=image_title,
mimetype="image/jpeg",
size=hosted_image["size"],
)
ex.description = strip_tags(ex.description)
escape_all(ex)
details = [
f"🖋 {' '.join(ex.creators[:2])}",
f"{pretty_duration(ex.duration_s)}",
f"⭐️ {ex.rating_value:_.01f} 10 (👤 {ex.rating_count})",
f"🏷 {' '.join(ex.genres)}",
]
lines = [
f"<b>{ex.title}</b> (<b>{ex.published:%Y}</b>)",
f"{', '.join(details)}",
f"<i>{capped_text(ex.description, 500)}</i>",
]
html = "<br>".join(lines)
plain = strip_tags("".join(lines))
await reply(message, plain, html=html)

View file

@ -7,7 +7,7 @@ from typing import *
import youtube_dl
from ..functions import reply
from ..functions import escape_all, pretty_duration, reply
from ..models import Message
HELP = """Gibt Informationen zu Youtube-Videos aus.
@ -45,7 +45,7 @@ async def handle(message: Message):
youtube_id = match["id"]
info = load_info(youtube_id, cachetoken())
info.escape_all()
escape_all(info)
details = [
f"🖋 {info.author}",
f"{pretty_duration(info.duration_seconds)}",
@ -59,15 +59,6 @@ async def handle(message: Message):
await reply(message, html=text)
def pretty_duration(seconds: int) -> str:
hours = seconds // 3600
minutes = (seconds - hours * 3600) // 60
seconds = seconds % 60
return (
f"{hours}h{minutes:02}m{seconds:02}s" if hours else f"{minutes}m{seconds:02}s"
)
class Nolog:
def debug(self, msg):
pass
@ -95,13 +86,6 @@ class Info:
categories: List[str]
tags: List[str]
def escape_all(self):
for f in fields(self):
if f.type is str:
setattr(self, f.name, escape(getattr(self, f.name)))
elif get_origin(f.type) is list:
setattr(self, f.name, [escape(x) for x in getattr(self, f.name)])
@classmethod
def from_url(cls, url):
info = ytdl.extract_info(url, download=False)

View file

@ -1,9 +1,11 @@
import locale
import logging
import unicodedata
from collections import defaultdict
from contextlib import contextmanager
from dataclasses import dataclass, fields
from datetime import datetime, timedelta, timezone
from html import escape
from html import escape as html_escape
from html.parser import HTMLParser
from io import StringIO
from typing import *
@ -18,7 +20,7 @@ tzdb = {
def html_nametag(uid, name):
return f'<a href="https://matrix.to/#/{escape(uid)}">{escape(name)}</a>'
return f'<a href="https://matrix.to/#/{html_escape(uid)}">{html_escape(name)}</a>'
async def reply(
@ -143,6 +145,67 @@ async def send_message(
log.exception(f"Unable to send message to room: {room_id}")
async def send_image(
client: nio.AsyncClient,
room_id: str,
url: str,
description: str,
*,
width: Optional[int] = None,
height: Optional[int] = None,
size: Optional[int] = None,
mimetype: Optional[str] = None,
thumbnail_url: Optional[str] = None,
thumbnail_width: Optional[int] = None,
thumbnail_height: Optional[int] = None,
thumbnail_size: Optional[int] = None,
thumbnail_mimetype: Optional[str] = None,
) -> nio.RoomSendResponse:
# https://matrix.org/docs/spec/client_server/r0.6.1#m-image
content = defaultdict(
dict,
{
"body": description,
"msgtype": "m.image",
"url": url,
},
)
# Map all image keyword args into the content dict.
kwds = locals()
kwmap = {
"width": "w",
"height": "h",
"size": "size",
"mimetype": "mimetype",
"thumbnail_url": "thumbnail_url",
}
for kwarg, carg in kwmap.items():
if kwds[kwarg] is not None:
content["info"][carg] = kwds[kwarg]
# Map all thumbnail keyword args into the content dict.
kwmap = {
"thumbnail_width": "w",
"thumbnail_height": "h",
"thumbnail_size": "size",
"thumbnail_mimetype": "mimetype",
}
thumbinfo = defaultdict(dict)
for kwarg, carg in kwmap.items():
if kwds[kwarg] is not None:
thumbinfo[carg] = kwds[kwarg]
if thumbinfo:
content["info"]["thumbnail_info"] = thumbinfo
return await client.room_send(
room_id,
"m.room.message",
content,
ignore_unverified_devices=True,
)
@contextmanager
def localized(lc: str, category=locale.LC_ALL):
locale.setlocale(category, lc)
@ -179,3 +242,81 @@ def strip_tags(html):
def clamp(lower, x, upper):
return max(lower, min(x, upper))
def pretty_duration(seconds: int) -> str:
hours = seconds // 3600
minutes = (seconds - hours * 3600) // 60
seconds = seconds % 60
# full: 1h 23m 13s
# 0 seconds: 1h 23m
# 0 hours: 23m 13s
# 0 hours 0 seconds: 23m 00s
parts = {}
if hours:
parts["h"] = f"{hours}h"
parts["m"] = f"{minutes:02}m"
if seconds or not hours:
parts["s"] = f"{seconds:02}s"
return " ".join(parts.values())
def capped_text(text: str, max_len: int, mark=" […]") -> str:
if len(text) <= max_len:
return text
capped = ""
for word in text.split(" "):
if len(capped + f" {word}") > max_len - len(mark):
capped += mark
break
capped += f" {word}"
return capped
class ElementParser(HTMLParser):
"""Parse HTML for the first matching element"""
def __init__(self, selector: Callable[[str, Mapping[str, str]], bool]):
super().__init__()
self.selector = selector
self.__active_tag = None
self.__done = False
self.__value = ""
def handle_starttag(self, tag, attrs):
if self.selector(tag, attrs):
self.__active_tag = tag
def handle_endtag(self, tag):
if tag == self.__active_tag:
self.__done = True
self.__active_tag = None
def handle_data(self, data):
if self.__active_tag and not self.__done:
self.__value += data
@property
def value(self) -> Optional[str]:
return self.__value if self.__done else None
def load_chunks(self, content: Iterable[str]) -> None:
for chunk in content:
self.feed(chunk)
if self.__done:
break
def escape_all(dc: dataclass, escape: Callable[[str], str] = html_escape) -> None:
"""Patch a dataclass to escape all strings."""
for f in fields(dc):
if f.type is str:
setattr(dc, f.name, escape(getattr(dc, f.name)))
elif get_origin(f.type) is list and get_args(f.type)[0] is str:
setattr(dc, f.name, [escape(x) for x in getattr(dc, f.name)])
elif get_origin(f.type) is dict and get_args(f.type)[1] is str:
setattr(dc, f.name, {k: escape(v) for k, v in getattr(dc, f.name).items()})