hotdog/hotdog/command/urlinfo_/imdb.py

187 lines
5.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import io
import json
import re
from dataclasses import dataclass
from dataclasses import replace as clone
from datetime import date
from pathlib import Path
from typing import *
from urllib.parse import urlparse
import requests
from ...functions import (
ElementParser,
capped_text,
escape_all,
pretty_duration,
reply,
send_image,
strip_tags,
)
from ...models import Message
# https://www.imdb.com/title/tt13399862/
can_handle = re.compile(r"https://www.imdb.com/title/(?P<id>tt\d+)/").fullmatch
def thumbnail(url, width=182, height=268):
"""Return a thumbnail URL for the given IMDb image URL.
The default settings are what IMDb currently uses for desktop display.
"""
resize = f"UY{height}" # there's also 'UX' to resize on width
offset = "0,0"
crop = f"CR{offset},{width},{height}"
al = "AL" # not sure what this is, doesn't seem to do much but they use it.
variant = "_".join((resize, crop, al))
parts = urlparse(url)
path = Path(parts.path)
# path.with_stem(f"{path.stem}_{variant}") XXX py3.9
path = path.with_name(f"{path.stem}_{variant}{path.suffix}")
return parts._replace(path=str(path)).geturl()
period_re = re.compile(
r"P((?P<year>\d+)Y)?((?P<month>\d+)M)?((?P<day>\d+)D)?T((?P<hour>\d+)H)?((?P<minute>\d+)M)?((?P<second>\d+)S)?"
)
def parse_period(s: str) -> int:
# see https://en.wikipedia.org/wiki/ISO_8601#Durations
seconds = {
"year": 365 * 86400,
"month": 30 * 86400,
"day": 86400,
"hour": 3600,
"minute": 60,
"second": 1,
}
if not (match := period_re.fullmatch(s)):
return 0
return sum(seconds[k] * int(v) for k, v in match.groupdict().items() if v)
_import_image_cache = (
{}
) # XXX ideally we'd cache these forever (in some permanent storage)
async def import_image(client, url, filename=None):
if url in _import_image_cache:
return _import_image_cache[url]
r = requests.get(
url,
# stream=True,
timeout=(3, 3),
headers={"user-agent": "hotdog/v1 urlinfo"},
)
size = len(r.content)
uploaded, keys = await client.upload(
# io.BufferedReader(r.raw),
io.BufferedReader(io.BytesIO(r.content)),
content_type="image/jpeg",
filename=filename,
filesize=size,
)
_import_image_cache[url] = {
"size": size,
"url": uploaded.content_uri,
}
return _import_image_cache[url]
@dataclass
class Extracted:
title: str
original_image: str
genres: List[str]
description: str
published: date
creators: List[str]
duration_s: Optional[int] = None
rating_value: Optional[float] = None
rating_count: Optional[int] = None
async def extractor(info):
parser = ElementParser(
lambda tag, attrs: (
tag == "script" and dict(attrs).get("type") == "application/ld+json"
)
)
parser.load_chunks(info._chunks_str)
if not parser.value:
return
ld = json.loads(parser.value)
assert ld["@context"] == "http://schema.org" and ld["@type"] in (
"Movie",
"CreativeWork",
)
creators = []
for k in "director", "creator":
if k in ld:
t = [ld[k]] if type(ld[k]) is dict else ld[k]
creators += [p["name"] for p in t if p["@type"] == "Person"]
creators = list({k: None for k in creators}) # remove dupes
info.extracted = Extracted(
title=ld["name"],
original_image=ld["image"],
genres=ld["genre"],
description=ld["description"],
published=date.fromisoformat(ld["datePublished"]),
creators=creators,
)
if "duration" in ld:
info.extracted.duration_s = parse_period(ld["duration"])
if "aggregateRating" in ld:
assert ld["aggregateRating"]["@type"] == "AggregateRating"
info.extracted.rating_value = float(ld["aggregateRating"]["ratingValue"])
info.extracted.rating_count = ld["aggregateRating"]["ratingCount"]
async def handle(message, url, info):
ex = clone(info.extracted)
image_title = f"Poster for {ex.title} ({ex.published:%Y})"
hosted_image = await import_image(
message.app.client,
thumbnail(ex.original_image),
filename=f"{image_title}.jpg",
)
await send_image(
message.app.client,
message.room.room_id,
hosted_image["url"],
description=image_title,
mimetype="image/jpeg",
size=hosted_image["size"],
)
ex.description = strip_tags(ex.description)
escape_all(ex)
details = [
f"🖋 {' '.join(ex.creators[:2])}",
]
if ex.duration_s:
details.append(f"{pretty_duration(ex.duration_s)}")
if ex.rating_count:
details.append(f"⭐️ {ex.rating_value:_.01f} 10 (👤 {ex.rating_count})")
details.append(f"🏷 {' '.join(ex.genres[:3])}")
lines = [
f"<b>{ex.title}</b> (<b>{ex.published:%Y}</b>)",
f"{', '.join(details)}",
f"<i>{capped_text(ex.description, 500)}</i>",
]
html = "<br>".join(lines)
plain = strip_tags("".join(lines))
await reply(message, plain, html=html)