hotdog/hotdog/command/urlinfo_/imdb.py

import io
import json
import re
from dataclasses import dataclass
from dataclasses import replace as clone
from datetime import date
from pathlib import Path
from typing import *
from urllib.parse import urlparse

import requests

from ...functions import (
    ElementParser,
    capped_text,
    escape_all,
    pretty_duration,
    reply,
    send_image,
    strip_tags,
)
from ...models import Message

# https://www.imdb.com/title/tt13399862/
can_handle = re.compile(r"https://www.imdb.com/title/(?P<id>tt\d+)/").fullmatch


def thumbnail(url, width=182, height=268):
    """Return a thumbnail URL for the given IMDb image URL.

    The default settings are what IMDb currently uses for desktop display.
    """
    resize = f"UY{height}"  # there's also 'UX' to resize on width
    offset = "0,0"
    crop = f"CR{offset},{width},{height}"
    al = "AL"  # not sure what this is, doesn't seem to do much but they use it.

    variant = "_".join((resize, crop, al))

    parts = urlparse(url)
    path = Path(parts.path)
    # path.with_stem(f"{path.stem}_{variant}")  XXX py3.9
    path = path.with_name(f"{path.stem}_{variant}{path.suffix}")
    return parts._replace(path=str(path)).geturl()


period_re = re.compile(
    r"P((?P<year>\d+)Y)?((?P<month>\d+)M)?((?P<day>\d+)D)?T((?P<hour>\d+)H)?((?P<minute>\d+)M)?((?P<second>\d+)S)?"
)


def parse_period(s: str) -> int:
    # see https://en.wikipedia.org/wiki/ISO_8601#Durations
    seconds = {
        "year": 365 * 86400,
        "month": 30 * 86400,
        "day": 86400,
        "hour": 3600,
        "minute": 60,
        "second": 1,
    }
    if not (match := period_re.fullmatch(s)):
        return 0
    return sum(seconds[k] * int(v) for k, v in match.groupdict().items() if v)


_import_image_cache = (
    {}
)  # XXX ideally we'd cache these forever (in some permanent storage)


async def import_image(client, url, filename=None):
    if url in _import_image_cache:
        return _import_image_cache[url]
    r = requests.get(
        url,
        # stream=True,
        timeout=(3, 3),
        headers={"user-agent": "hotdog/v1 urlinfo"},
    )
    size = len(r.content)
    uploaded, keys = await client.upload(
        # io.BufferedReader(r.raw),
        io.BufferedReader(io.BytesIO(r.content)),
        content_type="image/jpeg",
        filename=filename,
        filesize=size,
    )
    _import_image_cache[url] = {
        "size": size,
        "url": uploaded.content_uri,
    }
    return _import_image_cache[url]


@dataclass
class Extracted:
    title: str
    original_image: str
    genres: List[str]
    description: str
    published: date
    creators: List[str]
    duration_s: Optional[int] = None
    rating_value: Optional[float] = None
    rating_count: Optional[int] = None


async def extractor(info):
    parser = ElementParser(
        lambda tag, attrs: (
            tag == "script" and dict(attrs).get("type") == "application/ld+json"
        )
    )
    parser.load_chunks(info._chunks_str)

    if not parser.value:
        return

    ld = json.loads(parser.value)
    assert ld["@context"] == "http://schema.org" and ld["@type"] in (
        "Movie",
        "CreativeWork",
    )

    creators = []
    for k in "director", "creator":
        if k in ld:
            t = [ld[k]] if type(ld[k]) is dict else ld[k]
            creators += [p["name"] for p in t if p["@type"] == "Person"]
    creators = list({k: None for k in creators})  # remove dupes

    info.extracted = Extracted(
        title=ld["name"],
        original_image=ld["image"],
        genres=ld["genre"],
        description=ld["description"],
        published=date.fromisoformat(ld["datePublished"]),
        creators=creators,
    )

    if "duration" in ld:
        info.extracted.duration_s = parse_period(ld["duration"])

    if "aggregateRating" in ld:
        assert ld["aggregateRating"]["@type"] == "AggregateRating"
        info.extracted.rating_value = float(ld["aggregateRating"]["ratingValue"])
        info.extracted.rating_count = ld["aggregateRating"]["ratingCount"]


async def handle(message, url, info):
    ex = clone(info.extracted)
    image_title = f"Poster for {ex.title} ({ex.published:%Y})"
    hosted_image = await import_image(
        message.app.client,
        thumbnail(ex.original_image),
        filename=f"{image_title}.jpg",
    )
    await send_image(
        message.app.client,
        message.room.room_id,
        hosted_image["url"],
        description=image_title,
        mimetype="image/jpeg",
        size=hosted_image["size"],
    )

    ex.description = strip_tags(ex.description)
    escape_all(ex)

    details = [
        f"🖋 {' ∕ '.join(ex.creators[:2])}",
    ]
    if ex.duration_s:
        details.append(f"⏱ {pretty_duration(ex.duration_s)}")
    if ex.rating_count:
        details.append(f"⭐️ {ex.rating_value:_.01f} ⁄ 10 (👤 {ex.rating_count})")
    details.append(f"🏷 {' ∕ '.join(ex.genres[:3])}")
    lines = [
        f"<b>{ex.title}</b> (<b>{ex.published:%Y}</b>)",
        f"{', '.join(details)}",
        f"<i>{capped_text(ex.description, 500)}</i>",
    ]

    html = "<br>".join(lines)
    plain = strip_tags(" — ".join(lines))
    await reply(message, plain, html=html)