187 lines
5.1 KiB
Python
187 lines
5.1 KiB
Python
import io
|
||
import json
|
||
import re
|
||
from dataclasses import dataclass
|
||
from dataclasses import replace as clone
|
||
from datetime import date
|
||
from pathlib import Path
|
||
from typing import *
|
||
from urllib.parse import urlparse
|
||
|
||
import requests
|
||
|
||
from ...functions import (
|
||
ElementParser,
|
||
capped_text,
|
||
escape_all,
|
||
pretty_duration,
|
||
reply,
|
||
send_image,
|
||
strip_tags,
|
||
)
|
||
from ...models import Message
|
||
|
||
# https://www.imdb.com/title/tt13399862/
|
||
can_handle = re.compile(r"https://www.imdb.com/title/(?P<id>tt\d+)/").fullmatch
|
||
|
||
|
||
def thumbnail(url, width=182, height=268):
|
||
"""Return a thumbnail URL for the given IMDb image URL.
|
||
|
||
The default settings are what IMDb currently uses for desktop display.
|
||
"""
|
||
resize = f"UY{height}" # there's also 'UX' to resize on width
|
||
offset = "0,0"
|
||
crop = f"CR{offset},{width},{height}"
|
||
al = "AL" # not sure what this is, doesn't seem to do much but they use it.
|
||
|
||
variant = "_".join((resize, crop, al))
|
||
|
||
parts = urlparse(url)
|
||
path = Path(parts.path)
|
||
# path.with_stem(f"{path.stem}_{variant}") XXX py3.9
|
||
path = path.with_name(f"{path.stem}_{variant}{path.suffix}")
|
||
return parts._replace(path=str(path)).geturl()
|
||
|
||
|
||
period_re = re.compile(
|
||
r"P((?P<year>\d+)Y)?((?P<month>\d+)M)?((?P<day>\d+)D)?T((?P<hour>\d+)H)?((?P<minute>\d+)M)?((?P<second>\d+)S)?"
|
||
)
|
||
|
||
|
||
def parse_period(s: str) -> int:
|
||
# see https://en.wikipedia.org/wiki/ISO_8601#Durations
|
||
seconds = {
|
||
"year": 365 * 86400,
|
||
"month": 30 * 86400,
|
||
"day": 86400,
|
||
"hour": 3600,
|
||
"minute": 60,
|
||
"second": 1,
|
||
}
|
||
if not (match := period_re.fullmatch(s)):
|
||
return 0
|
||
return sum(seconds[k] * int(v) for k, v in match.groupdict().items() if v)
|
||
|
||
|
||
_import_image_cache = (
|
||
{}
|
||
) # XXX ideally we'd cache these forever (in some permanent storage)
|
||
|
||
|
||
async def import_image(client, url, filename=None):
|
||
if url in _import_image_cache:
|
||
return _import_image_cache[url]
|
||
r = requests.get(
|
||
url,
|
||
# stream=True,
|
||
timeout=(3, 3),
|
||
headers={"user-agent": "hotdog/v1 urlinfo"},
|
||
)
|
||
size = len(r.content)
|
||
uploaded, keys = await client.upload(
|
||
# io.BufferedReader(r.raw),
|
||
io.BufferedReader(io.BytesIO(r.content)),
|
||
content_type="image/jpeg",
|
||
filename=filename,
|
||
filesize=size,
|
||
)
|
||
_import_image_cache[url] = {
|
||
"size": size,
|
||
"url": uploaded.content_uri,
|
||
}
|
||
return _import_image_cache[url]
|
||
|
||
|
||
@dataclass
|
||
class Extracted:
|
||
title: str
|
||
original_image: str
|
||
genres: List[str]
|
||
description: str
|
||
published: date
|
||
creators: List[str]
|
||
duration_s: Optional[int] = None
|
||
rating_value: Optional[float] = None
|
||
rating_count: Optional[int] = None
|
||
|
||
|
||
async def extractor(info):
|
||
parser = ElementParser(
|
||
lambda tag, attrs: (
|
||
tag == "script" and dict(attrs).get("type") == "application/ld+json"
|
||
)
|
||
)
|
||
parser.load_chunks(info._chunks_str)
|
||
|
||
if not parser.value:
|
||
return
|
||
|
||
ld = json.loads(parser.value)
|
||
assert ld["@context"] == "http://schema.org" and ld["@type"] in (
|
||
"Movie",
|
||
"CreativeWork",
|
||
)
|
||
|
||
creators = []
|
||
for k in "director", "creator":
|
||
if k in ld:
|
||
t = [ld[k]] if type(ld[k]) is dict else ld[k]
|
||
creators += [p["name"] for p in t if p["@type"] == "Person"]
|
||
creators = list({k: None for k in creators}) # remove dupes
|
||
|
||
info.extracted = Extracted(
|
||
title=ld["name"],
|
||
original_image=ld["image"],
|
||
genres=ld["genre"],
|
||
description=ld["description"],
|
||
published=date.fromisoformat(ld["datePublished"]),
|
||
creators=creators,
|
||
)
|
||
|
||
if "duration" in ld:
|
||
info.extracted.duration_s = parse_period(ld["duration"])
|
||
|
||
if "aggregateRating" in ld:
|
||
assert ld["aggregateRating"]["@type"] == "AggregateRating"
|
||
info.extracted.rating_value = float(ld["aggregateRating"]["ratingValue"])
|
||
info.extracted.rating_count = ld["aggregateRating"]["ratingCount"]
|
||
|
||
|
||
async def handle(message, url, info):
|
||
ex = clone(info.extracted)
|
||
image_title = f"Poster for {ex.title} ({ex.published:%Y})"
|
||
hosted_image = await import_image(
|
||
message.app.client,
|
||
thumbnail(ex.original_image),
|
||
filename=f"{image_title}.jpg",
|
||
)
|
||
await send_image(
|
||
message.app.client,
|
||
message.room.room_id,
|
||
hosted_image["url"],
|
||
description=image_title,
|
||
mimetype="image/jpeg",
|
||
size=hosted_image["size"],
|
||
)
|
||
|
||
ex.description = strip_tags(ex.description)
|
||
escape_all(ex)
|
||
|
||
details = [
|
||
f"🖋 {' ∕ '.join(ex.creators[:2])}",
|
||
]
|
||
if ex.duration_s:
|
||
details.append(f"⏱ {pretty_duration(ex.duration_s)}")
|
||
if ex.rating_count:
|
||
details.append(f"⭐️ {ex.rating_value:_.01f} ⁄ 10 (👤 {ex.rating_count})")
|
||
details.append(f"🏷 {' ∕ '.join(ex.genres[:3])}")
|
||
lines = [
|
||
f"<b>{ex.title}</b> (<b>{ex.published:%Y}</b>)",
|
||
f"{', '.join(details)}",
|
||
f"<i>{capped_text(ex.description, 500)}</i>",
|
||
]
|
||
|
||
html = "<br>".join(lines)
|
||
plain = strip_tags(" — ".join(lines))
|
||
await reply(message, plain, html=html)
|