hotdog/hotdog/command/urlinfo.py

import codecs
import re
from dataclasses import dataclass
from html import escape
from html.parser import HTMLParser
from random import randint
from time import time as now
from typing import *

import requests

from ..functions import ElementParser, reply
from ..models import Message
from .urlinfo_ import imdb  # XXX make this dynamic? (like we load plugins)

HELP = """Return information about an online HTTP resource.
!u[rl] <url>
"""


def init(bot):
    bot.on_message(handle)


is_url = re.compile(
    # r"https?://(?:[a-zA-Z]|[0-9]|[$-_~@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    r"(https?://|www\.)\S+"
).fullmatch


def get_encodings_from_content(content: str) -> List[str]:
    """Returns encodings from given content string."""

    charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
    pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
    xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')

    return (
        charset_re.findall(content)
        + pragma_re.findall(content)
        + xml_re.findall(content)
    )


def stream_decode_response_unicode(content: Iterable[bytes]) -> Iterable[str]:
    """Stream decodes a iterator."""

    decoder = None
    for chunk in content:
        if decoder is None:
            encodings = get_encodings_from_content(
                chunk.decode("utf-8", errors="replace")
            ) + ["utf-8"]
            decoder = codecs.getincrementaldecoder(encodings[0])(errors="replace")

        rv = decoder.decode(chunk)
        if rv:
            yield rv
    rv = decoder.decode(b"", final=True)
    if rv:
        yield rv


def title(content: Iterable[str]) -> Optional[str]:
    t = ElementParser(lambda tag, attrs: tag == "title")
    t.load_chunks(content)
    return t.value


def capped(content: Iterable[AnyStr], read_max: int) -> Iterable[AnyStr]:
    read = 0
    for chunk in content:
        read += len(chunk)
        yield chunk
        if read >= read_max:
            break


@dataclass
class Info:
    """Information extracted by loading a URL.

    This information can/will be cached for successive lookups of the same URL.
    When the info object is handed to the extractor function _resp and _chunks
    will be set.  Prior to commiting the info object to cache these references
    will be removed.
    """

    code: int
    final_url: str
    elapsed_ms: int
    reason: str
    content_type: Optional[str]
    size: Optional[int]
    filename: Optional[str]
    extracted: Optional[Any] = None

    _resp: Optional[requests.Response] = None
    _chunks_str: Optional[Iterable[str]] = None
    _chunks_bytes: Optional[Iterable[bytes]] = None


# XXX can't use lru_cache with async funcs
#     TODO: create lru_cache that supports async and timeout

_load_info_cache = {}


async def load_info(
    url: str, extractor: Callable[[Info], None], cachetoken
) -> Optional[Info]:
    """The cachetoken is just there to bust the LRU cache after a while."""
    cachekey = (url, cachetoken)
    if cachekey in _load_info_cache:
        return _load_info_cache[cachekey]

    try:
        r = requests.get(
            url,
            stream=True,
            timeout=(3, 3),
            headers={"user-agent": "hotdog/v1 urlinfo"},
        )
    except Exception:
        return None

    filename = None
    dispo = r.headers.get("Content-Disposition", "").split(";")
    if len(dispo) == 2 and dispo[0] == "attachment":
        dispo = dispo[1].strip().split("=", 2)
        if len(dispo) == 2 and dispo[0] == "filename":
            filename = dispo[1].strip()

    one_kb = 2 ** 10
    chunks = capped(r.iter_content(chunk_size=30 * one_kb), read_max=200 * one_kb)

    info = Info(
        code=r.status_code,
        final_url=r.url,
        elapsed_ms=int(r.elapsed.total_seconds() * 1_000),
        reason=r.reason,
        content_type=r.headers.get("Content-Type"),
        size=(
            int(r.headers["Content-Length"]) if "Content-Length" in r.headers else None
        ),
        filename=filename,
        _resp=r,
        _chunks_str=stream_decode_response_unicode(chunks),
        _chunks_bytes=chunks,
    )

    await extractor(info)

    # Remove all references to the Response before the info is commited to cache.
    info._resp = None
    info._chunks_str = None
    info._chunks_bytes = None

    _load_info_cache[cachekey] = info
    return _load_info_cache[cachekey]


def cachetoken(quant_m=15):
    """Return a cache token with the given time frame"""
    return int(now() / 60 / quant_m)


def pretty_size(size: int) -> str:
    qs = "", "K", "M", "G", "T", "P"
    for q in qs:
        if size < 1024 or q == qs[-1]:
            break
        size /= 1000
    if not q:
        return f"{size} B"
    return f"{size:_.02f} {q}B"


async def generic_handler(message: Message, url: str, info: Info):
    details = []
    if info.content_type:
        details.append(f"<i>Media type</i>: {escape(info.content_type)}")
    if info.size:
        details.append(f"<i>Size</i>: {pretty_size(info.size)}")
    details.append(f"<i>Status</i>: {info.code}")
    if info.reason:
        details[-1] += f" ({escape(info.reason)})"
    if info.final_url != url:
        details.append(
            f"""<i>Redirected to</i>: <a href="{escape(info.final_url)}">{escape(info.final_url)}</a>"""
        )
    if info.filename and info.filename != url.rsplit("/", 2)[-1]:
        details.append(f"<i>Filename</i>: {escape(info.filename)}")
    details.append(f"<i>TTFB</i>: {info.elapsed_ms:_} ms")

    text = (
        f"<b>{escape(info.extracted['title'])}</b> — "
        if info.extracted["title"]
        else ""
    )
    text += "; ".join(details)

    await reply(message, html=text, in_thread=True)


async def generic_extractor(info: Info):
    content_type = info._resp.headers.get("Content-Type", "")
    is_html = content_type.startswith("text/html") or url.lower().endswith(
        (".html", ".htm")
    )
    info.extracted = {"title": title(info._chunks_str) if is_html else None}


def full_url(ref: str) -> str:
    return f"http://{ref}" if ref.startswith("www") else ref


async def handle(message: Message):
    if message.command and message.command not in {"u", "url"}:
        return

    limit = 3
    urls = [full_url(w) for w in message.words if is_url(w)][:limit]
    if not urls:
        return

    handlers = (imdb,)

    for url in urls:
        for handler in handlers:
            if handler.can_handle(url):
                handler = handler.handle
                extractor = handler.extractor
                break
        else:
            if not (
                message.command
            ):  # We only want the generic handler if we were called explicitly
                continue
            handler = generic_handler
            extractor = generic_extractor

        info = await load_info(url, extractor, cachetoken())
        if not info:
            continue
        await handler(message, url, info)