hotdog/hotdog/command/urlinfo.py
ducklet efc6ecbb45 urlinfo: allow sub-modules and add module for IMDb movies
The urlinfo plugin is now set up to look up URL information for any URL
occurring in text, not only when triggered explicitly as a command.
The youtube plugin should probably be integrated into this setup,
replacing the bot plugin with a urlinfo extension.
2020-11-07 20:36:31 +01:00

246 lines
6.9 KiB
Python

import codecs
import re
from dataclasses import dataclass
from html import escape
from html.parser import HTMLParser
from random import randint
from time import time as now
from typing import *
import requests
from ..functions import ElementParser, reply
from ..models import Message
from .urlinfo_ import imdb # XXX make this dynamic? (like we load plugins)
HELP = """Return information about an online HTTP resource.
!u[rl] <url>
"""
def init(bot):
bot.on_message(handle)
is_url = re.compile(
# r"https?://(?:[a-zA-Z]|[0-9]|[$-_~@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
r"(https?://|www\.)\S+"
).fullmatch
def get_encodings_from_content(content: str) -> List[str]:
"""Returns encodings from given content string."""
charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
return (
charset_re.findall(content)
+ pragma_re.findall(content)
+ xml_re.findall(content)
)
def stream_decode_response_unicode(content: Iterable[bytes]) -> Iterable[str]:
"""Stream decodes a iterator."""
decoder = None
for chunk in content:
if decoder is None:
encodings = get_encodings_from_content(
chunk.decode("utf-8", errors="replace")
) + ["utf-8"]
decoder = codecs.getincrementaldecoder(encodings[0])(errors="replace")
rv = decoder.decode(chunk)
if rv:
yield rv
rv = decoder.decode(b"", final=True)
if rv:
yield rv
def title(content: Iterable[str]) -> Optional[str]:
t = ElementParser(lambda tag, attrs: tag == "title")
t.load_chunks(content)
return t.value
def capped(content: Iterable[AnyStr], read_max: int) -> Iterable[AnyStr]:
read = 0
for chunk in content:
read += len(chunk)
yield chunk
if read >= read_max:
break
@dataclass
class Info:
"""Information extracted by loading a URL.
This information can/will be cached for successive lookups of the same URL.
When the info object is handed to the extractor function _resp and _chunks
will be set. Prior to commiting the info object to cache these references
will be removed.
"""
code: int
final_url: str
elapsed_ms: int
reason: str
content_type: Optional[str]
size: Optional[int]
filename: Optional[str]
extracted: Optional[Any] = None
_resp: Optional[requests.Response] = None
_chunks_str: Optional[Iterable[str]] = None
_chunks_bytes: Optional[Iterable[bytes]] = None
# XXX can't use lru_cache with async funcs
# TODO: create lru_cache that supports async and timeout
_load_info_cache = {}
async def load_info(
url: str, extractor: Callable[[Info], None], cachetoken
) -> Optional[Info]:
"""The cachetoken is just there to bust the LRU cache after a while."""
cachekey = (url, cachetoken)
if cachekey in _load_info_cache:
return _load_info_cache[cachekey]
try:
r = requests.get(
url,
stream=True,
timeout=(3, 3),
headers={"user-agent": "hotdog/v1 urlinfo"},
)
except Exception:
return None
filename = None
dispo = r.headers.get("Content-Disposition", "").split(";")
if len(dispo) == 2 and dispo[0] == "attachment":
dispo = dispo[1].strip().split("=", 2)
if len(dispo) == 2 and dispo[0] == "filename":
filename = dispo[1].strip()
one_kb = 2 ** 10
chunks = capped(r.iter_content(chunk_size=30 * one_kb), read_max=200 * one_kb)
info = Info(
code=r.status_code,
final_url=r.url,
elapsed_ms=int(r.elapsed.total_seconds() * 1_000),
reason=r.reason,
content_type=r.headers.get("Content-Type"),
size=(
int(r.headers["Content-Length"]) if "Content-Length" in r.headers else None
),
filename=filename,
_resp=r,
_chunks_str=stream_decode_response_unicode(chunks),
_chunks_bytes=chunks,
)
await extractor(info)
# Remove all references to the Response before the info is commited to cache.
info._resp = None
info._chunks_str = None
info._chunks_bytes = None
_load_info_cache[cachekey] = info
return _load_info_cache[cachekey]
def cachetoken(quant_m=15):
"""Return a cache token with the given time frame"""
return int(now() / 60 / quant_m)
def pretty_size(size: int) -> str:
qs = "", "K", "M", "G", "T", "P"
for q in qs:
if size < 1024 or q == qs[-1]:
break
size /= 1000
if not q:
return f"{size} B"
return f"{size:_.02f} {q}B"
async def generic_handler(message: Message, url: str, info: Info):
details = []
if info.content_type:
details.append(f"<i>Media type</i>: {escape(info.content_type)}")
if info.size:
details.append(f"<i>Size</i>: {pretty_size(info.size)}")
details.append(f"<i>Status</i>: {info.code}")
if info.reason:
details[-1] += f" ({escape(info.reason)})"
if info.final_url != url:
details.append(
f"""<i>Redirected to</i>: <a href="{escape(info.final_url)}">{escape(info.final_url)}</a>"""
)
if info.filename and info.filename != url.rsplit("/", 2)[-1]:
details.append(f"<i>Filename</i>: {escape(info.filename)}")
details.append(f"<i>TTFB</i>: {info.elapsed_ms:_} ms")
text = (
f"<b>{escape(info.extracted['title'])}</b> — "
if info.extracted["title"]
else ""
)
text += "; ".join(details)
await reply(message, html=text, in_thread=True)
async def generic_extractor(info: Info):
content_type = info._resp.headers.get("Content-Type", "")
is_html = content_type.startswith("text/html") or url.lower().endswith(
(".html", ".htm")
)
info.extracted = {"title": title(info._chunks_str) if is_html else None}
def full_url(ref: str) -> str:
return f"http://{ref}" if ref.startswith("www") else ref
async def handle(message: Message):
if message.command and message.command not in {"u", "url"}:
return
limit = 3
urls = [full_url(w) for w in message.words if is_url(w)][:limit]
if not urls:
return
handlers = (imdb,)
for url in urls:
for handler in handlers:
if handler.can_handle(url):
handler = handler.handle
extractor = handler.extractor
break
else:
if not (
message.command
): # We only want the generic handler if we were called explicitly
continue
handler = generic_handler
extractor = generic_extractor
info = await load_info(url, extractor, cachetoken())
if not info:
continue
await handler(message, url, info)