The urlinfo plugin is now set up to look up URL information for any URL occurring in text, not only when triggered explicitly as a command. The youtube plugin should probably be integrated into this setup, replacing the bot plugin with a urlinfo extension.
246 lines
6.9 KiB
Python
246 lines
6.9 KiB
Python
import codecs
|
|
import re
|
|
from dataclasses import dataclass
|
|
from html import escape
|
|
from html.parser import HTMLParser
|
|
from random import randint
|
|
from time import time as now
|
|
from typing import *
|
|
|
|
import requests
|
|
|
|
from ..functions import ElementParser, reply
|
|
from ..models import Message
|
|
from .urlinfo_ import imdb # XXX make this dynamic? (like we load plugins)
|
|
|
|
HELP = """Return information about an online HTTP resource.
|
|
!u[rl] <url>
|
|
"""
|
|
|
|
|
|
def init(bot):
|
|
bot.on_message(handle)
|
|
|
|
|
|
is_url = re.compile(
|
|
# r"https?://(?:[a-zA-Z]|[0-9]|[$-_~@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
|
|
r"(https?://|www\.)\S+"
|
|
).fullmatch
|
|
|
|
|
|
def get_encodings_from_content(content: str) -> List[str]:
|
|
"""Returns encodings from given content string."""
|
|
|
|
charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
|
|
pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
|
|
xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
|
|
|
|
return (
|
|
charset_re.findall(content)
|
|
+ pragma_re.findall(content)
|
|
+ xml_re.findall(content)
|
|
)
|
|
|
|
|
|
def stream_decode_response_unicode(content: Iterable[bytes]) -> Iterable[str]:
|
|
"""Stream decodes a iterator."""
|
|
|
|
decoder = None
|
|
for chunk in content:
|
|
if decoder is None:
|
|
encodings = get_encodings_from_content(
|
|
chunk.decode("utf-8", errors="replace")
|
|
) + ["utf-8"]
|
|
decoder = codecs.getincrementaldecoder(encodings[0])(errors="replace")
|
|
|
|
rv = decoder.decode(chunk)
|
|
if rv:
|
|
yield rv
|
|
rv = decoder.decode(b"", final=True)
|
|
if rv:
|
|
yield rv
|
|
|
|
|
|
def title(content: Iterable[str]) -> Optional[str]:
|
|
t = ElementParser(lambda tag, attrs: tag == "title")
|
|
t.load_chunks(content)
|
|
return t.value
|
|
|
|
|
|
def capped(content: Iterable[AnyStr], read_max: int) -> Iterable[AnyStr]:
|
|
read = 0
|
|
for chunk in content:
|
|
read += len(chunk)
|
|
yield chunk
|
|
if read >= read_max:
|
|
break
|
|
|
|
|
|
@dataclass
|
|
class Info:
|
|
"""Information extracted by loading a URL.
|
|
|
|
This information can/will be cached for successive lookups of the same URL.
|
|
When the info object is handed to the extractor function _resp and _chunks
|
|
will be set. Prior to commiting the info object to cache these references
|
|
will be removed.
|
|
"""
|
|
|
|
code: int
|
|
final_url: str
|
|
elapsed_ms: int
|
|
reason: str
|
|
content_type: Optional[str]
|
|
size: Optional[int]
|
|
filename: Optional[str]
|
|
extracted: Optional[Any] = None
|
|
|
|
_resp: Optional[requests.Response] = None
|
|
_chunks_str: Optional[Iterable[str]] = None
|
|
_chunks_bytes: Optional[Iterable[bytes]] = None
|
|
|
|
|
|
# XXX can't use lru_cache with async funcs
|
|
# TODO: create lru_cache that supports async and timeout
|
|
|
|
_load_info_cache = {}
|
|
|
|
|
|
async def load_info(
|
|
url: str, extractor: Callable[[Info], None], cachetoken
|
|
) -> Optional[Info]:
|
|
"""The cachetoken is just there to bust the LRU cache after a while."""
|
|
cachekey = (url, cachetoken)
|
|
if cachekey in _load_info_cache:
|
|
return _load_info_cache[cachekey]
|
|
|
|
try:
|
|
r = requests.get(
|
|
url,
|
|
stream=True,
|
|
timeout=(3, 3),
|
|
headers={"user-agent": "hotdog/v1 urlinfo"},
|
|
)
|
|
except Exception:
|
|
return None
|
|
|
|
filename = None
|
|
dispo = r.headers.get("Content-Disposition", "").split(";")
|
|
if len(dispo) == 2 and dispo[0] == "attachment":
|
|
dispo = dispo[1].strip().split("=", 2)
|
|
if len(dispo) == 2 and dispo[0] == "filename":
|
|
filename = dispo[1].strip()
|
|
|
|
one_kb = 2 ** 10
|
|
chunks = capped(r.iter_content(chunk_size=30 * one_kb), read_max=200 * one_kb)
|
|
|
|
info = Info(
|
|
code=r.status_code,
|
|
final_url=r.url,
|
|
elapsed_ms=int(r.elapsed.total_seconds() * 1_000),
|
|
reason=r.reason,
|
|
content_type=r.headers.get("Content-Type"),
|
|
size=(
|
|
int(r.headers["Content-Length"]) if "Content-Length" in r.headers else None
|
|
),
|
|
filename=filename,
|
|
_resp=r,
|
|
_chunks_str=stream_decode_response_unicode(chunks),
|
|
_chunks_bytes=chunks,
|
|
)
|
|
|
|
await extractor(info)
|
|
|
|
# Remove all references to the Response before the info is commited to cache.
|
|
info._resp = None
|
|
info._chunks_str = None
|
|
info._chunks_bytes = None
|
|
|
|
_load_info_cache[cachekey] = info
|
|
return _load_info_cache[cachekey]
|
|
|
|
|
|
def cachetoken(quant_m=15):
|
|
"""Return a cache token with the given time frame"""
|
|
return int(now() / 60 / quant_m)
|
|
|
|
|
|
def pretty_size(size: int) -> str:
|
|
qs = "", "K", "M", "G", "T", "P"
|
|
for q in qs:
|
|
if size < 1024 or q == qs[-1]:
|
|
break
|
|
size /= 1000
|
|
if not q:
|
|
return f"{size} B"
|
|
return f"{size:_.02f} {q}B"
|
|
|
|
|
|
async def generic_handler(message: Message, url: str, info: Info):
|
|
details = []
|
|
if info.content_type:
|
|
details.append(f"<i>Media type</i>: {escape(info.content_type)}")
|
|
if info.size:
|
|
details.append(f"<i>Size</i>: {pretty_size(info.size)}")
|
|
details.append(f"<i>Status</i>: {info.code}")
|
|
if info.reason:
|
|
details[-1] += f" ({escape(info.reason)})"
|
|
if info.final_url != url:
|
|
details.append(
|
|
f"""<i>Redirected to</i>: <a href="{escape(info.final_url)}">{escape(info.final_url)}</a>"""
|
|
)
|
|
if info.filename and info.filename != url.rsplit("/", 2)[-1]:
|
|
details.append(f"<i>Filename</i>: {escape(info.filename)}")
|
|
details.append(f"<i>TTFB</i>: {info.elapsed_ms:_} ms")
|
|
|
|
text = (
|
|
f"<b>{escape(info.extracted['title'])}</b> — "
|
|
if info.extracted["title"]
|
|
else ""
|
|
)
|
|
text += "; ".join(details)
|
|
|
|
await reply(message, html=text, in_thread=True)
|
|
|
|
|
|
async def generic_extractor(info: Info):
|
|
content_type = info._resp.headers.get("Content-Type", "")
|
|
is_html = content_type.startswith("text/html") or url.lower().endswith(
|
|
(".html", ".htm")
|
|
)
|
|
info.extracted = {"title": title(info._chunks_str) if is_html else None}
|
|
|
|
|
|
def full_url(ref: str) -> str:
|
|
return f"http://{ref}" if ref.startswith("www") else ref
|
|
|
|
|
|
async def handle(message: Message):
|
|
if message.command and message.command not in {"u", "url"}:
|
|
return
|
|
|
|
limit = 3
|
|
urls = [full_url(w) for w in message.words if is_url(w)][:limit]
|
|
if not urls:
|
|
return
|
|
|
|
handlers = (imdb,)
|
|
|
|
for url in urls:
|
|
for handler in handlers:
|
|
if handler.can_handle(url):
|
|
handler = handler.handle
|
|
extractor = handler.extractor
|
|
break
|
|
else:
|
|
if not (
|
|
message.command
|
|
): # We only want the generic handler if we were called explicitly
|
|
continue
|
|
handler = generic_handler
|
|
extractor = generic_extractor
|
|
|
|
info = await load_info(url, extractor, cachetoken())
|
|
if not info:
|
|
continue
|
|
await handler(message, url, info)
|