From a8785ef961993ece46f4d1e1cccf8e2526152d97 Mon Sep 17 00:00:00 2001
From: ducklet <ducklet@noreply.code.dumpr.org>
Date: Tue, 10 Nov 2020 21:41:43 +0100
Subject: [PATCH] urlinfo: split of generic handler into a separate module

The new generic handler also supports some generic JSON-LD parsing.
---
 hotdog/command/urlinfo.py          |  89 ++++----------
 hotdog/command/urlinfo_/generic.py | 180 +++++++++++++++++++++++++++++
 hotdog/functions.py                |  10 +-
 3 files changed, 205 insertions(+), 74 deletions(-)
 create mode 100644 hotdog/command/urlinfo_/generic.py
diff --git a/hotdog/command/urlinfo.py b/hotdog/command/urlinfo.py
index 7d7e01b..82c2e62 100644
--- a/hotdog/command/urlinfo.py
+++ b/hotdog/command/urlinfo.py
@@ -1,15 +1,15 @@
 import codecs
 import re
 from dataclasses import dataclass
-from html import escape
 from time import time as now
 from typing import *
 
 import requests
 
-from ..functions import ElementParser, reply
+from ..functions import react
+from ..html import find
 from ..models import Message
-from .urlinfo_ import imdb  # XXX make this dynamic? (like we load plugins)
+from .urlinfo_ import generic, imdb  # XXX make this dynamic? (like we load plugins)
 
 HELP = """Return information about an online HTTP resource.
 !u[rl] <url>
@@ -59,12 +59,6 @@ def stream_decode_response_unicode(content: Iterable[bytes]) -> Iterable[str]:
         yield rv
 
 
-def title(content: Iterable[str]) -> Optional[str]:
-    t = ElementParser(lambda tag, attrs: tag == "title")
-    t.load_chunks(content)
-    return t.value
-
-
 def capped(content: Iterable[AnyStr], read_max: int) -> Iterable[AnyStr]:
     read = 0
     for chunk in content:
@@ -163,84 +157,41 @@ def cachetoken(quant_m=15):
     return int(now() / 60 / quant_m)
 
 
-def pretty_size(size: int) -> str:
-    qs = "", "K", "M", "G", "T", "P"
-    for q in qs:
-        if size < 1024 or q == qs[-1]:
-            break
-        size /= 1000
-    if not q:
-        return f"{size} B"
-    return f"{size:_.02f} {q}B"
-
-
-async def generic_handler(message: Message, url: str, info: Info):
-    details = []
-    if info.content_type:
-        details.append(f"<i>Media type</i>: {escape(info.content_type)}")
-    if info.size:
-        details.append(f"<i>Size</i>: {pretty_size(info.size)}")
-    details.append(f"<i>Status</i>: {info.code}")
-    if info.reason:
-        details[-1] += f" ({escape(info.reason)})"
-    if info.final_url != url:
-        details.append(
-            f"""<i>Redirected to</i>: <a href="{escape(info.final_url)}">{escape(info.final_url)}</a>"""
-        )
-    if info.filename and info.filename != url.rsplit("/", 2)[-1]:
-        details.append(f"<i>Filename</i>: {escape(info.filename)}")
-    details.append(f"<i>TTFB</i>: {info.elapsed_ms:_} ms")
-
-    text = (
-        f"<b>{escape(info.extracted['title'])}</b> — "
-        if info.extracted["title"]
-        else ""
-    )
-    text += "; ".join(details)
-
-    await reply(message, html=text, in_thread=True)
-
-
-async def generic_extractor(info: Info):
-    content_type = info._resp.headers.get("Content-Type", "")
-    is_html = content_type.startswith("text/html") or url.lower().endswith(
-        (".html", ".htm")
-    )
-    info.extracted = {"title": title(info._chunks_str) if is_html else None}
-
-
 def full_url(ref: str) -> str:
     return f"http://{ref}" if ref.startswith("www") else ref
 
 
-class GenericHandler:
-    extractor = generic_extractor
-    handle = generic_handler
-
-
 async def handle(message: Message):
     if message.command and message.command not in {"u", "url"}:
         return
 
     limit = 3
-    urls = [full_url(w) for w in message.words if is_url(w)][:limit]
+    handlers = (imdb,)
+
+    urls = {full_url(w) for w in message.words if is_url(w)}
+    if message.html:
+        urls |= {n["href"] for n in find(message.html, "a") if is_url(n["href"])}
+
     if not urls:
         return
 
-    handlers = (imdb,)
-
-    for url in urls:
+    for url in list(urls)[:limit]:
         for handler in handlers:
             if handler.can_handle(url):
                 break
         else:
             # We only want the generic handler if we were called explicitly
-            handler = GenericHandler if message.command else None
+            handler = generic if message.command or len(message.words) == 1 else None
 
         if handler is None:
             continue
 
-        info = await load_info(url, handler.extractor, cachetoken())
-        if not info:
-            continue
-        await handler.handle(message, url, info)
+        await react(message, "⚡️")
+        try:
+            info = await load_info(url, handler.extractor, cachetoken())
+            if not info:
+                continue
+            await handler.handle(message, url, info)
+        except:
+            await react(message, "🐛")
+            raise
diff --git a/hotdog/command/urlinfo_/generic.py b/hotdog/command/urlinfo_/generic.py
new file mode 100644
index 0000000..5989f91
--- /dev/null
+++ b/hotdog/command/urlinfo_/generic.py
@@ -0,0 +1,180 @@
+import json
+import logging
+from dataclasses import dataclass
+from datetime import datetime
+from typing import *
+
+from ...functions import (
+    ElementParser,
+    capped_text,
+    escape_all,
+    localizedtz,
+    reply,
+    strip_tags,
+)
+from ...models import Message
+
+log = logging.getLogger(__name__)
+
+
+def generic_details(info, url: str):
+    escape_all(info)
+    details = []
+    if info.content_type:
+        details.append(f"<i>Media type</i>: {info.content_type}")
+    if info.size:
+        details.append(f"<i>Size</i>: {pretty_size(info.size)}")
+    details.append(f"<i>Status</i>: {info.code}")
+    if info.reason:
+        details[-1] += f" ({info.reason})"
+    if info.final_url != url:
+        details.append(
+            f"""<i>Redirected to</i>: <a href="{info.final_url}">{info.final_url}</a>"""
+        )
+    if info.filename and info.filename != url.rsplit("/", 2)[-1]:
+        details.append(f"<i>Filename</i>: {info.filename}")
+    details.append(f"<i>TTFB</i>: {info.elapsed_ms:_} ms")
+
+    text = ""
+    if info.extracted.title:
+        text += f"<b>{info.extracted.title}</b> — "
+    text += "; ".join(details)
+    return text
+
+
+def ld_details(ex, tz, lc):
+    details = []
+    if ex.creators:
+        details.append(f"🖋 {' ∕ '.join(ex.creators[:2])}")
+    if ex.genres:
+        details.append(f"🏷 {' ∕ '.join(ex.genres[:3])}")
+
+    lines = []
+    if ex.title:
+        lines.append(f"<b>{ex.title}</b>")
+        if ex.published:
+            lines[
+                -1
+            ] += f" (<b>{localizedtz(ex.published, '%x %X', tzname=tz, lc=lc)}</b>)"
+    if details:
+        lines.append(f"{', '.join(details)}")
+    if ex.description:
+        lines.append(f"<i>{capped_text(ex.description, 500)}</i>")
+
+    html = "<br>".join(lines)
+    plain = strip_tags(" — ".join(lines))
+    return html, plain
+
+
+async def handle(message: Message, url, info):
+    roomconf = message.app.config.l6n[message.room.room_id]
+    plain = html = None
+    if info.extracted.ld:
+        html, plain = ld_details(
+            info.extracted.ld, tz=roomconf["timezone"], lc=roomconf["locale"]
+        )
+    else:
+        html = generic_details(info, url)
+
+    await reply(message, plain, html=html, in_thread=True)
+
+
+@dataclass
+class Extracted:
+    ld: Optional["LinkedData"] = None
+    title: Optional[str] = None
+
+
+async def extractor(info):
+    content_type = info._resp.headers.get("Content-Type", "")
+    is_html = content_type.startswith("text/html") or info.final_url.lower().endswith(
+        (".html", ".htm")
+    )
+    info.extracted = Extracted()
+    if is_html:
+        parsed = parse_html(info._chunks_str)
+        info.extracted.title = parsed["title"]
+        info.extracted.ld = (
+            next(parse_ldjson(parsed["ldjson"])) if parsed["ldjson"] else None
+        )
+
+
+def parse_html(content: Iterable[str]) -> Mapping[str, Optional[str]]:
+    parsers = {
+        "ldjson": ElementParser(
+            lambda tag, attrs: (
+                tag == "script" and dict(attrs).get("type") == "application/ld+json"
+            )
+        ),
+        "title": ElementParser(lambda tag, attrs: tag == "title"),
+    }
+
+    for chunk in content:
+        for p in parsers.values():
+            if not p.done:
+                p.feed(chunk)
+        if all(p.done for p in parsers.values()):
+            break
+
+    return {k: p.value for k, p in parsers.items()}
+
+
+def pretty_size(size: int) -> str:
+    qs = "", "K", "M", "G", "T", "P"
+    for q in qs:
+        if size < 1024 or q == qs[-1]:
+            break
+        size /= 1000
+    if not q:
+        return f"{size} B"
+    return f"{size:_.02f} {q}B"
+
+
+def uniq(col: Collection[Hashable]) -> Collection[Hashable]:
+    return type(col)({k: None for k in col})
+
+
+def aslist(o: Any):
+    if o is None:
+        return []
+    return o if type(o) is list else [o]
+
+
+@dataclass
+class LinkedData:
+    title: Optional[str]
+    image: Optional[str]
+    genres: List[str]
+    description: Optional[str]
+    published: Optional[datetime]
+    creators: List[str]
+
+    @classmethod
+    def from_json(cls, o: Mapping[str, Any]):
+        # https://schema.org/Movie
+        # https://schema.org/NewsArticle
+        creators = []
+        for k in "director", "creator", "author", "producer", "contributor":
+            if k in o:
+                creators += [p["name"] for p in aslist(o[k]) if p["@type"] == "Person"]
+        return cls(
+            title=o.get("headline") or o.get("name"),
+            published=(
+                datetime.fromisoformat(o["datePublished"])
+                if "datePublished" in o
+                else None
+            ),
+            image=o.get("image"),
+            description=o.get("description"),
+            genres=uniq(aslist(o.get("genre"))),
+            creators=uniq(creators),
+        )
+
+
+def parse_ldjson(ldjson: str) -> Iterable[LinkedData]:
+    ld: Union[dict, list] = json.loads(ldjson)
+    for o in aslist(ld):
+        if o.get("@context") != "https://schema.org":
+            log.debug("Unknown context in Linked Data.")
+        else:
+            yield LinkedData.from_json(o)
diff --git a/hotdog/functions.py b/hotdog/functions.py
index 179b235..a20ac29 100644
--- a/hotdog/functions.py
+++ b/hotdog/functions.py
@@ -284,7 +284,7 @@ class ElementParser(HTMLParser):
         super().__init__()
         self.selector = selector
         self.__active_tag = None
-        self.__done = False
+        self.done = False
         self.__value = ""
 
     def handle_starttag(self, tag, attrs):
@@ -293,21 +293,21 @@ class ElementParser(HTMLParser):
 
     def handle_endtag(self, tag):
         if tag == self.__active_tag:
-            self.__done = True
+            self.done = True
             self.__active_tag = None
 
     def handle_data(self, data):
-        if self.__active_tag and not self.__done:
+        if self.__active_tag and not self.done:
             self.__value += data
 
     @property
     def value(self) -> Optional[str]:
-        return self.__value if self.__done else None
+        return self.__value if self.done else None
 
     def load_chunks(self, content: Iterable[str]) -> None:
         for chunk in content:
             self.feed(chunk)
-            if self.__done:
+            if self.done:
                 break