urlinfo:generic: improve JSON-LD handling

Support (or at least acknowledge) multiple JSON-LD script tags per page and context tags using HTTP instead of HTTPS.
2020-11-14 14:37:09 +01:00 · 2020-11-14 14:37:09 +01:00 · acd1f949d2
commit acd1f949d2
parent c798f8d776
1 changed files with 19 additions and 14 deletions
--- a/hotdog/command/urlinfo_/generic.py
+++ b/hotdog/command/urlinfo_/generic.py
@ -73,7 +73,7 @@ async def handle(message: Message, url, load_info):

    roomconf = message.app.config.l6n[message.room.room_id]
    plain = html = None
-    if info.extracted.ld:
+    if info.extracted.ld and info.extracted.ld.description:
        html, plain = ld_details(
            info.extracted.ld, tz=roomconf["timezone"], lc=roomconf["locale"]
        )
@ -98,9 +98,9 @@ async def extractor(info):
    if is_html:
        parsed = parse_html(info._chunks_str)
        info.extracted.title = parsed["title"]
-        info.extracted.ld = (
-            next(parse_ldjson(parsed["ldjson"])) if parsed["ldjson"] else None
-        )
+        info.extracted.ld = next(
+            parse_ldjson(parsed["ldjson"]), None
+        )  # XXX We'll just go with the first entry for now, would be better to look at the LD type and pick & choose.


 def parse_html(content: Iterable[str]) -> Mapping[str, Optional[str]]:
@ -108,7 +108,8 @@ def parse_html(content: Iterable[str]) -> Mapping[str, Optional[str]]:
        "ldjson": ElementParser(
            lambda tag, attrs: (
                tag == "script" and dict(attrs).get("type") == "application/ld+json"
-            )
+            ),
+            stop_after_one=False,
        ),
        "title": ElementParser(lambda tag, attrs: tag == "title"),
    }
@ -120,7 +121,10 @@ def parse_html(content: Iterable[str]) -> Mapping[str, Optional[str]]:
        if all(p.done for p in parsers.values()):
            break

-    return {k: p.value for k, p in parsers.items()}
+    return {
+        "ldjson": parsers["ldjson"].values,
+        "title": parsers["title"].value,
+    }


 def pretty_size(size: int) -> str:
@ -168,17 +172,18 @@ class LinkedData:
                if "datePublished" in o
                else None
            ),
-            image=o.get("image"),
+            image=o.get("image") or o.get("logo"),
            description=o.get("description"),
            genres=uniq(aslist(o.get("genre"))),
            creators=uniq(creators),
        )


-def parse_ldjson(ldjson: str) -> Iterable[LinkedData]:
+def parse_ldjson(ldjsons: Iterable[str]) -> Iterable[LinkedData]:
+    for ldjson in ldjsons:
        ld: Union[dict, list] = json.loads(ldjson)
        for o in aslist(ld):
-        if o.get("@context") != "https://schema.org":
+            if o.get("@context") not in ("http://schema.org", "https://schema.org"):
                log.debug("Unknown context in Linked Data.")
            else:
                yield LinkedData.from_json(o)