From acd1f949d29710bb668de33c183905ccaf0e2a39 Mon Sep 17 00:00:00 2001 From: ducklet Date: Sat, 14 Nov 2020 14:37:09 +0100 Subject: [PATCH] urlinfo:generic: improve JSON-LD handling Support (or at least acknowledge) multiple JSON-LD script tags per page and context tags using HTTP instead of HTTPS. --- hotdog/command/urlinfo_/generic.py | 33 +++++++++++++++++------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/hotdog/command/urlinfo_/generic.py b/hotdog/command/urlinfo_/generic.py index 62ceb27..358ff36 100644 --- a/hotdog/command/urlinfo_/generic.py +++ b/hotdog/command/urlinfo_/generic.py @@ -73,7 +73,7 @@ async def handle(message: Message, url, load_info): roomconf = message.app.config.l6n[message.room.room_id] plain = html = None - if info.extracted.ld: + if info.extracted.ld and info.extracted.ld.description: html, plain = ld_details( info.extracted.ld, tz=roomconf["timezone"], lc=roomconf["locale"] ) @@ -98,9 +98,9 @@ async def extractor(info): if is_html: parsed = parse_html(info._chunks_str) info.extracted.title = parsed["title"] - info.extracted.ld = ( - next(parse_ldjson(parsed["ldjson"])) if parsed["ldjson"] else None - ) + info.extracted.ld = next( + parse_ldjson(parsed["ldjson"]), None + ) # XXX We'll just go with the first entry for now, would be better to look at the LD type and pick & choose. def parse_html(content: Iterable[str]) -> Mapping[str, Optional[str]]: @@ -108,7 +108,8 @@ def parse_html(content: Iterable[str]) -> Mapping[str, Optional[str]]: "ldjson": ElementParser( lambda tag, attrs: ( tag == "script" and dict(attrs).get("type") == "application/ld+json" - ) + ), + stop_after_one=False, ), "title": ElementParser(lambda tag, attrs: tag == "title"), } @@ -120,7 +121,10 @@ def parse_html(content: Iterable[str]) -> Mapping[str, Optional[str]]: if all(p.done for p in parsers.values()): break - return {k: p.value for k, p in parsers.items()} + return { + "ldjson": parsers["ldjson"].values, + "title": parsers["title"].value, + } def pretty_size(size: int) -> str: @@ -168,17 +172,18 @@ class LinkedData: if "datePublished" in o else None ), - image=o.get("image"), + image=o.get("image") or o.get("logo"), description=o.get("description"), genres=uniq(aslist(o.get("genre"))), creators=uniq(creators), ) -def parse_ldjson(ldjson: str) -> Iterable[LinkedData]: - ld: Union[dict, list] = json.loads(ldjson) - for o in aslist(ld): - if o.get("@context") != "https://schema.org": - log.debug("Unknown context in Linked Data.") - else: - yield LinkedData.from_json(o) +def parse_ldjson(ldjsons: Iterable[str]) -> Iterable[LinkedData]: + for ldjson in ldjsons: + ld: Union[dict, list] = json.loads(ldjson) + for o in aslist(ld): + if o.get("@context") not in ("http://schema.org", "https://schema.org"): + log.debug("Unknown context in Linked Data.") + else: + yield LinkedData.from_json(o)