urlinfo:generic: improve JSON-LD handling

Support (or at least acknowledge) multiple JSON-LD script tags per page
and context tags using HTTP instead of HTTPS.
This commit is contained in:
ducklet 2020-11-14 14:37:09 +01:00
parent c798f8d776
commit acd1f949d2

View file

@ -73,7 +73,7 @@ async def handle(message: Message, url, load_info):
roomconf = message.app.config.l6n[message.room.room_id] roomconf = message.app.config.l6n[message.room.room_id]
plain = html = None plain = html = None
if info.extracted.ld: if info.extracted.ld and info.extracted.ld.description:
html, plain = ld_details( html, plain = ld_details(
info.extracted.ld, tz=roomconf["timezone"], lc=roomconf["locale"] info.extracted.ld, tz=roomconf["timezone"], lc=roomconf["locale"]
) )
@ -98,9 +98,9 @@ async def extractor(info):
if is_html: if is_html:
parsed = parse_html(info._chunks_str) parsed = parse_html(info._chunks_str)
info.extracted.title = parsed["title"] info.extracted.title = parsed["title"]
info.extracted.ld = ( info.extracted.ld = next(
next(parse_ldjson(parsed["ldjson"])) if parsed["ldjson"] else None parse_ldjson(parsed["ldjson"]), None
) ) # XXX We'll just go with the first entry for now, would be better to look at the LD type and pick & choose.
def parse_html(content: Iterable[str]) -> Mapping[str, Optional[str]]: def parse_html(content: Iterable[str]) -> Mapping[str, Optional[str]]:
@ -108,7 +108,8 @@ def parse_html(content: Iterable[str]) -> Mapping[str, Optional[str]]:
"ldjson": ElementParser( "ldjson": ElementParser(
lambda tag, attrs: ( lambda tag, attrs: (
tag == "script" and dict(attrs).get("type") == "application/ld+json" tag == "script" and dict(attrs).get("type") == "application/ld+json"
) ),
stop_after_one=False,
), ),
"title": ElementParser(lambda tag, attrs: tag == "title"), "title": ElementParser(lambda tag, attrs: tag == "title"),
} }
@ -120,7 +121,10 @@ def parse_html(content: Iterable[str]) -> Mapping[str, Optional[str]]:
if all(p.done for p in parsers.values()): if all(p.done for p in parsers.values()):
break break
return {k: p.value for k, p in parsers.items()} return {
"ldjson": parsers["ldjson"].values,
"title": parsers["title"].value,
}
def pretty_size(size: int) -> str: def pretty_size(size: int) -> str:
@ -168,17 +172,18 @@ class LinkedData:
if "datePublished" in o if "datePublished" in o
else None else None
), ),
image=o.get("image"), image=o.get("image") or o.get("logo"),
description=o.get("description"), description=o.get("description"),
genres=uniq(aslist(o.get("genre"))), genres=uniq(aslist(o.get("genre"))),
creators=uniq(creators), creators=uniq(creators),
) )
def parse_ldjson(ldjson: str) -> Iterable[LinkedData]: def parse_ldjson(ldjsons: Iterable[str]) -> Iterable[LinkedData]:
ld: Union[dict, list] = json.loads(ldjson) for ldjson in ldjsons:
for o in aslist(ld): ld: Union[dict, list] = json.loads(ldjson)
if o.get("@context") != "https://schema.org": for o in aslist(ld):
log.debug("Unknown context in Linked Data.") if o.get("@context") not in ("http://schema.org", "https://schema.org"):
else: log.debug("Unknown context in Linked Data.")
yield LinkedData.from_json(o) else:
yield LinkedData.from_json(o)