urlinfo:generic: improve JSON-LD handling

Support (or at least acknowledge) multiple JSON-LD script tags per page
and context tags using HTTP instead of HTTPS.
This commit is contained in:
ducklet 2020-11-14 14:37:09 +01:00
parent c798f8d776
commit acd1f949d2

View file

@ -73,7 +73,7 @@ async def handle(message: Message, url, load_info):
roomconf = message.app.config.l6n[message.room.room_id]
plain = html = None
if info.extracted.ld:
if info.extracted.ld and info.extracted.ld.description:
html, plain = ld_details(
info.extracted.ld, tz=roomconf["timezone"], lc=roomconf["locale"]
)
@ -98,9 +98,9 @@ async def extractor(info):
if is_html:
parsed = parse_html(info._chunks_str)
info.extracted.title = parsed["title"]
info.extracted.ld = (
next(parse_ldjson(parsed["ldjson"])) if parsed["ldjson"] else None
)
info.extracted.ld = next(
parse_ldjson(parsed["ldjson"]), None
) # XXX We'll just go with the first entry for now, would be better to look at the LD type and pick & choose.
def parse_html(content: Iterable[str]) -> Mapping[str, Optional[str]]:
@ -108,7 +108,8 @@ def parse_html(content: Iterable[str]) -> Mapping[str, Optional[str]]:
"ldjson": ElementParser(
lambda tag, attrs: (
tag == "script" and dict(attrs).get("type") == "application/ld+json"
)
),
stop_after_one=False,
),
"title": ElementParser(lambda tag, attrs: tag == "title"),
}
@ -120,7 +121,10 @@ def parse_html(content: Iterable[str]) -> Mapping[str, Optional[str]]:
if all(p.done for p in parsers.values()):
break
return {k: p.value for k, p in parsers.items()}
return {
"ldjson": parsers["ldjson"].values,
"title": parsers["title"].value,
}
def pretty_size(size: int) -> str:
@ -168,17 +172,18 @@ class LinkedData:
if "datePublished" in o
else None
),
image=o.get("image"),
image=o.get("image") or o.get("logo"),
description=o.get("description"),
genres=uniq(aslist(o.get("genre"))),
creators=uniq(creators),
)
def parse_ldjson(ldjson: str) -> Iterable[LinkedData]:
def parse_ldjson(ldjsons: Iterable[str]) -> Iterable[LinkedData]:
for ldjson in ldjsons:
ld: Union[dict, list] = json.loads(ldjson)
for o in aslist(ld):
if o.get("@context") != "https://schema.org":
if o.get("@context") not in ("http://schema.org", "https://schema.org"):
log.debug("Unknown context in Linked Data.")
else:
yield LinkedData.from_json(o)