urlinfo:generic: improve JSON-LD handling
Support (or at least acknowledge) multiple JSON-LD script tags per page and context tags using HTTP instead of HTTPS.
This commit is contained in:
parent
c798f8d776
commit
acd1f949d2
1 changed files with 19 additions and 14 deletions
|
|
@ -73,7 +73,7 @@ async def handle(message: Message, url, load_info):
|
|||
|
||||
roomconf = message.app.config.l6n[message.room.room_id]
|
||||
plain = html = None
|
||||
if info.extracted.ld:
|
||||
if info.extracted.ld and info.extracted.ld.description:
|
||||
html, plain = ld_details(
|
||||
info.extracted.ld, tz=roomconf["timezone"], lc=roomconf["locale"]
|
||||
)
|
||||
|
|
@ -98,9 +98,9 @@ async def extractor(info):
|
|||
if is_html:
|
||||
parsed = parse_html(info._chunks_str)
|
||||
info.extracted.title = parsed["title"]
|
||||
info.extracted.ld = (
|
||||
next(parse_ldjson(parsed["ldjson"])) if parsed["ldjson"] else None
|
||||
)
|
||||
info.extracted.ld = next(
|
||||
parse_ldjson(parsed["ldjson"]), None
|
||||
) # XXX We'll just go with the first entry for now, would be better to look at the LD type and pick & choose.
|
||||
|
||||
|
||||
def parse_html(content: Iterable[str]) -> Mapping[str, Optional[str]]:
|
||||
|
|
@ -108,7 +108,8 @@ def parse_html(content: Iterable[str]) -> Mapping[str, Optional[str]]:
|
|||
"ldjson": ElementParser(
|
||||
lambda tag, attrs: (
|
||||
tag == "script" and dict(attrs).get("type") == "application/ld+json"
|
||||
)
|
||||
),
|
||||
stop_after_one=False,
|
||||
),
|
||||
"title": ElementParser(lambda tag, attrs: tag == "title"),
|
||||
}
|
||||
|
|
@ -120,7 +121,10 @@ def parse_html(content: Iterable[str]) -> Mapping[str, Optional[str]]:
|
|||
if all(p.done for p in parsers.values()):
|
||||
break
|
||||
|
||||
return {k: p.value for k, p in parsers.items()}
|
||||
return {
|
||||
"ldjson": parsers["ldjson"].values,
|
||||
"title": parsers["title"].value,
|
||||
}
|
||||
|
||||
|
||||
def pretty_size(size: int) -> str:
|
||||
|
|
@ -168,17 +172,18 @@ class LinkedData:
|
|||
if "datePublished" in o
|
||||
else None
|
||||
),
|
||||
image=o.get("image"),
|
||||
image=o.get("image") or o.get("logo"),
|
||||
description=o.get("description"),
|
||||
genres=uniq(aslist(o.get("genre"))),
|
||||
creators=uniq(creators),
|
||||
)
|
||||
|
||||
|
||||
def parse_ldjson(ldjson: str) -> Iterable[LinkedData]:
|
||||
def parse_ldjson(ldjsons: Iterable[str]) -> Iterable[LinkedData]:
|
||||
for ldjson in ldjsons:
|
||||
ld: Union[dict, list] = json.loads(ldjson)
|
||||
for o in aslist(ld):
|
||||
if o.get("@context") != "https://schema.org":
|
||||
if o.get("@context") not in ("http://schema.org", "https://schema.org"):
|
||||
log.debug("Unknown context in Linked Data.")
|
||||
else:
|
||||
yield LinkedData.from_json(o)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue