hotdog/hotdog/command/wikipedia.py

457 lines
7.2 KiB
Python

import asyncio
import logging
from dataclasses import dataclass
from datetime import datetime
from functools import lru_cache
from html import escape
from time import time as now
from typing import *
import requests
from ..functions import localizedtz, react, reply, strip_tags
from ..models import Message
log = logging.getLogger(__name__)
HELP = """Look up articles on Wikipedia.
!w[p|ikipedia] [lang (ISO 639)] <search terms ...>
"""
def init(bot):
bot.shared[__name__] = {"last": 0}
bot.on_command({"w", "wp", "wikipedia"}, handle)
api_url = "https://{lang}.wikipedia.org/w/api.php"
# see https://de.wikipedia.org/wiki/Liste_der_Wikipedias_nach_Sprachen
langs = {
"aa",
"ab",
"ace",
"af",
"ak",
"als",
"am",
"an",
"ang",
"ar",
"arc",
"as",
"ast",
"av",
"ay",
"az",
"ba",
"bar",
"bat-smg",
"bcl",
"be",
"bg",
"bh",
"bi",
"bjn",
"bm",
"bn",
"bo",
"bpy",
"br",
"bs",
"bug",
"bxr",
"ca",
"cbk-zam",
"cdo",
"ce",
"ceb",
"ch",
"cho",
"chr",
"chy",
"ckb",
"co",
"cr",
"crh",
"cs",
"csb",
"cu",
"cv",
"cy",
"da",
"de",
"diq",
"dsb",
"dv",
"dz",
"ee",
"el",
"eml",
"en",
"eo",
"es",
"et",
"eu",
"ext",
"fa",
"ff",
"fi",
"fiu-vro",
"fj",
"fo",
"fr",
"frp",
"frr",
"fur",
"fy",
"ga",
"gag",
"gan",
"gd",
"gl",
"glk",
"gn",
"got",
"gu",
"gv",
"ha",
"hak",
"haw",
"he",
"hi",
"hif",
"ho",
"hr",
"hsb",
"ht",
"hu",
"hy",
"hz",
"ia",
"id",
"ie",
"ig",
"ii",
"ik",
"ilo",
"io",
"is",
"it",
"iu",
"ja",
"jbo",
"jv",
"ka",
"kaa",
"kab",
"kbd",
"kg",
"kj",
"kk",
"kl",
"km",
"kn",
"ko",
"koi",
"kr[",
"krc",
"ks",
"ksh",
"ku",
"kv",
"kw",
"ky",
"la",
"lad",
"lb",
"lbe",
"lez",
"lg",
"li",
"lij",
"lmo",
"ln",
"lo",
"lt",
"ltg",
"lv",
"map-bms",
"mdf",
"mg",
"mh",
"mhr",
"mi",
"mk",
"ml",
"mn",
"mr",
"mrj",
"ms",
"mt",
"mus",
"mwl",
"my",
"myv",
"mzn",
"na",
"nah",
"nap",
"nds",
"nds-nl",
"ne",
"new",
"ng",
"nl",
"nn",
"no",
"nov",
"nrm",
"nso",
"nv",
"oc",
"om",
"or",
"os",
"pa",
"pag",
"pam",
"pap",
"pcd",
"pdc",
"pfl",
"pi",
"pih",
"pl",
"pms",
"pnb",
"pnt",
"ps",
"pt",
"qu",
"rm",
"rmy",
"rn",
"ro",
"roa-rup",
"roa-tara",
"ru",
"rue",
"rw",
"sa",
"sc",
"scn",
"sco",
"sd",
"se",
"sg",
"sh",
"si",
"simple",
"sk",
"sl",
"sm",
"sn",
"so",
"sq",
"sr",
"srn",
"ss",
"st",
"stq",
"su",
"sv",
"sw",
"szl",
"ta",
"te",
"tet",
"tg",
"th",
"ti",
"tk",
"tl",
"tn",
"to",
"tpi",
"tr",
"ts",
"tt",
"tum",
"tw",
"ty",
"udm",
"ug",
"uk",
"ur",
"uz",
"ve",
"vec",
"vep",
"vi",
"vls",
"vo",
"wa",
"war",
"wo",
"wuu",
"xal",
"xh",
"xmf",
"yi",
"yo",
"za",
"zea",
"zh",
"zh-classical",
"zh-min-nan",
"zh-yue",
"zu",
"zu",
}
def searchparams(terms: Iterable[str]):
# see https://www.mediawiki.org/wiki/API:Search
return {
"action": "query",
"list": "search",
"srsearch": " ".join(terms),
"format": "json",
"srlimit": 3,
}
def resolveparams(ids: Iterable[int]):
return {
"action": "query",
"prop": "info",
"pageids": "|".join(map(str, ids)),
"inprop": "url",
"format": "json",
}
@dataclass(frozen=True)
class Hit:
# ns: int
pageid: int
size: int
snippet: str
timestamp: datetime
title: str
wordcount: int
@classmethod
def from_json(cls, data):
return cls(
data["pageid"],
data["size"],
data["snippet"],
fromjsonformat(data["timestamp"]),
data["title"],
data["wordcount"],
)
def fromjsonformat(s: str) -> datetime:
if s.endswith("Z"):
s = s[:-1] + "+00:00"
return datetime.fromisoformat(s)
def load_api_json(session, lang, params):
r = session.get(
api_url.format(lang=lang),
params=params,
timeout=(3, 3),
headers={"User-Agent": "hotdog/v1 wikipedia"},
)
r.raise_for_status()
return r.json()
def search(session, lang, terms):
data = load_api_json(session, lang, searchparams(terms))
return {
"total": data["query"]["searchinfo"]["totalhits"],
"hits": [Hit.from_json(d) for d in data["query"]["search"]],
}
def resolve_urls(session, lang, ids: Collection[int]):
if not ids:
return {}
data = load_api_json(session, lang, resolveparams(ids))
return {
int(pid): p.get("canonicalurl") for pid, p in data["query"]["pages"].items()
}
@lru_cache(maxsize=10)
def search_and_resolve(lang, terms):
session = requests.Session()
result = search(session, lang, terms)
result["urls"] = resolve_urls(session, lang, [hit.pageid for hit in result["hits"]])
return result
def snippet(s: str, max_content_len: int = 300):
content = ""
for word in s.split():
if not word:
continue
if len(content + f" {word}") > max_content_len - 3:
content += " […]"
break
content += f" {word}"
return content
async def handle(message: Message):
bot = message.app
roomconf = bot.config.l6n[message.room.room_id]
if message.args.get(0) in langs:
lang = message.args[0]
args = message.args[1:]
else:
lang, *_ = roomconf["locale"].split("_", 1)
args = message.args
if not args:
return
await react(message, "⚡️")
# XXX no need to wait if the result is already cached - can we check that?
# Guard against API flooding...
timeout = 10
while 0 < (waitfor := timeout - (now() - bot.shared[__name__]["last"])):
log.debug(f"Waiting for {waitfor}s before next API call.")
await asyncio.sleep(waitfor)
bot.shared[__name__]["last"] = now()
r = search_and_resolve(lang, args)
if not r["hits"]:
await react(message, "")
return
lines = []
if r["total"] > 3:
lines.append(f"Found <b>{r['total']}</b> matching articles.")
for hit in r["hits"][:3]:
last_updated = localizedtz(
hit.timestamp, "%x %X", tzname=roomconf["timezone"], lc=roomconf["locale"]
)
teaser = snippet(escape(strip_tags(f"{hit.snippet}")))
lines.append(
f'<b><a href="{escape(r["urls"][hit.pageid])}">{escape(hit.title)}</a></b>: <i>{teaser}</i> (<i>{last_updated}</i>)'
)
await asyncio.gather(
reply(message, html="<br/>\n".join(lines), in_thread=True),
react(message, ""),
)