457 lines
7.2 KiB
Python
457 lines
7.2 KiB
Python
import asyncio
|
|
import logging
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from functools import lru_cache
|
|
from html import escape
|
|
from time import time as now
|
|
from typing import *
|
|
|
|
import requests
|
|
|
|
from ..functions import localizedtz, react, reply, strip_tags
|
|
from ..models import Message
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
HELP = """Look up articles on Wikipedia.
|
|
!w[p|ikipedia] [lang (ISO 639)] <search terms ...>
|
|
"""
|
|
|
|
|
|
def init(bot):
|
|
bot.shared[__name__] = {"last": 0}
|
|
|
|
bot.on_command({"w", "wp", "wikipedia"}, handle)
|
|
|
|
|
|
api_url = "https://{lang}.wikipedia.org/w/api.php"
|
|
|
|
# see https://de.wikipedia.org/wiki/Liste_der_Wikipedias_nach_Sprachen
|
|
langs = {
|
|
"aa",
|
|
"ab",
|
|
"ace",
|
|
"af",
|
|
"ak",
|
|
"als",
|
|
"am",
|
|
"an",
|
|
"ang",
|
|
"ar",
|
|
"arc",
|
|
"as",
|
|
"ast",
|
|
"av",
|
|
"ay",
|
|
"az",
|
|
"ba",
|
|
"bar",
|
|
"bat-smg",
|
|
"bcl",
|
|
"be",
|
|
"bg",
|
|
"bh",
|
|
"bi",
|
|
"bjn",
|
|
"bm",
|
|
"bn",
|
|
"bo",
|
|
"bpy",
|
|
"br",
|
|
"bs",
|
|
"bug",
|
|
"bxr",
|
|
"ca",
|
|
"cbk-zam",
|
|
"cdo",
|
|
"ce",
|
|
"ceb",
|
|
"ch",
|
|
"cho",
|
|
"chr",
|
|
"chy",
|
|
"ckb",
|
|
"co",
|
|
"cr",
|
|
"crh",
|
|
"cs",
|
|
"csb",
|
|
"cu",
|
|
"cv",
|
|
"cy",
|
|
"da",
|
|
"de",
|
|
"diq",
|
|
"dsb",
|
|
"dv",
|
|
"dz",
|
|
"ee",
|
|
"el",
|
|
"eml",
|
|
"en",
|
|
"eo",
|
|
"es",
|
|
"et",
|
|
"eu",
|
|
"ext",
|
|
"fa",
|
|
"ff",
|
|
"fi",
|
|
"fiu-vro",
|
|
"fj",
|
|
"fo",
|
|
"fr",
|
|
"frp",
|
|
"frr",
|
|
"fur",
|
|
"fy",
|
|
"ga",
|
|
"gag",
|
|
"gan",
|
|
"gd",
|
|
"gl",
|
|
"glk",
|
|
"gn",
|
|
"got",
|
|
"gu",
|
|
"gv",
|
|
"ha",
|
|
"hak",
|
|
"haw",
|
|
"he",
|
|
"hi",
|
|
"hif",
|
|
"ho",
|
|
"hr",
|
|
"hsb",
|
|
"ht",
|
|
"hu",
|
|
"hy",
|
|
"hz",
|
|
"ia",
|
|
"id",
|
|
"ie",
|
|
"ig",
|
|
"ii",
|
|
"ik",
|
|
"ilo",
|
|
"io",
|
|
"is",
|
|
"it",
|
|
"iu",
|
|
"ja",
|
|
"jbo",
|
|
"jv",
|
|
"ka",
|
|
"kaa",
|
|
"kab",
|
|
"kbd",
|
|
"kg",
|
|
"kj",
|
|
"kk",
|
|
"kl",
|
|
"km",
|
|
"kn",
|
|
"ko",
|
|
"koi",
|
|
"kr[",
|
|
"krc",
|
|
"ks",
|
|
"ksh",
|
|
"ku",
|
|
"kv",
|
|
"kw",
|
|
"ky",
|
|
"la",
|
|
"lad",
|
|
"lb",
|
|
"lbe",
|
|
"lez",
|
|
"lg",
|
|
"li",
|
|
"lij",
|
|
"lmo",
|
|
"ln",
|
|
"lo",
|
|
"lt",
|
|
"ltg",
|
|
"lv",
|
|
"map-bms",
|
|
"mdf",
|
|
"mg",
|
|
"mh",
|
|
"mhr",
|
|
"mi",
|
|
"mk",
|
|
"ml",
|
|
"mn",
|
|
"mr",
|
|
"mrj",
|
|
"ms",
|
|
"mt",
|
|
"mus",
|
|
"mwl",
|
|
"my",
|
|
"myv",
|
|
"mzn",
|
|
"na",
|
|
"nah",
|
|
"nap",
|
|
"nds",
|
|
"nds-nl",
|
|
"ne",
|
|
"new",
|
|
"ng",
|
|
"nl",
|
|
"nn",
|
|
"no",
|
|
"nov",
|
|
"nrm",
|
|
"nso",
|
|
"nv",
|
|
"oc",
|
|
"om",
|
|
"or",
|
|
"os",
|
|
"pa",
|
|
"pag",
|
|
"pam",
|
|
"pap",
|
|
"pcd",
|
|
"pdc",
|
|
"pfl",
|
|
"pi",
|
|
"pih",
|
|
"pl",
|
|
"pms",
|
|
"pnb",
|
|
"pnt",
|
|
"ps",
|
|
"pt",
|
|
"qu",
|
|
"rm",
|
|
"rmy",
|
|
"rn",
|
|
"ro",
|
|
"roa-rup",
|
|
"roa-tara",
|
|
"ru",
|
|
"rue",
|
|
"rw",
|
|
"sa",
|
|
"sc",
|
|
"scn",
|
|
"sco",
|
|
"sd",
|
|
"se",
|
|
"sg",
|
|
"sh",
|
|
"si",
|
|
"simple",
|
|
"sk",
|
|
"sl",
|
|
"sm",
|
|
"sn",
|
|
"so",
|
|
"sq",
|
|
"sr",
|
|
"srn",
|
|
"ss",
|
|
"st",
|
|
"stq",
|
|
"su",
|
|
"sv",
|
|
"sw",
|
|
"szl",
|
|
"ta",
|
|
"te",
|
|
"tet",
|
|
"tg",
|
|
"th",
|
|
"ti",
|
|
"tk",
|
|
"tl",
|
|
"tn",
|
|
"to",
|
|
"tpi",
|
|
"tr",
|
|
"ts",
|
|
"tt",
|
|
"tum",
|
|
"tw",
|
|
"ty",
|
|
"udm",
|
|
"ug",
|
|
"uk",
|
|
"ur",
|
|
"uz",
|
|
"ve",
|
|
"vec",
|
|
"vep",
|
|
"vi",
|
|
"vls",
|
|
"vo",
|
|
"wa",
|
|
"war",
|
|
"wo",
|
|
"wuu",
|
|
"xal",
|
|
"xh",
|
|
"xmf",
|
|
"yi",
|
|
"yo",
|
|
"za",
|
|
"zea",
|
|
"zh",
|
|
"zh-classical",
|
|
"zh-min-nan",
|
|
"zh-yue",
|
|
"zu",
|
|
"zu",
|
|
}
|
|
|
|
|
|
def searchparams(terms: Iterable[str]):
|
|
# see https://www.mediawiki.org/wiki/API:Search
|
|
return {
|
|
"action": "query",
|
|
"list": "search",
|
|
"srsearch": " ".join(terms),
|
|
"format": "json",
|
|
"srlimit": 3,
|
|
}
|
|
|
|
|
|
def resolveparams(ids: Iterable[int]):
|
|
return {
|
|
"action": "query",
|
|
"prop": "info",
|
|
"pageids": "|".join(map(str, ids)),
|
|
"inprop": "url",
|
|
"format": "json",
|
|
}
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Hit:
|
|
# ns: int
|
|
pageid: int
|
|
size: int
|
|
snippet: str
|
|
timestamp: datetime
|
|
title: str
|
|
wordcount: int
|
|
|
|
@classmethod
|
|
def from_json(cls, data):
|
|
return cls(
|
|
data["pageid"],
|
|
data["size"],
|
|
data["snippet"],
|
|
fromjsonformat(data["timestamp"]),
|
|
data["title"],
|
|
data["wordcount"],
|
|
)
|
|
|
|
|
|
def fromjsonformat(s: str) -> datetime:
|
|
if s.endswith("Z"):
|
|
s = s[:-1] + "+00:00"
|
|
return datetime.fromisoformat(s)
|
|
|
|
|
|
def load_api_json(session, lang, params):
|
|
r = session.get(
|
|
api_url.format(lang=lang),
|
|
params=params,
|
|
timeout=(3, 3),
|
|
headers={"User-Agent": "hotdog/v1 wikipedia"},
|
|
)
|
|
r.raise_for_status()
|
|
return r.json()
|
|
|
|
|
|
def search(session, lang, terms):
|
|
data = load_api_json(session, lang, searchparams(terms))
|
|
return {
|
|
"total": data["query"]["searchinfo"]["totalhits"],
|
|
"hits": [Hit.from_json(d) for d in data["query"]["search"]],
|
|
}
|
|
|
|
|
|
def resolve_urls(session, lang, ids: Collection[int]):
|
|
if not ids:
|
|
return {}
|
|
data = load_api_json(session, lang, resolveparams(ids))
|
|
return {
|
|
int(pid): p.get("canonicalurl") for pid, p in data["query"]["pages"].items()
|
|
}
|
|
|
|
|
|
@lru_cache(maxsize=10)
|
|
def search_and_resolve(lang, terms):
|
|
session = requests.Session()
|
|
result = search(session, lang, terms)
|
|
result["urls"] = resolve_urls(session, lang, [hit.pageid for hit in result["hits"]])
|
|
return result
|
|
|
|
|
|
def snippet(s: str, max_content_len: int = 300):
|
|
content = ""
|
|
for word in s.split():
|
|
if not word:
|
|
continue
|
|
if len(content + f" {word}") > max_content_len - 3:
|
|
content += " […]"
|
|
break
|
|
content += f" {word}"
|
|
return content
|
|
|
|
|
|
async def handle(message: Message):
|
|
bot = message.app
|
|
roomconf = bot.config.l6n[message.room.room_id]
|
|
if message.args.get(0) in langs:
|
|
lang = message.args[0]
|
|
args = message.args[1:]
|
|
else:
|
|
lang, *_ = roomconf["locale"].split("_", 1)
|
|
args = message.args
|
|
|
|
if not args:
|
|
return
|
|
|
|
await react(message, "⚡️")
|
|
|
|
# XXX no need to wait if the result is already cached - can we check that?
|
|
|
|
# Guard against API flooding...
|
|
timeout = 10
|
|
while 0 < (waitfor := timeout - (now() - bot.shared[__name__]["last"])):
|
|
log.debug(f"Waiting for {waitfor}s before next API call.")
|
|
await asyncio.sleep(waitfor)
|
|
bot.shared[__name__]["last"] = now()
|
|
|
|
r = search_and_resolve(lang, args)
|
|
|
|
if not r["hits"]:
|
|
await react(message, "❌")
|
|
return
|
|
|
|
lines = []
|
|
if r["total"] > 3:
|
|
lines.append(f"Found <b>{r['total']}</b> matching articles.")
|
|
for hit in r["hits"][:3]:
|
|
last_updated = localizedtz(
|
|
hit.timestamp, "%x %X", tzname=roomconf["timezone"], lc=roomconf["locale"]
|
|
)
|
|
teaser = snippet(escape(strip_tags(f"… {hit.snippet} …")))
|
|
lines.append(
|
|
f'<b><a href="{escape(r["urls"][hit.pageid])}">{escape(hit.title)}</a></b>: <i>{teaser}</i> (<i>{last_updated}</i>)'
|
|
)
|
|
|
|
await asyncio.gather(
|
|
reply(message, html="<br/>\n".join(lines), in_thread=True),
|
|
react(message, "✅"),
|
|
)
|