add new admin routes to trigger database updates
The two routes offer reloading the whole movie database from IMDb, and reloading all users' IMDb ratings. These functions (or very similar) were already available using the CLI, but now they can be triggered remotely by an admin.
This commit is contained in:
parent
1805282d41
commit
294e311aff
5 changed files with 196 additions and 14 deletions
|
|
@ -4,12 +4,13 @@ from pathlib import Path
|
||||||
import toml
|
import toml
|
||||||
|
|
||||||
cachedir = (
|
cachedir = (
|
||||||
Path(cachedir) if (cachedir := os.getenv("UNWIND_CACHEDIR", ".cache")) else None
|
Path(cachedir) if (cachedir := os.getenv("UNWIND_CACHEDIR", "./.cache")) else None
|
||||||
)
|
)
|
||||||
|
datadir = Path(os.getenv("UNWIND_DATA") or "./data")
|
||||||
debug = os.getenv("DEBUG") == "1"
|
debug = os.getenv("DEBUG") == "1"
|
||||||
loglevel = os.getenv("UNWIND_LOGLEVEL") or ("DEBUG" if debug else "INFO")
|
loglevel = os.getenv("UNWIND_LOGLEVEL") or ("DEBUG" if debug else "INFO")
|
||||||
storage_path = os.getenv("UNWIND_STORAGE", "./data/db.sqlite")
|
storage_path = os.getenv("UNWIND_STORAGE", datadir / "db.sqlite")
|
||||||
config_path = os.getenv("UNWIND_CONFIG", "./data/config.toml")
|
config_path = os.getenv("UNWIND_CONFIG", datadir / "config.toml")
|
||||||
|
|
||||||
_config = toml.load(config_path)
|
_config = toml.load(config_path)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -37,7 +37,7 @@ log = logging.getLogger(__name__)
|
||||||
async def refresh_user_ratings_from_imdb(stop_on_dupe=True):
|
async def refresh_user_ratings_from_imdb(stop_on_dupe=True):
|
||||||
|
|
||||||
with session() as s:
|
with session() as s:
|
||||||
s.headers["Accept-Language"] = "en-GB, en;q=0.5"
|
s.headers["Accept-Language"] = "en-US, en;q=0.5"
|
||||||
|
|
||||||
for user in await db.get_all(User):
|
for user in await db.get_all(User):
|
||||||
|
|
||||||
|
|
@ -192,6 +192,7 @@ async def load_ratings(user_id):
|
||||||
ratings, next_url = await parse_page(next_url)
|
ratings, next_url = await parse_page(next_url)
|
||||||
|
|
||||||
for i, rating in enumerate(ratings):
|
for i, rating in enumerate(ratings):
|
||||||
|
assert rating.user and rating.movie
|
||||||
|
|
||||||
if i == 0:
|
if i == 0:
|
||||||
# All rating objects share the same user.
|
# All rating objects share the same user.
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,7 @@ from datetime import datetime, timezone
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, cast
|
from typing import Optional, cast
|
||||||
|
|
||||||
from . import db
|
from . import config, db, request
|
||||||
from .db import add_or_update_many_movies
|
from .db import add_or_update_many_movies
|
||||||
from .imdb import score_from_imdb_rating
|
from .imdb import score_from_imdb_rating
|
||||||
from .models import Movie
|
from .models import Movie
|
||||||
|
|
@ -179,11 +179,11 @@ def read_basics(path):
|
||||||
yield m
|
yield m
|
||||||
|
|
||||||
|
|
||||||
async def import_from_file(basics_path: Path, ratings_path: Path):
|
async def import_from_file(*, basics_path: Path, ratings_path: Path):
|
||||||
log.info("Loading scores ... 💾")
|
log.info("💾 Loading scores ...")
|
||||||
scores = read_ratings_as_scoremap(ratings_path)
|
scores = read_ratings_as_scoremap(ratings_path)
|
||||||
|
|
||||||
log.info("Importing movies ... 💾")
|
log.info("💾 Importing movies ...")
|
||||||
total = count_lines(basics_path)
|
total = count_lines(basics_path)
|
||||||
assert total != 0
|
assert total != 0
|
||||||
perc = 0.0
|
perc = 0.0
|
||||||
|
|
@ -196,7 +196,7 @@ async def import_from_file(basics_path: Path, ratings_path: Path):
|
||||||
for i, m in enumerate(read_basics(basics_path)):
|
for i, m in enumerate(read_basics(basics_path)):
|
||||||
|
|
||||||
if i / total > perc:
|
if i / total > perc:
|
||||||
log.info("Imported %s%%", round(perc * 100, 1))
|
log.info("⏳ Imported %s%%", round(perc * 100, 1))
|
||||||
perc += perc_step
|
perc += perc_step
|
||||||
|
|
||||||
if m.media_type not in {
|
if m.media_type not in {
|
||||||
|
|
@ -222,3 +222,33 @@ async def import_from_file(basics_path: Path, ratings_path: Path):
|
||||||
if chunk:
|
if chunk:
|
||||||
await add_or_update_many_movies(chunk)
|
await add_or_update_many_movies(chunk)
|
||||||
chunk = []
|
chunk = []
|
||||||
|
|
||||||
|
|
||||||
|
async def load_from_web():
|
||||||
|
"""Refresh the full IMDb movie database.
|
||||||
|
|
||||||
|
The latest dumps are first downloaded and then imported into the database.
|
||||||
|
Beware that this can take a very long time (about half an hour), as the
|
||||||
|
amount of data is quite large.
|
||||||
|
|
||||||
|
See https://www.imdb.com/interfaces/ and https://datasets.imdbws.com/ for
|
||||||
|
more information on the IMDb database dumps.
|
||||||
|
"""
|
||||||
|
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
|
||||||
|
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
|
||||||
|
ratings_file = config.datadir / "imdb/title.ratings.tsv.gz"
|
||||||
|
basics_file = config.datadir / "imdb/title.basics.tsv.gz"
|
||||||
|
|
||||||
|
ratings_mtime = ratings_file.stat().st_mtime if ratings_file.exists() else None
|
||||||
|
bastics_mtime = basics_file.stat().st_mtime if basics_file.exists() else None
|
||||||
|
|
||||||
|
with request.session():
|
||||||
|
request.download(ratings_url, ratings_file, only_if_newer=True)
|
||||||
|
request.download(basics_url, basics_file, only_if_newer=True)
|
||||||
|
|
||||||
|
is_changed = (
|
||||||
|
ratings_mtime != ratings_file.stat().st_mtime
|
||||||
|
or bastics_mtime != basics_file.stat().st_mtime
|
||||||
|
)
|
||||||
|
if is_changed:
|
||||||
|
await import_from_file(basics_path=basics_file, ratings_path=ratings_file)
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,8 @@
|
||||||
|
import email.utils
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
from collections import deque
|
from collections import deque
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
@ -8,10 +11,11 @@ from hashlib import md5
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from random import random
|
from random import random
|
||||||
from time import sleep, time
|
from time import sleep, time
|
||||||
from typing import Callable, Optional
|
from typing import Callable, Optional, Union
|
||||||
|
|
||||||
import bs4
|
import bs4
|
||||||
import requests
|
import requests
|
||||||
|
from requests.status_codes import codes
|
||||||
from urllib3.util.retry import Retry
|
from urllib3.util.retry import Retry
|
||||||
|
|
||||||
from . import config
|
from . import config
|
||||||
|
|
@ -44,6 +48,13 @@ _shared_session = None
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def session():
|
def session():
|
||||||
|
"""Return the shared request session.
|
||||||
|
|
||||||
|
The session is shared by all request functions and provides cookie
|
||||||
|
persistence and connection pooling.
|
||||||
|
Opening the session before making a request allows you to set headers
|
||||||
|
or change the retry behavior.
|
||||||
|
"""
|
||||||
global _shared_session
|
global _shared_session
|
||||||
|
|
||||||
if _shared_session:
|
if _shared_session:
|
||||||
|
|
@ -86,7 +97,7 @@ def throttle(
|
||||||
if len(calls) == calls.maxlen:
|
if len(calls) == calls.maxlen:
|
||||||
wait_until = calls.popleft() + per_seconds + jitter()
|
wait_until = calls.popleft() + per_seconds + jitter()
|
||||||
timeout = wait_until - time()
|
timeout = wait_until - time()
|
||||||
log.debug(f"waiting {timeout:.2} seconds ... ⏳")
|
log.debug(f"⏳ waiting {timeout:.2} seconds ...")
|
||||||
sleep(timeout)
|
sleep(timeout)
|
||||||
|
|
||||||
# call
|
# call
|
||||||
|
|
@ -148,7 +159,7 @@ def http_get(s: requests.Session, url: str, *args, **kwds) -> requests.Response:
|
||||||
if cachefile:
|
if cachefile:
|
||||||
if cachefile.exists():
|
if cachefile.exists():
|
||||||
log.debug(
|
log.debug(
|
||||||
f"loading {req.url} ({req.headers!r}) from cache {cachefile} ... 💾"
|
f"💾 loading {req.url} ({req.headers!a}) from cache {cachefile} ..."
|
||||||
)
|
)
|
||||||
with cachefile.open() as fp:
|
with cachefile.open() as fp:
|
||||||
resp = CachedResponse(**json.load(fp))
|
resp = CachedResponse(**json.load(fp))
|
||||||
|
|
@ -158,7 +169,7 @@ def http_get(s: requests.Session, url: str, *args, **kwds) -> requests.Response:
|
||||||
)
|
)
|
||||||
return resp
|
return resp
|
||||||
|
|
||||||
log.debug(f"loading {req.url} ({req.headers!r}) ... ⚡️")
|
log.debug(f"⚡️ loading {req.url} ({req.headers!a}) ...")
|
||||||
resp = s.send(req, allow_redirects=False, stream=True)
|
resp = s.send(req, allow_redirects=False, stream=True)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
|
|
||||||
|
|
@ -182,8 +193,130 @@ def http_get(s: requests.Session, url: str, *args, **kwds) -> requests.Response:
|
||||||
|
|
||||||
|
|
||||||
def soup_from_url(url):
|
def soup_from_url(url):
|
||||||
|
"""Return a BeautifulSoup instance from the contents for the given URL."""
|
||||||
with session() as s:
|
with session() as s:
|
||||||
r = http_get(s, url)
|
r = http_get(s, url)
|
||||||
|
|
||||||
soup = bs4.BeautifulSoup(r.text, "html5lib")
|
soup = bs4.BeautifulSoup(r.text, "html5lib")
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
def last_modified_from_response(resp):
|
||||||
|
if last_mod := resp.headers.get("Last-Modified"):
|
||||||
|
try:
|
||||||
|
return email.utils.parsedate_to_datetime(last_mod).timestamp()
|
||||||
|
except:
|
||||||
|
log.exception("🐛 Received invalid value for Last-Modified: %s", last_mod)
|
||||||
|
|
||||||
|
|
||||||
|
def last_modified_from_file(path: Path):
|
||||||
|
return path.stat().st_mtime
|
||||||
|
|
||||||
|
|
||||||
|
def download(
|
||||||
|
url: str,
|
||||||
|
file_path: Union[Path, str] = None,
|
||||||
|
*,
|
||||||
|
replace_existing: bool = None,
|
||||||
|
only_if_newer: bool = False,
|
||||||
|
timeout: float = None,
|
||||||
|
verify_ssl: bool = True,
|
||||||
|
chunk_callback=None,
|
||||||
|
response_callback=None,
|
||||||
|
):
|
||||||
|
"""Download a file.
|
||||||
|
|
||||||
|
Existing files will not be overwritten unless `replace_existing` is set.
|
||||||
|
Setting `only_if_newer` will check if the remote file is newer than the
|
||||||
|
local file, otherwise the download will be aborted.
|
||||||
|
"""
|
||||||
|
if replace_existing is None:
|
||||||
|
replace_existing = only_if_newer
|
||||||
|
|
||||||
|
file_exists = None
|
||||||
|
if file_path is not None:
|
||||||
|
file_path = Path(file_path)
|
||||||
|
|
||||||
|
file_exists = file_path.exists() and file_path.stat().st_size
|
||||||
|
if file_exists and not replace_existing:
|
||||||
|
raise FileExistsError(23, "Would replace existing file", str(file_path))
|
||||||
|
|
||||||
|
with session() as s:
|
||||||
|
|
||||||
|
headers = {}
|
||||||
|
if file_exists and only_if_newer:
|
||||||
|
assert file_path
|
||||||
|
file_lastmod = last_modified_from_file(file_path)
|
||||||
|
headers["If-Modified-Since"] = email.utils.formatdate(
|
||||||
|
file_lastmod, usegmt=True
|
||||||
|
)
|
||||||
|
|
||||||
|
req = s.prepare_request(requests.Request("GET", url, headers=headers))
|
||||||
|
|
||||||
|
log.debug("⚡️ loading %s (%s) ...", req.url, req.headers)
|
||||||
|
resp = s.send(
|
||||||
|
req, allow_redirects=True, stream=True, timeout=timeout, verify=verify_ssl
|
||||||
|
)
|
||||||
|
|
||||||
|
if response_callback is not None:
|
||||||
|
try:
|
||||||
|
response_callback(resp)
|
||||||
|
except:
|
||||||
|
log.exception("🐛 Error in response callback.")
|
||||||
|
|
||||||
|
log.debug("☕️ Response status: %s; headers: %s", resp.status_code, resp.headers)
|
||||||
|
|
||||||
|
resp.raise_for_status()
|
||||||
|
|
||||||
|
if resp.status_code == codes.not_modified:
|
||||||
|
log.debug("✋ Remote file has not changed, skipping download.")
|
||||||
|
return
|
||||||
|
|
||||||
|
if file_path is None:
|
||||||
|
return resp.content
|
||||||
|
|
||||||
|
assert replace_existing is True
|
||||||
|
|
||||||
|
resp_lastmod = last_modified_from_response(resp)
|
||||||
|
|
||||||
|
# Check Last-Modified in case the server ignored If-Modified-Since.
|
||||||
|
# XXX also check Content-Length?
|
||||||
|
if file_exists and only_if_newer and resp_lastmod is not None:
|
||||||
|
assert file_lastmod
|
||||||
|
|
||||||
|
if resp_lastmod <= file_lastmod:
|
||||||
|
log.debug("✋ Local file is newer, skipping download.")
|
||||||
|
resp.close()
|
||||||
|
return
|
||||||
|
|
||||||
|
# Create intermediate directories if necessary.
|
||||||
|
download_dir = file_path.parent
|
||||||
|
download_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Write content to temp file.
|
||||||
|
tempdir = download_dir
|
||||||
|
tempfd, tempfile_path = tempfile.mkstemp(
|
||||||
|
dir=tempdir, prefix=f".download-{file_path.name}."
|
||||||
|
)
|
||||||
|
one_mb = 2 ** 20
|
||||||
|
chunk_size = 8 * one_mb
|
||||||
|
try:
|
||||||
|
log.debug("💾 Writing to temp file %s ...", tempfile_path)
|
||||||
|
for chunk in resp.iter_content(chunk_size=chunk_size, decode_unicode=False):
|
||||||
|
os.write(tempfd, chunk)
|
||||||
|
if chunk_callback:
|
||||||
|
try:
|
||||||
|
chunk_callback(chunk)
|
||||||
|
except:
|
||||||
|
log.exception("🐛 Error in chunk callback.")
|
||||||
|
finally:
|
||||||
|
os.close(tempfd)
|
||||||
|
|
||||||
|
# Move downloaded file to destination.
|
||||||
|
if file_exists:
|
||||||
|
log.debug("💾 Replacing existing file: %s", file_path)
|
||||||
|
Path(tempfile_path).replace(file_path)
|
||||||
|
|
||||||
|
# Fix file attributes.
|
||||||
|
if resp_lastmod is not None:
|
||||||
|
os.utime(file_path, (resp_lastmod, resp_lastmod))
|
||||||
|
|
|
||||||
|
|
@ -12,13 +12,14 @@ from starlette.authentication import (
|
||||||
SimpleUser,
|
SimpleUser,
|
||||||
requires,
|
requires,
|
||||||
)
|
)
|
||||||
|
from starlette.background import BackgroundTask
|
||||||
from starlette.exceptions import HTTPException
|
from starlette.exceptions import HTTPException
|
||||||
from starlette.middleware import Middleware
|
from starlette.middleware import Middleware
|
||||||
from starlette.middleware.authentication import AuthenticationMiddleware
|
from starlette.middleware.authentication import AuthenticationMiddleware
|
||||||
from starlette.responses import JSONResponse
|
from starlette.responses import JSONResponse
|
||||||
from starlette.routing import Mount, Route
|
from starlette.routing import Mount, Route
|
||||||
|
|
||||||
from . import config, db, imdb
|
from . import config, db, imdb, imdb_import
|
||||||
from .db import close_connection_pool, find_ratings, open_connection_pool
|
from .db import close_connection_pool, find_ratings, open_connection_pool
|
||||||
from .middleware.responsetime import ResponseTimeMiddleware
|
from .middleware.responsetime import ResponseTimeMiddleware
|
||||||
from .models import Group, Movie, User, asplain
|
from .models import Group, Movie, User, asplain
|
||||||
|
|
@ -192,6 +193,13 @@ async def add_movie(request):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@route("/movies/_reload_imdb", methods=["POST"])
|
||||||
|
@requires(["authenticated", "admin"])
|
||||||
|
async def load_imdb_movies(request):
|
||||||
|
task = BackgroundTask(imdb_import.load_from_web)
|
||||||
|
return JSONResponse({"status": "Import started."}, background=task, status_code=202)
|
||||||
|
|
||||||
|
|
||||||
@route("/users")
|
@route("/users")
|
||||||
@requires(["authenticated", "admin"])
|
@requires(["authenticated", "admin"])
|
||||||
async def list_users(request):
|
async def list_users(request):
|
||||||
|
|
@ -217,6 +225,15 @@ async def set_rating_for_user(request):
|
||||||
request.path_params["user_id"]
|
request.path_params["user_id"]
|
||||||
|
|
||||||
|
|
||||||
|
@route("/users/_reload_ratings", methods=["POST"])
|
||||||
|
@requires(["authenticated", "admin"])
|
||||||
|
async def load_imdb_user_ratings(request):
|
||||||
|
|
||||||
|
ratings = [rating async for rating in imdb.refresh_user_ratings_from_imdb()]
|
||||||
|
|
||||||
|
return JSONResponse({"new_ratings": [asplain(r) for r in ratings]})
|
||||||
|
|
||||||
|
|
||||||
@route("/groups", methods=["POST"])
|
@route("/groups", methods=["POST"])
|
||||||
@requires(["authenticated", "admin"])
|
@requires(["authenticated", "admin"])
|
||||||
async def add_group(request):
|
async def add_group(request):
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue