add imdb full import mode

2021-06-21 18:54:03 +02:00 · 2021-06-21 18:54:03 +02:00 · 7dd10f8bc3
commit 7dd10f8bc3
parent b5cb22822e
17 changed files with 721 additions and 109 deletions
--- a/poetry.lock
+++ b/poetry.lock
@ -20,6 +20,28 @@ python-versions = ">=3.6"
 [package.extras]
 tests = ["pytest", "pytest-asyncio", "mypy (>=0.800)"]
 [[package]]
 name = "atomicwrites"
 version = "1.4.0"
 description = "Atomic file writes."
 category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 [[package]]
 name = "attrs"
 version = "21.2.0"
 description = "Classes Without Boilerplate"
 category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 [package.extras]
 dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit"]
 docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"]
 tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface"]
 tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins"]
 [[package]]
 name = "beautifulsoup4"
 version = "4.9.3"
@ -122,6 +144,73 @@ category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 [[package]]
 name = "iniconfig"
 version = "1.1.1"
 description = "iniconfig: brain-dead simple config-ini parsing"
 category = "main"
 optional = false
 python-versions = "*"
 [[package]]
 name = "packaging"
 version = "20.9"
 description = "Core utilities for Python packages"
 category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 [package.dependencies]
 pyparsing = ">=2.0.2"
 [[package]]
 name = "pluggy"
 version = "0.13.1"
 description = "plugin and hook calling mechanisms for python"
 category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 [package.extras]
 dev = ["pre-commit", "tox"]
 [[package]]
 name = "py"
 version = "1.10.0"
 description = "library with cross-python path, ini-parsing, io, code, log facilities"
 category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 [[package]]
 name = "pyparsing"
 version = "2.4.7"
 description = "Python parsing module"
 category = "main"
 optional = false
 python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
 [[package]]
 name = "pytest"
 version = "6.2.4"
 description = "pytest: simple powerful testing with Python"
 category = "main"
 optional = false
 python-versions = ">=3.6"
 [package.dependencies]
 atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""}
 attrs = ">=19.2.0"
 colorama = {version = "*", markers = "sys_platform == \"win32\""}
 iniconfig = "*"
 packaging = "*"
 pluggy = ">=0.12,<1.0.0a1"
 py = ">=1.8.2"
 toml = "*"
 [package.extras]
 testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"]
 [[package]]
 name = "requests"
 version = "2.25.1"
@ -251,7 +340,7 @@ python-versions = "*"
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.9"
-content-hash = "28c14ec611e61db259fa6aa160df99308f7452874f69377a634d07cd379603c8"
+content-hash = "2aed53c1d20035335cf0c0e6439f65369519cd8ea47ae2e6043e49767106ffb0"
 [metadata.files]
 aiosqlite = [
@ -262,6 +351,14 @@ asgiref = [
    {file = "asgiref-3.3.4-py3-none-any.whl", hash = "sha256:92906c611ce6c967347bbfea733f13d6313901d54dcca88195eaeb52b2a8e8ee"},
    {file = "asgiref-3.3.4.tar.gz", hash = "sha256:d1216dfbdfb63826470995d31caed36225dcaf34f182e0fa257a4dd9e86f1b78"},
 ]
 atomicwrites = [
    {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"},
    {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"},
 ]
 attrs = [
    {file = "attrs-21.2.0-py2.py3-none-any.whl", hash = "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1"},
    {file = "attrs-21.2.0.tar.gz", hash = "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"},
 ]
 beautifulsoup4 = [
    {file = "beautifulsoup4-4.9.3-py2-none-any.whl", hash = "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35"},
    {file = "beautifulsoup4-4.9.3-py3-none-any.whl", hash = "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"},
@ -299,6 +396,30 @@ idna = [
    {file = "idna-2.10-py2.py3-none-any.whl", hash = "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"},
    {file = "idna-2.10.tar.gz", hash = "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6"},
 ]
 iniconfig = [
    {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"},
    {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
 ]
 packaging = [
    {file = "packaging-20.9-py2.py3-none-any.whl", hash = "sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a"},
    {file = "packaging-20.9.tar.gz", hash = "sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5"},
 ]
 pluggy = [
    {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"},
    {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"},
 ]
 py = [
    {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"},
    {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"},
 ]
 pyparsing = [
    {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"},
    {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"},
 ]
 pytest = [
    {file = "pytest-6.2.4-py3-none-any.whl", hash = "sha256:91ef2131a9bd6be8f76f1f08eac5c5317221d6ad1e143ae03894b862e8976890"},
    {file = "pytest-6.2.4.tar.gz", hash = "sha256:50bcad0a0b9c5a72c8e4e7c9855a3ad496ca6a881a3641b4260605450772c54b"},
 ]
 requests = [
    {file = "requests-2.25.1-py2.py3-none-any.whl", hash = "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"},
    {file = "requests-2.25.1.tar.gz", hash = "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804"},
--- a/pyproject.toml
+++ b/pyproject.toml
@ -15,6 +15,7 @@ ulid-py = "^1.1.0"
 databases = {extras = ["sqlite"], version = "^0.4.3"}
 toml = "^0.10.2"
 uvicorn = "^0.14.0"
 pytest = "^6.2.4"
 [tool.poetry.dev-dependencies]
--- a/2
+++ b/2
@ -12,4 +12,4 @@ shift
 # export DEBUG=1
 # export UNWIND_LOGLEVEL=DEBUG
-exec scripts/"$task" "$@"
+exec "$RUN_DIR"/scripts/"$task" "$@"
--- a/scripts/app
+++ b/scripts/app
@ -1,5 +1,7 @@
 #!/bin/sh -eu
 cd "$RUN_DIR"
 [ -z "${DEBUG:-}" ] || set -x
 exec python -m unwind "$@"
--- a/scripts/dev
+++ b/scripts/dev
@ -1,5 +1,7 @@
 #!/bin/sh -eu
 cd "$RUN_DIR"
 [ -z "${DEBUG:-}" ] || set -x
 exec uvicorn unwind:web_app --reload
--- a/scripts/load_imdb_dumps
+++ b/scripts/load_imdb_dumps
@ -0,0 +1,18 @@
 #!/bin/sh -eu
 datadir="$RUN_DIR"/data
 [ -z "${DEBUG:-}" ] || set -x
 # See
 # - https://www.imdb.com/interfaces/
 # - https://datasets.imdbws.com/
 wget -N \
    --no-directories \
    --directory-prefix "$datadir" \
    https://datasets.imdbws.com/title.basics.tsv.gz \
    https://datasets.imdbws.com/title.ratings.tsv.gz
 "$RUN_BIN" app import-imdb-dataset \
    --basics "$datadir"/title.basics.tsv.gz \
    --ratings "$datadir"/title.ratings.tsv.gz
--- a/scripts/server
+++ b/scripts/server
@ -1,5 +1,7 @@
 #!/bin/sh -eu
 cd "$RUN_DIR"
 [ -z "${DEBUG:-}" ] || set -x
 exec uvicorn --host 0.0.0.0 unwind:web_app
--- a/scripts/tests
+++ b/scripts/tests
@ -0,0 +1,7 @@
 #!/bin/sh -eu
 cd "$RUN_DIR"
 [ -z "${DEBUG:-}" ] || set -x
 exec python -m pytest "$@"
--- a/tests/test_imdb.py
+++ b/tests/test_imdb.py
@ -0,0 +1,19 @@
 import pytest
 from unwind.imdb import imdb_rating_from_score, score_from_imdb_rating
@pytest.mark.parametrize("rating", (x / 10 for x in range(10, 101)))
 def test_rating_conversion(rating):
    assert rating == imdb_rating_from_score(score_from_imdb_rating(rating))
@pytest.mark.parametrize("score", range(0, 101))
 def test_score_conversion(score):
    # Because our score covers 101 discrete values and IMDb's rating only 91
    # discrete values, the mapping is non-injective, i.e. 10 values can't be
    # mapped uniquely.
    non_injective = set(range(5, 100, 10))
    if score in non_injective:
        pytest.skip(f"Score cannot be mapped back correctly: {score}")
    assert score == score_from_imdb_rating(imdb_rating_from_score(score))
--- a/unwind/main.py
+++ b/unwind/main.py
@ -1,15 +1,18 @@
 import argparse
 import asyncio
 import logging
 from pathlib import Path
 from . import config
 from .db import close_connection_pool, open_connection_pool
 from .imdb import load_imdb
 from .imdb_import import import_from_file
 from .request import session
 log = logging.getLogger(__name__)
-async def run_import():
+async def run_load_user_ratings_from_imdb():
    await open_connection_pool()
    with session() as s:
@ -22,6 +25,60 @@ async def run_import():
    await close_connection_pool()
 async def run_import_imdb_dataset(basics_path: Path, ratings_path: Path):
    await open_connection_pool()
    await import_from_file(basics_path=basics_path, ratings_path=ratings_path)
    await close_connection_pool()
 def getargs():
    parser = argparse.ArgumentParser()
    commands = parser.add_subparsers(required=True)
    parser_import_imdb_dataset = commands.add_parser(
        "import-imdb-dataset",
        help="Import IMDb datasets.",
        description="""
            Import IMDb datasets.
            New datasets available from https://www.imdb.com/interfaces/.
        """,
    )
    parser_import_imdb_dataset.add_argument(
        dest="mode",
        action="store_const",
        const="import-imdb-dataset",
    )
    parser_import_imdb_dataset.add_argument(
        "--basics", metavar="basics_file.tsv.gz", type=Path, required=True
    )
    parser_import_imdb_dataset.add_argument(
        "--ratings", metavar="ratings_file.tsv.gz", type=Path, required=True
    )
    parser_load_user_ratings_from_imdb = commands.add_parser(
        "load-user-ratings-from-imdb",
        help="Load user ratings from imdb.com.",
        description="""
            Refresh user ratings for all registered users live from IMDb's website.
        """,
    )
    parser_load_user_ratings_from_imdb.add_argument(
        dest="mode",
        action="store_const",
        const="load-user-ratings-from-imdb",
    )
    try:
        args = parser.parse_args()
    except TypeError:
        parser.print_usage()
        raise
    return args
 def main():
    logging.basicConfig(
        format="%(asctime)s.%(msecs)03d [%(name)s:%(process)d] %(levelname)s: %(message)s",
@ -30,7 +87,15 @@ def main():
    )
    log.debug(f"Log level: {config.loglevel}")
-    asyncio.run(run_import())
+    try:
        args = getargs()
    except:
        return
    if args.mode == "load-user-ratings-from-imdb":
        asyncio.run(run_load_user_ratings_from_imdb())
    elif args.mode == "import-imdb-dataset":
        asyncio.run(run_import_imdb_dataset(args.basics, args.ratings))
 main()
--- a/unwind/db.py
+++ b/unwind/db.py
@ -6,7 +6,7 @@ from typing import Optional, Type, TypeVar
 from databases import Database
 from . import config
-from .models import Movie, Rating, User, asplain, fromplain, utcnow
+from .models import Movie, Rating, User, asplain, fromplain, optional_fields
 log = logging.getLogger(__name__)
@ -69,10 +69,14 @@ ModelType = TypeVar("ModelType")
 async def get(model: Type[ModelType], **kwds) -> Optional[ModelType]:
    values = {k: v for k, v in kwds.items() if v is not None}
    if not values:
        return
    fields_ = ", ".join(f.name for f in fields(model))
-    cond = " AND ".join(f"{k}=:{k}" for k in kwds)
+    cond = " AND ".join(f"{k}=:{k}" for k, v in values.items())
    query = f"SELECT {fields_} FROM {model._table} WHERE {cond}"
-    row = await shared_connection().fetch_one(query=query, values=kwds)
+    row = await shared_connection().fetch_one(query=query, values=values)
    return fromplain(model, row) if row else None
@ -95,16 +99,28 @@ async def add_or_update_user(user: User):
 async def add_or_update_movie(movie: Movie):
    """Add or update a Movie in the database.
    This is an upsert operation, but it will also update the Movie you pass
    into the function to make its `id` match the DB's movie's `id`, and also
    set all optional values on your Movie that might be unset but exist in the
    database.  It's a bidirectional sync.
    """
    db_movie = await get(Movie, imdb_id=movie.imdb_id)
    if not db_movie:
        await add(movie)
    else:
        movie.id = db_movie.id
        movie.updated = db_movie.updated
-        if movie != db_movie:
+        # We want to keep any existing value in the DB for all optional fields.
-            movie.updated = utcnow()
+        for f in optional_fields(movie):
-            await update(movie)
+            if getattr(movie, f.name) is None:
                setattr(movie, f.name, getattr(db_movie, f.name))
        if movie.updated <= db_movie.updated:
            return
        await update(movie)
 async def add_or_update_rating(rating: Rating) -> bool:
@ -147,38 +163,48 @@ async def find_ratings(
        values["escape"] = "#"
        escaped_title = sql_escape(title, char=values["escape"])
        values["pattern"] = "%" + "%".join(escaped_title.split()) + "%"
-        conditions.append("movies.title LIKE :pattern ESCAPE :escape")
+        values["opattern"] = values["pattern"]
        values["oescape"] = values["escape"]
        conditions.append(
            f"""
            (
                {Movie._table}.title LIKE :pattern ESCAPE :escape
                OR {Movie._table}.original_title LIKE :opattern ESCAPE :oescape
            )
            """
        )
    if media_type:
        values["media_type"] = media_type
-        conditions.append("movies.media_type=:media_type")
+        conditions.append(f"{Movie._table}.media_type=:media_type")
    if ignore_tv_episodes:
-        conditions.append("movies.media_type!='TV Episode'")
+        conditions.append(f"{Movie._table}.media_type!='TV Episode'")
    query = f"""
        WITH newest_movies
        AS (
-            SELECT DISTINCT ratings.movie_id
+            SELECT DISTINCT {Rating._table}.movie_id
-            FROM ratings
+            FROM {Rating._table}
-            LEFT JOIN movies ON movies.id=ratings.movie_id
+            LEFT JOIN {Movie._table} ON {Movie._table}.id={Rating._table}.movie_id
            {('WHERE ' + ' AND '.join(conditions)) if conditions else ''}
-            ORDER BY length(movies.title) ASC, ratings.rating_date DESC
+            ORDER BY length({Movie._table}.title) ASC, {Rating._table}.rating_date DESC
            LIMIT :limit_rows
        )
        SELECT 
-            users.name AS user_name,
+            {User._table}.name AS user_name,
-            ratings.score AS user_score,
+            {Rating._table}.score AS user_score,
-            movies.score AS imdb_score,
+            {Movie._table}.score AS imdb_score,
-            movies.imdb_id AS movie_imdb_id,
+            {Movie._table}.imdb_id AS movie_imdb_id,
-            movies.media_type AS media_type,
+            {Movie._table}.media_type AS media_type,
-            movies.title AS movie_title,
+            {Movie._table}.title AS canonical_title,
-            movies.release_year AS release_year
+            {Movie._table}.original_title AS original_title,
            {Movie._table}.release_year AS release_year
        FROM newest_movies
-        LEFT JOIN ratings ON ratings.movie_id=newest_movies.movie_id
+        LEFT JOIN {Rating._table} ON {Rating._table}.movie_id=newest_movies.movie_id
-        LEFT JOIN users ON users.id=ratings.user_id
+        LEFT JOIN {User._table} ON {User._table}.id={Rating._table}.user_id
-        LEFT JOIN movies ON movies.id=ratings.movie_id
+        LEFT JOIN {Movie._table} ON {Movie._table}.id={Rating._table}.movie_id
    """
    rows = await shared_connection().fetch_all(query=query, values=values)
--- a/unwind/imdb.py
+++ b/unwind/imdb.py
@ -6,8 +6,8 @@ from typing import Optional
 from urllib.parse import urljoin
 from .db import add_or_update_movie, add_or_update_rating, add_or_update_user
-from .models import Movie, Rating, User, asplain, fromplain
+from .models import Movie, Rating, User
-from .request import soup_from_url
+from .request import cache_path, soup_from_url
 log = logging.getLogger(__name__)
@ -38,6 +38,26 @@ def imdb_url(user_id):
    return f"https://www.imdb.com/user/{user_id}/ratings"
 def imdb_rating_from_score(score: int) -> float:
    """Return the IMDb rating from an Unwind Movie score."""
    assert 0 <= score <= 100
    rating = round(score * 9 / 100 + 1, 1)
    assert 1.0 <= rating <= 10.0
    return rating
 def score_from_imdb_rating(rating: float) -> int:
    """Return the Unwind Movie score for an IMDb rating."""
    # Scale IMDb's 10 point rating to our score of [0, 100].
    # There's a pitfall here!
    # You might think this would be simply IMDb's rating times 10, *but*
    # the lowest possible rating on IMDb is actually 1.
    assert 1.0 <= rating <= 10.0
    score = round(100 * (rating - 1) / 9)
    assert 0 <= score <= 100
    return score
 find_name = re.compile(r"(?P<name>.*)'s Ratings").fullmatch
 find_rating_date = re.compile(r"Rated on (?P<date>\d{2} \w+ \d{4})").fullmatch
 find_runtime = re.compile(r"((?P<h>\d+) hr)? ?((?P<m>\d+) min)?").fullmatch
@ -50,67 +70,88 @@ find_year = re.compile(
 find_movie_id = re.compile(r"/title/(?P<id>tt\d+)/").search
 def movie_and_rating_from_item(item) -> tuple[Movie, Rating]:
    movie = Movie(
        title=item.h3.a.string.strip(),
        genres=set(s.strip() for s in item.find("span", "genre").string.split(",")),
    )
    episode_br = item.h3.br
    if episode_br:
        episode_a = episode_br.find_next("a")
        if not episode_a:
            raise ValueError("Unknown document structure.")
        movie.media_type = "TV Episode"
        movie.title += " / " + episode_a.string.strip()
        if match := find_year(episode_br.find_next("span", "lister-item-year").string):
            movie.release_year = int(match["year"])
        if match := find_movie_id(episode_a["href"]):
            movie.imdb_id = match["id"]
    if (tag := item.find("span", "runtime")) and (match := find_runtime(tag.string)):
        movie.runtime = int(match["h"] or 0) * 60 + int(match["m"] or 0)
    if not episode_br:
        if match := find_year(item.h3.find("span", "lister-item-year").string):
            if media_type := match["type"]:
                movie.media_type = media_type.strip()
            movie.release_year = int(match["year"])
        if match := find_movie_id(item.h3.a["href"]):
            movie.imdb_id = match["id"]
    if not movie.media_type:
        movie.media_type = "Movie"
    rating = Rating()
    ratings_item = item.find("div", "ipl-rating-widget")
    if match := find_rating_date(ratings_item.find_next("p", "text-muted").string):
        rating.rating_date = datetime.strptime(match["date"], "%d %b %Y")
    if match := ratings_item.find("div", "ipl-rating-star--other-user"):
        if rating_item := match.find("span", "ipl-rating-star__rating"):
            rating.score = score_from_imdb_rating(float(rating_item.string))
    if match := ratings_item.find("div", "ipl-rating-star small"):
        if rating_item := match.find("span", "ipl-rating-star__rating"):
            movie.score = score_from_imdb_rating(float(rating_item.string))
    return movie, rating
 ForgedRequest = namedtuple("ForgedRequest", "url headers")
 async def parse_page(url, stop_on_dupe=True) -> Optional[str]:
    soup = soup_from_url(url)
-    user = User(imdb_id=soup.find("meta", property="pageId")["content"], name="")
+    meta = soup.find("meta", property="pageId")
-    if match := find_name(soup.h1.string):
+    headline = soup.h1
    assert meta is not None and headline is not None
    user = User(imdb_id=meta["content"], name="")
    if match := find_name(headline.string):
        user.name = match["name"]
    await add_or_update_user(user)
    items = soup.find_all("div", "lister-item-content")
    for i, item in enumerate(items):
-        movie = Movie(
+        try:
-            title=item.h3.a.string.strip(),
+            movie, rating = movie_and_rating_from_item(item)
-            genres=set(s.strip() for s in item.find("span", "genre").string.split(",")),
+        except Exception as err:
-        )
+            log.error(
-
+                "Error in %s item #%s (%s): %s: %s",
-        episode_br = item.h3.br
+                url,
-        if episode_br:
+                i,
-            episode_a = episode_br.find_next("a")
+                cache_path(ForgedRequest(url, headers={})),
-            if not episode_a:
+                " ".join(item.h3.stripped_strings),
-                log.error("Unknown document structure.")
+                err,
-                continue
+            )
-
+            continue
            movie.media_type = "TV Episode"
            movie.title += " / " + episode_a.string.strip()
            if match := find_year(
                episode_br.find_next("span", "lister-item-year").string
            ):
                movie.release_year = int(match["year"])
            if match := find_movie_id(episode_a["href"]):
                movie.imdb_id = match["id"]
        rating = Rating(user_id=user.id)
        if (tag := item.find("span", "runtime")) and (
            match := find_runtime(tag.string)
        ):
            movie.runtime = int(match["h"] or 0) * 60 + int(match["m"] or 0)
        if not episode_br:
            if match := find_year(item.h3.find("span", "lister-item-year").string):
                if media_type := match["type"]:
                    movie.media_type = media_type.strip()
                movie.release_year = int(match["year"])
            if match := find_movie_id(item.h3.a["href"]):
                movie.imdb_id = match["id"]
        ratings_item = item.find("div", "ipl-rating-widget")
        if match := find_rating_date(ratings_item.find_next("p", "text-muted").string):
            rating.rating_date = datetime.strptime(match["date"], "%d %b %Y")
        for rating_item in ratings_item.find_all("span", "ipl-rating-star__rating")[:2]:
            if "ipl-rating-star--other-user" in rating_item.parent["class"]:
                rating.score = int(float(rating_item.string) * 10)
            else:
                movie.score = int(float(rating_item.string) * 10)
        if not movie.media_type:
            movie.media_type = "Movie"
        await add_or_update_movie(movie)
        rating.user_id = user.id
        rating.movie_id = movie.id  # needs to be set _after_ movie has been updated
        is_updated = await add_or_update_rating(rating)
@ -118,9 +159,9 @@ async def parse_page(url, stop_on_dupe=True) -> Optional[str]:
            log.info("Import stopped after %s items.  Caught up to known state. ✋", i)
            return None
-    next_url = urljoin(
+    footer = soup.find("div", "footer")
-        url, soup.find("div", "footer").find(string=re.compile(r"Next")).parent["href"]
+    assert footer is not None
-    )
+    next_url = urljoin(url, footer.find(string=re.compile(r"Next")).parent["href"])
    return next_url if url != next_url else None
--- a/unwind/imdb_import.py
+++ b/unwind/imdb_import.py
@ -0,0 +1,206 @@
 import csv
 import gzip
 import logging
 from dataclasses import dataclass, fields
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Optional, get_origin
 from . import db
 from .db import add_or_update_movie
 from .imdb import score_from_imdb_rating
 from .models import Movie, optional_type
 log = logging.getLogger(__name__)
 # See
 # - https://www.imdb.com/interfaces/
 # - https://datasets.imdbws.com/
@dataclass
 class BasicRow:
    tconst: str
    titleType: str
    primaryTitle: str
    originalTitle: str
    isAdult: bool
    startYear: Optional[int]
    endYear: Optional[int]
    runtimeMinutes: Optional[int]
    genres: Optional[set[str]]
    @classmethod
    def from_row(cls, row):
        vals = []
        for f, r in zip(fields(cls), row):
            ttype = f.type
            is_opt = False
            if (otype := optional_type(ttype)) is not None:
                ttype = otype
                is_opt = True
            if (otype := get_origin(ttype)) is not None:
                ttype = otype
            if r == r"\N":
                if is_opt:
                    vals.append(None)
                else:
                    raise ValueError(f"Unexpected null value for field: {f.name}")
            elif f.name == "genres":
                vals.append(set(r.split(",")))
            elif f.name == "isAdult":
                assert r in "01"
                vals.append(r == "1")
            else:
                vals.append(ttype(r))
        inst = cls(*vals)
        assert inst.titleType in title_types
        return inst
    def as_movie(self):
        assert self.startYear is not None
        return Movie(
            title=self.primaryTitle,
            original_title=self.originalTitle,
            release_year=self.startYear,
            media_type=title_types[self.titleType],
            imdb_id=self.tconst,
            score=None,
            runtime=self.runtimeMinutes,
            genres=self.genres or set(),
        )
@dataclass
 class RatingRow:
    tconst: str
    averageRating: float
    numVotes: int
    @classmethod
    def from_row(cls, row):
        inst = cls(*(f.type(r) for f, r in zip(fields(cls), row)))
        assert inst.tconst != r"\N"
        return inst
    def as_movie(self):
        return Movie(
            imdb_id=self.tconst,
            score=score_from_imdb_rating(self.averageRating),
        )
 title_types = {
    "movie": "Movie",
    "radioEpisode": "Radio Episode",
    "radioSeries": "Radio Series",
    "short": "Short",
    "tvEpisode": "TV Episode",
    "tvMiniSeries": "TV Mini Series",
    "tvMovie": "TV Movie",
    "tvSeries": "TV Series",
    "tvShort": "TV Short",
    "tvSpecial": "TV Special",
    "video": "Video",
    "videoGame": "Video Game",
 }
 def gz_mtime(path) -> datetime:
    """Return the timestamp of the compressed file."""
    g = gzip.GzipFile(path, "rb")
    g.peek(1)  # start reading the file to fill the timestamp field
    assert g.mtime is not None
    return datetime.fromtimestamp(g.mtime).replace(tzinfo=timezone.utc)
 def count_lines(path) -> int:
    i = 0
    with gzip.open(path, "rt") as f:
        for i, _ in enumerate(f, start=1):
            pass
    return i
 def read_imdb_tsv(path, row_type):
    with gzip.open(path, "rt", newline="") as f:
        rows = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
        # skip header line
        rows = iter(rows)
        header = next(rows)
        try:
            assert tuple(f.name for f in fields(row_type)) == tuple(header)
        except AssertionError:
            log.error("Unexpected header line: %s", header)
            raise
        for i, row in enumerate(rows, start=1):
            try:
                yield row_type.from_row(row)
            except Exception as err:
                log.error("Error in line %s: %s", i, row, exc_info=err)
                raise
 def read_ratings(path):
    mtime = gz_mtime(path)
    rows = read_imdb_tsv(path, RatingRow)
    for row in rows:
        m = row.as_movie()
        m.updated = mtime
        yield m
 def read_basics(path):
    mtime = gz_mtime(path)
    rows = read_imdb_tsv(path, BasicRow)
    for row in rows:
        if row.startYear is None:
            log.debug("Skipping movie, missing year: %s", row)
            continue
        m = row.as_movie()
        m.updated = mtime
        yield m
 async def import_from_file(basics_path: Path, ratings_path: Path):
    log.info("Loading scores ... 💾")
    scores = {m.imdb_id: m.score for m in read_ratings(ratings_path)}
    log.info("Importing movies ... 💾")
    total = count_lines(basics_path)
    assert total != 0
    perc = 0.0
    perc_step = 0.001
    async with db.shared_connection().transaction():
        for i, m in enumerate(read_basics(basics_path)):
            if i / total > perc:
                log.info("Imported %s%%", round(perc * 100, 1))
                perc += perc_step
            if m.media_type not in {
                "Movie",
                "Short",
                "TV Mini Series",
                "TV Movie",
                "TV Series",
                "TV Short",
                "TV Special",
                "Video",
            }:
                log.debug("Skipping movie, unwanted media type: %s", m.media_type)
                continue
            m.score = scores.get(m.imdb_id)
            await add_or_update_movie(m)
--- a/unwind/init.sql
+++ b/unwind/init.sql
@ -1,31 +1,32 @@
 PRAGMA foreign_keys = ON;;
 CREATE TABLE IF NOT EXISTS users (
-    id TEXT NOT NULL PRIMARY KEY,
+    id TEXT PRIMARY KEY NOT NULL,
    imdb_id TEXT NOT NULL UNIQUE,
    name TEXT NOT NULL
 );;
 CREATE TABLE IF NOT EXISTS movies (
-    id TEXT NOT NULL PRIMARY KEY,
+    id TEXT PRIMARY KEY NOT NULL,
    title TEXT NOT NULL,
-    release_year NUMBER NOT NULL,
+    original_title TEXT,
    release_year INTEGER NOT NULL,
    media_type TEXT NOT NULL,
    imdb_id TEXT NOT NULL UNIQUE,
-    score NUMBER NOT NULL,
+    score INTEGER,
-    runtime NUMBER,
+    runtime INTEGER,
    genres TEXT NOT NULL,
    updated TEXT NOT NULL
 );;
 CREATE TABLE IF NOT EXISTS ratings (
-    id TEXT NOT NULL PRIMARY KEY,
+    id TEXT PRIMARY KEY NOT NULL,
    movie_id TEXT NOT NULL,
    user_id TEXT NOT NULL,
-    score NUMBER NOT NULL,
+    score INTEGER NOT NULL,
    rating_date TEXT NOT NULL,
-    favorite NUMBER,
+    favorite BOOL,
-    finished NUMBER,
+    finished BOOL,
    FOREIGN KEY(movie_id) REFERENCES movies(id),
    FOREIGN KEY(user_id) REFERENCES users(id)
 );;
--- a/unwind/models.py
+++ b/unwind/models.py
@ -1,5 +1,5 @@
 import json
-from dataclasses import asdict, dataclass, field, fields, is_dataclass
+from dataclasses import asdict, dataclass, field, fields
 from datetime import datetime, timezone
 from typing import Any, ClassVar, Optional, Type, Union, get_args, get_origin
@ -25,6 +25,12 @@ def optional_type(tp: Type):
    return args[0]
 def optional_fields(o):
    for f in fields(o):
        if is_optional(f.type):
            yield f
 def asplain(o) -> dict[str, Any]:
    validate(o)
@ -56,9 +62,6 @@ def asplain(o) -> dict[str, Any]:
 def fromplain(cls, d: dict[str, Any]):
    # if not is_dataclass(cls):
    #     raise TypeError(f'Not a dataclass: {type(cls)}')
    dd = {}
    for f in fields(cls):
@ -107,11 +110,14 @@ class Movie:
    _table: ClassVar[str] = "movies"
    id: ULID = field(default_factory=ULID)
-    title: str = None  # canonical title
+    title: str = None  # canonical title (usually English)
    original_title: Optional[
        str
    ] = None  # original title (usually transscribed to latin script)
    release_year: int = None  # canonical release date
-    media_type: Optional[str] = None
+    media_type: str = None
    imdb_id: str = None
-    score: int = None  # range: [0,100]
+    score: Optional[int] = None  # range: [0,100]
    runtime: Optional[int] = None  # minutes
    genres: set[str] = None
    updated: datetime = field(default_factory=utcnow)
--- a/unwind/request.py
+++ b/unwind/request.py
@ -5,9 +5,10 @@ from contextlib import contextmanager
 from dataclasses import dataclass
 from functools import wraps
 from hashlib import md5
 from pathlib import Path
 from random import random
 from time import sleep, time
-from typing import Callable
+from typing import Callable, Optional
 import bs4
 import requests
@ -130,16 +131,19 @@ class RedirectError(RuntimeError):
        super().__init__(f"Redirected: {from_url} -> {to_url}")
 def cache_path(req) -> Optional[Path]:
    if not config.cachedir:
        return
    sig = repr(req.url)  # + repr(sorted(req.headers.items()))
    return config.cachedir / md5(sig.encode()).hexdigest()
@throttle(1, 1, random)
 def http_get(s: requests.Session, url: str, *args, **kwds) -> requests.Response:
    req = s.prepare_request(requests.Request("GET", url, *args, **kwds))
-    if config.debug and config.cachedir:
+    cachefile = cache_path(req) if config.debug else None
        sig = repr(req.url)  # + repr(sorted(req.headers.items()))
        cachefile = config.cachedir / md5(sig.encode()).hexdigest()
    else:
        cachefile = None
    if cachefile:
        if cachefile.exists():
--- a/unwind/web.py
+++ b/unwind/web.py
@ -1,11 +1,42 @@
-from collections import defaultdict
+import base64
 import binascii
 from starlette.applications import Starlette
 from starlette.authentication import (
    AuthCredentials,
    AuthenticationBackend,
    AuthenticationError,
    SimpleUser,
    UnauthenticatedUser,
    requires,
 )
 from starlette.middleware import Middleware
 from starlette.middleware.authentication import AuthenticationMiddleware
 from starlette.responses import JSONResponse
-from starlette.routing import Route
+from starlette.routing import Mount, Route
-from . import config
+from . import config, db
 from .db import close_connection_pool, find_ratings, open_connection_pool
 from .models import Movie, asplain
 class BasicAuthBackend(AuthenticationBackend):
    async def authenticate(self, request):
        if "Authorization" not in request.headers:
            return
        auth = request.headers["Authorization"]
        try:
            scheme, credentials = auth.split()
            if scheme.lower() != "basic":
                return
            decoded = base64.b64decode(credentials).decode("ascii")
        except (ValueError, UnicodeDecodeError, binascii.Error) as exc:
            raise AuthenticationError("Invalid basic auth credentials")
        username, _, password = decoded.partition(":")
        # TODO: You'd want to verify the username and password here.
        return AuthCredentials(["authenticated"]), SimpleUser(username)
 def imdb_url(imdb_id: str):
@ -29,7 +60,8 @@ async def ratings(request):
        mov = aggr.setdefault(
            r["movie_imdb_id"],
            {
-                "title": r["movie_title"],
+                "canonical_title": r["canonical_title"],
                "original_title": r["original_title"],
                "year": r["release_year"],
                "link": imdb_url(r["movie_imdb_id"]),
                "user_scores": [],
@ -44,10 +76,69 @@ async def ratings(request):
    return JSONResponse(resp)
 not_found = JSONResponse({"error": "Not Found"}, status_code=404)
 async def get_movies(request):
    imdb_id = request.query_params.get("imdb_id")
    movie = await db.get(Movie, imdb_id=imdb_id)
    resp = [asplain(movie)] if movie else []
    return JSONResponse(resp)
@requires(["authenticated", "admin"])
 async def add_movie(request):
    pass
@requires(["authenticated", "admin"])
 async def add_user(request):
    pass
 async def ratings_for_user(request):
    request.path_params["user_id"]
@requires("authenticated")
 async def set_rating_for_user(request):
    request.path_params["user_id"]
@requires(["authenticated", "admin"])
 async def add_group(request):
    pass
@requires(["authenticated", "admin"])
 async def add_user_to_group(request):
    request.path_params["group_id"]
 async def get_ratings_for_group(request):
    request.path_params["group_id"]
 app = Starlette(
    on_startup=[open_connection_pool],
    on_shutdown=[close_connection_pool],
    routes=[
-        Route("/ratings", ratings),
+        Mount(
            "/api/v1",
            routes=[
                Route("/ratings", ratings),  # XXX legacy, remove.
                Route("/movies", get_movies),
                Route("/movies", add_movie, methods=["POST"]),
                Route("/users", add_user, methods=["POST"]),
                Route("/users/{user_id}/ratings", ratings_for_user),
                Route("/users/{user_id}/ratings", set_rating_for_user, methods=["PUT"]),
                Route("/groups", add_group, methods=["POST"]),
                Route("/groups/{group_id}/users", add_user_to_group, methods=["POST"]),
                Route("/groups/{group_id}/ratings", get_ratings_for_group),
            ],
        ),
    ],
    middleware=[Middleware(AuthenticationMiddleware, backend=BasicAuthBackend())],
 )