From 7dd10f8bc343db3e1bb9cfea475744d8abdf35f4 Mon Sep 17 00:00:00 2001
From: ducklet <ducklet@noreply.code.dumpr.org>
Date: Mon, 21 Jun 2021 18:54:03 +0200
Subject: [PATCH] add imdb full import mode

---
 poetry.lock             | 123 +++++++++++++++++++++++-
 pyproject.toml          |   1 +
 run                     |   2 +-
 scripts/app             |   2 +
 scripts/dev             |   2 +
 scripts/load_imdb_dumps |  18 ++++
 scripts/server          |   2 +
 scripts/tests           |   7 ++
 tests/test_imdb.py      |  19 ++++
 unwind/__main__.py      |  69 +++++++++++++-
 unwind/db.py            |  74 ++++++++++-----
 unwind/imdb.py          | 149 ++++++++++++++++++-----------
 unwind/imdb_import.py   | 206 ++++++++++++++++++++++++++++++++++++++++
 unwind/init.sql         |  19 ++--
 unwind/models.py        |  20 ++--
 unwind/request.py       |  16 ++--
 unwind/web.py           | 101 +++++++++++++++++++-
 17 files changed, 721 insertions(+), 109 deletions(-)
 create mode 100755 scripts/load_imdb_dumps
 create mode 100755 scripts/tests
 create mode 100644 tests/test_imdb.py
 create mode 100644 unwind/imdb_import.py

diff --git a/poetry.lock b/poetry.lock
index 31651e7..3891013 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -20,6 +20,28 @@ python-versions = ">=3.6"
 [package.extras]
 tests = ["pytest", "pytest-asyncio", "mypy (>=0.800)"]
 
+[[package]]
+name = "atomicwrites"
+version = "1.4.0"
+description = "Atomic file writes."
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "attrs"
+version = "21.2.0"
+description = "Classes Without Boilerplate"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[package.extras]
+dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit"]
+docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"]
+tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface"]
+tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins"]
+
 [[package]]
 name = "beautifulsoup4"
 version = "4.9.3"
@@ -122,6 +144,73 @@ category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 
+[[package]]
+name = "iniconfig"
+version = "1.1.1"
+description = "iniconfig: brain-dead simple config-ini parsing"
+category = "main"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "packaging"
+version = "20.9"
+description = "Core utilities for Python packages"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[package.dependencies]
+pyparsing = ">=2.0.2"
+
+[[package]]
+name = "pluggy"
+version = "0.13.1"
+description = "plugin and hook calling mechanisms for python"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[package.extras]
+dev = ["pre-commit", "tox"]
+
+[[package]]
+name = "py"
+version = "1.10.0"
+description = "library with cross-python path, ini-parsing, io, code, log facilities"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "pyparsing"
+version = "2.4.7"
+description = "Python parsing module"
+category = "main"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
+
+[[package]]
+name = "pytest"
+version = "6.2.4"
+description = "pytest: simple powerful testing with Python"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""}
+attrs = ">=19.2.0"
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+iniconfig = "*"
+packaging = "*"
+pluggy = ">=0.12,<1.0.0a1"
+py = ">=1.8.2"
+toml = "*"
+
+[package.extras]
+testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"]
+
 [[package]]
 name = "requests"
 version = "2.25.1"
@@ -251,7 +340,7 @@ python-versions = "*"
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.9"
-content-hash = "28c14ec611e61db259fa6aa160df99308f7452874f69377a634d07cd379603c8"
+content-hash = "2aed53c1d20035335cf0c0e6439f65369519cd8ea47ae2e6043e49767106ffb0"
 
 [metadata.files]
 aiosqlite = [
@@ -262,6 +351,14 @@ asgiref = [
     {file = "asgiref-3.3.4-py3-none-any.whl", hash = "sha256:92906c611ce6c967347bbfea733f13d6313901d54dcca88195eaeb52b2a8e8ee"},
     {file = "asgiref-3.3.4.tar.gz", hash = "sha256:d1216dfbdfb63826470995d31caed36225dcaf34f182e0fa257a4dd9e86f1b78"},
 ]
+atomicwrites = [
+    {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"},
+    {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"},
+]
+attrs = [
+    {file = "attrs-21.2.0-py2.py3-none-any.whl", hash = "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1"},
+    {file = "attrs-21.2.0.tar.gz", hash = "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"},
+]
 beautifulsoup4 = [
     {file = "beautifulsoup4-4.9.3-py2-none-any.whl", hash = "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35"},
     {file = "beautifulsoup4-4.9.3-py3-none-any.whl", hash = "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"},
@@ -299,6 +396,30 @@ idna = [
     {file = "idna-2.10-py2.py3-none-any.whl", hash = "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"},
     {file = "idna-2.10.tar.gz", hash = "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6"},
 ]
+iniconfig = [
+    {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"},
+    {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
+]
+packaging = [
+    {file = "packaging-20.9-py2.py3-none-any.whl", hash = "sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a"},
+    {file = "packaging-20.9.tar.gz", hash = "sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5"},
+]
+pluggy = [
+    {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"},
+    {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"},
+]
+py = [
+    {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"},
+    {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"},
+]
+pyparsing = [
+    {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"},
+    {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"},
+]
+pytest = [
+    {file = "pytest-6.2.4-py3-none-any.whl", hash = "sha256:91ef2131a9bd6be8f76f1f08eac5c5317221d6ad1e143ae03894b862e8976890"},
+    {file = "pytest-6.2.4.tar.gz", hash = "sha256:50bcad0a0b9c5a72c8e4e7c9855a3ad496ca6a881a3641b4260605450772c54b"},
+]
 requests = [
     {file = "requests-2.25.1-py2.py3-none-any.whl", hash = "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"},
     {file = "requests-2.25.1.tar.gz", hash = "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804"},
diff --git a/pyproject.toml b/pyproject.toml
index 6067f29..9d5a057 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,7 @@ ulid-py = "^1.1.0"
 databases = {extras = ["sqlite"], version = "^0.4.3"}
 toml = "^0.10.2"
 uvicorn = "^0.14.0"
+pytest = "^6.2.4"
 
 [tool.poetry.dev-dependencies]
 
diff --git a/run b/run
index 69c6c59..b163649 100755
--- a/run
+++ b/run
@@ -12,4 +12,4 @@ shift
 # export DEBUG=1
 # export UNWIND_LOGLEVEL=DEBUG
 
-exec scripts/"$task" "$@"
+exec "$RUN_DIR"/scripts/"$task" "$@"
diff --git a/scripts/app b/scripts/app
index 2677f0f..2b214f6 100755
--- a/scripts/app
+++ b/scripts/app
@@ -1,5 +1,7 @@
 #!/bin/sh -eu
 
+cd "$RUN_DIR"
+
 [ -z "${DEBUG:-}" ] || set -x
 
 exec python -m unwind "$@"
diff --git a/scripts/dev b/scripts/dev
index 5102be9..4c2319f 100755
--- a/scripts/dev
+++ b/scripts/dev
@@ -1,5 +1,7 @@
 #!/bin/sh -eu
 
+cd "$RUN_DIR"
+
 [ -z "${DEBUG:-}" ] || set -x
 
 exec uvicorn unwind:web_app --reload
diff --git a/scripts/load_imdb_dumps b/scripts/load_imdb_dumps
new file mode 100755
index 0000000..5f4c43f
--- /dev/null
+++ b/scripts/load_imdb_dumps
@@ -0,0 +1,18 @@
+#!/bin/sh -eu
+
+datadir="$RUN_DIR"/data
+
+[ -z "${DEBUG:-}" ] || set -x
+
+# See
+# - https://www.imdb.com/interfaces/
+# - https://datasets.imdbws.com/
+
+wget -N \
+    --no-directories \
+    --directory-prefix "$datadir" \
+    https://datasets.imdbws.com/title.basics.tsv.gz \
+    https://datasets.imdbws.com/title.ratings.tsv.gz
+"$RUN_BIN" app import-imdb-dataset \
+    --basics "$datadir"/title.basics.tsv.gz \
+    --ratings "$datadir"/title.ratings.tsv.gz
diff --git a/scripts/server b/scripts/server
index 74580e8..a184989 100755
--- a/scripts/server
+++ b/scripts/server
@@ -1,5 +1,7 @@
 #!/bin/sh -eu
 
+cd "$RUN_DIR"
+
 [ -z "${DEBUG:-}" ] || set -x
 
 exec uvicorn --host 0.0.0.0 unwind:web_app
diff --git a/scripts/tests b/scripts/tests
new file mode 100755
index 0000000..b4bb899
--- /dev/null
+++ b/scripts/tests
@@ -0,0 +1,7 @@
+#!/bin/sh -eu
+
+cd "$RUN_DIR"
+
+[ -z "${DEBUG:-}" ] || set -x
+
+exec python -m pytest "$@"
diff --git a/tests/test_imdb.py b/tests/test_imdb.py
new file mode 100644
index 0000000..13a03fd
--- /dev/null
+++ b/tests/test_imdb.py
@@ -0,0 +1,19 @@
+import pytest
+from unwind.imdb import imdb_rating_from_score, score_from_imdb_rating
+
+
+@pytest.mark.parametrize("rating", (x / 10 for x in range(10, 101)))
+def test_rating_conversion(rating):
+    assert rating == imdb_rating_from_score(score_from_imdb_rating(rating))
+
+
+@pytest.mark.parametrize("score", range(0, 101))
+def test_score_conversion(score):
+    # Because our score covers 101 discrete values and IMDb's rating only 91
+    # discrete values, the mapping is non-injective, i.e. 10 values can't be
+    # mapped uniquely.
+    non_injective = set(range(5, 100, 10))
+    if score in non_injective:
+        pytest.skip(f"Score cannot be mapped back correctly: {score}")
+
+    assert score == score_from_imdb_rating(imdb_rating_from_score(score))
diff --git a/unwind/__main__.py b/unwind/__main__.py
index 0541b4a..bcc7dd5 100644
--- a/unwind/__main__.py
+++ b/unwind/__main__.py
@@ -1,15 +1,18 @@
+import argparse
 import asyncio
 import logging
+from pathlib import Path
 
 from . import config
 from .db import close_connection_pool, open_connection_pool
 from .imdb import load_imdb
+from .imdb_import import import_from_file
 from .request import session
 
 log = logging.getLogger(__name__)
 
 
-async def run_import():
+async def run_load_user_ratings_from_imdb():
     await open_connection_pool()
 
     with session() as s:
@@ -22,6 +25,60 @@ async def run_import():
     await close_connection_pool()
 
 
+async def run_import_imdb_dataset(basics_path: Path, ratings_path: Path):
+    await open_connection_pool()
+
+    await import_from_file(basics_path=basics_path, ratings_path=ratings_path)
+
+    await close_connection_pool()
+
+
+def getargs():
+    parser = argparse.ArgumentParser()
+    commands = parser.add_subparsers(required=True)
+
+    parser_import_imdb_dataset = commands.add_parser(
+        "import-imdb-dataset",
+        help="Import IMDb datasets.",
+        description="""
+            Import IMDb datasets.
+            New datasets available from https://www.imdb.com/interfaces/.
+        """,
+    )
+    parser_import_imdb_dataset.add_argument(
+        dest="mode",
+        action="store_const",
+        const="import-imdb-dataset",
+    )
+    parser_import_imdb_dataset.add_argument(
+        "--basics", metavar="basics_file.tsv.gz", type=Path, required=True
+    )
+    parser_import_imdb_dataset.add_argument(
+        "--ratings", metavar="ratings_file.tsv.gz", type=Path, required=True
+    )
+
+    parser_load_user_ratings_from_imdb = commands.add_parser(
+        "load-user-ratings-from-imdb",
+        help="Load user ratings from imdb.com.",
+        description="""
+            Refresh user ratings for all registered users live from IMDb's website.
+        """,
+    )
+    parser_load_user_ratings_from_imdb.add_argument(
+        dest="mode",
+        action="store_const",
+        const="load-user-ratings-from-imdb",
+    )
+
+    try:
+        args = parser.parse_args()
+    except TypeError:
+        parser.print_usage()
+        raise
+
+    return args
+
+
 def main():
     logging.basicConfig(
         format="%(asctime)s.%(msecs)03d [%(name)s:%(process)d] %(levelname)s: %(message)s",
@@ -30,7 +87,15 @@ def main():
     )
     log.debug(f"Log level: {config.loglevel}")
 
-    asyncio.run(run_import())
+    try:
+        args = getargs()
+    except:
+        return
+
+    if args.mode == "load-user-ratings-from-imdb":
+        asyncio.run(run_load_user_ratings_from_imdb())
+    elif args.mode == "import-imdb-dataset":
+        asyncio.run(run_import_imdb_dataset(args.basics, args.ratings))
 
 
 main()
diff --git a/unwind/db.py b/unwind/db.py
index 03145ca..e5eb29c 100644
--- a/unwind/db.py
+++ b/unwind/db.py
@@ -6,7 +6,7 @@ from typing import Optional, Type, TypeVar
 from databases import Database
 
 from . import config
-from .models import Movie, Rating, User, asplain, fromplain, utcnow
+from .models import Movie, Rating, User, asplain, fromplain, optional_fields
 
 log = logging.getLogger(__name__)
 
@@ -69,10 +69,14 @@ ModelType = TypeVar("ModelType")
 
 
 async def get(model: Type[ModelType], **kwds) -> Optional[ModelType]:
+    values = {k: v for k, v in kwds.items() if v is not None}
+    if not values:
+        return
+
     fields_ = ", ".join(f.name for f in fields(model))
-    cond = " AND ".join(f"{k}=:{k}" for k in kwds)
+    cond = " AND ".join(f"{k}=:{k}" for k, v in values.items())
     query = f"SELECT {fields_} FROM {model._table} WHERE {cond}"
-    row = await shared_connection().fetch_one(query=query, values=kwds)
+    row = await shared_connection().fetch_one(query=query, values=values)
     return fromplain(model, row) if row else None
 
 
@@ -95,16 +99,28 @@ async def add_or_update_user(user: User):
 
 
 async def add_or_update_movie(movie: Movie):
+    """Add or update a Movie in the database.
+
+    This is an upsert operation, but it will also update the Movie you pass
+    into the function to make its `id` match the DB's movie's `id`, and also
+    set all optional values on your Movie that might be unset but exist in the
+    database.  It's a bidirectional sync.
+    """
     db_movie = await get(Movie, imdb_id=movie.imdb_id)
     if not db_movie:
         await add(movie)
     else:
         movie.id = db_movie.id
-        movie.updated = db_movie.updated
 
-        if movie != db_movie:
-            movie.updated = utcnow()
-            await update(movie)
+        # We want to keep any existing value in the DB for all optional fields.
+        for f in optional_fields(movie):
+            if getattr(movie, f.name) is None:
+                setattr(movie, f.name, getattr(db_movie, f.name))
+
+        if movie.updated <= db_movie.updated:
+            return
+
+        await update(movie)
 
 
 async def add_or_update_rating(rating: Rating) -> bool:
@@ -147,38 +163,48 @@ async def find_ratings(
         values["escape"] = "#"
         escaped_title = sql_escape(title, char=values["escape"])
         values["pattern"] = "%" + "%".join(escaped_title.split()) + "%"
-        conditions.append("movies.title LIKE :pattern ESCAPE :escape")
+        values["opattern"] = values["pattern"]
+        values["oescape"] = values["escape"]
+        conditions.append(
+            f"""
+            (
+                {Movie._table}.title LIKE :pattern ESCAPE :escape
+                OR {Movie._table}.original_title LIKE :opattern ESCAPE :oescape
+            )
+            """
+        )
 
     if media_type:
         values["media_type"] = media_type
-        conditions.append("movies.media_type=:media_type")
+        conditions.append(f"{Movie._table}.media_type=:media_type")
 
     if ignore_tv_episodes:
-        conditions.append("movies.media_type!='TV Episode'")
+        conditions.append(f"{Movie._table}.media_type!='TV Episode'")
 
     query = f"""
         WITH newest_movies
         AS (
-            SELECT DISTINCT ratings.movie_id
-            FROM ratings
-            LEFT JOIN movies ON movies.id=ratings.movie_id
+            SELECT DISTINCT {Rating._table}.movie_id
+            FROM {Rating._table}
+            LEFT JOIN {Movie._table} ON {Movie._table}.id={Rating._table}.movie_id
             {('WHERE ' + ' AND '.join(conditions)) if conditions else ''}
-            ORDER BY length(movies.title) ASC, ratings.rating_date DESC
+            ORDER BY length({Movie._table}.title) ASC, {Rating._table}.rating_date DESC
             LIMIT :limit_rows
         )
 
         SELECT 
-            users.name AS user_name,
-            ratings.score AS user_score,
-            movies.score AS imdb_score,
-            movies.imdb_id AS movie_imdb_id,
-            movies.media_type AS media_type,
-            movies.title AS movie_title,
-            movies.release_year AS release_year
+            {User._table}.name AS user_name,
+            {Rating._table}.score AS user_score,
+            {Movie._table}.score AS imdb_score,
+            {Movie._table}.imdb_id AS movie_imdb_id,
+            {Movie._table}.media_type AS media_type,
+            {Movie._table}.title AS canonical_title,
+            {Movie._table}.original_title AS original_title,
+            {Movie._table}.release_year AS release_year
         FROM newest_movies
-        LEFT JOIN ratings ON ratings.movie_id=newest_movies.movie_id
-        LEFT JOIN users ON users.id=ratings.user_id
-        LEFT JOIN movies ON movies.id=ratings.movie_id
+        LEFT JOIN {Rating._table} ON {Rating._table}.movie_id=newest_movies.movie_id
+        LEFT JOIN {User._table} ON {User._table}.id={Rating._table}.user_id
+        LEFT JOIN {Movie._table} ON {Movie._table}.id={Rating._table}.movie_id
     """
 
     rows = await shared_connection().fetch_all(query=query, values=values)
diff --git a/unwind/imdb.py b/unwind/imdb.py
index 17e792d..95e6469 100644
--- a/unwind/imdb.py
+++ b/unwind/imdb.py
@@ -6,8 +6,8 @@ from typing import Optional
 from urllib.parse import urljoin
 
 from .db import add_or_update_movie, add_or_update_rating, add_or_update_user
-from .models import Movie, Rating, User, asplain, fromplain
-from .request import soup_from_url
+from .models import Movie, Rating, User
+from .request import cache_path, soup_from_url
 
 log = logging.getLogger(__name__)
 
@@ -38,6 +38,26 @@ def imdb_url(user_id):
     return f"https://www.imdb.com/user/{user_id}/ratings"
 
 
+def imdb_rating_from_score(score: int) -> float:
+    """Return the IMDb rating from an Unwind Movie score."""
+    assert 0 <= score <= 100
+    rating = round(score * 9 / 100 + 1, 1)
+    assert 1.0 <= rating <= 10.0
+    return rating
+
+
+def score_from_imdb_rating(rating: float) -> int:
+    """Return the Unwind Movie score for an IMDb rating."""
+    # Scale IMDb's 10 point rating to our score of [0, 100].
+    # There's a pitfall here!
+    # You might think this would be simply IMDb's rating times 10, *but*
+    # the lowest possible rating on IMDb is actually 1.
+    assert 1.0 <= rating <= 10.0
+    score = round(100 * (rating - 1) / 9)
+    assert 0 <= score <= 100
+    return score
+
+
 find_name = re.compile(r"(?P<name>.*)'s Ratings").fullmatch
 find_rating_date = re.compile(r"Rated on (?P<date>\d{2} \w+ \d{4})").fullmatch
 find_runtime = re.compile(r"((?P<h>\d+) hr)? ?((?P<m>\d+) min)?").fullmatch
@@ -50,67 +70,88 @@ find_year = re.compile(
 find_movie_id = re.compile(r"/title/(?P<id>tt\d+)/").search
 
 
+def movie_and_rating_from_item(item) -> tuple[Movie, Rating]:
+
+    movie = Movie(
+        title=item.h3.a.string.strip(),
+        genres=set(s.strip() for s in item.find("span", "genre").string.split(",")),
+    )
+
+    episode_br = item.h3.br
+    if episode_br:
+        episode_a = episode_br.find_next("a")
+        if not episode_a:
+            raise ValueError("Unknown document structure.")
+
+        movie.media_type = "TV Episode"
+        movie.title += " / " + episode_a.string.strip()
+        if match := find_year(episode_br.find_next("span", "lister-item-year").string):
+            movie.release_year = int(match["year"])
+        if match := find_movie_id(episode_a["href"]):
+            movie.imdb_id = match["id"]
+
+    if (tag := item.find("span", "runtime")) and (match := find_runtime(tag.string)):
+        movie.runtime = int(match["h"] or 0) * 60 + int(match["m"] or 0)
+
+    if not episode_br:
+        if match := find_year(item.h3.find("span", "lister-item-year").string):
+            if media_type := match["type"]:
+                movie.media_type = media_type.strip()
+            movie.release_year = int(match["year"])
+        if match := find_movie_id(item.h3.a["href"]):
+            movie.imdb_id = match["id"]
+
+    if not movie.media_type:
+        movie.media_type = "Movie"
+
+    rating = Rating()
+
+    ratings_item = item.find("div", "ipl-rating-widget")
+    if match := find_rating_date(ratings_item.find_next("p", "text-muted").string):
+        rating.rating_date = datetime.strptime(match["date"], "%d %b %Y")
+    if match := ratings_item.find("div", "ipl-rating-star--other-user"):
+        if rating_item := match.find("span", "ipl-rating-star__rating"):
+            rating.score = score_from_imdb_rating(float(rating_item.string))
+    if match := ratings_item.find("div", "ipl-rating-star small"):
+        if rating_item := match.find("span", "ipl-rating-star__rating"):
+            movie.score = score_from_imdb_rating(float(rating_item.string))
+
+    return movie, rating
+
+
+ForgedRequest = namedtuple("ForgedRequest", "url headers")
+
+
 async def parse_page(url, stop_on_dupe=True) -> Optional[str]:
     soup = soup_from_url(url)
 
-    user = User(imdb_id=soup.find("meta", property="pageId")["content"], name="")
-    if match := find_name(soup.h1.string):
+    meta = soup.find("meta", property="pageId")
+    headline = soup.h1
+    assert meta is not None and headline is not None
+    user = User(imdb_id=meta["content"], name="")
+    if match := find_name(headline.string):
         user.name = match["name"]
     await add_or_update_user(user)
 
     items = soup.find_all("div", "lister-item-content")
     for i, item in enumerate(items):
 
-        movie = Movie(
-            title=item.h3.a.string.strip(),
-            genres=set(s.strip() for s in item.find("span", "genre").string.split(",")),
-        )
-
-        episode_br = item.h3.br
-        if episode_br:
-            episode_a = episode_br.find_next("a")
-            if not episode_a:
-                log.error("Unknown document structure.")
-                continue
-
-            movie.media_type = "TV Episode"
-            movie.title += " / " + episode_a.string.strip()
-            if match := find_year(
-                episode_br.find_next("span", "lister-item-year").string
-            ):
-                movie.release_year = int(match["year"])
-            if match := find_movie_id(episode_a["href"]):
-                movie.imdb_id = match["id"]
-
-        rating = Rating(user_id=user.id)
-
-        if (tag := item.find("span", "runtime")) and (
-            match := find_runtime(tag.string)
-        ):
-            movie.runtime = int(match["h"] or 0) * 60 + int(match["m"] or 0)
-
-        if not episode_br:
-            if match := find_year(item.h3.find("span", "lister-item-year").string):
-                if media_type := match["type"]:
-                    movie.media_type = media_type.strip()
-                movie.release_year = int(match["year"])
-            if match := find_movie_id(item.h3.a["href"]):
-                movie.imdb_id = match["id"]
-
-        ratings_item = item.find("div", "ipl-rating-widget")
-        if match := find_rating_date(ratings_item.find_next("p", "text-muted").string):
-            rating.rating_date = datetime.strptime(match["date"], "%d %b %Y")
-        for rating_item in ratings_item.find_all("span", "ipl-rating-star__rating")[:2]:
-            if "ipl-rating-star--other-user" in rating_item.parent["class"]:
-                rating.score = int(float(rating_item.string) * 10)
-            else:
-                movie.score = int(float(rating_item.string) * 10)
-
-        if not movie.media_type:
-            movie.media_type = "Movie"
+        try:
+            movie, rating = movie_and_rating_from_item(item)
+        except Exception as err:
+            log.error(
+                "Error in %s item #%s (%s): %s: %s",
+                url,
+                i,
+                cache_path(ForgedRequest(url, headers={})),
+                " ".join(item.h3.stripped_strings),
+                err,
+            )
+            continue
 
         await add_or_update_movie(movie)
 
+        rating.user_id = user.id
         rating.movie_id = movie.id  # needs to be set _after_ movie has been updated
         is_updated = await add_or_update_rating(rating)
 
@@ -118,9 +159,9 @@ async def parse_page(url, stop_on_dupe=True) -> Optional[str]:
             log.info("Import stopped after %s items.  Caught up to known state. ✋", i)
             return None
 
-    next_url = urljoin(
-        url, soup.find("div", "footer").find(string=re.compile(r"Next")).parent["href"]
-    )
+    footer = soup.find("div", "footer")
+    assert footer is not None
+    next_url = urljoin(url, footer.find(string=re.compile(r"Next")).parent["href"])
 
     return next_url if url != next_url else None
 
diff --git a/unwind/imdb_import.py b/unwind/imdb_import.py
new file mode 100644
index 0000000..cf065ac
--- /dev/null
+++ b/unwind/imdb_import.py
@@ -0,0 +1,206 @@
+import csv
+import gzip
+import logging
+from dataclasses import dataclass, fields
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional, get_origin
+
+from . import db
+from .db import add_or_update_movie
+from .imdb import score_from_imdb_rating
+from .models import Movie, optional_type
+
+log = logging.getLogger(__name__)
+
+
+# See
+# - https://www.imdb.com/interfaces/
+# - https://datasets.imdbws.com/
+
+
+@dataclass
+class BasicRow:
+    tconst: str
+    titleType: str
+    primaryTitle: str
+    originalTitle: str
+    isAdult: bool
+    startYear: Optional[int]
+    endYear: Optional[int]
+    runtimeMinutes: Optional[int]
+    genres: Optional[set[str]]
+
+    @classmethod
+    def from_row(cls, row):
+        vals = []
+        for f, r in zip(fields(cls), row):
+            ttype = f.type
+            is_opt = False
+
+            if (otype := optional_type(ttype)) is not None:
+                ttype = otype
+                is_opt = True
+            if (otype := get_origin(ttype)) is not None:
+                ttype = otype
+
+            if r == r"\N":
+                if is_opt:
+                    vals.append(None)
+                else:
+                    raise ValueError(f"Unexpected null value for field: {f.name}")
+            elif f.name == "genres":
+                vals.append(set(r.split(",")))
+            elif f.name == "isAdult":
+                assert r in "01"
+                vals.append(r == "1")
+            else:
+                vals.append(ttype(r))
+
+        inst = cls(*vals)
+        assert inst.titleType in title_types
+        return inst
+
+    def as_movie(self):
+        assert self.startYear is not None
+        return Movie(
+            title=self.primaryTitle,
+            original_title=self.originalTitle,
+            release_year=self.startYear,
+            media_type=title_types[self.titleType],
+            imdb_id=self.tconst,
+            score=None,
+            runtime=self.runtimeMinutes,
+            genres=self.genres or set(),
+        )
+
+
+@dataclass
+class RatingRow:
+    tconst: str
+    averageRating: float
+    numVotes: int
+
+    @classmethod
+    def from_row(cls, row):
+        inst = cls(*(f.type(r) for f, r in zip(fields(cls), row)))
+        assert inst.tconst != r"\N"
+        return inst
+
+    def as_movie(self):
+        return Movie(
+            imdb_id=self.tconst,
+            score=score_from_imdb_rating(self.averageRating),
+        )
+
+
+title_types = {
+    "movie": "Movie",
+    "radioEpisode": "Radio Episode",
+    "radioSeries": "Radio Series",
+    "short": "Short",
+    "tvEpisode": "TV Episode",
+    "tvMiniSeries": "TV Mini Series",
+    "tvMovie": "TV Movie",
+    "tvSeries": "TV Series",
+    "tvShort": "TV Short",
+    "tvSpecial": "TV Special",
+    "video": "Video",
+    "videoGame": "Video Game",
+}
+
+
+def gz_mtime(path) -> datetime:
+    """Return the timestamp of the compressed file."""
+    g = gzip.GzipFile(path, "rb")
+    g.peek(1)  # start reading the file to fill the timestamp field
+    assert g.mtime is not None
+    return datetime.fromtimestamp(g.mtime).replace(tzinfo=timezone.utc)
+
+
+def count_lines(path) -> int:
+    i = 0
+    with gzip.open(path, "rt") as f:
+        for i, _ in enumerate(f, start=1):
+            pass
+    return i
+
+
+def read_imdb_tsv(path, row_type):
+    with gzip.open(path, "rt", newline="") as f:
+        rows = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
+
+        # skip header line
+        rows = iter(rows)
+        header = next(rows)
+        try:
+            assert tuple(f.name for f in fields(row_type)) == tuple(header)
+        except AssertionError:
+            log.error("Unexpected header line: %s", header)
+            raise
+
+        for i, row in enumerate(rows, start=1):
+            try:
+                yield row_type.from_row(row)
+            except Exception as err:
+                log.error("Error in line %s: %s", i, row, exc_info=err)
+                raise
+
+
+def read_ratings(path):
+    mtime = gz_mtime(path)
+    rows = read_imdb_tsv(path, RatingRow)
+
+    for row in rows:
+        m = row.as_movie()
+        m.updated = mtime
+        yield m
+
+
+def read_basics(path):
+    mtime = gz_mtime(path)
+    rows = read_imdb_tsv(path, BasicRow)
+
+    for row in rows:
+        if row.startYear is None:
+            log.debug("Skipping movie, missing year: %s", row)
+            continue
+
+        m = row.as_movie()
+        m.updated = mtime
+        yield m
+
+
+async def import_from_file(basics_path: Path, ratings_path: Path):
+    log.info("Loading scores ... 💾")
+    scores = {m.imdb_id: m.score for m in read_ratings(ratings_path)}
+
+    log.info("Importing movies ... 💾")
+    total = count_lines(basics_path)
+    assert total != 0
+    perc = 0.0
+    perc_step = 0.001
+
+    async with db.shared_connection().transaction():
+
+        for i, m in enumerate(read_basics(basics_path)):
+
+            if i / total > perc:
+                log.info("Imported %s%%", round(perc * 100, 1))
+                perc += perc_step
+
+            if m.media_type not in {
+                "Movie",
+                "Short",
+                "TV Mini Series",
+                "TV Movie",
+                "TV Series",
+                "TV Short",
+                "TV Special",
+                "Video",
+            }:
+                log.debug("Skipping movie, unwanted media type: %s", m.media_type)
+                continue
+
+            m.score = scores.get(m.imdb_id)
+            await add_or_update_movie(m)
diff --git a/unwind/init.sql b/unwind/init.sql
index d0bd446..eea5530 100644
--- a/unwind/init.sql
+++ b/unwind/init.sql
@@ -1,31 +1,32 @@
 PRAGMA foreign_keys = ON;;
 
 CREATE TABLE IF NOT EXISTS users (
-    id TEXT NOT NULL PRIMARY KEY,
+    id TEXT PRIMARY KEY NOT NULL,
     imdb_id TEXT NOT NULL UNIQUE,
     name TEXT NOT NULL
 );;
 
 CREATE TABLE IF NOT EXISTS movies (
-    id TEXT NOT NULL PRIMARY KEY,
+    id TEXT PRIMARY KEY NOT NULL,
     title TEXT NOT NULL,
-    release_year NUMBER NOT NULL,
+    original_title TEXT,
+    release_year INTEGER NOT NULL,
     media_type TEXT NOT NULL,
     imdb_id TEXT NOT NULL UNIQUE,
-    score NUMBER NOT NULL,
-    runtime NUMBER,
+    score INTEGER,
+    runtime INTEGER,
     genres TEXT NOT NULL,
     updated TEXT NOT NULL
 );;
 
 CREATE TABLE IF NOT EXISTS ratings (
-    id TEXT NOT NULL PRIMARY KEY,
+    id TEXT PRIMARY KEY NOT NULL,
     movie_id TEXT NOT NULL,
     user_id TEXT NOT NULL,
-    score NUMBER NOT NULL,
+    score INTEGER NOT NULL,
     rating_date TEXT NOT NULL,
-    favorite NUMBER,
-    finished NUMBER,
+    favorite BOOL,
+    finished BOOL,
     FOREIGN KEY(movie_id) REFERENCES movies(id),
     FOREIGN KEY(user_id) REFERENCES users(id)
 );;
diff --git a/unwind/models.py b/unwind/models.py
index f44cfe4..05fb5ae 100644
--- a/unwind/models.py
+++ b/unwind/models.py
@@ -1,5 +1,5 @@
 import json
-from dataclasses import asdict, dataclass, field, fields, is_dataclass
+from dataclasses import asdict, dataclass, field, fields
 from datetime import datetime, timezone
 from typing import Any, ClassVar, Optional, Type, Union, get_args, get_origin
 
@@ -25,6 +25,12 @@ def optional_type(tp: Type):
     return args[0]
 
 
+def optional_fields(o):
+    for f in fields(o):
+        if is_optional(f.type):
+            yield f
+
+
 def asplain(o) -> dict[str, Any]:
     validate(o)
 
@@ -56,9 +62,6 @@ def asplain(o) -> dict[str, Any]:
 
 
 def fromplain(cls, d: dict[str, Any]):
-    # if not is_dataclass(cls):
-    #     raise TypeError(f'Not a dataclass: {type(cls)}')
-
     dd = {}
     for f in fields(cls):
 
@@ -107,11 +110,14 @@ class Movie:
     _table: ClassVar[str] = "movies"
 
     id: ULID = field(default_factory=ULID)
-    title: str = None  # canonical title
+    title: str = None  # canonical title (usually English)
+    original_title: Optional[
+        str
+    ] = None  # original title (usually transscribed to latin script)
     release_year: int = None  # canonical release date
-    media_type: Optional[str] = None
+    media_type: str = None
     imdb_id: str = None
-    score: int = None  # range: [0,100]
+    score: Optional[int] = None  # range: [0,100]
     runtime: Optional[int] = None  # minutes
     genres: set[str] = None
     updated: datetime = field(default_factory=utcnow)
diff --git a/unwind/request.py b/unwind/request.py
index b517f58..e46672c 100644
--- a/unwind/request.py
+++ b/unwind/request.py
@@ -5,9 +5,10 @@ from contextlib import contextmanager
 from dataclasses import dataclass
 from functools import wraps
 from hashlib import md5
+from pathlib import Path
 from random import random
 from time import sleep, time
-from typing import Callable
+from typing import Callable, Optional
 
 import bs4
 import requests
@@ -130,16 +131,19 @@ class RedirectError(RuntimeError):
         super().__init__(f"Redirected: {from_url} -> {to_url}")
 
 
+def cache_path(req) -> Optional[Path]:
+    if not config.cachedir:
+        return
+    sig = repr(req.url)  # + repr(sorted(req.headers.items()))
+    return config.cachedir / md5(sig.encode()).hexdigest()
+
+
 @throttle(1, 1, random)
 def http_get(s: requests.Session, url: str, *args, **kwds) -> requests.Response:
 
     req = s.prepare_request(requests.Request("GET", url, *args, **kwds))
 
-    if config.debug and config.cachedir:
-        sig = repr(req.url)  # + repr(sorted(req.headers.items()))
-        cachefile = config.cachedir / md5(sig.encode()).hexdigest()
-    else:
-        cachefile = None
+    cachefile = cache_path(req) if config.debug else None
 
     if cachefile:
         if cachefile.exists():
diff --git a/unwind/web.py b/unwind/web.py
index 474d169..4cbe6db 100644
--- a/unwind/web.py
+++ b/unwind/web.py
@@ -1,11 +1,42 @@
-from collections import defaultdict
+import base64
+import binascii
 
 from starlette.applications import Starlette
+from starlette.authentication import (
+    AuthCredentials,
+    AuthenticationBackend,
+    AuthenticationError,
+    SimpleUser,
+    UnauthenticatedUser,
+    requires,
+)
+from starlette.middleware import Middleware
+from starlette.middleware.authentication import AuthenticationMiddleware
 from starlette.responses import JSONResponse
-from starlette.routing import Route
+from starlette.routing import Mount, Route
 
-from . import config
+from . import config, db
 from .db import close_connection_pool, find_ratings, open_connection_pool
+from .models import Movie, asplain
+
+
+class BasicAuthBackend(AuthenticationBackend):
+    async def authenticate(self, request):
+        if "Authorization" not in request.headers:
+            return
+
+        auth = request.headers["Authorization"]
+        try:
+            scheme, credentials = auth.split()
+            if scheme.lower() != "basic":
+                return
+            decoded = base64.b64decode(credentials).decode("ascii")
+        except (ValueError, UnicodeDecodeError, binascii.Error) as exc:
+            raise AuthenticationError("Invalid basic auth credentials")
+
+        username, _, password = decoded.partition(":")
+        # TODO: You'd want to verify the username and password here.
+        return AuthCredentials(["authenticated"]), SimpleUser(username)
 
 
 def imdb_url(imdb_id: str):
@@ -29,7 +60,8 @@ async def ratings(request):
         mov = aggr.setdefault(
             r["movie_imdb_id"],
             {
-                "title": r["movie_title"],
+                "canonical_title": r["canonical_title"],
+                "original_title": r["original_title"],
                 "year": r["release_year"],
                 "link": imdb_url(r["movie_imdb_id"]),
                 "user_scores": [],
@@ -44,10 +76,69 @@ async def ratings(request):
     return JSONResponse(resp)
 
 
+not_found = JSONResponse({"error": "Not Found"}, status_code=404)
+
+
+async def get_movies(request):
+    imdb_id = request.query_params.get("imdb_id")
+
+    movie = await db.get(Movie, imdb_id=imdb_id)
+
+    resp = [asplain(movie)] if movie else []
+    return JSONResponse(resp)
+
+
+@requires(["authenticated", "admin"])
+async def add_movie(request):
+    pass
+
+
+@requires(["authenticated", "admin"])
+async def add_user(request):
+    pass
+
+
+async def ratings_for_user(request):
+    request.path_params["user_id"]
+
+
+@requires("authenticated")
+async def set_rating_for_user(request):
+    request.path_params["user_id"]
+
+
+@requires(["authenticated", "admin"])
+async def add_group(request):
+    pass
+
+
+@requires(["authenticated", "admin"])
+async def add_user_to_group(request):
+    request.path_params["group_id"]
+
+
+async def get_ratings_for_group(request):
+    request.path_params["group_id"]
+
+
 app = Starlette(
     on_startup=[open_connection_pool],
     on_shutdown=[close_connection_pool],
     routes=[
-        Route("/ratings", ratings),
+        Mount(
+            "/api/v1",
+            routes=[
+                Route("/ratings", ratings),  # XXX legacy, remove.
+                Route("/movies", get_movies),
+                Route("/movies", add_movie, methods=["POST"]),
+                Route("/users", add_user, methods=["POST"]),
+                Route("/users/{user_id}/ratings", ratings_for_user),
+                Route("/users/{user_id}/ratings", set_rating_for_user, methods=["PUT"]),
+                Route("/groups", add_group, methods=["POST"]),
+                Route("/groups/{group_id}/users", add_user_to_group, methods=["POST"]),
+                Route("/groups/{group_id}/ratings", get_ratings_for_group),
+            ],
+        ),
     ],
+    middleware=[Middleware(AuthenticationMiddleware, backend=BasicAuthBackend())],
 )