From 7dd10f8bc343db3e1bb9cfea475744d8abdf35f4 Mon Sep 17 00:00:00 2001 From: ducklet Date: Mon, 21 Jun 2021 18:54:03 +0200 Subject: [PATCH] add imdb full import mode --- poetry.lock | 123 +++++++++++++++++++++++- pyproject.toml | 1 + run | 2 +- scripts/app | 2 + scripts/dev | 2 + scripts/load_imdb_dumps | 18 ++++ scripts/server | 2 + scripts/tests | 7 ++ tests/test_imdb.py | 19 ++++ unwind/__main__.py | 69 +++++++++++++- unwind/db.py | 74 ++++++++++----- unwind/imdb.py | 149 ++++++++++++++++++----------- unwind/imdb_import.py | 206 ++++++++++++++++++++++++++++++++++++++++ unwind/init.sql | 19 ++-- unwind/models.py | 20 ++-- unwind/request.py | 16 ++-- unwind/web.py | 101 +++++++++++++++++++- 17 files changed, 721 insertions(+), 109 deletions(-) create mode 100755 scripts/load_imdb_dumps create mode 100755 scripts/tests create mode 100644 tests/test_imdb.py create mode 100644 unwind/imdb_import.py diff --git a/poetry.lock b/poetry.lock index 31651e7..3891013 100644 --- a/poetry.lock +++ b/poetry.lock @@ -20,6 +20,28 @@ python-versions = ">=3.6" [package.extras] tests = ["pytest", "pytest-asyncio", "mypy (>=0.800)"] +[[package]] +name = "atomicwrites" +version = "1.4.0" +description = "Atomic file writes." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "attrs" +version = "21.2.0" +description = "Classes Without Boilerplate" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[package.extras] +dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit"] +docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"] +tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface"] +tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins"] + [[package]] name = "beautifulsoup4" version = "4.9.3" @@ -122,6 +144,73 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +[[package]] +name = "iniconfig" +version = "1.1.1" +description = "iniconfig: brain-dead simple config-ini parsing" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "packaging" +version = "20.9" +description = "Core utilities for Python packages" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[package.dependencies] +pyparsing = ">=2.0.2" + +[[package]] +name = "pluggy" +version = "0.13.1" +description = "plugin and hook calling mechanisms for python" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[package.extras] +dev = ["pre-commit", "tox"] + +[[package]] +name = "py" +version = "1.10.0" +description = "library with cross-python path, ini-parsing, io, code, log facilities" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "pyparsing" +version = "2.4.7" +description = "Python parsing module" +category = "main" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" + +[[package]] +name = "pytest" +version = "6.2.4" +description = "pytest: simple powerful testing with Python" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} +attrs = ">=19.2.0" +colorama = {version = "*", markers = "sys_platform == \"win32\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=0.12,<1.0.0a1" +py = ">=1.8.2" +toml = "*" + +[package.extras] +testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] + [[package]] name = "requests" version = "2.25.1" @@ -251,7 +340,7 @@ python-versions = "*" [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "28c14ec611e61db259fa6aa160df99308f7452874f69377a634d07cd379603c8" +content-hash = "2aed53c1d20035335cf0c0e6439f65369519cd8ea47ae2e6043e49767106ffb0" [metadata.files] aiosqlite = [ @@ -262,6 +351,14 @@ asgiref = [ {file = "asgiref-3.3.4-py3-none-any.whl", hash = "sha256:92906c611ce6c967347bbfea733f13d6313901d54dcca88195eaeb52b2a8e8ee"}, {file = "asgiref-3.3.4.tar.gz", hash = "sha256:d1216dfbdfb63826470995d31caed36225dcaf34f182e0fa257a4dd9e86f1b78"}, ] +atomicwrites = [ + {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"}, + {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"}, +] +attrs = [ + {file = "attrs-21.2.0-py2.py3-none-any.whl", hash = "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1"}, + {file = "attrs-21.2.0.tar.gz", hash = "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"}, +] beautifulsoup4 = [ {file = "beautifulsoup4-4.9.3-py2-none-any.whl", hash = "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35"}, {file = "beautifulsoup4-4.9.3-py3-none-any.whl", hash = "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"}, @@ -299,6 +396,30 @@ idna = [ {file = "idna-2.10-py2.py3-none-any.whl", hash = "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"}, {file = "idna-2.10.tar.gz", hash = "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6"}, ] +iniconfig = [ + {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, + {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, +] +packaging = [ + {file = "packaging-20.9-py2.py3-none-any.whl", hash = "sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a"}, + {file = "packaging-20.9.tar.gz", hash = "sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5"}, +] +pluggy = [ + {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"}, + {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"}, +] +py = [ + {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"}, + {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"}, +] +pyparsing = [ + {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"}, + {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"}, +] +pytest = [ + {file = "pytest-6.2.4-py3-none-any.whl", hash = "sha256:91ef2131a9bd6be8f76f1f08eac5c5317221d6ad1e143ae03894b862e8976890"}, + {file = "pytest-6.2.4.tar.gz", hash = "sha256:50bcad0a0b9c5a72c8e4e7c9855a3ad496ca6a881a3641b4260605450772c54b"}, +] requests = [ {file = "requests-2.25.1-py2.py3-none-any.whl", hash = "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"}, {file = "requests-2.25.1.tar.gz", hash = "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804"}, diff --git a/pyproject.toml b/pyproject.toml index 6067f29..9d5a057 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ ulid-py = "^1.1.0" databases = {extras = ["sqlite"], version = "^0.4.3"} toml = "^0.10.2" uvicorn = "^0.14.0" +pytest = "^6.2.4" [tool.poetry.dev-dependencies] diff --git a/run b/run index 69c6c59..b163649 100755 --- a/run +++ b/run @@ -12,4 +12,4 @@ shift # export DEBUG=1 # export UNWIND_LOGLEVEL=DEBUG -exec scripts/"$task" "$@" +exec "$RUN_DIR"/scripts/"$task" "$@" diff --git a/scripts/app b/scripts/app index 2677f0f..2b214f6 100755 --- a/scripts/app +++ b/scripts/app @@ -1,5 +1,7 @@ #!/bin/sh -eu +cd "$RUN_DIR" + [ -z "${DEBUG:-}" ] || set -x exec python -m unwind "$@" diff --git a/scripts/dev b/scripts/dev index 5102be9..4c2319f 100755 --- a/scripts/dev +++ b/scripts/dev @@ -1,5 +1,7 @@ #!/bin/sh -eu +cd "$RUN_DIR" + [ -z "${DEBUG:-}" ] || set -x exec uvicorn unwind:web_app --reload diff --git a/scripts/load_imdb_dumps b/scripts/load_imdb_dumps new file mode 100755 index 0000000..5f4c43f --- /dev/null +++ b/scripts/load_imdb_dumps @@ -0,0 +1,18 @@ +#!/bin/sh -eu + +datadir="$RUN_DIR"/data + +[ -z "${DEBUG:-}" ] || set -x + +# See +# - https://www.imdb.com/interfaces/ +# - https://datasets.imdbws.com/ + +wget -N \ + --no-directories \ + --directory-prefix "$datadir" \ + https://datasets.imdbws.com/title.basics.tsv.gz \ + https://datasets.imdbws.com/title.ratings.tsv.gz +"$RUN_BIN" app import-imdb-dataset \ + --basics "$datadir"/title.basics.tsv.gz \ + --ratings "$datadir"/title.ratings.tsv.gz diff --git a/scripts/server b/scripts/server index 74580e8..a184989 100755 --- a/scripts/server +++ b/scripts/server @@ -1,5 +1,7 @@ #!/bin/sh -eu +cd "$RUN_DIR" + [ -z "${DEBUG:-}" ] || set -x exec uvicorn --host 0.0.0.0 unwind:web_app diff --git a/scripts/tests b/scripts/tests new file mode 100755 index 0000000..b4bb899 --- /dev/null +++ b/scripts/tests @@ -0,0 +1,7 @@ +#!/bin/sh -eu + +cd "$RUN_DIR" + +[ -z "${DEBUG:-}" ] || set -x + +exec python -m pytest "$@" diff --git a/tests/test_imdb.py b/tests/test_imdb.py new file mode 100644 index 0000000..13a03fd --- /dev/null +++ b/tests/test_imdb.py @@ -0,0 +1,19 @@ +import pytest +from unwind.imdb import imdb_rating_from_score, score_from_imdb_rating + + +@pytest.mark.parametrize("rating", (x / 10 for x in range(10, 101))) +def test_rating_conversion(rating): + assert rating == imdb_rating_from_score(score_from_imdb_rating(rating)) + + +@pytest.mark.parametrize("score", range(0, 101)) +def test_score_conversion(score): + # Because our score covers 101 discrete values and IMDb's rating only 91 + # discrete values, the mapping is non-injective, i.e. 10 values can't be + # mapped uniquely. + non_injective = set(range(5, 100, 10)) + if score in non_injective: + pytest.skip(f"Score cannot be mapped back correctly: {score}") + + assert score == score_from_imdb_rating(imdb_rating_from_score(score)) diff --git a/unwind/__main__.py b/unwind/__main__.py index 0541b4a..bcc7dd5 100644 --- a/unwind/__main__.py +++ b/unwind/__main__.py @@ -1,15 +1,18 @@ +import argparse import asyncio import logging +from pathlib import Path from . import config from .db import close_connection_pool, open_connection_pool from .imdb import load_imdb +from .imdb_import import import_from_file from .request import session log = logging.getLogger(__name__) -async def run_import(): +async def run_load_user_ratings_from_imdb(): await open_connection_pool() with session() as s: @@ -22,6 +25,60 @@ async def run_import(): await close_connection_pool() +async def run_import_imdb_dataset(basics_path: Path, ratings_path: Path): + await open_connection_pool() + + await import_from_file(basics_path=basics_path, ratings_path=ratings_path) + + await close_connection_pool() + + +def getargs(): + parser = argparse.ArgumentParser() + commands = parser.add_subparsers(required=True) + + parser_import_imdb_dataset = commands.add_parser( + "import-imdb-dataset", + help="Import IMDb datasets.", + description=""" + Import IMDb datasets. + New datasets available from https://www.imdb.com/interfaces/. + """, + ) + parser_import_imdb_dataset.add_argument( + dest="mode", + action="store_const", + const="import-imdb-dataset", + ) + parser_import_imdb_dataset.add_argument( + "--basics", metavar="basics_file.tsv.gz", type=Path, required=True + ) + parser_import_imdb_dataset.add_argument( + "--ratings", metavar="ratings_file.tsv.gz", type=Path, required=True + ) + + parser_load_user_ratings_from_imdb = commands.add_parser( + "load-user-ratings-from-imdb", + help="Load user ratings from imdb.com.", + description=""" + Refresh user ratings for all registered users live from IMDb's website. + """, + ) + parser_load_user_ratings_from_imdb.add_argument( + dest="mode", + action="store_const", + const="load-user-ratings-from-imdb", + ) + + try: + args = parser.parse_args() + except TypeError: + parser.print_usage() + raise + + return args + + def main(): logging.basicConfig( format="%(asctime)s.%(msecs)03d [%(name)s:%(process)d] %(levelname)s: %(message)s", @@ -30,7 +87,15 @@ def main(): ) log.debug(f"Log level: {config.loglevel}") - asyncio.run(run_import()) + try: + args = getargs() + except: + return + + if args.mode == "load-user-ratings-from-imdb": + asyncio.run(run_load_user_ratings_from_imdb()) + elif args.mode == "import-imdb-dataset": + asyncio.run(run_import_imdb_dataset(args.basics, args.ratings)) main() diff --git a/unwind/db.py b/unwind/db.py index 03145ca..e5eb29c 100644 --- a/unwind/db.py +++ b/unwind/db.py @@ -6,7 +6,7 @@ from typing import Optional, Type, TypeVar from databases import Database from . import config -from .models import Movie, Rating, User, asplain, fromplain, utcnow +from .models import Movie, Rating, User, asplain, fromplain, optional_fields log = logging.getLogger(__name__) @@ -69,10 +69,14 @@ ModelType = TypeVar("ModelType") async def get(model: Type[ModelType], **kwds) -> Optional[ModelType]: + values = {k: v for k, v in kwds.items() if v is not None} + if not values: + return + fields_ = ", ".join(f.name for f in fields(model)) - cond = " AND ".join(f"{k}=:{k}" for k in kwds) + cond = " AND ".join(f"{k}=:{k}" for k, v in values.items()) query = f"SELECT {fields_} FROM {model._table} WHERE {cond}" - row = await shared_connection().fetch_one(query=query, values=kwds) + row = await shared_connection().fetch_one(query=query, values=values) return fromplain(model, row) if row else None @@ -95,16 +99,28 @@ async def add_or_update_user(user: User): async def add_or_update_movie(movie: Movie): + """Add or update a Movie in the database. + + This is an upsert operation, but it will also update the Movie you pass + into the function to make its `id` match the DB's movie's `id`, and also + set all optional values on your Movie that might be unset but exist in the + database. It's a bidirectional sync. + """ db_movie = await get(Movie, imdb_id=movie.imdb_id) if not db_movie: await add(movie) else: movie.id = db_movie.id - movie.updated = db_movie.updated - if movie != db_movie: - movie.updated = utcnow() - await update(movie) + # We want to keep any existing value in the DB for all optional fields. + for f in optional_fields(movie): + if getattr(movie, f.name) is None: + setattr(movie, f.name, getattr(db_movie, f.name)) + + if movie.updated <= db_movie.updated: + return + + await update(movie) async def add_or_update_rating(rating: Rating) -> bool: @@ -147,38 +163,48 @@ async def find_ratings( values["escape"] = "#" escaped_title = sql_escape(title, char=values["escape"]) values["pattern"] = "%" + "%".join(escaped_title.split()) + "%" - conditions.append("movies.title LIKE :pattern ESCAPE :escape") + values["opattern"] = values["pattern"] + values["oescape"] = values["escape"] + conditions.append( + f""" + ( + {Movie._table}.title LIKE :pattern ESCAPE :escape + OR {Movie._table}.original_title LIKE :opattern ESCAPE :oescape + ) + """ + ) if media_type: values["media_type"] = media_type - conditions.append("movies.media_type=:media_type") + conditions.append(f"{Movie._table}.media_type=:media_type") if ignore_tv_episodes: - conditions.append("movies.media_type!='TV Episode'") + conditions.append(f"{Movie._table}.media_type!='TV Episode'") query = f""" WITH newest_movies AS ( - SELECT DISTINCT ratings.movie_id - FROM ratings - LEFT JOIN movies ON movies.id=ratings.movie_id + SELECT DISTINCT {Rating._table}.movie_id + FROM {Rating._table} + LEFT JOIN {Movie._table} ON {Movie._table}.id={Rating._table}.movie_id {('WHERE ' + ' AND '.join(conditions)) if conditions else ''} - ORDER BY length(movies.title) ASC, ratings.rating_date DESC + ORDER BY length({Movie._table}.title) ASC, {Rating._table}.rating_date DESC LIMIT :limit_rows ) SELECT - users.name AS user_name, - ratings.score AS user_score, - movies.score AS imdb_score, - movies.imdb_id AS movie_imdb_id, - movies.media_type AS media_type, - movies.title AS movie_title, - movies.release_year AS release_year + {User._table}.name AS user_name, + {Rating._table}.score AS user_score, + {Movie._table}.score AS imdb_score, + {Movie._table}.imdb_id AS movie_imdb_id, + {Movie._table}.media_type AS media_type, + {Movie._table}.title AS canonical_title, + {Movie._table}.original_title AS original_title, + {Movie._table}.release_year AS release_year FROM newest_movies - LEFT JOIN ratings ON ratings.movie_id=newest_movies.movie_id - LEFT JOIN users ON users.id=ratings.user_id - LEFT JOIN movies ON movies.id=ratings.movie_id + LEFT JOIN {Rating._table} ON {Rating._table}.movie_id=newest_movies.movie_id + LEFT JOIN {User._table} ON {User._table}.id={Rating._table}.user_id + LEFT JOIN {Movie._table} ON {Movie._table}.id={Rating._table}.movie_id """ rows = await shared_connection().fetch_all(query=query, values=values) diff --git a/unwind/imdb.py b/unwind/imdb.py index 17e792d..95e6469 100644 --- a/unwind/imdb.py +++ b/unwind/imdb.py @@ -6,8 +6,8 @@ from typing import Optional from urllib.parse import urljoin from .db import add_or_update_movie, add_or_update_rating, add_or_update_user -from .models import Movie, Rating, User, asplain, fromplain -from .request import soup_from_url +from .models import Movie, Rating, User +from .request import cache_path, soup_from_url log = logging.getLogger(__name__) @@ -38,6 +38,26 @@ def imdb_url(user_id): return f"https://www.imdb.com/user/{user_id}/ratings" +def imdb_rating_from_score(score: int) -> float: + """Return the IMDb rating from an Unwind Movie score.""" + assert 0 <= score <= 100 + rating = round(score * 9 / 100 + 1, 1) + assert 1.0 <= rating <= 10.0 + return rating + + +def score_from_imdb_rating(rating: float) -> int: + """Return the Unwind Movie score for an IMDb rating.""" + # Scale IMDb's 10 point rating to our score of [0, 100]. + # There's a pitfall here! + # You might think this would be simply IMDb's rating times 10, *but* + # the lowest possible rating on IMDb is actually 1. + assert 1.0 <= rating <= 10.0 + score = round(100 * (rating - 1) / 9) + assert 0 <= score <= 100 + return score + + find_name = re.compile(r"(?P.*)'s Ratings").fullmatch find_rating_date = re.compile(r"Rated on (?P\d{2} \w+ \d{4})").fullmatch find_runtime = re.compile(r"((?P\d+) hr)? ?((?P\d+) min)?").fullmatch @@ -50,67 +70,88 @@ find_year = re.compile( find_movie_id = re.compile(r"/title/(?Ptt\d+)/").search +def movie_and_rating_from_item(item) -> tuple[Movie, Rating]: + + movie = Movie( + title=item.h3.a.string.strip(), + genres=set(s.strip() for s in item.find("span", "genre").string.split(",")), + ) + + episode_br = item.h3.br + if episode_br: + episode_a = episode_br.find_next("a") + if not episode_a: + raise ValueError("Unknown document structure.") + + movie.media_type = "TV Episode" + movie.title += " / " + episode_a.string.strip() + if match := find_year(episode_br.find_next("span", "lister-item-year").string): + movie.release_year = int(match["year"]) + if match := find_movie_id(episode_a["href"]): + movie.imdb_id = match["id"] + + if (tag := item.find("span", "runtime")) and (match := find_runtime(tag.string)): + movie.runtime = int(match["h"] or 0) * 60 + int(match["m"] or 0) + + if not episode_br: + if match := find_year(item.h3.find("span", "lister-item-year").string): + if media_type := match["type"]: + movie.media_type = media_type.strip() + movie.release_year = int(match["year"]) + if match := find_movie_id(item.h3.a["href"]): + movie.imdb_id = match["id"] + + if not movie.media_type: + movie.media_type = "Movie" + + rating = Rating() + + ratings_item = item.find("div", "ipl-rating-widget") + if match := find_rating_date(ratings_item.find_next("p", "text-muted").string): + rating.rating_date = datetime.strptime(match["date"], "%d %b %Y") + if match := ratings_item.find("div", "ipl-rating-star--other-user"): + if rating_item := match.find("span", "ipl-rating-star__rating"): + rating.score = score_from_imdb_rating(float(rating_item.string)) + if match := ratings_item.find("div", "ipl-rating-star small"): + if rating_item := match.find("span", "ipl-rating-star__rating"): + movie.score = score_from_imdb_rating(float(rating_item.string)) + + return movie, rating + + +ForgedRequest = namedtuple("ForgedRequest", "url headers") + + async def parse_page(url, stop_on_dupe=True) -> Optional[str]: soup = soup_from_url(url) - user = User(imdb_id=soup.find("meta", property="pageId")["content"], name="") - if match := find_name(soup.h1.string): + meta = soup.find("meta", property="pageId") + headline = soup.h1 + assert meta is not None and headline is not None + user = User(imdb_id=meta["content"], name="") + if match := find_name(headline.string): user.name = match["name"] await add_or_update_user(user) items = soup.find_all("div", "lister-item-content") for i, item in enumerate(items): - movie = Movie( - title=item.h3.a.string.strip(), - genres=set(s.strip() for s in item.find("span", "genre").string.split(",")), - ) - - episode_br = item.h3.br - if episode_br: - episode_a = episode_br.find_next("a") - if not episode_a: - log.error("Unknown document structure.") - continue - - movie.media_type = "TV Episode" - movie.title += " / " + episode_a.string.strip() - if match := find_year( - episode_br.find_next("span", "lister-item-year").string - ): - movie.release_year = int(match["year"]) - if match := find_movie_id(episode_a["href"]): - movie.imdb_id = match["id"] - - rating = Rating(user_id=user.id) - - if (tag := item.find("span", "runtime")) and ( - match := find_runtime(tag.string) - ): - movie.runtime = int(match["h"] or 0) * 60 + int(match["m"] or 0) - - if not episode_br: - if match := find_year(item.h3.find("span", "lister-item-year").string): - if media_type := match["type"]: - movie.media_type = media_type.strip() - movie.release_year = int(match["year"]) - if match := find_movie_id(item.h3.a["href"]): - movie.imdb_id = match["id"] - - ratings_item = item.find("div", "ipl-rating-widget") - if match := find_rating_date(ratings_item.find_next("p", "text-muted").string): - rating.rating_date = datetime.strptime(match["date"], "%d %b %Y") - for rating_item in ratings_item.find_all("span", "ipl-rating-star__rating")[:2]: - if "ipl-rating-star--other-user" in rating_item.parent["class"]: - rating.score = int(float(rating_item.string) * 10) - else: - movie.score = int(float(rating_item.string) * 10) - - if not movie.media_type: - movie.media_type = "Movie" + try: + movie, rating = movie_and_rating_from_item(item) + except Exception as err: + log.error( + "Error in %s item #%s (%s): %s: %s", + url, + i, + cache_path(ForgedRequest(url, headers={})), + " ".join(item.h3.stripped_strings), + err, + ) + continue await add_or_update_movie(movie) + rating.user_id = user.id rating.movie_id = movie.id # needs to be set _after_ movie has been updated is_updated = await add_or_update_rating(rating) @@ -118,9 +159,9 @@ async def parse_page(url, stop_on_dupe=True) -> Optional[str]: log.info("Import stopped after %s items. Caught up to known state. ✋", i) return None - next_url = urljoin( - url, soup.find("div", "footer").find(string=re.compile(r"Next")).parent["href"] - ) + footer = soup.find("div", "footer") + assert footer is not None + next_url = urljoin(url, footer.find(string=re.compile(r"Next")).parent["href"]) return next_url if url != next_url else None diff --git a/unwind/imdb_import.py b/unwind/imdb_import.py new file mode 100644 index 0000000..cf065ac --- /dev/null +++ b/unwind/imdb_import.py @@ -0,0 +1,206 @@ +import csv +import gzip +import logging +from dataclasses import dataclass, fields +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional, get_origin + +from . import db +from .db import add_or_update_movie +from .imdb import score_from_imdb_rating +from .models import Movie, optional_type + +log = logging.getLogger(__name__) + + +# See +# - https://www.imdb.com/interfaces/ +# - https://datasets.imdbws.com/ + + +@dataclass +class BasicRow: + tconst: str + titleType: str + primaryTitle: str + originalTitle: str + isAdult: bool + startYear: Optional[int] + endYear: Optional[int] + runtimeMinutes: Optional[int] + genres: Optional[set[str]] + + @classmethod + def from_row(cls, row): + vals = [] + for f, r in zip(fields(cls), row): + ttype = f.type + is_opt = False + + if (otype := optional_type(ttype)) is not None: + ttype = otype + is_opt = True + if (otype := get_origin(ttype)) is not None: + ttype = otype + + if r == r"\N": + if is_opt: + vals.append(None) + else: + raise ValueError(f"Unexpected null value for field: {f.name}") + elif f.name == "genres": + vals.append(set(r.split(","))) + elif f.name == "isAdult": + assert r in "01" + vals.append(r == "1") + else: + vals.append(ttype(r)) + + inst = cls(*vals) + assert inst.titleType in title_types + return inst + + def as_movie(self): + assert self.startYear is not None + return Movie( + title=self.primaryTitle, + original_title=self.originalTitle, + release_year=self.startYear, + media_type=title_types[self.titleType], + imdb_id=self.tconst, + score=None, + runtime=self.runtimeMinutes, + genres=self.genres or set(), + ) + + +@dataclass +class RatingRow: + tconst: str + averageRating: float + numVotes: int + + @classmethod + def from_row(cls, row): + inst = cls(*(f.type(r) for f, r in zip(fields(cls), row))) + assert inst.tconst != r"\N" + return inst + + def as_movie(self): + return Movie( + imdb_id=self.tconst, + score=score_from_imdb_rating(self.averageRating), + ) + + +title_types = { + "movie": "Movie", + "radioEpisode": "Radio Episode", + "radioSeries": "Radio Series", + "short": "Short", + "tvEpisode": "TV Episode", + "tvMiniSeries": "TV Mini Series", + "tvMovie": "TV Movie", + "tvSeries": "TV Series", + "tvShort": "TV Short", + "tvSpecial": "TV Special", + "video": "Video", + "videoGame": "Video Game", +} + + +def gz_mtime(path) -> datetime: + """Return the timestamp of the compressed file.""" + g = gzip.GzipFile(path, "rb") + g.peek(1) # start reading the file to fill the timestamp field + assert g.mtime is not None + return datetime.fromtimestamp(g.mtime).replace(tzinfo=timezone.utc) + + +def count_lines(path) -> int: + i = 0 + with gzip.open(path, "rt") as f: + for i, _ in enumerate(f, start=1): + pass + return i + + +def read_imdb_tsv(path, row_type): + with gzip.open(path, "rt", newline="") as f: + rows = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE) + + # skip header line + rows = iter(rows) + header = next(rows) + try: + assert tuple(f.name for f in fields(row_type)) == tuple(header) + except AssertionError: + log.error("Unexpected header line: %s", header) + raise + + for i, row in enumerate(rows, start=1): + try: + yield row_type.from_row(row) + except Exception as err: + log.error("Error in line %s: %s", i, row, exc_info=err) + raise + + +def read_ratings(path): + mtime = gz_mtime(path) + rows = read_imdb_tsv(path, RatingRow) + + for row in rows: + m = row.as_movie() + m.updated = mtime + yield m + + +def read_basics(path): + mtime = gz_mtime(path) + rows = read_imdb_tsv(path, BasicRow) + + for row in rows: + if row.startYear is None: + log.debug("Skipping movie, missing year: %s", row) + continue + + m = row.as_movie() + m.updated = mtime + yield m + + +async def import_from_file(basics_path: Path, ratings_path: Path): + log.info("Loading scores ... 💾") + scores = {m.imdb_id: m.score for m in read_ratings(ratings_path)} + + log.info("Importing movies ... 💾") + total = count_lines(basics_path) + assert total != 0 + perc = 0.0 + perc_step = 0.001 + + async with db.shared_connection().transaction(): + + for i, m in enumerate(read_basics(basics_path)): + + if i / total > perc: + log.info("Imported %s%%", round(perc * 100, 1)) + perc += perc_step + + if m.media_type not in { + "Movie", + "Short", + "TV Mini Series", + "TV Movie", + "TV Series", + "TV Short", + "TV Special", + "Video", + }: + log.debug("Skipping movie, unwanted media type: %s", m.media_type) + continue + + m.score = scores.get(m.imdb_id) + await add_or_update_movie(m) diff --git a/unwind/init.sql b/unwind/init.sql index d0bd446..eea5530 100644 --- a/unwind/init.sql +++ b/unwind/init.sql @@ -1,31 +1,32 @@ PRAGMA foreign_keys = ON;; CREATE TABLE IF NOT EXISTS users ( - id TEXT NOT NULL PRIMARY KEY, + id TEXT PRIMARY KEY NOT NULL, imdb_id TEXT NOT NULL UNIQUE, name TEXT NOT NULL );; CREATE TABLE IF NOT EXISTS movies ( - id TEXT NOT NULL PRIMARY KEY, + id TEXT PRIMARY KEY NOT NULL, title TEXT NOT NULL, - release_year NUMBER NOT NULL, + original_title TEXT, + release_year INTEGER NOT NULL, media_type TEXT NOT NULL, imdb_id TEXT NOT NULL UNIQUE, - score NUMBER NOT NULL, - runtime NUMBER, + score INTEGER, + runtime INTEGER, genres TEXT NOT NULL, updated TEXT NOT NULL );; CREATE TABLE IF NOT EXISTS ratings ( - id TEXT NOT NULL PRIMARY KEY, + id TEXT PRIMARY KEY NOT NULL, movie_id TEXT NOT NULL, user_id TEXT NOT NULL, - score NUMBER NOT NULL, + score INTEGER NOT NULL, rating_date TEXT NOT NULL, - favorite NUMBER, - finished NUMBER, + favorite BOOL, + finished BOOL, FOREIGN KEY(movie_id) REFERENCES movies(id), FOREIGN KEY(user_id) REFERENCES users(id) );; diff --git a/unwind/models.py b/unwind/models.py index f44cfe4..05fb5ae 100644 --- a/unwind/models.py +++ b/unwind/models.py @@ -1,5 +1,5 @@ import json -from dataclasses import asdict, dataclass, field, fields, is_dataclass +from dataclasses import asdict, dataclass, field, fields from datetime import datetime, timezone from typing import Any, ClassVar, Optional, Type, Union, get_args, get_origin @@ -25,6 +25,12 @@ def optional_type(tp: Type): return args[0] +def optional_fields(o): + for f in fields(o): + if is_optional(f.type): + yield f + + def asplain(o) -> dict[str, Any]: validate(o) @@ -56,9 +62,6 @@ def asplain(o) -> dict[str, Any]: def fromplain(cls, d: dict[str, Any]): - # if not is_dataclass(cls): - # raise TypeError(f'Not a dataclass: {type(cls)}') - dd = {} for f in fields(cls): @@ -107,11 +110,14 @@ class Movie: _table: ClassVar[str] = "movies" id: ULID = field(default_factory=ULID) - title: str = None # canonical title + title: str = None # canonical title (usually English) + original_title: Optional[ + str + ] = None # original title (usually transscribed to latin script) release_year: int = None # canonical release date - media_type: Optional[str] = None + media_type: str = None imdb_id: str = None - score: int = None # range: [0,100] + score: Optional[int] = None # range: [0,100] runtime: Optional[int] = None # minutes genres: set[str] = None updated: datetime = field(default_factory=utcnow) diff --git a/unwind/request.py b/unwind/request.py index b517f58..e46672c 100644 --- a/unwind/request.py +++ b/unwind/request.py @@ -5,9 +5,10 @@ from contextlib import contextmanager from dataclasses import dataclass from functools import wraps from hashlib import md5 +from pathlib import Path from random import random from time import sleep, time -from typing import Callable +from typing import Callable, Optional import bs4 import requests @@ -130,16 +131,19 @@ class RedirectError(RuntimeError): super().__init__(f"Redirected: {from_url} -> {to_url}") +def cache_path(req) -> Optional[Path]: + if not config.cachedir: + return + sig = repr(req.url) # + repr(sorted(req.headers.items())) + return config.cachedir / md5(sig.encode()).hexdigest() + + @throttle(1, 1, random) def http_get(s: requests.Session, url: str, *args, **kwds) -> requests.Response: req = s.prepare_request(requests.Request("GET", url, *args, **kwds)) - if config.debug and config.cachedir: - sig = repr(req.url) # + repr(sorted(req.headers.items())) - cachefile = config.cachedir / md5(sig.encode()).hexdigest() - else: - cachefile = None + cachefile = cache_path(req) if config.debug else None if cachefile: if cachefile.exists(): diff --git a/unwind/web.py b/unwind/web.py index 474d169..4cbe6db 100644 --- a/unwind/web.py +++ b/unwind/web.py @@ -1,11 +1,42 @@ -from collections import defaultdict +import base64 +import binascii from starlette.applications import Starlette +from starlette.authentication import ( + AuthCredentials, + AuthenticationBackend, + AuthenticationError, + SimpleUser, + UnauthenticatedUser, + requires, +) +from starlette.middleware import Middleware +from starlette.middleware.authentication import AuthenticationMiddleware from starlette.responses import JSONResponse -from starlette.routing import Route +from starlette.routing import Mount, Route -from . import config +from . import config, db from .db import close_connection_pool, find_ratings, open_connection_pool +from .models import Movie, asplain + + +class BasicAuthBackend(AuthenticationBackend): + async def authenticate(self, request): + if "Authorization" not in request.headers: + return + + auth = request.headers["Authorization"] + try: + scheme, credentials = auth.split() + if scheme.lower() != "basic": + return + decoded = base64.b64decode(credentials).decode("ascii") + except (ValueError, UnicodeDecodeError, binascii.Error) as exc: + raise AuthenticationError("Invalid basic auth credentials") + + username, _, password = decoded.partition(":") + # TODO: You'd want to verify the username and password here. + return AuthCredentials(["authenticated"]), SimpleUser(username) def imdb_url(imdb_id: str): @@ -29,7 +60,8 @@ async def ratings(request): mov = aggr.setdefault( r["movie_imdb_id"], { - "title": r["movie_title"], + "canonical_title": r["canonical_title"], + "original_title": r["original_title"], "year": r["release_year"], "link": imdb_url(r["movie_imdb_id"]), "user_scores": [], @@ -44,10 +76,69 @@ async def ratings(request): return JSONResponse(resp) +not_found = JSONResponse({"error": "Not Found"}, status_code=404) + + +async def get_movies(request): + imdb_id = request.query_params.get("imdb_id") + + movie = await db.get(Movie, imdb_id=imdb_id) + + resp = [asplain(movie)] if movie else [] + return JSONResponse(resp) + + +@requires(["authenticated", "admin"]) +async def add_movie(request): + pass + + +@requires(["authenticated", "admin"]) +async def add_user(request): + pass + + +async def ratings_for_user(request): + request.path_params["user_id"] + + +@requires("authenticated") +async def set_rating_for_user(request): + request.path_params["user_id"] + + +@requires(["authenticated", "admin"]) +async def add_group(request): + pass + + +@requires(["authenticated", "admin"]) +async def add_user_to_group(request): + request.path_params["group_id"] + + +async def get_ratings_for_group(request): + request.path_params["group_id"] + + app = Starlette( on_startup=[open_connection_pool], on_shutdown=[close_connection_pool], routes=[ - Route("/ratings", ratings), + Mount( + "/api/v1", + routes=[ + Route("/ratings", ratings), # XXX legacy, remove. + Route("/movies", get_movies), + Route("/movies", add_movie, methods=["POST"]), + Route("/users", add_user, methods=["POST"]), + Route("/users/{user_id}/ratings", ratings_for_user), + Route("/users/{user_id}/ratings", set_rating_for_user, methods=["PUT"]), + Route("/groups", add_group, methods=["POST"]), + Route("/groups/{group_id}/users", add_user_to_group, methods=["POST"]), + Route("/groups/{group_id}/ratings", get_ratings_for_group), + ], + ), ], + middleware=[Middleware(AuthenticationMiddleware, backend=BasicAuthBackend())], )