add imdb full import mode

This commit is contained in:
ducklet 2021-06-21 18:54:03 +02:00
parent b5cb22822e
commit 7dd10f8bc3
17 changed files with 721 additions and 109 deletions

123
poetry.lock generated
View file

@ -20,6 +20,28 @@ python-versions = ">=3.6"
[package.extras] [package.extras]
tests = ["pytest", "pytest-asyncio", "mypy (>=0.800)"] tests = ["pytest", "pytest-asyncio", "mypy (>=0.800)"]
[[package]]
name = "atomicwrites"
version = "1.4.0"
description = "Atomic file writes."
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
[[package]]
name = "attrs"
version = "21.2.0"
description = "Classes Without Boilerplate"
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
[package.extras]
dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit"]
docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"]
tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface"]
tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins"]
[[package]] [[package]]
name = "beautifulsoup4" name = "beautifulsoup4"
version = "4.9.3" version = "4.9.3"
@ -122,6 +144,73 @@ category = "main"
optional = false optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
[[package]]
name = "iniconfig"
version = "1.1.1"
description = "iniconfig: brain-dead simple config-ini parsing"
category = "main"
optional = false
python-versions = "*"
[[package]]
name = "packaging"
version = "20.9"
description = "Core utilities for Python packages"
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
[package.dependencies]
pyparsing = ">=2.0.2"
[[package]]
name = "pluggy"
version = "0.13.1"
description = "plugin and hook calling mechanisms for python"
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
[package.extras]
dev = ["pre-commit", "tox"]
[[package]]
name = "py"
version = "1.10.0"
description = "library with cross-python path, ini-parsing, io, code, log facilities"
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
[[package]]
name = "pyparsing"
version = "2.4.7"
description = "Python parsing module"
category = "main"
optional = false
python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
[[package]]
name = "pytest"
version = "6.2.4"
description = "pytest: simple powerful testing with Python"
category = "main"
optional = false
python-versions = ">=3.6"
[package.dependencies]
atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""}
attrs = ">=19.2.0"
colorama = {version = "*", markers = "sys_platform == \"win32\""}
iniconfig = "*"
packaging = "*"
pluggy = ">=0.12,<1.0.0a1"
py = ">=1.8.2"
toml = "*"
[package.extras]
testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"]
[[package]] [[package]]
name = "requests" name = "requests"
version = "2.25.1" version = "2.25.1"
@ -251,7 +340,7 @@ python-versions = "*"
[metadata] [metadata]
lock-version = "1.1" lock-version = "1.1"
python-versions = "^3.9" python-versions = "^3.9"
content-hash = "28c14ec611e61db259fa6aa160df99308f7452874f69377a634d07cd379603c8" content-hash = "2aed53c1d20035335cf0c0e6439f65369519cd8ea47ae2e6043e49767106ffb0"
[metadata.files] [metadata.files]
aiosqlite = [ aiosqlite = [
@ -262,6 +351,14 @@ asgiref = [
{file = "asgiref-3.3.4-py3-none-any.whl", hash = "sha256:92906c611ce6c967347bbfea733f13d6313901d54dcca88195eaeb52b2a8e8ee"}, {file = "asgiref-3.3.4-py3-none-any.whl", hash = "sha256:92906c611ce6c967347bbfea733f13d6313901d54dcca88195eaeb52b2a8e8ee"},
{file = "asgiref-3.3.4.tar.gz", hash = "sha256:d1216dfbdfb63826470995d31caed36225dcaf34f182e0fa257a4dd9e86f1b78"}, {file = "asgiref-3.3.4.tar.gz", hash = "sha256:d1216dfbdfb63826470995d31caed36225dcaf34f182e0fa257a4dd9e86f1b78"},
] ]
atomicwrites = [
{file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"},
{file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"},
]
attrs = [
{file = "attrs-21.2.0-py2.py3-none-any.whl", hash = "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1"},
{file = "attrs-21.2.0.tar.gz", hash = "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"},
]
beautifulsoup4 = [ beautifulsoup4 = [
{file = "beautifulsoup4-4.9.3-py2-none-any.whl", hash = "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35"}, {file = "beautifulsoup4-4.9.3-py2-none-any.whl", hash = "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35"},
{file = "beautifulsoup4-4.9.3-py3-none-any.whl", hash = "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"}, {file = "beautifulsoup4-4.9.3-py3-none-any.whl", hash = "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"},
@ -299,6 +396,30 @@ idna = [
{file = "idna-2.10-py2.py3-none-any.whl", hash = "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"}, {file = "idna-2.10-py2.py3-none-any.whl", hash = "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"},
{file = "idna-2.10.tar.gz", hash = "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6"}, {file = "idna-2.10.tar.gz", hash = "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6"},
] ]
iniconfig = [
{file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"},
{file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
]
packaging = [
{file = "packaging-20.9-py2.py3-none-any.whl", hash = "sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a"},
{file = "packaging-20.9.tar.gz", hash = "sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5"},
]
pluggy = [
{file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"},
{file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"},
]
py = [
{file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"},
{file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"},
]
pyparsing = [
{file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"},
{file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"},
]
pytest = [
{file = "pytest-6.2.4-py3-none-any.whl", hash = "sha256:91ef2131a9bd6be8f76f1f08eac5c5317221d6ad1e143ae03894b862e8976890"},
{file = "pytest-6.2.4.tar.gz", hash = "sha256:50bcad0a0b9c5a72c8e4e7c9855a3ad496ca6a881a3641b4260605450772c54b"},
]
requests = [ requests = [
{file = "requests-2.25.1-py2.py3-none-any.whl", hash = "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"}, {file = "requests-2.25.1-py2.py3-none-any.whl", hash = "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"},
{file = "requests-2.25.1.tar.gz", hash = "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804"}, {file = "requests-2.25.1.tar.gz", hash = "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804"},

View file

@ -15,6 +15,7 @@ ulid-py = "^1.1.0"
databases = {extras = ["sqlite"], version = "^0.4.3"} databases = {extras = ["sqlite"], version = "^0.4.3"}
toml = "^0.10.2" toml = "^0.10.2"
uvicorn = "^0.14.0" uvicorn = "^0.14.0"
pytest = "^6.2.4"
[tool.poetry.dev-dependencies] [tool.poetry.dev-dependencies]

2
run
View file

@ -12,4 +12,4 @@ shift
# export DEBUG=1 # export DEBUG=1
# export UNWIND_LOGLEVEL=DEBUG # export UNWIND_LOGLEVEL=DEBUG
exec scripts/"$task" "$@" exec "$RUN_DIR"/scripts/"$task" "$@"

View file

@ -1,5 +1,7 @@
#!/bin/sh -eu #!/bin/sh -eu
cd "$RUN_DIR"
[ -z "${DEBUG:-}" ] || set -x [ -z "${DEBUG:-}" ] || set -x
exec python -m unwind "$@" exec python -m unwind "$@"

View file

@ -1,5 +1,7 @@
#!/bin/sh -eu #!/bin/sh -eu
cd "$RUN_DIR"
[ -z "${DEBUG:-}" ] || set -x [ -z "${DEBUG:-}" ] || set -x
exec uvicorn unwind:web_app --reload exec uvicorn unwind:web_app --reload

18
scripts/load_imdb_dumps Executable file
View file

@ -0,0 +1,18 @@
#!/bin/sh -eu
datadir="$RUN_DIR"/data
[ -z "${DEBUG:-}" ] || set -x
# See
# - https://www.imdb.com/interfaces/
# - https://datasets.imdbws.com/
wget -N \
--no-directories \
--directory-prefix "$datadir" \
https://datasets.imdbws.com/title.basics.tsv.gz \
https://datasets.imdbws.com/title.ratings.tsv.gz
"$RUN_BIN" app import-imdb-dataset \
--basics "$datadir"/title.basics.tsv.gz \
--ratings "$datadir"/title.ratings.tsv.gz

View file

@ -1,5 +1,7 @@
#!/bin/sh -eu #!/bin/sh -eu
cd "$RUN_DIR"
[ -z "${DEBUG:-}" ] || set -x [ -z "${DEBUG:-}" ] || set -x
exec uvicorn --host 0.0.0.0 unwind:web_app exec uvicorn --host 0.0.0.0 unwind:web_app

7
scripts/tests Executable file
View file

@ -0,0 +1,7 @@
#!/bin/sh -eu
cd "$RUN_DIR"
[ -z "${DEBUG:-}" ] || set -x
exec python -m pytest "$@"

19
tests/test_imdb.py Normal file
View file

@ -0,0 +1,19 @@
import pytest
from unwind.imdb import imdb_rating_from_score, score_from_imdb_rating
@pytest.mark.parametrize("rating", (x / 10 for x in range(10, 101)))
def test_rating_conversion(rating):
assert rating == imdb_rating_from_score(score_from_imdb_rating(rating))
@pytest.mark.parametrize("score", range(0, 101))
def test_score_conversion(score):
# Because our score covers 101 discrete values and IMDb's rating only 91
# discrete values, the mapping is non-injective, i.e. 10 values can't be
# mapped uniquely.
non_injective = set(range(5, 100, 10))
if score in non_injective:
pytest.skip(f"Score cannot be mapped back correctly: {score}")
assert score == score_from_imdb_rating(imdb_rating_from_score(score))

View file

@ -1,15 +1,18 @@
import argparse
import asyncio import asyncio
import logging import logging
from pathlib import Path
from . import config from . import config
from .db import close_connection_pool, open_connection_pool from .db import close_connection_pool, open_connection_pool
from .imdb import load_imdb from .imdb import load_imdb
from .imdb_import import import_from_file
from .request import session from .request import session
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
async def run_import(): async def run_load_user_ratings_from_imdb():
await open_connection_pool() await open_connection_pool()
with session() as s: with session() as s:
@ -22,6 +25,60 @@ async def run_import():
await close_connection_pool() await close_connection_pool()
async def run_import_imdb_dataset(basics_path: Path, ratings_path: Path):
await open_connection_pool()
await import_from_file(basics_path=basics_path, ratings_path=ratings_path)
await close_connection_pool()
def getargs():
parser = argparse.ArgumentParser()
commands = parser.add_subparsers(required=True)
parser_import_imdb_dataset = commands.add_parser(
"import-imdb-dataset",
help="Import IMDb datasets.",
description="""
Import IMDb datasets.
New datasets available from https://www.imdb.com/interfaces/.
""",
)
parser_import_imdb_dataset.add_argument(
dest="mode",
action="store_const",
const="import-imdb-dataset",
)
parser_import_imdb_dataset.add_argument(
"--basics", metavar="basics_file.tsv.gz", type=Path, required=True
)
parser_import_imdb_dataset.add_argument(
"--ratings", metavar="ratings_file.tsv.gz", type=Path, required=True
)
parser_load_user_ratings_from_imdb = commands.add_parser(
"load-user-ratings-from-imdb",
help="Load user ratings from imdb.com.",
description="""
Refresh user ratings for all registered users live from IMDb's website.
""",
)
parser_load_user_ratings_from_imdb.add_argument(
dest="mode",
action="store_const",
const="load-user-ratings-from-imdb",
)
try:
args = parser.parse_args()
except TypeError:
parser.print_usage()
raise
return args
def main(): def main():
logging.basicConfig( logging.basicConfig(
format="%(asctime)s.%(msecs)03d [%(name)s:%(process)d] %(levelname)s: %(message)s", format="%(asctime)s.%(msecs)03d [%(name)s:%(process)d] %(levelname)s: %(message)s",
@ -30,7 +87,15 @@ def main():
) )
log.debug(f"Log level: {config.loglevel}") log.debug(f"Log level: {config.loglevel}")
asyncio.run(run_import()) try:
args = getargs()
except:
return
if args.mode == "load-user-ratings-from-imdb":
asyncio.run(run_load_user_ratings_from_imdb())
elif args.mode == "import-imdb-dataset":
asyncio.run(run_import_imdb_dataset(args.basics, args.ratings))
main() main()

View file

@ -6,7 +6,7 @@ from typing import Optional, Type, TypeVar
from databases import Database from databases import Database
from . import config from . import config
from .models import Movie, Rating, User, asplain, fromplain, utcnow from .models import Movie, Rating, User, asplain, fromplain, optional_fields
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -69,10 +69,14 @@ ModelType = TypeVar("ModelType")
async def get(model: Type[ModelType], **kwds) -> Optional[ModelType]: async def get(model: Type[ModelType], **kwds) -> Optional[ModelType]:
values = {k: v for k, v in kwds.items() if v is not None}
if not values:
return
fields_ = ", ".join(f.name for f in fields(model)) fields_ = ", ".join(f.name for f in fields(model))
cond = " AND ".join(f"{k}=:{k}" for k in kwds) cond = " AND ".join(f"{k}=:{k}" for k, v in values.items())
query = f"SELECT {fields_} FROM {model._table} WHERE {cond}" query = f"SELECT {fields_} FROM {model._table} WHERE {cond}"
row = await shared_connection().fetch_one(query=query, values=kwds) row = await shared_connection().fetch_one(query=query, values=values)
return fromplain(model, row) if row else None return fromplain(model, row) if row else None
@ -95,16 +99,28 @@ async def add_or_update_user(user: User):
async def add_or_update_movie(movie: Movie): async def add_or_update_movie(movie: Movie):
"""Add or update a Movie in the database.
This is an upsert operation, but it will also update the Movie you pass
into the function to make its `id` match the DB's movie's `id`, and also
set all optional values on your Movie that might be unset but exist in the
database. It's a bidirectional sync.
"""
db_movie = await get(Movie, imdb_id=movie.imdb_id) db_movie = await get(Movie, imdb_id=movie.imdb_id)
if not db_movie: if not db_movie:
await add(movie) await add(movie)
else: else:
movie.id = db_movie.id movie.id = db_movie.id
movie.updated = db_movie.updated
if movie != db_movie: # We want to keep any existing value in the DB for all optional fields.
movie.updated = utcnow() for f in optional_fields(movie):
await update(movie) if getattr(movie, f.name) is None:
setattr(movie, f.name, getattr(db_movie, f.name))
if movie.updated <= db_movie.updated:
return
await update(movie)
async def add_or_update_rating(rating: Rating) -> bool: async def add_or_update_rating(rating: Rating) -> bool:
@ -147,38 +163,48 @@ async def find_ratings(
values["escape"] = "#" values["escape"] = "#"
escaped_title = sql_escape(title, char=values["escape"]) escaped_title = sql_escape(title, char=values["escape"])
values["pattern"] = "%" + "%".join(escaped_title.split()) + "%" values["pattern"] = "%" + "%".join(escaped_title.split()) + "%"
conditions.append("movies.title LIKE :pattern ESCAPE :escape") values["opattern"] = values["pattern"]
values["oescape"] = values["escape"]
conditions.append(
f"""
(
{Movie._table}.title LIKE :pattern ESCAPE :escape
OR {Movie._table}.original_title LIKE :opattern ESCAPE :oescape
)
"""
)
if media_type: if media_type:
values["media_type"] = media_type values["media_type"] = media_type
conditions.append("movies.media_type=:media_type") conditions.append(f"{Movie._table}.media_type=:media_type")
if ignore_tv_episodes: if ignore_tv_episodes:
conditions.append("movies.media_type!='TV Episode'") conditions.append(f"{Movie._table}.media_type!='TV Episode'")
query = f""" query = f"""
WITH newest_movies WITH newest_movies
AS ( AS (
SELECT DISTINCT ratings.movie_id SELECT DISTINCT {Rating._table}.movie_id
FROM ratings FROM {Rating._table}
LEFT JOIN movies ON movies.id=ratings.movie_id LEFT JOIN {Movie._table} ON {Movie._table}.id={Rating._table}.movie_id
{('WHERE ' + ' AND '.join(conditions)) if conditions else ''} {('WHERE ' + ' AND '.join(conditions)) if conditions else ''}
ORDER BY length(movies.title) ASC, ratings.rating_date DESC ORDER BY length({Movie._table}.title) ASC, {Rating._table}.rating_date DESC
LIMIT :limit_rows LIMIT :limit_rows
) )
SELECT SELECT
users.name AS user_name, {User._table}.name AS user_name,
ratings.score AS user_score, {Rating._table}.score AS user_score,
movies.score AS imdb_score, {Movie._table}.score AS imdb_score,
movies.imdb_id AS movie_imdb_id, {Movie._table}.imdb_id AS movie_imdb_id,
movies.media_type AS media_type, {Movie._table}.media_type AS media_type,
movies.title AS movie_title, {Movie._table}.title AS canonical_title,
movies.release_year AS release_year {Movie._table}.original_title AS original_title,
{Movie._table}.release_year AS release_year
FROM newest_movies FROM newest_movies
LEFT JOIN ratings ON ratings.movie_id=newest_movies.movie_id LEFT JOIN {Rating._table} ON {Rating._table}.movie_id=newest_movies.movie_id
LEFT JOIN users ON users.id=ratings.user_id LEFT JOIN {User._table} ON {User._table}.id={Rating._table}.user_id
LEFT JOIN movies ON movies.id=ratings.movie_id LEFT JOIN {Movie._table} ON {Movie._table}.id={Rating._table}.movie_id
""" """
rows = await shared_connection().fetch_all(query=query, values=values) rows = await shared_connection().fetch_all(query=query, values=values)

View file

@ -6,8 +6,8 @@ from typing import Optional
from urllib.parse import urljoin from urllib.parse import urljoin
from .db import add_or_update_movie, add_or_update_rating, add_or_update_user from .db import add_or_update_movie, add_or_update_rating, add_or_update_user
from .models import Movie, Rating, User, asplain, fromplain from .models import Movie, Rating, User
from .request import soup_from_url from .request import cache_path, soup_from_url
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -38,6 +38,26 @@ def imdb_url(user_id):
return f"https://www.imdb.com/user/{user_id}/ratings" return f"https://www.imdb.com/user/{user_id}/ratings"
def imdb_rating_from_score(score: int) -> float:
"""Return the IMDb rating from an Unwind Movie score."""
assert 0 <= score <= 100
rating = round(score * 9 / 100 + 1, 1)
assert 1.0 <= rating <= 10.0
return rating
def score_from_imdb_rating(rating: float) -> int:
"""Return the Unwind Movie score for an IMDb rating."""
# Scale IMDb's 10 point rating to our score of [0, 100].
# There's a pitfall here!
# You might think this would be simply IMDb's rating times 10, *but*
# the lowest possible rating on IMDb is actually 1.
assert 1.0 <= rating <= 10.0
score = round(100 * (rating - 1) / 9)
assert 0 <= score <= 100
return score
find_name = re.compile(r"(?P<name>.*)'s Ratings").fullmatch find_name = re.compile(r"(?P<name>.*)'s Ratings").fullmatch
find_rating_date = re.compile(r"Rated on (?P<date>\d{2} \w+ \d{4})").fullmatch find_rating_date = re.compile(r"Rated on (?P<date>\d{2} \w+ \d{4})").fullmatch
find_runtime = re.compile(r"((?P<h>\d+) hr)? ?((?P<m>\d+) min)?").fullmatch find_runtime = re.compile(r"((?P<h>\d+) hr)? ?((?P<m>\d+) min)?").fullmatch
@ -50,67 +70,88 @@ find_year = re.compile(
find_movie_id = re.compile(r"/title/(?P<id>tt\d+)/").search find_movie_id = re.compile(r"/title/(?P<id>tt\d+)/").search
def movie_and_rating_from_item(item) -> tuple[Movie, Rating]:
movie = Movie(
title=item.h3.a.string.strip(),
genres=set(s.strip() for s in item.find("span", "genre").string.split(",")),
)
episode_br = item.h3.br
if episode_br:
episode_a = episode_br.find_next("a")
if not episode_a:
raise ValueError("Unknown document structure.")
movie.media_type = "TV Episode"
movie.title += " / " + episode_a.string.strip()
if match := find_year(episode_br.find_next("span", "lister-item-year").string):
movie.release_year = int(match["year"])
if match := find_movie_id(episode_a["href"]):
movie.imdb_id = match["id"]
if (tag := item.find("span", "runtime")) and (match := find_runtime(tag.string)):
movie.runtime = int(match["h"] or 0) * 60 + int(match["m"] or 0)
if not episode_br:
if match := find_year(item.h3.find("span", "lister-item-year").string):
if media_type := match["type"]:
movie.media_type = media_type.strip()
movie.release_year = int(match["year"])
if match := find_movie_id(item.h3.a["href"]):
movie.imdb_id = match["id"]
if not movie.media_type:
movie.media_type = "Movie"
rating = Rating()
ratings_item = item.find("div", "ipl-rating-widget")
if match := find_rating_date(ratings_item.find_next("p", "text-muted").string):
rating.rating_date = datetime.strptime(match["date"], "%d %b %Y")
if match := ratings_item.find("div", "ipl-rating-star--other-user"):
if rating_item := match.find("span", "ipl-rating-star__rating"):
rating.score = score_from_imdb_rating(float(rating_item.string))
if match := ratings_item.find("div", "ipl-rating-star small"):
if rating_item := match.find("span", "ipl-rating-star__rating"):
movie.score = score_from_imdb_rating(float(rating_item.string))
return movie, rating
ForgedRequest = namedtuple("ForgedRequest", "url headers")
async def parse_page(url, stop_on_dupe=True) -> Optional[str]: async def parse_page(url, stop_on_dupe=True) -> Optional[str]:
soup = soup_from_url(url) soup = soup_from_url(url)
user = User(imdb_id=soup.find("meta", property="pageId")["content"], name="") meta = soup.find("meta", property="pageId")
if match := find_name(soup.h1.string): headline = soup.h1
assert meta is not None and headline is not None
user = User(imdb_id=meta["content"], name="")
if match := find_name(headline.string):
user.name = match["name"] user.name = match["name"]
await add_or_update_user(user) await add_or_update_user(user)
items = soup.find_all("div", "lister-item-content") items = soup.find_all("div", "lister-item-content")
for i, item in enumerate(items): for i, item in enumerate(items):
movie = Movie( try:
title=item.h3.a.string.strip(), movie, rating = movie_and_rating_from_item(item)
genres=set(s.strip() for s in item.find("span", "genre").string.split(",")), except Exception as err:
) log.error(
"Error in %s item #%s (%s): %s: %s",
episode_br = item.h3.br url,
if episode_br: i,
episode_a = episode_br.find_next("a") cache_path(ForgedRequest(url, headers={})),
if not episode_a: " ".join(item.h3.stripped_strings),
log.error("Unknown document structure.") err,
continue )
continue
movie.media_type = "TV Episode"
movie.title += " / " + episode_a.string.strip()
if match := find_year(
episode_br.find_next("span", "lister-item-year").string
):
movie.release_year = int(match["year"])
if match := find_movie_id(episode_a["href"]):
movie.imdb_id = match["id"]
rating = Rating(user_id=user.id)
if (tag := item.find("span", "runtime")) and (
match := find_runtime(tag.string)
):
movie.runtime = int(match["h"] or 0) * 60 + int(match["m"] or 0)
if not episode_br:
if match := find_year(item.h3.find("span", "lister-item-year").string):
if media_type := match["type"]:
movie.media_type = media_type.strip()
movie.release_year = int(match["year"])
if match := find_movie_id(item.h3.a["href"]):
movie.imdb_id = match["id"]
ratings_item = item.find("div", "ipl-rating-widget")
if match := find_rating_date(ratings_item.find_next("p", "text-muted").string):
rating.rating_date = datetime.strptime(match["date"], "%d %b %Y")
for rating_item in ratings_item.find_all("span", "ipl-rating-star__rating")[:2]:
if "ipl-rating-star--other-user" in rating_item.parent["class"]:
rating.score = int(float(rating_item.string) * 10)
else:
movie.score = int(float(rating_item.string) * 10)
if not movie.media_type:
movie.media_type = "Movie"
await add_or_update_movie(movie) await add_or_update_movie(movie)
rating.user_id = user.id
rating.movie_id = movie.id # needs to be set _after_ movie has been updated rating.movie_id = movie.id # needs to be set _after_ movie has been updated
is_updated = await add_or_update_rating(rating) is_updated = await add_or_update_rating(rating)
@ -118,9 +159,9 @@ async def parse_page(url, stop_on_dupe=True) -> Optional[str]:
log.info("Import stopped after %s items. Caught up to known state. ✋", i) log.info("Import stopped after %s items. Caught up to known state. ✋", i)
return None return None
next_url = urljoin( footer = soup.find("div", "footer")
url, soup.find("div", "footer").find(string=re.compile(r"Next")).parent["href"] assert footer is not None
) next_url = urljoin(url, footer.find(string=re.compile(r"Next")).parent["href"])
return next_url if url != next_url else None return next_url if url != next_url else None

206
unwind/imdb_import.py Normal file
View file

@ -0,0 +1,206 @@
import csv
import gzip
import logging
from dataclasses import dataclass, fields
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, get_origin
from . import db
from .db import add_or_update_movie
from .imdb import score_from_imdb_rating
from .models import Movie, optional_type
log = logging.getLogger(__name__)
# See
# - https://www.imdb.com/interfaces/
# - https://datasets.imdbws.com/
@dataclass
class BasicRow:
tconst: str
titleType: str
primaryTitle: str
originalTitle: str
isAdult: bool
startYear: Optional[int]
endYear: Optional[int]
runtimeMinutes: Optional[int]
genres: Optional[set[str]]
@classmethod
def from_row(cls, row):
vals = []
for f, r in zip(fields(cls), row):
ttype = f.type
is_opt = False
if (otype := optional_type(ttype)) is not None:
ttype = otype
is_opt = True
if (otype := get_origin(ttype)) is not None:
ttype = otype
if r == r"\N":
if is_opt:
vals.append(None)
else:
raise ValueError(f"Unexpected null value for field: {f.name}")
elif f.name == "genres":
vals.append(set(r.split(",")))
elif f.name == "isAdult":
assert r in "01"
vals.append(r == "1")
else:
vals.append(ttype(r))
inst = cls(*vals)
assert inst.titleType in title_types
return inst
def as_movie(self):
assert self.startYear is not None
return Movie(
title=self.primaryTitle,
original_title=self.originalTitle,
release_year=self.startYear,
media_type=title_types[self.titleType],
imdb_id=self.tconst,
score=None,
runtime=self.runtimeMinutes,
genres=self.genres or set(),
)
@dataclass
class RatingRow:
tconst: str
averageRating: float
numVotes: int
@classmethod
def from_row(cls, row):
inst = cls(*(f.type(r) for f, r in zip(fields(cls), row)))
assert inst.tconst != r"\N"
return inst
def as_movie(self):
return Movie(
imdb_id=self.tconst,
score=score_from_imdb_rating(self.averageRating),
)
title_types = {
"movie": "Movie",
"radioEpisode": "Radio Episode",
"radioSeries": "Radio Series",
"short": "Short",
"tvEpisode": "TV Episode",
"tvMiniSeries": "TV Mini Series",
"tvMovie": "TV Movie",
"tvSeries": "TV Series",
"tvShort": "TV Short",
"tvSpecial": "TV Special",
"video": "Video",
"videoGame": "Video Game",
}
def gz_mtime(path) -> datetime:
"""Return the timestamp of the compressed file."""
g = gzip.GzipFile(path, "rb")
g.peek(1) # start reading the file to fill the timestamp field
assert g.mtime is not None
return datetime.fromtimestamp(g.mtime).replace(tzinfo=timezone.utc)
def count_lines(path) -> int:
i = 0
with gzip.open(path, "rt") as f:
for i, _ in enumerate(f, start=1):
pass
return i
def read_imdb_tsv(path, row_type):
with gzip.open(path, "rt", newline="") as f:
rows = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
# skip header line
rows = iter(rows)
header = next(rows)
try:
assert tuple(f.name for f in fields(row_type)) == tuple(header)
except AssertionError:
log.error("Unexpected header line: %s", header)
raise
for i, row in enumerate(rows, start=1):
try:
yield row_type.from_row(row)
except Exception as err:
log.error("Error in line %s: %s", i, row, exc_info=err)
raise
def read_ratings(path):
mtime = gz_mtime(path)
rows = read_imdb_tsv(path, RatingRow)
for row in rows:
m = row.as_movie()
m.updated = mtime
yield m
def read_basics(path):
mtime = gz_mtime(path)
rows = read_imdb_tsv(path, BasicRow)
for row in rows:
if row.startYear is None:
log.debug("Skipping movie, missing year: %s", row)
continue
m = row.as_movie()
m.updated = mtime
yield m
async def import_from_file(basics_path: Path, ratings_path: Path):
log.info("Loading scores ... 💾")
scores = {m.imdb_id: m.score for m in read_ratings(ratings_path)}
log.info("Importing movies ... 💾")
total = count_lines(basics_path)
assert total != 0
perc = 0.0
perc_step = 0.001
async with db.shared_connection().transaction():
for i, m in enumerate(read_basics(basics_path)):
if i / total > perc:
log.info("Imported %s%%", round(perc * 100, 1))
perc += perc_step
if m.media_type not in {
"Movie",
"Short",
"TV Mini Series",
"TV Movie",
"TV Series",
"TV Short",
"TV Special",
"Video",
}:
log.debug("Skipping movie, unwanted media type: %s", m.media_type)
continue
m.score = scores.get(m.imdb_id)
await add_or_update_movie(m)

View file

@ -1,31 +1,32 @@
PRAGMA foreign_keys = ON;; PRAGMA foreign_keys = ON;;
CREATE TABLE IF NOT EXISTS users ( CREATE TABLE IF NOT EXISTS users (
id TEXT NOT NULL PRIMARY KEY, id TEXT PRIMARY KEY NOT NULL,
imdb_id TEXT NOT NULL UNIQUE, imdb_id TEXT NOT NULL UNIQUE,
name TEXT NOT NULL name TEXT NOT NULL
);; );;
CREATE TABLE IF NOT EXISTS movies ( CREATE TABLE IF NOT EXISTS movies (
id TEXT NOT NULL PRIMARY KEY, id TEXT PRIMARY KEY NOT NULL,
title TEXT NOT NULL, title TEXT NOT NULL,
release_year NUMBER NOT NULL, original_title TEXT,
release_year INTEGER NOT NULL,
media_type TEXT NOT NULL, media_type TEXT NOT NULL,
imdb_id TEXT NOT NULL UNIQUE, imdb_id TEXT NOT NULL UNIQUE,
score NUMBER NOT NULL, score INTEGER,
runtime NUMBER, runtime INTEGER,
genres TEXT NOT NULL, genres TEXT NOT NULL,
updated TEXT NOT NULL updated TEXT NOT NULL
);; );;
CREATE TABLE IF NOT EXISTS ratings ( CREATE TABLE IF NOT EXISTS ratings (
id TEXT NOT NULL PRIMARY KEY, id TEXT PRIMARY KEY NOT NULL,
movie_id TEXT NOT NULL, movie_id TEXT NOT NULL,
user_id TEXT NOT NULL, user_id TEXT NOT NULL,
score NUMBER NOT NULL, score INTEGER NOT NULL,
rating_date TEXT NOT NULL, rating_date TEXT NOT NULL,
favorite NUMBER, favorite BOOL,
finished NUMBER, finished BOOL,
FOREIGN KEY(movie_id) REFERENCES movies(id), FOREIGN KEY(movie_id) REFERENCES movies(id),
FOREIGN KEY(user_id) REFERENCES users(id) FOREIGN KEY(user_id) REFERENCES users(id)
);; );;

View file

@ -1,5 +1,5 @@
import json import json
from dataclasses import asdict, dataclass, field, fields, is_dataclass from dataclasses import asdict, dataclass, field, fields
from datetime import datetime, timezone from datetime import datetime, timezone
from typing import Any, ClassVar, Optional, Type, Union, get_args, get_origin from typing import Any, ClassVar, Optional, Type, Union, get_args, get_origin
@ -25,6 +25,12 @@ def optional_type(tp: Type):
return args[0] return args[0]
def optional_fields(o):
for f in fields(o):
if is_optional(f.type):
yield f
def asplain(o) -> dict[str, Any]: def asplain(o) -> dict[str, Any]:
validate(o) validate(o)
@ -56,9 +62,6 @@ def asplain(o) -> dict[str, Any]:
def fromplain(cls, d: dict[str, Any]): def fromplain(cls, d: dict[str, Any]):
# if not is_dataclass(cls):
# raise TypeError(f'Not a dataclass: {type(cls)}')
dd = {} dd = {}
for f in fields(cls): for f in fields(cls):
@ -107,11 +110,14 @@ class Movie:
_table: ClassVar[str] = "movies" _table: ClassVar[str] = "movies"
id: ULID = field(default_factory=ULID) id: ULID = field(default_factory=ULID)
title: str = None # canonical title title: str = None # canonical title (usually English)
original_title: Optional[
str
] = None # original title (usually transscribed to latin script)
release_year: int = None # canonical release date release_year: int = None # canonical release date
media_type: Optional[str] = None media_type: str = None
imdb_id: str = None imdb_id: str = None
score: int = None # range: [0,100] score: Optional[int] = None # range: [0,100]
runtime: Optional[int] = None # minutes runtime: Optional[int] = None # minutes
genres: set[str] = None genres: set[str] = None
updated: datetime = field(default_factory=utcnow) updated: datetime = field(default_factory=utcnow)

View file

@ -5,9 +5,10 @@ from contextlib import contextmanager
from dataclasses import dataclass from dataclasses import dataclass
from functools import wraps from functools import wraps
from hashlib import md5 from hashlib import md5
from pathlib import Path
from random import random from random import random
from time import sleep, time from time import sleep, time
from typing import Callable from typing import Callable, Optional
import bs4 import bs4
import requests import requests
@ -130,16 +131,19 @@ class RedirectError(RuntimeError):
super().__init__(f"Redirected: {from_url} -> {to_url}") super().__init__(f"Redirected: {from_url} -> {to_url}")
def cache_path(req) -> Optional[Path]:
if not config.cachedir:
return
sig = repr(req.url) # + repr(sorted(req.headers.items()))
return config.cachedir / md5(sig.encode()).hexdigest()
@throttle(1, 1, random) @throttle(1, 1, random)
def http_get(s: requests.Session, url: str, *args, **kwds) -> requests.Response: def http_get(s: requests.Session, url: str, *args, **kwds) -> requests.Response:
req = s.prepare_request(requests.Request("GET", url, *args, **kwds)) req = s.prepare_request(requests.Request("GET", url, *args, **kwds))
if config.debug and config.cachedir: cachefile = cache_path(req) if config.debug else None
sig = repr(req.url) # + repr(sorted(req.headers.items()))
cachefile = config.cachedir / md5(sig.encode()).hexdigest()
else:
cachefile = None
if cachefile: if cachefile:
if cachefile.exists(): if cachefile.exists():

View file

@ -1,11 +1,42 @@
from collections import defaultdict import base64
import binascii
from starlette.applications import Starlette from starlette.applications import Starlette
from starlette.authentication import (
AuthCredentials,
AuthenticationBackend,
AuthenticationError,
SimpleUser,
UnauthenticatedUser,
requires,
)
from starlette.middleware import Middleware
from starlette.middleware.authentication import AuthenticationMiddleware
from starlette.responses import JSONResponse from starlette.responses import JSONResponse
from starlette.routing import Route from starlette.routing import Mount, Route
from . import config from . import config, db
from .db import close_connection_pool, find_ratings, open_connection_pool from .db import close_connection_pool, find_ratings, open_connection_pool
from .models import Movie, asplain
class BasicAuthBackend(AuthenticationBackend):
async def authenticate(self, request):
if "Authorization" not in request.headers:
return
auth = request.headers["Authorization"]
try:
scheme, credentials = auth.split()
if scheme.lower() != "basic":
return
decoded = base64.b64decode(credentials).decode("ascii")
except (ValueError, UnicodeDecodeError, binascii.Error) as exc:
raise AuthenticationError("Invalid basic auth credentials")
username, _, password = decoded.partition(":")
# TODO: You'd want to verify the username and password here.
return AuthCredentials(["authenticated"]), SimpleUser(username)
def imdb_url(imdb_id: str): def imdb_url(imdb_id: str):
@ -29,7 +60,8 @@ async def ratings(request):
mov = aggr.setdefault( mov = aggr.setdefault(
r["movie_imdb_id"], r["movie_imdb_id"],
{ {
"title": r["movie_title"], "canonical_title": r["canonical_title"],
"original_title": r["original_title"],
"year": r["release_year"], "year": r["release_year"],
"link": imdb_url(r["movie_imdb_id"]), "link": imdb_url(r["movie_imdb_id"]),
"user_scores": [], "user_scores": [],
@ -44,10 +76,69 @@ async def ratings(request):
return JSONResponse(resp) return JSONResponse(resp)
not_found = JSONResponse({"error": "Not Found"}, status_code=404)
async def get_movies(request):
imdb_id = request.query_params.get("imdb_id")
movie = await db.get(Movie, imdb_id=imdb_id)
resp = [asplain(movie)] if movie else []
return JSONResponse(resp)
@requires(["authenticated", "admin"])
async def add_movie(request):
pass
@requires(["authenticated", "admin"])
async def add_user(request):
pass
async def ratings_for_user(request):
request.path_params["user_id"]
@requires("authenticated")
async def set_rating_for_user(request):
request.path_params["user_id"]
@requires(["authenticated", "admin"])
async def add_group(request):
pass
@requires(["authenticated", "admin"])
async def add_user_to_group(request):
request.path_params["group_id"]
async def get_ratings_for_group(request):
request.path_params["group_id"]
app = Starlette( app = Starlette(
on_startup=[open_connection_pool], on_startup=[open_connection_pool],
on_shutdown=[close_connection_pool], on_shutdown=[close_connection_pool],
routes=[ routes=[
Route("/ratings", ratings), Mount(
"/api/v1",
routes=[
Route("/ratings", ratings), # XXX legacy, remove.
Route("/movies", get_movies),
Route("/movies", add_movie, methods=["POST"]),
Route("/users", add_user, methods=["POST"]),
Route("/users/{user_id}/ratings", ratings_for_user),
Route("/users/{user_id}/ratings", set_rating_for_user, methods=["PUT"]),
Route("/groups", add_group, methods=["POST"]),
Route("/groups/{group_id}/users", add_user_to_group, methods=["POST"]),
Route("/groups/{group_id}/ratings", get_ratings_for_group),
],
),
], ],
middleware=[Middleware(AuthenticationMiddleware, backend=BasicAuthBackend())],
) )