add imdb full import mode
This commit is contained in:
parent
b5cb22822e
commit
7dd10f8bc3
17 changed files with 721 additions and 109 deletions
123
poetry.lock
generated
123
poetry.lock
generated
|
|
@ -20,6 +20,28 @@ python-versions = ">=3.6"
|
||||||
[package.extras]
|
[package.extras]
|
||||||
tests = ["pytest", "pytest-asyncio", "mypy (>=0.800)"]
|
tests = ["pytest", "pytest-asyncio", "mypy (>=0.800)"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "atomicwrites"
|
||||||
|
version = "1.4.0"
|
||||||
|
description = "Atomic file writes."
|
||||||
|
category = "main"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "attrs"
|
||||||
|
version = "21.2.0"
|
||||||
|
description = "Classes Without Boilerplate"
|
||||||
|
category = "main"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit"]
|
||||||
|
docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"]
|
||||||
|
tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface"]
|
||||||
|
tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "beautifulsoup4"
|
name = "beautifulsoup4"
|
||||||
version = "4.9.3"
|
version = "4.9.3"
|
||||||
|
|
@ -122,6 +144,73 @@ category = "main"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "iniconfig"
|
||||||
|
version = "1.1.1"
|
||||||
|
description = "iniconfig: brain-dead simple config-ini parsing"
|
||||||
|
category = "main"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "packaging"
|
||||||
|
version = "20.9"
|
||||||
|
description = "Core utilities for Python packages"
|
||||||
|
category = "main"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
pyparsing = ">=2.0.2"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pluggy"
|
||||||
|
version = "0.13.1"
|
||||||
|
description = "plugin and hook calling mechanisms for python"
|
||||||
|
category = "main"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
dev = ["pre-commit", "tox"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "py"
|
||||||
|
version = "1.10.0"
|
||||||
|
description = "library with cross-python path, ini-parsing, io, code, log facilities"
|
||||||
|
category = "main"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pyparsing"
|
||||||
|
version = "2.4.7"
|
||||||
|
description = "Python parsing module"
|
||||||
|
category = "main"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pytest"
|
||||||
|
version = "6.2.4"
|
||||||
|
description = "pytest: simple powerful testing with Python"
|
||||||
|
category = "main"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.6"
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""}
|
||||||
|
attrs = ">=19.2.0"
|
||||||
|
colorama = {version = "*", markers = "sys_platform == \"win32\""}
|
||||||
|
iniconfig = "*"
|
||||||
|
packaging = "*"
|
||||||
|
pluggy = ">=0.12,<1.0.0a1"
|
||||||
|
py = ">=1.8.2"
|
||||||
|
toml = "*"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "requests"
|
name = "requests"
|
||||||
version = "2.25.1"
|
version = "2.25.1"
|
||||||
|
|
@ -251,7 +340,7 @@ python-versions = "*"
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "1.1"
|
lock-version = "1.1"
|
||||||
python-versions = "^3.9"
|
python-versions = "^3.9"
|
||||||
content-hash = "28c14ec611e61db259fa6aa160df99308f7452874f69377a634d07cd379603c8"
|
content-hash = "2aed53c1d20035335cf0c0e6439f65369519cd8ea47ae2e6043e49767106ffb0"
|
||||||
|
|
||||||
[metadata.files]
|
[metadata.files]
|
||||||
aiosqlite = [
|
aiosqlite = [
|
||||||
|
|
@ -262,6 +351,14 @@ asgiref = [
|
||||||
{file = "asgiref-3.3.4-py3-none-any.whl", hash = "sha256:92906c611ce6c967347bbfea733f13d6313901d54dcca88195eaeb52b2a8e8ee"},
|
{file = "asgiref-3.3.4-py3-none-any.whl", hash = "sha256:92906c611ce6c967347bbfea733f13d6313901d54dcca88195eaeb52b2a8e8ee"},
|
||||||
{file = "asgiref-3.3.4.tar.gz", hash = "sha256:d1216dfbdfb63826470995d31caed36225dcaf34f182e0fa257a4dd9e86f1b78"},
|
{file = "asgiref-3.3.4.tar.gz", hash = "sha256:d1216dfbdfb63826470995d31caed36225dcaf34f182e0fa257a4dd9e86f1b78"},
|
||||||
]
|
]
|
||||||
|
atomicwrites = [
|
||||||
|
{file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"},
|
||||||
|
{file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"},
|
||||||
|
]
|
||||||
|
attrs = [
|
||||||
|
{file = "attrs-21.2.0-py2.py3-none-any.whl", hash = "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1"},
|
||||||
|
{file = "attrs-21.2.0.tar.gz", hash = "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"},
|
||||||
|
]
|
||||||
beautifulsoup4 = [
|
beautifulsoup4 = [
|
||||||
{file = "beautifulsoup4-4.9.3-py2-none-any.whl", hash = "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35"},
|
{file = "beautifulsoup4-4.9.3-py2-none-any.whl", hash = "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35"},
|
||||||
{file = "beautifulsoup4-4.9.3-py3-none-any.whl", hash = "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"},
|
{file = "beautifulsoup4-4.9.3-py3-none-any.whl", hash = "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"},
|
||||||
|
|
@ -299,6 +396,30 @@ idna = [
|
||||||
{file = "idna-2.10-py2.py3-none-any.whl", hash = "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"},
|
{file = "idna-2.10-py2.py3-none-any.whl", hash = "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"},
|
||||||
{file = "idna-2.10.tar.gz", hash = "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6"},
|
{file = "idna-2.10.tar.gz", hash = "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6"},
|
||||||
]
|
]
|
||||||
|
iniconfig = [
|
||||||
|
{file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"},
|
||||||
|
{file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
|
||||||
|
]
|
||||||
|
packaging = [
|
||||||
|
{file = "packaging-20.9-py2.py3-none-any.whl", hash = "sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a"},
|
||||||
|
{file = "packaging-20.9.tar.gz", hash = "sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5"},
|
||||||
|
]
|
||||||
|
pluggy = [
|
||||||
|
{file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"},
|
||||||
|
{file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"},
|
||||||
|
]
|
||||||
|
py = [
|
||||||
|
{file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"},
|
||||||
|
{file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"},
|
||||||
|
]
|
||||||
|
pyparsing = [
|
||||||
|
{file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"},
|
||||||
|
{file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"},
|
||||||
|
]
|
||||||
|
pytest = [
|
||||||
|
{file = "pytest-6.2.4-py3-none-any.whl", hash = "sha256:91ef2131a9bd6be8f76f1f08eac5c5317221d6ad1e143ae03894b862e8976890"},
|
||||||
|
{file = "pytest-6.2.4.tar.gz", hash = "sha256:50bcad0a0b9c5a72c8e4e7c9855a3ad496ca6a881a3641b4260605450772c54b"},
|
||||||
|
]
|
||||||
requests = [
|
requests = [
|
||||||
{file = "requests-2.25.1-py2.py3-none-any.whl", hash = "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"},
|
{file = "requests-2.25.1-py2.py3-none-any.whl", hash = "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"},
|
||||||
{file = "requests-2.25.1.tar.gz", hash = "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804"},
|
{file = "requests-2.25.1.tar.gz", hash = "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804"},
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@ ulid-py = "^1.1.0"
|
||||||
databases = {extras = ["sqlite"], version = "^0.4.3"}
|
databases = {extras = ["sqlite"], version = "^0.4.3"}
|
||||||
toml = "^0.10.2"
|
toml = "^0.10.2"
|
||||||
uvicorn = "^0.14.0"
|
uvicorn = "^0.14.0"
|
||||||
|
pytest = "^6.2.4"
|
||||||
|
|
||||||
[tool.poetry.dev-dependencies]
|
[tool.poetry.dev-dependencies]
|
||||||
|
|
||||||
|
|
|
||||||
2
run
2
run
|
|
@ -12,4 +12,4 @@ shift
|
||||||
# export DEBUG=1
|
# export DEBUG=1
|
||||||
# export UNWIND_LOGLEVEL=DEBUG
|
# export UNWIND_LOGLEVEL=DEBUG
|
||||||
|
|
||||||
exec scripts/"$task" "$@"
|
exec "$RUN_DIR"/scripts/"$task" "$@"
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,7 @@
|
||||||
#!/bin/sh -eu
|
#!/bin/sh -eu
|
||||||
|
|
||||||
|
cd "$RUN_DIR"
|
||||||
|
|
||||||
[ -z "${DEBUG:-}" ] || set -x
|
[ -z "${DEBUG:-}" ] || set -x
|
||||||
|
|
||||||
exec python -m unwind "$@"
|
exec python -m unwind "$@"
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,7 @@
|
||||||
#!/bin/sh -eu
|
#!/bin/sh -eu
|
||||||
|
|
||||||
|
cd "$RUN_DIR"
|
||||||
|
|
||||||
[ -z "${DEBUG:-}" ] || set -x
|
[ -z "${DEBUG:-}" ] || set -x
|
||||||
|
|
||||||
exec uvicorn unwind:web_app --reload
|
exec uvicorn unwind:web_app --reload
|
||||||
|
|
|
||||||
18
scripts/load_imdb_dumps
Executable file
18
scripts/load_imdb_dumps
Executable file
|
|
@ -0,0 +1,18 @@
|
||||||
|
#!/bin/sh -eu
|
||||||
|
|
||||||
|
datadir="$RUN_DIR"/data
|
||||||
|
|
||||||
|
[ -z "${DEBUG:-}" ] || set -x
|
||||||
|
|
||||||
|
# See
|
||||||
|
# - https://www.imdb.com/interfaces/
|
||||||
|
# - https://datasets.imdbws.com/
|
||||||
|
|
||||||
|
wget -N \
|
||||||
|
--no-directories \
|
||||||
|
--directory-prefix "$datadir" \
|
||||||
|
https://datasets.imdbws.com/title.basics.tsv.gz \
|
||||||
|
https://datasets.imdbws.com/title.ratings.tsv.gz
|
||||||
|
"$RUN_BIN" app import-imdb-dataset \
|
||||||
|
--basics "$datadir"/title.basics.tsv.gz \
|
||||||
|
--ratings "$datadir"/title.ratings.tsv.gz
|
||||||
|
|
@ -1,5 +1,7 @@
|
||||||
#!/bin/sh -eu
|
#!/bin/sh -eu
|
||||||
|
|
||||||
|
cd "$RUN_DIR"
|
||||||
|
|
||||||
[ -z "${DEBUG:-}" ] || set -x
|
[ -z "${DEBUG:-}" ] || set -x
|
||||||
|
|
||||||
exec uvicorn --host 0.0.0.0 unwind:web_app
|
exec uvicorn --host 0.0.0.0 unwind:web_app
|
||||||
|
|
|
||||||
7
scripts/tests
Executable file
7
scripts/tests
Executable file
|
|
@ -0,0 +1,7 @@
|
||||||
|
#!/bin/sh -eu
|
||||||
|
|
||||||
|
cd "$RUN_DIR"
|
||||||
|
|
||||||
|
[ -z "${DEBUG:-}" ] || set -x
|
||||||
|
|
||||||
|
exec python -m pytest "$@"
|
||||||
19
tests/test_imdb.py
Normal file
19
tests/test_imdb.py
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
import pytest
|
||||||
|
from unwind.imdb import imdb_rating_from_score, score_from_imdb_rating
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("rating", (x / 10 for x in range(10, 101)))
|
||||||
|
def test_rating_conversion(rating):
|
||||||
|
assert rating == imdb_rating_from_score(score_from_imdb_rating(rating))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("score", range(0, 101))
|
||||||
|
def test_score_conversion(score):
|
||||||
|
# Because our score covers 101 discrete values and IMDb's rating only 91
|
||||||
|
# discrete values, the mapping is non-injective, i.e. 10 values can't be
|
||||||
|
# mapped uniquely.
|
||||||
|
non_injective = set(range(5, 100, 10))
|
||||||
|
if score in non_injective:
|
||||||
|
pytest.skip(f"Score cannot be mapped back correctly: {score}")
|
||||||
|
|
||||||
|
assert score == score_from_imdb_rating(imdb_rating_from_score(score))
|
||||||
|
|
@ -1,15 +1,18 @@
|
||||||
|
import argparse
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from . import config
|
from . import config
|
||||||
from .db import close_connection_pool, open_connection_pool
|
from .db import close_connection_pool, open_connection_pool
|
||||||
from .imdb import load_imdb
|
from .imdb import load_imdb
|
||||||
|
from .imdb_import import import_from_file
|
||||||
from .request import session
|
from .request import session
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
async def run_import():
|
async def run_load_user_ratings_from_imdb():
|
||||||
await open_connection_pool()
|
await open_connection_pool()
|
||||||
|
|
||||||
with session() as s:
|
with session() as s:
|
||||||
|
|
@ -22,6 +25,60 @@ async def run_import():
|
||||||
await close_connection_pool()
|
await close_connection_pool()
|
||||||
|
|
||||||
|
|
||||||
|
async def run_import_imdb_dataset(basics_path: Path, ratings_path: Path):
|
||||||
|
await open_connection_pool()
|
||||||
|
|
||||||
|
await import_from_file(basics_path=basics_path, ratings_path=ratings_path)
|
||||||
|
|
||||||
|
await close_connection_pool()
|
||||||
|
|
||||||
|
|
||||||
|
def getargs():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
commands = parser.add_subparsers(required=True)
|
||||||
|
|
||||||
|
parser_import_imdb_dataset = commands.add_parser(
|
||||||
|
"import-imdb-dataset",
|
||||||
|
help="Import IMDb datasets.",
|
||||||
|
description="""
|
||||||
|
Import IMDb datasets.
|
||||||
|
New datasets available from https://www.imdb.com/interfaces/.
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
parser_import_imdb_dataset.add_argument(
|
||||||
|
dest="mode",
|
||||||
|
action="store_const",
|
||||||
|
const="import-imdb-dataset",
|
||||||
|
)
|
||||||
|
parser_import_imdb_dataset.add_argument(
|
||||||
|
"--basics", metavar="basics_file.tsv.gz", type=Path, required=True
|
||||||
|
)
|
||||||
|
parser_import_imdb_dataset.add_argument(
|
||||||
|
"--ratings", metavar="ratings_file.tsv.gz", type=Path, required=True
|
||||||
|
)
|
||||||
|
|
||||||
|
parser_load_user_ratings_from_imdb = commands.add_parser(
|
||||||
|
"load-user-ratings-from-imdb",
|
||||||
|
help="Load user ratings from imdb.com.",
|
||||||
|
description="""
|
||||||
|
Refresh user ratings for all registered users live from IMDb's website.
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
parser_load_user_ratings_from_imdb.add_argument(
|
||||||
|
dest="mode",
|
||||||
|
action="store_const",
|
||||||
|
const="load-user-ratings-from-imdb",
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
args = parser.parse_args()
|
||||||
|
except TypeError:
|
||||||
|
parser.print_usage()
|
||||||
|
raise
|
||||||
|
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
format="%(asctime)s.%(msecs)03d [%(name)s:%(process)d] %(levelname)s: %(message)s",
|
format="%(asctime)s.%(msecs)03d [%(name)s:%(process)d] %(levelname)s: %(message)s",
|
||||||
|
|
@ -30,7 +87,15 @@ def main():
|
||||||
)
|
)
|
||||||
log.debug(f"Log level: {config.loglevel}")
|
log.debug(f"Log level: {config.loglevel}")
|
||||||
|
|
||||||
asyncio.run(run_import())
|
try:
|
||||||
|
args = getargs()
|
||||||
|
except:
|
||||||
|
return
|
||||||
|
|
||||||
|
if args.mode == "load-user-ratings-from-imdb":
|
||||||
|
asyncio.run(run_load_user_ratings_from_imdb())
|
||||||
|
elif args.mode == "import-imdb-dataset":
|
||||||
|
asyncio.run(run_import_imdb_dataset(args.basics, args.ratings))
|
||||||
|
|
||||||
|
|
||||||
main()
|
main()
|
||||||
|
|
|
||||||
74
unwind/db.py
74
unwind/db.py
|
|
@ -6,7 +6,7 @@ from typing import Optional, Type, TypeVar
|
||||||
from databases import Database
|
from databases import Database
|
||||||
|
|
||||||
from . import config
|
from . import config
|
||||||
from .models import Movie, Rating, User, asplain, fromplain, utcnow
|
from .models import Movie, Rating, User, asplain, fromplain, optional_fields
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -69,10 +69,14 @@ ModelType = TypeVar("ModelType")
|
||||||
|
|
||||||
|
|
||||||
async def get(model: Type[ModelType], **kwds) -> Optional[ModelType]:
|
async def get(model: Type[ModelType], **kwds) -> Optional[ModelType]:
|
||||||
|
values = {k: v for k, v in kwds.items() if v is not None}
|
||||||
|
if not values:
|
||||||
|
return
|
||||||
|
|
||||||
fields_ = ", ".join(f.name for f in fields(model))
|
fields_ = ", ".join(f.name for f in fields(model))
|
||||||
cond = " AND ".join(f"{k}=:{k}" for k in kwds)
|
cond = " AND ".join(f"{k}=:{k}" for k, v in values.items())
|
||||||
query = f"SELECT {fields_} FROM {model._table} WHERE {cond}"
|
query = f"SELECT {fields_} FROM {model._table} WHERE {cond}"
|
||||||
row = await shared_connection().fetch_one(query=query, values=kwds)
|
row = await shared_connection().fetch_one(query=query, values=values)
|
||||||
return fromplain(model, row) if row else None
|
return fromplain(model, row) if row else None
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -95,16 +99,28 @@ async def add_or_update_user(user: User):
|
||||||
|
|
||||||
|
|
||||||
async def add_or_update_movie(movie: Movie):
|
async def add_or_update_movie(movie: Movie):
|
||||||
|
"""Add or update a Movie in the database.
|
||||||
|
|
||||||
|
This is an upsert operation, but it will also update the Movie you pass
|
||||||
|
into the function to make its `id` match the DB's movie's `id`, and also
|
||||||
|
set all optional values on your Movie that might be unset but exist in the
|
||||||
|
database. It's a bidirectional sync.
|
||||||
|
"""
|
||||||
db_movie = await get(Movie, imdb_id=movie.imdb_id)
|
db_movie = await get(Movie, imdb_id=movie.imdb_id)
|
||||||
if not db_movie:
|
if not db_movie:
|
||||||
await add(movie)
|
await add(movie)
|
||||||
else:
|
else:
|
||||||
movie.id = db_movie.id
|
movie.id = db_movie.id
|
||||||
movie.updated = db_movie.updated
|
|
||||||
|
|
||||||
if movie != db_movie:
|
# We want to keep any existing value in the DB for all optional fields.
|
||||||
movie.updated = utcnow()
|
for f in optional_fields(movie):
|
||||||
await update(movie)
|
if getattr(movie, f.name) is None:
|
||||||
|
setattr(movie, f.name, getattr(db_movie, f.name))
|
||||||
|
|
||||||
|
if movie.updated <= db_movie.updated:
|
||||||
|
return
|
||||||
|
|
||||||
|
await update(movie)
|
||||||
|
|
||||||
|
|
||||||
async def add_or_update_rating(rating: Rating) -> bool:
|
async def add_or_update_rating(rating: Rating) -> bool:
|
||||||
|
|
@ -147,38 +163,48 @@ async def find_ratings(
|
||||||
values["escape"] = "#"
|
values["escape"] = "#"
|
||||||
escaped_title = sql_escape(title, char=values["escape"])
|
escaped_title = sql_escape(title, char=values["escape"])
|
||||||
values["pattern"] = "%" + "%".join(escaped_title.split()) + "%"
|
values["pattern"] = "%" + "%".join(escaped_title.split()) + "%"
|
||||||
conditions.append("movies.title LIKE :pattern ESCAPE :escape")
|
values["opattern"] = values["pattern"]
|
||||||
|
values["oescape"] = values["escape"]
|
||||||
|
conditions.append(
|
||||||
|
f"""
|
||||||
|
(
|
||||||
|
{Movie._table}.title LIKE :pattern ESCAPE :escape
|
||||||
|
OR {Movie._table}.original_title LIKE :opattern ESCAPE :oescape
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
if media_type:
|
if media_type:
|
||||||
values["media_type"] = media_type
|
values["media_type"] = media_type
|
||||||
conditions.append("movies.media_type=:media_type")
|
conditions.append(f"{Movie._table}.media_type=:media_type")
|
||||||
|
|
||||||
if ignore_tv_episodes:
|
if ignore_tv_episodes:
|
||||||
conditions.append("movies.media_type!='TV Episode'")
|
conditions.append(f"{Movie._table}.media_type!='TV Episode'")
|
||||||
|
|
||||||
query = f"""
|
query = f"""
|
||||||
WITH newest_movies
|
WITH newest_movies
|
||||||
AS (
|
AS (
|
||||||
SELECT DISTINCT ratings.movie_id
|
SELECT DISTINCT {Rating._table}.movie_id
|
||||||
FROM ratings
|
FROM {Rating._table}
|
||||||
LEFT JOIN movies ON movies.id=ratings.movie_id
|
LEFT JOIN {Movie._table} ON {Movie._table}.id={Rating._table}.movie_id
|
||||||
{('WHERE ' + ' AND '.join(conditions)) if conditions else ''}
|
{('WHERE ' + ' AND '.join(conditions)) if conditions else ''}
|
||||||
ORDER BY length(movies.title) ASC, ratings.rating_date DESC
|
ORDER BY length({Movie._table}.title) ASC, {Rating._table}.rating_date DESC
|
||||||
LIMIT :limit_rows
|
LIMIT :limit_rows
|
||||||
)
|
)
|
||||||
|
|
||||||
SELECT
|
SELECT
|
||||||
users.name AS user_name,
|
{User._table}.name AS user_name,
|
||||||
ratings.score AS user_score,
|
{Rating._table}.score AS user_score,
|
||||||
movies.score AS imdb_score,
|
{Movie._table}.score AS imdb_score,
|
||||||
movies.imdb_id AS movie_imdb_id,
|
{Movie._table}.imdb_id AS movie_imdb_id,
|
||||||
movies.media_type AS media_type,
|
{Movie._table}.media_type AS media_type,
|
||||||
movies.title AS movie_title,
|
{Movie._table}.title AS canonical_title,
|
||||||
movies.release_year AS release_year
|
{Movie._table}.original_title AS original_title,
|
||||||
|
{Movie._table}.release_year AS release_year
|
||||||
FROM newest_movies
|
FROM newest_movies
|
||||||
LEFT JOIN ratings ON ratings.movie_id=newest_movies.movie_id
|
LEFT JOIN {Rating._table} ON {Rating._table}.movie_id=newest_movies.movie_id
|
||||||
LEFT JOIN users ON users.id=ratings.user_id
|
LEFT JOIN {User._table} ON {User._table}.id={Rating._table}.user_id
|
||||||
LEFT JOIN movies ON movies.id=ratings.movie_id
|
LEFT JOIN {Movie._table} ON {Movie._table}.id={Rating._table}.movie_id
|
||||||
"""
|
"""
|
||||||
|
|
||||||
rows = await shared_connection().fetch_all(query=query, values=values)
|
rows = await shared_connection().fetch_all(query=query, values=values)
|
||||||
|
|
|
||||||
149
unwind/imdb.py
149
unwind/imdb.py
|
|
@ -6,8 +6,8 @@ from typing import Optional
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
from .db import add_or_update_movie, add_or_update_rating, add_or_update_user
|
from .db import add_or_update_movie, add_or_update_rating, add_or_update_user
|
||||||
from .models import Movie, Rating, User, asplain, fromplain
|
from .models import Movie, Rating, User
|
||||||
from .request import soup_from_url
|
from .request import cache_path, soup_from_url
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -38,6 +38,26 @@ def imdb_url(user_id):
|
||||||
return f"https://www.imdb.com/user/{user_id}/ratings"
|
return f"https://www.imdb.com/user/{user_id}/ratings"
|
||||||
|
|
||||||
|
|
||||||
|
def imdb_rating_from_score(score: int) -> float:
|
||||||
|
"""Return the IMDb rating from an Unwind Movie score."""
|
||||||
|
assert 0 <= score <= 100
|
||||||
|
rating = round(score * 9 / 100 + 1, 1)
|
||||||
|
assert 1.0 <= rating <= 10.0
|
||||||
|
return rating
|
||||||
|
|
||||||
|
|
||||||
|
def score_from_imdb_rating(rating: float) -> int:
|
||||||
|
"""Return the Unwind Movie score for an IMDb rating."""
|
||||||
|
# Scale IMDb's 10 point rating to our score of [0, 100].
|
||||||
|
# There's a pitfall here!
|
||||||
|
# You might think this would be simply IMDb's rating times 10, *but*
|
||||||
|
# the lowest possible rating on IMDb is actually 1.
|
||||||
|
assert 1.0 <= rating <= 10.0
|
||||||
|
score = round(100 * (rating - 1) / 9)
|
||||||
|
assert 0 <= score <= 100
|
||||||
|
return score
|
||||||
|
|
||||||
|
|
||||||
find_name = re.compile(r"(?P<name>.*)'s Ratings").fullmatch
|
find_name = re.compile(r"(?P<name>.*)'s Ratings").fullmatch
|
||||||
find_rating_date = re.compile(r"Rated on (?P<date>\d{2} \w+ \d{4})").fullmatch
|
find_rating_date = re.compile(r"Rated on (?P<date>\d{2} \w+ \d{4})").fullmatch
|
||||||
find_runtime = re.compile(r"((?P<h>\d+) hr)? ?((?P<m>\d+) min)?").fullmatch
|
find_runtime = re.compile(r"((?P<h>\d+) hr)? ?((?P<m>\d+) min)?").fullmatch
|
||||||
|
|
@ -50,67 +70,88 @@ find_year = re.compile(
|
||||||
find_movie_id = re.compile(r"/title/(?P<id>tt\d+)/").search
|
find_movie_id = re.compile(r"/title/(?P<id>tt\d+)/").search
|
||||||
|
|
||||||
|
|
||||||
|
def movie_and_rating_from_item(item) -> tuple[Movie, Rating]:
|
||||||
|
|
||||||
|
movie = Movie(
|
||||||
|
title=item.h3.a.string.strip(),
|
||||||
|
genres=set(s.strip() for s in item.find("span", "genre").string.split(",")),
|
||||||
|
)
|
||||||
|
|
||||||
|
episode_br = item.h3.br
|
||||||
|
if episode_br:
|
||||||
|
episode_a = episode_br.find_next("a")
|
||||||
|
if not episode_a:
|
||||||
|
raise ValueError("Unknown document structure.")
|
||||||
|
|
||||||
|
movie.media_type = "TV Episode"
|
||||||
|
movie.title += " / " + episode_a.string.strip()
|
||||||
|
if match := find_year(episode_br.find_next("span", "lister-item-year").string):
|
||||||
|
movie.release_year = int(match["year"])
|
||||||
|
if match := find_movie_id(episode_a["href"]):
|
||||||
|
movie.imdb_id = match["id"]
|
||||||
|
|
||||||
|
if (tag := item.find("span", "runtime")) and (match := find_runtime(tag.string)):
|
||||||
|
movie.runtime = int(match["h"] or 0) * 60 + int(match["m"] or 0)
|
||||||
|
|
||||||
|
if not episode_br:
|
||||||
|
if match := find_year(item.h3.find("span", "lister-item-year").string):
|
||||||
|
if media_type := match["type"]:
|
||||||
|
movie.media_type = media_type.strip()
|
||||||
|
movie.release_year = int(match["year"])
|
||||||
|
if match := find_movie_id(item.h3.a["href"]):
|
||||||
|
movie.imdb_id = match["id"]
|
||||||
|
|
||||||
|
if not movie.media_type:
|
||||||
|
movie.media_type = "Movie"
|
||||||
|
|
||||||
|
rating = Rating()
|
||||||
|
|
||||||
|
ratings_item = item.find("div", "ipl-rating-widget")
|
||||||
|
if match := find_rating_date(ratings_item.find_next("p", "text-muted").string):
|
||||||
|
rating.rating_date = datetime.strptime(match["date"], "%d %b %Y")
|
||||||
|
if match := ratings_item.find("div", "ipl-rating-star--other-user"):
|
||||||
|
if rating_item := match.find("span", "ipl-rating-star__rating"):
|
||||||
|
rating.score = score_from_imdb_rating(float(rating_item.string))
|
||||||
|
if match := ratings_item.find("div", "ipl-rating-star small"):
|
||||||
|
if rating_item := match.find("span", "ipl-rating-star__rating"):
|
||||||
|
movie.score = score_from_imdb_rating(float(rating_item.string))
|
||||||
|
|
||||||
|
return movie, rating
|
||||||
|
|
||||||
|
|
||||||
|
ForgedRequest = namedtuple("ForgedRequest", "url headers")
|
||||||
|
|
||||||
|
|
||||||
async def parse_page(url, stop_on_dupe=True) -> Optional[str]:
|
async def parse_page(url, stop_on_dupe=True) -> Optional[str]:
|
||||||
soup = soup_from_url(url)
|
soup = soup_from_url(url)
|
||||||
|
|
||||||
user = User(imdb_id=soup.find("meta", property="pageId")["content"], name="")
|
meta = soup.find("meta", property="pageId")
|
||||||
if match := find_name(soup.h1.string):
|
headline = soup.h1
|
||||||
|
assert meta is not None and headline is not None
|
||||||
|
user = User(imdb_id=meta["content"], name="")
|
||||||
|
if match := find_name(headline.string):
|
||||||
user.name = match["name"]
|
user.name = match["name"]
|
||||||
await add_or_update_user(user)
|
await add_or_update_user(user)
|
||||||
|
|
||||||
items = soup.find_all("div", "lister-item-content")
|
items = soup.find_all("div", "lister-item-content")
|
||||||
for i, item in enumerate(items):
|
for i, item in enumerate(items):
|
||||||
|
|
||||||
movie = Movie(
|
try:
|
||||||
title=item.h3.a.string.strip(),
|
movie, rating = movie_and_rating_from_item(item)
|
||||||
genres=set(s.strip() for s in item.find("span", "genre").string.split(",")),
|
except Exception as err:
|
||||||
)
|
log.error(
|
||||||
|
"Error in %s item #%s (%s): %s: %s",
|
||||||
episode_br = item.h3.br
|
url,
|
||||||
if episode_br:
|
i,
|
||||||
episode_a = episode_br.find_next("a")
|
cache_path(ForgedRequest(url, headers={})),
|
||||||
if not episode_a:
|
" ".join(item.h3.stripped_strings),
|
||||||
log.error("Unknown document structure.")
|
err,
|
||||||
continue
|
)
|
||||||
|
continue
|
||||||
movie.media_type = "TV Episode"
|
|
||||||
movie.title += " / " + episode_a.string.strip()
|
|
||||||
if match := find_year(
|
|
||||||
episode_br.find_next("span", "lister-item-year").string
|
|
||||||
):
|
|
||||||
movie.release_year = int(match["year"])
|
|
||||||
if match := find_movie_id(episode_a["href"]):
|
|
||||||
movie.imdb_id = match["id"]
|
|
||||||
|
|
||||||
rating = Rating(user_id=user.id)
|
|
||||||
|
|
||||||
if (tag := item.find("span", "runtime")) and (
|
|
||||||
match := find_runtime(tag.string)
|
|
||||||
):
|
|
||||||
movie.runtime = int(match["h"] or 0) * 60 + int(match["m"] or 0)
|
|
||||||
|
|
||||||
if not episode_br:
|
|
||||||
if match := find_year(item.h3.find("span", "lister-item-year").string):
|
|
||||||
if media_type := match["type"]:
|
|
||||||
movie.media_type = media_type.strip()
|
|
||||||
movie.release_year = int(match["year"])
|
|
||||||
if match := find_movie_id(item.h3.a["href"]):
|
|
||||||
movie.imdb_id = match["id"]
|
|
||||||
|
|
||||||
ratings_item = item.find("div", "ipl-rating-widget")
|
|
||||||
if match := find_rating_date(ratings_item.find_next("p", "text-muted").string):
|
|
||||||
rating.rating_date = datetime.strptime(match["date"], "%d %b %Y")
|
|
||||||
for rating_item in ratings_item.find_all("span", "ipl-rating-star__rating")[:2]:
|
|
||||||
if "ipl-rating-star--other-user" in rating_item.parent["class"]:
|
|
||||||
rating.score = int(float(rating_item.string) * 10)
|
|
||||||
else:
|
|
||||||
movie.score = int(float(rating_item.string) * 10)
|
|
||||||
|
|
||||||
if not movie.media_type:
|
|
||||||
movie.media_type = "Movie"
|
|
||||||
|
|
||||||
await add_or_update_movie(movie)
|
await add_or_update_movie(movie)
|
||||||
|
|
||||||
|
rating.user_id = user.id
|
||||||
rating.movie_id = movie.id # needs to be set _after_ movie has been updated
|
rating.movie_id = movie.id # needs to be set _after_ movie has been updated
|
||||||
is_updated = await add_or_update_rating(rating)
|
is_updated = await add_or_update_rating(rating)
|
||||||
|
|
||||||
|
|
@ -118,9 +159,9 @@ async def parse_page(url, stop_on_dupe=True) -> Optional[str]:
|
||||||
log.info("Import stopped after %s items. Caught up to known state. ✋", i)
|
log.info("Import stopped after %s items. Caught up to known state. ✋", i)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
next_url = urljoin(
|
footer = soup.find("div", "footer")
|
||||||
url, soup.find("div", "footer").find(string=re.compile(r"Next")).parent["href"]
|
assert footer is not None
|
||||||
)
|
next_url = urljoin(url, footer.find(string=re.compile(r"Next")).parent["href"])
|
||||||
|
|
||||||
return next_url if url != next_url else None
|
return next_url if url != next_url else None
|
||||||
|
|
||||||
|
|
|
||||||
206
unwind/imdb_import.py
Normal file
206
unwind/imdb_import.py
Normal file
|
|
@ -0,0 +1,206 @@
|
||||||
|
import csv
|
||||||
|
import gzip
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass, fields
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, get_origin
|
||||||
|
|
||||||
|
from . import db
|
||||||
|
from .db import add_or_update_movie
|
||||||
|
from .imdb import score_from_imdb_rating
|
||||||
|
from .models import Movie, optional_type
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# See
|
||||||
|
# - https://www.imdb.com/interfaces/
|
||||||
|
# - https://datasets.imdbws.com/
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BasicRow:
|
||||||
|
tconst: str
|
||||||
|
titleType: str
|
||||||
|
primaryTitle: str
|
||||||
|
originalTitle: str
|
||||||
|
isAdult: bool
|
||||||
|
startYear: Optional[int]
|
||||||
|
endYear: Optional[int]
|
||||||
|
runtimeMinutes: Optional[int]
|
||||||
|
genres: Optional[set[str]]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_row(cls, row):
|
||||||
|
vals = []
|
||||||
|
for f, r in zip(fields(cls), row):
|
||||||
|
ttype = f.type
|
||||||
|
is_opt = False
|
||||||
|
|
||||||
|
if (otype := optional_type(ttype)) is not None:
|
||||||
|
ttype = otype
|
||||||
|
is_opt = True
|
||||||
|
if (otype := get_origin(ttype)) is not None:
|
||||||
|
ttype = otype
|
||||||
|
|
||||||
|
if r == r"\N":
|
||||||
|
if is_opt:
|
||||||
|
vals.append(None)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unexpected null value for field: {f.name}")
|
||||||
|
elif f.name == "genres":
|
||||||
|
vals.append(set(r.split(",")))
|
||||||
|
elif f.name == "isAdult":
|
||||||
|
assert r in "01"
|
||||||
|
vals.append(r == "1")
|
||||||
|
else:
|
||||||
|
vals.append(ttype(r))
|
||||||
|
|
||||||
|
inst = cls(*vals)
|
||||||
|
assert inst.titleType in title_types
|
||||||
|
return inst
|
||||||
|
|
||||||
|
def as_movie(self):
|
||||||
|
assert self.startYear is not None
|
||||||
|
return Movie(
|
||||||
|
title=self.primaryTitle,
|
||||||
|
original_title=self.originalTitle,
|
||||||
|
release_year=self.startYear,
|
||||||
|
media_type=title_types[self.titleType],
|
||||||
|
imdb_id=self.tconst,
|
||||||
|
score=None,
|
||||||
|
runtime=self.runtimeMinutes,
|
||||||
|
genres=self.genres or set(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RatingRow:
|
||||||
|
tconst: str
|
||||||
|
averageRating: float
|
||||||
|
numVotes: int
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_row(cls, row):
|
||||||
|
inst = cls(*(f.type(r) for f, r in zip(fields(cls), row)))
|
||||||
|
assert inst.tconst != r"\N"
|
||||||
|
return inst
|
||||||
|
|
||||||
|
def as_movie(self):
|
||||||
|
return Movie(
|
||||||
|
imdb_id=self.tconst,
|
||||||
|
score=score_from_imdb_rating(self.averageRating),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
title_types = {
|
||||||
|
"movie": "Movie",
|
||||||
|
"radioEpisode": "Radio Episode",
|
||||||
|
"radioSeries": "Radio Series",
|
||||||
|
"short": "Short",
|
||||||
|
"tvEpisode": "TV Episode",
|
||||||
|
"tvMiniSeries": "TV Mini Series",
|
||||||
|
"tvMovie": "TV Movie",
|
||||||
|
"tvSeries": "TV Series",
|
||||||
|
"tvShort": "TV Short",
|
||||||
|
"tvSpecial": "TV Special",
|
||||||
|
"video": "Video",
|
||||||
|
"videoGame": "Video Game",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def gz_mtime(path) -> datetime:
|
||||||
|
"""Return the timestamp of the compressed file."""
|
||||||
|
g = gzip.GzipFile(path, "rb")
|
||||||
|
g.peek(1) # start reading the file to fill the timestamp field
|
||||||
|
assert g.mtime is not None
|
||||||
|
return datetime.fromtimestamp(g.mtime).replace(tzinfo=timezone.utc)
|
||||||
|
|
||||||
|
|
||||||
|
def count_lines(path) -> int:
|
||||||
|
i = 0
|
||||||
|
with gzip.open(path, "rt") as f:
|
||||||
|
for i, _ in enumerate(f, start=1):
|
||||||
|
pass
|
||||||
|
return i
|
||||||
|
|
||||||
|
|
||||||
|
def read_imdb_tsv(path, row_type):
|
||||||
|
with gzip.open(path, "rt", newline="") as f:
|
||||||
|
rows = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
|
||||||
|
|
||||||
|
# skip header line
|
||||||
|
rows = iter(rows)
|
||||||
|
header = next(rows)
|
||||||
|
try:
|
||||||
|
assert tuple(f.name for f in fields(row_type)) == tuple(header)
|
||||||
|
except AssertionError:
|
||||||
|
log.error("Unexpected header line: %s", header)
|
||||||
|
raise
|
||||||
|
|
||||||
|
for i, row in enumerate(rows, start=1):
|
||||||
|
try:
|
||||||
|
yield row_type.from_row(row)
|
||||||
|
except Exception as err:
|
||||||
|
log.error("Error in line %s: %s", i, row, exc_info=err)
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def read_ratings(path):
|
||||||
|
mtime = gz_mtime(path)
|
||||||
|
rows = read_imdb_tsv(path, RatingRow)
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
m = row.as_movie()
|
||||||
|
m.updated = mtime
|
||||||
|
yield m
|
||||||
|
|
||||||
|
|
||||||
|
def read_basics(path):
|
||||||
|
mtime = gz_mtime(path)
|
||||||
|
rows = read_imdb_tsv(path, BasicRow)
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
if row.startYear is None:
|
||||||
|
log.debug("Skipping movie, missing year: %s", row)
|
||||||
|
continue
|
||||||
|
|
||||||
|
m = row.as_movie()
|
||||||
|
m.updated = mtime
|
||||||
|
yield m
|
||||||
|
|
||||||
|
|
||||||
|
async def import_from_file(basics_path: Path, ratings_path: Path):
|
||||||
|
log.info("Loading scores ... 💾")
|
||||||
|
scores = {m.imdb_id: m.score for m in read_ratings(ratings_path)}
|
||||||
|
|
||||||
|
log.info("Importing movies ... 💾")
|
||||||
|
total = count_lines(basics_path)
|
||||||
|
assert total != 0
|
||||||
|
perc = 0.0
|
||||||
|
perc_step = 0.001
|
||||||
|
|
||||||
|
async with db.shared_connection().transaction():
|
||||||
|
|
||||||
|
for i, m in enumerate(read_basics(basics_path)):
|
||||||
|
|
||||||
|
if i / total > perc:
|
||||||
|
log.info("Imported %s%%", round(perc * 100, 1))
|
||||||
|
perc += perc_step
|
||||||
|
|
||||||
|
if m.media_type not in {
|
||||||
|
"Movie",
|
||||||
|
"Short",
|
||||||
|
"TV Mini Series",
|
||||||
|
"TV Movie",
|
||||||
|
"TV Series",
|
||||||
|
"TV Short",
|
||||||
|
"TV Special",
|
||||||
|
"Video",
|
||||||
|
}:
|
||||||
|
log.debug("Skipping movie, unwanted media type: %s", m.media_type)
|
||||||
|
continue
|
||||||
|
|
||||||
|
m.score = scores.get(m.imdb_id)
|
||||||
|
await add_or_update_movie(m)
|
||||||
|
|
@ -1,31 +1,32 @@
|
||||||
PRAGMA foreign_keys = ON;;
|
PRAGMA foreign_keys = ON;;
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS users (
|
CREATE TABLE IF NOT EXISTS users (
|
||||||
id TEXT NOT NULL PRIMARY KEY,
|
id TEXT PRIMARY KEY NOT NULL,
|
||||||
imdb_id TEXT NOT NULL UNIQUE,
|
imdb_id TEXT NOT NULL UNIQUE,
|
||||||
name TEXT NOT NULL
|
name TEXT NOT NULL
|
||||||
);;
|
);;
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS movies (
|
CREATE TABLE IF NOT EXISTS movies (
|
||||||
id TEXT NOT NULL PRIMARY KEY,
|
id TEXT PRIMARY KEY NOT NULL,
|
||||||
title TEXT NOT NULL,
|
title TEXT NOT NULL,
|
||||||
release_year NUMBER NOT NULL,
|
original_title TEXT,
|
||||||
|
release_year INTEGER NOT NULL,
|
||||||
media_type TEXT NOT NULL,
|
media_type TEXT NOT NULL,
|
||||||
imdb_id TEXT NOT NULL UNIQUE,
|
imdb_id TEXT NOT NULL UNIQUE,
|
||||||
score NUMBER NOT NULL,
|
score INTEGER,
|
||||||
runtime NUMBER,
|
runtime INTEGER,
|
||||||
genres TEXT NOT NULL,
|
genres TEXT NOT NULL,
|
||||||
updated TEXT NOT NULL
|
updated TEXT NOT NULL
|
||||||
);;
|
);;
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS ratings (
|
CREATE TABLE IF NOT EXISTS ratings (
|
||||||
id TEXT NOT NULL PRIMARY KEY,
|
id TEXT PRIMARY KEY NOT NULL,
|
||||||
movie_id TEXT NOT NULL,
|
movie_id TEXT NOT NULL,
|
||||||
user_id TEXT NOT NULL,
|
user_id TEXT NOT NULL,
|
||||||
score NUMBER NOT NULL,
|
score INTEGER NOT NULL,
|
||||||
rating_date TEXT NOT NULL,
|
rating_date TEXT NOT NULL,
|
||||||
favorite NUMBER,
|
favorite BOOL,
|
||||||
finished NUMBER,
|
finished BOOL,
|
||||||
FOREIGN KEY(movie_id) REFERENCES movies(id),
|
FOREIGN KEY(movie_id) REFERENCES movies(id),
|
||||||
FOREIGN KEY(user_id) REFERENCES users(id)
|
FOREIGN KEY(user_id) REFERENCES users(id)
|
||||||
);;
|
);;
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
import json
|
import json
|
||||||
from dataclasses import asdict, dataclass, field, fields, is_dataclass
|
from dataclasses import asdict, dataclass, field, fields
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from typing import Any, ClassVar, Optional, Type, Union, get_args, get_origin
|
from typing import Any, ClassVar, Optional, Type, Union, get_args, get_origin
|
||||||
|
|
||||||
|
|
@ -25,6 +25,12 @@ def optional_type(tp: Type):
|
||||||
return args[0]
|
return args[0]
|
||||||
|
|
||||||
|
|
||||||
|
def optional_fields(o):
|
||||||
|
for f in fields(o):
|
||||||
|
if is_optional(f.type):
|
||||||
|
yield f
|
||||||
|
|
||||||
|
|
||||||
def asplain(o) -> dict[str, Any]:
|
def asplain(o) -> dict[str, Any]:
|
||||||
validate(o)
|
validate(o)
|
||||||
|
|
||||||
|
|
@ -56,9 +62,6 @@ def asplain(o) -> dict[str, Any]:
|
||||||
|
|
||||||
|
|
||||||
def fromplain(cls, d: dict[str, Any]):
|
def fromplain(cls, d: dict[str, Any]):
|
||||||
# if not is_dataclass(cls):
|
|
||||||
# raise TypeError(f'Not a dataclass: {type(cls)}')
|
|
||||||
|
|
||||||
dd = {}
|
dd = {}
|
||||||
for f in fields(cls):
|
for f in fields(cls):
|
||||||
|
|
||||||
|
|
@ -107,11 +110,14 @@ class Movie:
|
||||||
_table: ClassVar[str] = "movies"
|
_table: ClassVar[str] = "movies"
|
||||||
|
|
||||||
id: ULID = field(default_factory=ULID)
|
id: ULID = field(default_factory=ULID)
|
||||||
title: str = None # canonical title
|
title: str = None # canonical title (usually English)
|
||||||
|
original_title: Optional[
|
||||||
|
str
|
||||||
|
] = None # original title (usually transscribed to latin script)
|
||||||
release_year: int = None # canonical release date
|
release_year: int = None # canonical release date
|
||||||
media_type: Optional[str] = None
|
media_type: str = None
|
||||||
imdb_id: str = None
|
imdb_id: str = None
|
||||||
score: int = None # range: [0,100]
|
score: Optional[int] = None # range: [0,100]
|
||||||
runtime: Optional[int] = None # minutes
|
runtime: Optional[int] = None # minutes
|
||||||
genres: set[str] = None
|
genres: set[str] = None
|
||||||
updated: datetime = field(default_factory=utcnow)
|
updated: datetime = field(default_factory=utcnow)
|
||||||
|
|
|
||||||
|
|
@ -5,9 +5,10 @@ from contextlib import contextmanager
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from functools import wraps
|
from functools import wraps
|
||||||
from hashlib import md5
|
from hashlib import md5
|
||||||
|
from pathlib import Path
|
||||||
from random import random
|
from random import random
|
||||||
from time import sleep, time
|
from time import sleep, time
|
||||||
from typing import Callable
|
from typing import Callable, Optional
|
||||||
|
|
||||||
import bs4
|
import bs4
|
||||||
import requests
|
import requests
|
||||||
|
|
@ -130,16 +131,19 @@ class RedirectError(RuntimeError):
|
||||||
super().__init__(f"Redirected: {from_url} -> {to_url}")
|
super().__init__(f"Redirected: {from_url} -> {to_url}")
|
||||||
|
|
||||||
|
|
||||||
|
def cache_path(req) -> Optional[Path]:
|
||||||
|
if not config.cachedir:
|
||||||
|
return
|
||||||
|
sig = repr(req.url) # + repr(sorted(req.headers.items()))
|
||||||
|
return config.cachedir / md5(sig.encode()).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
@throttle(1, 1, random)
|
@throttle(1, 1, random)
|
||||||
def http_get(s: requests.Session, url: str, *args, **kwds) -> requests.Response:
|
def http_get(s: requests.Session, url: str, *args, **kwds) -> requests.Response:
|
||||||
|
|
||||||
req = s.prepare_request(requests.Request("GET", url, *args, **kwds))
|
req = s.prepare_request(requests.Request("GET", url, *args, **kwds))
|
||||||
|
|
||||||
if config.debug and config.cachedir:
|
cachefile = cache_path(req) if config.debug else None
|
||||||
sig = repr(req.url) # + repr(sorted(req.headers.items()))
|
|
||||||
cachefile = config.cachedir / md5(sig.encode()).hexdigest()
|
|
||||||
else:
|
|
||||||
cachefile = None
|
|
||||||
|
|
||||||
if cachefile:
|
if cachefile:
|
||||||
if cachefile.exists():
|
if cachefile.exists():
|
||||||
|
|
|
||||||
101
unwind/web.py
101
unwind/web.py
|
|
@ -1,11 +1,42 @@
|
||||||
from collections import defaultdict
|
import base64
|
||||||
|
import binascii
|
||||||
|
|
||||||
from starlette.applications import Starlette
|
from starlette.applications import Starlette
|
||||||
|
from starlette.authentication import (
|
||||||
|
AuthCredentials,
|
||||||
|
AuthenticationBackend,
|
||||||
|
AuthenticationError,
|
||||||
|
SimpleUser,
|
||||||
|
UnauthenticatedUser,
|
||||||
|
requires,
|
||||||
|
)
|
||||||
|
from starlette.middleware import Middleware
|
||||||
|
from starlette.middleware.authentication import AuthenticationMiddleware
|
||||||
from starlette.responses import JSONResponse
|
from starlette.responses import JSONResponse
|
||||||
from starlette.routing import Route
|
from starlette.routing import Mount, Route
|
||||||
|
|
||||||
from . import config
|
from . import config, db
|
||||||
from .db import close_connection_pool, find_ratings, open_connection_pool
|
from .db import close_connection_pool, find_ratings, open_connection_pool
|
||||||
|
from .models import Movie, asplain
|
||||||
|
|
||||||
|
|
||||||
|
class BasicAuthBackend(AuthenticationBackend):
|
||||||
|
async def authenticate(self, request):
|
||||||
|
if "Authorization" not in request.headers:
|
||||||
|
return
|
||||||
|
|
||||||
|
auth = request.headers["Authorization"]
|
||||||
|
try:
|
||||||
|
scheme, credentials = auth.split()
|
||||||
|
if scheme.lower() != "basic":
|
||||||
|
return
|
||||||
|
decoded = base64.b64decode(credentials).decode("ascii")
|
||||||
|
except (ValueError, UnicodeDecodeError, binascii.Error) as exc:
|
||||||
|
raise AuthenticationError("Invalid basic auth credentials")
|
||||||
|
|
||||||
|
username, _, password = decoded.partition(":")
|
||||||
|
# TODO: You'd want to verify the username and password here.
|
||||||
|
return AuthCredentials(["authenticated"]), SimpleUser(username)
|
||||||
|
|
||||||
|
|
||||||
def imdb_url(imdb_id: str):
|
def imdb_url(imdb_id: str):
|
||||||
|
|
@ -29,7 +60,8 @@ async def ratings(request):
|
||||||
mov = aggr.setdefault(
|
mov = aggr.setdefault(
|
||||||
r["movie_imdb_id"],
|
r["movie_imdb_id"],
|
||||||
{
|
{
|
||||||
"title": r["movie_title"],
|
"canonical_title": r["canonical_title"],
|
||||||
|
"original_title": r["original_title"],
|
||||||
"year": r["release_year"],
|
"year": r["release_year"],
|
||||||
"link": imdb_url(r["movie_imdb_id"]),
|
"link": imdb_url(r["movie_imdb_id"]),
|
||||||
"user_scores": [],
|
"user_scores": [],
|
||||||
|
|
@ -44,10 +76,69 @@ async def ratings(request):
|
||||||
return JSONResponse(resp)
|
return JSONResponse(resp)
|
||||||
|
|
||||||
|
|
||||||
|
not_found = JSONResponse({"error": "Not Found"}, status_code=404)
|
||||||
|
|
||||||
|
|
||||||
|
async def get_movies(request):
|
||||||
|
imdb_id = request.query_params.get("imdb_id")
|
||||||
|
|
||||||
|
movie = await db.get(Movie, imdb_id=imdb_id)
|
||||||
|
|
||||||
|
resp = [asplain(movie)] if movie else []
|
||||||
|
return JSONResponse(resp)
|
||||||
|
|
||||||
|
|
||||||
|
@requires(["authenticated", "admin"])
|
||||||
|
async def add_movie(request):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@requires(["authenticated", "admin"])
|
||||||
|
async def add_user(request):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
async def ratings_for_user(request):
|
||||||
|
request.path_params["user_id"]
|
||||||
|
|
||||||
|
|
||||||
|
@requires("authenticated")
|
||||||
|
async def set_rating_for_user(request):
|
||||||
|
request.path_params["user_id"]
|
||||||
|
|
||||||
|
|
||||||
|
@requires(["authenticated", "admin"])
|
||||||
|
async def add_group(request):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@requires(["authenticated", "admin"])
|
||||||
|
async def add_user_to_group(request):
|
||||||
|
request.path_params["group_id"]
|
||||||
|
|
||||||
|
|
||||||
|
async def get_ratings_for_group(request):
|
||||||
|
request.path_params["group_id"]
|
||||||
|
|
||||||
|
|
||||||
app = Starlette(
|
app = Starlette(
|
||||||
on_startup=[open_connection_pool],
|
on_startup=[open_connection_pool],
|
||||||
on_shutdown=[close_connection_pool],
|
on_shutdown=[close_connection_pool],
|
||||||
routes=[
|
routes=[
|
||||||
Route("/ratings", ratings),
|
Mount(
|
||||||
|
"/api/v1",
|
||||||
|
routes=[
|
||||||
|
Route("/ratings", ratings), # XXX legacy, remove.
|
||||||
|
Route("/movies", get_movies),
|
||||||
|
Route("/movies", add_movie, methods=["POST"]),
|
||||||
|
Route("/users", add_user, methods=["POST"]),
|
||||||
|
Route("/users/{user_id}/ratings", ratings_for_user),
|
||||||
|
Route("/users/{user_id}/ratings", set_rating_for_user, methods=["PUT"]),
|
||||||
|
Route("/groups", add_group, methods=["POST"]),
|
||||||
|
Route("/groups/{group_id}/users", add_user_to_group, methods=["POST"]),
|
||||||
|
Route("/groups/{group_id}/ratings", get_ratings_for_group),
|
||||||
|
],
|
||||||
|
),
|
||||||
],
|
],
|
||||||
|
middleware=[Middleware(AuthenticationMiddleware, backend=BasicAuthBackend())],
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue