refactor loading of imdb ratings to yield loaded ratings

This commit is contained in:
ducklet 2021-07-10 01:43:24 +02:00
parent 1ad7a79d33
commit 1805282d41
3 changed files with 60 additions and 34 deletions

View file

@ -4,11 +4,9 @@ import logging
from pathlib import Path from pathlib import Path
from . import config from . import config
from .db import close_connection_pool, get_all, open_connection_pool from .db import close_connection_pool, open_connection_pool
from .imdb import load_imdb from .imdb import refresh_user_ratings_from_imdb
from .imdb_import import import_from_file from .imdb_import import import_from_file
from .models import User
from .request import session
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -16,12 +14,11 @@ log = logging.getLogger(__name__)
async def run_load_user_ratings_from_imdb(): async def run_load_user_ratings_from_imdb():
await open_connection_pool() await open_connection_pool()
with session() as s: i = 0
s.headers["Accept-Language"] = "en-GB, en;q=0.5" async for rating in refresh_user_ratings_from_imdb():
i += 1
for user in await get_all(User): log.info("✨ Imported %s new ratings.", i)
log.info("Loading data for %s ... ⚡️", user.name)
await load_imdb(user.imdb_id)
await close_connection_pool() await close_connection_pool()

View file

@ -2,12 +2,12 @@ import logging
import re import re
from collections import namedtuple from collections import namedtuple
from datetime import datetime from datetime import datetime
from typing import Optional from typing import Optional, Tuple
from urllib.parse import urljoin from urllib.parse import urljoin
from .db import add_or_update_movie, add_or_update_rating, add_or_update_user from . import db
from .models import Movie, Rating, User from .models import Movie, Rating, User
from .request import cache_path, soup_from_url from .request import cache_path, session, soup_from_url
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -34,10 +34,32 @@ log = logging.getLogger(__name__)
# p.text-muted.text ("Rated on 06 May 2021") # p.text-muted.text ("Rated on 06 May 2021")
def imdb_url(user_id): async def refresh_user_ratings_from_imdb(stop_on_dupe=True):
with session() as s:
s.headers["Accept-Language"] = "en-GB, en;q=0.5"
for user in await db.get_all(User):
log.info("⚡️ Loading data for %s ...", user.name)
async for rating, is_updated in load_ratings(user.imdb_id):
assert rating.user == user
if stop_on_dupe and not is_updated:
break
yield rating
def user_ratings_url(user_id):
return f"https://www.imdb.com/user/{user_id}/ratings" return f"https://www.imdb.com/user/{user_id}/ratings"
def movie_url(imdb_id: str):
return f"https://www.imdb.com/title/{imdb_id}/"
def imdb_rating_from_score(score: int) -> float: def imdb_rating_from_score(score: int) -> float:
"""Return the IMDb rating from an Unwind Movie score.""" """Return the IMDb rating from an Unwind Movie score."""
assert 0 <= score <= 100 assert 0 <= score <= 100
@ -122,7 +144,9 @@ def movie_and_rating_from_item(item) -> tuple[Movie, Rating]:
ForgedRequest = namedtuple("ForgedRequest", "url headers") ForgedRequest = namedtuple("ForgedRequest", "url headers")
async def parse_page(url, stop_on_dupe=True) -> Optional[str]: async def parse_page(url) -> Tuple[list[Rating], Optional[str]]:
ratings = []
soup = soup_from_url(url) soup = soup_from_url(url)
meta = soup.find("meta", property="pageId") meta = soup.find("meta", property="pageId")
@ -131,7 +155,6 @@ async def parse_page(url, stop_on_dupe=True) -> Optional[str]:
user = User(imdb_id=meta["content"], name="") user = User(imdb_id=meta["content"], name="")
if match := find_name(headline.string): if match := find_name(headline.string):
user.name = match["name"] user.name = match["name"]
await add_or_update_user(user)
items = soup.find_all("div", "lister-item-content") items = soup.find_all("div", "lister-item-content")
for i, item in enumerate(items): for i, item in enumerate(items):
@ -149,25 +172,35 @@ async def parse_page(url, stop_on_dupe=True) -> Optional[str]:
) )
continue continue
await add_or_update_movie(movie) rating.user = user
rating.movie = movie
rating.user_id = user.id ratings.append(rating)
rating.movie_id = movie.id # needs to be set _after_ movie has been updated
is_updated = await add_or_update_rating(rating)
if stop_on_dupe and not is_updated:
log.info("Import stopped after %s items. Caught up to known state. ✋", i)
return None
footer = soup.find("div", "footer") footer = soup.find("div", "footer")
assert footer is not None assert footer is not None
next_url = urljoin(url, footer.find(string=re.compile(r"Next")).parent["href"]) next_url = urljoin(url, footer.find(string=re.compile(r"Next")).parent["href"])
return next_url if url != next_url else None return (ratings, next_url if url != next_url else None)
async def load_imdb(user_id): async def load_ratings(user_id):
next_url = imdb_url(user_id) next_url = user_ratings_url(user_id)
while next_url := await parse_page(next_url): while next_url:
pass
ratings, next_url = await parse_page(next_url)
for i, rating in enumerate(ratings):
if i == 0:
# All rating objects share the same user.
await db.add_or_update_user(rating.user)
rating.user_id = rating.user.id
await db.add_or_update_movie(rating.movie)
rating.movie_id = rating.movie.id
is_updated = await db.add_or_update_rating(rating)
yield rating, is_updated

View file

@ -18,7 +18,7 @@ from starlette.middleware.authentication import AuthenticationMiddleware
from starlette.responses import JSONResponse from starlette.responses import JSONResponse
from starlette.routing import Mount, Route from starlette.routing import Mount, Route
from . import config, db from . import config, db, imdb
from .db import close_connection_pool, find_ratings, open_connection_pool from .db import close_connection_pool, find_ratings, open_connection_pool
from .middleware.responsetime import ResponseTimeMiddleware from .middleware.responsetime import ResponseTimeMiddleware
from .models import Group, Movie, User, asplain from .models import Group, Movie, User, asplain
@ -64,10 +64,6 @@ class BearerAuthBackend(AuthenticationBackend):
return AuthCredentials(["authenticated", *roles]), user return AuthCredentials(["authenticated", *roles]), user
def imdb_url(imdb_id: str):
return f"https://www.imdb.com/title/{imdb_id}/"
def truthy(s: str): def truthy(s: str):
return bool(s) and s.lower() in {"1", "yes", "true"} return bool(s) and s.lower() in {"1", "yes", "true"}
@ -153,7 +149,7 @@ async def get_ratings_for_group(request):
"canonical_title": r["canonical_title"], "canonical_title": r["canonical_title"],
"original_title": r["original_title"], "original_title": r["original_title"],
"year": r["release_year"], "year": r["release_year"],
"link": imdb_url(r["movie_imdb_id"]), "link": imdb.movie_url(r["movie_imdb_id"]),
"user_scores": [], "user_scores": [],
"imdb_score": r["imdb_score"], "imdb_score": r["imdb_score"],
"media_type": r["media_type"], "media_type": r["media_type"],