From 1805282d41134a940fc03f6f28a6b9b7dcf9bcd5 Mon Sep 17 00:00:00 2001 From: ducklet Date: Sat, 10 Jul 2021 01:43:24 +0200 Subject: [PATCH] refactor loading of imdb ratings to yield loaded ratings --- unwind/__main__.py | 15 ++++------ unwind/imdb.py | 71 +++++++++++++++++++++++++++++++++------------- unwind/web.py | 8 ++---- 3 files changed, 60 insertions(+), 34 deletions(-) diff --git a/unwind/__main__.py b/unwind/__main__.py index 24d530e..983ede7 100644 --- a/unwind/__main__.py +++ b/unwind/__main__.py @@ -4,11 +4,9 @@ import logging from pathlib import Path from . import config -from .db import close_connection_pool, get_all, open_connection_pool -from .imdb import load_imdb +from .db import close_connection_pool, open_connection_pool +from .imdb import refresh_user_ratings_from_imdb from .imdb_import import import_from_file -from .models import User -from .request import session log = logging.getLogger(__name__) @@ -16,12 +14,11 @@ log = logging.getLogger(__name__) async def run_load_user_ratings_from_imdb(): await open_connection_pool() - with session() as s: - s.headers["Accept-Language"] = "en-GB, en;q=0.5" + i = 0 + async for rating in refresh_user_ratings_from_imdb(): + i += 1 - for user in await get_all(User): - log.info("Loading data for %s ... ⚡️", user.name) - await load_imdb(user.imdb_id) + log.info("✨ Imported %s new ratings.", i) await close_connection_pool() diff --git a/unwind/imdb.py b/unwind/imdb.py index 95e6469..72d06c9 100644 --- a/unwind/imdb.py +++ b/unwind/imdb.py @@ -2,12 +2,12 @@ import logging import re from collections import namedtuple from datetime import datetime -from typing import Optional +from typing import Optional, Tuple from urllib.parse import urljoin -from .db import add_or_update_movie, add_or_update_rating, add_or_update_user +from . import db from .models import Movie, Rating, User -from .request import cache_path, soup_from_url +from .request import cache_path, session, soup_from_url log = logging.getLogger(__name__) @@ -34,10 +34,32 @@ log = logging.getLogger(__name__) # p.text-muted.text ("Rated on 06 May 2021") -def imdb_url(user_id): +async def refresh_user_ratings_from_imdb(stop_on_dupe=True): + + with session() as s: + s.headers["Accept-Language"] = "en-GB, en;q=0.5" + + for user in await db.get_all(User): + + log.info("⚡️ Loading data for %s ...", user.name) + + async for rating, is_updated in load_ratings(user.imdb_id): + assert rating.user == user + + if stop_on_dupe and not is_updated: + break + + yield rating + + +def user_ratings_url(user_id): return f"https://www.imdb.com/user/{user_id}/ratings" +def movie_url(imdb_id: str): + return f"https://www.imdb.com/title/{imdb_id}/" + + def imdb_rating_from_score(score: int) -> float: """Return the IMDb rating from an Unwind Movie score.""" assert 0 <= score <= 100 @@ -122,7 +144,9 @@ def movie_and_rating_from_item(item) -> tuple[Movie, Rating]: ForgedRequest = namedtuple("ForgedRequest", "url headers") -async def parse_page(url, stop_on_dupe=True) -> Optional[str]: +async def parse_page(url) -> Tuple[list[Rating], Optional[str]]: + ratings = [] + soup = soup_from_url(url) meta = soup.find("meta", property="pageId") @@ -131,7 +155,6 @@ async def parse_page(url, stop_on_dupe=True) -> Optional[str]: user = User(imdb_id=meta["content"], name="") if match := find_name(headline.string): user.name = match["name"] - await add_or_update_user(user) items = soup.find_all("div", "lister-item-content") for i, item in enumerate(items): @@ -149,25 +172,35 @@ async def parse_page(url, stop_on_dupe=True) -> Optional[str]: ) continue - await add_or_update_movie(movie) + rating.user = user + rating.movie = movie - rating.user_id = user.id - rating.movie_id = movie.id # needs to be set _after_ movie has been updated - is_updated = await add_or_update_rating(rating) - - if stop_on_dupe and not is_updated: - log.info("Import stopped after %s items. Caught up to known state. ✋", i) - return None + ratings.append(rating) footer = soup.find("div", "footer") assert footer is not None next_url = urljoin(url, footer.find(string=re.compile(r"Next")).parent["href"]) - return next_url if url != next_url else None + return (ratings, next_url if url != next_url else None) -async def load_imdb(user_id): - next_url = imdb_url(user_id) +async def load_ratings(user_id): + next_url = user_ratings_url(user_id) - while next_url := await parse_page(next_url): - pass + while next_url: + + ratings, next_url = await parse_page(next_url) + + for i, rating in enumerate(ratings): + + if i == 0: + # All rating objects share the same user. + await db.add_or_update_user(rating.user) + rating.user_id = rating.user.id + + await db.add_or_update_movie(rating.movie) + rating.movie_id = rating.movie.id + + is_updated = await db.add_or_update_rating(rating) + + yield rating, is_updated diff --git a/unwind/web.py b/unwind/web.py index 67370dd..128dbf8 100644 --- a/unwind/web.py +++ b/unwind/web.py @@ -18,7 +18,7 @@ from starlette.middleware.authentication import AuthenticationMiddleware from starlette.responses import JSONResponse from starlette.routing import Mount, Route -from . import config, db +from . import config, db, imdb from .db import close_connection_pool, find_ratings, open_connection_pool from .middleware.responsetime import ResponseTimeMiddleware from .models import Group, Movie, User, asplain @@ -64,10 +64,6 @@ class BearerAuthBackend(AuthenticationBackend): return AuthCredentials(["authenticated", *roles]), user -def imdb_url(imdb_id: str): - return f"https://www.imdb.com/title/{imdb_id}/" - - def truthy(s: str): return bool(s) and s.lower() in {"1", "yes", "true"} @@ -153,7 +149,7 @@ async def get_ratings_for_group(request): "canonical_title": r["canonical_title"], "original_title": r["original_title"], "year": r["release_year"], - "link": imdb_url(r["movie_imdb_id"]), + "link": imdb.movie_url(r["movie_imdb_id"]), "user_scores": [], "imdb_score": r["imdb_score"], "media_type": r["media_type"],