refactor loading of imdb ratings to yield loaded ratings

This commit is contained in:
ducklet 2021-07-10 01:43:24 +02:00
parent 1ad7a79d33
commit 1805282d41
3 changed files with 60 additions and 34 deletions

View file

@ -2,12 +2,12 @@ import logging
import re
from collections import namedtuple
from datetime import datetime
from typing import Optional
from typing import Optional, Tuple
from urllib.parse import urljoin
from .db import add_or_update_movie, add_or_update_rating, add_or_update_user
from . import db
from .models import Movie, Rating, User
from .request import cache_path, soup_from_url
from .request import cache_path, session, soup_from_url
log = logging.getLogger(__name__)
@ -34,10 +34,32 @@ log = logging.getLogger(__name__)
# p.text-muted.text ("Rated on 06 May 2021")
def imdb_url(user_id):
async def refresh_user_ratings_from_imdb(stop_on_dupe=True):
with session() as s:
s.headers["Accept-Language"] = "en-GB, en;q=0.5"
for user in await db.get_all(User):
log.info("⚡️ Loading data for %s ...", user.name)
async for rating, is_updated in load_ratings(user.imdb_id):
assert rating.user == user
if stop_on_dupe and not is_updated:
break
yield rating
def user_ratings_url(user_id):
return f"https://www.imdb.com/user/{user_id}/ratings"
def movie_url(imdb_id: str):
return f"https://www.imdb.com/title/{imdb_id}/"
def imdb_rating_from_score(score: int) -> float:
"""Return the IMDb rating from an Unwind Movie score."""
assert 0 <= score <= 100
@ -122,7 +144,9 @@ def movie_and_rating_from_item(item) -> tuple[Movie, Rating]:
ForgedRequest = namedtuple("ForgedRequest", "url headers")
async def parse_page(url, stop_on_dupe=True) -> Optional[str]:
async def parse_page(url) -> Tuple[list[Rating], Optional[str]]:
ratings = []
soup = soup_from_url(url)
meta = soup.find("meta", property="pageId")
@ -131,7 +155,6 @@ async def parse_page(url, stop_on_dupe=True) -> Optional[str]:
user = User(imdb_id=meta["content"], name="")
if match := find_name(headline.string):
user.name = match["name"]
await add_or_update_user(user)
items = soup.find_all("div", "lister-item-content")
for i, item in enumerate(items):
@ -149,25 +172,35 @@ async def parse_page(url, stop_on_dupe=True) -> Optional[str]:
)
continue
await add_or_update_movie(movie)
rating.user = user
rating.movie = movie
rating.user_id = user.id
rating.movie_id = movie.id # needs to be set _after_ movie has been updated
is_updated = await add_or_update_rating(rating)
if stop_on_dupe and not is_updated:
log.info("Import stopped after %s items. Caught up to known state. ✋", i)
return None
ratings.append(rating)
footer = soup.find("div", "footer")
assert footer is not None
next_url = urljoin(url, footer.find(string=re.compile(r"Next")).parent["href"])
return next_url if url != next_url else None
return (ratings, next_url if url != next_url else None)
async def load_imdb(user_id):
next_url = imdb_url(user_id)
async def load_ratings(user_id):
next_url = user_ratings_url(user_id)
while next_url := await parse_page(next_url):
pass
while next_url:
ratings, next_url = await parse_page(next_url)
for i, rating in enumerate(ratings):
if i == 0:
# All rating objects share the same user.
await db.add_or_update_user(rating.user)
rating.user_id = rating.user.id
await db.add_or_update_movie(rating.movie)
rating.movie_id = rating.movie.id
is_updated = await db.add_or_update_rating(rating)
yield rating, is_updated