diff --git a/tests/fixtures/ratings-ur655321.html.bz2 b/tests/fixtures/ratings-ur655321.html.bz2 new file mode 100644 index 0000000..db71d44 Binary files /dev/null and b/tests/fixtures/ratings-ur655321.html.bz2 differ diff --git a/tests/test_imdb.py b/tests/test_imdb.py index 4f949d6..99ad4a0 100644 --- a/tests/test_imdb.py +++ b/tests/test_imdb.py @@ -65,3 +65,20 @@ async def test_load_top_250(monkeypatch): movie_ids = await imdb.load_top_250() assert len(movie_ids) == 250 assert all(id_.startswith("tt") for id_ in movie_ids) + + +@pytest.mark.asyncio +async def test_load_ratings_page(monkeypatch): + with bz2.open(fixturesdir / "ratings-ur655321.html.bz2", "rb") as f: + html = f.read() + soup = bs4.BeautifulSoup(html, "html5lib") + + monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup)) + + page = await imdb.load_ratings_page("fakeurl") + assert len(page.ratings) == 100 + assert page.imdb_user_id is not None + assert page.imdb_user_id == "ur655321" + assert page.imdb_user_name == "AlexUltra" + assert page.next_page_url is not None + assert page.next_page_url.startswith("/user/ur655321/ratings?") diff --git a/unwind/imdb.py b/unwind/imdb.py index 6646d78..542751f 100644 --- a/unwind/imdb.py +++ b/unwind/imdb.py @@ -2,6 +2,7 @@ import json import logging import re from collections import namedtuple +from dataclasses import dataclass, field from datetime import datetime from typing import AsyncIterable from urllib.parse import urljoin @@ -48,7 +49,7 @@ async def refresh_user_ratings_from_imdb(stop_on_dupe: bool = True): log.info("⚡️ Loading data for %s ...", user.name) try: - async for rating, is_updated in load_ratings(user.imdb_id): + async for rating, is_updated in load_and_store_ratings(user.imdb_id): assert rating.user is not None and rating.user.id == user.id if stop_on_dupe and not is_updated: @@ -154,27 +155,35 @@ def movie_and_rating_from_item(item: bs4.Tag) -> tuple[Movie, Rating]: ForgedRequest = namedtuple("ForgedRequest", "url headers") +MovieId = str # ttXXXXXXXX +UserId = str # urXXXXXXXX -async def parse_page(url: str) -> tuple[list[Rating], str | None]: - ratings = [] + +@dataclass +class RatingsPage: + ratings: list[Rating] = field(default_factory=list) + next_page_url: str | None = None + imdb_user_id: UserId | None = None + imdb_user_name: str | None = None + + +async def load_ratings_page(url: str) -> RatingsPage: + page = RatingsPage() soup = await asoup_from_url(url) if (meta := soup.find("meta", property="pageId")) is None: raise RuntimeError("No pageId found.") assert isinstance(meta, bs4.Tag) - imdb_id = meta["content"] - assert isinstance(imdb_id, str) - async with db.new_connection() as conn: - user = await db.get(conn, User, imdb_id=imdb_id) or User( - imdb_id=imdb_id, name="", secret="" - ) + if isinstance(page_id := meta["content"], list): + page_id = page_id[0] + page.imdb_user_id = page_id if (headline := soup.h1) is None: raise RuntimeError("No headline found.") assert isinstance(headline.string, str) if match := find_name(headline.string): - user.name = match["name"] + page.imdb_user_name = match["name"] items = soup.find_all("div", "lister-item-content") for i, item in enumerate(items): @@ -191,48 +200,62 @@ async def parse_page(url: str) -> tuple[list[Rating], str | None]: ) continue - rating.user = user rating.movie = movie - ratings.append(rating) + page.ratings.append(rating) - next_url = None if (footer := soup.find("div", "footer")) is None: raise RuntimeError("No footer found.") assert isinstance(footer, bs4.Tag) - if (next_link := footer.find("a", string="Next")) is not None: + if (next_link := footer.find("a", string=re.compile("Next"))) is not None: assert isinstance(next_link, bs4.Tag) next_href = next_link["href"] assert isinstance(next_href, str) - next_url = urljoin(url, next_href) + page.next_page_url = urljoin(url, next_href) - return (ratings, next_url if url != next_url else None) + return page -async def load_ratings(user_id: str) -> AsyncIterable[tuple[Rating, bool]]: +async def load_and_store_ratings( + user_id: MovieId, +) -> AsyncIterable[tuple[Rating, bool]]: + async with db.new_connection() as conn: + user = await db.get(conn, User, imdb_id=user_id) or User( + imdb_id=user_id, name="", secret="" + ) + + is_first = True + async for rating in load_ratings(user_id): + assert rating.movie + + rating.user = user + + async with db.transaction() as conn: + if is_first: + is_first = False + # All rating objects share the same user. + await db.add_or_update_user(conn, rating.user) + rating.user_id = rating.user.id + + await db.add_or_update_movie(conn, rating.movie) + rating.movie_id = rating.movie.id + + is_updated = await db.add_or_update_rating(conn, rating) + + yield rating, is_updated + + +async def load_ratings(user_id: MovieId) -> AsyncIterable[Rating]: next_url = user_ratings_url(user_id) while next_url: - ratings, next_url = await parse_page(next_url) - - for i, rating in enumerate(ratings): - assert rating.user and rating.movie - - async with db.transaction() as conn: - if i == 0: - # All rating objects share the same user. - await db.add_or_update_user(conn, rating.user) - rating.user_id = rating.user.id - - await db.add_or_update_movie(conn, rating.movie) - rating.movie_id = rating.movie.id - - is_updated = await db.add_or_update_rating(conn, rating) - - yield rating, is_updated + ratings_page = await load_ratings_page(next_url) + next_url = ratings_page.next_page_url + for rating in ratings_page.ratings: + yield rating -async def _ids_from_list_html(url: str) -> AsyncIterable[str]: +async def _ids_from_list_html(url: str) -> AsyncIterable[MovieId]: """Return all IMDb movie IDs (`tt*`) from the given URL.""" # document.querySelectorAll('li.ipc-metadata-list-summary-item a.ipc-title-link-wrapper') # .href: '/title/tt1213644/?ref_=chtbtm_t_1' @@ -245,7 +268,7 @@ async def _ids_from_list_html(url: str) -> AsyncIterable[str]: yield match_["id"] -async def load_most_popular_100() -> list[str]: +async def load_most_popular_100() -> list[MovieId]: """Return the IMDb's top 100 most popular movies. IMDb Charts: Most Popular Movies @@ -258,7 +281,7 @@ async def load_most_popular_100() -> list[str]: return ids -async def load_bottom_100() -> list[str]: +async def load_bottom_100() -> list[MovieId]: """Return the IMDb's bottom 100 lowest rated movies. IMDb Charts: Lowest Rated Movies @@ -271,7 +294,7 @@ async def load_bottom_100() -> list[str]: return ids -async def load_top_250() -> list[str]: +async def load_top_250() -> list[MovieId]: """Return the IMDb's top 250 highest rated movies. IMDb Charts: IMDb Top 250 Movies