fix: find next rating page

2024-05-10 00:13:32 +02:00 · 2024-05-10 00:13:32 +02:00 · d385860ca9
commit d385860ca9
parent 738799cc74
3 changed files with 78 additions and 38 deletions
--- a/tests/fixtures/ratings-ur655321.html.bz2
+++ b/tests/fixtures/ratings-ur655321.html.bz2
--- a/tests/test_imdb.py
+++ b/tests/test_imdb.py
@ -65,3 +65,20 @@ async def test_load_top_250(monkeypatch):
    movie_ids = await imdb.load_top_250()
    assert len(movie_ids) == 250
    assert all(id_.startswith("tt") for id_ in movie_ids)
+
+
+@pytest.mark.asyncio
+async def test_load_ratings_page(monkeypatch):
+    with bz2.open(fixturesdir / "ratings-ur655321.html.bz2", "rb") as f:
+        html = f.read()
+    soup = bs4.BeautifulSoup(html, "html5lib")
+
+    monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup))
+
+    page = await imdb.load_ratings_page("fakeurl")
+    assert len(page.ratings) == 100
+    assert page.imdb_user_id is not None
+    assert page.imdb_user_id == "ur655321"
+    assert page.imdb_user_name == "AlexUltra"
+    assert page.next_page_url is not None
+    assert page.next_page_url.startswith("/user/ur655321/ratings?")
--- a/unwind/imdb.py
+++ b/unwind/imdb.py
@ -2,6 +2,7 @@ import json
 import logging
 import re
 from collections import namedtuple
+from dataclasses import dataclass, field
 from datetime import datetime
 from typing import AsyncIterable
 from urllib.parse import urljoin
@ -48,7 +49,7 @@ async def refresh_user_ratings_from_imdb(stop_on_dupe: bool = True):
            log.info("⚡️ Loading data for %s ...", user.name)

            try:
-                async for rating, is_updated in load_ratings(user.imdb_id):
+                async for rating, is_updated in load_and_store_ratings(user.imdb_id):
                    assert rating.user is not None and rating.user.id == user.id

                    if stop_on_dupe and not is_updated:
@ -154,27 +155,35 @@ def movie_and_rating_from_item(item: bs4.Tag) -> tuple[Movie, Rating]:

 ForgedRequest = namedtuple("ForgedRequest", "url headers")

+MovieId = str  # ttXXXXXXXX
+UserId = str  # urXXXXXXXX

-async def parse_page(url: str) -> tuple[list[Rating], str | None]:
-    ratings = []
+
+@dataclass
+class RatingsPage:
+    ratings: list[Rating] = field(default_factory=list)
+    next_page_url: str | None = None
+    imdb_user_id: UserId | None = None
+    imdb_user_name: str | None = None
+
+
+async def load_ratings_page(url: str) -> RatingsPage:
+    page = RatingsPage()

    soup = await asoup_from_url(url)

    if (meta := soup.find("meta", property="pageId")) is None:
        raise RuntimeError("No pageId found.")
    assert isinstance(meta, bs4.Tag)
-    imdb_id = meta["content"]
-    assert isinstance(imdb_id, str)
-    async with db.new_connection() as conn:
-        user = await db.get(conn, User, imdb_id=imdb_id) or User(
-            imdb_id=imdb_id, name="", secret=""
-        )
+    if isinstance(page_id := meta["content"], list):
+        page_id = page_id[0]
+    page.imdb_user_id = page_id

    if (headline := soup.h1) is None:
        raise RuntimeError("No headline found.")
    assert isinstance(headline.string, str)
    if match := find_name(headline.string):
-        user.name = match["name"]
+        page.imdb_user_name = match["name"]

    items = soup.find_all("div", "lister-item-content")
    for i, item in enumerate(items):
@ -191,48 +200,62 @@ async def parse_page(url: str) -> tuple[list[Rating], str | None]:
            )
            continue

-        rating.user = user
        rating.movie = movie

-        ratings.append(rating)
+        page.ratings.append(rating)

-    next_url = None
    if (footer := soup.find("div", "footer")) is None:
        raise RuntimeError("No footer found.")
    assert isinstance(footer, bs4.Tag)
-    if (next_link := footer.find("a", string="Next")) is not None:
+    if (next_link := footer.find("a", string=re.compile("Next"))) is not None:
        assert isinstance(next_link, bs4.Tag)
        next_href = next_link["href"]
        assert isinstance(next_href, str)
-        next_url = urljoin(url, next_href)
+        page.next_page_url = urljoin(url, next_href)

-    return (ratings, next_url if url != next_url else None)
+    return page


-async def load_ratings(user_id: str) -> AsyncIterable[tuple[Rating, bool]]:
+async def load_and_store_ratings(
+    user_id: MovieId,
+) -> AsyncIterable[tuple[Rating, bool]]:
+    async with db.new_connection() as conn:
+        user = await db.get(conn, User, imdb_id=user_id) or User(
+            imdb_id=user_id, name="", secret=""
+        )
+
+    is_first = True
+    async for rating in load_ratings(user_id):
+        assert rating.movie
+
+        rating.user = user
+
+        async with db.transaction() as conn:
+            if is_first:
+                is_first = False
+                # All rating objects share the same user.
+                await db.add_or_update_user(conn, rating.user)
+            rating.user_id = rating.user.id
+
+            await db.add_or_update_movie(conn, rating.movie)
+            rating.movie_id = rating.movie.id
+
+            is_updated = await db.add_or_update_rating(conn, rating)
+
+        yield rating, is_updated
+
+
+async def load_ratings(user_id: MovieId) -> AsyncIterable[Rating]:
    next_url = user_ratings_url(user_id)

    while next_url:
-        ratings, next_url = await parse_page(next_url)
-
-        for i, rating in enumerate(ratings):
-            assert rating.user and rating.movie
-
-            async with db.transaction() as conn:
-                if i == 0:
-                    # All rating objects share the same user.
-                    await db.add_or_update_user(conn, rating.user)
-                rating.user_id = rating.user.id
-
-                await db.add_or_update_movie(conn, rating.movie)
-                rating.movie_id = rating.movie.id
-
-                is_updated = await db.add_or_update_rating(conn, rating)
-
-            yield rating, is_updated
+        ratings_page = await load_ratings_page(next_url)
+        next_url = ratings_page.next_page_url
+        for rating in ratings_page.ratings:
+            yield rating


-async def _ids_from_list_html(url: str) -> AsyncIterable[str]:
+async def _ids_from_list_html(url: str) -> AsyncIterable[MovieId]:
    """Return all IMDb movie IDs (`tt*`) from the given URL."""
    # document.querySelectorAll('li.ipc-metadata-list-summary-item a.ipc-title-link-wrapper')
    # .href: '/title/tt1213644/?ref_=chtbtm_t_1'
@ -245,7 +268,7 @@ async def _ids_from_list_html(url: str) -> AsyncIterable[str]:
                    yield match_["id"]


-async def load_most_popular_100() -> list[str]:
+async def load_most_popular_100() -> list[MovieId]:
    """Return the IMDb's top 100 most popular movies.

    IMDb Charts: Most Popular Movies
@ -258,7 +281,7 @@ async def load_most_popular_100() -> list[str]:
    return ids


-async def load_bottom_100() -> list[str]:
+async def load_bottom_100() -> list[MovieId]:
    """Return the IMDb's bottom 100 lowest rated movies.

    IMDb Charts: Lowest Rated Movies
@ -271,7 +294,7 @@ async def load_bottom_100() -> list[str]:
    return ids


-async def load_top_250() -> list[str]:
+async def load_top_250() -> list[MovieId]:
    """Return the IMDb's top 250 highest rated movies.

    IMDb Charts: IMDb Top 250 Movies