fix: find next rating page

This commit is contained in:
ducklet 2024-05-10 00:13:32 +02:00
parent 738799cc74
commit d385860ca9
3 changed files with 78 additions and 38 deletions

BIN
tests/fixtures/ratings-ur655321.html.bz2 vendored Normal file

Binary file not shown.

View file

@ -65,3 +65,20 @@ async def test_load_top_250(monkeypatch):
movie_ids = await imdb.load_top_250() movie_ids = await imdb.load_top_250()
assert len(movie_ids) == 250 assert len(movie_ids) == 250
assert all(id_.startswith("tt") for id_ in movie_ids) assert all(id_.startswith("tt") for id_ in movie_ids)
@pytest.mark.asyncio
async def test_load_ratings_page(monkeypatch):
with bz2.open(fixturesdir / "ratings-ur655321.html.bz2", "rb") as f:
html = f.read()
soup = bs4.BeautifulSoup(html, "html5lib")
monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup))
page = await imdb.load_ratings_page("fakeurl")
assert len(page.ratings) == 100
assert page.imdb_user_id is not None
assert page.imdb_user_id == "ur655321"
assert page.imdb_user_name == "AlexUltra"
assert page.next_page_url is not None
assert page.next_page_url.startswith("/user/ur655321/ratings?")

View file

@ -2,6 +2,7 @@ import json
import logging import logging
import re import re
from collections import namedtuple from collections import namedtuple
from dataclasses import dataclass, field
from datetime import datetime from datetime import datetime
from typing import AsyncIterable from typing import AsyncIterable
from urllib.parse import urljoin from urllib.parse import urljoin
@ -48,7 +49,7 @@ async def refresh_user_ratings_from_imdb(stop_on_dupe: bool = True):
log.info("⚡️ Loading data for %s ...", user.name) log.info("⚡️ Loading data for %s ...", user.name)
try: try:
async for rating, is_updated in load_ratings(user.imdb_id): async for rating, is_updated in load_and_store_ratings(user.imdb_id):
assert rating.user is not None and rating.user.id == user.id assert rating.user is not None and rating.user.id == user.id
if stop_on_dupe and not is_updated: if stop_on_dupe and not is_updated:
@ -154,27 +155,35 @@ def movie_and_rating_from_item(item: bs4.Tag) -> tuple[Movie, Rating]:
ForgedRequest = namedtuple("ForgedRequest", "url headers") ForgedRequest = namedtuple("ForgedRequest", "url headers")
MovieId = str # ttXXXXXXXX
UserId = str # urXXXXXXXX
async def parse_page(url: str) -> tuple[list[Rating], str | None]:
ratings = [] @dataclass
class RatingsPage:
ratings: list[Rating] = field(default_factory=list)
next_page_url: str | None = None
imdb_user_id: UserId | None = None
imdb_user_name: str | None = None
async def load_ratings_page(url: str) -> RatingsPage:
page = RatingsPage()
soup = await asoup_from_url(url) soup = await asoup_from_url(url)
if (meta := soup.find("meta", property="pageId")) is None: if (meta := soup.find("meta", property="pageId")) is None:
raise RuntimeError("No pageId found.") raise RuntimeError("No pageId found.")
assert isinstance(meta, bs4.Tag) assert isinstance(meta, bs4.Tag)
imdb_id = meta["content"] if isinstance(page_id := meta["content"], list):
assert isinstance(imdb_id, str) page_id = page_id[0]
async with db.new_connection() as conn: page.imdb_user_id = page_id
user = await db.get(conn, User, imdb_id=imdb_id) or User(
imdb_id=imdb_id, name="", secret=""
)
if (headline := soup.h1) is None: if (headline := soup.h1) is None:
raise RuntimeError("No headline found.") raise RuntimeError("No headline found.")
assert isinstance(headline.string, str) assert isinstance(headline.string, str)
if match := find_name(headline.string): if match := find_name(headline.string):
user.name = match["name"] page.imdb_user_name = match["name"]
items = soup.find_all("div", "lister-item-content") items = soup.find_all("div", "lister-item-content")
for i, item in enumerate(items): for i, item in enumerate(items):
@ -191,35 +200,39 @@ async def parse_page(url: str) -> tuple[list[Rating], str | None]:
) )
continue continue
rating.user = user
rating.movie = movie rating.movie = movie
ratings.append(rating) page.ratings.append(rating)
next_url = None
if (footer := soup.find("div", "footer")) is None: if (footer := soup.find("div", "footer")) is None:
raise RuntimeError("No footer found.") raise RuntimeError("No footer found.")
assert isinstance(footer, bs4.Tag) assert isinstance(footer, bs4.Tag)
if (next_link := footer.find("a", string="Next")) is not None: if (next_link := footer.find("a", string=re.compile("Next"))) is not None:
assert isinstance(next_link, bs4.Tag) assert isinstance(next_link, bs4.Tag)
next_href = next_link["href"] next_href = next_link["href"]
assert isinstance(next_href, str) assert isinstance(next_href, str)
next_url = urljoin(url, next_href) page.next_page_url = urljoin(url, next_href)
return (ratings, next_url if url != next_url else None) return page
async def load_ratings(user_id: str) -> AsyncIterable[tuple[Rating, bool]]: async def load_and_store_ratings(
next_url = user_ratings_url(user_id) user_id: MovieId,
) -> AsyncIterable[tuple[Rating, bool]]:
async with db.new_connection() as conn:
user = await db.get(conn, User, imdb_id=user_id) or User(
imdb_id=user_id, name="", secret=""
)
while next_url: is_first = True
ratings, next_url = await parse_page(next_url) async for rating in load_ratings(user_id):
assert rating.movie
for i, rating in enumerate(ratings): rating.user = user
assert rating.user and rating.movie
async with db.transaction() as conn: async with db.transaction() as conn:
if i == 0: if is_first:
is_first = False
# All rating objects share the same user. # All rating objects share the same user.
await db.add_or_update_user(conn, rating.user) await db.add_or_update_user(conn, rating.user)
rating.user_id = rating.user.id rating.user_id = rating.user.id
@ -232,7 +245,17 @@ async def load_ratings(user_id: str) -> AsyncIterable[tuple[Rating, bool]]:
yield rating, is_updated yield rating, is_updated
async def _ids_from_list_html(url: str) -> AsyncIterable[str]: async def load_ratings(user_id: MovieId) -> AsyncIterable[Rating]:
next_url = user_ratings_url(user_id)
while next_url:
ratings_page = await load_ratings_page(next_url)
next_url = ratings_page.next_page_url
for rating in ratings_page.ratings:
yield rating
async def _ids_from_list_html(url: str) -> AsyncIterable[MovieId]:
"""Return all IMDb movie IDs (`tt*`) from the given URL.""" """Return all IMDb movie IDs (`tt*`) from the given URL."""
# document.querySelectorAll('li.ipc-metadata-list-summary-item a.ipc-title-link-wrapper') # document.querySelectorAll('li.ipc-metadata-list-summary-item a.ipc-title-link-wrapper')
# .href: '/title/tt1213644/?ref_=chtbtm_t_1' # .href: '/title/tt1213644/?ref_=chtbtm_t_1'
@ -245,7 +268,7 @@ async def _ids_from_list_html(url: str) -> AsyncIterable[str]:
yield match_["id"] yield match_["id"]
async def load_most_popular_100() -> list[str]: async def load_most_popular_100() -> list[MovieId]:
"""Return the IMDb's top 100 most popular movies. """Return the IMDb's top 100 most popular movies.
IMDb Charts: Most Popular Movies IMDb Charts: Most Popular Movies
@ -258,7 +281,7 @@ async def load_most_popular_100() -> list[str]:
return ids return ids
async def load_bottom_100() -> list[str]: async def load_bottom_100() -> list[MovieId]:
"""Return the IMDb's bottom 100 lowest rated movies. """Return the IMDb's bottom 100 lowest rated movies.
IMDb Charts: Lowest Rated Movies IMDb Charts: Lowest Rated Movies
@ -271,7 +294,7 @@ async def load_bottom_100() -> list[str]:
return ids return ids
async def load_top_250() -> list[str]: async def load_top_250() -> list[MovieId]:
"""Return the IMDb's top 250 highest rated movies. """Return the IMDb's top 250 highest rated movies.
IMDb Charts: IMDb Top 250 Movies IMDb Charts: IMDb Top 250 Movies