diff --git a/tests/fixtures/ratings-ur655321-20240720.gql.json.bz2 b/tests/fixtures/ratings-ur655321-20240720.gql.json.bz2 new file mode 100644 index 0000000..b56b820 Binary files /dev/null and b/tests/fixtures/ratings-ur655321-20240720.gql.json.bz2 differ diff --git a/tests/fixtures/ratings-ur655321-20240720.html.bz2 b/tests/fixtures/ratings-ur655321-20240720.html.bz2 new file mode 100644 index 0000000..01a633a Binary files /dev/null and b/tests/fixtures/ratings-ur655321-20240720.html.bz2 differ diff --git a/tests/test_imdb.py b/tests/test_imdb.py index d4a5db5..aa3f55a 100644 --- a/tests/test_imdb.py +++ b/tests/test_imdb.py @@ -134,7 +134,7 @@ async def test_load_ratings_page_20240510(monkeypatch): if rating.movie.imdb_id == item["imdb_id"]: rating_dict = {key: getattr(rating.movie, key) for key in item.keys()} return rating_dict - raise AssertionError() + raise AssertionError(f"{item["imdb_id"]} not found in page.ratings") a_movie = { "title": "Kung Fu Panda 4", @@ -142,8 +142,9 @@ async def test_load_ratings_page_20240510(monkeypatch): "media_type": "Movie", "imdb_id": "tt21692408", "imdb_score": 59, - "imdb_votes": 36000, + "imdb_votes": 36069, "runtime": 94, + "genres": {"Action", "Adventure", "Animation"}, } assert a_movie == movie(a_movie) @@ -153,7 +154,8 @@ async def test_load_ratings_page_20240510(monkeypatch): "media_type": "TV Series", "imdb_id": "tt8888540", "imdb_score": 64, - "imdb_votes": 6000, + "imdb_votes": 6044, + "genres": {"Drama"}, } assert a_running_tvseries == movie(a_running_tvseries) @@ -163,29 +165,94 @@ async def test_load_ratings_page_20240510(monkeypatch): "media_type": "TV Series", "imdb_id": "tt0072500", "imdb_score": 87, - "imdb_votes": 100000, + "imdb_votes": 100261, + "genres": {"Comedy"}, } assert a_finished_tvseries == movie(a_finished_tvseries) a_tvepisode = { "title": "Columbo / No Time to Die", - "original_title": None, + "original_title": "Columbo / No Time to Die", "release_year": 1992, "media_type": "TV Episode", "imdb_id": "tt0103987", "imdb_score": 59, - "imdb_votes": 2100, + "imdb_votes": 2122, "runtime": 98, + "genres": {"Crime", "Drama", "Mystery"}, } assert a_tvepisode == movie(a_tvepisode) a_videogame = { "title": "Alan Wake", - "original_title": None, + "original_title": "Alan Wake", "release_year": 2010, "media_type": "Video Game", "imdb_id": "tt0466662", - "imdb_score": 82, - "imdb_votes": 7300, + # The data from __NEXT_DATA__ is wrong, the actual values should be: + # "imdb_score": 82, + # "imdb_votes": 7300, + # "genres": {"Action", "Adventure", "Horror"}, + "imdb_score": 67, # Wrong value, but correctly parsed from __NEXT_DATA__ + "imdb_votes": 11655, # Wrong value, but correctly parsed from __NEXT_DATA__ + "genres": {"Comedy", "Crime", "Drama"}, # Wrong value } assert a_videogame == movie(a_videogame) + + +@pytest.mark.asyncio +async def test_load_ratings_page_20240720(monkeypatch): + with bz2.open(fixturesdir / "ratings-ur655321-20240720.html.bz2", "rb") as f: + html = f.read() + soup = bs4.BeautifulSoup(html, "html5lib") + monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup)) + + with bz2.open(fixturesdir / "ratings-ur655321-20240720.gql.json.bz2", "rb") as f: + jsonstr = f.read() + async with imdb.asession() as s: + monkeypatch.setattr(s, "post", AsyncMock(return_value=_mock_response(jsonstr))) + page = await imdb._load_ratings_page("fakeurl", "ur655321") + assert len(page.ratings) == 100 + assert page.imdb_user_id is not None + assert page.imdb_user_id == "ur655321" + assert page.imdb_user_name == "AlexUltra" + assert page.next_page_url is None, "not supported for new ratings page" + + def movie(item: dict): + for rating in page.ratings: + assert rating.movie + if rating.movie.imdb_id == item["imdb_id"]: + rating_dict = {key: getattr(rating.movie, key) for key in item.keys()} + return rating_dict + raise AssertionError(f"{item["imdb_id"]} not found in page.ratings") + + a_movie = { + "title": "Kung Fu Panda 4", + "release_year": 2024, + "media_type": "Movie", + "imdb_id": "tt21692408", + "imdb_score": 59, + "imdb_votes": 48018, + "runtime": 94, + } + assert a_movie == movie(a_movie) + + a_running_tvseries = { + "title": "Palm Royale", + "release_year": 2024, + "media_type": "TV Series", + "imdb_id": "tt8888540", + "imdb_score": 63, + "imdb_votes": 9458, + } + assert a_running_tvseries == movie(a_running_tvseries) + + a_finished_tvseries = { + "title": "Fawlty Towers", + "release_year": 1975, + "media_type": "TV Series", + "imdb_id": "tt0072500", + "imdb_score": 87, + "imdb_votes": 100860, + } + assert a_finished_tvseries == movie(a_finished_tvseries) diff --git a/unwind/imdb.py b/unwind/imdb.py index 24e3311..3d844a5 100644 --- a/unwind/imdb.py +++ b/unwind/imdb.py @@ -4,7 +4,7 @@ import re from collections import namedtuple from dataclasses import dataclass, field from datetime import datetime -from typing import AsyncIterable, Iterable +from typing import Any, AsyncIterable, Iterable from urllib.parse import urljoin import bs4 @@ -116,9 +116,8 @@ find_vote_count = re.compile(r"\((?P\d+(\.\d+)?K?)\)").fullmatch def _first_string(tag: bs4.Tag) -> str | None: - for child in tag.children: - if isinstance(child, str): - return child + for s in tag.strings: + return s def _tv_episode_title(series_name: str, episode_name: str) -> str: @@ -244,7 +243,8 @@ async def _load_ratings_page(url: str, user_id: ImdbUserId) -> _RatingsPage: """Dispatch to handlers for different ratings page versions.""" soup = await asoup_from_url(url) - + if soup.find("script", id="__NEXT_DATA__", type="application/json") is not None: + return await _load_ratings_page_202407(user_id, url, soup) if soup.find("meta", property="imdb:pageConst") is not None: return await _load_ratings_page_2024(user_id, url, soup) elif soup.find("meta", property="pageId") is not None: @@ -253,6 +253,90 @@ async def _load_ratings_page(url: str, user_id: ImdbUserId) -> _RatingsPage: raise RuntimeError("Unknown ratings page version.") +def _get_or_None(d: dict[str, Any], keys: list[str]) -> Any | None: + for k in keys: + try: + d = d[k] + except KeyError: + return None + if d is None: + break + return d + + +def _parse_movies_from_nextdata_202407(nextdata: dict) -> Iterable[Movie]: + nextratings = nextdata["props"]["pageProps"]["mainColumnData"][ + "advancedTitleSearch" + ]["edges"] + for ratingdata in nextratings: + ratingdata = ratingdata["node"]["title"] + # endYear=ratingdata["releaseYear"]["endYear"] + imdb_rating = _get_or_None(ratingdata, ["ratingsSummary", "aggregateRating"]) + runtime_s = _get_or_None(ratingdata, ["runtime", "seconds"]) + + movie = Movie( + title=ratingdata["titleText"]["text"], + original_title=_get_or_None(ratingdata, ["originalTitleText", "text"]), + release_year=ratingdata["releaseYear"]["year"], + media_type=ratingdata["titleType"]["text"], + imdb_id=ratingdata["id"], + imdb_score=( + None if imdb_rating is None else score_from_imdb_rating(imdb_rating) + ), + imdb_votes=_get_or_None(ratingdata, ["ratingsSummary", "voteCount"]), + runtime=None if runtime_s is None else int(runtime_s / 60), + genres={ + genre["genre"]["text"] for genre in ratingdata["titleGenres"]["genres"] + }, + ) + + if movie.media_type == "TV Episode": + seriesdata = ratingdata["series"]["series"] + series_original_title = seriesdata["originalTitleText"]["text"] + series_title = seriesdata["titleText"]["text"] + # series_id = seriesdata["releaseYear"]["id"] + # series_year = seriesdata["releaseYear"]["year"] + # series_endyear = seriesdata["releaseYear"]["endYear"] + movie.title = _tv_episode_title(series_title, movie.title) + movie.original_title = _tv_episode_title( + series_original_title, movie.original_title + ) + + yield movie + + +async def _load_ratings_page_202407( + user_id: ImdbUserId, url: str, soup: bs4.BeautifulSoup +) -> _RatingsPage: + """Handle the ratings page from July 2024.""" + if ( + nextjson := soup.find("script", id="__NEXT_DATA__", type="application/json") + ) is None: + raise RuntimeError("No __NEXT_DATA__ BLOB found.") + + nextdata = json.loads(nextjson.string.strip()) + userdata = nextdata["props"]["pageProps"]["aboveTheFoldData"] + + page = _RatingsPage( + imdb_user_id=userdata["authorId"], + imdb_user_name=userdata["authorName"], + ) + + movies = _parse_movies_from_nextdata_202407(nextdata) + movies_dict = {m.imdb_id: m for m in movies} + async for rating in _load_user_movie_ratings(user_id, list(movies_dict.keys())): + movie = movies_dict[rating.movie_id] + rating = Rating( + movie=movie, + score=score_from_imdb_rating(rating.imdb_rating), + rating_date=rating.rating_date, + ) + + page.ratings.append(rating) + + return page + + async def _load_ratings_page_2024( user_id: ImdbUserId, url: str, soup: bs4.BeautifulSoup ) -> _RatingsPage: @@ -559,3 +643,55 @@ async def _load_user_movie_ratings( except KeyError as err: log.error("Unexpected data structure.", exc_info=err) raise + + +async def _load_user_ratings_202407( + user_id: ImdbUserId, movie_ids: list[ImdbMovieId] +) -> AsyncIterable[_UserMovieRating]: + """ + + This is a new API that showed up in July 2024. + It's used on a user's ratings page to load their ratings. + """ + raise NotImplementedError() + + qgl_api_url = "https://api.graphql.imdb.com/" + headers = { + "accept": "application/graphql+json, application/json", + "content-type": "application/json", + "origin": "https://www.imdb.com", + } + query = { + "operationName": "RatingsPage", + "variables": json.dumps( + { + # "after": ..., + "filter": { + "explicitContentConstraint": { + "explicitContentFilter": "INCLUDE_ADULT" + }, + "singleUserRatingConstraint": { + "filterType": "INCLUDE", + "userId": user_id, + }, + }, + "first": 100, + "locale": "en-US", + "sort": {"sortBy": "SINGLE_USER_RATING_DATE", "sortOrder": "ASC"}, + } + ), + "extensions": json.dumps( + { + "persistedQuery": { + "sha256Hash": "ae30a55f169252b5f0208d686f41aaff231d7f70bb75c257732c80234d71dbe9", + "version": 1, + } + } + ), + } + async with asession() as s: + r = await s.get(qgl_api_url, headers=headers, query=query, timeout=10) + r.raise_for_status() + data = r.json() + + # ...