fix: support new user ratings page markup

We use __NEXT_DATA__ from the page to find the user's latest rated
movies.
We found that at least in one case (of a Video Game) the movie details
were wrong. Normally this shouldn't be a problem though because we know
all the movies already and we keep the values we already have. Otherwise
the data from __NEXT_DATA__ seems more accurate and complete.
This commit is contained in:
ducklet 2024-07-21 14:46:45 +02:00
parent d7530e6bb0
commit 380d6ff186
4 changed files with 217 additions and 14 deletions

Binary file not shown.

Binary file not shown.

View file

@ -134,7 +134,7 @@ async def test_load_ratings_page_20240510(monkeypatch):
if rating.movie.imdb_id == item["imdb_id"]:
rating_dict = {key: getattr(rating.movie, key) for key in item.keys()}
return rating_dict
raise AssertionError()
raise AssertionError(f"{item["imdb_id"]} not found in page.ratings")
a_movie = {
"title": "Kung Fu Panda 4",
@ -142,8 +142,9 @@ async def test_load_ratings_page_20240510(monkeypatch):
"media_type": "Movie",
"imdb_id": "tt21692408",
"imdb_score": 59,
"imdb_votes": 36000,
"imdb_votes": 36069,
"runtime": 94,
"genres": {"Action", "Adventure", "Animation"},
}
assert a_movie == movie(a_movie)
@ -153,7 +154,8 @@ async def test_load_ratings_page_20240510(monkeypatch):
"media_type": "TV Series",
"imdb_id": "tt8888540",
"imdb_score": 64,
"imdb_votes": 6000,
"imdb_votes": 6044,
"genres": {"Drama"},
}
assert a_running_tvseries == movie(a_running_tvseries)
@ -163,29 +165,94 @@ async def test_load_ratings_page_20240510(monkeypatch):
"media_type": "TV Series",
"imdb_id": "tt0072500",
"imdb_score": 87,
"imdb_votes": 100000,
"imdb_votes": 100261,
"genres": {"Comedy"},
}
assert a_finished_tvseries == movie(a_finished_tvseries)
a_tvepisode = {
"title": "Columbo / No Time to Die",
"original_title": None,
"original_title": "Columbo / No Time to Die",
"release_year": 1992,
"media_type": "TV Episode",
"imdb_id": "tt0103987",
"imdb_score": 59,
"imdb_votes": 2100,
"imdb_votes": 2122,
"runtime": 98,
"genres": {"Crime", "Drama", "Mystery"},
}
assert a_tvepisode == movie(a_tvepisode)
a_videogame = {
"title": "Alan Wake",
"original_title": None,
"original_title": "Alan Wake",
"release_year": 2010,
"media_type": "Video Game",
"imdb_id": "tt0466662",
"imdb_score": 82,
"imdb_votes": 7300,
# The data from __NEXT_DATA__ is wrong, the actual values should be:
# "imdb_score": 82,
# "imdb_votes": 7300,
# "genres": {"Action", "Adventure", "Horror"},
"imdb_score": 67, # Wrong value, but correctly parsed from __NEXT_DATA__
"imdb_votes": 11655, # Wrong value, but correctly parsed from __NEXT_DATA__
"genres": {"Comedy", "Crime", "Drama"}, # Wrong value
}
assert a_videogame == movie(a_videogame)
@pytest.mark.asyncio
async def test_load_ratings_page_20240720(monkeypatch):
with bz2.open(fixturesdir / "ratings-ur655321-20240720.html.bz2", "rb") as f:
html = f.read()
soup = bs4.BeautifulSoup(html, "html5lib")
monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup))
with bz2.open(fixturesdir / "ratings-ur655321-20240720.gql.json.bz2", "rb") as f:
jsonstr = f.read()
async with imdb.asession() as s:
monkeypatch.setattr(s, "post", AsyncMock(return_value=_mock_response(jsonstr)))
page = await imdb._load_ratings_page("fakeurl", "ur655321")
assert len(page.ratings) == 100
assert page.imdb_user_id is not None
assert page.imdb_user_id == "ur655321"
assert page.imdb_user_name == "AlexUltra"
assert page.next_page_url is None, "not supported for new ratings page"
def movie(item: dict):
for rating in page.ratings:
assert rating.movie
if rating.movie.imdb_id == item["imdb_id"]:
rating_dict = {key: getattr(rating.movie, key) for key in item.keys()}
return rating_dict
raise AssertionError(f"{item["imdb_id"]} not found in page.ratings")
a_movie = {
"title": "Kung Fu Panda 4",
"release_year": 2024,
"media_type": "Movie",
"imdb_id": "tt21692408",
"imdb_score": 59,
"imdb_votes": 48018,
"runtime": 94,
}
assert a_movie == movie(a_movie)
a_running_tvseries = {
"title": "Palm Royale",
"release_year": 2024,
"media_type": "TV Series",
"imdb_id": "tt8888540",
"imdb_score": 63,
"imdb_votes": 9458,
}
assert a_running_tvseries == movie(a_running_tvseries)
a_finished_tvseries = {
"title": "Fawlty Towers",
"release_year": 1975,
"media_type": "TV Series",
"imdb_id": "tt0072500",
"imdb_score": 87,
"imdb_votes": 100860,
}
assert a_finished_tvseries == movie(a_finished_tvseries)

View file

@ -4,7 +4,7 @@ import re
from collections import namedtuple
from dataclasses import dataclass, field
from datetime import datetime
from typing import AsyncIterable, Iterable
from typing import Any, AsyncIterable, Iterable
from urllib.parse import urljoin
import bs4
@ -116,9 +116,8 @@ find_vote_count = re.compile(r"\((?P<count>\d+(\.\d+)?K?)\)").fullmatch
def _first_string(tag: bs4.Tag) -> str | None:
for child in tag.children:
if isinstance(child, str):
return child
for s in tag.strings:
return s
def _tv_episode_title(series_name: str, episode_name: str) -> str:
@ -244,7 +243,8 @@ async def _load_ratings_page(url: str, user_id: ImdbUserId) -> _RatingsPage:
"""Dispatch to handlers for different ratings page versions."""
soup = await asoup_from_url(url)
if soup.find("script", id="__NEXT_DATA__", type="application/json") is not None:
return await _load_ratings_page_202407(user_id, url, soup)
if soup.find("meta", property="imdb:pageConst") is not None:
return await _load_ratings_page_2024(user_id, url, soup)
elif soup.find("meta", property="pageId") is not None:
@ -253,6 +253,90 @@ async def _load_ratings_page(url: str, user_id: ImdbUserId) -> _RatingsPage:
raise RuntimeError("Unknown ratings page version.")
def _get_or_None(d: dict[str, Any], keys: list[str]) -> Any | None:
for k in keys:
try:
d = d[k]
except KeyError:
return None
if d is None:
break
return d
def _parse_movies_from_nextdata_202407(nextdata: dict) -> Iterable[Movie]:
nextratings = nextdata["props"]["pageProps"]["mainColumnData"][
"advancedTitleSearch"
]["edges"]
for ratingdata in nextratings:
ratingdata = ratingdata["node"]["title"]
# endYear=ratingdata["releaseYear"]["endYear"]
imdb_rating = _get_or_None(ratingdata, ["ratingsSummary", "aggregateRating"])
runtime_s = _get_or_None(ratingdata, ["runtime", "seconds"])
movie = Movie(
title=ratingdata["titleText"]["text"],
original_title=_get_or_None(ratingdata, ["originalTitleText", "text"]),
release_year=ratingdata["releaseYear"]["year"],
media_type=ratingdata["titleType"]["text"],
imdb_id=ratingdata["id"],
imdb_score=(
None if imdb_rating is None else score_from_imdb_rating(imdb_rating)
),
imdb_votes=_get_or_None(ratingdata, ["ratingsSummary", "voteCount"]),
runtime=None if runtime_s is None else int(runtime_s / 60),
genres={
genre["genre"]["text"] for genre in ratingdata["titleGenres"]["genres"]
},
)
if movie.media_type == "TV Episode":
seriesdata = ratingdata["series"]["series"]
series_original_title = seriesdata["originalTitleText"]["text"]
series_title = seriesdata["titleText"]["text"]
# series_id = seriesdata["releaseYear"]["id"]
# series_year = seriesdata["releaseYear"]["year"]
# series_endyear = seriesdata["releaseYear"]["endYear"]
movie.title = _tv_episode_title(series_title, movie.title)
movie.original_title = _tv_episode_title(
series_original_title, movie.original_title
)
yield movie
async def _load_ratings_page_202407(
user_id: ImdbUserId, url: str, soup: bs4.BeautifulSoup
) -> _RatingsPage:
"""Handle the ratings page from July 2024."""
if (
nextjson := soup.find("script", id="__NEXT_DATA__", type="application/json")
) is None:
raise RuntimeError("No __NEXT_DATA__ BLOB found.")
nextdata = json.loads(nextjson.string.strip())
userdata = nextdata["props"]["pageProps"]["aboveTheFoldData"]
page = _RatingsPage(
imdb_user_id=userdata["authorId"],
imdb_user_name=userdata["authorName"],
)
movies = _parse_movies_from_nextdata_202407(nextdata)
movies_dict = {m.imdb_id: m for m in movies}
async for rating in _load_user_movie_ratings(user_id, list(movies_dict.keys())):
movie = movies_dict[rating.movie_id]
rating = Rating(
movie=movie,
score=score_from_imdb_rating(rating.imdb_rating),
rating_date=rating.rating_date,
)
page.ratings.append(rating)
return page
async def _load_ratings_page_2024(
user_id: ImdbUserId, url: str, soup: bs4.BeautifulSoup
) -> _RatingsPage:
@ -559,3 +643,55 @@ async def _load_user_movie_ratings(
except KeyError as err:
log.error("Unexpected data structure.", exc_info=err)
raise
async def _load_user_ratings_202407(
user_id: ImdbUserId, movie_ids: list[ImdbMovieId]
) -> AsyncIterable[_UserMovieRating]:
"""
This is a new API that showed up in July 2024.
It's used on a user's ratings page to load their ratings.
"""
raise NotImplementedError()
qgl_api_url = "https://api.graphql.imdb.com/"
headers = {
"accept": "application/graphql+json, application/json",
"content-type": "application/json",
"origin": "https://www.imdb.com",
}
query = {
"operationName": "RatingsPage",
"variables": json.dumps(
{
# "after": ...,
"filter": {
"explicitContentConstraint": {
"explicitContentFilter": "INCLUDE_ADULT"
},
"singleUserRatingConstraint": {
"filterType": "INCLUDE",
"userId": user_id,
},
},
"first": 100,
"locale": "en-US",
"sort": {"sortBy": "SINGLE_USER_RATING_DATE", "sortOrder": "ASC"},
}
),
"extensions": json.dumps(
{
"persistedQuery": {
"sha256Hash": "ae30a55f169252b5f0208d686f41aaff231d7f70bb75c257732c80234d71dbe9",
"version": 1,
}
}
),
}
async with asession() as s:
r = await s.get(qgl_api_url, headers=headers, query=query, timeout=10)
r.raise_for_status()
data = r.json()
# ...