fix: support new user ratings page markup
We use __NEXT_DATA__ from the page to find the user's latest rated movies. We found that at least in one case (of a Video Game) the movie details were wrong. Normally this shouldn't be a problem though because we know all the movies already and we keep the values we already have. Otherwise the data from __NEXT_DATA__ seems more accurate and complete.
This commit is contained in:
parent
d7530e6bb0
commit
380d6ff186
4 changed files with 217 additions and 14 deletions
BIN
tests/fixtures/ratings-ur655321-20240720.gql.json.bz2
vendored
Normal file
BIN
tests/fixtures/ratings-ur655321-20240720.gql.json.bz2
vendored
Normal file
Binary file not shown.
BIN
tests/fixtures/ratings-ur655321-20240720.html.bz2
vendored
Normal file
BIN
tests/fixtures/ratings-ur655321-20240720.html.bz2
vendored
Normal file
Binary file not shown.
|
|
@ -134,7 +134,7 @@ async def test_load_ratings_page_20240510(monkeypatch):
|
||||||
if rating.movie.imdb_id == item["imdb_id"]:
|
if rating.movie.imdb_id == item["imdb_id"]:
|
||||||
rating_dict = {key: getattr(rating.movie, key) for key in item.keys()}
|
rating_dict = {key: getattr(rating.movie, key) for key in item.keys()}
|
||||||
return rating_dict
|
return rating_dict
|
||||||
raise AssertionError()
|
raise AssertionError(f"{item["imdb_id"]} not found in page.ratings")
|
||||||
|
|
||||||
a_movie = {
|
a_movie = {
|
||||||
"title": "Kung Fu Panda 4",
|
"title": "Kung Fu Panda 4",
|
||||||
|
|
@ -142,8 +142,9 @@ async def test_load_ratings_page_20240510(monkeypatch):
|
||||||
"media_type": "Movie",
|
"media_type": "Movie",
|
||||||
"imdb_id": "tt21692408",
|
"imdb_id": "tt21692408",
|
||||||
"imdb_score": 59,
|
"imdb_score": 59,
|
||||||
"imdb_votes": 36000,
|
"imdb_votes": 36069,
|
||||||
"runtime": 94,
|
"runtime": 94,
|
||||||
|
"genres": {"Action", "Adventure", "Animation"},
|
||||||
}
|
}
|
||||||
assert a_movie == movie(a_movie)
|
assert a_movie == movie(a_movie)
|
||||||
|
|
||||||
|
|
@ -153,7 +154,8 @@ async def test_load_ratings_page_20240510(monkeypatch):
|
||||||
"media_type": "TV Series",
|
"media_type": "TV Series",
|
||||||
"imdb_id": "tt8888540",
|
"imdb_id": "tt8888540",
|
||||||
"imdb_score": 64,
|
"imdb_score": 64,
|
||||||
"imdb_votes": 6000,
|
"imdb_votes": 6044,
|
||||||
|
"genres": {"Drama"},
|
||||||
}
|
}
|
||||||
assert a_running_tvseries == movie(a_running_tvseries)
|
assert a_running_tvseries == movie(a_running_tvseries)
|
||||||
|
|
||||||
|
|
@ -163,29 +165,94 @@ async def test_load_ratings_page_20240510(monkeypatch):
|
||||||
"media_type": "TV Series",
|
"media_type": "TV Series",
|
||||||
"imdb_id": "tt0072500",
|
"imdb_id": "tt0072500",
|
||||||
"imdb_score": 87,
|
"imdb_score": 87,
|
||||||
"imdb_votes": 100000,
|
"imdb_votes": 100261,
|
||||||
|
"genres": {"Comedy"},
|
||||||
}
|
}
|
||||||
assert a_finished_tvseries == movie(a_finished_tvseries)
|
assert a_finished_tvseries == movie(a_finished_tvseries)
|
||||||
|
|
||||||
a_tvepisode = {
|
a_tvepisode = {
|
||||||
"title": "Columbo / No Time to Die",
|
"title": "Columbo / No Time to Die",
|
||||||
"original_title": None,
|
"original_title": "Columbo / No Time to Die",
|
||||||
"release_year": 1992,
|
"release_year": 1992,
|
||||||
"media_type": "TV Episode",
|
"media_type": "TV Episode",
|
||||||
"imdb_id": "tt0103987",
|
"imdb_id": "tt0103987",
|
||||||
"imdb_score": 59,
|
"imdb_score": 59,
|
||||||
"imdb_votes": 2100,
|
"imdb_votes": 2122,
|
||||||
"runtime": 98,
|
"runtime": 98,
|
||||||
|
"genres": {"Crime", "Drama", "Mystery"},
|
||||||
}
|
}
|
||||||
assert a_tvepisode == movie(a_tvepisode)
|
assert a_tvepisode == movie(a_tvepisode)
|
||||||
|
|
||||||
a_videogame = {
|
a_videogame = {
|
||||||
"title": "Alan Wake",
|
"title": "Alan Wake",
|
||||||
"original_title": None,
|
"original_title": "Alan Wake",
|
||||||
"release_year": 2010,
|
"release_year": 2010,
|
||||||
"media_type": "Video Game",
|
"media_type": "Video Game",
|
||||||
"imdb_id": "tt0466662",
|
"imdb_id": "tt0466662",
|
||||||
"imdb_score": 82,
|
# The data from __NEXT_DATA__ is wrong, the actual values should be:
|
||||||
"imdb_votes": 7300,
|
# "imdb_score": 82,
|
||||||
|
# "imdb_votes": 7300,
|
||||||
|
# "genres": {"Action", "Adventure", "Horror"},
|
||||||
|
"imdb_score": 67, # Wrong value, but correctly parsed from __NEXT_DATA__
|
||||||
|
"imdb_votes": 11655, # Wrong value, but correctly parsed from __NEXT_DATA__
|
||||||
|
"genres": {"Comedy", "Crime", "Drama"}, # Wrong value
|
||||||
}
|
}
|
||||||
assert a_videogame == movie(a_videogame)
|
assert a_videogame == movie(a_videogame)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_load_ratings_page_20240720(monkeypatch):
|
||||||
|
with bz2.open(fixturesdir / "ratings-ur655321-20240720.html.bz2", "rb") as f:
|
||||||
|
html = f.read()
|
||||||
|
soup = bs4.BeautifulSoup(html, "html5lib")
|
||||||
|
monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup))
|
||||||
|
|
||||||
|
with bz2.open(fixturesdir / "ratings-ur655321-20240720.gql.json.bz2", "rb") as f:
|
||||||
|
jsonstr = f.read()
|
||||||
|
async with imdb.asession() as s:
|
||||||
|
monkeypatch.setattr(s, "post", AsyncMock(return_value=_mock_response(jsonstr)))
|
||||||
|
page = await imdb._load_ratings_page("fakeurl", "ur655321")
|
||||||
|
assert len(page.ratings) == 100
|
||||||
|
assert page.imdb_user_id is not None
|
||||||
|
assert page.imdb_user_id == "ur655321"
|
||||||
|
assert page.imdb_user_name == "AlexUltra"
|
||||||
|
assert page.next_page_url is None, "not supported for new ratings page"
|
||||||
|
|
||||||
|
def movie(item: dict):
|
||||||
|
for rating in page.ratings:
|
||||||
|
assert rating.movie
|
||||||
|
if rating.movie.imdb_id == item["imdb_id"]:
|
||||||
|
rating_dict = {key: getattr(rating.movie, key) for key in item.keys()}
|
||||||
|
return rating_dict
|
||||||
|
raise AssertionError(f"{item["imdb_id"]} not found in page.ratings")
|
||||||
|
|
||||||
|
a_movie = {
|
||||||
|
"title": "Kung Fu Panda 4",
|
||||||
|
"release_year": 2024,
|
||||||
|
"media_type": "Movie",
|
||||||
|
"imdb_id": "tt21692408",
|
||||||
|
"imdb_score": 59,
|
||||||
|
"imdb_votes": 48018,
|
||||||
|
"runtime": 94,
|
||||||
|
}
|
||||||
|
assert a_movie == movie(a_movie)
|
||||||
|
|
||||||
|
a_running_tvseries = {
|
||||||
|
"title": "Palm Royale",
|
||||||
|
"release_year": 2024,
|
||||||
|
"media_type": "TV Series",
|
||||||
|
"imdb_id": "tt8888540",
|
||||||
|
"imdb_score": 63,
|
||||||
|
"imdb_votes": 9458,
|
||||||
|
}
|
||||||
|
assert a_running_tvseries == movie(a_running_tvseries)
|
||||||
|
|
||||||
|
a_finished_tvseries = {
|
||||||
|
"title": "Fawlty Towers",
|
||||||
|
"release_year": 1975,
|
||||||
|
"media_type": "TV Series",
|
||||||
|
"imdb_id": "tt0072500",
|
||||||
|
"imdb_score": 87,
|
||||||
|
"imdb_votes": 100860,
|
||||||
|
}
|
||||||
|
assert a_finished_tvseries == movie(a_finished_tvseries)
|
||||||
|
|
|
||||||
146
unwind/imdb.py
146
unwind/imdb.py
|
|
@ -4,7 +4,7 @@ import re
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import AsyncIterable, Iterable
|
from typing import Any, AsyncIterable, Iterable
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
import bs4
|
import bs4
|
||||||
|
|
@ -116,9 +116,8 @@ find_vote_count = re.compile(r"\((?P<count>\d+(\.\d+)?K?)\)").fullmatch
|
||||||
|
|
||||||
|
|
||||||
def _first_string(tag: bs4.Tag) -> str | None:
|
def _first_string(tag: bs4.Tag) -> str | None:
|
||||||
for child in tag.children:
|
for s in tag.strings:
|
||||||
if isinstance(child, str):
|
return s
|
||||||
return child
|
|
||||||
|
|
||||||
|
|
||||||
def _tv_episode_title(series_name: str, episode_name: str) -> str:
|
def _tv_episode_title(series_name: str, episode_name: str) -> str:
|
||||||
|
|
@ -244,7 +243,8 @@ async def _load_ratings_page(url: str, user_id: ImdbUserId) -> _RatingsPage:
|
||||||
"""Dispatch to handlers for different ratings page versions."""
|
"""Dispatch to handlers for different ratings page versions."""
|
||||||
|
|
||||||
soup = await asoup_from_url(url)
|
soup = await asoup_from_url(url)
|
||||||
|
if soup.find("script", id="__NEXT_DATA__", type="application/json") is not None:
|
||||||
|
return await _load_ratings_page_202407(user_id, url, soup)
|
||||||
if soup.find("meta", property="imdb:pageConst") is not None:
|
if soup.find("meta", property="imdb:pageConst") is not None:
|
||||||
return await _load_ratings_page_2024(user_id, url, soup)
|
return await _load_ratings_page_2024(user_id, url, soup)
|
||||||
elif soup.find("meta", property="pageId") is not None:
|
elif soup.find("meta", property="pageId") is not None:
|
||||||
|
|
@ -253,6 +253,90 @@ async def _load_ratings_page(url: str, user_id: ImdbUserId) -> _RatingsPage:
|
||||||
raise RuntimeError("Unknown ratings page version.")
|
raise RuntimeError("Unknown ratings page version.")
|
||||||
|
|
||||||
|
|
||||||
|
def _get_or_None(d: dict[str, Any], keys: list[str]) -> Any | None:
|
||||||
|
for k in keys:
|
||||||
|
try:
|
||||||
|
d = d[k]
|
||||||
|
except KeyError:
|
||||||
|
return None
|
||||||
|
if d is None:
|
||||||
|
break
|
||||||
|
return d
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_movies_from_nextdata_202407(nextdata: dict) -> Iterable[Movie]:
|
||||||
|
nextratings = nextdata["props"]["pageProps"]["mainColumnData"][
|
||||||
|
"advancedTitleSearch"
|
||||||
|
]["edges"]
|
||||||
|
for ratingdata in nextratings:
|
||||||
|
ratingdata = ratingdata["node"]["title"]
|
||||||
|
# endYear=ratingdata["releaseYear"]["endYear"]
|
||||||
|
imdb_rating = _get_or_None(ratingdata, ["ratingsSummary", "aggregateRating"])
|
||||||
|
runtime_s = _get_or_None(ratingdata, ["runtime", "seconds"])
|
||||||
|
|
||||||
|
movie = Movie(
|
||||||
|
title=ratingdata["titleText"]["text"],
|
||||||
|
original_title=_get_or_None(ratingdata, ["originalTitleText", "text"]),
|
||||||
|
release_year=ratingdata["releaseYear"]["year"],
|
||||||
|
media_type=ratingdata["titleType"]["text"],
|
||||||
|
imdb_id=ratingdata["id"],
|
||||||
|
imdb_score=(
|
||||||
|
None if imdb_rating is None else score_from_imdb_rating(imdb_rating)
|
||||||
|
),
|
||||||
|
imdb_votes=_get_or_None(ratingdata, ["ratingsSummary", "voteCount"]),
|
||||||
|
runtime=None if runtime_s is None else int(runtime_s / 60),
|
||||||
|
genres={
|
||||||
|
genre["genre"]["text"] for genre in ratingdata["titleGenres"]["genres"]
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
if movie.media_type == "TV Episode":
|
||||||
|
seriesdata = ratingdata["series"]["series"]
|
||||||
|
series_original_title = seriesdata["originalTitleText"]["text"]
|
||||||
|
series_title = seriesdata["titleText"]["text"]
|
||||||
|
# series_id = seriesdata["releaseYear"]["id"]
|
||||||
|
# series_year = seriesdata["releaseYear"]["year"]
|
||||||
|
# series_endyear = seriesdata["releaseYear"]["endYear"]
|
||||||
|
movie.title = _tv_episode_title(series_title, movie.title)
|
||||||
|
movie.original_title = _tv_episode_title(
|
||||||
|
series_original_title, movie.original_title
|
||||||
|
)
|
||||||
|
|
||||||
|
yield movie
|
||||||
|
|
||||||
|
|
||||||
|
async def _load_ratings_page_202407(
|
||||||
|
user_id: ImdbUserId, url: str, soup: bs4.BeautifulSoup
|
||||||
|
) -> _RatingsPage:
|
||||||
|
"""Handle the ratings page from July 2024."""
|
||||||
|
if (
|
||||||
|
nextjson := soup.find("script", id="__NEXT_DATA__", type="application/json")
|
||||||
|
) is None:
|
||||||
|
raise RuntimeError("No __NEXT_DATA__ BLOB found.")
|
||||||
|
|
||||||
|
nextdata = json.loads(nextjson.string.strip())
|
||||||
|
userdata = nextdata["props"]["pageProps"]["aboveTheFoldData"]
|
||||||
|
|
||||||
|
page = _RatingsPage(
|
||||||
|
imdb_user_id=userdata["authorId"],
|
||||||
|
imdb_user_name=userdata["authorName"],
|
||||||
|
)
|
||||||
|
|
||||||
|
movies = _parse_movies_from_nextdata_202407(nextdata)
|
||||||
|
movies_dict = {m.imdb_id: m for m in movies}
|
||||||
|
async for rating in _load_user_movie_ratings(user_id, list(movies_dict.keys())):
|
||||||
|
movie = movies_dict[rating.movie_id]
|
||||||
|
rating = Rating(
|
||||||
|
movie=movie,
|
||||||
|
score=score_from_imdb_rating(rating.imdb_rating),
|
||||||
|
rating_date=rating.rating_date,
|
||||||
|
)
|
||||||
|
|
||||||
|
page.ratings.append(rating)
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
async def _load_ratings_page_2024(
|
async def _load_ratings_page_2024(
|
||||||
user_id: ImdbUserId, url: str, soup: bs4.BeautifulSoup
|
user_id: ImdbUserId, url: str, soup: bs4.BeautifulSoup
|
||||||
) -> _RatingsPage:
|
) -> _RatingsPage:
|
||||||
|
|
@ -559,3 +643,55 @@ async def _load_user_movie_ratings(
|
||||||
except KeyError as err:
|
except KeyError as err:
|
||||||
log.error("Unexpected data structure.", exc_info=err)
|
log.error("Unexpected data structure.", exc_info=err)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
async def _load_user_ratings_202407(
|
||||||
|
user_id: ImdbUserId, movie_ids: list[ImdbMovieId]
|
||||||
|
) -> AsyncIterable[_UserMovieRating]:
|
||||||
|
"""
|
||||||
|
|
||||||
|
This is a new API that showed up in July 2024.
|
||||||
|
It's used on a user's ratings page to load their ratings.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
qgl_api_url = "https://api.graphql.imdb.com/"
|
||||||
|
headers = {
|
||||||
|
"accept": "application/graphql+json, application/json",
|
||||||
|
"content-type": "application/json",
|
||||||
|
"origin": "https://www.imdb.com",
|
||||||
|
}
|
||||||
|
query = {
|
||||||
|
"operationName": "RatingsPage",
|
||||||
|
"variables": json.dumps(
|
||||||
|
{
|
||||||
|
# "after": ...,
|
||||||
|
"filter": {
|
||||||
|
"explicitContentConstraint": {
|
||||||
|
"explicitContentFilter": "INCLUDE_ADULT"
|
||||||
|
},
|
||||||
|
"singleUserRatingConstraint": {
|
||||||
|
"filterType": "INCLUDE",
|
||||||
|
"userId": user_id,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"first": 100,
|
||||||
|
"locale": "en-US",
|
||||||
|
"sort": {"sortBy": "SINGLE_USER_RATING_DATE", "sortOrder": "ASC"},
|
||||||
|
}
|
||||||
|
),
|
||||||
|
"extensions": json.dumps(
|
||||||
|
{
|
||||||
|
"persistedQuery": {
|
||||||
|
"sha256Hash": "ae30a55f169252b5f0208d686f41aaff231d7f70bb75c257732c80234d71dbe9",
|
||||||
|
"version": 1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
),
|
||||||
|
}
|
||||||
|
async with asession() as s:
|
||||||
|
r = await s.get(qgl_api_url, headers=headers, query=query, timeout=10)
|
||||||
|
r.raise_for_status()
|
||||||
|
data = r.json()
|
||||||
|
|
||||||
|
# ...
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue