diff --git a/tests/fixtures/bottom_100-20240714.html.bz2 b/tests/fixtures/bottom_100-20240714.html.bz2 new file mode 100644 index 0000000..37ae7f5 Binary files /dev/null and b/tests/fixtures/bottom_100-20240714.html.bz2 differ diff --git a/tests/fixtures/most_popular_100-20240714.html.bz2 b/tests/fixtures/most_popular_100-20240714.html.bz2 new file mode 100644 index 0000000..163c644 Binary files /dev/null and b/tests/fixtures/most_popular_100-20240714.html.bz2 differ diff --git a/tests/test_imdb.py b/tests/test_imdb.py index 7f4efcf..d4a5db5 100644 --- a/tests/test_imdb.py +++ b/tests/test_imdb.py @@ -30,29 +30,43 @@ def test_score_conversion(score: int): assert score == score_from_imdb_rating(imdb_rating_from_score(score)) +@pytest.mark.parametrize( + "fixture", + ( + ("most_popular_100.html.bz2"), + ("most_popular_100-20240714.html.bz2"), + ), +) @pytest.mark.asyncio -async def test_load_most_popular_100(monkeypatch): - with bz2.open(fixturesdir / "most_popular_100.html.bz2", "rb") as f: +async def test_load_most_popular_100(monkeypatch, fixture: str): + with bz2.open(fixturesdir / fixture, "rb") as f: html = f.read() soup = bs4.BeautifulSoup(html, "html5lib") monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup)) movie_ids = await imdb.load_most_popular_100() - assert len(movie_ids) == 100 + assert len(set(movie_ids)) == 100 assert all(id_.startswith("tt") for id_ in movie_ids) +@pytest.mark.parametrize( + "fixture", + ( + ("bottom_100.html.bz2"), + ("bottom_100-20240714.html.bz2"), + ), +) @pytest.mark.asyncio -async def test_load_bottom_100(monkeypatch): - with bz2.open(fixturesdir / "bottom_100.html.bz2", "rb") as f: +async def test_load_bottom_100(monkeypatch, fixture: str): + with bz2.open(fixturesdir / fixture, "rb") as f: html = f.read() soup = bs4.BeautifulSoup(html, "html5lib") monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup)) movie_ids = await imdb.load_bottom_100() - assert len(movie_ids) == 100 + assert len(set(movie_ids)) == 100 assert all(id_.startswith("tt") for id_ in movie_ids) diff --git a/unwind/imdb.py b/unwind/imdb.py index 1685ad3..24e3311 100644 --- a/unwind/imdb.py +++ b/unwind/imdb.py @@ -4,7 +4,7 @@ import re from collections import namedtuple from dataclasses import dataclass, field from datetime import datetime -from typing import AsyncIterable +from typing import AsyncIterable, Iterable from urllib.parse import urljoin import bs4 @@ -106,7 +106,9 @@ find_year = re.compile( r"(\([IVX]+\) )?\((?P\d{4})(–( |\d{4})| (?P[^)]+))?\)" # noqa: RUF001 ).fullmatch # find_year_2: e.g. "2024", "1971–2003", "2024–" # noqa: RUF003 -find_year_2 = re.compile(r"(?P\d{4})(–(?P\d{4})?)?").fullmatch # noqa: RUF001 +find_year_2 = re.compile( + r"(?P\d{4})(–(?P\d{4})?)?" # noqa: RUF001 +).fullmatch find_movie_id = re.compile(r"/title/(?Ptt\d+)/").search find_movie_name = re.compile(r"\d+\. (?P.+)").fullmatch # find_vote_count: e.g. "(5.9K)", "(1K)", "(8)" @@ -398,17 +400,33 @@ async def load_ratings(user_id: ImdbUserId) -> AsyncIterable[Rating]: yield rating -async def _ids_from_list_html(url: str) -> AsyncIterable[ImdbMovieId]: - """Return all IMDb movie IDs (`tt*`) from the given URL.""" +def _ids_from_list_html(soup: bs4.BeautifulSoup) -> Iterable[ImdbMovieId]: + """Return all IMDb movie IDs (`tt*`) from the given soup.""" # document.querySelectorAll('li.ipc-metadata-list-summary-item a.ipc-title-link-wrapper') # .href: '/title/tt1213644/?ref_=chtbtm_t_1' # .text(): '1. Disaster Movie' - soup = await asoup_from_url(url) for item in soup.find_all("li", "ipc-metadata-list-summary-item"): if (link := item.find("a", "ipc-title-link-wrapper")) is not None: if (href := link.get("href")) is not None: if match_ := find_movie_id(href): - yield match_["id"] + yield ImdbMovieId(match_["id"]) + + +def _items_from_ldjson(soup: bs4.BeautifulSoup) -> Iterable[dict]: + """Return all items from the LD+JSON block in the given soup.""" + if (item := soup.find("script", type="application/ld+json")) is None: + raise RuntimeError("Could not find LD+JSON data.") + data = json.loads(item.string.strip()) + if data["@type"] != "ItemList": + raise RuntimeError(f"Expected ItemList, got {data['@type']!a}.") + for item in data["itemListElement"]: + yield item["item"] + + +def _ids_from_ldjson(soup: bs4.BeautifulSoup) -> Iterable[ImdbMovieId]: + for item in _items_from_ldjson(soup): + if match_ := find_movie_id(item["url"]): + yield ImdbMovieId(match_["id"]) async def load_most_popular_100() -> list[ImdbMovieId]: @@ -418,7 +436,11 @@ async def load_most_popular_100() -> list[ImdbMovieId]: As determined by IMDb users """ url = "https://www.imdb.com/chart/moviemeter/" - ids = [tid async for tid in _ids_from_list_html(url)] + soup = await asoup_from_url(url) + try: + ids = list(_ids_from_ldjson(soup)) + except RuntimeError: + ids = list(_ids_from_list_html(soup)) if len(ids) != 100: raise RuntimeError(f"Expected exactly 100 items, got {len(ids)}") return ids @@ -431,7 +453,11 @@ async def load_bottom_100() -> list[ImdbMovieId]: Bottom 100 as voted by IMDb users """ url = "https://www.imdb.com/chart/bottom/" - ids = [tid async for tid in _ids_from_list_html(url)] + soup = await asoup_from_url(url) + try: + ids = list(_ids_from_ldjson(soup)) + except RuntimeError: + ids = list(_ids_from_list_html(soup)) if len(ids) != 100: raise RuntimeError(f"Expected exactly 100 items, got {len(ids)}") return ids