fix: support new "most popular 100" & "bottom 100" HTML
The previous version had all 100 movies rendered into the HTML. The new version has only the top 25 rendered into HTML, but the whole list has been made available as LD+JSON data. Since we can easily support both, we don't (yet) remove the old parser.
This commit is contained in:
parent
aaaf66c715
commit
d7530e6bb0
4 changed files with 54 additions and 14 deletions
BIN
tests/fixtures/bottom_100-20240714.html.bz2
vendored
Normal file
BIN
tests/fixtures/bottom_100-20240714.html.bz2
vendored
Normal file
Binary file not shown.
BIN
tests/fixtures/most_popular_100-20240714.html.bz2
vendored
Normal file
BIN
tests/fixtures/most_popular_100-20240714.html.bz2
vendored
Normal file
Binary file not shown.
|
|
@ -30,29 +30,43 @@ def test_score_conversion(score: int):
|
||||||
assert score == score_from_imdb_rating(imdb_rating_from_score(score))
|
assert score == score_from_imdb_rating(imdb_rating_from_score(score))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"fixture",
|
||||||
|
(
|
||||||
|
("most_popular_100.html.bz2"),
|
||||||
|
("most_popular_100-20240714.html.bz2"),
|
||||||
|
),
|
||||||
|
)
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_load_most_popular_100(monkeypatch):
|
async def test_load_most_popular_100(monkeypatch, fixture: str):
|
||||||
with bz2.open(fixturesdir / "most_popular_100.html.bz2", "rb") as f:
|
with bz2.open(fixturesdir / fixture, "rb") as f:
|
||||||
html = f.read()
|
html = f.read()
|
||||||
soup = bs4.BeautifulSoup(html, "html5lib")
|
soup = bs4.BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup))
|
monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup))
|
||||||
|
|
||||||
movie_ids = await imdb.load_most_popular_100()
|
movie_ids = await imdb.load_most_popular_100()
|
||||||
assert len(movie_ids) == 100
|
assert len(set(movie_ids)) == 100
|
||||||
assert all(id_.startswith("tt") for id_ in movie_ids)
|
assert all(id_.startswith("tt") for id_ in movie_ids)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"fixture",
|
||||||
|
(
|
||||||
|
("bottom_100.html.bz2"),
|
||||||
|
("bottom_100-20240714.html.bz2"),
|
||||||
|
),
|
||||||
|
)
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_load_bottom_100(monkeypatch):
|
async def test_load_bottom_100(monkeypatch, fixture: str):
|
||||||
with bz2.open(fixturesdir / "bottom_100.html.bz2", "rb") as f:
|
with bz2.open(fixturesdir / fixture, "rb") as f:
|
||||||
html = f.read()
|
html = f.read()
|
||||||
soup = bs4.BeautifulSoup(html, "html5lib")
|
soup = bs4.BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup))
|
monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup))
|
||||||
|
|
||||||
movie_ids = await imdb.load_bottom_100()
|
movie_ids = await imdb.load_bottom_100()
|
||||||
assert len(movie_ids) == 100
|
assert len(set(movie_ids)) == 100
|
||||||
assert all(id_.startswith("tt") for id_ in movie_ids)
|
assert all(id_.startswith("tt") for id_ in movie_ids)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,7 @@ import re
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import AsyncIterable
|
from typing import AsyncIterable, Iterable
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
import bs4
|
import bs4
|
||||||
|
|
@ -106,7 +106,9 @@ find_year = re.compile(
|
||||||
r"(\([IVX]+\) )?\((?P<year>\d{4})(–( |\d{4})| (?P<type>[^)]+))?\)" # noqa: RUF001
|
r"(\([IVX]+\) )?\((?P<year>\d{4})(–( |\d{4})| (?P<type>[^)]+))?\)" # noqa: RUF001
|
||||||
).fullmatch
|
).fullmatch
|
||||||
# find_year_2: e.g. "2024", "1971–2003", "2024–" # noqa: RUF003
|
# find_year_2: e.g. "2024", "1971–2003", "2024–" # noqa: RUF003
|
||||||
find_year_2 = re.compile(r"(?P<year>\d{4})(–(?P<end_year>\d{4})?)?").fullmatch # noqa: RUF001
|
find_year_2 = re.compile(
|
||||||
|
r"(?P<year>\d{4})(–(?P<end_year>\d{4})?)?" # noqa: RUF001
|
||||||
|
).fullmatch
|
||||||
find_movie_id = re.compile(r"/title/(?P<id>tt\d+)/").search
|
find_movie_id = re.compile(r"/title/(?P<id>tt\d+)/").search
|
||||||
find_movie_name = re.compile(r"\d+\. (?P<name>.+)").fullmatch
|
find_movie_name = re.compile(r"\d+\. (?P<name>.+)").fullmatch
|
||||||
# find_vote_count: e.g. "(5.9K)", "(1K)", "(8)"
|
# find_vote_count: e.g. "(5.9K)", "(1K)", "(8)"
|
||||||
|
|
@ -398,17 +400,33 @@ async def load_ratings(user_id: ImdbUserId) -> AsyncIterable[Rating]:
|
||||||
yield rating
|
yield rating
|
||||||
|
|
||||||
|
|
||||||
async def _ids_from_list_html(url: str) -> AsyncIterable[ImdbMovieId]:
|
def _ids_from_list_html(soup: bs4.BeautifulSoup) -> Iterable[ImdbMovieId]:
|
||||||
"""Return all IMDb movie IDs (`tt*`) from the given URL."""
|
"""Return all IMDb movie IDs (`tt*`) from the given soup."""
|
||||||
# document.querySelectorAll('li.ipc-metadata-list-summary-item a.ipc-title-link-wrapper')
|
# document.querySelectorAll('li.ipc-metadata-list-summary-item a.ipc-title-link-wrapper')
|
||||||
# .href: '/title/tt1213644/?ref_=chtbtm_t_1'
|
# .href: '/title/tt1213644/?ref_=chtbtm_t_1'
|
||||||
# .text(): '1. Disaster Movie'
|
# .text(): '1. Disaster Movie'
|
||||||
soup = await asoup_from_url(url)
|
|
||||||
for item in soup.find_all("li", "ipc-metadata-list-summary-item"):
|
for item in soup.find_all("li", "ipc-metadata-list-summary-item"):
|
||||||
if (link := item.find("a", "ipc-title-link-wrapper")) is not None:
|
if (link := item.find("a", "ipc-title-link-wrapper")) is not None:
|
||||||
if (href := link.get("href")) is not None:
|
if (href := link.get("href")) is not None:
|
||||||
if match_ := find_movie_id(href):
|
if match_ := find_movie_id(href):
|
||||||
yield match_["id"]
|
yield ImdbMovieId(match_["id"])
|
||||||
|
|
||||||
|
|
||||||
|
def _items_from_ldjson(soup: bs4.BeautifulSoup) -> Iterable[dict]:
|
||||||
|
"""Return all items from the LD+JSON block in the given soup."""
|
||||||
|
if (item := soup.find("script", type="application/ld+json")) is None:
|
||||||
|
raise RuntimeError("Could not find LD+JSON data.")
|
||||||
|
data = json.loads(item.string.strip())
|
||||||
|
if data["@type"] != "ItemList":
|
||||||
|
raise RuntimeError(f"Expected ItemList, got {data['@type']!a}.")
|
||||||
|
for item in data["itemListElement"]:
|
||||||
|
yield item["item"]
|
||||||
|
|
||||||
|
|
||||||
|
def _ids_from_ldjson(soup: bs4.BeautifulSoup) -> Iterable[ImdbMovieId]:
|
||||||
|
for item in _items_from_ldjson(soup):
|
||||||
|
if match_ := find_movie_id(item["url"]):
|
||||||
|
yield ImdbMovieId(match_["id"])
|
||||||
|
|
||||||
|
|
||||||
async def load_most_popular_100() -> list[ImdbMovieId]:
|
async def load_most_popular_100() -> list[ImdbMovieId]:
|
||||||
|
|
@ -418,7 +436,11 @@ async def load_most_popular_100() -> list[ImdbMovieId]:
|
||||||
As determined by IMDb users
|
As determined by IMDb users
|
||||||
"""
|
"""
|
||||||
url = "https://www.imdb.com/chart/moviemeter/"
|
url = "https://www.imdb.com/chart/moviemeter/"
|
||||||
ids = [tid async for tid in _ids_from_list_html(url)]
|
soup = await asoup_from_url(url)
|
||||||
|
try:
|
||||||
|
ids = list(_ids_from_ldjson(soup))
|
||||||
|
except RuntimeError:
|
||||||
|
ids = list(_ids_from_list_html(soup))
|
||||||
if len(ids) != 100:
|
if len(ids) != 100:
|
||||||
raise RuntimeError(f"Expected exactly 100 items, got {len(ids)}")
|
raise RuntimeError(f"Expected exactly 100 items, got {len(ids)}")
|
||||||
return ids
|
return ids
|
||||||
|
|
@ -431,7 +453,11 @@ async def load_bottom_100() -> list[ImdbMovieId]:
|
||||||
Bottom 100 as voted by IMDb users
|
Bottom 100 as voted by IMDb users
|
||||||
"""
|
"""
|
||||||
url = "https://www.imdb.com/chart/bottom/"
|
url = "https://www.imdb.com/chart/bottom/"
|
||||||
ids = [tid async for tid in _ids_from_list_html(url)]
|
soup = await asoup_from_url(url)
|
||||||
|
try:
|
||||||
|
ids = list(_ids_from_ldjson(soup))
|
||||||
|
except RuntimeError:
|
||||||
|
ids = list(_ids_from_list_html(soup))
|
||||||
if len(ids) != 100:
|
if len(ids) != 100:
|
||||||
raise RuntimeError(f"Expected exactly 100 items, got {len(ids)}")
|
raise RuntimeError(f"Expected exactly 100 items, got {len(ids)}")
|
||||||
return ids
|
return ids
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue