feat: add functions to retrieve IMDb chart lists

These charts are
- the top 250 highest rated movies
- the top 100 most popular movies
- the bottom 100 lowest rated movies
This commit is contained in:
ducklet 2024-05-10 00:12:25 +02:00
parent 4fbdb26d9c
commit 2bf5607183
7 changed files with 176 additions and 10 deletions

BIN
tests/fixtures/bottom_100.html.bz2 vendored Normal file

Binary file not shown.

BIN
tests/fixtures/most_popular_100.html.bz2 vendored Normal file

Binary file not shown.

BIN
tests/fixtures/top250.gql.json.bz2 vendored Normal file

Binary file not shown.

View file

@ -1,7 +1,16 @@
import bz2
from pathlib import Path
from unittest.mock import AsyncMock
import bs4
import pytest
from unwind import imdb
from unwind.imdb import imdb_rating_from_score, score_from_imdb_rating
testsdir = Path(__file__).parent
fixturesdir = testsdir / "fixtures"
@pytest.mark.parametrize("rating", (x / 10 for x in range(10, 101)))
def test_rating_conversion(rating: float):
@ -18,3 +27,41 @@ def test_score_conversion(score: int):
pytest.skip(f"Score cannot be mapped back correctly: {score}")
assert score == score_from_imdb_rating(imdb_rating_from_score(score))
@pytest.mark.asyncio
async def test_load_most_popular_100(monkeypatch):
with bz2.open(fixturesdir / "most_popular_100.html.bz2", "rb") as f:
html = f.read()
soup = bs4.BeautifulSoup(html, "html5lib")
monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup))
movie_ids = await imdb.load_most_popular_100()
assert len(movie_ids) == 100
assert all(id_.startswith("tt") for id_ in movie_ids)
@pytest.mark.asyncio
async def test_load_bottom_100(monkeypatch):
with bz2.open(fixturesdir / "bottom_100.html.bz2", "rb") as f:
html = f.read()
soup = bs4.BeautifulSoup(html, "html5lib")
monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup))
movie_ids = await imdb.load_bottom_100()
assert len(movie_ids) == 100
assert all(id_.startswith("tt") for id_ in movie_ids)
@pytest.mark.asyncio
async def test_load_top_250(monkeypatch):
with bz2.open(fixturesdir / "top250.gql.json.bz2", "rb") as f:
jsonstr = f.read()
monkeypatch.setattr(imdb, "adownload", AsyncMock(return_value=jsonstr))
movie_ids = await imdb.load_top_250()
assert len(movie_ids) == 250
assert all(id_.startswith("tt") for id_ in movie_ids)

View file

@ -1,14 +1,16 @@
import json
import logging
import re
from collections import namedtuple
from datetime import datetime
from typing import AsyncIterable
from urllib.parse import urljoin
import bs4
from . import db
from .models import Movie, Rating, User
from .request import asession, asoup_from_url, cache_path
from .request import adownload, asession, asoup_from_url, cache_path
log = logging.getLogger(__name__)
@ -207,7 +209,7 @@ async def parse_page(url: str) -> tuple[list[Rating], str | None]:
return (ratings, next_url if url != next_url else None)
async def load_ratings(user_id: str):
async def load_ratings(user_id: str) -> AsyncIterable[tuple[Rating, bool]]:
next_url = user_ratings_url(user_id)
while next_url:
@ -228,3 +230,74 @@ async def load_ratings(user_id: str):
is_updated = await db.add_or_update_rating(conn, rating)
yield rating, is_updated
async def _ids_from_list_html(url: str) -> AsyncIterable[str]:
"""Return all IMDb movie IDs (`tt*`) from the given URL."""
# document.querySelectorAll('li.ipc-metadata-list-summary-item a.ipc-title-link-wrapper')
# .href: '/title/tt1213644/?ref_=chtbtm_t_1'
# .text(): '1. Disaster Movie'
soup = await asoup_from_url(url)
for item in soup.find_all("li", class_="ipc-metadata-list-summary-item"):
if (link := item.find("a", class_="ipc-title-link-wrapper")) is not None:
if (href := link.get("href")) is not None:
if match_ := find_movie_id(href):
yield match_["id"]
async def load_most_popular_100() -> list[str]:
"""Return the IMDb's top 100 most popular movies.
IMDb Charts: Most Popular Movies
As determined by IMDb users
"""
url = "https://www.imdb.com/chart/moviemeter/"
ids = [tid async for tid in _ids_from_list_html(url)]
if len(ids) != 100:
raise RuntimeError(f"Expected exactly 100 items, got {len(ids)}")
return ids
async def load_bottom_100() -> list[str]:
"""Return the IMDb's bottom 100 lowest rated movies.
IMDb Charts: Lowest Rated Movies
Bottom 100 as voted by IMDb users
"""
url = "https://www.imdb.com/chart/bottom/"
ids = [tid async for tid in _ids_from_list_html(url)]
if len(ids) != 100:
raise RuntimeError(f"Expected exactly 100 items, got {len(ids)}")
return ids
async def load_top_250() -> list[str]:
"""Return the IMDb's top 250 highest rated movies.
IMDb Charts: IMDb Top 250 Movies
As rated by regular IMDb voters.
"""
# Called from page https://www.imdb.com/chart/top/
qgl_api_url = "https://caching.graphql.imdb.com/"
query = {
"operationName": "Top250MoviesPagination",
"variables": r'{"first":250,"locale":"en-US"}',
"extensions": r'{"persistedQuery":{"sha256Hash":"26114ee01d97e04f65d6c8c7212ae8b7888fa57ceed105450d1fce09df749b2d","version":1}}',
}
headers = {"content-type": "application/json"}
jsonstr = await adownload(qgl_api_url, query=query, headers=headers)
data = json.loads(jsonstr)
try:
imdb_title_ids = [
edge["node"]["id"] for edge in data["data"]["chartTitles"]["edges"]
]
has_next_page = data["data"]["chartTitles"]["pageInfo"]["hasNextPage"]
has_previous_page = data["data"]["chartTitles"]["pageInfo"]["hasPreviousPage"]
except KeyError as err:
log.error("Unexpected data structure.", exc_info=err)
raise
if len(imdb_title_ids) != 250 or has_next_page or has_previous_page:
raise RuntimeError(f"Expected exactly 250 items, got {len(imdb_title_ids)}")
return imdb_title_ids

View file

@ -17,7 +17,7 @@ log = logging.getLogger(__name__)
T = TypeVar("T")
# See
# - https://www.imdb.com/interfaces/
# - https://developer.imdb.com/non-commercial-datasets/
# - https://datasets.imdbws.com/
@ -254,13 +254,22 @@ async def download_datasets(*, basics_path: Path, ratings_path: Path) -> None:
See https://www.imdb.com/interfaces/ and https://datasets.imdbws.com/ for
more information on the IMDb database dumps.
"""
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
# name_basics_url = "https://datasets.imdbws.com/name.basics.tsv.gz"
# title_akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
title_basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
# title_crew_url = "https://datasets.imdbws.com/title.crew.tsv.gz"
# title_episode_url = "https://datasets.imdbws.com/title.episode.tsv.gz"
# title_principals_url = "https://datasets.imdbws.com/title.principals.tsv.gz"
title_ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
async with request.asession():
await asyncio.gather(
request.adownload(ratings_url, to_path=ratings_path, only_if_newer=True),
request.adownload(basics_url, to_path=basics_path, only_if_newer=True),
request.adownload(
title_ratings_url, to_path=ratings_path, only_if_newer=True
),
request.adownload(
title_basics_url, to_path=basics_path, only_if_newer=True
),
)

View file

@ -11,7 +11,7 @@ from hashlib import md5
from pathlib import Path
from random import random
from time import sleep, time
from typing import Any, Callable, ParamSpec, TypeVar, cast
from typing import Any, Callable, ParamSpec, TypeVar, cast, overload
import bs4
import httpx
@ -201,10 +201,44 @@ def _last_modified_from_file(path: Path) -> float:
return path.stat().st_mtime
@overload
async def adownload(
url: str,
*,
to_path: Path | str,
query: dict[str, str] | None = None,
headers: dict[str, str] | None = None,
replace_existing: bool | None = None,
only_if_newer: bool = False,
timeout: float | None = None,
chunk_callback: Callable[[bytes], Any] | None = None,
response_callback: Callable[[_Response_T], Any] | None = None,
) -> None:
...
@overload
async def adownload(
url: str,
*,
to_path: None = None,
query: dict[str, str] | None = None,
headers: dict[str, str] | None = None,
replace_existing: bool | None = None,
only_if_newer: bool = False,
timeout: float | None = None,
chunk_callback: Callable[[bytes], Any] | None = None,
response_callback: Callable[[_Response_T], Any] | None = None,
) -> bytes:
...
async def adownload(
url: str,
*,
to_path: Path | str | None = None,
query: dict[str, str] | None = None,
headers: dict[str, str] | None = None,
replace_existing: bool | None = None,
only_if_newer: bool = False,
timeout: float | None = None,
@ -231,7 +265,8 @@ async def adownload(
raise FileExistsError(23, "Would replace existing file", str(to_path))
async with asession() as s:
headers = {}
if headers is None:
headers = {}
if file_exists and only_if_newer:
assert to_path
file_lastmod = _last_modified_from_file(to_path)
@ -239,7 +274,9 @@ async def adownload(
file_lastmod, usegmt=True
)
req = s.build_request(method="GET", url=url, headers=headers, timeout=timeout)
req = s.build_request(
method="GET", url=url, params=query, headers=headers, timeout=timeout
)
log.debug("⚡️ Loading %s (%a) ...", req.url, dict(req.headers))
resp = await s.send(req, follow_redirects=True, stream=True)