diff --git a/tests/fixtures/bottom_100.html.bz2 b/tests/fixtures/bottom_100.html.bz2 new file mode 100644 index 0000000..57cf96e Binary files /dev/null and b/tests/fixtures/bottom_100.html.bz2 differ diff --git a/tests/fixtures/most_popular_100.html.bz2 b/tests/fixtures/most_popular_100.html.bz2 new file mode 100644 index 0000000..13fdf12 Binary files /dev/null and b/tests/fixtures/most_popular_100.html.bz2 differ diff --git a/tests/fixtures/top250.gql.json.bz2 b/tests/fixtures/top250.gql.json.bz2 new file mode 100644 index 0000000..d4d8bfc Binary files /dev/null and b/tests/fixtures/top250.gql.json.bz2 differ diff --git a/tests/test_imdb.py b/tests/test_imdb.py index 00467ce..4f949d6 100644 --- a/tests/test_imdb.py +++ b/tests/test_imdb.py @@ -1,7 +1,16 @@ +import bz2 +from pathlib import Path +from unittest.mock import AsyncMock + +import bs4 import pytest +from unwind import imdb from unwind.imdb import imdb_rating_from_score, score_from_imdb_rating +testsdir = Path(__file__).parent +fixturesdir = testsdir / "fixtures" + @pytest.mark.parametrize("rating", (x / 10 for x in range(10, 101))) def test_rating_conversion(rating: float): @@ -18,3 +27,41 @@ def test_score_conversion(score: int): pytest.skip(f"Score cannot be mapped back correctly: {score}") assert score == score_from_imdb_rating(imdb_rating_from_score(score)) + + +@pytest.mark.asyncio +async def test_load_most_popular_100(monkeypatch): + with bz2.open(fixturesdir / "most_popular_100.html.bz2", "rb") as f: + html = f.read() + soup = bs4.BeautifulSoup(html, "html5lib") + + monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup)) + + movie_ids = await imdb.load_most_popular_100() + assert len(movie_ids) == 100 + assert all(id_.startswith("tt") for id_ in movie_ids) + + +@pytest.mark.asyncio +async def test_load_bottom_100(monkeypatch): + with bz2.open(fixturesdir / "bottom_100.html.bz2", "rb") as f: + html = f.read() + soup = bs4.BeautifulSoup(html, "html5lib") + + monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup)) + + movie_ids = await imdb.load_bottom_100() + assert len(movie_ids) == 100 + assert all(id_.startswith("tt") for id_ in movie_ids) + + +@pytest.mark.asyncio +async def test_load_top_250(monkeypatch): + with bz2.open(fixturesdir / "top250.gql.json.bz2", "rb") as f: + jsonstr = f.read() + + monkeypatch.setattr(imdb, "adownload", AsyncMock(return_value=jsonstr)) + + movie_ids = await imdb.load_top_250() + assert len(movie_ids) == 250 + assert all(id_.startswith("tt") for id_ in movie_ids) diff --git a/unwind/imdb.py b/unwind/imdb.py index 631a088..6646d78 100644 --- a/unwind/imdb.py +++ b/unwind/imdb.py @@ -1,14 +1,16 @@ +import json import logging import re from collections import namedtuple from datetime import datetime +from typing import AsyncIterable from urllib.parse import urljoin import bs4 from . import db from .models import Movie, Rating, User -from .request import asession, asoup_from_url, cache_path +from .request import adownload, asession, asoup_from_url, cache_path log = logging.getLogger(__name__) @@ -207,7 +209,7 @@ async def parse_page(url: str) -> tuple[list[Rating], str | None]: return (ratings, next_url if url != next_url else None) -async def load_ratings(user_id: str): +async def load_ratings(user_id: str) -> AsyncIterable[tuple[Rating, bool]]: next_url = user_ratings_url(user_id) while next_url: @@ -228,3 +230,74 @@ async def load_ratings(user_id: str): is_updated = await db.add_or_update_rating(conn, rating) yield rating, is_updated + + +async def _ids_from_list_html(url: str) -> AsyncIterable[str]: + """Return all IMDb movie IDs (`tt*`) from the given URL.""" + # document.querySelectorAll('li.ipc-metadata-list-summary-item a.ipc-title-link-wrapper') + # .href: '/title/tt1213644/?ref_=chtbtm_t_1' + # .text(): '1. Disaster Movie' + soup = await asoup_from_url(url) + for item in soup.find_all("li", class_="ipc-metadata-list-summary-item"): + if (link := item.find("a", class_="ipc-title-link-wrapper")) is not None: + if (href := link.get("href")) is not None: + if match_ := find_movie_id(href): + yield match_["id"] + + +async def load_most_popular_100() -> list[str]: + """Return the IMDb's top 100 most popular movies. + + IMDb Charts: Most Popular Movies + As determined by IMDb users + """ + url = "https://www.imdb.com/chart/moviemeter/" + ids = [tid async for tid in _ids_from_list_html(url)] + if len(ids) != 100: + raise RuntimeError(f"Expected exactly 100 items, got {len(ids)}") + return ids + + +async def load_bottom_100() -> list[str]: + """Return the IMDb's bottom 100 lowest rated movies. + + IMDb Charts: Lowest Rated Movies + Bottom 100 as voted by IMDb users + """ + url = "https://www.imdb.com/chart/bottom/" + ids = [tid async for tid in _ids_from_list_html(url)] + if len(ids) != 100: + raise RuntimeError(f"Expected exactly 100 items, got {len(ids)}") + return ids + + +async def load_top_250() -> list[str]: + """Return the IMDb's top 250 highest rated movies. + + IMDb Charts: IMDb Top 250 Movies + As rated by regular IMDb voters. + """ + # Called from page https://www.imdb.com/chart/top/ + qgl_api_url = "https://caching.graphql.imdb.com/" + query = { + "operationName": "Top250MoviesPagination", + "variables": r'{"first":250,"locale":"en-US"}', + "extensions": r'{"persistedQuery":{"sha256Hash":"26114ee01d97e04f65d6c8c7212ae8b7888fa57ceed105450d1fce09df749b2d","version":1}}', + } + headers = {"content-type": "application/json"} + jsonstr = await adownload(qgl_api_url, query=query, headers=headers) + data = json.loads(jsonstr) + try: + imdb_title_ids = [ + edge["node"]["id"] for edge in data["data"]["chartTitles"]["edges"] + ] + has_next_page = data["data"]["chartTitles"]["pageInfo"]["hasNextPage"] + has_previous_page = data["data"]["chartTitles"]["pageInfo"]["hasPreviousPage"] + except KeyError as err: + log.error("Unexpected data structure.", exc_info=err) + raise + + if len(imdb_title_ids) != 250 or has_next_page or has_previous_page: + raise RuntimeError(f"Expected exactly 250 items, got {len(imdb_title_ids)}") + + return imdb_title_ids diff --git a/unwind/imdb_import.py b/unwind/imdb_import.py index dad419e..5644917 100644 --- a/unwind/imdb_import.py +++ b/unwind/imdb_import.py @@ -17,7 +17,7 @@ log = logging.getLogger(__name__) T = TypeVar("T") # See -# - https://www.imdb.com/interfaces/ +# - https://developer.imdb.com/non-commercial-datasets/ # - https://datasets.imdbws.com/ @@ -254,13 +254,22 @@ async def download_datasets(*, basics_path: Path, ratings_path: Path) -> None: See https://www.imdb.com/interfaces/ and https://datasets.imdbws.com/ for more information on the IMDb database dumps. """ - basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz" - ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz" + # name_basics_url = "https://datasets.imdbws.com/name.basics.tsv.gz" + # title_akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz" + title_basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz" + # title_crew_url = "https://datasets.imdbws.com/title.crew.tsv.gz" + # title_episode_url = "https://datasets.imdbws.com/title.episode.tsv.gz" + # title_principals_url = "https://datasets.imdbws.com/title.principals.tsv.gz" + title_ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz" async with request.asession(): await asyncio.gather( - request.adownload(ratings_url, to_path=ratings_path, only_if_newer=True), - request.adownload(basics_url, to_path=basics_path, only_if_newer=True), + request.adownload( + title_ratings_url, to_path=ratings_path, only_if_newer=True + ), + request.adownload( + title_basics_url, to_path=basics_path, only_if_newer=True + ), ) diff --git a/unwind/request.py b/unwind/request.py index b4a41d4..2079200 100644 --- a/unwind/request.py +++ b/unwind/request.py @@ -11,7 +11,7 @@ from hashlib import md5 from pathlib import Path from random import random from time import sleep, time -from typing import Any, Callable, ParamSpec, TypeVar, cast +from typing import Any, Callable, ParamSpec, TypeVar, cast, overload import bs4 import httpx @@ -201,10 +201,44 @@ def _last_modified_from_file(path: Path) -> float: return path.stat().st_mtime +@overload +async def adownload( + url: str, + *, + to_path: Path | str, + query: dict[str, str] | None = None, + headers: dict[str, str] | None = None, + replace_existing: bool | None = None, + only_if_newer: bool = False, + timeout: float | None = None, + chunk_callback: Callable[[bytes], Any] | None = None, + response_callback: Callable[[_Response_T], Any] | None = None, +) -> None: + ... + + +@overload +async def adownload( + url: str, + *, + to_path: None = None, + query: dict[str, str] | None = None, + headers: dict[str, str] | None = None, + replace_existing: bool | None = None, + only_if_newer: bool = False, + timeout: float | None = None, + chunk_callback: Callable[[bytes], Any] | None = None, + response_callback: Callable[[_Response_T], Any] | None = None, +) -> bytes: + ... + + async def adownload( url: str, *, to_path: Path | str | None = None, + query: dict[str, str] | None = None, + headers: dict[str, str] | None = None, replace_existing: bool | None = None, only_if_newer: bool = False, timeout: float | None = None, @@ -231,7 +265,8 @@ async def adownload( raise FileExistsError(23, "Would replace existing file", str(to_path)) async with asession() as s: - headers = {} + if headers is None: + headers = {} if file_exists and only_if_newer: assert to_path file_lastmod = _last_modified_from_file(to_path) @@ -239,7 +274,9 @@ async def adownload( file_lastmod, usegmt=True ) - req = s.build_request(method="GET", url=url, headers=headers, timeout=timeout) + req = s.build_request( + method="GET", url=url, params=query, headers=headers, timeout=timeout + ) log.debug("⚡️ Loading %s (%a) ...", req.url, dict(req.headers)) resp = await s.send(req, follow_redirects=True, stream=True)