feat: add functions to retrieve IMDb chart lists

These charts are - the top 250 highest rated movies - the top 100 most popular movies - the bottom 100 lowest rated movies
2024-05-10 00:12:25 +02:00 · 2024-05-10 00:12:25 +02:00 · 2bf5607183
commit 2bf5607183
parent 4fbdb26d9c
7 changed files with 176 additions and 10 deletions
--- a/tests/fixtures/bottom_100.html.bz2
+++ b/tests/fixtures/bottom_100.html.bz2
--- a/tests/fixtures/most_popular_100.html.bz2
+++ b/tests/fixtures/most_popular_100.html.bz2
--- a/tests/fixtures/top250.gql.json.bz2
+++ b/tests/fixtures/top250.gql.json.bz2
--- a/tests/test_imdb.py
+++ b/tests/test_imdb.py
@ -1,7 +1,16 @@
+import bz2
+from pathlib import Path
+from unittest.mock import AsyncMock
+
+import bs4
 import pytest

+from unwind import imdb
 from unwind.imdb import imdb_rating_from_score, score_from_imdb_rating

+testsdir = Path(__file__).parent
+fixturesdir = testsdir / "fixtures"
+

@pytest.mark.parametrize("rating", (x / 10 for x in range(10, 101)))
 def test_rating_conversion(rating: float):
@ -18,3 +27,41 @@ def test_score_conversion(score: int):
        pytest.skip(f"Score cannot be mapped back correctly: {score}")

    assert score == score_from_imdb_rating(imdb_rating_from_score(score))
+
+
+@pytest.mark.asyncio
+async def test_load_most_popular_100(monkeypatch):
+    with bz2.open(fixturesdir / "most_popular_100.html.bz2", "rb") as f:
+        html = f.read()
+    soup = bs4.BeautifulSoup(html, "html5lib")
+
+    monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup))
+
+    movie_ids = await imdb.load_most_popular_100()
+    assert len(movie_ids) == 100
+    assert all(id_.startswith("tt") for id_ in movie_ids)
+
+
+@pytest.mark.asyncio
+async def test_load_bottom_100(monkeypatch):
+    with bz2.open(fixturesdir / "bottom_100.html.bz2", "rb") as f:
+        html = f.read()
+    soup = bs4.BeautifulSoup(html, "html5lib")
+
+    monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup))
+
+    movie_ids = await imdb.load_bottom_100()
+    assert len(movie_ids) == 100
+    assert all(id_.startswith("tt") for id_ in movie_ids)
+
+
+@pytest.mark.asyncio
+async def test_load_top_250(monkeypatch):
+    with bz2.open(fixturesdir / "top250.gql.json.bz2", "rb") as f:
+        jsonstr = f.read()
+
+    monkeypatch.setattr(imdb, "adownload", AsyncMock(return_value=jsonstr))
+
+    movie_ids = await imdb.load_top_250()
+    assert len(movie_ids) == 250
+    assert all(id_.startswith("tt") for id_ in movie_ids)
--- a/unwind/imdb.py
+++ b/unwind/imdb.py
@ -1,14 +1,16 @@
+import json
 import logging
 import re
 from collections import namedtuple
 from datetime import datetime
+from typing import AsyncIterable
 from urllib.parse import urljoin

 import bs4

 from . import db
 from .models import Movie, Rating, User
-from .request import asession, asoup_from_url, cache_path
+from .request import adownload, asession, asoup_from_url, cache_path

 log = logging.getLogger(__name__)

@ -207,7 +209,7 @@ async def parse_page(url: str) -> tuple[list[Rating], str | None]:
    return (ratings, next_url if url != next_url else None)


-async def load_ratings(user_id: str):
+async def load_ratings(user_id: str) -> AsyncIterable[tuple[Rating, bool]]:
    next_url = user_ratings_url(user_id)

    while next_url:
@ -228,3 +230,74 @@ async def load_ratings(user_id: str):
                is_updated = await db.add_or_update_rating(conn, rating)

            yield rating, is_updated
+
+
+async def _ids_from_list_html(url: str) -> AsyncIterable[str]:
+    """Return all IMDb movie IDs (`tt*`) from the given URL."""
+    # document.querySelectorAll('li.ipc-metadata-list-summary-item a.ipc-title-link-wrapper')
+    # .href: '/title/tt1213644/?ref_=chtbtm_t_1'
+    # .text(): '1. Disaster Movie'
+    soup = await asoup_from_url(url)
+    for item in soup.find_all("li", class_="ipc-metadata-list-summary-item"):
+        if (link := item.find("a", class_="ipc-title-link-wrapper")) is not None:
+            if (href := link.get("href")) is not None:
+                if match_ := find_movie_id(href):
+                    yield match_["id"]
+
+
+async def load_most_popular_100() -> list[str]:
+    """Return the IMDb's top 100 most popular movies.
+
+    IMDb Charts: Most Popular Movies
+    As determined by IMDb users
+    """
+    url = "https://www.imdb.com/chart/moviemeter/"
+    ids = [tid async for tid in _ids_from_list_html(url)]
+    if len(ids) != 100:
+        raise RuntimeError(f"Expected exactly 100 items, got {len(ids)}")
+    return ids
+
+
+async def load_bottom_100() -> list[str]:
+    """Return the IMDb's bottom 100 lowest rated movies.
+
+    IMDb Charts: Lowest Rated Movies
+    Bottom 100 as voted by IMDb users
+    """
+    url = "https://www.imdb.com/chart/bottom/"
+    ids = [tid async for tid in _ids_from_list_html(url)]
+    if len(ids) != 100:
+        raise RuntimeError(f"Expected exactly 100 items, got {len(ids)}")
+    return ids
+
+
+async def load_top_250() -> list[str]:
+    """Return the IMDb's top 250 highest rated movies.
+
+    IMDb Charts: IMDb Top 250 Movies
+    As rated by regular IMDb voters.
+    """
+    # Called from page https://www.imdb.com/chart/top/
+    qgl_api_url = "https://caching.graphql.imdb.com/"
+    query = {
+        "operationName": "Top250MoviesPagination",
+        "variables": r'{"first":250,"locale":"en-US"}',
+        "extensions": r'{"persistedQuery":{"sha256Hash":"26114ee01d97e04f65d6c8c7212ae8b7888fa57ceed105450d1fce09df749b2d","version":1}}',
+    }
+    headers = {"content-type": "application/json"}
+    jsonstr = await adownload(qgl_api_url, query=query, headers=headers)
+    data = json.loads(jsonstr)
+    try:
+        imdb_title_ids = [
+            edge["node"]["id"] for edge in data["data"]["chartTitles"]["edges"]
+        ]
+        has_next_page = data["data"]["chartTitles"]["pageInfo"]["hasNextPage"]
+        has_previous_page = data["data"]["chartTitles"]["pageInfo"]["hasPreviousPage"]
+    except KeyError as err:
+        log.error("Unexpected data structure.", exc_info=err)
+        raise
+
+    if len(imdb_title_ids) != 250 or has_next_page or has_previous_page:
+        raise RuntimeError(f"Expected exactly 250 items, got {len(imdb_title_ids)}")
+
+    return imdb_title_ids
--- a/unwind/imdb_import.py
+++ b/unwind/imdb_import.py
@ -17,7 +17,7 @@ log = logging.getLogger(__name__)
 T = TypeVar("T")

 # See
-# - https://www.imdb.com/interfaces/
+# - https://developer.imdb.com/non-commercial-datasets/
 # - https://datasets.imdbws.com/


@ -254,13 +254,22 @@ async def download_datasets(*, basics_path: Path, ratings_path: Path) -> None:
    See https://www.imdb.com/interfaces/ and https://datasets.imdbws.com/ for
    more information on the IMDb database dumps.
    """
-    basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
-    ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
+    # name_basics_url = "https://datasets.imdbws.com/name.basics.tsv.gz"
+    # title_akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
+    title_basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
+    # title_crew_url = "https://datasets.imdbws.com/title.crew.tsv.gz"
+    # title_episode_url = "https://datasets.imdbws.com/title.episode.tsv.gz"
+    # title_principals_url = "https://datasets.imdbws.com/title.principals.tsv.gz"
+    title_ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

    async with request.asession():
        await asyncio.gather(
-            request.adownload(ratings_url, to_path=ratings_path, only_if_newer=True),
-            request.adownload(basics_url, to_path=basics_path, only_if_newer=True),
+            request.adownload(
+                title_ratings_url, to_path=ratings_path, only_if_newer=True
+            ),
+            request.adownload(
+                title_basics_url, to_path=basics_path, only_if_newer=True
+            ),
        )


--- a/unwind/request.py
+++ b/unwind/request.py
@ -11,7 +11,7 @@ from hashlib import md5
 from pathlib import Path
 from random import random
 from time import sleep, time
-from typing import Any, Callable, ParamSpec, TypeVar, cast
+from typing import Any, Callable, ParamSpec, TypeVar, cast, overload

 import bs4
 import httpx
@ -201,10 +201,44 @@ def _last_modified_from_file(path: Path) -> float:
    return path.stat().st_mtime


+@overload
+async def adownload(
+    url: str,
+    *,
+    to_path: Path | str,
+    query: dict[str, str] | None = None,
+    headers: dict[str, str] | None = None,
+    replace_existing: bool | None = None,
+    only_if_newer: bool = False,
+    timeout: float | None = None,
+    chunk_callback: Callable[[bytes], Any] | None = None,
+    response_callback: Callable[[_Response_T], Any] | None = None,
+) -> None:
+    ...
+
+
+@overload
+async def adownload(
+    url: str,
+    *,
+    to_path: None = None,
+    query: dict[str, str] | None = None,
+    headers: dict[str, str] | None = None,
+    replace_existing: bool | None = None,
+    only_if_newer: bool = False,
+    timeout: float | None = None,
+    chunk_callback: Callable[[bytes], Any] | None = None,
+    response_callback: Callable[[_Response_T], Any] | None = None,
+) -> bytes:
+    ...
+
+
 async def adownload(
    url: str,
    *,
    to_path: Path | str | None = None,
+    query: dict[str, str] | None = None,
+    headers: dict[str, str] | None = None,
    replace_existing: bool | None = None,
    only_if_newer: bool = False,
    timeout: float | None = None,
@ -231,6 +265,7 @@ async def adownload(
            raise FileExistsError(23, "Would replace existing file", str(to_path))

    async with asession() as s:
+        if headers is None:
            headers = {}
        if file_exists and only_if_newer:
            assert to_path
@ -239,7 +274,9 @@ async def adownload(
                file_lastmod, usegmt=True
            )

-        req = s.build_request(method="GET", url=url, headers=headers, timeout=timeout)
+        req = s.build_request(
+            method="GET", url=url, params=query, headers=headers, timeout=timeout
+        )

        log.debug("⚡️ Loading %s (%a) ...", req.url, dict(req.headers))
        resp = await s.send(req, follow_redirects=True, stream=True)