feat: add functions to retrieve IMDb chart lists

These charts are - the top 250 highest rated movies - the top 100 most popular movies - the bottom 100 lowest rated movies
2024-05-10 00:12:25 +02:00 · 2024-05-10 00:12:25 +02:00 · 2bf5607183
commit 2bf5607183
parent 4fbdb26d9c
7 changed files with 176 additions and 10 deletions
--- a/tests/fixtures/bottom_100.html.bz2
+++ b/tests/fixtures/bottom_100.html.bz2
--- a/tests/fixtures/most_popular_100.html.bz2
+++ b/tests/fixtures/most_popular_100.html.bz2
--- a/tests/fixtures/top250.gql.json.bz2
+++ b/tests/fixtures/top250.gql.json.bz2
--- a/tests/test_imdb.py
+++ b/tests/test_imdb.py
@ -1,7 +1,16 @@
 import bz2
 from pathlib import Path
 from unittest.mock import AsyncMock
 import bs4
 import pytest
 from unwind import imdb
 from unwind.imdb import imdb_rating_from_score, score_from_imdb_rating
 testsdir = Path(__file__).parent
 fixturesdir = testsdir / "fixtures"
@pytest.mark.parametrize("rating", (x / 10 for x in range(10, 101)))
 def test_rating_conversion(rating: float):
@ -18,3 +27,41 @@ def test_score_conversion(score: int):
        pytest.skip(f"Score cannot be mapped back correctly: {score}")
    assert score == score_from_imdb_rating(imdb_rating_from_score(score))
@pytest.mark.asyncio
 async def test_load_most_popular_100(monkeypatch):
    with bz2.open(fixturesdir / "most_popular_100.html.bz2", "rb") as f:
        html = f.read()
    soup = bs4.BeautifulSoup(html, "html5lib")
    monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup))
    movie_ids = await imdb.load_most_popular_100()
    assert len(movie_ids) == 100
    assert all(id_.startswith("tt") for id_ in movie_ids)
@pytest.mark.asyncio
 async def test_load_bottom_100(monkeypatch):
    with bz2.open(fixturesdir / "bottom_100.html.bz2", "rb") as f:
        html = f.read()
    soup = bs4.BeautifulSoup(html, "html5lib")
    monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup))
    movie_ids = await imdb.load_bottom_100()
    assert len(movie_ids) == 100
    assert all(id_.startswith("tt") for id_ in movie_ids)
@pytest.mark.asyncio
 async def test_load_top_250(monkeypatch):
    with bz2.open(fixturesdir / "top250.gql.json.bz2", "rb") as f:
        jsonstr = f.read()
    monkeypatch.setattr(imdb, "adownload", AsyncMock(return_value=jsonstr))
    movie_ids = await imdb.load_top_250()
    assert len(movie_ids) == 250
    assert all(id_.startswith("tt") for id_ in movie_ids)
--- a/unwind/imdb.py
+++ b/unwind/imdb.py
@ -1,14 +1,16 @@
 import json
 import logging
 import re
 from collections import namedtuple
 from datetime import datetime
 from typing import AsyncIterable
 from urllib.parse import urljoin
 import bs4
 from . import db
 from .models import Movie, Rating, User
-from .request import asession, asoup_from_url, cache_path
+from .request import adownload, asession, asoup_from_url, cache_path
 log = logging.getLogger(__name__)
@ -207,7 +209,7 @@ async def parse_page(url: str) -> tuple[list[Rating], str | None]:
    return (ratings, next_url if url != next_url else None)
-async def load_ratings(user_id: str):
+async def load_ratings(user_id: str) -> AsyncIterable[tuple[Rating, bool]]:
    next_url = user_ratings_url(user_id)
    while next_url:
@ -228,3 +230,74 @@ async def load_ratings(user_id: str):
                is_updated = await db.add_or_update_rating(conn, rating)
            yield rating, is_updated
 async def _ids_from_list_html(url: str) -> AsyncIterable[str]:
    """Return all IMDb movie IDs (`tt*`) from the given URL."""
    # document.querySelectorAll('li.ipc-metadata-list-summary-item a.ipc-title-link-wrapper')
    # .href: '/title/tt1213644/?ref_=chtbtm_t_1'
    # .text(): '1. Disaster Movie'
    soup = await asoup_from_url(url)
    for item in soup.find_all("li", class_="ipc-metadata-list-summary-item"):
        if (link := item.find("a", class_="ipc-title-link-wrapper")) is not None:
            if (href := link.get("href")) is not None:
                if match_ := find_movie_id(href):
                    yield match_["id"]
 async def load_most_popular_100() -> list[str]:
    """Return the IMDb's top 100 most popular movies.
    IMDb Charts: Most Popular Movies
    As determined by IMDb users
    """
    url = "https://www.imdb.com/chart/moviemeter/"
    ids = [tid async for tid in _ids_from_list_html(url)]
    if len(ids) != 100:
        raise RuntimeError(f"Expected exactly 100 items, got {len(ids)}")
    return ids
 async def load_bottom_100() -> list[str]:
    """Return the IMDb's bottom 100 lowest rated movies.
    IMDb Charts: Lowest Rated Movies
    Bottom 100 as voted by IMDb users
    """
    url = "https://www.imdb.com/chart/bottom/"
    ids = [tid async for tid in _ids_from_list_html(url)]
    if len(ids) != 100:
        raise RuntimeError(f"Expected exactly 100 items, got {len(ids)}")
    return ids
 async def load_top_250() -> list[str]:
    """Return the IMDb's top 250 highest rated movies.
    IMDb Charts: IMDb Top 250 Movies
    As rated by regular IMDb voters.
    """
    # Called from page https://www.imdb.com/chart/top/
    qgl_api_url = "https://caching.graphql.imdb.com/"
    query = {
        "operationName": "Top250MoviesPagination",
        "variables": r'{"first":250,"locale":"en-US"}',
        "extensions": r'{"persistedQuery":{"sha256Hash":"26114ee01d97e04f65d6c8c7212ae8b7888fa57ceed105450d1fce09df749b2d","version":1}}',
    }
    headers = {"content-type": "application/json"}
    jsonstr = await adownload(qgl_api_url, query=query, headers=headers)
    data = json.loads(jsonstr)
    try:
        imdb_title_ids = [
            edge["node"]["id"] for edge in data["data"]["chartTitles"]["edges"]
        ]
        has_next_page = data["data"]["chartTitles"]["pageInfo"]["hasNextPage"]
        has_previous_page = data["data"]["chartTitles"]["pageInfo"]["hasPreviousPage"]
    except KeyError as err:
        log.error("Unexpected data structure.", exc_info=err)
        raise
    if len(imdb_title_ids) != 250 or has_next_page or has_previous_page:
        raise RuntimeError(f"Expected exactly 250 items, got {len(imdb_title_ids)}")
    return imdb_title_ids
--- a/unwind/imdb_import.py
+++ b/unwind/imdb_import.py
@ -17,7 +17,7 @@ log = logging.getLogger(__name__)
 T = TypeVar("T")
 # See
-# - https://www.imdb.com/interfaces/
+# - https://developer.imdb.com/non-commercial-datasets/
 # - https://datasets.imdbws.com/
@ -254,13 +254,22 @@ async def download_datasets(*, basics_path: Path, ratings_path: Path) -> None:
    See https://www.imdb.com/interfaces/ and https://datasets.imdbws.com/ for
    more information on the IMDb database dumps.
    """
-    basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
+    # name_basics_url = "https://datasets.imdbws.com/name.basics.tsv.gz"
-    ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
+    # title_akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
    title_basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
    # title_crew_url = "https://datasets.imdbws.com/title.crew.tsv.gz"
    # title_episode_url = "https://datasets.imdbws.com/title.episode.tsv.gz"
    # title_principals_url = "https://datasets.imdbws.com/title.principals.tsv.gz"
    title_ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
    async with request.asession():
        await asyncio.gather(
-            request.adownload(ratings_url, to_path=ratings_path, only_if_newer=True),
+            request.adownload(
-            request.adownload(basics_url, to_path=basics_path, only_if_newer=True),
+                title_ratings_url, to_path=ratings_path, only_if_newer=True
            ),
            request.adownload(
                title_basics_url, to_path=basics_path, only_if_newer=True
            ),
        )
--- a/unwind/request.py
+++ b/unwind/request.py
@ -11,7 +11,7 @@ from hashlib import md5
 from pathlib import Path
 from random import random
 from time import sleep, time
-from typing import Any, Callable, ParamSpec, TypeVar, cast
+from typing import Any, Callable, ParamSpec, TypeVar, cast, overload
 import bs4
 import httpx
@ -201,10 +201,44 @@ def _last_modified_from_file(path: Path) -> float:
    return path.stat().st_mtime
@overload
 async def adownload(
    url: str,
    *,
    to_path: Path | str,
    query: dict[str, str] | None = None,
    headers: dict[str, str] | None = None,
    replace_existing: bool | None = None,
    only_if_newer: bool = False,
    timeout: float | None = None,
    chunk_callback: Callable[[bytes], Any] | None = None,
    response_callback: Callable[[_Response_T], Any] | None = None,
 ) -> None:
    ...
@overload
 async def adownload(
    url: str,
    *,
    to_path: None = None,
    query: dict[str, str] | None = None,
    headers: dict[str, str] | None = None,
    replace_existing: bool | None = None,
    only_if_newer: bool = False,
    timeout: float | None = None,
    chunk_callback: Callable[[bytes], Any] | None = None,
    response_callback: Callable[[_Response_T], Any] | None = None,
 ) -> bytes:
    ...
 async def adownload(
    url: str,
    *,
    to_path: Path | str | None = None,
    query: dict[str, str] | None = None,
    headers: dict[str, str] | None = None,
    replace_existing: bool | None = None,
    only_if_newer: bool = False,
    timeout: float | None = None,
@ -231,6 +265,7 @@ async def adownload(
            raise FileExistsError(23, "Would replace existing file", str(to_path))
    async with asession() as s:
        if headers is None:
            headers = {}
        if file_exists and only_if_newer:
            assert to_path
@ -239,7 +274,9 @@ async def adownload(
                file_lastmod, usegmt=True
            )
-        req = s.build_request(method="GET", url=url, headers=headers, timeout=timeout)
+        req = s.build_request(
            method="GET", url=url, params=query, headers=headers, timeout=timeout
        )
        log.debug("⚡️ Loading %s (%a) ...", req.url, dict(req.headers))
        resp = await s.send(req, follow_redirects=True, stream=True)