diff --git a/tests/fixtures/bottom_100.html.bz2 b/tests/fixtures/bottom_100.html.bz2
new file mode 100644
index 0000000..57cf96e
Binary files /dev/null and b/tests/fixtures/bottom_100.html.bz2 differ
diff --git a/tests/fixtures/most_popular_100.html.bz2 b/tests/fixtures/most_popular_100.html.bz2
new file mode 100644
index 0000000..13fdf12
Binary files /dev/null and b/tests/fixtures/most_popular_100.html.bz2 differ
diff --git a/tests/fixtures/top250.gql.json.bz2 b/tests/fixtures/top250.gql.json.bz2
new file mode 100644
index 0000000..d4d8bfc
Binary files /dev/null and b/tests/fixtures/top250.gql.json.bz2 differ
diff --git a/tests/test_imdb.py b/tests/test_imdb.py
index 00467ce..4f949d6 100644
--- a/tests/test_imdb.py
+++ b/tests/test_imdb.py
@@ -1,7 +1,16 @@
+import bz2
+from pathlib import Path
+from unittest.mock import AsyncMock
+
+import bs4
import pytest
+from unwind import imdb
from unwind.imdb import imdb_rating_from_score, score_from_imdb_rating
+testsdir = Path(__file__).parent
+fixturesdir = testsdir / "fixtures"
+
@pytest.mark.parametrize("rating", (x / 10 for x in range(10, 101)))
def test_rating_conversion(rating: float):
@@ -18,3 +27,41 @@ def test_score_conversion(score: int):
pytest.skip(f"Score cannot be mapped back correctly: {score}")
assert score == score_from_imdb_rating(imdb_rating_from_score(score))
+
+
+@pytest.mark.asyncio
+async def test_load_most_popular_100(monkeypatch):
+ with bz2.open(fixturesdir / "most_popular_100.html.bz2", "rb") as f:
+ html = f.read()
+ soup = bs4.BeautifulSoup(html, "html5lib")
+
+ monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup))
+
+ movie_ids = await imdb.load_most_popular_100()
+ assert len(movie_ids) == 100
+ assert all(id_.startswith("tt") for id_ in movie_ids)
+
+
+@pytest.mark.asyncio
+async def test_load_bottom_100(monkeypatch):
+ with bz2.open(fixturesdir / "bottom_100.html.bz2", "rb") as f:
+ html = f.read()
+ soup = bs4.BeautifulSoup(html, "html5lib")
+
+ monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup))
+
+ movie_ids = await imdb.load_bottom_100()
+ assert len(movie_ids) == 100
+ assert all(id_.startswith("tt") for id_ in movie_ids)
+
+
+@pytest.mark.asyncio
+async def test_load_top_250(monkeypatch):
+ with bz2.open(fixturesdir / "top250.gql.json.bz2", "rb") as f:
+ jsonstr = f.read()
+
+ monkeypatch.setattr(imdb, "adownload", AsyncMock(return_value=jsonstr))
+
+ movie_ids = await imdb.load_top_250()
+ assert len(movie_ids) == 250
+ assert all(id_.startswith("tt") for id_ in movie_ids)
diff --git a/unwind/imdb.py b/unwind/imdb.py
index 631a088..6646d78 100644
--- a/unwind/imdb.py
+++ b/unwind/imdb.py
@@ -1,14 +1,16 @@
+import json
import logging
import re
from collections import namedtuple
from datetime import datetime
+from typing import AsyncIterable
from urllib.parse import urljoin
import bs4
from . import db
from .models import Movie, Rating, User
-from .request import asession, asoup_from_url, cache_path
+from .request import adownload, asession, asoup_from_url, cache_path
log = logging.getLogger(__name__)
@@ -207,7 +209,7 @@ async def parse_page(url: str) -> tuple[list[Rating], str | None]:
return (ratings, next_url if url != next_url else None)
-async def load_ratings(user_id: str):
+async def load_ratings(user_id: str) -> AsyncIterable[tuple[Rating, bool]]:
next_url = user_ratings_url(user_id)
while next_url:
@@ -228,3 +230,74 @@ async def load_ratings(user_id: str):
is_updated = await db.add_or_update_rating(conn, rating)
yield rating, is_updated
+
+
+async def _ids_from_list_html(url: str) -> AsyncIterable[str]:
+ """Return all IMDb movie IDs (`tt*`) from the given URL."""
+ # document.querySelectorAll('li.ipc-metadata-list-summary-item a.ipc-title-link-wrapper')
+ # .href: '/title/tt1213644/?ref_=chtbtm_t_1'
+ # .text(): '1. Disaster Movie'
+ soup = await asoup_from_url(url)
+ for item in soup.find_all("li", class_="ipc-metadata-list-summary-item"):
+ if (link := item.find("a", class_="ipc-title-link-wrapper")) is not None:
+ if (href := link.get("href")) is not None:
+ if match_ := find_movie_id(href):
+ yield match_["id"]
+
+
+async def load_most_popular_100() -> list[str]:
+ """Return the IMDb's top 100 most popular movies.
+
+ IMDb Charts: Most Popular Movies
+ As determined by IMDb users
+ """
+ url = "https://www.imdb.com/chart/moviemeter/"
+ ids = [tid async for tid in _ids_from_list_html(url)]
+ if len(ids) != 100:
+ raise RuntimeError(f"Expected exactly 100 items, got {len(ids)}")
+ return ids
+
+
+async def load_bottom_100() -> list[str]:
+ """Return the IMDb's bottom 100 lowest rated movies.
+
+ IMDb Charts: Lowest Rated Movies
+ Bottom 100 as voted by IMDb users
+ """
+ url = "https://www.imdb.com/chart/bottom/"
+ ids = [tid async for tid in _ids_from_list_html(url)]
+ if len(ids) != 100:
+ raise RuntimeError(f"Expected exactly 100 items, got {len(ids)}")
+ return ids
+
+
+async def load_top_250() -> list[str]:
+ """Return the IMDb's top 250 highest rated movies.
+
+ IMDb Charts: IMDb Top 250 Movies
+ As rated by regular IMDb voters.
+ """
+ # Called from page https://www.imdb.com/chart/top/
+ qgl_api_url = "https://caching.graphql.imdb.com/"
+ query = {
+ "operationName": "Top250MoviesPagination",
+ "variables": r'{"first":250,"locale":"en-US"}',
+ "extensions": r'{"persistedQuery":{"sha256Hash":"26114ee01d97e04f65d6c8c7212ae8b7888fa57ceed105450d1fce09df749b2d","version":1}}',
+ }
+ headers = {"content-type": "application/json"}
+ jsonstr = await adownload(qgl_api_url, query=query, headers=headers)
+ data = json.loads(jsonstr)
+ try:
+ imdb_title_ids = [
+ edge["node"]["id"] for edge in data["data"]["chartTitles"]["edges"]
+ ]
+ has_next_page = data["data"]["chartTitles"]["pageInfo"]["hasNextPage"]
+ has_previous_page = data["data"]["chartTitles"]["pageInfo"]["hasPreviousPage"]
+ except KeyError as err:
+ log.error("Unexpected data structure.", exc_info=err)
+ raise
+
+ if len(imdb_title_ids) != 250 or has_next_page or has_previous_page:
+ raise RuntimeError(f"Expected exactly 250 items, got {len(imdb_title_ids)}")
+
+ return imdb_title_ids
diff --git a/unwind/imdb_import.py b/unwind/imdb_import.py
index dad419e..5644917 100644
--- a/unwind/imdb_import.py
+++ b/unwind/imdb_import.py
@@ -17,7 +17,7 @@ log = logging.getLogger(__name__)
T = TypeVar("T")
# See
-# - https://www.imdb.com/interfaces/
+# - https://developer.imdb.com/non-commercial-datasets/
# - https://datasets.imdbws.com/
@@ -254,13 +254,22 @@ async def download_datasets(*, basics_path: Path, ratings_path: Path) -> None:
See https://www.imdb.com/interfaces/ and https://datasets.imdbws.com/ for
more information on the IMDb database dumps.
"""
- basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
- ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
+ # name_basics_url = "https://datasets.imdbws.com/name.basics.tsv.gz"
+ # title_akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
+ title_basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
+ # title_crew_url = "https://datasets.imdbws.com/title.crew.tsv.gz"
+ # title_episode_url = "https://datasets.imdbws.com/title.episode.tsv.gz"
+ # title_principals_url = "https://datasets.imdbws.com/title.principals.tsv.gz"
+ title_ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
async with request.asession():
await asyncio.gather(
- request.adownload(ratings_url, to_path=ratings_path, only_if_newer=True),
- request.adownload(basics_url, to_path=basics_path, only_if_newer=True),
+ request.adownload(
+ title_ratings_url, to_path=ratings_path, only_if_newer=True
+ ),
+ request.adownload(
+ title_basics_url, to_path=basics_path, only_if_newer=True
+ ),
)
diff --git a/unwind/request.py b/unwind/request.py
index b4a41d4..2079200 100644
--- a/unwind/request.py
+++ b/unwind/request.py
@@ -11,7 +11,7 @@ from hashlib import md5
from pathlib import Path
from random import random
from time import sleep, time
-from typing import Any, Callable, ParamSpec, TypeVar, cast
+from typing import Any, Callable, ParamSpec, TypeVar, cast, overload
import bs4
import httpx
@@ -201,10 +201,44 @@ def _last_modified_from_file(path: Path) -> float:
return path.stat().st_mtime
+@overload
+async def adownload(
+ url: str,
+ *,
+ to_path: Path | str,
+ query: dict[str, str] | None = None,
+ headers: dict[str, str] | None = None,
+ replace_existing: bool | None = None,
+ only_if_newer: bool = False,
+ timeout: float | None = None,
+ chunk_callback: Callable[[bytes], Any] | None = None,
+ response_callback: Callable[[_Response_T], Any] | None = None,
+) -> None:
+ ...
+
+
+@overload
+async def adownload(
+ url: str,
+ *,
+ to_path: None = None,
+ query: dict[str, str] | None = None,
+ headers: dict[str, str] | None = None,
+ replace_existing: bool | None = None,
+ only_if_newer: bool = False,
+ timeout: float | None = None,
+ chunk_callback: Callable[[bytes], Any] | None = None,
+ response_callback: Callable[[_Response_T], Any] | None = None,
+) -> bytes:
+ ...
+
+
async def adownload(
url: str,
*,
to_path: Path | str | None = None,
+ query: dict[str, str] | None = None,
+ headers: dict[str, str] | None = None,
replace_existing: bool | None = None,
only_if_newer: bool = False,
timeout: float | None = None,
@@ -231,7 +265,8 @@ async def adownload(
raise FileExistsError(23, "Would replace existing file", str(to_path))
async with asession() as s:
- headers = {}
+ if headers is None:
+ headers = {}
if file_exists and only_if_newer:
assert to_path
file_lastmod = _last_modified_from_file(to_path)
@@ -239,7 +274,9 @@ async def adownload(
file_lastmod, usegmt=True
)
- req = s.build_request(method="GET", url=url, headers=headers, timeout=timeout)
+ req = s.build_request(
+ method="GET", url=url, params=query, headers=headers, timeout=timeout
+ )
log.debug("⚡️ Loading %s (%a) ...", req.url, dict(req.headers))
resp = await s.send(req, follow_redirects=True, stream=True)