feat: add functions to retrieve IMDb chart lists
These charts are - the top 250 highest rated movies - the top 100 most popular movies - the bottom 100 lowest rated movies
This commit is contained in:
parent
4fbdb26d9c
commit
2bf5607183
7 changed files with 176 additions and 10 deletions
BIN
tests/fixtures/bottom_100.html.bz2
vendored
Normal file
BIN
tests/fixtures/bottom_100.html.bz2
vendored
Normal file
Binary file not shown.
BIN
tests/fixtures/most_popular_100.html.bz2
vendored
Normal file
BIN
tests/fixtures/most_popular_100.html.bz2
vendored
Normal file
Binary file not shown.
BIN
tests/fixtures/top250.gql.json.bz2
vendored
Normal file
BIN
tests/fixtures/top250.gql.json.bz2
vendored
Normal file
Binary file not shown.
|
|
@ -1,7 +1,16 @@
|
|||
import bz2
|
||||
from pathlib import Path
|
||||
from unittest.mock import AsyncMock
|
||||
|
||||
import bs4
|
||||
import pytest
|
||||
|
||||
from unwind import imdb
|
||||
from unwind.imdb import imdb_rating_from_score, score_from_imdb_rating
|
||||
|
||||
testsdir = Path(__file__).parent
|
||||
fixturesdir = testsdir / "fixtures"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("rating", (x / 10 for x in range(10, 101)))
|
||||
def test_rating_conversion(rating: float):
|
||||
|
|
@ -18,3 +27,41 @@ def test_score_conversion(score: int):
|
|||
pytest.skip(f"Score cannot be mapped back correctly: {score}")
|
||||
|
||||
assert score == score_from_imdb_rating(imdb_rating_from_score(score))
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_load_most_popular_100(monkeypatch):
|
||||
with bz2.open(fixturesdir / "most_popular_100.html.bz2", "rb") as f:
|
||||
html = f.read()
|
||||
soup = bs4.BeautifulSoup(html, "html5lib")
|
||||
|
||||
monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup))
|
||||
|
||||
movie_ids = await imdb.load_most_popular_100()
|
||||
assert len(movie_ids) == 100
|
||||
assert all(id_.startswith("tt") for id_ in movie_ids)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_load_bottom_100(monkeypatch):
|
||||
with bz2.open(fixturesdir / "bottom_100.html.bz2", "rb") as f:
|
||||
html = f.read()
|
||||
soup = bs4.BeautifulSoup(html, "html5lib")
|
||||
|
||||
monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup))
|
||||
|
||||
movie_ids = await imdb.load_bottom_100()
|
||||
assert len(movie_ids) == 100
|
||||
assert all(id_.startswith("tt") for id_ in movie_ids)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_load_top_250(monkeypatch):
|
||||
with bz2.open(fixturesdir / "top250.gql.json.bz2", "rb") as f:
|
||||
jsonstr = f.read()
|
||||
|
||||
monkeypatch.setattr(imdb, "adownload", AsyncMock(return_value=jsonstr))
|
||||
|
||||
movie_ids = await imdb.load_top_250()
|
||||
assert len(movie_ids) == 250
|
||||
assert all(id_.startswith("tt") for id_ in movie_ids)
|
||||
|
|
|
|||
|
|
@ -1,14 +1,16 @@
|
|||
import json
|
||||
import logging
|
||||
import re
|
||||
from collections import namedtuple
|
||||
from datetime import datetime
|
||||
from typing import AsyncIterable
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import bs4
|
||||
|
||||
from . import db
|
||||
from .models import Movie, Rating, User
|
||||
from .request import asession, asoup_from_url, cache_path
|
||||
from .request import adownload, asession, asoup_from_url, cache_path
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -207,7 +209,7 @@ async def parse_page(url: str) -> tuple[list[Rating], str | None]:
|
|||
return (ratings, next_url if url != next_url else None)
|
||||
|
||||
|
||||
async def load_ratings(user_id: str):
|
||||
async def load_ratings(user_id: str) -> AsyncIterable[tuple[Rating, bool]]:
|
||||
next_url = user_ratings_url(user_id)
|
||||
|
||||
while next_url:
|
||||
|
|
@ -228,3 +230,74 @@ async def load_ratings(user_id: str):
|
|||
is_updated = await db.add_or_update_rating(conn, rating)
|
||||
|
||||
yield rating, is_updated
|
||||
|
||||
|
||||
async def _ids_from_list_html(url: str) -> AsyncIterable[str]:
|
||||
"""Return all IMDb movie IDs (`tt*`) from the given URL."""
|
||||
# document.querySelectorAll('li.ipc-metadata-list-summary-item a.ipc-title-link-wrapper')
|
||||
# .href: '/title/tt1213644/?ref_=chtbtm_t_1'
|
||||
# .text(): '1. Disaster Movie'
|
||||
soup = await asoup_from_url(url)
|
||||
for item in soup.find_all("li", class_="ipc-metadata-list-summary-item"):
|
||||
if (link := item.find("a", class_="ipc-title-link-wrapper")) is not None:
|
||||
if (href := link.get("href")) is not None:
|
||||
if match_ := find_movie_id(href):
|
||||
yield match_["id"]
|
||||
|
||||
|
||||
async def load_most_popular_100() -> list[str]:
|
||||
"""Return the IMDb's top 100 most popular movies.
|
||||
|
||||
IMDb Charts: Most Popular Movies
|
||||
As determined by IMDb users
|
||||
"""
|
||||
url = "https://www.imdb.com/chart/moviemeter/"
|
||||
ids = [tid async for tid in _ids_from_list_html(url)]
|
||||
if len(ids) != 100:
|
||||
raise RuntimeError(f"Expected exactly 100 items, got {len(ids)}")
|
||||
return ids
|
||||
|
||||
|
||||
async def load_bottom_100() -> list[str]:
|
||||
"""Return the IMDb's bottom 100 lowest rated movies.
|
||||
|
||||
IMDb Charts: Lowest Rated Movies
|
||||
Bottom 100 as voted by IMDb users
|
||||
"""
|
||||
url = "https://www.imdb.com/chart/bottom/"
|
||||
ids = [tid async for tid in _ids_from_list_html(url)]
|
||||
if len(ids) != 100:
|
||||
raise RuntimeError(f"Expected exactly 100 items, got {len(ids)}")
|
||||
return ids
|
||||
|
||||
|
||||
async def load_top_250() -> list[str]:
|
||||
"""Return the IMDb's top 250 highest rated movies.
|
||||
|
||||
IMDb Charts: IMDb Top 250 Movies
|
||||
As rated by regular IMDb voters.
|
||||
"""
|
||||
# Called from page https://www.imdb.com/chart/top/
|
||||
qgl_api_url = "https://caching.graphql.imdb.com/"
|
||||
query = {
|
||||
"operationName": "Top250MoviesPagination",
|
||||
"variables": r'{"first":250,"locale":"en-US"}',
|
||||
"extensions": r'{"persistedQuery":{"sha256Hash":"26114ee01d97e04f65d6c8c7212ae8b7888fa57ceed105450d1fce09df749b2d","version":1}}',
|
||||
}
|
||||
headers = {"content-type": "application/json"}
|
||||
jsonstr = await adownload(qgl_api_url, query=query, headers=headers)
|
||||
data = json.loads(jsonstr)
|
||||
try:
|
||||
imdb_title_ids = [
|
||||
edge["node"]["id"] for edge in data["data"]["chartTitles"]["edges"]
|
||||
]
|
||||
has_next_page = data["data"]["chartTitles"]["pageInfo"]["hasNextPage"]
|
||||
has_previous_page = data["data"]["chartTitles"]["pageInfo"]["hasPreviousPage"]
|
||||
except KeyError as err:
|
||||
log.error("Unexpected data structure.", exc_info=err)
|
||||
raise
|
||||
|
||||
if len(imdb_title_ids) != 250 or has_next_page or has_previous_page:
|
||||
raise RuntimeError(f"Expected exactly 250 items, got {len(imdb_title_ids)}")
|
||||
|
||||
return imdb_title_ids
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ log = logging.getLogger(__name__)
|
|||
T = TypeVar("T")
|
||||
|
||||
# See
|
||||
# - https://www.imdb.com/interfaces/
|
||||
# - https://developer.imdb.com/non-commercial-datasets/
|
||||
# - https://datasets.imdbws.com/
|
||||
|
||||
|
||||
|
|
@ -254,13 +254,22 @@ async def download_datasets(*, basics_path: Path, ratings_path: Path) -> None:
|
|||
See https://www.imdb.com/interfaces/ and https://datasets.imdbws.com/ for
|
||||
more information on the IMDb database dumps.
|
||||
"""
|
||||
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
|
||||
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
|
||||
# name_basics_url = "https://datasets.imdbws.com/name.basics.tsv.gz"
|
||||
# title_akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
|
||||
title_basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
|
||||
# title_crew_url = "https://datasets.imdbws.com/title.crew.tsv.gz"
|
||||
# title_episode_url = "https://datasets.imdbws.com/title.episode.tsv.gz"
|
||||
# title_principals_url = "https://datasets.imdbws.com/title.principals.tsv.gz"
|
||||
title_ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
|
||||
|
||||
async with request.asession():
|
||||
await asyncio.gather(
|
||||
request.adownload(ratings_url, to_path=ratings_path, only_if_newer=True),
|
||||
request.adownload(basics_url, to_path=basics_path, only_if_newer=True),
|
||||
request.adownload(
|
||||
title_ratings_url, to_path=ratings_path, only_if_newer=True
|
||||
),
|
||||
request.adownload(
|
||||
title_basics_url, to_path=basics_path, only_if_newer=True
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ from hashlib import md5
|
|||
from pathlib import Path
|
||||
from random import random
|
||||
from time import sleep, time
|
||||
from typing import Any, Callable, ParamSpec, TypeVar, cast
|
||||
from typing import Any, Callable, ParamSpec, TypeVar, cast, overload
|
||||
|
||||
import bs4
|
||||
import httpx
|
||||
|
|
@ -201,10 +201,44 @@ def _last_modified_from_file(path: Path) -> float:
|
|||
return path.stat().st_mtime
|
||||
|
||||
|
||||
@overload
|
||||
async def adownload(
|
||||
url: str,
|
||||
*,
|
||||
to_path: Path | str,
|
||||
query: dict[str, str] | None = None,
|
||||
headers: dict[str, str] | None = None,
|
||||
replace_existing: bool | None = None,
|
||||
only_if_newer: bool = False,
|
||||
timeout: float | None = None,
|
||||
chunk_callback: Callable[[bytes], Any] | None = None,
|
||||
response_callback: Callable[[_Response_T], Any] | None = None,
|
||||
) -> None:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
async def adownload(
|
||||
url: str,
|
||||
*,
|
||||
to_path: None = None,
|
||||
query: dict[str, str] | None = None,
|
||||
headers: dict[str, str] | None = None,
|
||||
replace_existing: bool | None = None,
|
||||
only_if_newer: bool = False,
|
||||
timeout: float | None = None,
|
||||
chunk_callback: Callable[[bytes], Any] | None = None,
|
||||
response_callback: Callable[[_Response_T], Any] | None = None,
|
||||
) -> bytes:
|
||||
...
|
||||
|
||||
|
||||
async def adownload(
|
||||
url: str,
|
||||
*,
|
||||
to_path: Path | str | None = None,
|
||||
query: dict[str, str] | None = None,
|
||||
headers: dict[str, str] | None = None,
|
||||
replace_existing: bool | None = None,
|
||||
only_if_newer: bool = False,
|
||||
timeout: float | None = None,
|
||||
|
|
@ -231,6 +265,7 @@ async def adownload(
|
|||
raise FileExistsError(23, "Would replace existing file", str(to_path))
|
||||
|
||||
async with asession() as s:
|
||||
if headers is None:
|
||||
headers = {}
|
||||
if file_exists and only_if_newer:
|
||||
assert to_path
|
||||
|
|
@ -239,7 +274,9 @@ async def adownload(
|
|||
file_lastmod, usegmt=True
|
||||
)
|
||||
|
||||
req = s.build_request(method="GET", url=url, headers=headers, timeout=timeout)
|
||||
req = s.build_request(
|
||||
method="GET", url=url, params=query, headers=headers, timeout=timeout
|
||||
)
|
||||
|
||||
log.debug("⚡️ Loading %s (%a) ...", req.url, dict(req.headers))
|
||||
resp = await s.send(req, follow_redirects=True, stream=True)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue