feat: add functions to retrieve IMDb chart lists
These charts are - the top 250 highest rated movies - the top 100 most popular movies - the bottom 100 lowest rated movies
This commit is contained in:
parent
4fbdb26d9c
commit
2bf5607183
7 changed files with 176 additions and 10 deletions
BIN
tests/fixtures/bottom_100.html.bz2
vendored
Normal file
BIN
tests/fixtures/bottom_100.html.bz2
vendored
Normal file
Binary file not shown.
BIN
tests/fixtures/most_popular_100.html.bz2
vendored
Normal file
BIN
tests/fixtures/most_popular_100.html.bz2
vendored
Normal file
Binary file not shown.
BIN
tests/fixtures/top250.gql.json.bz2
vendored
Normal file
BIN
tests/fixtures/top250.gql.json.bz2
vendored
Normal file
Binary file not shown.
|
|
@ -1,7 +1,16 @@
|
||||||
|
import bz2
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import AsyncMock
|
||||||
|
|
||||||
|
import bs4
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from unwind import imdb
|
||||||
from unwind.imdb import imdb_rating_from_score, score_from_imdb_rating
|
from unwind.imdb import imdb_rating_from_score, score_from_imdb_rating
|
||||||
|
|
||||||
|
testsdir = Path(__file__).parent
|
||||||
|
fixturesdir = testsdir / "fixtures"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("rating", (x / 10 for x in range(10, 101)))
|
@pytest.mark.parametrize("rating", (x / 10 for x in range(10, 101)))
|
||||||
def test_rating_conversion(rating: float):
|
def test_rating_conversion(rating: float):
|
||||||
|
|
@ -18,3 +27,41 @@ def test_score_conversion(score: int):
|
||||||
pytest.skip(f"Score cannot be mapped back correctly: {score}")
|
pytest.skip(f"Score cannot be mapped back correctly: {score}")
|
||||||
|
|
||||||
assert score == score_from_imdb_rating(imdb_rating_from_score(score))
|
assert score == score_from_imdb_rating(imdb_rating_from_score(score))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_load_most_popular_100(monkeypatch):
|
||||||
|
with bz2.open(fixturesdir / "most_popular_100.html.bz2", "rb") as f:
|
||||||
|
html = f.read()
|
||||||
|
soup = bs4.BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
|
monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup))
|
||||||
|
|
||||||
|
movie_ids = await imdb.load_most_popular_100()
|
||||||
|
assert len(movie_ids) == 100
|
||||||
|
assert all(id_.startswith("tt") for id_ in movie_ids)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_load_bottom_100(monkeypatch):
|
||||||
|
with bz2.open(fixturesdir / "bottom_100.html.bz2", "rb") as f:
|
||||||
|
html = f.read()
|
||||||
|
soup = bs4.BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
|
monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup))
|
||||||
|
|
||||||
|
movie_ids = await imdb.load_bottom_100()
|
||||||
|
assert len(movie_ids) == 100
|
||||||
|
assert all(id_.startswith("tt") for id_ in movie_ids)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_load_top_250(monkeypatch):
|
||||||
|
with bz2.open(fixturesdir / "top250.gql.json.bz2", "rb") as f:
|
||||||
|
jsonstr = f.read()
|
||||||
|
|
||||||
|
monkeypatch.setattr(imdb, "adownload", AsyncMock(return_value=jsonstr))
|
||||||
|
|
||||||
|
movie_ids = await imdb.load_top_250()
|
||||||
|
assert len(movie_ids) == 250
|
||||||
|
assert all(id_.startswith("tt") for id_ in movie_ids)
|
||||||
|
|
|
||||||
|
|
@ -1,14 +1,16 @@
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from typing import AsyncIterable
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
import bs4
|
import bs4
|
||||||
|
|
||||||
from . import db
|
from . import db
|
||||||
from .models import Movie, Rating, User
|
from .models import Movie, Rating, User
|
||||||
from .request import asession, asoup_from_url, cache_path
|
from .request import adownload, asession, asoup_from_url, cache_path
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -207,7 +209,7 @@ async def parse_page(url: str) -> tuple[list[Rating], str | None]:
|
||||||
return (ratings, next_url if url != next_url else None)
|
return (ratings, next_url if url != next_url else None)
|
||||||
|
|
||||||
|
|
||||||
async def load_ratings(user_id: str):
|
async def load_ratings(user_id: str) -> AsyncIterable[tuple[Rating, bool]]:
|
||||||
next_url = user_ratings_url(user_id)
|
next_url = user_ratings_url(user_id)
|
||||||
|
|
||||||
while next_url:
|
while next_url:
|
||||||
|
|
@ -228,3 +230,74 @@ async def load_ratings(user_id: str):
|
||||||
is_updated = await db.add_or_update_rating(conn, rating)
|
is_updated = await db.add_or_update_rating(conn, rating)
|
||||||
|
|
||||||
yield rating, is_updated
|
yield rating, is_updated
|
||||||
|
|
||||||
|
|
||||||
|
async def _ids_from_list_html(url: str) -> AsyncIterable[str]:
|
||||||
|
"""Return all IMDb movie IDs (`tt*`) from the given URL."""
|
||||||
|
# document.querySelectorAll('li.ipc-metadata-list-summary-item a.ipc-title-link-wrapper')
|
||||||
|
# .href: '/title/tt1213644/?ref_=chtbtm_t_1'
|
||||||
|
# .text(): '1. Disaster Movie'
|
||||||
|
soup = await asoup_from_url(url)
|
||||||
|
for item in soup.find_all("li", class_="ipc-metadata-list-summary-item"):
|
||||||
|
if (link := item.find("a", class_="ipc-title-link-wrapper")) is not None:
|
||||||
|
if (href := link.get("href")) is not None:
|
||||||
|
if match_ := find_movie_id(href):
|
||||||
|
yield match_["id"]
|
||||||
|
|
||||||
|
|
||||||
|
async def load_most_popular_100() -> list[str]:
|
||||||
|
"""Return the IMDb's top 100 most popular movies.
|
||||||
|
|
||||||
|
IMDb Charts: Most Popular Movies
|
||||||
|
As determined by IMDb users
|
||||||
|
"""
|
||||||
|
url = "https://www.imdb.com/chart/moviemeter/"
|
||||||
|
ids = [tid async for tid in _ids_from_list_html(url)]
|
||||||
|
if len(ids) != 100:
|
||||||
|
raise RuntimeError(f"Expected exactly 100 items, got {len(ids)}")
|
||||||
|
return ids
|
||||||
|
|
||||||
|
|
||||||
|
async def load_bottom_100() -> list[str]:
|
||||||
|
"""Return the IMDb's bottom 100 lowest rated movies.
|
||||||
|
|
||||||
|
IMDb Charts: Lowest Rated Movies
|
||||||
|
Bottom 100 as voted by IMDb users
|
||||||
|
"""
|
||||||
|
url = "https://www.imdb.com/chart/bottom/"
|
||||||
|
ids = [tid async for tid in _ids_from_list_html(url)]
|
||||||
|
if len(ids) != 100:
|
||||||
|
raise RuntimeError(f"Expected exactly 100 items, got {len(ids)}")
|
||||||
|
return ids
|
||||||
|
|
||||||
|
|
||||||
|
async def load_top_250() -> list[str]:
|
||||||
|
"""Return the IMDb's top 250 highest rated movies.
|
||||||
|
|
||||||
|
IMDb Charts: IMDb Top 250 Movies
|
||||||
|
As rated by regular IMDb voters.
|
||||||
|
"""
|
||||||
|
# Called from page https://www.imdb.com/chart/top/
|
||||||
|
qgl_api_url = "https://caching.graphql.imdb.com/"
|
||||||
|
query = {
|
||||||
|
"operationName": "Top250MoviesPagination",
|
||||||
|
"variables": r'{"first":250,"locale":"en-US"}',
|
||||||
|
"extensions": r'{"persistedQuery":{"sha256Hash":"26114ee01d97e04f65d6c8c7212ae8b7888fa57ceed105450d1fce09df749b2d","version":1}}',
|
||||||
|
}
|
||||||
|
headers = {"content-type": "application/json"}
|
||||||
|
jsonstr = await adownload(qgl_api_url, query=query, headers=headers)
|
||||||
|
data = json.loads(jsonstr)
|
||||||
|
try:
|
||||||
|
imdb_title_ids = [
|
||||||
|
edge["node"]["id"] for edge in data["data"]["chartTitles"]["edges"]
|
||||||
|
]
|
||||||
|
has_next_page = data["data"]["chartTitles"]["pageInfo"]["hasNextPage"]
|
||||||
|
has_previous_page = data["data"]["chartTitles"]["pageInfo"]["hasPreviousPage"]
|
||||||
|
except KeyError as err:
|
||||||
|
log.error("Unexpected data structure.", exc_info=err)
|
||||||
|
raise
|
||||||
|
|
||||||
|
if len(imdb_title_ids) != 250 or has_next_page or has_previous_page:
|
||||||
|
raise RuntimeError(f"Expected exactly 250 items, got {len(imdb_title_ids)}")
|
||||||
|
|
||||||
|
return imdb_title_ids
|
||||||
|
|
|
||||||
|
|
@ -17,7 +17,7 @@ log = logging.getLogger(__name__)
|
||||||
T = TypeVar("T")
|
T = TypeVar("T")
|
||||||
|
|
||||||
# See
|
# See
|
||||||
# - https://www.imdb.com/interfaces/
|
# - https://developer.imdb.com/non-commercial-datasets/
|
||||||
# - https://datasets.imdbws.com/
|
# - https://datasets.imdbws.com/
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -254,13 +254,22 @@ async def download_datasets(*, basics_path: Path, ratings_path: Path) -> None:
|
||||||
See https://www.imdb.com/interfaces/ and https://datasets.imdbws.com/ for
|
See https://www.imdb.com/interfaces/ and https://datasets.imdbws.com/ for
|
||||||
more information on the IMDb database dumps.
|
more information on the IMDb database dumps.
|
||||||
"""
|
"""
|
||||||
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
|
# name_basics_url = "https://datasets.imdbws.com/name.basics.tsv.gz"
|
||||||
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
|
# title_akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
|
||||||
|
title_basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
|
||||||
|
# title_crew_url = "https://datasets.imdbws.com/title.crew.tsv.gz"
|
||||||
|
# title_episode_url = "https://datasets.imdbws.com/title.episode.tsv.gz"
|
||||||
|
# title_principals_url = "https://datasets.imdbws.com/title.principals.tsv.gz"
|
||||||
|
title_ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
|
||||||
|
|
||||||
async with request.asession():
|
async with request.asession():
|
||||||
await asyncio.gather(
|
await asyncio.gather(
|
||||||
request.adownload(ratings_url, to_path=ratings_path, only_if_newer=True),
|
request.adownload(
|
||||||
request.adownload(basics_url, to_path=basics_path, only_if_newer=True),
|
title_ratings_url, to_path=ratings_path, only_if_newer=True
|
||||||
|
),
|
||||||
|
request.adownload(
|
||||||
|
title_basics_url, to_path=basics_path, only_if_newer=True
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ from hashlib import md5
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from random import random
|
from random import random
|
||||||
from time import sleep, time
|
from time import sleep, time
|
||||||
from typing import Any, Callable, ParamSpec, TypeVar, cast
|
from typing import Any, Callable, ParamSpec, TypeVar, cast, overload
|
||||||
|
|
||||||
import bs4
|
import bs4
|
||||||
import httpx
|
import httpx
|
||||||
|
|
@ -201,10 +201,44 @@ def _last_modified_from_file(path: Path) -> float:
|
||||||
return path.stat().st_mtime
|
return path.stat().st_mtime
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
|
async def adownload(
|
||||||
|
url: str,
|
||||||
|
*,
|
||||||
|
to_path: Path | str,
|
||||||
|
query: dict[str, str] | None = None,
|
||||||
|
headers: dict[str, str] | None = None,
|
||||||
|
replace_existing: bool | None = None,
|
||||||
|
only_if_newer: bool = False,
|
||||||
|
timeout: float | None = None,
|
||||||
|
chunk_callback: Callable[[bytes], Any] | None = None,
|
||||||
|
response_callback: Callable[[_Response_T], Any] | None = None,
|
||||||
|
) -> None:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
|
async def adownload(
|
||||||
|
url: str,
|
||||||
|
*,
|
||||||
|
to_path: None = None,
|
||||||
|
query: dict[str, str] | None = None,
|
||||||
|
headers: dict[str, str] | None = None,
|
||||||
|
replace_existing: bool | None = None,
|
||||||
|
only_if_newer: bool = False,
|
||||||
|
timeout: float | None = None,
|
||||||
|
chunk_callback: Callable[[bytes], Any] | None = None,
|
||||||
|
response_callback: Callable[[_Response_T], Any] | None = None,
|
||||||
|
) -> bytes:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
async def adownload(
|
async def adownload(
|
||||||
url: str,
|
url: str,
|
||||||
*,
|
*,
|
||||||
to_path: Path | str | None = None,
|
to_path: Path | str | None = None,
|
||||||
|
query: dict[str, str] | None = None,
|
||||||
|
headers: dict[str, str] | None = None,
|
||||||
replace_existing: bool | None = None,
|
replace_existing: bool | None = None,
|
||||||
only_if_newer: bool = False,
|
only_if_newer: bool = False,
|
||||||
timeout: float | None = None,
|
timeout: float | None = None,
|
||||||
|
|
@ -231,6 +265,7 @@ async def adownload(
|
||||||
raise FileExistsError(23, "Would replace existing file", str(to_path))
|
raise FileExistsError(23, "Would replace existing file", str(to_path))
|
||||||
|
|
||||||
async with asession() as s:
|
async with asession() as s:
|
||||||
|
if headers is None:
|
||||||
headers = {}
|
headers = {}
|
||||||
if file_exists and only_if_newer:
|
if file_exists and only_if_newer:
|
||||||
assert to_path
|
assert to_path
|
||||||
|
|
@ -239,7 +274,9 @@ async def adownload(
|
||||||
file_lastmod, usegmt=True
|
file_lastmod, usegmt=True
|
||||||
)
|
)
|
||||||
|
|
||||||
req = s.build_request(method="GET", url=url, headers=headers, timeout=timeout)
|
req = s.build_request(
|
||||||
|
method="GET", url=url, params=query, headers=headers, timeout=timeout
|
||||||
|
)
|
||||||
|
|
||||||
log.debug("⚡️ Loading %s (%a) ...", req.url, dict(req.headers))
|
log.debug("⚡️ Loading %s (%a) ...", req.url, dict(req.headers))
|
||||||
resp = await s.send(req, follow_redirects=True, stream=True)
|
resp = await s.send(req, follow_redirects=True, stream=True)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue