fix: find next rating page
This commit is contained in:
parent
738799cc74
commit
d385860ca9
3 changed files with 78 additions and 38 deletions
BIN
tests/fixtures/ratings-ur655321.html.bz2
vendored
Normal file
BIN
tests/fixtures/ratings-ur655321.html.bz2
vendored
Normal file
Binary file not shown.
|
|
@ -65,3 +65,20 @@ async def test_load_top_250(monkeypatch):
|
||||||
movie_ids = await imdb.load_top_250()
|
movie_ids = await imdb.load_top_250()
|
||||||
assert len(movie_ids) == 250
|
assert len(movie_ids) == 250
|
||||||
assert all(id_.startswith("tt") for id_ in movie_ids)
|
assert all(id_.startswith("tt") for id_ in movie_ids)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_load_ratings_page(monkeypatch):
|
||||||
|
with bz2.open(fixturesdir / "ratings-ur655321.html.bz2", "rb") as f:
|
||||||
|
html = f.read()
|
||||||
|
soup = bs4.BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
|
monkeypatch.setattr(imdb, "asoup_from_url", AsyncMock(return_value=soup))
|
||||||
|
|
||||||
|
page = await imdb.load_ratings_page("fakeurl")
|
||||||
|
assert len(page.ratings) == 100
|
||||||
|
assert page.imdb_user_id is not None
|
||||||
|
assert page.imdb_user_id == "ur655321"
|
||||||
|
assert page.imdb_user_name == "AlexUltra"
|
||||||
|
assert page.next_page_url is not None
|
||||||
|
assert page.next_page_url.startswith("/user/ur655321/ratings?")
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
|
from dataclasses import dataclass, field
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import AsyncIterable
|
from typing import AsyncIterable
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
@ -48,7 +49,7 @@ async def refresh_user_ratings_from_imdb(stop_on_dupe: bool = True):
|
||||||
log.info("⚡️ Loading data for %s ...", user.name)
|
log.info("⚡️ Loading data for %s ...", user.name)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
async for rating, is_updated in load_ratings(user.imdb_id):
|
async for rating, is_updated in load_and_store_ratings(user.imdb_id):
|
||||||
assert rating.user is not None and rating.user.id == user.id
|
assert rating.user is not None and rating.user.id == user.id
|
||||||
|
|
||||||
if stop_on_dupe and not is_updated:
|
if stop_on_dupe and not is_updated:
|
||||||
|
|
@ -154,27 +155,35 @@ def movie_and_rating_from_item(item: bs4.Tag) -> tuple[Movie, Rating]:
|
||||||
|
|
||||||
ForgedRequest = namedtuple("ForgedRequest", "url headers")
|
ForgedRequest = namedtuple("ForgedRequest", "url headers")
|
||||||
|
|
||||||
|
MovieId = str # ttXXXXXXXX
|
||||||
|
UserId = str # urXXXXXXXX
|
||||||
|
|
||||||
async def parse_page(url: str) -> tuple[list[Rating], str | None]:
|
|
||||||
ratings = []
|
@dataclass
|
||||||
|
class RatingsPage:
|
||||||
|
ratings: list[Rating] = field(default_factory=list)
|
||||||
|
next_page_url: str | None = None
|
||||||
|
imdb_user_id: UserId | None = None
|
||||||
|
imdb_user_name: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
async def load_ratings_page(url: str) -> RatingsPage:
|
||||||
|
page = RatingsPage()
|
||||||
|
|
||||||
soup = await asoup_from_url(url)
|
soup = await asoup_from_url(url)
|
||||||
|
|
||||||
if (meta := soup.find("meta", property="pageId")) is None:
|
if (meta := soup.find("meta", property="pageId")) is None:
|
||||||
raise RuntimeError("No pageId found.")
|
raise RuntimeError("No pageId found.")
|
||||||
assert isinstance(meta, bs4.Tag)
|
assert isinstance(meta, bs4.Tag)
|
||||||
imdb_id = meta["content"]
|
if isinstance(page_id := meta["content"], list):
|
||||||
assert isinstance(imdb_id, str)
|
page_id = page_id[0]
|
||||||
async with db.new_connection() as conn:
|
page.imdb_user_id = page_id
|
||||||
user = await db.get(conn, User, imdb_id=imdb_id) or User(
|
|
||||||
imdb_id=imdb_id, name="", secret=""
|
|
||||||
)
|
|
||||||
|
|
||||||
if (headline := soup.h1) is None:
|
if (headline := soup.h1) is None:
|
||||||
raise RuntimeError("No headline found.")
|
raise RuntimeError("No headline found.")
|
||||||
assert isinstance(headline.string, str)
|
assert isinstance(headline.string, str)
|
||||||
if match := find_name(headline.string):
|
if match := find_name(headline.string):
|
||||||
user.name = match["name"]
|
page.imdb_user_name = match["name"]
|
||||||
|
|
||||||
items = soup.find_all("div", "lister-item-content")
|
items = soup.find_all("div", "lister-item-content")
|
||||||
for i, item in enumerate(items):
|
for i, item in enumerate(items):
|
||||||
|
|
@ -191,48 +200,62 @@ async def parse_page(url: str) -> tuple[list[Rating], str | None]:
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
rating.user = user
|
|
||||||
rating.movie = movie
|
rating.movie = movie
|
||||||
|
|
||||||
ratings.append(rating)
|
page.ratings.append(rating)
|
||||||
|
|
||||||
next_url = None
|
|
||||||
if (footer := soup.find("div", "footer")) is None:
|
if (footer := soup.find("div", "footer")) is None:
|
||||||
raise RuntimeError("No footer found.")
|
raise RuntimeError("No footer found.")
|
||||||
assert isinstance(footer, bs4.Tag)
|
assert isinstance(footer, bs4.Tag)
|
||||||
if (next_link := footer.find("a", string="Next")) is not None:
|
if (next_link := footer.find("a", string=re.compile("Next"))) is not None:
|
||||||
assert isinstance(next_link, bs4.Tag)
|
assert isinstance(next_link, bs4.Tag)
|
||||||
next_href = next_link["href"]
|
next_href = next_link["href"]
|
||||||
assert isinstance(next_href, str)
|
assert isinstance(next_href, str)
|
||||||
next_url = urljoin(url, next_href)
|
page.next_page_url = urljoin(url, next_href)
|
||||||
|
|
||||||
return (ratings, next_url if url != next_url else None)
|
return page
|
||||||
|
|
||||||
|
|
||||||
async def load_ratings(user_id: str) -> AsyncIterable[tuple[Rating, bool]]:
|
async def load_and_store_ratings(
|
||||||
|
user_id: MovieId,
|
||||||
|
) -> AsyncIterable[tuple[Rating, bool]]:
|
||||||
|
async with db.new_connection() as conn:
|
||||||
|
user = await db.get(conn, User, imdb_id=user_id) or User(
|
||||||
|
imdb_id=user_id, name="", secret=""
|
||||||
|
)
|
||||||
|
|
||||||
|
is_first = True
|
||||||
|
async for rating in load_ratings(user_id):
|
||||||
|
assert rating.movie
|
||||||
|
|
||||||
|
rating.user = user
|
||||||
|
|
||||||
|
async with db.transaction() as conn:
|
||||||
|
if is_first:
|
||||||
|
is_first = False
|
||||||
|
# All rating objects share the same user.
|
||||||
|
await db.add_or_update_user(conn, rating.user)
|
||||||
|
rating.user_id = rating.user.id
|
||||||
|
|
||||||
|
await db.add_or_update_movie(conn, rating.movie)
|
||||||
|
rating.movie_id = rating.movie.id
|
||||||
|
|
||||||
|
is_updated = await db.add_or_update_rating(conn, rating)
|
||||||
|
|
||||||
|
yield rating, is_updated
|
||||||
|
|
||||||
|
|
||||||
|
async def load_ratings(user_id: MovieId) -> AsyncIterable[Rating]:
|
||||||
next_url = user_ratings_url(user_id)
|
next_url = user_ratings_url(user_id)
|
||||||
|
|
||||||
while next_url:
|
while next_url:
|
||||||
ratings, next_url = await parse_page(next_url)
|
ratings_page = await load_ratings_page(next_url)
|
||||||
|
next_url = ratings_page.next_page_url
|
||||||
for i, rating in enumerate(ratings):
|
for rating in ratings_page.ratings:
|
||||||
assert rating.user and rating.movie
|
yield rating
|
||||||
|
|
||||||
async with db.transaction() as conn:
|
|
||||||
if i == 0:
|
|
||||||
# All rating objects share the same user.
|
|
||||||
await db.add_or_update_user(conn, rating.user)
|
|
||||||
rating.user_id = rating.user.id
|
|
||||||
|
|
||||||
await db.add_or_update_movie(conn, rating.movie)
|
|
||||||
rating.movie_id = rating.movie.id
|
|
||||||
|
|
||||||
is_updated = await db.add_or_update_rating(conn, rating)
|
|
||||||
|
|
||||||
yield rating, is_updated
|
|
||||||
|
|
||||||
|
|
||||||
async def _ids_from_list_html(url: str) -> AsyncIterable[str]:
|
async def _ids_from_list_html(url: str) -> AsyncIterable[MovieId]:
|
||||||
"""Return all IMDb movie IDs (`tt*`) from the given URL."""
|
"""Return all IMDb movie IDs (`tt*`) from the given URL."""
|
||||||
# document.querySelectorAll('li.ipc-metadata-list-summary-item a.ipc-title-link-wrapper')
|
# document.querySelectorAll('li.ipc-metadata-list-summary-item a.ipc-title-link-wrapper')
|
||||||
# .href: '/title/tt1213644/?ref_=chtbtm_t_1'
|
# .href: '/title/tt1213644/?ref_=chtbtm_t_1'
|
||||||
|
|
@ -245,7 +268,7 @@ async def _ids_from_list_html(url: str) -> AsyncIterable[str]:
|
||||||
yield match_["id"]
|
yield match_["id"]
|
||||||
|
|
||||||
|
|
||||||
async def load_most_popular_100() -> list[str]:
|
async def load_most_popular_100() -> list[MovieId]:
|
||||||
"""Return the IMDb's top 100 most popular movies.
|
"""Return the IMDb's top 100 most popular movies.
|
||||||
|
|
||||||
IMDb Charts: Most Popular Movies
|
IMDb Charts: Most Popular Movies
|
||||||
|
|
@ -258,7 +281,7 @@ async def load_most_popular_100() -> list[str]:
|
||||||
return ids
|
return ids
|
||||||
|
|
||||||
|
|
||||||
async def load_bottom_100() -> list[str]:
|
async def load_bottom_100() -> list[MovieId]:
|
||||||
"""Return the IMDb's bottom 100 lowest rated movies.
|
"""Return the IMDb's bottom 100 lowest rated movies.
|
||||||
|
|
||||||
IMDb Charts: Lowest Rated Movies
|
IMDb Charts: Lowest Rated Movies
|
||||||
|
|
@ -271,7 +294,7 @@ async def load_bottom_100() -> list[str]:
|
||||||
return ids
|
return ids
|
||||||
|
|
||||||
|
|
||||||
async def load_top_250() -> list[str]:
|
async def load_top_250() -> list[MovieId]:
|
||||||
"""Return the IMDb's top 250 highest rated movies.
|
"""Return the IMDb's top 250 highest rated movies.
|
||||||
|
|
||||||
IMDb Charts: IMDb Top 250 Movies
|
IMDb Charts: IMDb Top 250 Movies
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue