From 8d20cc040ebd07801bf84ddb2606afe37434b029 Mon Sep 17 00:00:00 2001 From: ducklet Date: Wed, 21 Jul 2021 20:04:57 +0200 Subject: [PATCH] add IMDb vote count to movies --- unwind/db.py | 7 ++++--- unwind/imdb.py | 2 +- unwind/imdb_import.py | 13 ++++++------ unwind/models.py | 3 ++- unwind/sql/20210720-213416.sql | 36 ++++++++++++++++++++++++++++++++++ unwind/web.py | 1 + 6 files changed, 51 insertions(+), 11 deletions(-) create mode 100644 unwind/sql/20210720-213416.sql diff --git a/unwind/db.py b/unwind/db.py index 7ab6346..f93db6e 100644 --- a/unwind/db.py +++ b/unwind/db.py @@ -379,7 +379,7 @@ async def find_ratings( FROM {Rating._table} LEFT JOIN {Movie._table} ON {Movie._table}.id={Rating._table}.movie_id WHERE {user_condition}{(' AND ' + ' AND '.join(conditions)) if conditions else ''} - ORDER BY length({Movie._table}.title) ASC, {Rating._table}.rating_date DESC, {Movie._table}.score DESC + ORDER BY length({Movie._table}.title) ASC, {Rating._table}.rating_date DESC, {Movie._table}.imdb_score DESC LIMIT :limit_rows )""" ] @@ -393,7 +393,7 @@ async def find_ratings( FROM {Movie._table} WHERE id NOT IN newest_movies {('AND ' + ' AND '.join(conditions)) if conditions else ''} - ORDER BY length(title) ASC, score DESC, release_year DESC + ORDER BY length(title) ASC, imdb_score DESC, release_year DESC LIMIT :limit_rows )""", f"""{source_table} AS ( @@ -412,7 +412,8 @@ async def find_ratings( SELECT {Rating._table}.score AS user_score, {Rating._table}.user_id AS user_id, - {Movie._table}.score AS imdb_score, + {Movie._table}.imdb_score, + {Movie._table}.imdb_votes, {Movie._table}.imdb_id AS movie_imdb_id, {Movie._table}.media_type AS media_type, {Movie._table}.title AS canonical_title, diff --git a/unwind/imdb.py b/unwind/imdb.py index 35368e5..a110ece 100644 --- a/unwind/imdb.py +++ b/unwind/imdb.py @@ -136,7 +136,7 @@ def movie_and_rating_from_item(item) -> tuple[Movie, Rating]: rating.score = score_from_imdb_rating(float(rating_item.string)) if match := ratings_item.find("div", "ipl-rating-star small"): if rating_item := match.find("span", "ipl-rating-star__rating"): - movie.score = score_from_imdb_rating(float(rating_item.string)) + movie.imdb_score = score_from_imdb_rating(float(rating_item.string)) return movie, rating diff --git a/unwind/imdb_import.py b/unwind/imdb_import.py index a34e931..94b6830 100644 --- a/unwind/imdb_import.py +++ b/unwind/imdb_import.py @@ -56,7 +56,7 @@ class BasicRow: release_year=self.startYear, media_type=title_types[self.titleType], imdb_id=self.tconst, - score=None, + imdb_score=None, runtime=self.runtimeMinutes, genres=self.genres or set(), updated=None, # optimization: skip default factory @@ -79,7 +79,8 @@ class RatingRow: def as_movie(self): return Movie( imdb_id=self.tconst, - score=score_from_imdb_rating(self.averageRating), + imdb_score=score_from_imdb_rating(self.averageRating), + imdb_votes=self.numVotes, updated=None, # optimization: skip default factory id=None, # optimization: skip default factory ) @@ -158,11 +159,11 @@ def read_ratings(path): yield m -def read_ratings_as_scoremap(path): +def read_ratings_as_mapping(path): """Optimized function to quickly load all ratings.""" rows = read_imdb_tsv(path, RatingRow, unpack=False) rows = cast(list[list[str]], rows) - return {r[0]: round(100 * (float(r[1]) - 1) / 9) for r in rows} + return {r[0]: (round(100 * (float(r[1]) - 1) / 9), int(r[2])) for r in rows} def read_basics(path): @@ -181,7 +182,7 @@ def read_basics(path): async def import_from_file(*, basics_path: Path, ratings_path: Path): log.info("💾 Loading scores ...") - scores = read_ratings_as_scoremap(ratings_path) + ratings = read_ratings_as_mapping(ratings_path) log.info("💾 Importing movies ...") total = count_lines(basics_path) @@ -212,7 +213,7 @@ async def import_from_file(*, basics_path: Path, ratings_path: Path): log.debug("Skipping movie, unwanted media type: %s", m.media_type) continue - m.score = scores.get(m.imdb_id) + m.imdb_score, m.imdb_votes = ratings.get(m.imdb_id, [None, None]) chunk.append(m) if len(chunk) > 1000: diff --git a/unwind/models.py b/unwind/models.py index 2789583..b3aae26 100644 --- a/unwind/models.py +++ b/unwind/models.py @@ -152,7 +152,8 @@ class Movie: release_year: int = None # canonical release date media_type: str = None imdb_id: str = None - score: Optional[int] = None # range: [0,100] + imdb_score: Optional[int] = None # range: [0,100] + imdb_votes: Optional[int] = None runtime: Optional[int] = None # minutes genres: set[str] = None updated: datetime = field(default_factory=utcnow) diff --git a/unwind/sql/20210720-213416.sql b/unwind/sql/20210720-213416.sql new file mode 100644 index 0000000..286e094 --- /dev/null +++ b/unwind/sql/20210720-213416.sql @@ -0,0 +1,36 @@ +-- add IMDb vote count + +CREATE TABLE _migrate_movies ( + id TEXT PRIMARY KEY NOT NULL, + title TEXT NOT NULL, + original_title TEXT, + release_year INTEGER NOT NULL, + media_type TEXT NOT NULL, + imdb_id TEXT NOT NULL UNIQUE, + imdb_score INTEGER, + imdb_votes INTEGER, + runtime INTEGER, + genres TEXT NOT NULL, + updated TEXT NOT NULL +);; + +INSERT INTO _migrate_movies +SELECT + id, + title, + original_title, + release_year, + media_type, + imdb_id, + score AS imdb_score, + NULL AS imdb_votes, + runtime, + genres, + updated +FROM movies +WHERE true;; + +DROP TABLE movies;; + +ALTER TABLE _migrate_movies +RENAME TO movies;; diff --git a/unwind/web.py b/unwind/web.py index ec72e68..367dc68 100644 --- a/unwind/web.py +++ b/unwind/web.py @@ -154,6 +154,7 @@ async def get_ratings_for_group(request): "link": imdb.movie_url(r["movie_imdb_id"]), "user_scores": [], "imdb_score": r["imdb_score"], + "imdb_votes": r["imdb_votes"], "media_type": r["media_type"], }, )