add IMDb vote count to movies

This commit is contained in:
ducklet 2021-07-21 20:04:57 +02:00
parent af25d9c5a2
commit 8d20cc040e
6 changed files with 51 additions and 11 deletions

View file

@ -379,7 +379,7 @@ async def find_ratings(
FROM {Rating._table} FROM {Rating._table}
LEFT JOIN {Movie._table} ON {Movie._table}.id={Rating._table}.movie_id LEFT JOIN {Movie._table} ON {Movie._table}.id={Rating._table}.movie_id
WHERE {user_condition}{(' AND ' + ' AND '.join(conditions)) if conditions else ''} WHERE {user_condition}{(' AND ' + ' AND '.join(conditions)) if conditions else ''}
ORDER BY length({Movie._table}.title) ASC, {Rating._table}.rating_date DESC, {Movie._table}.score DESC ORDER BY length({Movie._table}.title) ASC, {Rating._table}.rating_date DESC, {Movie._table}.imdb_score DESC
LIMIT :limit_rows LIMIT :limit_rows
)""" )"""
] ]
@ -393,7 +393,7 @@ async def find_ratings(
FROM {Movie._table} FROM {Movie._table}
WHERE id NOT IN newest_movies WHERE id NOT IN newest_movies
{('AND ' + ' AND '.join(conditions)) if conditions else ''} {('AND ' + ' AND '.join(conditions)) if conditions else ''}
ORDER BY length(title) ASC, score DESC, release_year DESC ORDER BY length(title) ASC, imdb_score DESC, release_year DESC
LIMIT :limit_rows LIMIT :limit_rows
)""", )""",
f"""{source_table} AS ( f"""{source_table} AS (
@ -412,7 +412,8 @@ async def find_ratings(
SELECT SELECT
{Rating._table}.score AS user_score, {Rating._table}.score AS user_score,
{Rating._table}.user_id AS user_id, {Rating._table}.user_id AS user_id,
{Movie._table}.score AS imdb_score, {Movie._table}.imdb_score,
{Movie._table}.imdb_votes,
{Movie._table}.imdb_id AS movie_imdb_id, {Movie._table}.imdb_id AS movie_imdb_id,
{Movie._table}.media_type AS media_type, {Movie._table}.media_type AS media_type,
{Movie._table}.title AS canonical_title, {Movie._table}.title AS canonical_title,

View file

@ -136,7 +136,7 @@ def movie_and_rating_from_item(item) -> tuple[Movie, Rating]:
rating.score = score_from_imdb_rating(float(rating_item.string)) rating.score = score_from_imdb_rating(float(rating_item.string))
if match := ratings_item.find("div", "ipl-rating-star small"): if match := ratings_item.find("div", "ipl-rating-star small"):
if rating_item := match.find("span", "ipl-rating-star__rating"): if rating_item := match.find("span", "ipl-rating-star__rating"):
movie.score = score_from_imdb_rating(float(rating_item.string)) movie.imdb_score = score_from_imdb_rating(float(rating_item.string))
return movie, rating return movie, rating

View file

@ -56,7 +56,7 @@ class BasicRow:
release_year=self.startYear, release_year=self.startYear,
media_type=title_types[self.titleType], media_type=title_types[self.titleType],
imdb_id=self.tconst, imdb_id=self.tconst,
score=None, imdb_score=None,
runtime=self.runtimeMinutes, runtime=self.runtimeMinutes,
genres=self.genres or set(), genres=self.genres or set(),
updated=None, # optimization: skip default factory updated=None, # optimization: skip default factory
@ -79,7 +79,8 @@ class RatingRow:
def as_movie(self): def as_movie(self):
return Movie( return Movie(
imdb_id=self.tconst, imdb_id=self.tconst,
score=score_from_imdb_rating(self.averageRating), imdb_score=score_from_imdb_rating(self.averageRating),
imdb_votes=self.numVotes,
updated=None, # optimization: skip default factory updated=None, # optimization: skip default factory
id=None, # optimization: skip default factory id=None, # optimization: skip default factory
) )
@ -158,11 +159,11 @@ def read_ratings(path):
yield m yield m
def read_ratings_as_scoremap(path): def read_ratings_as_mapping(path):
"""Optimized function to quickly load all ratings.""" """Optimized function to quickly load all ratings."""
rows = read_imdb_tsv(path, RatingRow, unpack=False) rows = read_imdb_tsv(path, RatingRow, unpack=False)
rows = cast(list[list[str]], rows) rows = cast(list[list[str]], rows)
return {r[0]: round(100 * (float(r[1]) - 1) / 9) for r in rows} return {r[0]: (round(100 * (float(r[1]) - 1) / 9), int(r[2])) for r in rows}
def read_basics(path): def read_basics(path):
@ -181,7 +182,7 @@ def read_basics(path):
async def import_from_file(*, basics_path: Path, ratings_path: Path): async def import_from_file(*, basics_path: Path, ratings_path: Path):
log.info("💾 Loading scores ...") log.info("💾 Loading scores ...")
scores = read_ratings_as_scoremap(ratings_path) ratings = read_ratings_as_mapping(ratings_path)
log.info("💾 Importing movies ...") log.info("💾 Importing movies ...")
total = count_lines(basics_path) total = count_lines(basics_path)
@ -212,7 +213,7 @@ async def import_from_file(*, basics_path: Path, ratings_path: Path):
log.debug("Skipping movie, unwanted media type: %s", m.media_type) log.debug("Skipping movie, unwanted media type: %s", m.media_type)
continue continue
m.score = scores.get(m.imdb_id) m.imdb_score, m.imdb_votes = ratings.get(m.imdb_id, [None, None])
chunk.append(m) chunk.append(m)
if len(chunk) > 1000: if len(chunk) > 1000:

View file

@ -152,7 +152,8 @@ class Movie:
release_year: int = None # canonical release date release_year: int = None # canonical release date
media_type: str = None media_type: str = None
imdb_id: str = None imdb_id: str = None
score: Optional[int] = None # range: [0,100] imdb_score: Optional[int] = None # range: [0,100]
imdb_votes: Optional[int] = None
runtime: Optional[int] = None # minutes runtime: Optional[int] = None # minutes
genres: set[str] = None genres: set[str] = None
updated: datetime = field(default_factory=utcnow) updated: datetime = field(default_factory=utcnow)

View file

@ -0,0 +1,36 @@
-- add IMDb vote count
CREATE TABLE _migrate_movies (
id TEXT PRIMARY KEY NOT NULL,
title TEXT NOT NULL,
original_title TEXT,
release_year INTEGER NOT NULL,
media_type TEXT NOT NULL,
imdb_id TEXT NOT NULL UNIQUE,
imdb_score INTEGER,
imdb_votes INTEGER,
runtime INTEGER,
genres TEXT NOT NULL,
updated TEXT NOT NULL
);;
INSERT INTO _migrate_movies
SELECT
id,
title,
original_title,
release_year,
media_type,
imdb_id,
score AS imdb_score,
NULL AS imdb_votes,
runtime,
genres,
updated
FROM movies
WHERE true;;
DROP TABLE movies;;
ALTER TABLE _migrate_movies
RENAME TO movies;;

View file

@ -154,6 +154,7 @@ async def get_ratings_for_group(request):
"link": imdb.movie_url(r["movie_imdb_id"]), "link": imdb.movie_url(r["movie_imdb_id"]),
"user_scores": [], "user_scores": [],
"imdb_score": r["imdb_score"], "imdb_score": r["imdb_score"],
"imdb_votes": r["imdb_votes"],
"media_type": r["media_type"], "media_type": r["media_type"],
}, },
) )