From 8d20cc040ebd07801bf84ddb2606afe37434b029 Mon Sep 17 00:00:00 2001
From: ducklet <ducklet@noreply.code.dumpr.org>
Date: Wed, 21 Jul 2021 20:04:57 +0200
Subject: [PATCH] add IMDb vote count to movies

---
 unwind/db.py                   |  7 ++++---
 unwind/imdb.py                 |  2 +-
 unwind/imdb_import.py          | 13 ++++++------
 unwind/models.py               |  3 ++-
 unwind/sql/20210720-213416.sql | 36 ++++++++++++++++++++++++++++++++++
 unwind/web.py                  |  1 +
 6 files changed, 51 insertions(+), 11 deletions(-)
 create mode 100644 unwind/sql/20210720-213416.sql

diff --git a/unwind/db.py b/unwind/db.py
index 7ab6346..f93db6e 100644
--- a/unwind/db.py
+++ b/unwind/db.py
@@ -379,7 +379,7 @@ async def find_ratings(
             FROM {Rating._table}
             LEFT JOIN {Movie._table} ON {Movie._table}.id={Rating._table}.movie_id
             WHERE {user_condition}{(' AND ' + ' AND '.join(conditions)) if conditions else ''}
-            ORDER BY length({Movie._table}.title) ASC, {Rating._table}.rating_date DESC, {Movie._table}.score DESC
+            ORDER BY length({Movie._table}.title) ASC, {Rating._table}.rating_date DESC, {Movie._table}.imdb_score DESC
             LIMIT :limit_rows
         )"""
     ]
@@ -393,7 +393,7 @@ async def find_ratings(
                     FROM {Movie._table}
                     WHERE id NOT IN newest_movies
                     {('AND ' + ' AND '.join(conditions)) if conditions else ''}
-                    ORDER BY length(title) ASC, score DESC, release_year DESC
+                    ORDER BY length(title) ASC, imdb_score DESC, release_year DESC
                     LIMIT :limit_rows
                 )""",
                 f"""{source_table} AS (
@@ -412,7 +412,8 @@ async def find_ratings(
         SELECT 
             {Rating._table}.score AS user_score,
             {Rating._table}.user_id AS user_id,
-            {Movie._table}.score AS imdb_score,
+            {Movie._table}.imdb_score,
+            {Movie._table}.imdb_votes,
             {Movie._table}.imdb_id AS movie_imdb_id,
             {Movie._table}.media_type AS media_type,
             {Movie._table}.title AS canonical_title,
diff --git a/unwind/imdb.py b/unwind/imdb.py
index 35368e5..a110ece 100644
--- a/unwind/imdb.py
+++ b/unwind/imdb.py
@@ -136,7 +136,7 @@ def movie_and_rating_from_item(item) -> tuple[Movie, Rating]:
             rating.score = score_from_imdb_rating(float(rating_item.string))
     if match := ratings_item.find("div", "ipl-rating-star small"):
         if rating_item := match.find("span", "ipl-rating-star__rating"):
-            movie.score = score_from_imdb_rating(float(rating_item.string))
+            movie.imdb_score = score_from_imdb_rating(float(rating_item.string))
 
     return movie, rating
 
diff --git a/unwind/imdb_import.py b/unwind/imdb_import.py
index a34e931..94b6830 100644
--- a/unwind/imdb_import.py
+++ b/unwind/imdb_import.py
@@ -56,7 +56,7 @@ class BasicRow:
             release_year=self.startYear,
             media_type=title_types[self.titleType],
             imdb_id=self.tconst,
-            score=None,
+            imdb_score=None,
             runtime=self.runtimeMinutes,
             genres=self.genres or set(),
             updated=None,  # optimization: skip default factory
@@ -79,7 +79,8 @@ class RatingRow:
     def as_movie(self):
         return Movie(
             imdb_id=self.tconst,
-            score=score_from_imdb_rating(self.averageRating),
+            imdb_score=score_from_imdb_rating(self.averageRating),
+            imdb_votes=self.numVotes,
             updated=None,  # optimization: skip default factory
             id=None,  # optimization: skip default factory
         )
@@ -158,11 +159,11 @@ def read_ratings(path):
         yield m
 
 
-def read_ratings_as_scoremap(path):
+def read_ratings_as_mapping(path):
     """Optimized function to quickly load all ratings."""
     rows = read_imdb_tsv(path, RatingRow, unpack=False)
     rows = cast(list[list[str]], rows)
-    return {r[0]: round(100 * (float(r[1]) - 1) / 9) for r in rows}
+    return {r[0]: (round(100 * (float(r[1]) - 1) / 9), int(r[2])) for r in rows}
 
 
 def read_basics(path):
@@ -181,7 +182,7 @@ def read_basics(path):
 
 async def import_from_file(*, basics_path: Path, ratings_path: Path):
     log.info("💾 Loading scores ...")
-    scores = read_ratings_as_scoremap(ratings_path)
+    ratings = read_ratings_as_mapping(ratings_path)
 
     log.info("💾 Importing movies ...")
     total = count_lines(basics_path)
@@ -212,7 +213,7 @@ async def import_from_file(*, basics_path: Path, ratings_path: Path):
             log.debug("Skipping movie, unwanted media type: %s", m.media_type)
             continue
 
-        m.score = scores.get(m.imdb_id)
+        m.imdb_score, m.imdb_votes = ratings.get(m.imdb_id, [None, None])
         chunk.append(m)
 
         if len(chunk) > 1000:
diff --git a/unwind/models.py b/unwind/models.py
index 2789583..b3aae26 100644
--- a/unwind/models.py
+++ b/unwind/models.py
@@ -152,7 +152,8 @@ class Movie:
     release_year: int = None  # canonical release date
     media_type: str = None
     imdb_id: str = None
-    score: Optional[int] = None  # range: [0,100]
+    imdb_score: Optional[int] = None  # range: [0,100]
+    imdb_votes: Optional[int] = None
     runtime: Optional[int] = None  # minutes
     genres: set[str] = None
     updated: datetime = field(default_factory=utcnow)
diff --git a/unwind/sql/20210720-213416.sql b/unwind/sql/20210720-213416.sql
new file mode 100644
index 0000000..286e094
--- /dev/null
+++ b/unwind/sql/20210720-213416.sql
@@ -0,0 +1,36 @@
+-- add IMDb vote count
+
+CREATE TABLE _migrate_movies (
+    id TEXT PRIMARY KEY NOT NULL,
+    title TEXT NOT NULL,
+    original_title TEXT,
+    release_year INTEGER NOT NULL,
+    media_type TEXT NOT NULL,
+    imdb_id TEXT NOT NULL UNIQUE,
+    imdb_score INTEGER,
+    imdb_votes INTEGER,
+    runtime INTEGER,
+    genres TEXT NOT NULL,
+    updated TEXT NOT NULL
+);;
+
+INSERT INTO _migrate_movies
+SELECT
+    id,
+    title,
+    original_title,
+    release_year,
+    media_type,
+    imdb_id,
+    score AS imdb_score,
+    NULL AS imdb_votes,
+    runtime,
+    genres,
+    updated
+FROM movies
+WHERE true;;
+
+DROP TABLE movies;;
+
+ALTER TABLE _migrate_movies
+RENAME TO movies;;
diff --git a/unwind/web.py b/unwind/web.py
index ec72e68..367dc68 100644
--- a/unwind/web.py
+++ b/unwind/web.py
@@ -154,6 +154,7 @@ async def get_ratings_for_group(request):
                 "link": imdb.movie_url(r["movie_imdb_id"]),
                 "user_scores": [],
                 "imdb_score": r["imdb_score"],
+                "imdb_votes": r["imdb_votes"],
                 "media_type": r["media_type"],
             },
         )