import csv import gzip import logging from dataclasses import dataclass, fields from datetime import datetime, timezone from pathlib import Path from typing import Optional, get_origin from . import db from .db import add_or_update_movie from .imdb import score_from_imdb_rating from .models import Movie, optional_type log = logging.getLogger(__name__) # See # - https://www.imdb.com/interfaces/ # - https://datasets.imdbws.com/ @dataclass class BasicRow: tconst: str titleType: str primaryTitle: str originalTitle: str isAdult: bool startYear: Optional[int] endYear: Optional[int] runtimeMinutes: Optional[int] genres: Optional[set[str]] @classmethod def from_row(cls, row): vals = [] for f, r in zip(fields(cls), row): ttype = f.type is_opt = False if (otype := optional_type(ttype)) is not None: ttype = otype is_opt = True if (otype := get_origin(ttype)) is not None: ttype = otype if r == r"\N": if is_opt: vals.append(None) else: raise ValueError(f"Unexpected null value for field: {f.name}") elif f.name == "genres": vals.append(set(r.split(","))) elif f.name == "isAdult": assert r in "01" vals.append(r == "1") else: vals.append(ttype(r)) inst = cls(*vals) assert inst.titleType in title_types return inst def as_movie(self): assert self.startYear is not None return Movie( title=self.primaryTitle, original_title=self.originalTitle, release_year=self.startYear, media_type=title_types[self.titleType], imdb_id=self.tconst, score=None, runtime=self.runtimeMinutes, genres=self.genres or set(), ) @dataclass class RatingRow: tconst: str averageRating: float numVotes: int @classmethod def from_row(cls, row): inst = cls(*(f.type(r) for f, r in zip(fields(cls), row))) assert inst.tconst != r"\N" return inst def as_movie(self): return Movie( imdb_id=self.tconst, score=score_from_imdb_rating(self.averageRating), ) title_types = { "movie": "Movie", "radioEpisode": "Radio Episode", "radioSeries": "Radio Series", "short": "Short", "tvEpisode": "TV Episode", "tvMiniSeries": "TV Mini Series", "tvMovie": "TV Movie", "tvSeries": "TV Series", "tvShort": "TV Short", "tvSpecial": "TV Special", "video": "Video", "videoGame": "Video Game", } def gz_mtime(path) -> datetime: """Return the timestamp of the compressed file.""" g = gzip.GzipFile(path, "rb") g.peek(1) # start reading the file to fill the timestamp field assert g.mtime is not None return datetime.fromtimestamp(g.mtime).replace(tzinfo=timezone.utc) def count_lines(path) -> int: i = 0 with gzip.open(path, "rt") as f: for i, _ in enumerate(f, start=1): pass return i def read_imdb_tsv(path, row_type): with gzip.open(path, "rt", newline="") as f: rows = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE) # skip header line rows = iter(rows) header = next(rows) try: assert tuple(f.name for f in fields(row_type)) == tuple(header) except AssertionError: log.error("Unexpected header line: %s", header) raise for i, row in enumerate(rows, start=1): try: yield row_type.from_row(row) except Exception as err: log.error("Error in line %s: %s", i, row, exc_info=err) raise def read_ratings(path): mtime = gz_mtime(path) rows = read_imdb_tsv(path, RatingRow) for row in rows: m = row.as_movie() m.updated = mtime yield m def read_basics(path): mtime = gz_mtime(path) rows = read_imdb_tsv(path, BasicRow) for row in rows: if row.startYear is None: log.debug("Skipping movie, missing year: %s", row) continue m = row.as_movie() m.updated = mtime yield m async def import_from_file(basics_path: Path, ratings_path: Path): log.info("Loading scores ... 💾") scores = {m.imdb_id: m.score for m in read_ratings(ratings_path)} log.info("Importing movies ... 💾") total = count_lines(basics_path) assert total != 0 perc = 0.0 perc_step = 0.001 async with db.shared_connection().transaction(): for i, m in enumerate(read_basics(basics_path)): if i / total > perc: log.info("Imported %s%%", round(perc * 100, 1)) perc += perc_step if m.media_type not in { "Movie", "Short", "TV Mini Series", "TV Movie", "TV Series", "TV Short", "TV Special", "Video", }: log.debug("Skipping movie, unwanted media type: %s", m.media_type) continue m.score = scores.get(m.imdb_id) await add_or_update_movie(m)