unwind/unwind/imdb_import.py

import csv
import gzip
import logging
from dataclasses import dataclass, fields
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, get_origin

from . import db
from .db import add_or_update_movie
from .imdb import score_from_imdb_rating
from .models import Movie, optional_type

log = logging.getLogger(__name__)


# See
# - https://www.imdb.com/interfaces/
# - https://datasets.imdbws.com/


@dataclass
class BasicRow:
    tconst: str
    titleType: str
    primaryTitle: str
    originalTitle: str
    isAdult: bool
    startYear: Optional[int]
    endYear: Optional[int]
    runtimeMinutes: Optional[int]
    genres: Optional[set[str]]

    @classmethod
    def from_row(cls, row):
        vals = []
        for f, r in zip(fields(cls), row):
            ttype = f.type
            is_opt = False

            if (otype := optional_type(ttype)) is not None:
                ttype = otype
                is_opt = True
            if (otype := get_origin(ttype)) is not None:
                ttype = otype

            if r == r"\N":
                if is_opt:
                    vals.append(None)
                else:
                    raise ValueError(f"Unexpected null value for field: {f.name}")
            elif f.name == "genres":
                vals.append(set(r.split(",")))
            elif f.name == "isAdult":
                assert r in "01"
                vals.append(r == "1")
            else:
                vals.append(ttype(r))

        inst = cls(*vals)
        assert inst.titleType in title_types
        return inst

    def as_movie(self):
        assert self.startYear is not None
        return Movie(
            title=self.primaryTitle,
            original_title=self.originalTitle,
            release_year=self.startYear,
            media_type=title_types[self.titleType],
            imdb_id=self.tconst,
            score=None,
            runtime=self.runtimeMinutes,
            genres=self.genres or set(),
        )


@dataclass
class RatingRow:
    tconst: str
    averageRating: float
    numVotes: int

    @classmethod
    def from_row(cls, row):
        inst = cls(*(f.type(r) for f, r in zip(fields(cls), row)))
        assert inst.tconst != r"\N"
        return inst

    def as_movie(self):
        return Movie(
            imdb_id=self.tconst,
            score=score_from_imdb_rating(self.averageRating),
        )


title_types = {
    "movie": "Movie",
    "radioEpisode": "Radio Episode",
    "radioSeries": "Radio Series",
    "short": "Short",
    "tvEpisode": "TV Episode",
    "tvMiniSeries": "TV Mini Series",
    "tvMovie": "TV Movie",
    "tvSeries": "TV Series",
    "tvShort": "TV Short",
    "tvSpecial": "TV Special",
    "video": "Video",
    "videoGame": "Video Game",
}


def gz_mtime(path) -> datetime:
    """Return the timestamp of the compressed file."""
    g = gzip.GzipFile(path, "rb")
    g.peek(1)  # start reading the file to fill the timestamp field
    assert g.mtime is not None
    return datetime.fromtimestamp(g.mtime).replace(tzinfo=timezone.utc)


def count_lines(path) -> int:
    i = 0
    with gzip.open(path, "rt") as f:
        for i, _ in enumerate(f, start=1):
            pass
    return i


def read_imdb_tsv(path, row_type):
    with gzip.open(path, "rt", newline="") as f:
        rows = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)

        # skip header line
        rows = iter(rows)
        header = next(rows)
        try:
            assert tuple(f.name for f in fields(row_type)) == tuple(header)
        except AssertionError:
            log.error("Unexpected header line: %s", header)
            raise

        for i, row in enumerate(rows, start=1):
            try:
                yield row_type.from_row(row)
            except Exception as err:
                log.error("Error in line %s: %s", i, row, exc_info=err)
                raise


def read_ratings(path):
    mtime = gz_mtime(path)
    rows = read_imdb_tsv(path, RatingRow)

    for row in rows:
        m = row.as_movie()
        m.updated = mtime
        yield m


def read_basics(path):
    mtime = gz_mtime(path)
    rows = read_imdb_tsv(path, BasicRow)

    for row in rows:
        if row.startYear is None:
            log.debug("Skipping movie, missing year: %s", row)
            continue

        m = row.as_movie()
        m.updated = mtime
        yield m


async def import_from_file(basics_path: Path, ratings_path: Path):
    log.info("Loading scores ... 💾")
    scores = {m.imdb_id: m.score for m in read_ratings(ratings_path)}

    log.info("Importing movies ... 💾")
    total = count_lines(basics_path)
    assert total != 0
    perc = 0.0
    perc_step = 0.001

    async with db.shared_connection().transaction():

        for i, m in enumerate(read_basics(basics_path)):

            if i / total > perc:
                log.info("Imported %s%%", round(perc * 100, 1))
                perc += perc_step

            if m.media_type not in {
                "Movie",
                "Short",
                "TV Mini Series",
                "TV Movie",
                "TV Series",
                "TV Short",
                "TV Special",
                "Video",
            }:
                log.debug("Skipping movie, unwanted media type: %s", m.media_type)
                continue

            m.score = scores.get(m.imdb_id)
            await add_or_update_movie(m)