unwind/unwind/imdb_import.py

206 lines
5.3 KiB
Python

import csv
import gzip
import logging
from dataclasses import dataclass, fields
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, get_origin
from . import db
from .db import add_or_update_movie
from .imdb import score_from_imdb_rating
from .models import Movie, optional_type
log = logging.getLogger(__name__)
# See
# - https://www.imdb.com/interfaces/
# - https://datasets.imdbws.com/
@dataclass
class BasicRow:
tconst: str
titleType: str
primaryTitle: str
originalTitle: str
isAdult: bool
startYear: Optional[int]
endYear: Optional[int]
runtimeMinutes: Optional[int]
genres: Optional[set[str]]
@classmethod
def from_row(cls, row):
vals = []
for f, r in zip(fields(cls), row):
ttype = f.type
is_opt = False
if (otype := optional_type(ttype)) is not None:
ttype = otype
is_opt = True
if (otype := get_origin(ttype)) is not None:
ttype = otype
if r == r"\N":
if is_opt:
vals.append(None)
else:
raise ValueError(f"Unexpected null value for field: {f.name}")
elif f.name == "genres":
vals.append(set(r.split(",")))
elif f.name == "isAdult":
assert r in "01"
vals.append(r == "1")
else:
vals.append(ttype(r))
inst = cls(*vals)
assert inst.titleType in title_types
return inst
def as_movie(self):
assert self.startYear is not None
return Movie(
title=self.primaryTitle,
original_title=self.originalTitle,
release_year=self.startYear,
media_type=title_types[self.titleType],
imdb_id=self.tconst,
score=None,
runtime=self.runtimeMinutes,
genres=self.genres or set(),
)
@dataclass
class RatingRow:
tconst: str
averageRating: float
numVotes: int
@classmethod
def from_row(cls, row):
inst = cls(*(f.type(r) for f, r in zip(fields(cls), row)))
assert inst.tconst != r"\N"
return inst
def as_movie(self):
return Movie(
imdb_id=self.tconst,
score=score_from_imdb_rating(self.averageRating),
)
title_types = {
"movie": "Movie",
"radioEpisode": "Radio Episode",
"radioSeries": "Radio Series",
"short": "Short",
"tvEpisode": "TV Episode",
"tvMiniSeries": "TV Mini Series",
"tvMovie": "TV Movie",
"tvSeries": "TV Series",
"tvShort": "TV Short",
"tvSpecial": "TV Special",
"video": "Video",
"videoGame": "Video Game",
}
def gz_mtime(path) -> datetime:
"""Return the timestamp of the compressed file."""
g = gzip.GzipFile(path, "rb")
g.peek(1) # start reading the file to fill the timestamp field
assert g.mtime is not None
return datetime.fromtimestamp(g.mtime).replace(tzinfo=timezone.utc)
def count_lines(path) -> int:
i = 0
with gzip.open(path, "rt") as f:
for i, _ in enumerate(f, start=1):
pass
return i
def read_imdb_tsv(path, row_type):
with gzip.open(path, "rt", newline="") as f:
rows = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
# skip header line
rows = iter(rows)
header = next(rows)
try:
assert tuple(f.name for f in fields(row_type)) == tuple(header)
except AssertionError:
log.error("Unexpected header line: %s", header)
raise
for i, row in enumerate(rows, start=1):
try:
yield row_type.from_row(row)
except Exception as err:
log.error("Error in line %s: %s", i, row, exc_info=err)
raise
def read_ratings(path):
mtime = gz_mtime(path)
rows = read_imdb_tsv(path, RatingRow)
for row in rows:
m = row.as_movie()
m.updated = mtime
yield m
def read_basics(path):
mtime = gz_mtime(path)
rows = read_imdb_tsv(path, BasicRow)
for row in rows:
if row.startYear is None:
log.debug("Skipping movie, missing year: %s", row)
continue
m = row.as_movie()
m.updated = mtime
yield m
async def import_from_file(basics_path: Path, ratings_path: Path):
log.info("Loading scores ... 💾")
scores = {m.imdb_id: m.score for m in read_ratings(ratings_path)}
log.info("Importing movies ... 💾")
total = count_lines(basics_path)
assert total != 0
perc = 0.0
perc_step = 0.001
async with db.shared_connection().transaction():
for i, m in enumerate(read_basics(basics_path)):
if i / total > perc:
log.info("Imported %s%%", round(perc * 100, 1))
perc += perc_step
if m.media_type not in {
"Movie",
"Short",
"TV Mini Series",
"TV Movie",
"TV Series",
"TV Short",
"TV Special",
"Video",
}:
log.debug("Skipping movie, unwanted media type: %s", m.media_type)
continue
m.score = scores.get(m.imdb_id)
await add_or_update_movie(m)