206 lines
5.3 KiB
Python
206 lines
5.3 KiB
Python
import csv
|
|
import gzip
|
|
import logging
|
|
from dataclasses import dataclass, fields
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional, get_origin
|
|
|
|
from . import db
|
|
from .db import add_or_update_movie
|
|
from .imdb import score_from_imdb_rating
|
|
from .models import Movie, optional_type
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
# See
|
|
# - https://www.imdb.com/interfaces/
|
|
# - https://datasets.imdbws.com/
|
|
|
|
|
|
@dataclass
|
|
class BasicRow:
|
|
tconst: str
|
|
titleType: str
|
|
primaryTitle: str
|
|
originalTitle: str
|
|
isAdult: bool
|
|
startYear: Optional[int]
|
|
endYear: Optional[int]
|
|
runtimeMinutes: Optional[int]
|
|
genres: Optional[set[str]]
|
|
|
|
@classmethod
|
|
def from_row(cls, row):
|
|
vals = []
|
|
for f, r in zip(fields(cls), row):
|
|
ttype = f.type
|
|
is_opt = False
|
|
|
|
if (otype := optional_type(ttype)) is not None:
|
|
ttype = otype
|
|
is_opt = True
|
|
if (otype := get_origin(ttype)) is not None:
|
|
ttype = otype
|
|
|
|
if r == r"\N":
|
|
if is_opt:
|
|
vals.append(None)
|
|
else:
|
|
raise ValueError(f"Unexpected null value for field: {f.name}")
|
|
elif f.name == "genres":
|
|
vals.append(set(r.split(",")))
|
|
elif f.name == "isAdult":
|
|
assert r in "01"
|
|
vals.append(r == "1")
|
|
else:
|
|
vals.append(ttype(r))
|
|
|
|
inst = cls(*vals)
|
|
assert inst.titleType in title_types
|
|
return inst
|
|
|
|
def as_movie(self):
|
|
assert self.startYear is not None
|
|
return Movie(
|
|
title=self.primaryTitle,
|
|
original_title=self.originalTitle,
|
|
release_year=self.startYear,
|
|
media_type=title_types[self.titleType],
|
|
imdb_id=self.tconst,
|
|
score=None,
|
|
runtime=self.runtimeMinutes,
|
|
genres=self.genres or set(),
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class RatingRow:
|
|
tconst: str
|
|
averageRating: float
|
|
numVotes: int
|
|
|
|
@classmethod
|
|
def from_row(cls, row):
|
|
inst = cls(*(f.type(r) for f, r in zip(fields(cls), row)))
|
|
assert inst.tconst != r"\N"
|
|
return inst
|
|
|
|
def as_movie(self):
|
|
return Movie(
|
|
imdb_id=self.tconst,
|
|
score=score_from_imdb_rating(self.averageRating),
|
|
)
|
|
|
|
|
|
title_types = {
|
|
"movie": "Movie",
|
|
"radioEpisode": "Radio Episode",
|
|
"radioSeries": "Radio Series",
|
|
"short": "Short",
|
|
"tvEpisode": "TV Episode",
|
|
"tvMiniSeries": "TV Mini Series",
|
|
"tvMovie": "TV Movie",
|
|
"tvSeries": "TV Series",
|
|
"tvShort": "TV Short",
|
|
"tvSpecial": "TV Special",
|
|
"video": "Video",
|
|
"videoGame": "Video Game",
|
|
}
|
|
|
|
|
|
def gz_mtime(path) -> datetime:
|
|
"""Return the timestamp of the compressed file."""
|
|
g = gzip.GzipFile(path, "rb")
|
|
g.peek(1) # start reading the file to fill the timestamp field
|
|
assert g.mtime is not None
|
|
return datetime.fromtimestamp(g.mtime).replace(tzinfo=timezone.utc)
|
|
|
|
|
|
def count_lines(path) -> int:
|
|
i = 0
|
|
with gzip.open(path, "rt") as f:
|
|
for i, _ in enumerate(f, start=1):
|
|
pass
|
|
return i
|
|
|
|
|
|
def read_imdb_tsv(path, row_type):
|
|
with gzip.open(path, "rt", newline="") as f:
|
|
rows = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
|
|
|
|
# skip header line
|
|
rows = iter(rows)
|
|
header = next(rows)
|
|
try:
|
|
assert tuple(f.name for f in fields(row_type)) == tuple(header)
|
|
except AssertionError:
|
|
log.error("Unexpected header line: %s", header)
|
|
raise
|
|
|
|
for i, row in enumerate(rows, start=1):
|
|
try:
|
|
yield row_type.from_row(row)
|
|
except Exception as err:
|
|
log.error("Error in line %s: %s", i, row, exc_info=err)
|
|
raise
|
|
|
|
|
|
def read_ratings(path):
|
|
mtime = gz_mtime(path)
|
|
rows = read_imdb_tsv(path, RatingRow)
|
|
|
|
for row in rows:
|
|
m = row.as_movie()
|
|
m.updated = mtime
|
|
yield m
|
|
|
|
|
|
def read_basics(path):
|
|
mtime = gz_mtime(path)
|
|
rows = read_imdb_tsv(path, BasicRow)
|
|
|
|
for row in rows:
|
|
if row.startYear is None:
|
|
log.debug("Skipping movie, missing year: %s", row)
|
|
continue
|
|
|
|
m = row.as_movie()
|
|
m.updated = mtime
|
|
yield m
|
|
|
|
|
|
async def import_from_file(basics_path: Path, ratings_path: Path):
|
|
log.info("Loading scores ... 💾")
|
|
scores = {m.imdb_id: m.score for m in read_ratings(ratings_path)}
|
|
|
|
log.info("Importing movies ... 💾")
|
|
total = count_lines(basics_path)
|
|
assert total != 0
|
|
perc = 0.0
|
|
perc_step = 0.001
|
|
|
|
async with db.shared_connection().transaction():
|
|
|
|
for i, m in enumerate(read_basics(basics_path)):
|
|
|
|
if i / total > perc:
|
|
log.info("Imported %s%%", round(perc * 100, 1))
|
|
perc += perc_step
|
|
|
|
if m.media_type not in {
|
|
"Movie",
|
|
"Short",
|
|
"TV Mini Series",
|
|
"TV Movie",
|
|
"TV Series",
|
|
"TV Short",
|
|
"TV Special",
|
|
"Video",
|
|
}:
|
|
log.debug("Skipping movie, unwanted media type: %s", m.media_type)
|
|
continue
|
|
|
|
m.score = scores.get(m.imdb_id)
|
|
await add_or_update_movie(m)
|