optimize runtime

- Postpone setting an ID for a Movie until it's added to the database.
- Use larger chunks to count the lines of an input file.
- Skip creating temporary objects for the score-map.
This commit is contained in:
ducklet 2021-07-04 21:31:27 +02:00
parent 9f6baa99b0
commit 2e73b335c9
2 changed files with 44 additions and 31 deletions

View file

@ -60,6 +60,13 @@ async def init_db(db):
async def add(item): async def add(item):
# Support late initializing of `id` (used for optimization).
if hasattr(item, "id") and getattr(item, "id") is None:
for f in fields(item):
if f.name == "id":
item.id = f.default_factory()
break
values = asplain(item) values = asplain(item)
keys = ", ".join(f"{k}" for k in values) keys = ", ".join(f"{k}" for k in values)
placeholders = ", ".join(f":{k}" for k in values) placeholders = ", ".join(f":{k}" for k in values)

View file

@ -4,12 +4,12 @@ import logging
from dataclasses import dataclass, fields from dataclasses import dataclass, fields
from datetime import datetime, timezone from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
from typing import Optional, get_origin from typing import Optional, cast
from . import db from . import db
from .db import add_or_update_movie from .db import add_or_update_movie
from .imdb import score_from_imdb_rating from .imdb import score_from_imdb_rating
from .models import Movie, optional_type from .models import Movie
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -33,31 +33,18 @@ class BasicRow:
@classmethod @classmethod
def from_row(cls, row): def from_row(cls, row):
vals = [] assert row[4] in "01" # isAdult
for f, r in zip(fields(cls), row): inst = cls(
ttype = f.type tconst=row[0],
is_opt = False titleType=row[1],
primaryTitle=row[2],
if (otype := optional_type(ttype)) is not None: originalTitle=row[3],
ttype = otype isAdult=row[4] == "1",
is_opt = True startYear=None if row[5] == r"\N" else int(row[5]),
if (otype := get_origin(ttype)) is not None: endYear=None if row[6] == r"\N" else int(row[6]),
ttype = otype runtimeMinutes=None if row[7] == r"\N" else int(row[7]),
genres=None if row[8] == r"\N" else set(row[8].split(",")),
if r == r"\N": )
if is_opt:
vals.append(None)
else:
raise ValueError(f"Unexpected null value for field: {f.name}")
elif f.name == "genres":
vals.append(set(r.split(",")))
elif f.name == "isAdult":
assert r in "01"
vals.append(r == "1")
else:
vals.append(ttype(r))
inst = cls(*vals)
assert inst.titleType in title_types assert inst.titleType in title_types
return inst return inst
@ -72,6 +59,8 @@ class BasicRow:
score=None, score=None,
runtime=self.runtimeMinutes, runtime=self.runtimeMinutes,
genres=self.genres or set(), genres=self.genres or set(),
updated=None, # optimization: skip default factory
id=None, # optimization: skip default factory
) )
@ -122,13 +111,19 @@ def gz_mtime(path) -> datetime:
def count_lines(path) -> int: def count_lines(path) -> int:
i = 0 i = 0
one_mb = 2 ** 20
buf_size = 8 * one_mb # 8 MiB seems to give a good read/process performance.
with gzip.open(path, "rt") as f: with gzip.open(path, "rt") as f:
for i, _ in enumerate(f, start=1):
pass while buf := f.read(buf_size):
i += buf.count("\n")
return i return i
def read_imdb_tsv(path, row_type): def read_imdb_tsv(path, row_type, *, unpack=True):
with gzip.open(path, "rt", newline="") as f: with gzip.open(path, "rt", newline="") as f:
rows = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE) rows = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
@ -141,6 +136,10 @@ def read_imdb_tsv(path, row_type):
log.error("Unexpected header line: %s", header) log.error("Unexpected header line: %s", header)
raise raise
if unpack is False:
yield from rows
return
for i, row in enumerate(rows, start=1): for i, row in enumerate(rows, start=1):
try: try:
yield row_type.from_row(row) yield row_type.from_row(row)
@ -159,6 +158,13 @@ def read_ratings(path):
yield m yield m
def read_ratings_as_scoremap(path):
"""Optimized function to quickly load all ratings."""
rows = read_imdb_tsv(path, RatingRow, unpack=False)
rows = cast(list[list[str]], rows)
return {r[0]: round(100 * (float(r[1]) - 1) / 9) for r in rows}
def read_basics(path): def read_basics(path):
mtime = gz_mtime(path) mtime = gz_mtime(path)
rows = read_imdb_tsv(path, BasicRow) rows = read_imdb_tsv(path, BasicRow)
@ -175,7 +181,7 @@ def read_basics(path):
async def import_from_file(basics_path: Path, ratings_path: Path): async def import_from_file(basics_path: Path, ratings_path: Path):
log.info("Loading scores ... 💾") log.info("Loading scores ... 💾")
scores = {m.imdb_id: m.score for m in read_ratings(ratings_path)} scores = read_ratings_as_scoremap(ratings_path)
log.info("Importing movies ... 💾") log.info("Importing movies ... 💾")
total = count_lines(basics_path) total = count_lines(basics_path)