From 2e73b335c939f9067ce22cfc7ada704923688223 Mon Sep 17 00:00:00 2001 From: ducklet Date: Sun, 4 Jul 2021 21:31:27 +0200 Subject: [PATCH] optimize runtime - Postpone setting an ID for a Movie until it's added to the database. - Use larger chunks to count the lines of an input file. - Skip creating temporary objects for the score-map. --- unwind/db.py | 7 +++++ unwind/imdb_import.py | 68 +++++++++++++++++++++++-------------------- 2 files changed, 44 insertions(+), 31 deletions(-) diff --git a/unwind/db.py b/unwind/db.py index 23d3b71..f2b519b 100644 --- a/unwind/db.py +++ b/unwind/db.py @@ -60,6 +60,13 @@ async def init_db(db): async def add(item): + # Support late initializing of `id` (used for optimization). + if hasattr(item, "id") and getattr(item, "id") is None: + for f in fields(item): + if f.name == "id": + item.id = f.default_factory() + break + values = asplain(item) keys = ", ".join(f"{k}" for k in values) placeholders = ", ".join(f":{k}" for k in values) diff --git a/unwind/imdb_import.py b/unwind/imdb_import.py index 56d91fb..3482774 100644 --- a/unwind/imdb_import.py +++ b/unwind/imdb_import.py @@ -4,12 +4,12 @@ import logging from dataclasses import dataclass, fields from datetime import datetime, timezone from pathlib import Path -from typing import Optional, get_origin +from typing import Optional, cast from . import db from .db import add_or_update_movie from .imdb import score_from_imdb_rating -from .models import Movie, optional_type +from .models import Movie log = logging.getLogger(__name__) @@ -33,31 +33,18 @@ class BasicRow: @classmethod def from_row(cls, row): - vals = [] - for f, r in zip(fields(cls), row): - ttype = f.type - is_opt = False - - if (otype := optional_type(ttype)) is not None: - ttype = otype - is_opt = True - if (otype := get_origin(ttype)) is not None: - ttype = otype - - if r == r"\N": - if is_opt: - vals.append(None) - else: - raise ValueError(f"Unexpected null value for field: {f.name}") - elif f.name == "genres": - vals.append(set(r.split(","))) - elif f.name == "isAdult": - assert r in "01" - vals.append(r == "1") - else: - vals.append(ttype(r)) - - inst = cls(*vals) + assert row[4] in "01" # isAdult + inst = cls( + tconst=row[0], + titleType=row[1], + primaryTitle=row[2], + originalTitle=row[3], + isAdult=row[4] == "1", + startYear=None if row[5] == r"\N" else int(row[5]), + endYear=None if row[6] == r"\N" else int(row[6]), + runtimeMinutes=None if row[7] == r"\N" else int(row[7]), + genres=None if row[8] == r"\N" else set(row[8].split(",")), + ) assert inst.titleType in title_types return inst @@ -72,6 +59,8 @@ class BasicRow: score=None, runtime=self.runtimeMinutes, genres=self.genres or set(), + updated=None, # optimization: skip default factory + id=None, # optimization: skip default factory ) @@ -122,13 +111,19 @@ def gz_mtime(path) -> datetime: def count_lines(path) -> int: i = 0 + + one_mb = 2 ** 20 + buf_size = 8 * one_mb # 8 MiB seems to give a good read/process performance. + with gzip.open(path, "rt") as f: - for i, _ in enumerate(f, start=1): - pass + + while buf := f.read(buf_size): + i += buf.count("\n") + return i -def read_imdb_tsv(path, row_type): +def read_imdb_tsv(path, row_type, *, unpack=True): with gzip.open(path, "rt", newline="") as f: rows = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE) @@ -141,6 +136,10 @@ def read_imdb_tsv(path, row_type): log.error("Unexpected header line: %s", header) raise + if unpack is False: + yield from rows + return + for i, row in enumerate(rows, start=1): try: yield row_type.from_row(row) @@ -159,6 +158,13 @@ def read_ratings(path): yield m +def read_ratings_as_scoremap(path): + """Optimized function to quickly load all ratings.""" + rows = read_imdb_tsv(path, RatingRow, unpack=False) + rows = cast(list[list[str]], rows) + return {r[0]: round(100 * (float(r[1]) - 1) / 9) for r in rows} + + def read_basics(path): mtime = gz_mtime(path) rows = read_imdb_tsv(path, BasicRow) @@ -175,7 +181,7 @@ def read_basics(path): async def import_from_file(basics_path: Path, ratings_path: Path): log.info("Loading scores ... 💾") - scores = {m.imdb_id: m.score for m in read_ratings(ratings_path)} + scores = read_ratings_as_scoremap(ratings_path) log.info("Importing movies ... 💾") total = count_lines(basics_path)