optimize runtime
- Postpone setting an ID for a Movie until it's added to the database. - Use larger chunks to count the lines of an input file. - Skip creating temporary objects for the score-map.
This commit is contained in:
parent
9f6baa99b0
commit
2e73b335c9
2 changed files with 44 additions and 31 deletions
|
|
@ -60,6 +60,13 @@ async def init_db(db):
|
||||||
|
|
||||||
|
|
||||||
async def add(item):
|
async def add(item):
|
||||||
|
# Support late initializing of `id` (used for optimization).
|
||||||
|
if hasattr(item, "id") and getattr(item, "id") is None:
|
||||||
|
for f in fields(item):
|
||||||
|
if f.name == "id":
|
||||||
|
item.id = f.default_factory()
|
||||||
|
break
|
||||||
|
|
||||||
values = asplain(item)
|
values = asplain(item)
|
||||||
keys = ", ".join(f"{k}" for k in values)
|
keys = ", ".join(f"{k}" for k in values)
|
||||||
placeholders = ", ".join(f":{k}" for k in values)
|
placeholders = ", ".join(f":{k}" for k in values)
|
||||||
|
|
|
||||||
|
|
@ -4,12 +4,12 @@ import logging
|
||||||
from dataclasses import dataclass, fields
|
from dataclasses import dataclass, fields
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, get_origin
|
from typing import Optional, cast
|
||||||
|
|
||||||
from . import db
|
from . import db
|
||||||
from .db import add_or_update_movie
|
from .db import add_or_update_movie
|
||||||
from .imdb import score_from_imdb_rating
|
from .imdb import score_from_imdb_rating
|
||||||
from .models import Movie, optional_type
|
from .models import Movie
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -33,31 +33,18 @@ class BasicRow:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_row(cls, row):
|
def from_row(cls, row):
|
||||||
vals = []
|
assert row[4] in "01" # isAdult
|
||||||
for f, r in zip(fields(cls), row):
|
inst = cls(
|
||||||
ttype = f.type
|
tconst=row[0],
|
||||||
is_opt = False
|
titleType=row[1],
|
||||||
|
primaryTitle=row[2],
|
||||||
if (otype := optional_type(ttype)) is not None:
|
originalTitle=row[3],
|
||||||
ttype = otype
|
isAdult=row[4] == "1",
|
||||||
is_opt = True
|
startYear=None if row[5] == r"\N" else int(row[5]),
|
||||||
if (otype := get_origin(ttype)) is not None:
|
endYear=None if row[6] == r"\N" else int(row[6]),
|
||||||
ttype = otype
|
runtimeMinutes=None if row[7] == r"\N" else int(row[7]),
|
||||||
|
genres=None if row[8] == r"\N" else set(row[8].split(",")),
|
||||||
if r == r"\N":
|
)
|
||||||
if is_opt:
|
|
||||||
vals.append(None)
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unexpected null value for field: {f.name}")
|
|
||||||
elif f.name == "genres":
|
|
||||||
vals.append(set(r.split(",")))
|
|
||||||
elif f.name == "isAdult":
|
|
||||||
assert r in "01"
|
|
||||||
vals.append(r == "1")
|
|
||||||
else:
|
|
||||||
vals.append(ttype(r))
|
|
||||||
|
|
||||||
inst = cls(*vals)
|
|
||||||
assert inst.titleType in title_types
|
assert inst.titleType in title_types
|
||||||
return inst
|
return inst
|
||||||
|
|
||||||
|
|
@ -72,6 +59,8 @@ class BasicRow:
|
||||||
score=None,
|
score=None,
|
||||||
runtime=self.runtimeMinutes,
|
runtime=self.runtimeMinutes,
|
||||||
genres=self.genres or set(),
|
genres=self.genres or set(),
|
||||||
|
updated=None, # optimization: skip default factory
|
||||||
|
id=None, # optimization: skip default factory
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -122,13 +111,19 @@ def gz_mtime(path) -> datetime:
|
||||||
|
|
||||||
def count_lines(path) -> int:
|
def count_lines(path) -> int:
|
||||||
i = 0
|
i = 0
|
||||||
|
|
||||||
|
one_mb = 2 ** 20
|
||||||
|
buf_size = 8 * one_mb # 8 MiB seems to give a good read/process performance.
|
||||||
|
|
||||||
with gzip.open(path, "rt") as f:
|
with gzip.open(path, "rt") as f:
|
||||||
for i, _ in enumerate(f, start=1):
|
|
||||||
pass
|
while buf := f.read(buf_size):
|
||||||
|
i += buf.count("\n")
|
||||||
|
|
||||||
return i
|
return i
|
||||||
|
|
||||||
|
|
||||||
def read_imdb_tsv(path, row_type):
|
def read_imdb_tsv(path, row_type, *, unpack=True):
|
||||||
with gzip.open(path, "rt", newline="") as f:
|
with gzip.open(path, "rt", newline="") as f:
|
||||||
rows = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
|
rows = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
|
||||||
|
|
||||||
|
|
@ -141,6 +136,10 @@ def read_imdb_tsv(path, row_type):
|
||||||
log.error("Unexpected header line: %s", header)
|
log.error("Unexpected header line: %s", header)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
if unpack is False:
|
||||||
|
yield from rows
|
||||||
|
return
|
||||||
|
|
||||||
for i, row in enumerate(rows, start=1):
|
for i, row in enumerate(rows, start=1):
|
||||||
try:
|
try:
|
||||||
yield row_type.from_row(row)
|
yield row_type.from_row(row)
|
||||||
|
|
@ -159,6 +158,13 @@ def read_ratings(path):
|
||||||
yield m
|
yield m
|
||||||
|
|
||||||
|
|
||||||
|
def read_ratings_as_scoremap(path):
|
||||||
|
"""Optimized function to quickly load all ratings."""
|
||||||
|
rows = read_imdb_tsv(path, RatingRow, unpack=False)
|
||||||
|
rows = cast(list[list[str]], rows)
|
||||||
|
return {r[0]: round(100 * (float(r[1]) - 1) / 9) for r in rows}
|
||||||
|
|
||||||
|
|
||||||
def read_basics(path):
|
def read_basics(path):
|
||||||
mtime = gz_mtime(path)
|
mtime = gz_mtime(path)
|
||||||
rows = read_imdb_tsv(path, BasicRow)
|
rows = read_imdb_tsv(path, BasicRow)
|
||||||
|
|
@ -175,7 +181,7 @@ def read_basics(path):
|
||||||
|
|
||||||
async def import_from_file(basics_path: Path, ratings_path: Path):
|
async def import_from_file(basics_path: Path, ratings_path: Path):
|
||||||
log.info("Loading scores ... 💾")
|
log.info("Loading scores ... 💾")
|
||||||
scores = {m.imdb_id: m.score for m in read_ratings(ratings_path)}
|
scores = read_ratings_as_scoremap(ratings_path)
|
||||||
|
|
||||||
log.info("Importing movies ... 💾")
|
log.info("Importing movies ... 💾")
|
||||||
total = count_lines(basics_path)
|
total = count_lines(basics_path)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue