From 7da3a094f1f2acd08c1f11a3b0132f6d04b6a5a2 Mon Sep 17 00:00:00 2001 From: ducklet Date: Sat, 4 Feb 2023 01:03:12 +0100 Subject: [PATCH] add "download-imdb-dataset" command to CLI --- unwind/__main__.py | 27 ++++++++++++++++++++++++++- unwind/imdb_import.py | 25 +++++++++++++++++++------ 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/unwind/__main__.py b/unwind/__main__.py index 983ede7..e1689a9 100644 --- a/unwind/__main__.py +++ b/unwind/__main__.py @@ -6,7 +6,7 @@ from pathlib import Path from . import config from .db import close_connection_pool, open_connection_pool from .imdb import refresh_user_ratings_from_imdb -from .imdb_import import import_from_file +from .imdb_import import download_datasets, import_from_file log = logging.getLogger(__name__) @@ -31,6 +31,10 @@ async def run_import_imdb_dataset(basics_path: Path, ratings_path: Path): await close_connection_pool() +async def run_download_imdb_dataset(basics_path: Path, ratings_path: Path): + await download_datasets(basics_path=basics_path, ratings_path=ratings_path) + + def getargs(): parser = argparse.ArgumentParser() commands = parser.add_subparsers(required=True) @@ -55,6 +59,25 @@ def getargs(): "--ratings", metavar="ratings_file.tsv.gz", type=Path, required=True ) + parser_download_imdb_dataset = commands.add_parser( + "download-imdb-dataset", + help="Download IMDb datasets.", + description=""" + Download IMDb datasets. + """, + ) + parser_download_imdb_dataset.add_argument( + dest="mode", + action="store_const", + const="download-imdb-dataset", + ) + parser_download_imdb_dataset.add_argument( + "--basics", metavar="basics_file.tsv.gz", type=Path, required=True + ) + parser_download_imdb_dataset.add_argument( + "--ratings", metavar="ratings_file.tsv.gz", type=Path, required=True + ) + parser_load_user_ratings_from_imdb = commands.add_parser( "load-user-ratings-from-imdb", help="Load user ratings from imdb.com.", @@ -94,6 +117,8 @@ def main(): asyncio.run(run_load_user_ratings_from_imdb()) elif args.mode == "import-imdb-dataset": asyncio.run(run_import_imdb_dataset(args.basics, args.ratings)) + elif args.mode == "download-imdb-dataset": + asyncio.run(run_download_imdb_dataset(args.basics, args.ratings)) main() diff --git a/unwind/imdb_import.py b/unwind/imdb_import.py index 3991a78..7e55b62 100644 --- a/unwind/imdb_import.py +++ b/unwind/imdb_import.py @@ -1,3 +1,4 @@ +import asyncio import csv import gzip import logging @@ -236,7 +237,23 @@ async def import_from_file(*, basics_path: Path, ratings_path: Path): await db.set_import_progress(100) -async def load_from_web(*, force: bool = False): +async def download_datasets(*, basics_path: Path, ratings_path: Path) -> None: + """Download IMDb movie database dumps. + + See https://www.imdb.com/interfaces/ and https://datasets.imdbws.com/ for + more information on the IMDb database dumps. + """ + basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz" + ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz" + + async with request.asession(): + await asyncio.gather( + request.adownload(ratings_url, to_path=ratings_path, only_if_newer=True), + request.adownload(basics_url, to_path=basics_path, only_if_newer=True), + ) + + +async def load_from_web(*, force: bool = False) -> None: """Refresh the full IMDb movie database. The latest dumps are first downloaded and then imported into the database. @@ -249,17 +266,13 @@ async def load_from_web(*, force: bool = False): await db.set_import_progress(0) try: - basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz" - ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz" ratings_file = config.datadir / "imdb/title.ratings.tsv.gz" basics_file = config.datadir / "imdb/title.basics.tsv.gz" ratings_mtime = ratings_file.stat().st_mtime if ratings_file.exists() else None bastics_mtime = basics_file.stat().st_mtime if basics_file.exists() else None - with request.session(): - request.download(ratings_url, ratings_file, only_if_newer=True) - request.download(basics_url, basics_file, only_if_newer=True) + await download_datasets(basics_path=basics_file, ratings_path=ratings_file) is_changed = ( ratings_mtime != ratings_file.stat().st_mtime