add "download-imdb-dataset" command to CLI

This commit is contained in:
ducklet 2023-02-04 01:03:12 +01:00
parent 0563d49dbc
commit 7da3a094f1
2 changed files with 45 additions and 7 deletions

View file

@ -6,7 +6,7 @@ from pathlib import Path
from . import config
from .db import close_connection_pool, open_connection_pool
from .imdb import refresh_user_ratings_from_imdb
from .imdb_import import import_from_file
from .imdb_import import download_datasets, import_from_file
log = logging.getLogger(__name__)
@ -31,6 +31,10 @@ async def run_import_imdb_dataset(basics_path: Path, ratings_path: Path):
await close_connection_pool()
async def run_download_imdb_dataset(basics_path: Path, ratings_path: Path):
await download_datasets(basics_path=basics_path, ratings_path=ratings_path)
def getargs():
parser = argparse.ArgumentParser()
commands = parser.add_subparsers(required=True)
@ -55,6 +59,25 @@ def getargs():
"--ratings", metavar="ratings_file.tsv.gz", type=Path, required=True
)
parser_download_imdb_dataset = commands.add_parser(
"download-imdb-dataset",
help="Download IMDb datasets.",
description="""
Download IMDb datasets.
""",
)
parser_download_imdb_dataset.add_argument(
dest="mode",
action="store_const",
const="download-imdb-dataset",
)
parser_download_imdb_dataset.add_argument(
"--basics", metavar="basics_file.tsv.gz", type=Path, required=True
)
parser_download_imdb_dataset.add_argument(
"--ratings", metavar="ratings_file.tsv.gz", type=Path, required=True
)
parser_load_user_ratings_from_imdb = commands.add_parser(
"load-user-ratings-from-imdb",
help="Load user ratings from imdb.com.",
@ -94,6 +117,8 @@ def main():
asyncio.run(run_load_user_ratings_from_imdb())
elif args.mode == "import-imdb-dataset":
asyncio.run(run_import_imdb_dataset(args.basics, args.ratings))
elif args.mode == "download-imdb-dataset":
asyncio.run(run_download_imdb_dataset(args.basics, args.ratings))
main()

View file

@ -1,3 +1,4 @@
import asyncio
import csv
import gzip
import logging
@ -236,7 +237,23 @@ async def import_from_file(*, basics_path: Path, ratings_path: Path):
await db.set_import_progress(100)
async def load_from_web(*, force: bool = False):
async def download_datasets(*, basics_path: Path, ratings_path: Path) -> None:
"""Download IMDb movie database dumps.
See https://www.imdb.com/interfaces/ and https://datasets.imdbws.com/ for
more information on the IMDb database dumps.
"""
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
async with request.asession():
await asyncio.gather(
request.adownload(ratings_url, to_path=ratings_path, only_if_newer=True),
request.adownload(basics_url, to_path=basics_path, only_if_newer=True),
)
async def load_from_web(*, force: bool = False) -> None:
"""Refresh the full IMDb movie database.
The latest dumps are first downloaded and then imported into the database.
@ -249,17 +266,13 @@ async def load_from_web(*, force: bool = False):
await db.set_import_progress(0)
try:
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
ratings_file = config.datadir / "imdb/title.ratings.tsv.gz"
basics_file = config.datadir / "imdb/title.basics.tsv.gz"
ratings_mtime = ratings_file.stat().st_mtime if ratings_file.exists() else None
bastics_mtime = basics_file.stat().st_mtime if basics_file.exists() else None
with request.session():
request.download(ratings_url, ratings_file, only_if_newer=True)
request.download(basics_url, basics_file, only_if_newer=True)
await download_datasets(basics_path=basics_file, ratings_path=ratings_file)
is_changed = (
ratings_mtime != ratings_file.stat().st_mtime