add "download-imdb-dataset" command to CLI

This commit is contained in:
ducklet 2023-02-04 01:03:12 +01:00
parent 0563d49dbc
commit 7da3a094f1
2 changed files with 45 additions and 7 deletions

View file

@ -6,7 +6,7 @@ from pathlib import Path
from . import config from . import config
from .db import close_connection_pool, open_connection_pool from .db import close_connection_pool, open_connection_pool
from .imdb import refresh_user_ratings_from_imdb from .imdb import refresh_user_ratings_from_imdb
from .imdb_import import import_from_file from .imdb_import import download_datasets, import_from_file
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -31,6 +31,10 @@ async def run_import_imdb_dataset(basics_path: Path, ratings_path: Path):
await close_connection_pool() await close_connection_pool()
async def run_download_imdb_dataset(basics_path: Path, ratings_path: Path):
await download_datasets(basics_path=basics_path, ratings_path=ratings_path)
def getargs(): def getargs():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
commands = parser.add_subparsers(required=True) commands = parser.add_subparsers(required=True)
@ -55,6 +59,25 @@ def getargs():
"--ratings", metavar="ratings_file.tsv.gz", type=Path, required=True "--ratings", metavar="ratings_file.tsv.gz", type=Path, required=True
) )
parser_download_imdb_dataset = commands.add_parser(
"download-imdb-dataset",
help="Download IMDb datasets.",
description="""
Download IMDb datasets.
""",
)
parser_download_imdb_dataset.add_argument(
dest="mode",
action="store_const",
const="download-imdb-dataset",
)
parser_download_imdb_dataset.add_argument(
"--basics", metavar="basics_file.tsv.gz", type=Path, required=True
)
parser_download_imdb_dataset.add_argument(
"--ratings", metavar="ratings_file.tsv.gz", type=Path, required=True
)
parser_load_user_ratings_from_imdb = commands.add_parser( parser_load_user_ratings_from_imdb = commands.add_parser(
"load-user-ratings-from-imdb", "load-user-ratings-from-imdb",
help="Load user ratings from imdb.com.", help="Load user ratings from imdb.com.",
@ -94,6 +117,8 @@ def main():
asyncio.run(run_load_user_ratings_from_imdb()) asyncio.run(run_load_user_ratings_from_imdb())
elif args.mode == "import-imdb-dataset": elif args.mode == "import-imdb-dataset":
asyncio.run(run_import_imdb_dataset(args.basics, args.ratings)) asyncio.run(run_import_imdb_dataset(args.basics, args.ratings))
elif args.mode == "download-imdb-dataset":
asyncio.run(run_download_imdb_dataset(args.basics, args.ratings))
main() main()

View file

@ -1,3 +1,4 @@
import asyncio
import csv import csv
import gzip import gzip
import logging import logging
@ -236,7 +237,23 @@ async def import_from_file(*, basics_path: Path, ratings_path: Path):
await db.set_import_progress(100) await db.set_import_progress(100)
async def load_from_web(*, force: bool = False): async def download_datasets(*, basics_path: Path, ratings_path: Path) -> None:
"""Download IMDb movie database dumps.
See https://www.imdb.com/interfaces/ and https://datasets.imdbws.com/ for
more information on the IMDb database dumps.
"""
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
async with request.asession():
await asyncio.gather(
request.adownload(ratings_url, to_path=ratings_path, only_if_newer=True),
request.adownload(basics_url, to_path=basics_path, only_if_newer=True),
)
async def load_from_web(*, force: bool = False) -> None:
"""Refresh the full IMDb movie database. """Refresh the full IMDb movie database.
The latest dumps are first downloaded and then imported into the database. The latest dumps are first downloaded and then imported into the database.
@ -249,17 +266,13 @@ async def load_from_web(*, force: bool = False):
await db.set_import_progress(0) await db.set_import_progress(0)
try: try:
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
ratings_file = config.datadir / "imdb/title.ratings.tsv.gz" ratings_file = config.datadir / "imdb/title.ratings.tsv.gz"
basics_file = config.datadir / "imdb/title.basics.tsv.gz" basics_file = config.datadir / "imdb/title.basics.tsv.gz"
ratings_mtime = ratings_file.stat().st_mtime if ratings_file.exists() else None ratings_mtime = ratings_file.stat().st_mtime if ratings_file.exists() else None
bastics_mtime = basics_file.stat().st_mtime if basics_file.exists() else None bastics_mtime = basics_file.stat().st_mtime if basics_file.exists() else None
with request.session(): await download_datasets(basics_path=basics_file, ratings_path=ratings_file)
request.download(ratings_url, ratings_file, only_if_newer=True)
request.download(basics_url, basics_file, only_if_newer=True)
is_changed = ( is_changed = (
ratings_mtime != ratings_file.stat().st_mtime ratings_mtime != ratings_file.stat().st_mtime