add "download-imdb-dataset" command to CLI
This commit is contained in:
parent
0563d49dbc
commit
7da3a094f1
2 changed files with 45 additions and 7 deletions
|
|
@ -6,7 +6,7 @@ from pathlib import Path
|
|||
from . import config
|
||||
from .db import close_connection_pool, open_connection_pool
|
||||
from .imdb import refresh_user_ratings_from_imdb
|
||||
from .imdb_import import import_from_file
|
||||
from .imdb_import import download_datasets, import_from_file
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -31,6 +31,10 @@ async def run_import_imdb_dataset(basics_path: Path, ratings_path: Path):
|
|||
await close_connection_pool()
|
||||
|
||||
|
||||
async def run_download_imdb_dataset(basics_path: Path, ratings_path: Path):
|
||||
await download_datasets(basics_path=basics_path, ratings_path=ratings_path)
|
||||
|
||||
|
||||
def getargs():
|
||||
parser = argparse.ArgumentParser()
|
||||
commands = parser.add_subparsers(required=True)
|
||||
|
|
@ -55,6 +59,25 @@ def getargs():
|
|||
"--ratings", metavar="ratings_file.tsv.gz", type=Path, required=True
|
||||
)
|
||||
|
||||
parser_download_imdb_dataset = commands.add_parser(
|
||||
"download-imdb-dataset",
|
||||
help="Download IMDb datasets.",
|
||||
description="""
|
||||
Download IMDb datasets.
|
||||
""",
|
||||
)
|
||||
parser_download_imdb_dataset.add_argument(
|
||||
dest="mode",
|
||||
action="store_const",
|
||||
const="download-imdb-dataset",
|
||||
)
|
||||
parser_download_imdb_dataset.add_argument(
|
||||
"--basics", metavar="basics_file.tsv.gz", type=Path, required=True
|
||||
)
|
||||
parser_download_imdb_dataset.add_argument(
|
||||
"--ratings", metavar="ratings_file.tsv.gz", type=Path, required=True
|
||||
)
|
||||
|
||||
parser_load_user_ratings_from_imdb = commands.add_parser(
|
||||
"load-user-ratings-from-imdb",
|
||||
help="Load user ratings from imdb.com.",
|
||||
|
|
@ -94,6 +117,8 @@ def main():
|
|||
asyncio.run(run_load_user_ratings_from_imdb())
|
||||
elif args.mode == "import-imdb-dataset":
|
||||
asyncio.run(run_import_imdb_dataset(args.basics, args.ratings))
|
||||
elif args.mode == "download-imdb-dataset":
|
||||
asyncio.run(run_download_imdb_dataset(args.basics, args.ratings))
|
||||
|
||||
|
||||
main()
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
import asyncio
|
||||
import csv
|
||||
import gzip
|
||||
import logging
|
||||
|
|
@ -236,7 +237,23 @@ async def import_from_file(*, basics_path: Path, ratings_path: Path):
|
|||
await db.set_import_progress(100)
|
||||
|
||||
|
||||
async def load_from_web(*, force: bool = False):
|
||||
async def download_datasets(*, basics_path: Path, ratings_path: Path) -> None:
|
||||
"""Download IMDb movie database dumps.
|
||||
|
||||
See https://www.imdb.com/interfaces/ and https://datasets.imdbws.com/ for
|
||||
more information on the IMDb database dumps.
|
||||
"""
|
||||
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
|
||||
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
|
||||
|
||||
async with request.asession():
|
||||
await asyncio.gather(
|
||||
request.adownload(ratings_url, to_path=ratings_path, only_if_newer=True),
|
||||
request.adownload(basics_url, to_path=basics_path, only_if_newer=True),
|
||||
)
|
||||
|
||||
|
||||
async def load_from_web(*, force: bool = False) -> None:
|
||||
"""Refresh the full IMDb movie database.
|
||||
|
||||
The latest dumps are first downloaded and then imported into the database.
|
||||
|
|
@ -249,17 +266,13 @@ async def load_from_web(*, force: bool = False):
|
|||
await db.set_import_progress(0)
|
||||
|
||||
try:
|
||||
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
|
||||
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
|
||||
ratings_file = config.datadir / "imdb/title.ratings.tsv.gz"
|
||||
basics_file = config.datadir / "imdb/title.basics.tsv.gz"
|
||||
|
||||
ratings_mtime = ratings_file.stat().st_mtime if ratings_file.exists() else None
|
||||
bastics_mtime = basics_file.stat().st_mtime if basics_file.exists() else None
|
||||
|
||||
with request.session():
|
||||
request.download(ratings_url, ratings_file, only_if_newer=True)
|
||||
request.download(basics_url, basics_file, only_if_newer=True)
|
||||
await download_datasets(basics_path=basics_file, ratings_path=ratings_file)
|
||||
|
||||
is_changed = (
|
||||
ratings_mtime != ratings_file.stat().st_mtime
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue