From 954562881a40c3f5095f0ce0b270c2e01511f5e4 Mon Sep 17 00:00:00 2001 From: ducklet Date: Sun, 21 Aug 2022 14:37:25 +0200 Subject: [PATCH] add `rclone` importer --- metadex/__main__.py | 57 ++++++++++++++++++++++++++++++++++++++++---- metadex/db.py | 1 + metadex/metadex.py | 58 +++++++++++++++++++++++++++++++++++++++++++++ metadex/utils.py | 1 - 4 files changed, 111 insertions(+), 6 deletions(-) diff --git a/metadex/__main__.py b/metadex/__main__.py index 38091b6..91bb944 100644 --- a/metadex/__main__.py +++ b/metadex/__main__.py @@ -63,7 +63,7 @@ def getargs(): # Command: scan - with command_parser("scan", help="scan a local file system") as subparser: + with command_parser("scan", help="Import from a local file system.") as subparser: subparser.add_argument( "basedir", @@ -89,7 +89,7 @@ def getargs(): with command_parser( "ingest-ls", - help="ingest extra data", + help="Import from `ls -lR`.", description="When ingesting data from an external source, the hostname will not be set automatically.", ) as subparser: @@ -113,7 +113,9 @@ def getargs(): # Command: ingest-db - with command_parser("ingest-db") as subparser: + with command_parser( + "ingest-db", help="Import from a metadex.sqlite file." + ) as subparser: subparser.add_argument( "infile", @@ -128,9 +130,33 @@ def getargs(): help="map a source host:path to any other destination while importing", ) + with command_parser( + "ingest-rclone-json", help="Import from `rclone lsjson`." + ) as subparser: + + subparser.add_argument( + "infile", + nargs="?", + type=argparse.FileType(), + default=sys.stdin, + help="output from `rclone lsjson`", + ) + subparser.add_argument( + "--remote-base", + nargs=1, + required=True, + type=str, + help="output from `rclone lsjson`", + ) + subparser.add_argument( + "--remove-missing", + action="store_true", + help="Remove files not listed in the infile.", + ) + # Command: rm - with command_parser("rm") as subparser: + with command_parser("rm", help="Remove files from the index.") as subparser: subparser.add_argument( "files", type=str, @@ -146,7 +172,7 @@ def getargs(): # Command: ls - with command_parser("ls") as subparser: + with command_parser("ls", help="Search indexed files.") as subparser: subparser.add_argument( "file", type=str, @@ -175,6 +201,9 @@ def getargs(): args.infile = utils.abspath(args.infile) elif args.mode == "ingest-ls": config.hostname = None + elif args.mode == "ingest-rclone-json": + config.hostname = None + args.remote_base = args.remote_base[0] elif args.mode is None: parser.print_help() parser.exit(1, "Error: No command selected.") @@ -182,6 +211,24 @@ def getargs(): return args +@command("ingest-rclone-json") +def cmd_ingest_rclone_json(args): + metadex.init(args.db) + + log.info("Ingesting rclone JSON file %a ...", args.infile.name) + context = metadex.ingest_rclone_json( + args.infile, + ignore_file=args.ignore_from, + remote_base=args.remote_base, + remove_missing=args.remove_missing, + ) + + metadex.close() + + msg = f"Checked {context.seen} files, {context.added} new, {context.changed} changed, {context.ignored} ignored, {context.removed} removed" + print(msg.ljust(metadex._terminal_width)) + + @command("ingest-ls") def cmd_ingest_ls(args): metadex.init(args.db) diff --git a/metadex/db.py b/metadex/db.py index 90c6416..56e7915 100644 --- a/metadex/db.py +++ b/metadex/db.py @@ -373,6 +373,7 @@ def upsert_if_changed(conn: Connection, new_data: dict): return "unchanged" log.info("File changed: %a:%a", new_data["hostname"], new_data["location"]) + log.debug("New data: %a Previous: %a", new_data, row._mapping) # changelog = [] # for f in ("stat_bytes", "stat_modified", "stat_type"): diff --git a/metadex/metadex.py b/metadex/metadex.py index e5595e6..86a1649 100644 --- a/metadex/metadex.py +++ b/metadex/metadex.py @@ -1,3 +1,4 @@ +import json import logging import os import re @@ -5,6 +6,7 @@ import sys import time from collections import deque from dataclasses import dataclass +from datetime import datetime, timezone from pathlib import Path from shutil import get_terminal_size from typing import Iterable, Literal, TextIO @@ -356,6 +358,62 @@ def ingest_db_file( return context +def _naive_fromisoformat(string, /): + if string.endswith("Z"): + string = string[:-1] + "+00:00" + return ( + datetime.fromisoformat(string).astimezone(tz=timezone.utc).replace(tzinfo=None) + ) + + +def _parse_rclone_json(file: TextIO, *, remote_base: str) -> Iterable[dict]: + remote_path = Path("/") / remote_base + for item in json.load(file): + # {"Path":"/foo/bar","Name":"bar","Size":-1,"MimeType":"inode/directory","ModTime":"2022-08-11T22:44:35+02:00","IsDir":true}, + if item["Path"] == "..": + continue + d = dict( + location=str(remote_path / item["Path"]), + hostname=config.hostname, + stat_bytes=size if (size := item["Size"]) != -1 else 0, + stat_modified=_naive_fromisoformat(item["ModTime"]), + stat_type="d" if item["IsDir"] else "f", + ) + yield d + + +def ingest_rclone_json( + file: TextIO, + *, + ignore_file: Path, + remote_base: str, + remove_missing: bool = False, +) -> _LogContext: + is_ignored = ignore.parse(ignore_file) + + context = _LogContext() + + with db.transaction() as conn: + + for d in _parse_rclone_json(file, remote_base=remote_base): + + context.seen += 1 + + _log_context(d["location"], context) + + if is_ignored(d["location"]): + log.debug("Skipping ignored entry: %a:%a", d["hostname"], d["location"]) + context.ignored += 1 + continue + + if (action := db.upsert_if_changed(conn, d)) == "added": + context.added += 1 + elif action == "changed": + context.changed += 1 + + return context + + def ingest_ls( file: TextIO, *, diff --git a/metadex/utils.py b/metadex/utils.py index 8988061..9c16d89 100644 --- a/metadex/utils.py +++ b/metadex/utils.py @@ -1,7 +1,6 @@ import os from pathlib import Path - _size_quantifiers = "BKMGTP" _size_map: "dict[str, int]" = { _size_quantifiers[i]: 2 ** (10 * i) for i in range(len(_size_quantifiers))