add rclone importer

This commit is contained in:
ducklet 2022-08-21 14:37:25 +02:00
parent af1236bc7e
commit 954562881a
4 changed files with 111 additions and 6 deletions

View file

@ -63,7 +63,7 @@ def getargs():
# Command: scan
with command_parser("scan", help="scan a local file system") as subparser:
with command_parser("scan", help="Import from a local file system.") as subparser:
subparser.add_argument(
"basedir",
@ -89,7 +89,7 @@ def getargs():
with command_parser(
"ingest-ls",
help="ingest extra data",
help="Import from `ls -lR`.",
description="When ingesting data from an external source, the hostname will not be set automatically.",
) as subparser:
@ -113,7 +113,9 @@ def getargs():
# Command: ingest-db
with command_parser("ingest-db") as subparser:
with command_parser(
"ingest-db", help="Import from a metadex.sqlite file."
) as subparser:
subparser.add_argument(
"infile",
@ -128,9 +130,33 @@ def getargs():
help="map a source host:path to any other destination while importing",
)
with command_parser(
"ingest-rclone-json", help="Import from `rclone lsjson`."
) as subparser:
subparser.add_argument(
"infile",
nargs="?",
type=argparse.FileType(),
default=sys.stdin,
help="output from `rclone lsjson`",
)
subparser.add_argument(
"--remote-base",
nargs=1,
required=True,
type=str,
help="output from `rclone lsjson`",
)
subparser.add_argument(
"--remove-missing",
action="store_true",
help="Remove files not listed in the infile.",
)
# Command: rm
with command_parser("rm") as subparser:
with command_parser("rm", help="Remove files from the index.") as subparser:
subparser.add_argument(
"files",
type=str,
@ -146,7 +172,7 @@ def getargs():
# Command: ls
with command_parser("ls") as subparser:
with command_parser("ls", help="Search indexed files.") as subparser:
subparser.add_argument(
"file",
type=str,
@ -175,6 +201,9 @@ def getargs():
args.infile = utils.abspath(args.infile)
elif args.mode == "ingest-ls":
config.hostname = None
elif args.mode == "ingest-rclone-json":
config.hostname = None
args.remote_base = args.remote_base[0]
elif args.mode is None:
parser.print_help()
parser.exit(1, "Error: No command selected.")
@ -182,6 +211,24 @@ def getargs():
return args
@command("ingest-rclone-json")
def cmd_ingest_rclone_json(args):
metadex.init(args.db)
log.info("Ingesting rclone JSON file %a ...", args.infile.name)
context = metadex.ingest_rclone_json(
args.infile,
ignore_file=args.ignore_from,
remote_base=args.remote_base,
remove_missing=args.remove_missing,
)
metadex.close()
msg = f"Checked {context.seen} files, {context.added} new, {context.changed} changed, {context.ignored} ignored, {context.removed} removed"
print(msg.ljust(metadex._terminal_width))
@command("ingest-ls")
def cmd_ingest_ls(args):
metadex.init(args.db)

View file

@ -373,6 +373,7 @@ def upsert_if_changed(conn: Connection, new_data: dict):
return "unchanged"
log.info("File changed: %a:%a", new_data["hostname"], new_data["location"])
log.debug("New data: %a Previous: %a", new_data, row._mapping)
# changelog = []
# for f in ("stat_bytes", "stat_modified", "stat_type"):

View file

@ -1,3 +1,4 @@
import json
import logging
import os
import re
@ -5,6 +6,7 @@ import sys
import time
from collections import deque
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from shutil import get_terminal_size
from typing import Iterable, Literal, TextIO
@ -356,6 +358,62 @@ def ingest_db_file(
return context
def _naive_fromisoformat(string, /):
if string.endswith("Z"):
string = string[:-1] + "+00:00"
return (
datetime.fromisoformat(string).astimezone(tz=timezone.utc).replace(tzinfo=None)
)
def _parse_rclone_json(file: TextIO, *, remote_base: str) -> Iterable[dict]:
remote_path = Path("/") / remote_base
for item in json.load(file):
# {"Path":"/foo/bar","Name":"bar","Size":-1,"MimeType":"inode/directory","ModTime":"2022-08-11T22:44:35+02:00","IsDir":true},
if item["Path"] == "..":
continue
d = dict(
location=str(remote_path / item["Path"]),
hostname=config.hostname,
stat_bytes=size if (size := item["Size"]) != -1 else 0,
stat_modified=_naive_fromisoformat(item["ModTime"]),
stat_type="d" if item["IsDir"] else "f",
)
yield d
def ingest_rclone_json(
file: TextIO,
*,
ignore_file: Path,
remote_base: str,
remove_missing: bool = False,
) -> _LogContext:
is_ignored = ignore.parse(ignore_file)
context = _LogContext()
with db.transaction() as conn:
for d in _parse_rclone_json(file, remote_base=remote_base):
context.seen += 1
_log_context(d["location"], context)
if is_ignored(d["location"]):
log.debug("Skipping ignored entry: %a:%a", d["hostname"], d["location"])
context.ignored += 1
continue
if (action := db.upsert_if_changed(conn, d)) == "added":
context.added += 1
elif action == "changed":
context.changed += 1
return context
def ingest_ls(
file: TextIO,
*,

View file

@ -1,7 +1,6 @@
import os
from pathlib import Path
_size_quantifiers = "BKMGTP"
_size_map: "dict[str, int]" = {
_size_quantifiers[i]: 2 ** (10 * i) for i in range(len(_size_quantifiers))