add rclone importer

This commit is contained in:
ducklet 2022-08-21 14:37:25 +02:00
parent af1236bc7e
commit 954562881a
4 changed files with 111 additions and 6 deletions

View file

@ -63,7 +63,7 @@ def getargs():
# Command: scan # Command: scan
with command_parser("scan", help="scan a local file system") as subparser: with command_parser("scan", help="Import from a local file system.") as subparser:
subparser.add_argument( subparser.add_argument(
"basedir", "basedir",
@ -89,7 +89,7 @@ def getargs():
with command_parser( with command_parser(
"ingest-ls", "ingest-ls",
help="ingest extra data", help="Import from `ls -lR`.",
description="When ingesting data from an external source, the hostname will not be set automatically.", description="When ingesting data from an external source, the hostname will not be set automatically.",
) as subparser: ) as subparser:
@ -113,7 +113,9 @@ def getargs():
# Command: ingest-db # Command: ingest-db
with command_parser("ingest-db") as subparser: with command_parser(
"ingest-db", help="Import from a metadex.sqlite file."
) as subparser:
subparser.add_argument( subparser.add_argument(
"infile", "infile",
@ -128,9 +130,33 @@ def getargs():
help="map a source host:path to any other destination while importing", help="map a source host:path to any other destination while importing",
) )
with command_parser(
"ingest-rclone-json", help="Import from `rclone lsjson`."
) as subparser:
subparser.add_argument(
"infile",
nargs="?",
type=argparse.FileType(),
default=sys.stdin,
help="output from `rclone lsjson`",
)
subparser.add_argument(
"--remote-base",
nargs=1,
required=True,
type=str,
help="output from `rclone lsjson`",
)
subparser.add_argument(
"--remove-missing",
action="store_true",
help="Remove files not listed in the infile.",
)
# Command: rm # Command: rm
with command_parser("rm") as subparser: with command_parser("rm", help="Remove files from the index.") as subparser:
subparser.add_argument( subparser.add_argument(
"files", "files",
type=str, type=str,
@ -146,7 +172,7 @@ def getargs():
# Command: ls # Command: ls
with command_parser("ls") as subparser: with command_parser("ls", help="Search indexed files.") as subparser:
subparser.add_argument( subparser.add_argument(
"file", "file",
type=str, type=str,
@ -175,6 +201,9 @@ def getargs():
args.infile = utils.abspath(args.infile) args.infile = utils.abspath(args.infile)
elif args.mode == "ingest-ls": elif args.mode == "ingest-ls":
config.hostname = None config.hostname = None
elif args.mode == "ingest-rclone-json":
config.hostname = None
args.remote_base = args.remote_base[0]
elif args.mode is None: elif args.mode is None:
parser.print_help() parser.print_help()
parser.exit(1, "Error: No command selected.") parser.exit(1, "Error: No command selected.")
@ -182,6 +211,24 @@ def getargs():
return args return args
@command("ingest-rclone-json")
def cmd_ingest_rclone_json(args):
metadex.init(args.db)
log.info("Ingesting rclone JSON file %a ...", args.infile.name)
context = metadex.ingest_rclone_json(
args.infile,
ignore_file=args.ignore_from,
remote_base=args.remote_base,
remove_missing=args.remove_missing,
)
metadex.close()
msg = f"Checked {context.seen} files, {context.added} new, {context.changed} changed, {context.ignored} ignored, {context.removed} removed"
print(msg.ljust(metadex._terminal_width))
@command("ingest-ls") @command("ingest-ls")
def cmd_ingest_ls(args): def cmd_ingest_ls(args):
metadex.init(args.db) metadex.init(args.db)

View file

@ -373,6 +373,7 @@ def upsert_if_changed(conn: Connection, new_data: dict):
return "unchanged" return "unchanged"
log.info("File changed: %a:%a", new_data["hostname"], new_data["location"]) log.info("File changed: %a:%a", new_data["hostname"], new_data["location"])
log.debug("New data: %a Previous: %a", new_data, row._mapping)
# changelog = [] # changelog = []
# for f in ("stat_bytes", "stat_modified", "stat_type"): # for f in ("stat_bytes", "stat_modified", "stat_type"):

View file

@ -1,3 +1,4 @@
import json
import logging import logging
import os import os
import re import re
@ -5,6 +6,7 @@ import sys
import time import time
from collections import deque from collections import deque
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
from shutil import get_terminal_size from shutil import get_terminal_size
from typing import Iterable, Literal, TextIO from typing import Iterable, Literal, TextIO
@ -356,6 +358,62 @@ def ingest_db_file(
return context return context
def _naive_fromisoformat(string, /):
if string.endswith("Z"):
string = string[:-1] + "+00:00"
return (
datetime.fromisoformat(string).astimezone(tz=timezone.utc).replace(tzinfo=None)
)
def _parse_rclone_json(file: TextIO, *, remote_base: str) -> Iterable[dict]:
remote_path = Path("/") / remote_base
for item in json.load(file):
# {"Path":"/foo/bar","Name":"bar","Size":-1,"MimeType":"inode/directory","ModTime":"2022-08-11T22:44:35+02:00","IsDir":true},
if item["Path"] == "..":
continue
d = dict(
location=str(remote_path / item["Path"]),
hostname=config.hostname,
stat_bytes=size if (size := item["Size"]) != -1 else 0,
stat_modified=_naive_fromisoformat(item["ModTime"]),
stat_type="d" if item["IsDir"] else "f",
)
yield d
def ingest_rclone_json(
file: TextIO,
*,
ignore_file: Path,
remote_base: str,
remove_missing: bool = False,
) -> _LogContext:
is_ignored = ignore.parse(ignore_file)
context = _LogContext()
with db.transaction() as conn:
for d in _parse_rclone_json(file, remote_base=remote_base):
context.seen += 1
_log_context(d["location"], context)
if is_ignored(d["location"]):
log.debug("Skipping ignored entry: %a:%a", d["hostname"], d["location"])
context.ignored += 1
continue
if (action := db.upsert_if_changed(conn, d)) == "added":
context.added += 1
elif action == "changed":
context.changed += 1
return context
def ingest_ls( def ingest_ls(
file: TextIO, file: TextIO,
*, *,

View file

@ -1,7 +1,6 @@
import os import os
from pathlib import Path from pathlib import Path
_size_quantifiers = "BKMGTP" _size_quantifiers = "BKMGTP"
_size_map: "dict[str, int]" = { _size_map: "dict[str, int]" = {
_size_quantifiers[i]: 2 ** (10 * i) for i in range(len(_size_quantifiers)) _size_quantifiers[i]: 2 ** (10 * i) for i in range(len(_size_quantifiers))