wip
This commit is contained in:
commit
01a96c14d4
18 changed files with 2108 additions and 0 deletions
0
metadex/__init__.py
Normal file
0
metadex/__init__.py
Normal file
319
metadex/__main__.py
Normal file
319
metadex/__main__.py
Normal file
|
|
@ -0,0 +1,319 @@
|
|||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import stat
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from . import config, metadex, utils
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def getargs():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.set_defaults(mode=None)
|
||||
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
"-n",
|
||||
action="store_true",
|
||||
help="don't update the DB, only print what would change",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--quick",
|
||||
action="store_true",
|
||||
default=not config.db_allow_slow,
|
||||
help="skip all DB integrity & optimization steps",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hostname",
|
||||
help="overwrite the hostname to use as path prefix",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ignore-from",
|
||||
type=Path,
|
||||
default=config.default_ignore,
|
||||
help="load list of ignore rules from the given file",
|
||||
)
|
||||
parser.add_argument("--db", type=Path, default=config.default_db)
|
||||
parser.add_argument("--verbose", "-v", action="store_true", default=False)
|
||||
|
||||
subparsers = parser.add_subparsers(title="commands")
|
||||
|
||||
# Command: scan
|
||||
|
||||
parser_scan = subparsers.add_parser("scan", help="scan a local file system")
|
||||
parser_scan.set_defaults(mode="scan")
|
||||
|
||||
parser_scan.add_argument(
|
||||
"basedir",
|
||||
type=Path,
|
||||
nargs="+",
|
||||
help="index all files from this dir",
|
||||
)
|
||||
parser_scan.add_argument(
|
||||
"--no-remove-missing",
|
||||
dest="remove_missing",
|
||||
action="store_false",
|
||||
help="do not remove files from the database which cannot be found in the file system",
|
||||
)
|
||||
parser_scan.add_argument(
|
||||
"--map-mount",
|
||||
nargs="*",
|
||||
default=[],
|
||||
type=str,
|
||||
help="map a source host:path to any other destination during scanning for files\nExample: src=/mnt/foo,dest=foo:",
|
||||
)
|
||||
|
||||
# Command: ingest-ls
|
||||
|
||||
parser_ingest_ls = subparsers.add_parser(
|
||||
"ingest-ls",
|
||||
help="ingest extra data",
|
||||
description="When ingesting data from an external source, the hostname will not be set automatically.",
|
||||
)
|
||||
parser_ingest_ls.set_defaults(mode="ingest-ls")
|
||||
|
||||
parser_ingest_ls.add_argument(
|
||||
"infile",
|
||||
nargs="?",
|
||||
type=argparse.FileType(),
|
||||
default=sys.stdin,
|
||||
help="output from `ls -lR`",
|
||||
)
|
||||
parser_ingest_ls.add_argument(
|
||||
"--remove-missing",
|
||||
action="store_true",
|
||||
help="Remove files not listed in the infile.",
|
||||
)
|
||||
parser_ingest_ls.add_argument(
|
||||
"--ref-year",
|
||||
type=int,
|
||||
help="The year when 'ls -l' was run, to resolve relative dates.",
|
||||
)
|
||||
|
||||
# Command: ingest-db
|
||||
|
||||
parser_ingest_db = subparsers.add_parser("ingest-db")
|
||||
parser_ingest_db.set_defaults(mode="ingest-db")
|
||||
|
||||
parser_ingest_db.add_argument(
|
||||
"infile",
|
||||
type=Path,
|
||||
help="a Metadex SQLite DB",
|
||||
)
|
||||
parser_ingest_db.add_argument(
|
||||
"--map-mount",
|
||||
nargs="*",
|
||||
default=[],
|
||||
type=str,
|
||||
help="map a source host:path to any other destination while importing",
|
||||
)
|
||||
|
||||
# Command: rm
|
||||
|
||||
parser_rm = subparsers.add_parser("rm")
|
||||
parser_rm.set_defaults(mode="rm")
|
||||
parser_rm.add_argument(
|
||||
"files",
|
||||
type=str,
|
||||
nargs="+",
|
||||
help="files to remove",
|
||||
)
|
||||
parser_rm.add_argument(
|
||||
"-r",
|
||||
action="store_true",
|
||||
dest="include_subfiles",
|
||||
help="include sub-files",
|
||||
)
|
||||
|
||||
# Command: ls
|
||||
|
||||
parser_ls = subparsers.add_parser("ls")
|
||||
parser_ls.set_defaults(mode="ls")
|
||||
parser_ls.add_argument(
|
||||
"file",
|
||||
type=str,
|
||||
nargs="*",
|
||||
help="look up a file",
|
||||
)
|
||||
parser_ls.add_argument(
|
||||
"--type",
|
||||
"-t",
|
||||
choices="dfl",
|
||||
help="Filter searches to (d)irectories, plain (f)iles, or sym(l)inks.",
|
||||
)
|
||||
parser_ls.add_argument("--format", type=str, default="{date}\t{size}\t{path}")
|
||||
parser_ls.add_argument(
|
||||
"--match", choices=("regex", "glob", "fuzzy"), default="glob"
|
||||
)
|
||||
|
||||
# Parse args.
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.mode == "scan":
|
||||
args.basedir = [utils.abspath(p) for p in args.basedir]
|
||||
elif args.mode == "ingest-db":
|
||||
args.infile = utils.abspath(args.infile)
|
||||
elif args.mode == "ingest-ls":
|
||||
config.hostname = None
|
||||
elif args.mode is None:
|
||||
parser.print_help()
|
||||
parser.exit(1, "Error: No command selected.")
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def cmd_ingest_ls(args):
|
||||
metadex.init(args.db)
|
||||
|
||||
log.info("Ingesting ls file %a ...", args.infile.name)
|
||||
metadex.ingest_ls(
|
||||
args.infile,
|
||||
ignore_file=args.ignore_from,
|
||||
ref_year=args.ref_year,
|
||||
remove_missing=args.remove_missing,
|
||||
)
|
||||
|
||||
metadex.close()
|
||||
|
||||
|
||||
def cmd_ingest_db(args):
|
||||
metadex.init(args.db)
|
||||
|
||||
log.info("Ingesting Metadex DB file %a ...", str(args.infile))
|
||||
context = metadex.ingest_db_file(
|
||||
args.infile, ignore_file=args.ignore_from, map_pathspecs=args.map_mount
|
||||
)
|
||||
|
||||
msg = f"Checked {context.seen} files, {context.added} new, {context.changed} changed, {context.ignored} ignored, {context.removed} removed"
|
||||
print(msg.ljust(metadex._terminal_width))
|
||||
|
||||
metadex.close()
|
||||
|
||||
|
||||
def cmd_scan(args):
|
||||
metadex.init(args.db)
|
||||
|
||||
for basedir in args.basedir:
|
||||
log.info("Scanning %a ...", str(basedir))
|
||||
context = metadex.scan(
|
||||
basedir,
|
||||
ignore_file=args.ignore_from,
|
||||
remove_missing=args.remove_missing,
|
||||
map_pathspecs=args.map_mount,
|
||||
)
|
||||
|
||||
msg = f"{basedir}: Checked {context.seen} files, {context.added} new, {context.changed} changed, {context.ignored} ignored, {context.removed} removed"
|
||||
print(msg.ljust(metadex._terminal_width))
|
||||
|
||||
metadex.close()
|
||||
|
||||
|
||||
def cmd_rm(args):
|
||||
metadex.init(args.db)
|
||||
|
||||
for path in args.files:
|
||||
metadex.rm(path, include_children=args.include_subfiles)
|
||||
|
||||
metadex.close()
|
||||
|
||||
|
||||
def cmd_ls(args):
|
||||
metadex.init(args.db)
|
||||
|
||||
args.file = [f for f in args.file if f]
|
||||
|
||||
if not args.file:
|
||||
|
||||
# List all known hosts.
|
||||
for host in sorted(metadex.hosts(), key=str.casefold):
|
||||
print(f"{host}:")
|
||||
|
||||
else:
|
||||
|
||||
for pathspec in args.file:
|
||||
|
||||
for file in metadex.ls(pathspec, type=args.type, match=args.match):
|
||||
date = file.stat_modified.isoformat(sep=" ", timespec="seconds")
|
||||
size = utils.size_for_display(
|
||||
file.stat_bytes, precision=1, format="compact"
|
||||
)
|
||||
path = f"{file.hostname}:{file.location}"
|
||||
if file.stat_type == "d" and not path.endswith("/"): # relevant for `/`
|
||||
path += "/"
|
||||
|
||||
fargs = {"date": date, "size": size, "path": path, "file": file}
|
||||
try:
|
||||
out = args.format.format(**fargs)
|
||||
except (KeyError, AttributeError) as err:
|
||||
keys = sorted(fargs.keys())
|
||||
log.exception(
|
||||
"Keys available to formatting: %s",
|
||||
", ".join(keys),
|
||||
exc_info=err,
|
||||
)
|
||||
return 5
|
||||
print(out)
|
||||
|
||||
|
||||
def is_stdout_piped():
|
||||
s = os.fstat(sys.stdout.fileno())
|
||||
return stat.S_ISFIFO(s.st_mode)
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s.%(msecs)03d [%(name)s:%(process)d] %(levelname)s: %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
level=config.loglevel,
|
||||
)
|
||||
logging.getLogger("sqlalchemy.engine").setLevel(
|
||||
"INFO" if config.debug else "WARNING"
|
||||
)
|
||||
log.debug(f"Log level: {config.loglevel}")
|
||||
|
||||
args = getargs()
|
||||
# print(args)
|
||||
|
||||
if args.verbose and config.loglevel == "WARNING":
|
||||
config.loglevel = "INFO"
|
||||
logging.getLogger().setLevel(config.loglevel)
|
||||
config.db_allow_slow = not args.quick
|
||||
config.dryrun = args.dry_run
|
||||
config.is_stdout_piped = is_stdout_piped()
|
||||
|
||||
if args.hostname:
|
||||
config.hostname = args.hostname
|
||||
|
||||
if config.hostname:
|
||||
log.info("Using hostname: %a", config.hostname)
|
||||
else:
|
||||
log.error("Hostname is not set.")
|
||||
log.info(
|
||||
"If the hostname cannot be found automatically, try setting it using --hostname."
|
||||
)
|
||||
return 2
|
||||
|
||||
if config.dryrun:
|
||||
log.info(f"--- DRY RUN ---")
|
||||
|
||||
if args.mode == "scan":
|
||||
return cmd_scan(args)
|
||||
elif args.mode == "ingest-ls":
|
||||
return cmd_ingest_ls(args)
|
||||
elif args.mode == "ingest-db":
|
||||
return cmd_ingest_db(args)
|
||||
elif args.mode == "rm":
|
||||
return cmd_rm(args)
|
||||
elif args.mode == "ls":
|
||||
# Since this is a read-only operation we can change some config params.
|
||||
config.db_allow_slow = False
|
||||
config.dryrun = True
|
||||
return cmd_ls(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
11
metadex/config.py
Normal file
11
metadex/config.py
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
|
||||
debug = os.getenv("DEBUG") == "1"
|
||||
loglevel = os.getenv("METADEX_LOGLEVEL") or ("DEBUG" if debug else "WARNING")
|
||||
dryrun = False
|
||||
hostname = os.uname().nodename # or socket.gethostname()
|
||||
default_db = Path("metadex.sqlite")
|
||||
default_ignore = Path("metadex.ignore")
|
||||
db_allow_slow = True
|
||||
is_stdout_piped = False
|
||||
540
metadex/db.py
Normal file
540
metadex/db.py
Normal file
|
|
@ -0,0 +1,540 @@
|
|||
import logging
|
||||
from contextlib import contextmanager
|
||||
from datetime import datetime
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from random import randint
|
||||
from typing import Iterable, overload
|
||||
|
||||
from sqlalchemy import (
|
||||
Column,
|
||||
DateTime,
|
||||
Enum,
|
||||
Integer,
|
||||
MetaData,
|
||||
String,
|
||||
Table,
|
||||
UniqueConstraint,
|
||||
create_engine,
|
||||
)
|
||||
from sqlalchemy.engine.base import Connection, Engine
|
||||
from sqlalchemy.engine.row import Row
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
from sqlalchemy.sql import and_, or_, select, text
|
||||
from sqlalchemy.sql.schema import ForeignKey
|
||||
|
||||
from . import config
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
metadata = MetaData()
|
||||
|
||||
metadex = Table(
|
||||
"metadex",
|
||||
metadata,
|
||||
Column("id", Integer, primary_key=True),
|
||||
Column(
|
||||
"parent_id",
|
||||
ForeignKey("metadex.id"),
|
||||
nullable=True,
|
||||
index=True,
|
||||
comment="Points to the entry with the parent location.",
|
||||
),
|
||||
Column(
|
||||
"added", DateTime, nullable=False
|
||||
), # switch to Integer for smaller size maybe?
|
||||
Column(
|
||||
"updated", DateTime, nullable=False
|
||||
), # switch to Integer for smaller size maybe?
|
||||
Column("location", String, nullable=False, index=True),
|
||||
Column("hostname", String, nullable=False, index=True),
|
||||
Column("stat_bytes", Integer, nullable=False),
|
||||
# Column("stat_changed", DateTime, nullable=False), # switch to Integer for smaller size maybe?
|
||||
Column(
|
||||
"stat_modified", DateTime, nullable=False
|
||||
), # switch to Integer for smaller size maybe?
|
||||
Column("stat_type", Enum("d", "f", "l", "-"), nullable=False),
|
||||
UniqueConstraint("location", "hostname"),
|
||||
)
|
||||
|
||||
engine: Engine = None # type:ignore
|
||||
|
||||
|
||||
def check_integrity(conn: Connection):
|
||||
|
||||
stmt = text("PRAGMA integrity_check")
|
||||
state = conn.execute(stmt).scalar()
|
||||
|
||||
if state is None:
|
||||
raise IntegrityError(stmt, None, None)
|
||||
|
||||
log.info("Database file integrity: %s", state)
|
||||
|
||||
|
||||
def check_parent_ids(conn: Connection):
|
||||
log.info("Checking parent file associations ... press Ctrl-C to skip!")
|
||||
|
||||
try:
|
||||
reassign_parent_ids(conn)
|
||||
except KeyboardInterrupt:
|
||||
log.warning("Aborted parent ID rebuild.")
|
||||
|
||||
|
||||
def optimize(conn: Connection, *, vacuum: bool = False):
|
||||
log.info("Optimizing database ...")
|
||||
|
||||
conn.execute(text("PRAGMA analysis_limit=400"))
|
||||
conn.execute(text("PRAGMA optimize"))
|
||||
|
||||
if vacuum:
|
||||
log.info("Running vacuum on database ... press Ctrl-C to skip!")
|
||||
try:
|
||||
conn.execute(text("VACUUM"))
|
||||
except KeyboardInterrupt:
|
||||
log.warning("Aborted DB cleanup.")
|
||||
|
||||
|
||||
def autoconf(conn: Connection):
|
||||
log.info("Configuring database ...")
|
||||
|
||||
conn.execute(text("PRAGMA journal_mode=WAL"))
|
||||
conn.execute(text("PRAGMA synchronous=NORMAL"))
|
||||
|
||||
|
||||
class Db:
|
||||
engine: "Engine | None" = None
|
||||
is_dirty: bool = False
|
||||
|
||||
def __init__(self, path: Path):
|
||||
self.open(path)
|
||||
|
||||
def __del__(self):
|
||||
self.close()
|
||||
|
||||
def open(self, path: Path):
|
||||
log.info("Using database: %a", str(path))
|
||||
|
||||
if self.engine:
|
||||
raise RuntimeError("DB already initialized.")
|
||||
|
||||
prefix = "sqlite+pysqlite:///"
|
||||
self.engine = create_engine(f"{prefix}{path}", future=True)
|
||||
metadata.create_all(engine)
|
||||
|
||||
if config.db_allow_slow:
|
||||
with self.transaction() as conn:
|
||||
autoconf(conn)
|
||||
check_integrity(conn)
|
||||
|
||||
def close(self):
|
||||
if self.engine is None:
|
||||
return
|
||||
|
||||
if self.is_dirty:
|
||||
chance = 10 # Set the chance for long running actions to happen to 1 in X.
|
||||
do_slow = config.db_allow_slow and randint(1, chance) == 1
|
||||
|
||||
with self.transaction() as conn:
|
||||
# if do_slow:
|
||||
# check_parent_ids(conn)
|
||||
optimize(conn, vacuum=do_slow)
|
||||
|
||||
self.engine = None
|
||||
|
||||
@contextmanager
|
||||
def transaction(
|
||||
self, *, rollback_on_error: bool = False, force_rollback: bool = False
|
||||
):
|
||||
if self.engine is None:
|
||||
raise RuntimeError("DB was closed.")
|
||||
|
||||
connect = (
|
||||
self.engine.connect
|
||||
if (force_rollback or config.dryrun)
|
||||
else self.engine.begin
|
||||
)
|
||||
err = None
|
||||
|
||||
with connect() as conn:
|
||||
try:
|
||||
yield conn
|
||||
except BaseException as e:
|
||||
if force_rollback or rollback_on_error:
|
||||
raise e
|
||||
# Allow the connection to run its ordinary clean up, i.e. to flush
|
||||
# the data written to it so far to disk.
|
||||
err = e
|
||||
|
||||
if err:
|
||||
raise err
|
||||
|
||||
if not force_rollback:
|
||||
self.is_dirty = True
|
||||
|
||||
|
||||
def init(path: Path = Path(":memory:")):
|
||||
global engine
|
||||
|
||||
log.info("Using database: %a", str(path))
|
||||
|
||||
if engine:
|
||||
raise RuntimeError("DB already initialized.")
|
||||
|
||||
prefix = "sqlite+pysqlite:///"
|
||||
engine = create_engine(f"{prefix}{path}", future=True)
|
||||
metadata.create_all(engine)
|
||||
|
||||
if config.db_allow_slow:
|
||||
with transaction() as conn:
|
||||
autoconf(conn)
|
||||
check_integrity(conn)
|
||||
|
||||
|
||||
def close():
|
||||
global engine
|
||||
|
||||
chance = 10 # Set the chance for long running actions to happen to 1 in X.
|
||||
do_slow = config.db_allow_slow and randint(1, chance) == 1
|
||||
|
||||
with transaction() as conn:
|
||||
# if do_slow:
|
||||
# check_parent_ids(conn)
|
||||
optimize(conn, vacuum=do_slow)
|
||||
|
||||
engine = None # type: ignore
|
||||
|
||||
|
||||
def iter_all(conn: Connection) -> Iterable[Row]:
|
||||
return conn.execute(select(metadex))
|
||||
|
||||
|
||||
def get_file(conn: Connection, *, location: str, hostname: str):
|
||||
stmt = select(metadex).where(
|
||||
and_(
|
||||
metadex.c.location == location,
|
||||
metadex.c.hostname == hostname,
|
||||
)
|
||||
)
|
||||
return conn.execute(stmt).one_or_none()
|
||||
|
||||
|
||||
def get_files(conn: Connection, *, parent_id: int):
|
||||
stmt = select(metadex).where(
|
||||
metadex.c.parent_id == parent_id,
|
||||
)
|
||||
return conn.execute(stmt).all()
|
||||
|
||||
|
||||
_escape_char = "#"
|
||||
|
||||
|
||||
def escape(s: str) -> str:
|
||||
return (
|
||||
s.replace(_escape_char, 2 * _escape_char)
|
||||
.replace("%", _escape_char + "%")
|
||||
.replace("_", _escape_char + "_")
|
||||
)
|
||||
|
||||
|
||||
def search(
|
||||
conn: Connection,
|
||||
*,
|
||||
contains: "str | None" = None,
|
||||
startswith: "str | None" = None,
|
||||
endswith: "str | None" = None,
|
||||
like: "str | None" = None,
|
||||
regex: "str | None" = None,
|
||||
type: "str | None" = None,
|
||||
hostname: "str | None" = None,
|
||||
hostname_like: "str | None" = None,
|
||||
hostname_regex: "str | None" = None,
|
||||
) -> "Iterable[Row]":
|
||||
|
||||
stmt = select(metadex)
|
||||
|
||||
if type:
|
||||
stmt = stmt.where(metadex.c.stat_type == type)
|
||||
|
||||
if hostname:
|
||||
stmt = stmt.where(metadex.c.hostname == hostname)
|
||||
if hostname_like:
|
||||
stmt = stmt.where(metadex.c.hostname.ilike(hostname_like, escape=_escape_char))
|
||||
if hostname_regex:
|
||||
stmt = stmt.where(metadex.c.hostname.regexp_match(hostname_regex))
|
||||
|
||||
if contains:
|
||||
stmt = stmt.where(
|
||||
metadex.c.location.contains(contains, autoescape=True),
|
||||
)
|
||||
if endswith:
|
||||
stmt = stmt.where(
|
||||
metadex.c.location.endswith(endswith, autoescape=True),
|
||||
)
|
||||
if startswith:
|
||||
stmt = stmt.where(
|
||||
metadex.c.location.startswith(startswith, autoescape=True),
|
||||
)
|
||||
if like:
|
||||
stmt = stmt.where(
|
||||
metadex.c.location.ilike(like, escape=_escape_char),
|
||||
)
|
||||
if regex:
|
||||
# It's important the "regex" filter comes last, because the order actually matters for SQLAlchemy/SQLite.
|
||||
# Running this filter last allows for all the _quick_ filters to apply first, leaving less rows for the expensive REGEXP statement.
|
||||
stmt = stmt.where(
|
||||
metadex.c.location.regexp_match(regex),
|
||||
)
|
||||
|
||||
return conn.execute(stmt)
|
||||
|
||||
|
||||
def all_hostnames(conn: Connection) -> Iterable[str]:
|
||||
stmt = select(metadex.c.hostname).distinct()
|
||||
return conn.execute(stmt).scalars().all()
|
||||
|
||||
|
||||
def _fake_entry(path: Path, *, hostname=None, now, parent_id) -> dict:
|
||||
return dict(
|
||||
parent_id=parent_id,
|
||||
added=now,
|
||||
updated=now,
|
||||
location=str(path),
|
||||
hostname=hostname if hostname is not None else config.hostname,
|
||||
stat_bytes=0,
|
||||
stat_modified=datetime.fromtimestamp(0),
|
||||
stat_type="d",
|
||||
)
|
||||
|
||||
|
||||
def _add_parents(conn: Connection, *, location: str, hostname: str):
|
||||
p_id: "int | None" = None
|
||||
for p in reversed(Path(location).parents):
|
||||
log.warning("Forging parent: %a:%a", hostname, str(p))
|
||||
d = _fake_entry(p, hostname=hostname, now=datetime.now(), parent_id=p_id)
|
||||
d = get_or_add(conn, d)
|
||||
p_id = d["id"]
|
||||
# r = conn.execute(
|
||||
# metadex.insert(),
|
||||
# [d],
|
||||
# )
|
||||
# p_id = r.inserted_primary_key.id
|
||||
|
||||
return p_id
|
||||
|
||||
|
||||
def get_or_add(conn: Connection, new_data: dict):
|
||||
row = get_file(conn, location=new_data["location"], hostname=new_data["hostname"])
|
||||
if row:
|
||||
return row
|
||||
|
||||
log.info(
|
||||
"File added: %a:%a (size: %i)",
|
||||
new_data["hostname"],
|
||||
new_data["location"],
|
||||
new_data["stat_bytes"],
|
||||
)
|
||||
|
||||
if "id" in new_data:
|
||||
del new_data["id"]
|
||||
|
||||
new_data["parent_id"] = _parent_id(
|
||||
conn, location=new_data["location"], hostname=new_data["hostname"]
|
||||
)
|
||||
if new_data["parent_id"] is None:
|
||||
new_data["parent_id"] = _add_parents(
|
||||
conn, location=new_data["location"], hostname=new_data["hostname"]
|
||||
)
|
||||
|
||||
now = datetime.now()
|
||||
if "added" not in new_data:
|
||||
new_data["added"] = now
|
||||
if "updated" not in new_data:
|
||||
new_data["updated"] = now
|
||||
|
||||
r = conn.execute(metadex.insert(), [new_data])
|
||||
new_data["id"] = r.inserted_primary_key.id
|
||||
return new_data
|
||||
|
||||
|
||||
def upsert_if_changed(conn: Connection, new_data: dict):
|
||||
row = get_or_add(conn, new_data)
|
||||
is_from_db = isinstance(row, Row)
|
||||
if not is_from_db:
|
||||
return "added"
|
||||
|
||||
is_changed = (
|
||||
new_data["stat_bytes"] != row["stat_bytes"]
|
||||
# or new_data["stat_changed"] != row["stat_changed"] # Ignore ctime, mtime is enough
|
||||
or new_data["stat_modified"] != row["stat_modified"]
|
||||
or new_data["stat_type"] != row["stat_type"]
|
||||
)
|
||||
|
||||
if not is_changed:
|
||||
return "unchanged"
|
||||
|
||||
log.info("File changed: %a:%a", new_data["hostname"], new_data["location"])
|
||||
|
||||
# changelog = []
|
||||
# for f in ("stat_bytes", "stat_modified", "stat_type"):
|
||||
# if new_data[f] != row[f]:
|
||||
# changelog.append(f"{f[5:]}: {row[f]!a} -> {new_data[f]!a}")
|
||||
# log.info("File changes: %s", ", ".join(changelog))
|
||||
|
||||
if "id" in new_data:
|
||||
del new_data["id"]
|
||||
|
||||
new_data["parent_id"] = _parent_id(conn, metadex_id=row["id"])
|
||||
# del new_data["added"]
|
||||
new_data["updated"] = datetime.now()
|
||||
stmt = metadex.update(
|
||||
and_(
|
||||
metadex.c.location == new_data["location"],
|
||||
metadex.c.hostname == new_data["hostname"],
|
||||
)
|
||||
)
|
||||
conn.execute(stmt, [new_data])
|
||||
return "changed"
|
||||
|
||||
|
||||
def remove_all(conn: Connection, location: str, *, hostname=None) -> int:
|
||||
"""Remove the entry with the given path and all its descendants."""
|
||||
|
||||
# We're using text comparison here to catch removed descendants even if
|
||||
# an intermediate directory is missing, e.g.
|
||||
# we have indexed /foo and /foo/bar/boo but not /foo/bar,
|
||||
# this can happen through ignore rules and users adding those paths explicitly.
|
||||
# We could also choose to ignore these edge cases and create orphans instead,
|
||||
# or change our parent-id-mechanism to support skipping intermediates, both of
|
||||
# which might be valid decisions for sake of optimization. For now we choose
|
||||
# simple correctness. Let's see how bad the performance can get.
|
||||
|
||||
if hostname is None:
|
||||
hostname = config.hostname
|
||||
|
||||
selector = and_(
|
||||
metadex.c.hostname == hostname,
|
||||
or_(
|
||||
metadex.c.location == location,
|
||||
metadex.c.location.startswith(location + "/", autoescape=True),
|
||||
),
|
||||
)
|
||||
stmt = select(metadex.c.location).where(selector)
|
||||
|
||||
cur = conn.execute(stmt)
|
||||
for (loc,) in cur:
|
||||
log.warning("Purging file from DB: %a:%a", hostname, loc)
|
||||
|
||||
stmt = metadex.delete(selector)
|
||||
return conn.execute(stmt).rowcount
|
||||
|
||||
|
||||
@contextmanager
|
||||
def transaction(rollback_on_error: bool = False):
|
||||
connect = engine.connect if config.dryrun else engine.begin
|
||||
err = None
|
||||
|
||||
with connect() as conn:
|
||||
try:
|
||||
yield conn
|
||||
except BaseException as e:
|
||||
if rollback_on_error:
|
||||
raise e
|
||||
# Allow the connection to run its ordinary clean up, i.e. to flush
|
||||
# the data written to it so far to disk.
|
||||
err = e
|
||||
|
||||
if err:
|
||||
raise err
|
||||
|
||||
|
||||
def files_in_dir(conn: Connection, location: str, *, hostname=None) -> Iterable[str]:
|
||||
"""Return all file names for the given dir."""
|
||||
if hostname is None:
|
||||
hostname = config.hostname
|
||||
|
||||
query = select(metadex.c.id).where(
|
||||
and_(metadex.c.hostname == hostname, metadex.c.location == location)
|
||||
)
|
||||
dir_id = conn.execute(query).scalar()
|
||||
if dir_id is None:
|
||||
return
|
||||
|
||||
query = select(metadex.c.location).where(metadex.c.parent_id == dir_id)
|
||||
for (loc,) in conn.execute(query):
|
||||
yield Path(loc).name
|
||||
|
||||
|
||||
MetadexId = int
|
||||
|
||||
|
||||
@overload
|
||||
def _parent_id(conn: Connection, *, metadex_id: MetadexId) -> "MetadexId | None":
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def _parent_id(conn: Connection, *, location: str, hostname: str) -> "MetadexId | None":
|
||||
...
|
||||
|
||||
|
||||
@lru_cache(maxsize=2048)
|
||||
def _parent_id(
|
||||
conn: Connection,
|
||||
*,
|
||||
metadex_id: "MetadexId | None" = None,
|
||||
location: "str | None" = None,
|
||||
hostname: "str | None" = None,
|
||||
) -> "MetadexId | None":
|
||||
if location is None:
|
||||
stmt = select(metadex.c.location, metadex.c.hostname).where(
|
||||
metadex.c.id == metadex_id
|
||||
)
|
||||
row = conn.execute(stmt).first()
|
||||
if not row:
|
||||
raise RuntimeError(
|
||||
"Metadex ID referenced but missing from DB: %a", metadex_id
|
||||
)
|
||||
location, hostname = row
|
||||
|
||||
assert location
|
||||
parent_loc = str(Path(location).parent)
|
||||
stmt = select(metadex.c.id).where(
|
||||
and_(metadex.c.location == parent_loc, metadex.c.hostname == hostname)
|
||||
)
|
||||
val = conn.execute(stmt).scalar()
|
||||
if not val:
|
||||
log.warning(
|
||||
"No parent found: %a",
|
||||
{"metadex_id": metadex_id, "location": location, "hostname": hostname},
|
||||
)
|
||||
return val
|
||||
|
||||
|
||||
def reassign_parent_ids(conn: Connection):
|
||||
stmt = select(
|
||||
metadex.c.id, metadex.c.parent_id, metadex.c.location, metadex.c.hostname
|
||||
)
|
||||
for (m_id, p_id_old, loc, host) in conn.execute(stmt):
|
||||
|
||||
parent_loc = str(Path(loc).parent)
|
||||
if parent_loc == loc:
|
||||
p_id = None
|
||||
else:
|
||||
stmt = select(metadex.c.id).where(
|
||||
and_(metadex.c.location == parent_loc, metadex.c.hostname == host)
|
||||
)
|
||||
p_id = conn.execute(stmt).scalar()
|
||||
if not p_id:
|
||||
log.warning(
|
||||
"No parent found: %a",
|
||||
{"metadex_id": m_id, "loc": loc, "host": host},
|
||||
)
|
||||
p_id = _add_parents(conn, location=loc, hostname=host)
|
||||
if p_id != p_id_old:
|
||||
log.warning(
|
||||
"Parent changed: %a",
|
||||
{"metadex_id": m_id, "loc": loc, "host": host, "parent_id": p_id},
|
||||
)
|
||||
|
||||
stmt = metadex.update().where(metadex.c.id == m_id)
|
||||
conn.execute(stmt, {"parent_id": p_id})
|
||||
35
metadex/ignore.py
Normal file
35
metadex/ignore.py
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
import re
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
from typing import Match
|
||||
|
||||
_regex_glob_map = {
|
||||
"**": r".*",
|
||||
"*": r"[^/]*",
|
||||
"?": r"[^/]",
|
||||
}
|
||||
_regex_glob_map = {re.escape(k): v for k, v in _regex_glob_map.items()}
|
||||
|
||||
|
||||
def _regex_from_glob(match: Match[str]) -> str:
|
||||
return _regex_glob_map[match.group()]
|
||||
|
||||
|
||||
_replace_globs_re = re.compile("|".join(re.escape(k) for k in _regex_glob_map))
|
||||
_replace_globs = partial(_replace_globs_re.sub, _regex_from_glob)
|
||||
|
||||
|
||||
def parse(path: Path):
|
||||
rules = []
|
||||
for line in path.open():
|
||||
line = line.rstrip()
|
||||
if not line or line.startswith("# "):
|
||||
continue
|
||||
|
||||
rule = _replace_globs(re.escape(line))
|
||||
if not rule.startswith("/"):
|
||||
rule = r".*/" + rule
|
||||
rules.append(rule)
|
||||
|
||||
regex = "|".join(rules)
|
||||
return re.compile(regex).fullmatch
|
||||
202
metadex/ls_parser.py
Normal file
202
metadex/ls_parser.py
Normal file
|
|
@ -0,0 +1,202 @@
|
|||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
from dataclasses import asdict, dataclass, fields
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable, TextIO
|
||||
|
||||
from . import utils
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# file mode entry type:
|
||||
# - Regular file.
|
||||
# b Block special file.
|
||||
# c Character special file.
|
||||
# d Directory.
|
||||
# l Symbolic link.
|
||||
# p FIFO.
|
||||
# s Socket.
|
||||
# w Whiteout.
|
||||
|
||||
# ls_re = re.compile("drwxrwsr-x 555 somuser somegrp 555 Dec 25 20:06 .")
|
||||
# ls_re = re.compile("drwxr-xr-x 11 501 20 352 1649098510 .")
|
||||
ls_re = re.compile(
|
||||
r"(?P<mode>[-bcdlpsw][-rwSsx]{6}[-rwSsxTt]{3})[@+]?\s+(?P<links>\d+)\s+(?P<owner>\S+)\s+(?P<group>\S+)\s+(?P<size>[.\d]+["
|
||||
+ utils._size_quantifiers
|
||||
+ r"]?)\s+(?P<date>(\d+|(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) [ 1-3]\d ( \d{4}|\d\d:\d\d)))\s(?P<name>.*)"
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class File:
|
||||
mode: str
|
||||
owner: str
|
||||
group: str
|
||||
size_bytes: int
|
||||
date: datetime
|
||||
path: Path
|
||||
|
||||
@property
|
||||
def is_dir(self):
|
||||
return self.mode.startswith("d")
|
||||
|
||||
@property
|
||||
def is_symlink(self):
|
||||
return self.mode.startswith("l")
|
||||
|
||||
|
||||
def asplain(o: object) -> "dict[str, Any]":
|
||||
d = asdict(o)
|
||||
for f in fields(o):
|
||||
if f.type is datetime:
|
||||
d[f.name] = d[f.name].isoformat()
|
||||
elif f.type is Path:
|
||||
d[f.name] = d[f.name].as_posix()
|
||||
return d
|
||||
|
||||
|
||||
def parse_date(date: str, ref_year: "int | None" = None) -> datetime:
|
||||
try:
|
||||
return (
|
||||
datetime.fromtimestamp(float(date))
|
||||
if date.isdigit()
|
||||
else datetime.strptime(date, "%b %d %Y")
|
||||
)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if ref_year is None:
|
||||
log.error("A reference year is required for relative timestamps: %a", date)
|
||||
raise ValueError("Missing ref_year.")
|
||||
|
||||
# We need to include the year in the string for parsing with strptime to
|
||||
# fully support leap years; otherwise without a year it might complain that
|
||||
# "Feb 29" is out of range.
|
||||
return datetime.strptime(f"{date} {ref_year}", "%b %d %H:%M %Y")
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChangeDir:
|
||||
from_: "Path | None"
|
||||
to: "Path | None"
|
||||
|
||||
|
||||
def parse_file(
|
||||
file: TextIO, *, ref_year: "int | None" = None
|
||||
) -> Iterable["File | ChangeDir"]:
|
||||
lines = (line.rstrip() for line in file)
|
||||
yield from parse_lines(lines, ref_year=ref_year)
|
||||
|
||||
|
||||
def parse_lines(
|
||||
lines: Iterable[str], *, ref_year: "int | None" = None
|
||||
) -> Iterable["File | ChangeDir"]:
|
||||
|
||||
workdir = Path("/")
|
||||
dirname: "Path | None" = None
|
||||
|
||||
for i, line in enumerate(lines, start=1):
|
||||
|
||||
if not line:
|
||||
# empty line, reset context
|
||||
if dirname is not None:
|
||||
yield ChangeDir(from_=dirname, to=None)
|
||||
dirname = None
|
||||
continue
|
||||
|
||||
if dirname is None:
|
||||
if not line.endswith(":"):
|
||||
log.error("Path is missing from context, instead got: %a", line)
|
||||
raise ValueError(f"Unexpected input in line #{i}")
|
||||
|
||||
if not line.startswith("/"):
|
||||
log.error("Only absolute paths are supported: %a", line)
|
||||
raise ValueError(f"Unexpected input in line #{i}")
|
||||
|
||||
dirname = workdir / line[:-1]
|
||||
yield ChangeDir(from_=None, to=dirname)
|
||||
|
||||
elif line.startswith("total "):
|
||||
pass
|
||||
|
||||
elif match := ls_re.fullmatch(line):
|
||||
name = match["name"]
|
||||
|
||||
# Support `ls` output where dirs are marked with a `/` suffix.
|
||||
if name.endswith("/"):
|
||||
name = name[:-1]
|
||||
|
||||
if name in (".", ".."):
|
||||
continue
|
||||
|
||||
if match["mode"].startswith("l"):
|
||||
markers = name.count("->")
|
||||
if markers == 1:
|
||||
name = name.split(" -> ")[0]
|
||||
elif markers >= 2:
|
||||
raise RuntimeError(f"Symlink has an ambiguous name: {name!a}")
|
||||
else:
|
||||
log.warning("Symlink is missing a target: %a", name)
|
||||
|
||||
try:
|
||||
size = utils.parse_size(match["size"])
|
||||
except ValueError as err:
|
||||
log.error("Error parsing size value: %a", match["size"], exc_info=err)
|
||||
raise ValueError(f"Unexpected input in line #{i}") from err
|
||||
|
||||
try:
|
||||
date = parse_date(match["date"], ref_year)
|
||||
except ValueError as err:
|
||||
log.error("Error parsing date value: %a", match["date"], exc_info=err)
|
||||
raise ValueError(f"Unexpected input in line #{i}") from err
|
||||
|
||||
yield File(
|
||||
mode=match["mode"],
|
||||
owner=match["owner"],
|
||||
group=match["group"],
|
||||
size_bytes=size,
|
||||
date=date,
|
||||
path=dirname / name,
|
||||
)
|
||||
|
||||
else:
|
||||
log.error("Line not matched by parser: %a", line)
|
||||
raise ValueError(f"Unexpected input in line #{i}")
|
||||
|
||||
if dirname is not None:
|
||||
yield ChangeDir(from_=dirname, to=None)
|
||||
|
||||
|
||||
def get_args(argv: "list[str]"):
|
||||
parser = argparse.ArgumentParser()
|
||||
# parser.add_argument("--workdir", help="The directory from where 'ls -l' was run")
|
||||
parser.add_argument("--ref-year", type=int, help="The year when 'ls -l' was run")
|
||||
# parser.add_argument(
|
||||
# "--json", action="store_true", default=False, help="Output as JSON"
|
||||
# )
|
||||
parser.add_argument(
|
||||
"infile",
|
||||
nargs="?",
|
||||
type=argparse.FileType(),
|
||||
default=sys.stdin,
|
||||
)
|
||||
|
||||
args = parser.parse_args(argv[1:])
|
||||
return args
|
||||
|
||||
|
||||
def main(argv: "list[str]"):
|
||||
args = get_args(argv)
|
||||
# workdir = Path(args.workdir or ".")
|
||||
ref_year = args.ref_year or datetime.now().year
|
||||
|
||||
for f in parse_file(args.infile, ref_year=ref_year):
|
||||
print(json.dumps(asplain(f)))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv)
|
||||
618
metadex/metadex.py
Normal file
618
metadex/metadex.py
Normal file
|
|
@ -0,0 +1,618 @@
|
|||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from collections import deque
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from shutil import get_terminal_size
|
||||
from typing import Iterable, Literal, TextIO
|
||||
|
||||
from . import config, db, ignore, ls_parser, models
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
init = db.init
|
||||
close = db.close
|
||||
|
||||
|
||||
def scan(
|
||||
path: Path,
|
||||
*,
|
||||
ignore_file: Path,
|
||||
remove_missing: bool = False,
|
||||
map_pathspecs: "list[str]" = [],
|
||||
) -> "_LogContext":
|
||||
f = _scan_remove_missing if remove_missing else _scan_add_only
|
||||
return f(path, ignore_file=ignore_file, map_pathspecs=map_pathspecs)
|
||||
|
||||
|
||||
# Opportunistically compensate for wide chars on the terminal.
|
||||
_terminal_width = int(get_terminal_size().columns * 0.9)
|
||||
_last_log = 0
|
||||
|
||||
|
||||
def _log_ephemeral(msg: str, *, debounce_ms: "int | None" = 200):
|
||||
global _last_log
|
||||
|
||||
if debounce_ms is not None:
|
||||
now = time.monotonic()
|
||||
if _last_log + (debounce_ms / 1000) > now:
|
||||
return
|
||||
_last_log = now
|
||||
|
||||
msg = msg.encode(errors="replace").decode()
|
||||
if len(msg) > _terminal_width:
|
||||
msg = msg[: _terminal_width - 3] + "..."
|
||||
sys.stderr.write(msg.ljust(_terminal_width) + "\r")
|
||||
|
||||
|
||||
@dataclass
|
||||
class _LogContext:
|
||||
seen: int = 0
|
||||
ignored: int = 0
|
||||
added: int = 0
|
||||
changed: int = 0
|
||||
removed: int = 0
|
||||
|
||||
|
||||
def _log_context(path, context: _LogContext):
|
||||
if config.is_stdout_piped:
|
||||
return
|
||||
|
||||
_log_ephemeral(
|
||||
f"{context.seen} a:{context.added} c:{context.changed} i:{context.ignored} r:{context.removed} {path}"
|
||||
)
|
||||
|
||||
|
||||
def _scan_add_only(path: Path, *, ignore_file: Path, map_pathspecs: "list[str]"):
|
||||
is_ignored = ignore.parse(ignore_file)
|
||||
|
||||
maps = _parse_pathspec_mapping(map_pathspecs)
|
||||
|
||||
context = _LogContext()
|
||||
|
||||
with db.transaction() as conn:
|
||||
|
||||
context.seen += 1
|
||||
|
||||
d = models.File.dict_from_entry(path)
|
||||
_apply_mapping(maps, d)
|
||||
|
||||
if is_ignored(d["location"]):
|
||||
log.warning(
|
||||
"Skipping ignored basedir: %a:%a",
|
||||
d["hostname"],
|
||||
d["location"],
|
||||
)
|
||||
return context
|
||||
|
||||
if (action := db.upsert_if_changed(conn, d)) == "added":
|
||||
context.added += 1
|
||||
elif action == "changed":
|
||||
context.changed += 1
|
||||
|
||||
dirs: deque[Path] = deque()
|
||||
if d["stat_type"] == "d":
|
||||
dirs.append(path)
|
||||
|
||||
while dirs:
|
||||
|
||||
cwd = dirs.popleft()
|
||||
try:
|
||||
scan = os.scandir(cwd)
|
||||
except Exception as err:
|
||||
log.error(err)
|
||||
continue
|
||||
|
||||
subdirs: deque[Path] = deque()
|
||||
with scan as files:
|
||||
for f in files:
|
||||
|
||||
context.seen += 1
|
||||
|
||||
_log_context(f.path, context)
|
||||
|
||||
d = models.File.dict_from_entry(f)
|
||||
_apply_mapping(maps, d)
|
||||
|
||||
if is_ignored(d["location"]):
|
||||
log.debug(
|
||||
"Skipping ignored entry: %a:%a",
|
||||
d["hostname"],
|
||||
d["location"],
|
||||
)
|
||||
context.ignored += 1
|
||||
continue
|
||||
|
||||
if (action := db.upsert_if_changed(conn, d)) == "added":
|
||||
context.added += 1
|
||||
append = subdirs.append
|
||||
elif action == "changed":
|
||||
context.changed += 1
|
||||
append = subdirs.append
|
||||
else:
|
||||
append = subdirs.appendleft
|
||||
|
||||
if f.is_dir(follow_symlinks=False):
|
||||
append(Path(f.path))
|
||||
|
||||
# `subdirs` sorts all changed dirs to the right, which means when we
|
||||
# extend `dirs` using `extendleft` it'll put them all left-most.
|
||||
# Or put more simply: new stuff on the left, old on the right.
|
||||
dirs.extendleft(subdirs)
|
||||
|
||||
return context
|
||||
|
||||
|
||||
def _scan_remove_missing(path: Path, *, ignore_file: Path, map_pathspecs: "list[str]"):
|
||||
"""Like `scan` but also search for missing files."""
|
||||
is_ignored = ignore.parse(ignore_file)
|
||||
|
||||
maps = _parse_pathspec_mapping(map_pathspecs)
|
||||
|
||||
context = _LogContext()
|
||||
|
||||
with db.transaction() as conn:
|
||||
|
||||
context.seen += 1
|
||||
|
||||
d = models.File.dict_from_entry(path)
|
||||
_apply_mapping(maps, d)
|
||||
|
||||
if is_ignored(d["location"]):
|
||||
log.warning(
|
||||
"Skipping ignored basedir: %a:%a",
|
||||
d["hostname"],
|
||||
d["location"],
|
||||
)
|
||||
return context
|
||||
|
||||
if (action := db.upsert_if_changed(conn, d)) == "added":
|
||||
context.added += 1
|
||||
elif action == "changed":
|
||||
context.changed += 1
|
||||
|
||||
dirs: deque[Path] = deque()
|
||||
if d["stat_type"] == "d":
|
||||
dirs.append(path)
|
||||
|
||||
while dirs:
|
||||
|
||||
cwd = dirs.popleft()
|
||||
try:
|
||||
scan = os.scandir(cwd)
|
||||
except Exception as err:
|
||||
log.error(err)
|
||||
continue
|
||||
|
||||
expected = {name for name in db.files_in_dir(conn, str(cwd))}
|
||||
|
||||
subdirs: deque[Path] = deque()
|
||||
with scan as files:
|
||||
for f in files:
|
||||
|
||||
context.seen += 1
|
||||
|
||||
_log_context(f.path, context)
|
||||
|
||||
d = models.File.dict_from_entry(f)
|
||||
_apply_mapping(maps, d)
|
||||
|
||||
if is_ignored(d["location"]):
|
||||
log.debug(
|
||||
"Skipping ignored entry: %a:%a",
|
||||
d["hostname"],
|
||||
d["location"],
|
||||
)
|
||||
context.ignored += 1
|
||||
continue
|
||||
|
||||
if (action := db.upsert_if_changed(conn, d)) == "added":
|
||||
context.added += 1
|
||||
append = subdirs.append
|
||||
elif action == "changed":
|
||||
context.changed += 1
|
||||
append = subdirs.append
|
||||
else:
|
||||
append = subdirs.appendleft
|
||||
|
||||
if f.is_dir(follow_symlinks=False):
|
||||
append(Path(f.path))
|
||||
|
||||
expected.discard(f.name)
|
||||
|
||||
# `subdirs` sorts all changed dirs to the right, which means when we
|
||||
# extend `dirs` using `extendleft` it'll put them all left-most.
|
||||
# Or put more simply: new stuff on the left, old on the right.
|
||||
dirs.extendleft(subdirs)
|
||||
|
||||
for name in expected:
|
||||
f = str(cwd / name)
|
||||
if is_ignored(f):
|
||||
continue
|
||||
|
||||
log.info("File removed: %a", f)
|
||||
|
||||
db.remove_all(conn, f)
|
||||
|
||||
return context
|
||||
|
||||
|
||||
_pathspec_re = re.compile(r"((?P<host>[^:/]*):)?(?P<path>.*)")
|
||||
_src_dest_re = re.compile(r"src=(?P<src>.*),dest=(?P<dest>.*)")
|
||||
|
||||
|
||||
def _parse_pathspec(pathspec: str):
|
||||
match = _pathspec_re.fullmatch(pathspec)
|
||||
assert match
|
||||
host: "str | None" = match["host"]
|
||||
path: str = match["path"] or "/"
|
||||
return host, path
|
||||
|
||||
|
||||
def _clean_dirname(loc: str, *, force_absolute=True):
|
||||
if force_absolute and not loc.startswith("/"):
|
||||
loc = "/" + loc
|
||||
if not loc.endswith("/"):
|
||||
loc += "/"
|
||||
return loc
|
||||
# if loc != "/" and loc.endswith("/"):
|
||||
# return loc[:-1]
|
||||
# return loc
|
||||
|
||||
|
||||
def _parse_pathspec_mapping(map_pathspecs: "list[str]"):
|
||||
Hostname = str
|
||||
Location = str
|
||||
maps: dict[Hostname, dict[Location, tuple[Hostname, Location]]] = {}
|
||||
for pathspec_mapping in map_pathspecs:
|
||||
match = _src_dest_re.fullmatch(pathspec_mapping)
|
||||
if not match:
|
||||
log.error("Invalid mapping: %a", pathspec_mapping)
|
||||
raise ValueError("Could not parse mapping.")
|
||||
|
||||
src_host, src_path = _parse_pathspec(match["src"])
|
||||
if not src_host:
|
||||
src_host = config.hostname
|
||||
log.warning("Using default hostname for mapping source: %a", src_host)
|
||||
# log.error("Hostname is required when mapping paths: %a", match["src"])
|
||||
# raise ValueError("Missing hostname.")
|
||||
src_path = _clean_dirname(src_path)
|
||||
if src_host not in maps:
|
||||
maps[src_host] = {}
|
||||
|
||||
dest_host, dest_path = _parse_pathspec(match["dest"])
|
||||
if not dest_host:
|
||||
dest_host = config.hostname
|
||||
log.warning("Using default hostname for mapping dest: %a", dest_host)
|
||||
# log.error("Hostname is required when mapping paths: %a", match["dest"])
|
||||
# raise ValueError("Missing hostname.")
|
||||
dest_path = _clean_dirname(dest_path)
|
||||
maps[src_host][src_path] = dest_host, dest_path
|
||||
log.info("Mapping %a:%a -> %a:%a", src_host, src_path, dest_host, dest_path)
|
||||
|
||||
return maps
|
||||
|
||||
|
||||
def _apply_mapping(maps: dict, d: dict):
|
||||
hostname = d["hostname"]
|
||||
location = (
|
||||
d["location"]
|
||||
if d["stat_type"] != "d"
|
||||
else _clean_dirname(d["location"], force_absolute=False)
|
||||
)
|
||||
if hostname in maps:
|
||||
for src_loc, (dest_host, dest_loc) in maps[hostname].items():
|
||||
if location.startswith(src_loc):
|
||||
d["hostname"] = dest_host
|
||||
d["location"] = dest_loc + d["location"][len(src_loc) :]
|
||||
log.debug(
|
||||
"Mapping %a -> %a",
|
||||
f"{hostname}:{location}",
|
||||
f'{d["hostname"]}:{d["location"]}',
|
||||
)
|
||||
break
|
||||
|
||||
|
||||
def ingest_db_file(
|
||||
db_file: Path,
|
||||
*,
|
||||
ignore_file: Path,
|
||||
map_pathspecs: "list[str]" = [],
|
||||
select_pathspecs: "list[str]" = [],
|
||||
) -> _LogContext:
|
||||
is_ignored = ignore.parse(ignore_file)
|
||||
|
||||
maps = _parse_pathspec_mapping(map_pathspecs)
|
||||
|
||||
context = _LogContext()
|
||||
|
||||
other_db = db.Db(db_file)
|
||||
with db.transaction() as conn, other_db.transaction(
|
||||
force_rollback=True
|
||||
) as other_conn:
|
||||
|
||||
for row in db.iter_all(other_conn):
|
||||
|
||||
context.seen += 1
|
||||
|
||||
_log_context(row["location"], context)
|
||||
|
||||
d = dict(row)
|
||||
_apply_mapping(maps, d)
|
||||
|
||||
if is_ignored(d["location"]):
|
||||
log.debug("Skipping ignored entry: %a:%a", d["hostname"], d["location"])
|
||||
context.ignored += 1
|
||||
continue
|
||||
|
||||
if (action := db.upsert_if_changed(conn, d)) == "added":
|
||||
context.added += 1
|
||||
elif action == "changed":
|
||||
context.changed += 1
|
||||
|
||||
return context
|
||||
|
||||
|
||||
def ingest_ls(
|
||||
file: TextIO,
|
||||
*,
|
||||
ignore_file: Path,
|
||||
ref_year: "int | None",
|
||||
remove_missing: bool = False,
|
||||
) -> _LogContext:
|
||||
f = _ingest_ls_remove_missing if remove_missing else _ingest_ls_add_only
|
||||
return f(file, ignore_file=ignore_file, ref_year=ref_year)
|
||||
|
||||
|
||||
def _ingest_ls_add_only(file: TextIO, *, ignore_file: Path, ref_year: "int | None"):
|
||||
is_ignored = ignore.parse(ignore_file)
|
||||
|
||||
context = _LogContext()
|
||||
|
||||
with db.transaction() as conn:
|
||||
|
||||
for f in ls_parser.parse_file(file, ref_year=ref_year):
|
||||
|
||||
if isinstance(f, ls_parser.ChangeDir):
|
||||
continue
|
||||
|
||||
context.seen += 1
|
||||
|
||||
_log_context(f.path, context)
|
||||
|
||||
d = _dict_from_lsfile(f)
|
||||
# _apply_mapping(maps, d)
|
||||
|
||||
if is_ignored(d["location"]):
|
||||
log.debug("Skipping ignored entry: %a:%a", d["hostname"], d["location"])
|
||||
context.ignored += 1
|
||||
continue
|
||||
|
||||
if (action := db.upsert_if_changed(conn, d)) == "added":
|
||||
context.added += 1
|
||||
elif action == "changed":
|
||||
context.changed += 1
|
||||
|
||||
return context
|
||||
|
||||
|
||||
def _dict_from_lsfile(f: ls_parser.File) -> dict:
|
||||
mode = f.mode[0]
|
||||
if mode == "-":
|
||||
mode = "f"
|
||||
elif mode not in "dl":
|
||||
mode = "-"
|
||||
|
||||
return dict(
|
||||
location=str(f.path),
|
||||
hostname=config.hostname,
|
||||
stat_bytes=f.size_bytes,
|
||||
stat_modified=f.date,
|
||||
stat_type=mode,
|
||||
)
|
||||
|
||||
|
||||
def _ingest_ls_remove_missing(
|
||||
file: TextIO, *, ignore_file: Path, ref_year: "int | None"
|
||||
):
|
||||
is_ignored = ignore.parse(ignore_file)
|
||||
|
||||
expected: set[str] = set()
|
||||
|
||||
context = _LogContext()
|
||||
|
||||
with db.transaction() as conn:
|
||||
|
||||
for f in ls_parser.parse_file(file, ref_year=ref_year):
|
||||
|
||||
if isinstance(f, ls_parser.ChangeDir):
|
||||
|
||||
if f.to is not None:
|
||||
expected = {name for name in db.files_in_dir(conn, str(f.to))}
|
||||
|
||||
elif f.from_:
|
||||
# remove missing
|
||||
for name in expected:
|
||||
loc = str(f.from_ / name)
|
||||
if is_ignored(loc):
|
||||
log.info("Ignoring file (for removal): %a", loc)
|
||||
continue
|
||||
|
||||
log.info("File removed: %a", loc)
|
||||
|
||||
context.removed += db.remove_all(conn, loc)
|
||||
|
||||
continue
|
||||
|
||||
context.seen += 1
|
||||
|
||||
_log_context(f.path, context)
|
||||
|
||||
d = _dict_from_lsfile(f)
|
||||
# _apply_mapping(maps, d)
|
||||
|
||||
if is_ignored(d["location"]):
|
||||
log.debug("Skipping ignored entry: %a:%a", d["hostname"], d["location"])
|
||||
context.ignored += 1
|
||||
continue
|
||||
|
||||
if (action := db.upsert_if_changed(conn, d)) == "added":
|
||||
context.added += 1
|
||||
elif action == "changed":
|
||||
context.changed += 1
|
||||
|
||||
expected.discard(f.path.name)
|
||||
|
||||
return context
|
||||
|
||||
|
||||
def _ls_files(
|
||||
*,
|
||||
host: "str | None",
|
||||
path: str,
|
||||
type: "models.StatType | None" = None,
|
||||
match: Literal["regex", "glob", "fuzzy"] = "glob",
|
||||
) -> Iterable[models.File]:
|
||||
def map_replace(mapping: dict, string: str):
|
||||
pattern = "|".join(re.escape(k) for k in mapping.keys())
|
||||
return re.sub(pattern, lambda m: mapping[m[0]], string)
|
||||
|
||||
def liketerm_from_glob(glob: str) -> str:
|
||||
s = db.escape(glob)
|
||||
s = map_replace({"*": "%", "?": "_"}, s)
|
||||
return s
|
||||
|
||||
def regex_from_glob(glob: str) -> str:
|
||||
s = re.escape(glob)
|
||||
s = map_replace({r"\*\*": ".*", r"\*": "[^/]*", r"\?": "[^/]"}, s)
|
||||
return s
|
||||
|
||||
with db.transaction() as conn:
|
||||
if match == "regex":
|
||||
|
||||
for f in db.search(
|
||||
conn, type=type, hostname_regex=host, regex=f"(?i){path}"
|
||||
):
|
||||
yield models.File(**f) # type: ignore
|
||||
|
||||
elif match == "glob":
|
||||
|
||||
filters = {"type": type}
|
||||
if host and _uses_glob(host):
|
||||
filters["hostname_like"] = liketerm_from_glob(host)
|
||||
else:
|
||||
filters["hostname"] = host
|
||||
|
||||
if not _uses_glob(path):
|
||||
rterm = re.escape(path)
|
||||
lterm = path # no `db.escape`, `endswith` does autoescape
|
||||
result = db.search(
|
||||
conn,
|
||||
endswith=lterm,
|
||||
regex=f"(?i)(^|/){rterm}$", # ensure a full name match
|
||||
**filters,
|
||||
)
|
||||
|
||||
else:
|
||||
rterm = regex_from_glob(path)
|
||||
lterm = liketerm_from_glob(path)
|
||||
result = db.search(
|
||||
conn,
|
||||
regex=f"(?i)(^|/){rterm}$",
|
||||
like=f"%{lterm}", # helps to drastically speed up the regex match
|
||||
**filters,
|
||||
)
|
||||
|
||||
for f in result:
|
||||
yield models.File(**f) # type: ignore
|
||||
|
||||
elif match == "fuzzy":
|
||||
|
||||
term = "%".join(db.escape(p) for p in path.split("/"))
|
||||
|
||||
for f in db.search(conn, like=f"%{term}%", type=type, hostname=host):
|
||||
yield models.File(**f) # type: ignore
|
||||
|
||||
|
||||
def _ls_dir_contents(*, host: str, path: str) -> Iterable[models.File]:
|
||||
|
||||
with db.transaction() as conn:
|
||||
|
||||
row = db.get_file(conn, location=path, hostname=host)
|
||||
|
||||
if not row:
|
||||
log.warning("No match: %a:%a", host, path)
|
||||
return
|
||||
|
||||
if row["stat_type"] != "d":
|
||||
yield models.File(**row) # type: ignore
|
||||
return
|
||||
|
||||
for f in db.get_files(conn, parent_id=row["id"]):
|
||||
yield models.File(**f) # type: ignore
|
||||
|
||||
|
||||
def _uses_glob(string: str) -> bool:
|
||||
return "*" in string or "?" in string
|
||||
|
||||
|
||||
def ls(
|
||||
pathspec: str,
|
||||
*,
|
||||
type: "models.StatType | None" = None,
|
||||
match: Literal["regex", "glob", "fuzzy"] = "glob",
|
||||
) -> Iterable[models.File]:
|
||||
host, path = _parse_pathspec(pathspec)
|
||||
|
||||
if host == "":
|
||||
host = config.hostname # allow ":foo" as shortcut for local search
|
||||
|
||||
log.info("Using path spec: %a:%a", host, path)
|
||||
|
||||
if path != "/" and path.endswith("/"):
|
||||
# In our DB no path except root (`/`) ends with `/`.
|
||||
path = path.rstrip("/")
|
||||
|
||||
if host and path.startswith("/") and not _uses_glob(host + path):
|
||||
yield from _ls_dir_contents(host=host, path=path)
|
||||
|
||||
else:
|
||||
yield from _ls_files(host=host, path=path, type=type, match=match)
|
||||
|
||||
|
||||
def rm(pathspec: str, *, include_children: bool = False):
|
||||
"""Remove the given path and all its descendants."""
|
||||
host, path = _parse_pathspec(pathspec)
|
||||
|
||||
if not host or not path.startswith("/"):
|
||||
log.error(
|
||||
"A full absolute path including hostname is required when removing files: %a",
|
||||
pathspec,
|
||||
)
|
||||
raise ValueError("Incomplete path specification.")
|
||||
|
||||
if path != "/" and path.endswith("/"):
|
||||
path = path[:-1]
|
||||
|
||||
with db.transaction() as conn:
|
||||
|
||||
row = db.get_file(conn, hostname=host, location=path)
|
||||
|
||||
if not row:
|
||||
log.error("No matching file found: %a", pathspec)
|
||||
raise ValueError("Path not found.")
|
||||
|
||||
children = db.get_files(conn, parent_id=row["id"])
|
||||
if children and not include_children:
|
||||
log.error("File has children: %a", pathspec)
|
||||
raise RuntimeError("Path has children.")
|
||||
|
||||
db.remove_all(conn, location=path, hostname=host)
|
||||
|
||||
|
||||
def hosts() -> "set[str]":
|
||||
with db.transaction() as conn:
|
||||
return set(db.all_hostnames(conn))
|
||||
84
metadex/models.py
Normal file
84
metadex/models.py
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
import os
|
||||
from dataclasses import asdict, dataclass
|
||||
from datetime import datetime
|
||||
from os import DirEntry
|
||||
from pathlib import Path
|
||||
from stat import S_IFDIR, S_IFLNK, S_IFMT, S_IFREG
|
||||
from typing import Literal
|
||||
|
||||
from . import config
|
||||
|
||||
_modes = {S_IFDIR: "d", S_IFREG: "f", S_IFLNK: "l"}
|
||||
|
||||
asdict = asdict
|
||||
|
||||
StatType = Literal["d", "f", "l", "-"]
|
||||
|
||||
|
||||
@dataclass
|
||||
class File:
|
||||
id: int
|
||||
parent_id: int
|
||||
added: datetime
|
||||
updated: datetime
|
||||
location: str
|
||||
hostname: str # XXX should better use a fingerprint/unique-id per host (e.g. `/etc/metadex.hostid`, for disks put it on their /)
|
||||
stat_bytes: int
|
||||
# stat_changed: datetime # XXX remove? The `ctime` changes not only for content changes but also file attr changes, which we don't track anyway.
|
||||
stat_modified: datetime
|
||||
stat_type: StatType
|
||||
|
||||
@classmethod
|
||||
def from_direntry(cls, entry: DirEntry):
|
||||
now = datetime.now()
|
||||
pstat = entry.stat(follow_symlinks=False)
|
||||
return cls(
|
||||
added=now,
|
||||
updated=now,
|
||||
location=entry.path,
|
||||
hostname=config.hostname,
|
||||
stat_bytes=pstat.st_size,
|
||||
# stat_changed=datetime.fromtimestamp(pstat.st_ctime),
|
||||
stat_modified=datetime.fromtimestamp(pstat.st_mtime),
|
||||
stat_type=_modes.get(S_IFMT(pstat.st_mode), "-"), # type: ignore
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_path(cls, path: Path):
|
||||
now = datetime.now()
|
||||
pstat = os.stat(path, follow_symlinks=False)
|
||||
return cls(
|
||||
added=now,
|
||||
updated=now,
|
||||
location=os.path.abspath(path),
|
||||
hostname=config.hostname,
|
||||
stat_bytes=pstat.st_size,
|
||||
# stat_changed=datetime.fromtimestamp(pstat.st_ctime),
|
||||
stat_modified=datetime.fromtimestamp(pstat.st_mtime),
|
||||
stat_type=_modes.get(S_IFMT(pstat.st_mode), "-"), # type: ignore
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def dict_from_entry(entry: "DirEntry | Path") -> dict:
|
||||
"""Return the File's data structure as dict.
|
||||
|
||||
This can be useful to skip calling `asdict`, which can be quite slow.
|
||||
"""
|
||||
# now = datetime.now()
|
||||
|
||||
if isinstance(entry, Path):
|
||||
location = os.path.abspath(entry)
|
||||
pstat = os.stat(entry, follow_symlinks=False)
|
||||
else:
|
||||
location = entry.path.encode(errors="replace").decode()
|
||||
pstat = entry.stat(follow_symlinks=False)
|
||||
|
||||
return dict(
|
||||
# added=now,
|
||||
# updated=now,
|
||||
location=location,
|
||||
hostname=config.hostname,
|
||||
stat_bytes=pstat.st_size,
|
||||
stat_modified=datetime.fromtimestamp(pstat.st_mtime),
|
||||
stat_type=_modes.get(S_IFMT(pstat.st_mode), "-"),
|
||||
)
|
||||
51
metadex/utils.py
Normal file
51
metadex/utils.py
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
_size_quantifiers = "BKMGTP"
|
||||
_size_map: "dict[str, int]" = {
|
||||
_size_quantifiers[i]: 2 ** (10 * i) for i in range(len(_size_quantifiers))
|
||||
}
|
||||
|
||||
|
||||
def size_for_display(byte_count: int, precision: int = 2, format="short") -> str:
|
||||
for qtf in reversed(_size_quantifiers):
|
||||
qty = byte_count / _size_map[qtf]
|
||||
if qty > 1:
|
||||
break
|
||||
|
||||
size = f"{qty:.{precision}f}"
|
||||
if format == "compact":
|
||||
size = size.replace("." + "0" * precision, "") # silly hack to remove
|
||||
return f"{size:>{4+precision}}{qtf}"
|
||||
|
||||
tpl = "{{:.{precision}f}} {{}}".format(precision=precision)
|
||||
if format == "short":
|
||||
pass
|
||||
elif format == "long" and qtf != "B":
|
||||
tpl += "iB"
|
||||
return tpl.format(qty, qtf)
|
||||
|
||||
|
||||
def parse_size(size: str) -> int:
|
||||
"""Return the given size converted to byte count.
|
||||
|
||||
Supported are
|
||||
|
||||
- plain byte count, e.g. "12345"
|
||||
- short format, e.g. "123.45K"
|
||||
|
||||
not supported: Kb = kBit, KB = kByte, KB = 10**3 B, KiB = 2**10 B
|
||||
"""
|
||||
if size.isdigit():
|
||||
return int(size)
|
||||
|
||||
d, q = float(size[:-1]), size[-1]
|
||||
return int(d * _size_map[q])
|
||||
|
||||
|
||||
def abspath(path: Path) -> Path:
|
||||
"""Normalize & make the given path absolute while maintaining symlinks.
|
||||
|
||||
Similar to Path.resolve(strict=False), but doesn't resolve symlinks."""
|
||||
return Path(os.path.abspath(path))
|
||||
Loading…
Add table
Add a link
Reference in a new issue