wip

2022-08-14 20:41:58 +02:00 · 2022-08-14 20:41:58 +02:00 · 01a96c14d4
commit 01a96c14d4
18 changed files with 2108 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+*.local
+metadex.sqlite
+pyrightconfig.json
--- a/README.md
+++ b/README.md
@ -0,0 +1,20 @@
+# Metadex
+
+Metadex is a database for file metadata.  It offers a simple and powerful CLI to scan your file system and find indexed files.  
+It supports path mapping, fast fuzzy searching, and multiple hosts.  
+Limited support exists for importing of metadata from external file lists, like for example from `ls -R`.
+
+It's mostly useful to access file information for files stored remotely or offline.  For example it could be used to keep track of the contents of backup DVDs or other cold storage.  
+
+Other use cases might also emerge.  
+All files are indexed with their last modified timestamp and file size.  
+Since searches are quite fast and flexible the database could even be useful to query local file information, or generate statistics, or graphs like a used space map.
+
+## Example usage
+
+```sh
+metadex scan ~  # Add all files from your home directory to the index.
+metadex ls '*.xml'  # List all .xml files.
+metadex ls '.config/**.json'  # List all .json config files.
+metadex ls .git --type d  # List all .git directories.
+```
--- a/metadex.ignore
+++ b/metadex.ignore
@ -0,0 +1,6 @@
+*.git
+.venv
+.DS_Store
+__pycache__
+node_modules
+vendor
--- a/metadex/init.py
+++ b/metadex/init.py
--- a/metadex/main.py
+++ b/metadex/main.py
@ -0,0 +1,319 @@
+import argparse
+import logging
+import os
+import stat
+import sys
+from pathlib import Path
+
+from . import config, metadex, utils
+
+log = logging.getLogger(__name__)
+
+
+def getargs():
+    parser = argparse.ArgumentParser()
+    parser.set_defaults(mode=None)
+
+    parser.add_argument(
+        "--dry-run",
+        "-n",
+        action="store_true",
+        help="don't update the DB, only print what would change",
+    )
+    parser.add_argument(
+        "--quick",
+        action="store_true",
+        default=not config.db_allow_slow,
+        help="skip all DB integrity & optimization steps",
+    )
+    parser.add_argument(
+        "--hostname",
+        help="overwrite the hostname to use as path prefix",
+    )
+    parser.add_argument(
+        "--ignore-from",
+        type=Path,
+        default=config.default_ignore,
+        help="load list of ignore rules from the given file",
+    )
+    parser.add_argument("--db", type=Path, default=config.default_db)
+    parser.add_argument("--verbose", "-v", action="store_true", default=False)
+
+    subparsers = parser.add_subparsers(title="commands")
+
+    # Command: scan
+
+    parser_scan = subparsers.add_parser("scan", help="scan a local file system")
+    parser_scan.set_defaults(mode="scan")
+
+    parser_scan.add_argument(
+        "basedir",
+        type=Path,
+        nargs="+",
+        help="index all files from this dir",
+    )
+    parser_scan.add_argument(
+        "--no-remove-missing",
+        dest="remove_missing",
+        action="store_false",
+        help="do not remove files from the database which cannot be found in the file system",
+    )
+    parser_scan.add_argument(
+        "--map-mount",
+        nargs="*",
+        default=[],
+        type=str,
+        help="map a source host:path to any other destination during scanning for files\nExample: src=/mnt/foo,dest=foo:",
+    )
+
+    # Command: ingest-ls
+
+    parser_ingest_ls = subparsers.add_parser(
+        "ingest-ls",
+        help="ingest extra data",
+        description="When ingesting data from an external source, the hostname will not be set automatically.",
+    )
+    parser_ingest_ls.set_defaults(mode="ingest-ls")
+
+    parser_ingest_ls.add_argument(
+        "infile",
+        nargs="?",
+        type=argparse.FileType(),
+        default=sys.stdin,
+        help="output from `ls -lR`",
+    )
+    parser_ingest_ls.add_argument(
+        "--remove-missing",
+        action="store_true",
+        help="Remove files not listed in the infile.",
+    )
+    parser_ingest_ls.add_argument(
+        "--ref-year",
+        type=int,
+        help="The year when 'ls -l' was run, to resolve relative dates.",
+    )
+
+    # Command: ingest-db
+
+    parser_ingest_db = subparsers.add_parser("ingest-db")
+    parser_ingest_db.set_defaults(mode="ingest-db")
+
+    parser_ingest_db.add_argument(
+        "infile",
+        type=Path,
+        help="a Metadex SQLite DB",
+    )
+    parser_ingest_db.add_argument(
+        "--map-mount",
+        nargs="*",
+        default=[],
+        type=str,
+        help="map a source host:path to any other destination while importing",
+    )
+
+    # Command: rm
+
+    parser_rm = subparsers.add_parser("rm")
+    parser_rm.set_defaults(mode="rm")
+    parser_rm.add_argument(
+        "files",
+        type=str,
+        nargs="+",
+        help="files to remove",
+    )
+    parser_rm.add_argument(
+        "-r",
+        action="store_true",
+        dest="include_subfiles",
+        help="include sub-files",
+    )
+
+    # Command: ls
+
+    parser_ls = subparsers.add_parser("ls")
+    parser_ls.set_defaults(mode="ls")
+    parser_ls.add_argument(
+        "file",
+        type=str,
+        nargs="*",
+        help="look up a file",
+    )
+    parser_ls.add_argument(
+        "--type",
+        "-t",
+        choices="dfl",
+        help="Filter searches to (d)irectories, plain (f)iles, or sym(l)inks.",
+    )
+    parser_ls.add_argument("--format", type=str, default="{date}\t{size}\t{path}")
+    parser_ls.add_argument(
+        "--match", choices=("regex", "glob", "fuzzy"), default="glob"
+    )
+
+    # Parse args.
+
+    args = parser.parse_args()
+
+    if args.mode == "scan":
+        args.basedir = [utils.abspath(p) for p in args.basedir]
+    elif args.mode == "ingest-db":
+        args.infile = utils.abspath(args.infile)
+    elif args.mode == "ingest-ls":
+        config.hostname = None
+    elif args.mode is None:
+        parser.print_help()
+        parser.exit(1, "Error: No command selected.")
+
+    return args
+
+
+def cmd_ingest_ls(args):
+    metadex.init(args.db)
+
+    log.info("Ingesting ls file %a ...", args.infile.name)
+    metadex.ingest_ls(
+        args.infile,
+        ignore_file=args.ignore_from,
+        ref_year=args.ref_year,
+        remove_missing=args.remove_missing,
+    )
+
+    metadex.close()
+
+
+def cmd_ingest_db(args):
+    metadex.init(args.db)
+
+    log.info("Ingesting Metadex DB file %a ...", str(args.infile))
+    context = metadex.ingest_db_file(
+        args.infile, ignore_file=args.ignore_from, map_pathspecs=args.map_mount
+    )
+
+    msg = f"Checked {context.seen} files, {context.added} new, {context.changed} changed, {context.ignored} ignored, {context.removed} removed"
+    print(msg.ljust(metadex._terminal_width))
+
+    metadex.close()
+
+
+def cmd_scan(args):
+    metadex.init(args.db)
+
+    for basedir in args.basedir:
+        log.info("Scanning %a ...", str(basedir))
+        context = metadex.scan(
+            basedir,
+            ignore_file=args.ignore_from,
+            remove_missing=args.remove_missing,
+            map_pathspecs=args.map_mount,
+        )
+
+        msg = f"{basedir}: Checked {context.seen} files, {context.added} new, {context.changed} changed, {context.ignored} ignored, {context.removed} removed"
+        print(msg.ljust(metadex._terminal_width))
+
+    metadex.close()
+
+
+def cmd_rm(args):
+    metadex.init(args.db)
+
+    for path in args.files:
+        metadex.rm(path, include_children=args.include_subfiles)
+
+    metadex.close()
+
+
+def cmd_ls(args):
+    metadex.init(args.db)
+
+    args.file = [f for f in args.file if f]
+
+    if not args.file:
+
+        # List all known hosts.
+        for host in sorted(metadex.hosts(), key=str.casefold):
+            print(f"{host}:")
+
+    else:
+
+        for pathspec in args.file:
+
+            for file in metadex.ls(pathspec, type=args.type, match=args.match):
+                date = file.stat_modified.isoformat(sep=" ", timespec="seconds")
+                size = utils.size_for_display(
+                    file.stat_bytes, precision=1, format="compact"
+                )
+                path = f"{file.hostname}:{file.location}"
+                if file.stat_type == "d" and not path.endswith("/"):  # relevant for `/`
+                    path += "/"
+
+                fargs = {"date": date, "size": size, "path": path, "file": file}
+                try:
+                    out = args.format.format(**fargs)
+                except (KeyError, AttributeError) as err:
+                    keys = sorted(fargs.keys())
+                    log.exception(
+                        "Keys available to formatting: %s",
+                        ", ".join(keys),
+                        exc_info=err,
+                    )
+                    return 5
+                print(out)
+
+
+def is_stdout_piped():
+    s = os.fstat(sys.stdout.fileno())
+    return stat.S_ISFIFO(s.st_mode)
+
+
+def main():
+    logging.basicConfig(
+        format="%(asctime)s.%(msecs)03d [%(name)s:%(process)d] %(levelname)s: %(message)s",
+        datefmt="%H:%M:%S",
+        level=config.loglevel,
+    )
+    logging.getLogger("sqlalchemy.engine").setLevel(
+        "INFO" if config.debug else "WARNING"
+    )
+    log.debug(f"Log level: {config.loglevel}")
+
+    args = getargs()
+    # print(args)
+
+    if args.verbose and config.loglevel == "WARNING":
+        config.loglevel = "INFO"
+        logging.getLogger().setLevel(config.loglevel)
+    config.db_allow_slow = not args.quick
+    config.dryrun = args.dry_run
+    config.is_stdout_piped = is_stdout_piped()
+
+    if args.hostname:
+        config.hostname = args.hostname
+
+    if config.hostname:
+        log.info("Using hostname: %a", config.hostname)
+    else:
+        log.error("Hostname is not set.")
+        log.info(
+            "If the hostname cannot be found automatically, try setting it using --hostname."
+        )
+        return 2
+
+    if config.dryrun:
+        log.info(f"--- DRY RUN ---")
+
+    if args.mode == "scan":
+        return cmd_scan(args)
+    elif args.mode == "ingest-ls":
+        return cmd_ingest_ls(args)
+    elif args.mode == "ingest-db":
+        return cmd_ingest_db(args)
+    elif args.mode == "rm":
+        return cmd_rm(args)
+    elif args.mode == "ls":
+        # Since this is a read-only operation we can change some config params.
+        config.db_allow_slow = False
+        config.dryrun = True
+        return cmd_ls(args)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/metadex/config.py
+++ b/metadex/config.py
@ -0,0 +1,11 @@
+import os
+from pathlib import Path
+
+debug = os.getenv("DEBUG") == "1"
+loglevel = os.getenv("METADEX_LOGLEVEL") or ("DEBUG" if debug else "WARNING")
+dryrun = False
+hostname = os.uname().nodename  # or socket.gethostname()
+default_db = Path("metadex.sqlite")
+default_ignore = Path("metadex.ignore")
+db_allow_slow = True
+is_stdout_piped = False
--- a/metadex/db.py
+++ b/metadex/db.py
@ -0,0 +1,540 @@
+import logging
+from contextlib import contextmanager
+from datetime import datetime
+from functools import lru_cache
+from pathlib import Path
+from random import randint
+from typing import Iterable, overload
+
+from sqlalchemy import (
+    Column,
+    DateTime,
+    Enum,
+    Integer,
+    MetaData,
+    String,
+    Table,
+    UniqueConstraint,
+    create_engine,
+)
+from sqlalchemy.engine.base import Connection, Engine
+from sqlalchemy.engine.row import Row
+from sqlalchemy.exc import IntegrityError
+from sqlalchemy.sql import and_, or_, select, text
+from sqlalchemy.sql.schema import ForeignKey
+
+from . import config
+
+log = logging.getLogger(__name__)
+
+metadata = MetaData()
+
+metadex = Table(
+    "metadex",
+    metadata,
+    Column("id", Integer, primary_key=True),
+    Column(
+        "parent_id",
+        ForeignKey("metadex.id"),
+        nullable=True,
+        index=True,
+        comment="Points to the entry with the parent location.",
+    ),
+    Column(
+        "added", DateTime, nullable=False
+    ),  # switch to Integer for smaller size maybe?
+    Column(
+        "updated", DateTime, nullable=False
+    ),  # switch to Integer for smaller size maybe?
+    Column("location", String, nullable=False, index=True),
+    Column("hostname", String, nullable=False, index=True),
+    Column("stat_bytes", Integer, nullable=False),
+    # Column("stat_changed", DateTime, nullable=False),  # switch to Integer for smaller size maybe?
+    Column(
+        "stat_modified", DateTime, nullable=False
+    ),  # switch to Integer for smaller size maybe?
+    Column("stat_type", Enum("d", "f", "l", "-"), nullable=False),
+    UniqueConstraint("location", "hostname"),
+)
+
+engine: Engine = None  # type:ignore
+
+
+def check_integrity(conn: Connection):
+
+    stmt = text("PRAGMA integrity_check")
+    state = conn.execute(stmt).scalar()
+
+    if state is None:
+        raise IntegrityError(stmt, None, None)
+
+    log.info("Database file integrity: %s", state)
+
+
+def check_parent_ids(conn: Connection):
+    log.info("Checking parent file associations ... press Ctrl-C to skip!")
+
+    try:
+        reassign_parent_ids(conn)
+    except KeyboardInterrupt:
+        log.warning("Aborted parent ID rebuild.")
+
+
+def optimize(conn: Connection, *, vacuum: bool = False):
+    log.info("Optimizing database ...")
+
+    conn.execute(text("PRAGMA analysis_limit=400"))
+    conn.execute(text("PRAGMA optimize"))
+
+    if vacuum:
+        log.info("Running vacuum on database ... press Ctrl-C to skip!")
+        try:
+            conn.execute(text("VACUUM"))
+        except KeyboardInterrupt:
+            log.warning("Aborted DB cleanup.")
+
+
+def autoconf(conn: Connection):
+    log.info("Configuring database ...")
+
+    conn.execute(text("PRAGMA journal_mode=WAL"))
+    conn.execute(text("PRAGMA synchronous=NORMAL"))
+
+
+class Db:
+    engine: "Engine | None" = None
+    is_dirty: bool = False
+
+    def __init__(self, path: Path):
+        self.open(path)
+
+    def __del__(self):
+        self.close()
+
+    def open(self, path: Path):
+        log.info("Using database: %a", str(path))
+
+        if self.engine:
+            raise RuntimeError("DB already initialized.")
+
+        prefix = "sqlite+pysqlite:///"
+        self.engine = create_engine(f"{prefix}{path}", future=True)
+        metadata.create_all(engine)
+
+        if config.db_allow_slow:
+            with self.transaction() as conn:
+                autoconf(conn)
+                check_integrity(conn)
+
+    def close(self):
+        if self.engine is None:
+            return
+
+        if self.is_dirty:
+            chance = 10  # Set the chance for long running actions to happen to 1 in X.
+            do_slow = config.db_allow_slow and randint(1, chance) == 1
+
+            with self.transaction() as conn:
+                # if do_slow:
+                #     check_parent_ids(conn)
+                optimize(conn, vacuum=do_slow)
+
+        self.engine = None
+
+    @contextmanager
+    def transaction(
+        self, *, rollback_on_error: bool = False, force_rollback: bool = False
+    ):
+        if self.engine is None:
+            raise RuntimeError("DB was closed.")
+
+        connect = (
+            self.engine.connect
+            if (force_rollback or config.dryrun)
+            else self.engine.begin
+        )
+        err = None
+
+        with connect() as conn:
+            try:
+                yield conn
+            except BaseException as e:
+                if force_rollback or rollback_on_error:
+                    raise e
+                # Allow the connection to run its ordinary clean up, i.e. to flush
+                # the data written to it so far to disk.
+                err = e
+
+        if err:
+            raise err
+
+        if not force_rollback:
+            self.is_dirty = True
+
+
+def init(path: Path = Path(":memory:")):
+    global engine
+
+    log.info("Using database: %a", str(path))
+
+    if engine:
+        raise RuntimeError("DB already initialized.")
+
+    prefix = "sqlite+pysqlite:///"
+    engine = create_engine(f"{prefix}{path}", future=True)
+    metadata.create_all(engine)
+
+    if config.db_allow_slow:
+        with transaction() as conn:
+            autoconf(conn)
+            check_integrity(conn)
+
+
+def close():
+    global engine
+
+    chance = 10  # Set the chance for long running actions to happen to 1 in X.
+    do_slow = config.db_allow_slow and randint(1, chance) == 1
+
+    with transaction() as conn:
+        # if do_slow:
+        #     check_parent_ids(conn)
+        optimize(conn, vacuum=do_slow)
+
+    engine = None  # type: ignore
+
+
+def iter_all(conn: Connection) -> Iterable[Row]:
+    return conn.execute(select(metadex))
+
+
+def get_file(conn: Connection, *, location: str, hostname: str):
+    stmt = select(metadex).where(
+        and_(
+            metadex.c.location == location,
+            metadex.c.hostname == hostname,
+        )
+    )
+    return conn.execute(stmt).one_or_none()
+
+
+def get_files(conn: Connection, *, parent_id: int):
+    stmt = select(metadex).where(
+        metadex.c.parent_id == parent_id,
+    )
+    return conn.execute(stmt).all()
+
+
+_escape_char = "#"
+
+
+def escape(s: str) -> str:
+    return (
+        s.replace(_escape_char, 2 * _escape_char)
+        .replace("%", _escape_char + "%")
+        .replace("_", _escape_char + "_")
+    )
+
+
+def search(
+    conn: Connection,
+    *,
+    contains: "str | None" = None,
+    startswith: "str | None" = None,
+    endswith: "str | None" = None,
+    like: "str | None" = None,
+    regex: "str | None" = None,
+    type: "str | None" = None,
+    hostname: "str | None" = None,
+    hostname_like: "str | None" = None,
+    hostname_regex: "str | None" = None,
+) -> "Iterable[Row]":
+
+    stmt = select(metadex)
+
+    if type:
+        stmt = stmt.where(metadex.c.stat_type == type)
+
+    if hostname:
+        stmt = stmt.where(metadex.c.hostname == hostname)
+    if hostname_like:
+        stmt = stmt.where(metadex.c.hostname.ilike(hostname_like, escape=_escape_char))
+    if hostname_regex:
+        stmt = stmt.where(metadex.c.hostname.regexp_match(hostname_regex))
+
+    if contains:
+        stmt = stmt.where(
+            metadex.c.location.contains(contains, autoescape=True),
+        )
+    if endswith:
+        stmt = stmt.where(
+            metadex.c.location.endswith(endswith, autoescape=True),
+        )
+    if startswith:
+        stmt = stmt.where(
+            metadex.c.location.startswith(startswith, autoescape=True),
+        )
+    if like:
+        stmt = stmt.where(
+            metadex.c.location.ilike(like, escape=_escape_char),
+        )
+    if regex:
+        # It's important the "regex" filter comes last, because the order actually matters for SQLAlchemy/SQLite.
+        # Running this filter last allows for all the _quick_ filters to apply first, leaving less rows for the expensive REGEXP statement.
+        stmt = stmt.where(
+            metadex.c.location.regexp_match(regex),
+        )
+
+    return conn.execute(stmt)
+
+
+def all_hostnames(conn: Connection) -> Iterable[str]:
+    stmt = select(metadex.c.hostname).distinct()
+    return conn.execute(stmt).scalars().all()
+
+
+def _fake_entry(path: Path, *, hostname=None, now, parent_id) -> dict:
+    return dict(
+        parent_id=parent_id,
+        added=now,
+        updated=now,
+        location=str(path),
+        hostname=hostname if hostname is not None else config.hostname,
+        stat_bytes=0,
+        stat_modified=datetime.fromtimestamp(0),
+        stat_type="d",
+    )
+
+
+def _add_parents(conn: Connection, *, location: str, hostname: str):
+    p_id: "int | None" = None
+    for p in reversed(Path(location).parents):
+        log.warning("Forging parent: %a:%a", hostname, str(p))
+        d = _fake_entry(p, hostname=hostname, now=datetime.now(), parent_id=p_id)
+        d = get_or_add(conn, d)
+        p_id = d["id"]
+        # r = conn.execute(
+        #     metadex.insert(),
+        #     [d],
+        # )
+        # p_id = r.inserted_primary_key.id
+
+    return p_id
+
+
+def get_or_add(conn: Connection, new_data: dict):
+    row = get_file(conn, location=new_data["location"], hostname=new_data["hostname"])
+    if row:
+        return row
+
+    log.info(
+        "File added: %a:%a (size: %i)",
+        new_data["hostname"],
+        new_data["location"],
+        new_data["stat_bytes"],
+    )
+
+    if "id" in new_data:
+        del new_data["id"]
+
+    new_data["parent_id"] = _parent_id(
+        conn, location=new_data["location"], hostname=new_data["hostname"]
+    )
+    if new_data["parent_id"] is None:
+        new_data["parent_id"] = _add_parents(
+            conn, location=new_data["location"], hostname=new_data["hostname"]
+        )
+
+    now = datetime.now()
+    if "added" not in new_data:
+        new_data["added"] = now
+    if "updated" not in new_data:
+        new_data["updated"] = now
+
+    r = conn.execute(metadex.insert(), [new_data])
+    new_data["id"] = r.inserted_primary_key.id
+    return new_data
+
+
+def upsert_if_changed(conn: Connection, new_data: dict):
+    row = get_or_add(conn, new_data)
+    is_from_db = isinstance(row, Row)
+    if not is_from_db:
+        return "added"
+
+    is_changed = (
+        new_data["stat_bytes"] != row["stat_bytes"]
+        # or new_data["stat_changed"] != row["stat_changed"]  # Ignore ctime, mtime is enough
+        or new_data["stat_modified"] != row["stat_modified"]
+        or new_data["stat_type"] != row["stat_type"]
+    )
+
+    if not is_changed:
+        return "unchanged"
+
+    log.info("File changed: %a:%a", new_data["hostname"], new_data["location"])
+
+    # changelog = []
+    # for f in ("stat_bytes", "stat_modified", "stat_type"):
+    #     if new_data[f] != row[f]:
+    #         changelog.append(f"{f[5:]}: {row[f]!a} -> {new_data[f]!a}")
+    # log.info("File changes: %s", ", ".join(changelog))
+
+    if "id" in new_data:
+        del new_data["id"]
+
+    new_data["parent_id"] = _parent_id(conn, metadex_id=row["id"])
+    # del new_data["added"]
+    new_data["updated"] = datetime.now()
+    stmt = metadex.update(
+        and_(
+            metadex.c.location == new_data["location"],
+            metadex.c.hostname == new_data["hostname"],
+        )
+    )
+    conn.execute(stmt, [new_data])
+    return "changed"
+
+
+def remove_all(conn: Connection, location: str, *, hostname=None) -> int:
+    """Remove the entry with the given path and all its descendants."""
+
+    # We're using text comparison here to catch removed descendants even if
+    # an intermediate directory is missing, e.g.
+    #   we have indexed /foo and /foo/bar/boo but not /foo/bar,
+    #   this can happen through ignore rules and users adding those paths explicitly.
+    # We could also choose to ignore these edge cases and create orphans instead,
+    # or change our parent-id-mechanism to support skipping intermediates, both of
+    # which might be valid decisions for sake of optimization.  For now we choose
+    # simple correctness.  Let's see how bad the performance can get.
+
+    if hostname is None:
+        hostname = config.hostname
+
+    selector = and_(
+        metadex.c.hostname == hostname,
+        or_(
+            metadex.c.location == location,
+            metadex.c.location.startswith(location + "/", autoescape=True),
+        ),
+    )
+    stmt = select(metadex.c.location).where(selector)
+
+    cur = conn.execute(stmt)
+    for (loc,) in cur:
+        log.warning("Purging file from DB: %a:%a", hostname, loc)
+
+    stmt = metadex.delete(selector)
+    return conn.execute(stmt).rowcount
+
+
+@contextmanager
+def transaction(rollback_on_error: bool = False):
+    connect = engine.connect if config.dryrun else engine.begin
+    err = None
+
+    with connect() as conn:
+        try:
+            yield conn
+        except BaseException as e:
+            if rollback_on_error:
+                raise e
+            # Allow the connection to run its ordinary clean up, i.e. to flush
+            # the data written to it so far to disk.
+            err = e
+
+    if err:
+        raise err
+
+
+def files_in_dir(conn: Connection, location: str, *, hostname=None) -> Iterable[str]:
+    """Return all file names for the given dir."""
+    if hostname is None:
+        hostname = config.hostname
+
+    query = select(metadex.c.id).where(
+        and_(metadex.c.hostname == hostname, metadex.c.location == location)
+    )
+    dir_id = conn.execute(query).scalar()
+    if dir_id is None:
+        return
+
+    query = select(metadex.c.location).where(metadex.c.parent_id == dir_id)
+    for (loc,) in conn.execute(query):
+        yield Path(loc).name
+
+
+MetadexId = int
+
+
+@overload
+def _parent_id(conn: Connection, *, metadex_id: MetadexId) -> "MetadexId | None":
+    ...
+
+
+@overload
+def _parent_id(conn: Connection, *, location: str, hostname: str) -> "MetadexId | None":
+    ...
+
+
+@lru_cache(maxsize=2048)
+def _parent_id(
+    conn: Connection,
+    *,
+    metadex_id: "MetadexId | None" = None,
+    location: "str | None" = None,
+    hostname: "str | None" = None,
+) -> "MetadexId | None":
+    if location is None:
+        stmt = select(metadex.c.location, metadex.c.hostname).where(
+            metadex.c.id == metadex_id
+        )
+        row = conn.execute(stmt).first()
+        if not row:
+            raise RuntimeError(
+                "Metadex ID referenced but missing from DB: %a", metadex_id
+            )
+        location, hostname = row
+
+    assert location
+    parent_loc = str(Path(location).parent)
+    stmt = select(metadex.c.id).where(
+        and_(metadex.c.location == parent_loc, metadex.c.hostname == hostname)
+    )
+    val = conn.execute(stmt).scalar()
+    if not val:
+        log.warning(
+            "No parent found: %a",
+            {"metadex_id": metadex_id, "location": location, "hostname": hostname},
+        )
+    return val
+
+
+def reassign_parent_ids(conn: Connection):
+    stmt = select(
+        metadex.c.id, metadex.c.parent_id, metadex.c.location, metadex.c.hostname
+    )
+    for (m_id, p_id_old, loc, host) in conn.execute(stmt):
+
+        parent_loc = str(Path(loc).parent)
+        if parent_loc == loc:
+            p_id = None
+        else:
+            stmt = select(metadex.c.id).where(
+                and_(metadex.c.location == parent_loc, metadex.c.hostname == host)
+            )
+            p_id = conn.execute(stmt).scalar()
+            if not p_id:
+                log.warning(
+                    "No parent found: %a",
+                    {"metadex_id": m_id, "loc": loc, "host": host},
+                )
+                p_id = _add_parents(conn, location=loc, hostname=host)
+        if p_id != p_id_old:
+            log.warning(
+                "Parent changed: %a",
+                {"metadex_id": m_id, "loc": loc, "host": host, "parent_id": p_id},
+            )
+
+            stmt = metadex.update().where(metadex.c.id == m_id)
+            conn.execute(stmt, {"parent_id": p_id})
--- a/metadex/ignore.py
+++ b/metadex/ignore.py
@ -0,0 +1,35 @@
+import re
+from functools import partial
+from pathlib import Path
+from typing import Match
+
+_regex_glob_map = {
+    "**": r".*",
+    "*": r"[^/]*",
+    "?": r"[^/]",
+}
+_regex_glob_map = {re.escape(k): v for k, v in _regex_glob_map.items()}
+
+
+def _regex_from_glob(match: Match[str]) -> str:
+    return _regex_glob_map[match.group()]
+
+
+_replace_globs_re = re.compile("|".join(re.escape(k) for k in _regex_glob_map))
+_replace_globs = partial(_replace_globs_re.sub, _regex_from_glob)
+
+
+def parse(path: Path):
+    rules = []
+    for line in path.open():
+        line = line.rstrip()
+        if not line or line.startswith("# "):
+            continue
+
+        rule = _replace_globs(re.escape(line))
+        if not rule.startswith("/"):
+            rule = r".*/" + rule
+        rules.append(rule)
+
+    regex = "|".join(rules)
+    return re.compile(regex).fullmatch
--- a/metadex/ls_parser.py
+++ b/metadex/ls_parser.py
@ -0,0 +1,202 @@
+import argparse
+import json
+import logging
+import re
+import sys
+from dataclasses import asdict, dataclass, fields
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Iterable, TextIO
+
+from . import utils
+
+log = logging.getLogger(__name__)
+
+# file mode entry type:
+#     -     Regular file.
+#     b     Block special file.
+#     c     Character special file.
+#     d     Directory.
+#     l     Symbolic link.
+#     p     FIFO.
+#     s     Socket.
+#     w     Whiteout.
+
+# ls_re = re.compile("drwxrwsr-x 555 somuser  somegrp 555 Dec 25 20:06 .")
+# ls_re = re.compile("drwxr-xr-x  11 501  20        352 1649098510 .")
+ls_re = re.compile(
+    r"(?P<mode>[-bcdlpsw][-rwSsx]{6}[-rwSsxTt]{3})[@+]?\s+(?P<links>\d+)\s+(?P<owner>\S+)\s+(?P<group>\S+)\s+(?P<size>[.\d]+["
+    + utils._size_quantifiers
+    + r"]?)\s+(?P<date>(\d+|(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) [ 1-3]\d ( \d{4}|\d\d:\d\d)))\s(?P<name>.*)"
+)
+
+
+@dataclass
+class File:
+    mode: str
+    owner: str
+    group: str
+    size_bytes: int
+    date: datetime
+    path: Path
+
+    @property
+    def is_dir(self):
+        return self.mode.startswith("d")
+
+    @property
+    def is_symlink(self):
+        return self.mode.startswith("l")
+
+
+def asplain(o: object) -> "dict[str, Any]":
+    d = asdict(o)
+    for f in fields(o):
+        if f.type is datetime:
+            d[f.name] = d[f.name].isoformat()
+        elif f.type is Path:
+            d[f.name] = d[f.name].as_posix()
+    return d
+
+
+def parse_date(date: str, ref_year: "int | None" = None) -> datetime:
+    try:
+        return (
+            datetime.fromtimestamp(float(date))
+            if date.isdigit()
+            else datetime.strptime(date, "%b %d %Y")
+        )
+    except ValueError:
+        pass
+
+    if ref_year is None:
+        log.error("A reference year is required for relative timestamps: %a", date)
+        raise ValueError("Missing ref_year.")
+
+    # We need to include the year in the string for parsing with strptime to
+    # fully support leap years; otherwise without a year it might complain that
+    # "Feb 29" is out of range.
+    return datetime.strptime(f"{date} {ref_year}", "%b %d %H:%M %Y")
+
+
+@dataclass
+class ChangeDir:
+    from_: "Path | None"
+    to: "Path | None"
+
+
+def parse_file(
+    file: TextIO, *, ref_year: "int | None" = None
+) -> Iterable["File | ChangeDir"]:
+    lines = (line.rstrip() for line in file)
+    yield from parse_lines(lines, ref_year=ref_year)
+
+
+def parse_lines(
+    lines: Iterable[str], *, ref_year: "int | None" = None
+) -> Iterable["File | ChangeDir"]:
+
+    workdir = Path("/")
+    dirname: "Path | None" = None
+
+    for i, line in enumerate(lines, start=1):
+
+        if not line:
+            # empty line, reset context
+            if dirname is not None:
+                yield ChangeDir(from_=dirname, to=None)
+            dirname = None
+            continue
+
+        if dirname is None:
+            if not line.endswith(":"):
+                log.error("Path is missing from context, instead got: %a", line)
+                raise ValueError(f"Unexpected input in line #{i}")
+
+            if not line.startswith("/"):
+                log.error("Only absolute paths are supported: %a", line)
+                raise ValueError(f"Unexpected input in line #{i}")
+
+            dirname = workdir / line[:-1]
+            yield ChangeDir(from_=None, to=dirname)
+
+        elif line.startswith("total "):
+            pass
+
+        elif match := ls_re.fullmatch(line):
+            name = match["name"]
+
+            # Support `ls` output where dirs are marked with a `/` suffix.
+            if name.endswith("/"):
+                name = name[:-1]
+
+            if name in (".", ".."):
+                continue
+
+            if match["mode"].startswith("l"):
+                markers = name.count("->")
+                if markers == 1:
+                    name = name.split(" -> ")[0]
+                elif markers >= 2:
+                    raise RuntimeError(f"Symlink has an ambiguous name: {name!a}")
+                else:
+                    log.warning("Symlink is missing a target: %a", name)
+
+            try:
+                size = utils.parse_size(match["size"])
+            except ValueError as err:
+                log.error("Error parsing size value: %a", match["size"], exc_info=err)
+                raise ValueError(f"Unexpected input in line #{i}") from err
+
+            try:
+                date = parse_date(match["date"], ref_year)
+            except ValueError as err:
+                log.error("Error parsing date value: %a", match["date"], exc_info=err)
+                raise ValueError(f"Unexpected input in line #{i}") from err
+
+            yield File(
+                mode=match["mode"],
+                owner=match["owner"],
+                group=match["group"],
+                size_bytes=size,
+                date=date,
+                path=dirname / name,
+            )
+
+        else:
+            log.error("Line not matched by parser: %a", line)
+            raise ValueError(f"Unexpected input in line #{i}")
+
+    if dirname is not None:
+        yield ChangeDir(from_=dirname, to=None)
+
+
+def get_args(argv: "list[str]"):
+    parser = argparse.ArgumentParser()
+    # parser.add_argument("--workdir", help="The directory from where 'ls -l' was run")
+    parser.add_argument("--ref-year", type=int, help="The year when 'ls -l' was run")
+    # parser.add_argument(
+    #     "--json", action="store_true", default=False, help="Output as JSON"
+    # )
+    parser.add_argument(
+        "infile",
+        nargs="?",
+        type=argparse.FileType(),
+        default=sys.stdin,
+    )
+
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv: "list[str]"):
+    args = get_args(argv)
+    # workdir = Path(args.workdir or ".")
+    ref_year = args.ref_year or datetime.now().year
+
+    for f in parse_file(args.infile, ref_year=ref_year):
+        print(json.dumps(asplain(f)))
+
+
+if __name__ == "__main__":
+    main(sys.argv)
--- a/metadex/metadex.py
+++ b/metadex/metadex.py
@ -0,0 +1,618 @@
+import logging
+import os
+import re
+import sys
+import time
+from collections import deque
+from dataclasses import dataclass
+from pathlib import Path
+from shutil import get_terminal_size
+from typing import Iterable, Literal, TextIO
+
+from . import config, db, ignore, ls_parser, models
+
+log = logging.getLogger(__name__)
+
+init = db.init
+close = db.close
+
+
+def scan(
+    path: Path,
+    *,
+    ignore_file: Path,
+    remove_missing: bool = False,
+    map_pathspecs: "list[str]" = [],
+) -> "_LogContext":
+    f = _scan_remove_missing if remove_missing else _scan_add_only
+    return f(path, ignore_file=ignore_file, map_pathspecs=map_pathspecs)
+
+
+# Opportunistically compensate for wide chars on the terminal.
+_terminal_width = int(get_terminal_size().columns * 0.9)
+_last_log = 0
+
+
+def _log_ephemeral(msg: str, *, debounce_ms: "int | None" = 200):
+    global _last_log
+
+    if debounce_ms is not None:
+        now = time.monotonic()
+        if _last_log + (debounce_ms / 1000) > now:
+            return
+        _last_log = now
+
+    msg = msg.encode(errors="replace").decode()
+    if len(msg) > _terminal_width:
+        msg = msg[: _terminal_width - 3] + "..."
+    sys.stderr.write(msg.ljust(_terminal_width) + "\r")
+
+
+@dataclass
+class _LogContext:
+    seen: int = 0
+    ignored: int = 0
+    added: int = 0
+    changed: int = 0
+    removed: int = 0
+
+
+def _log_context(path, context: _LogContext):
+    if config.is_stdout_piped:
+        return
+
+    _log_ephemeral(
+        f"{context.seen} a:{context.added} c:{context.changed} i:{context.ignored} r:{context.removed} {path}"
+    )
+
+
+def _scan_add_only(path: Path, *, ignore_file: Path, map_pathspecs: "list[str]"):
+    is_ignored = ignore.parse(ignore_file)
+
+    maps = _parse_pathspec_mapping(map_pathspecs)
+
+    context = _LogContext()
+
+    with db.transaction() as conn:
+
+        context.seen += 1
+
+        d = models.File.dict_from_entry(path)
+        _apply_mapping(maps, d)
+
+        if is_ignored(d["location"]):
+            log.warning(
+                "Skipping ignored basedir: %a:%a",
+                d["hostname"],
+                d["location"],
+            )
+            return context
+
+        if (action := db.upsert_if_changed(conn, d)) == "added":
+            context.added += 1
+        elif action == "changed":
+            context.changed += 1
+
+        dirs: deque[Path] = deque()
+        if d["stat_type"] == "d":
+            dirs.append(path)
+
+        while dirs:
+
+            cwd = dirs.popleft()
+            try:
+                scan = os.scandir(cwd)
+            except Exception as err:
+                log.error(err)
+                continue
+
+            subdirs: deque[Path] = deque()
+            with scan as files:
+                for f in files:
+
+                    context.seen += 1
+
+                    _log_context(f.path, context)
+
+                    d = models.File.dict_from_entry(f)
+                    _apply_mapping(maps, d)
+
+                    if is_ignored(d["location"]):
+                        log.debug(
+                            "Skipping ignored entry: %a:%a",
+                            d["hostname"],
+                            d["location"],
+                        )
+                        context.ignored += 1
+                        continue
+
+                    if (action := db.upsert_if_changed(conn, d)) == "added":
+                        context.added += 1
+                        append = subdirs.append
+                    elif action == "changed":
+                        context.changed += 1
+                        append = subdirs.append
+                    else:
+                        append = subdirs.appendleft
+
+                    if f.is_dir(follow_symlinks=False):
+                        append(Path(f.path))
+
+            # `subdirs` sorts all changed dirs to the right, which means when we
+            # extend `dirs` using `extendleft` it'll put them all left-most.
+            # Or put more simply: new stuff on the left, old on the right.
+            dirs.extendleft(subdirs)
+
+    return context
+
+
+def _scan_remove_missing(path: Path, *, ignore_file: Path, map_pathspecs: "list[str]"):
+    """Like `scan` but also search for missing files."""
+    is_ignored = ignore.parse(ignore_file)
+
+    maps = _parse_pathspec_mapping(map_pathspecs)
+
+    context = _LogContext()
+
+    with db.transaction() as conn:
+
+        context.seen += 1
+
+        d = models.File.dict_from_entry(path)
+        _apply_mapping(maps, d)
+
+        if is_ignored(d["location"]):
+            log.warning(
+                "Skipping ignored basedir: %a:%a",
+                d["hostname"],
+                d["location"],
+            )
+            return context
+
+        if (action := db.upsert_if_changed(conn, d)) == "added":
+            context.added += 1
+        elif action == "changed":
+            context.changed += 1
+
+        dirs: deque[Path] = deque()
+        if d["stat_type"] == "d":
+            dirs.append(path)
+
+        while dirs:
+
+            cwd = dirs.popleft()
+            try:
+                scan = os.scandir(cwd)
+            except Exception as err:
+                log.error(err)
+                continue
+
+            expected = {name for name in db.files_in_dir(conn, str(cwd))}
+
+            subdirs: deque[Path] = deque()
+            with scan as files:
+                for f in files:
+
+                    context.seen += 1
+
+                    _log_context(f.path, context)
+
+                    d = models.File.dict_from_entry(f)
+                    _apply_mapping(maps, d)
+
+                    if is_ignored(d["location"]):
+                        log.debug(
+                            "Skipping ignored entry: %a:%a",
+                            d["hostname"],
+                            d["location"],
+                        )
+                        context.ignored += 1
+                        continue
+
+                    if (action := db.upsert_if_changed(conn, d)) == "added":
+                        context.added += 1
+                        append = subdirs.append
+                    elif action == "changed":
+                        context.changed += 1
+                        append = subdirs.append
+                    else:
+                        append = subdirs.appendleft
+
+                    if f.is_dir(follow_symlinks=False):
+                        append(Path(f.path))
+
+                    expected.discard(f.name)
+
+            # `subdirs` sorts all changed dirs to the right, which means when we
+            # extend `dirs` using `extendleft` it'll put them all left-most.
+            # Or put more simply: new stuff on the left, old on the right.
+            dirs.extendleft(subdirs)
+
+            for name in expected:
+                f = str(cwd / name)
+                if is_ignored(f):
+                    continue
+
+                log.info("File removed: %a", f)
+
+                db.remove_all(conn, f)
+
+    return context
+
+
+_pathspec_re = re.compile(r"((?P<host>[^:/]*):)?(?P<path>.*)")
+_src_dest_re = re.compile(r"src=(?P<src>.*),dest=(?P<dest>.*)")
+
+
+def _parse_pathspec(pathspec: str):
+    match = _pathspec_re.fullmatch(pathspec)
+    assert match
+    host: "str | None" = match["host"]
+    path: str = match["path"] or "/"
+    return host, path
+
+
+def _clean_dirname(loc: str, *, force_absolute=True):
+    if force_absolute and not loc.startswith("/"):
+        loc = "/" + loc
+    if not loc.endswith("/"):
+        loc += "/"
+    return loc
+    # if loc != "/" and loc.endswith("/"):
+    #     return loc[:-1]
+    # return loc
+
+
+def _parse_pathspec_mapping(map_pathspecs: "list[str]"):
+    Hostname = str
+    Location = str
+    maps: dict[Hostname, dict[Location, tuple[Hostname, Location]]] = {}
+    for pathspec_mapping in map_pathspecs:
+        match = _src_dest_re.fullmatch(pathspec_mapping)
+        if not match:
+            log.error("Invalid mapping: %a", pathspec_mapping)
+            raise ValueError("Could not parse mapping.")
+
+        src_host, src_path = _parse_pathspec(match["src"])
+        if not src_host:
+            src_host = config.hostname
+            log.warning("Using default hostname for mapping source: %a", src_host)
+            # log.error("Hostname is required when mapping paths: %a", match["src"])
+            # raise ValueError("Missing hostname.")
+        src_path = _clean_dirname(src_path)
+        if src_host not in maps:
+            maps[src_host] = {}
+
+        dest_host, dest_path = _parse_pathspec(match["dest"])
+        if not dest_host:
+            dest_host = config.hostname
+            log.warning("Using default hostname for mapping dest: %a", dest_host)
+            # log.error("Hostname is required when mapping paths: %a", match["dest"])
+            # raise ValueError("Missing hostname.")
+        dest_path = _clean_dirname(dest_path)
+        maps[src_host][src_path] = dest_host, dest_path
+        log.info("Mapping %a:%a -> %a:%a", src_host, src_path, dest_host, dest_path)
+
+    return maps
+
+
+def _apply_mapping(maps: dict, d: dict):
+    hostname = d["hostname"]
+    location = (
+        d["location"]
+        if d["stat_type"] != "d"
+        else _clean_dirname(d["location"], force_absolute=False)
+    )
+    if hostname in maps:
+        for src_loc, (dest_host, dest_loc) in maps[hostname].items():
+            if location.startswith(src_loc):
+                d["hostname"] = dest_host
+                d["location"] = dest_loc + d["location"][len(src_loc) :]
+                log.debug(
+                    "Mapping %a -> %a",
+                    f"{hostname}:{location}",
+                    f'{d["hostname"]}:{d["location"]}',
+                )
+                break
+
+
+def ingest_db_file(
+    db_file: Path,
+    *,
+    ignore_file: Path,
+    map_pathspecs: "list[str]" = [],
+    select_pathspecs: "list[str]" = [],
+) -> _LogContext:
+    is_ignored = ignore.parse(ignore_file)
+
+    maps = _parse_pathspec_mapping(map_pathspecs)
+
+    context = _LogContext()
+
+    other_db = db.Db(db_file)
+    with db.transaction() as conn, other_db.transaction(
+        force_rollback=True
+    ) as other_conn:
+
+        for row in db.iter_all(other_conn):
+
+            context.seen += 1
+
+            _log_context(row["location"], context)
+
+            d = dict(row)
+            _apply_mapping(maps, d)
+
+            if is_ignored(d["location"]):
+                log.debug("Skipping ignored entry: %a:%a", d["hostname"], d["location"])
+                context.ignored += 1
+                continue
+
+            if (action := db.upsert_if_changed(conn, d)) == "added":
+                context.added += 1
+            elif action == "changed":
+                context.changed += 1
+
+    return context
+
+
+def ingest_ls(
+    file: TextIO,
+    *,
+    ignore_file: Path,
+    ref_year: "int | None",
+    remove_missing: bool = False,
+) -> _LogContext:
+    f = _ingest_ls_remove_missing if remove_missing else _ingest_ls_add_only
+    return f(file, ignore_file=ignore_file, ref_year=ref_year)
+
+
+def _ingest_ls_add_only(file: TextIO, *, ignore_file: Path, ref_year: "int | None"):
+    is_ignored = ignore.parse(ignore_file)
+
+    context = _LogContext()
+
+    with db.transaction() as conn:
+
+        for f in ls_parser.parse_file(file, ref_year=ref_year):
+
+            if isinstance(f, ls_parser.ChangeDir):
+                continue
+
+            context.seen += 1
+
+            _log_context(f.path, context)
+
+            d = _dict_from_lsfile(f)
+            # _apply_mapping(maps, d)
+
+            if is_ignored(d["location"]):
+                log.debug("Skipping ignored entry: %a:%a", d["hostname"], d["location"])
+                context.ignored += 1
+                continue
+
+            if (action := db.upsert_if_changed(conn, d)) == "added":
+                context.added += 1
+            elif action == "changed":
+                context.changed += 1
+
+    return context
+
+
+def _dict_from_lsfile(f: ls_parser.File) -> dict:
+    mode = f.mode[0]
+    if mode == "-":
+        mode = "f"
+    elif mode not in "dl":
+        mode = "-"
+
+    return dict(
+        location=str(f.path),
+        hostname=config.hostname,
+        stat_bytes=f.size_bytes,
+        stat_modified=f.date,
+        stat_type=mode,
+    )
+
+
+def _ingest_ls_remove_missing(
+    file: TextIO, *, ignore_file: Path, ref_year: "int | None"
+):
+    is_ignored = ignore.parse(ignore_file)
+
+    expected: set[str] = set()
+
+    context = _LogContext()
+
+    with db.transaction() as conn:
+
+        for f in ls_parser.parse_file(file, ref_year=ref_year):
+
+            if isinstance(f, ls_parser.ChangeDir):
+
+                if f.to is not None:
+                    expected = {name for name in db.files_in_dir(conn, str(f.to))}
+
+                elif f.from_:
+                    # remove missing
+                    for name in expected:
+                        loc = str(f.from_ / name)
+                        if is_ignored(loc):
+                            log.info("Ignoring file (for removal): %a", loc)
+                            continue
+
+                        log.info("File removed: %a", loc)
+
+                        context.removed += db.remove_all(conn, loc)
+
+                continue
+
+            context.seen += 1
+
+            _log_context(f.path, context)
+
+            d = _dict_from_lsfile(f)
+            # _apply_mapping(maps, d)
+
+            if is_ignored(d["location"]):
+                log.debug("Skipping ignored entry: %a:%a", d["hostname"], d["location"])
+                context.ignored += 1
+                continue
+
+            if (action := db.upsert_if_changed(conn, d)) == "added":
+                context.added += 1
+            elif action == "changed":
+                context.changed += 1
+
+            expected.discard(f.path.name)
+
+    return context
+
+
+def _ls_files(
+    *,
+    host: "str | None",
+    path: str,
+    type: "models.StatType | None" = None,
+    match: Literal["regex", "glob", "fuzzy"] = "glob",
+) -> Iterable[models.File]:
+    def map_replace(mapping: dict, string: str):
+        pattern = "|".join(re.escape(k) for k in mapping.keys())
+        return re.sub(pattern, lambda m: mapping[m[0]], string)
+
+    def liketerm_from_glob(glob: str) -> str:
+        s = db.escape(glob)
+        s = map_replace({"*": "%", "?": "_"}, s)
+        return s
+
+    def regex_from_glob(glob: str) -> str:
+        s = re.escape(glob)
+        s = map_replace({r"\*\*": ".*", r"\*": "[^/]*", r"\?": "[^/]"}, s)
+        return s
+
+    with db.transaction() as conn:
+        if match == "regex":
+
+            for f in db.search(
+                conn, type=type, hostname_regex=host, regex=f"(?i){path}"
+            ):
+                yield models.File(**f)  # type: ignore
+
+        elif match == "glob":
+
+            filters = {"type": type}
+            if host and _uses_glob(host):
+                filters["hostname_like"] = liketerm_from_glob(host)
+            else:
+                filters["hostname"] = host
+
+            if not _uses_glob(path):
+                rterm = re.escape(path)
+                lterm = path  # no `db.escape`, `endswith` does autoescape
+                result = db.search(
+                    conn,
+                    endswith=lterm,
+                    regex=f"(?i)(^|/){rterm}$",  # ensure a full name match
+                    **filters,
+                )
+
+            else:
+                rterm = regex_from_glob(path)
+                lterm = liketerm_from_glob(path)
+                result = db.search(
+                    conn,
+                    regex=f"(?i)(^|/){rterm}$",
+                    like=f"%{lterm}",  # helps to drastically speed up the regex match
+                    **filters,
+                )
+
+            for f in result:
+                yield models.File(**f)  # type: ignore
+
+        elif match == "fuzzy":
+
+            term = "%".join(db.escape(p) for p in path.split("/"))
+
+            for f in db.search(conn, like=f"%{term}%", type=type, hostname=host):
+                yield models.File(**f)  # type: ignore
+
+
+def _ls_dir_contents(*, host: str, path: str) -> Iterable[models.File]:
+
+    with db.transaction() as conn:
+
+        row = db.get_file(conn, location=path, hostname=host)
+
+        if not row:
+            log.warning("No match: %a:%a", host, path)
+            return
+
+        if row["stat_type"] != "d":
+            yield models.File(**row)  # type: ignore
+            return
+
+        for f in db.get_files(conn, parent_id=row["id"]):
+            yield models.File(**f)  # type: ignore
+
+
+def _uses_glob(string: str) -> bool:
+    return "*" in string or "?" in string
+
+
+def ls(
+    pathspec: str,
+    *,
+    type: "models.StatType | None" = None,
+    match: Literal["regex", "glob", "fuzzy"] = "glob",
+) -> Iterable[models.File]:
+    host, path = _parse_pathspec(pathspec)
+
+    if host == "":
+        host = config.hostname  # allow ":foo" as shortcut for local search
+
+    log.info("Using path spec: %a:%a", host, path)
+
+    if path != "/" and path.endswith("/"):
+        # In our DB no path except root (`/`) ends with `/`.
+        path = path.rstrip("/")
+
+    if host and path.startswith("/") and not _uses_glob(host + path):
+        yield from _ls_dir_contents(host=host, path=path)
+
+    else:
+        yield from _ls_files(host=host, path=path, type=type, match=match)
+
+
+def rm(pathspec: str, *, include_children: bool = False):
+    """Remove the given path and all its descendants."""
+    host, path = _parse_pathspec(pathspec)
+
+    if not host or not path.startswith("/"):
+        log.error(
+            "A full absolute path including hostname is required when removing files: %a",
+            pathspec,
+        )
+        raise ValueError("Incomplete path specification.")
+
+    if path != "/" and path.endswith("/"):
+        path = path[:-1]
+
+    with db.transaction() as conn:
+
+        row = db.get_file(conn, hostname=host, location=path)
+
+        if not row:
+            log.error("No matching file found: %a", pathspec)
+            raise ValueError("Path not found.")
+
+        children = db.get_files(conn, parent_id=row["id"])
+        if children and not include_children:
+            log.error("File has children: %a", pathspec)
+            raise RuntimeError("Path has children.")
+
+        db.remove_all(conn, location=path, hostname=host)
+
+
+def hosts() -> "set[str]":
+    with db.transaction() as conn:
+        return set(db.all_hostnames(conn))
--- a/metadex/models.py
+++ b/metadex/models.py
@ -0,0 +1,84 @@
+import os
+from dataclasses import asdict, dataclass
+from datetime import datetime
+from os import DirEntry
+from pathlib import Path
+from stat import S_IFDIR, S_IFLNK, S_IFMT, S_IFREG
+from typing import Literal
+
+from . import config
+
+_modes = {S_IFDIR: "d", S_IFREG: "f", S_IFLNK: "l"}
+
+asdict = asdict
+
+StatType = Literal["d", "f", "l", "-"]
+
+
+@dataclass
+class File:
+    id: int
+    parent_id: int
+    added: datetime
+    updated: datetime
+    location: str
+    hostname: str  # XXX should better use a fingerprint/unique-id per host (e.g. `/etc/metadex.hostid`, for disks put it on their /)
+    stat_bytes: int
+    # stat_changed: datetime  # XXX remove? The `ctime` changes not only for content changes but also file attr changes, which we don't track anyway.
+    stat_modified: datetime
+    stat_type: StatType
+
+    @classmethod
+    def from_direntry(cls, entry: DirEntry):
+        now = datetime.now()
+        pstat = entry.stat(follow_symlinks=False)
+        return cls(
+            added=now,
+            updated=now,
+            location=entry.path,
+            hostname=config.hostname,
+            stat_bytes=pstat.st_size,
+            # stat_changed=datetime.fromtimestamp(pstat.st_ctime),
+            stat_modified=datetime.fromtimestamp(pstat.st_mtime),
+            stat_type=_modes.get(S_IFMT(pstat.st_mode), "-"),  # type: ignore
+        )
+
+    @classmethod
+    def from_path(cls, path: Path):
+        now = datetime.now()
+        pstat = os.stat(path, follow_symlinks=False)
+        return cls(
+            added=now,
+            updated=now,
+            location=os.path.abspath(path),
+            hostname=config.hostname,
+            stat_bytes=pstat.st_size,
+            # stat_changed=datetime.fromtimestamp(pstat.st_ctime),
+            stat_modified=datetime.fromtimestamp(pstat.st_mtime),
+            stat_type=_modes.get(S_IFMT(pstat.st_mode), "-"),  # type: ignore
+        )
+
+    @staticmethod
+    def dict_from_entry(entry: "DirEntry | Path") -> dict:
+        """Return the File's data structure as dict.
+
+        This can be useful to skip calling `asdict`, which can be quite slow.
+        """
+        # now = datetime.now()
+
+        if isinstance(entry, Path):
+            location = os.path.abspath(entry)
+            pstat = os.stat(entry, follow_symlinks=False)
+        else:
+            location = entry.path.encode(errors="replace").decode()
+            pstat = entry.stat(follow_symlinks=False)
+
+        return dict(
+            # added=now,
+            # updated=now,
+            location=location,
+            hostname=config.hostname,
+            stat_bytes=pstat.st_size,
+            stat_modified=datetime.fromtimestamp(pstat.st_mtime),
+            stat_type=_modes.get(S_IFMT(pstat.st_mode), "-"),
+        )
--- a/metadex/utils.py
+++ b/metadex/utils.py
@ -0,0 +1,51 @@
+import os
+from pathlib import Path
+
+
+_size_quantifiers = "BKMGTP"
+_size_map: "dict[str, int]" = {
+    _size_quantifiers[i]: 2 ** (10 * i) for i in range(len(_size_quantifiers))
+}
+
+
+def size_for_display(byte_count: int, precision: int = 2, format="short") -> str:
+    for qtf in reversed(_size_quantifiers):
+        qty = byte_count / _size_map[qtf]
+        if qty > 1:
+            break
+
+    size = f"{qty:.{precision}f}"
+    if format == "compact":
+        size = size.replace("." + "0" * precision, "")  # silly hack to remove
+        return f"{size:>{4+precision}}{qtf}"
+
+    tpl = "{{:.{precision}f}} {{}}".format(precision=precision)
+    if format == "short":
+        pass
+    elif format == "long" and qtf != "B":
+        tpl += "iB"
+    return tpl.format(qty, qtf)
+
+
+def parse_size(size: str) -> int:
+    """Return the given size converted to byte count.
+
+    Supported are
+
+    - plain byte count, e.g. "12345"
+    - short format, e.g. "123.45K"
+
+    not supported: Kb = kBit, KB = kByte, KB = 10**3 B, KiB = 2**10 B
+    """
+    if size.isdigit():
+        return int(size)
+
+    d, q = float(size[:-1]), size[-1]
+    return int(d * _size_map[q])
+
+
+def abspath(path: Path) -> Path:
+    """Normalize & make the given path absolute while maintaining symlinks.
+
+    Similar to Path.resolve(strict=False), but doesn't resolve symlinks."""
+    return Path(os.path.abspath(path))
--- a/poetry.lock
+++ b/poetry.lock
@ -0,0 +1,139 @@
+[[package]]
+name = "greenlet"
+version = "1.1.2"
+description = "Lightweight in-process concurrent programming"
+category = "main"
+optional = false
+python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*"
+
+[package.extras]
+docs = ["sphinx"]
+
+[[package]]
+name = "sqlalchemy"
+version = "1.4.39"
+description = "Database Abstraction Library"
+category = "main"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
+
+[package.dependencies]
+greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"}
+
+[package.extras]
+aiomysql = ["greenlet (!=0.4.17)", "aiomysql"]
+aiosqlite = ["typing_extensions (!=3.10.0.1)", "greenlet (!=0.4.17)", "aiosqlite"]
+asyncio = ["greenlet (!=0.4.17)"]
+asyncmy = ["greenlet (!=0.4.17)", "asyncmy (>=0.2.3,!=0.2.4)"]
+mariadb_connector = ["mariadb (>=1.0.1)"]
+mssql = ["pyodbc"]
+mssql_pymssql = ["pymssql"]
+mssql_pyodbc = ["pyodbc"]
+mypy = ["sqlalchemy2-stubs", "mypy (>=0.910)"]
+mysql = ["mysqlclient (>=1.4.0,<2)", "mysqlclient (>=1.4.0)"]
+mysql_connector = ["mysql-connector-python"]
+oracle = ["cx_oracle (>=7,<8)", "cx_oracle (>=7)"]
+postgresql = ["psycopg2 (>=2.7)"]
+postgresql_asyncpg = ["greenlet (!=0.4.17)", "asyncpg"]
+postgresql_pg8000 = ["pg8000 (>=1.16.6,!=1.29.0)"]
+postgresql_psycopg2binary = ["psycopg2-binary"]
+postgresql_psycopg2cffi = ["psycopg2cffi"]
+pymysql = ["pymysql (<1)", "pymysql"]
+sqlcipher = ["sqlcipher3-binary"]
+
+[metadata]
+lock-version = "1.1"
+python-versions = "^3.8"
+content-hash = "01d83cdef20caa2f18db197ca0498033c4995040150a36de92a3958efb0e9fb3"
+
+[metadata.files]
+greenlet = [
+    {file = "greenlet-1.1.2-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:58df5c2a0e293bf665a51f8a100d3e9956febfbf1d9aaf8c0677cf70218910c6"},
+    {file = "greenlet-1.1.2-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:aec52725173bd3a7b56fe91bc56eccb26fbdff1386ef123abb63c84c5b43b63a"},
+    {file = "greenlet-1.1.2-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:833e1551925ed51e6b44c800e71e77dacd7e49181fdc9ac9a0bf3714d515785d"},
+    {file = "greenlet-1.1.2-cp27-cp27m-win32.whl", hash = "sha256:aa5b467f15e78b82257319aebc78dd2915e4c1436c3c0d1ad6f53e47ba6e2713"},
+    {file = "greenlet-1.1.2-cp27-cp27m-win_amd64.whl", hash = "sha256:40b951f601af999a8bf2ce8c71e8aaa4e8c6f78ff8afae7b808aae2dc50d4c40"},
+    {file = "greenlet-1.1.2-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:95e69877983ea39b7303570fa6760f81a3eec23d0e3ab2021b7144b94d06202d"},
+    {file = "greenlet-1.1.2-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:356b3576ad078c89a6107caa9c50cc14e98e3a6c4874a37c3e0273e4baf33de8"},
+    {file = "greenlet-1.1.2-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:8639cadfda96737427330a094476d4c7a56ac03de7265622fcf4cfe57c8ae18d"},
+    {file = "greenlet-1.1.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:97e5306482182170ade15c4b0d8386ded995a07d7cc2ca8f27958d34d6736497"},
+    {file = "greenlet-1.1.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e6a36bb9474218c7a5b27ae476035497a6990e21d04c279884eb10d9b290f1b1"},
+    {file = "greenlet-1.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:abb7a75ed8b968f3061327c433a0fbd17b729947b400747c334a9c29a9af6c58"},
+    {file = "greenlet-1.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:14d4f3cd4e8b524ae9b8aa567858beed70c392fdec26dbdb0a8a418392e71708"},
+    {file = "greenlet-1.1.2-cp35-cp35m-macosx_10_14_x86_64.whl", hash = "sha256:17ff94e7a83aa8671a25bf5b59326ec26da379ace2ebc4411d690d80a7fbcf23"},
+    {file = "greenlet-1.1.2-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:9f3cba480d3deb69f6ee2c1825060177a22c7826431458c697df88e6aeb3caee"},
+    {file = "greenlet-1.1.2-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:fa877ca7f6b48054f847b61d6fa7bed5cebb663ebc55e018fda12db09dcc664c"},
+    {file = "greenlet-1.1.2-cp35-cp35m-win32.whl", hash = "sha256:7cbd7574ce8e138bda9df4efc6bf2ab8572c9aff640d8ecfece1b006b68da963"},
+    {file = "greenlet-1.1.2-cp35-cp35m-win_amd64.whl", hash = "sha256:903bbd302a2378f984aef528f76d4c9b1748f318fe1294961c072bdc7f2ffa3e"},
+    {file = "greenlet-1.1.2-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:049fe7579230e44daef03a259faa24511d10ebfa44f69411d99e6a184fe68073"},
+    {file = "greenlet-1.1.2-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:dd0b1e9e891f69e7675ba5c92e28b90eaa045f6ab134ffe70b52e948aa175b3c"},
+    {file = "greenlet-1.1.2-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:7418b6bfc7fe3331541b84bb2141c9baf1ec7132a7ecd9f375912eca810e714e"},
+    {file = "greenlet-1.1.2-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f9d29ca8a77117315101425ec7ec2a47a22ccf59f5593378fc4077ac5b754fce"},
+    {file = "greenlet-1.1.2-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:21915eb821a6b3d9d8eefdaf57d6c345b970ad722f856cd71739493ce003ad08"},
+    {file = "greenlet-1.1.2-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eff9d20417ff9dcb0d25e2defc2574d10b491bf2e693b4e491914738b7908168"},
+    {file = "greenlet-1.1.2-cp36-cp36m-win32.whl", hash = "sha256:32ca72bbc673adbcfecb935bb3fb1b74e663d10a4b241aaa2f5a75fe1d1f90aa"},
+    {file = "greenlet-1.1.2-cp36-cp36m-win_amd64.whl", hash = "sha256:f0214eb2a23b85528310dad848ad2ac58e735612929c8072f6093f3585fd342d"},
+    {file = "greenlet-1.1.2-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:b92e29e58bef6d9cfd340c72b04d74c4b4e9f70c9fa7c78b674d1fec18896dc4"},
+    {file = "greenlet-1.1.2-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:fdcec0b8399108577ec290f55551d926d9a1fa6cad45882093a7a07ac5ec147b"},
+    {file = "greenlet-1.1.2-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:93f81b134a165cc17123626ab8da2e30c0455441d4ab5576eed73a64c025b25c"},
+    {file = "greenlet-1.1.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e12bdc622676ce47ae9abbf455c189e442afdde8818d9da983085df6312e7a1"},
+    {file = "greenlet-1.1.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8c790abda465726cfb8bb08bd4ca9a5d0a7bd77c7ac1ca1b839ad823b948ea28"},
+    {file = "greenlet-1.1.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f276df9830dba7a333544bd41070e8175762a7ac20350786b322b714b0e654f5"},
+    {file = "greenlet-1.1.2-cp37-cp37m-win32.whl", hash = "sha256:64e6175c2e53195278d7388c454e0b30997573f3f4bd63697f88d855f7a6a1fc"},
+    {file = "greenlet-1.1.2-cp37-cp37m-win_amd64.whl", hash = "sha256:b11548073a2213d950c3f671aa88e6f83cda6e2fb97a8b6317b1b5b33d850e06"},
+    {file = "greenlet-1.1.2-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:9633b3034d3d901f0a46b7939f8c4d64427dfba6bbc5a36b1a67364cf148a1b0"},
+    {file = "greenlet-1.1.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:eb6ea6da4c787111adf40f697b4e58732ee0942b5d3bd8f435277643329ba627"},
+    {file = "greenlet-1.1.2-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:f3acda1924472472ddd60c29e5b9db0cec629fbe3c5c5accb74d6d6d14773478"},
+    {file = "greenlet-1.1.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e859fcb4cbe93504ea18008d1df98dee4f7766db66c435e4882ab35cf70cac43"},
+    {file = "greenlet-1.1.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:00e44c8afdbe5467e4f7b5851be223be68adb4272f44696ee71fe46b7036a711"},
+    {file = "greenlet-1.1.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec8c433b3ab0419100bd45b47c9c8551248a5aee30ca5e9d399a0b57ac04651b"},
+    {file = "greenlet-1.1.2-cp38-cp38-win32.whl", hash = "sha256:288c6a76705dc54fba69fbcb59904ae4ad768b4c768839b8ca5fdadec6dd8cfd"},
+    {file = "greenlet-1.1.2-cp38-cp38-win_amd64.whl", hash = "sha256:8d2f1fb53a421b410751887eb4ff21386d119ef9cde3797bf5e7ed49fb51a3b3"},
+    {file = "greenlet-1.1.2-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:166eac03e48784a6a6e0e5f041cfebb1ab400b394db188c48b3a84737f505b67"},
+    {file = "greenlet-1.1.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:572e1787d1460da79590bf44304abbc0a2da944ea64ec549188fa84d89bba7ab"},
+    {file = "greenlet-1.1.2-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:be5f425ff1f5f4b3c1e33ad64ab994eed12fc284a6ea71c5243fd564502ecbe5"},
+    {file = "greenlet-1.1.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1692f7d6bc45e3200844be0dba153612103db241691088626a33ff1f24a0d88"},
+    {file = "greenlet-1.1.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7227b47e73dedaa513cdebb98469705ef0d66eb5a1250144468e9c3097d6b59b"},
+    {file = "greenlet-1.1.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ff61ff178250f9bb3cd89752df0f1dd0e27316a8bd1465351652b1b4a4cdfd3"},
+    {file = "greenlet-1.1.2-cp39-cp39-win32.whl", hash = "sha256:f70a9e237bb792c7cc7e44c531fd48f5897961701cdaa06cf22fc14965c496cf"},
+    {file = "greenlet-1.1.2-cp39-cp39-win_amd64.whl", hash = "sha256:013d61294b6cd8fe3242932c1c5e36e5d1db2c8afb58606c5a67efce62c1f5fd"},
+    {file = "greenlet-1.1.2.tar.gz", hash = "sha256:e30f5ea4ae2346e62cedde8794a56858a67b878dd79f7df76a0767e356b1744a"},
+]
+sqlalchemy = [
+    {file = "SQLAlchemy-1.4.39-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:4770eb3ba69ec5fa41c681a75e53e0e342ac24c1f9220d883458b5596888e43a"},
+    {file = "SQLAlchemy-1.4.39-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:752ef2e8dbaa3c5d419f322e3632f00ba6b1c3230f65bc97c2ff5c5c6c08f441"},
+    {file = "SQLAlchemy-1.4.39-cp27-cp27m-win32.whl", hash = "sha256:b30e70f1594ee3c8902978fd71900d7312453922827c4ce0012fa6a8278d6df4"},
+    {file = "SQLAlchemy-1.4.39-cp27-cp27m-win_amd64.whl", hash = "sha256:864d4f89f054819cb95e93100b7d251e4d114d1c60bc7576db07b046432af280"},
+    {file = "SQLAlchemy-1.4.39-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:8f901be74f00a13bf375241a778455ee864c2c21c79154aad196b7a994e1144f"},
+    {file = "SQLAlchemy-1.4.39-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:1745987ada1890b0e7978abdb22c133eca2e89ab98dc17939042240063e1ef21"},
+    {file = "SQLAlchemy-1.4.39-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ede13a472caa85a13abe5095e71676af985d7690eaa8461aeac5c74f6600b6c0"},
+    {file = "SQLAlchemy-1.4.39-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:7f13644b15665f7322f9e0635129e0ef2098409484df67fcd225d954c5861559"},
+    {file = "SQLAlchemy-1.4.39-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26146c59576dfe9c546c9f45397a7c7c4a90c25679492ff610a7500afc7d03a6"},
+    {file = "SQLAlchemy-1.4.39-cp310-cp310-win32.whl", hash = "sha256:91d2b89bb0c302f89e753bea008936acfa4e18c156fb264fe41eb6bbb2bbcdeb"},
+    {file = "SQLAlchemy-1.4.39-cp310-cp310-win_amd64.whl", hash = "sha256:50e7569637e2e02253295527ff34666706dbb2bc5f6c61a5a7f44b9610c9bb09"},
+    {file = "SQLAlchemy-1.4.39-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:107df519eb33d7f8e0d0d052128af2f25066c1a0f6b648fd1a9612ab66800b86"},
+    {file = "SQLAlchemy-1.4.39-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f24d4d6ec301688c59b0c4bb1c1c94c5d0bff4ecad33bb8f5d9efdfb8d8bc925"},
+    {file = "SQLAlchemy-1.4.39-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:7b2785dd2a0c044a36836857ac27310dc7a99166253551ee8f5408930958cc60"},
+    {file = "SQLAlchemy-1.4.39-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6e2c8581c6620136b9530137954a8376efffd57fe19802182c7561b0ab48b48"},
+    {file = "SQLAlchemy-1.4.39-cp36-cp36m-win32.whl", hash = "sha256:fbc076f79d830ae4c9d49926180a1140b49fa675d0f0d555b44c9a15b29f4c80"},
+    {file = "SQLAlchemy-1.4.39-cp36-cp36m-win_amd64.whl", hash = "sha256:0ec54460475f0c42512895c99c63d90dd2d9cbd0c13491a184182e85074b04c5"},
+    {file = "SQLAlchemy-1.4.39-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:6f95706da857e6e79b54c33c1214f5467aab10600aa508ddd1239d5df271986e"},
+    {file = "SQLAlchemy-1.4.39-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:621f050e72cc7dfd9ad4594ff0abeaad954d6e4a2891545e8f1a53dcdfbef445"},
+    {file = "SQLAlchemy-1.4.39-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:05a05771617bfa723ba4cef58d5b25ac028b0d68f28f403edebed5b8243b3a87"},
+    {file = "SQLAlchemy-1.4.39-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20bf65bcce65c538e68d5df27402b39341fabeecf01de7e0e72b9d9836c13c52"},
+    {file = "SQLAlchemy-1.4.39-cp37-cp37m-win32.whl", hash = "sha256:f2a42acc01568b9701665e85562bbff78ec3e21981c7d51d56717c22e5d3d58b"},
+    {file = "SQLAlchemy-1.4.39-cp37-cp37m-win_amd64.whl", hash = "sha256:6d81de54e45f1d756785405c9d06cd17918c2eecc2d4262dc2d276ca612c2f61"},
+    {file = "SQLAlchemy-1.4.39-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:5c2d19bfb33262bf987ef0062345efd0f54c4189c2d95159c72995457bf4a359"},
+    {file = "SQLAlchemy-1.4.39-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:14ea8ff2d33c48f8e6c3c472111d893b9e356284d1482102da9678195e5a8eac"},
+    {file = "SQLAlchemy-1.4.39-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ec3985c883d6d217cf2013028afc6e3c82b8907192ba6195d6e49885bfc4b19d"},
+    {file = "SQLAlchemy-1.4.39-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1962dfee37b7fb17d3d4889bf84c4ea08b1c36707194c578f61e6e06d12ab90f"},
+    {file = "SQLAlchemy-1.4.39-cp38-cp38-win32.whl", hash = "sha256:047ef5ccd8860f6147b8ac6c45a4bc573d4e030267b45d9a1c47b55962ff0e6f"},
+    {file = "SQLAlchemy-1.4.39-cp38-cp38-win_amd64.whl", hash = "sha256:b71be98ef6e180217d1797185c75507060a57ab9cd835653e0112db16a710f0d"},
+    {file = "SQLAlchemy-1.4.39-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:365b75938049ae31cf2176efd3d598213ddb9eb883fbc82086efa019a5f649df"},
+    {file = "SQLAlchemy-1.4.39-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e7a7667d928ba6ee361a3176e1bef6847c1062b37726b33505cc84136f657e0d"},
+    {file = "SQLAlchemy-1.4.39-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c6d00cb9da8d0cbfaba18cad046e94b06de6d4d0ffd9d4095a3ad1838af22528"},
+    {file = "SQLAlchemy-1.4.39-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b0538b66f959771c56ff996d828081908a6a52a47c5548faed4a3d0a027a5368"},
+    {file = "SQLAlchemy-1.4.39-cp39-cp39-win32.whl", hash = "sha256:d1f665e50592caf4cad3caed3ed86f93227bffe0680218ccbb293bd5a6734ca8"},
+    {file = "SQLAlchemy-1.4.39-cp39-cp39-win_amd64.whl", hash = "sha256:8b773c9974c272aae0fa7e95b576d98d17ee65f69d8644f9b6ffc90ee96b4d19"},
+    {file = "SQLAlchemy-1.4.39.tar.gz", hash = "sha256:8194896038753b46b08a0b0ae89a5d80c897fb601dd51e243ed5720f1f155d27"},
+]
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,15 @@
+[tool.poetry]
+name = "metadex"
+version = "0.1.0"
+description = ""
+authors = ["ducklet <ducklet@noreply.code.dumpr.org>"]
+
+[tool.poetry.dependencies]
+python = "^3.8"
+SQLAlchemy = "^1.4.35"
+
+[tool.poetry.dev-dependencies]
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@ -0,0 +1 @@
+{"pythonPlatform":"Linux", "pythonVersion":"3.8"}
--- a/16
+++ b/16
@ -0,0 +1,16 @@
+#!/bin/sh -euf
+
+here=$(dirname "$(realpath "$0")")
+
+# cd "$here"
+
+python_bin="$here"/.venv/bin/python
+
+[ -z "${DEBUG:-}" ] || set -x
+
+# time python -m cProfile -s tottime -m metadex -n scan ~ >profile-scan.txt
+PYTHONPATH="$here" \
+"$python_bin" -m metadex \
+    --db "$here/metadex.sqlite" \
+    --ignore-from "$here/metadex.ignore" \
+    "$@"
--- a/scripts/lint
+++ b/scripts/lint
@ -0,0 +1,25 @@
+#!/bin/sh -eu
+
+if [ "${1:-}" = '--fix' ]; then
+    autoflake \
+        --remove-duplicate-keys \
+        --remove-unused-variables \
+        --remove-all-unused-imports \
+        --ignore-init-module-imports \
+        --recursive \
+        --in-place \
+        .
+    isort --profile black .
+    black .
+else
+    autoflake \
+        --remove-duplicate-keys \
+        --remove-unused-variables \
+        --remove-all-unused-imports \
+        --ignore-init-module-imports \
+        --recursive \
+        --check \
+        .
+    isort --profile black --check .
+    black --check .
+fi
--- a/scripts/ls
+++ b/scripts/ls
@ -0,0 +1,23 @@
+#!/bin/sh -eu
+
+# Create a `ingest-ls` compatible file listing.
+# 
+# Compatible with current versions of GNU and macOS `ls`.
+# 
+# $ scripts/ls -R /some/base/path \
+#   | python -m metadex ingest-ls --remove-missing
+
+_ls() {
+    if command ls -d --time-style='+%s' . >/dev/null 2>&1; then
+        # echo 'GNU'
+        command ls --time-style='+%s' "$@"
+    elif command ls -d -D '%s' . >/dev/null 2>&1; then
+        # echo 'macOS'
+        command ls -D '%s' "$@"
+    else
+        # echo 'unknown'
+        command ls "$@"
+    fi
+}
+
+_ls -lnAU "$@"
				`@ -0,0 +1 @@`
				`{"pythonPlatform":"Linux", "pythonVersion":"3.8"}`