This commit is contained in:
ducklet 2022-08-14 20:41:58 +02:00
commit 01a96c14d4
18 changed files with 2108 additions and 0 deletions

3
.gitignore vendored Normal file
View file

@ -0,0 +1,3 @@
*.local
metadex.sqlite
pyrightconfig.json

20
README.md Normal file
View file

@ -0,0 +1,20 @@
# Metadex
Metadex is a database for file metadata. It offers a simple and powerful CLI to scan your file system and find indexed files.
It supports path mapping, fast fuzzy searching, and multiple hosts.
Limited support exists for importing of metadata from external file lists, like for example from `ls -R`.
It's mostly useful to access file information for files stored remotely or offline. For example it could be used to keep track of the contents of backup DVDs or other cold storage.
Other use cases might also emerge.
All files are indexed with their last modified timestamp and file size.
Since searches are quite fast and flexible the database could even be useful to query local file information, or generate statistics, or graphs like a used space map.
## Example usage
```sh
metadex scan ~ # Add all files from your home directory to the index.
metadex ls '*.xml' # List all .xml files.
metadex ls '.config/**.json' # List all .json config files.
metadex ls .git --type d # List all .git directories.
```

6
metadex.ignore Normal file
View file

@ -0,0 +1,6 @@
*.git
.venv
.DS_Store
__pycache__
node_modules
vendor

0
metadex/__init__.py Normal file
View file

319
metadex/__main__.py Normal file
View file

@ -0,0 +1,319 @@
import argparse
import logging
import os
import stat
import sys
from pathlib import Path
from . import config, metadex, utils
log = logging.getLogger(__name__)
def getargs():
parser = argparse.ArgumentParser()
parser.set_defaults(mode=None)
parser.add_argument(
"--dry-run",
"-n",
action="store_true",
help="don't update the DB, only print what would change",
)
parser.add_argument(
"--quick",
action="store_true",
default=not config.db_allow_slow,
help="skip all DB integrity & optimization steps",
)
parser.add_argument(
"--hostname",
help="overwrite the hostname to use as path prefix",
)
parser.add_argument(
"--ignore-from",
type=Path,
default=config.default_ignore,
help="load list of ignore rules from the given file",
)
parser.add_argument("--db", type=Path, default=config.default_db)
parser.add_argument("--verbose", "-v", action="store_true", default=False)
subparsers = parser.add_subparsers(title="commands")
# Command: scan
parser_scan = subparsers.add_parser("scan", help="scan a local file system")
parser_scan.set_defaults(mode="scan")
parser_scan.add_argument(
"basedir",
type=Path,
nargs="+",
help="index all files from this dir",
)
parser_scan.add_argument(
"--no-remove-missing",
dest="remove_missing",
action="store_false",
help="do not remove files from the database which cannot be found in the file system",
)
parser_scan.add_argument(
"--map-mount",
nargs="*",
default=[],
type=str,
help="map a source host:path to any other destination during scanning for files\nExample: src=/mnt/foo,dest=foo:",
)
# Command: ingest-ls
parser_ingest_ls = subparsers.add_parser(
"ingest-ls",
help="ingest extra data",
description="When ingesting data from an external source, the hostname will not be set automatically.",
)
parser_ingest_ls.set_defaults(mode="ingest-ls")
parser_ingest_ls.add_argument(
"infile",
nargs="?",
type=argparse.FileType(),
default=sys.stdin,
help="output from `ls -lR`",
)
parser_ingest_ls.add_argument(
"--remove-missing",
action="store_true",
help="Remove files not listed in the infile.",
)
parser_ingest_ls.add_argument(
"--ref-year",
type=int,
help="The year when 'ls -l' was run, to resolve relative dates.",
)
# Command: ingest-db
parser_ingest_db = subparsers.add_parser("ingest-db")
parser_ingest_db.set_defaults(mode="ingest-db")
parser_ingest_db.add_argument(
"infile",
type=Path,
help="a Metadex SQLite DB",
)
parser_ingest_db.add_argument(
"--map-mount",
nargs="*",
default=[],
type=str,
help="map a source host:path to any other destination while importing",
)
# Command: rm
parser_rm = subparsers.add_parser("rm")
parser_rm.set_defaults(mode="rm")
parser_rm.add_argument(
"files",
type=str,
nargs="+",
help="files to remove",
)
parser_rm.add_argument(
"-r",
action="store_true",
dest="include_subfiles",
help="include sub-files",
)
# Command: ls
parser_ls = subparsers.add_parser("ls")
parser_ls.set_defaults(mode="ls")
parser_ls.add_argument(
"file",
type=str,
nargs="*",
help="look up a file",
)
parser_ls.add_argument(
"--type",
"-t",
choices="dfl",
help="Filter searches to (d)irectories, plain (f)iles, or sym(l)inks.",
)
parser_ls.add_argument("--format", type=str, default="{date}\t{size}\t{path}")
parser_ls.add_argument(
"--match", choices=("regex", "glob", "fuzzy"), default="glob"
)
# Parse args.
args = parser.parse_args()
if args.mode == "scan":
args.basedir = [utils.abspath(p) for p in args.basedir]
elif args.mode == "ingest-db":
args.infile = utils.abspath(args.infile)
elif args.mode == "ingest-ls":
config.hostname = None
elif args.mode is None:
parser.print_help()
parser.exit(1, "Error: No command selected.")
return args
def cmd_ingest_ls(args):
metadex.init(args.db)
log.info("Ingesting ls file %a ...", args.infile.name)
metadex.ingest_ls(
args.infile,
ignore_file=args.ignore_from,
ref_year=args.ref_year,
remove_missing=args.remove_missing,
)
metadex.close()
def cmd_ingest_db(args):
metadex.init(args.db)
log.info("Ingesting Metadex DB file %a ...", str(args.infile))
context = metadex.ingest_db_file(
args.infile, ignore_file=args.ignore_from, map_pathspecs=args.map_mount
)
msg = f"Checked {context.seen} files, {context.added} new, {context.changed} changed, {context.ignored} ignored, {context.removed} removed"
print(msg.ljust(metadex._terminal_width))
metadex.close()
def cmd_scan(args):
metadex.init(args.db)
for basedir in args.basedir:
log.info("Scanning %a ...", str(basedir))
context = metadex.scan(
basedir,
ignore_file=args.ignore_from,
remove_missing=args.remove_missing,
map_pathspecs=args.map_mount,
)
msg = f"{basedir}: Checked {context.seen} files, {context.added} new, {context.changed} changed, {context.ignored} ignored, {context.removed} removed"
print(msg.ljust(metadex._terminal_width))
metadex.close()
def cmd_rm(args):
metadex.init(args.db)
for path in args.files:
metadex.rm(path, include_children=args.include_subfiles)
metadex.close()
def cmd_ls(args):
metadex.init(args.db)
args.file = [f for f in args.file if f]
if not args.file:
# List all known hosts.
for host in sorted(metadex.hosts(), key=str.casefold):
print(f"{host}:")
else:
for pathspec in args.file:
for file in metadex.ls(pathspec, type=args.type, match=args.match):
date = file.stat_modified.isoformat(sep=" ", timespec="seconds")
size = utils.size_for_display(
file.stat_bytes, precision=1, format="compact"
)
path = f"{file.hostname}:{file.location}"
if file.stat_type == "d" and not path.endswith("/"): # relevant for `/`
path += "/"
fargs = {"date": date, "size": size, "path": path, "file": file}
try:
out = args.format.format(**fargs)
except (KeyError, AttributeError) as err:
keys = sorted(fargs.keys())
log.exception(
"Keys available to formatting: %s",
", ".join(keys),
exc_info=err,
)
return 5
print(out)
def is_stdout_piped():
s = os.fstat(sys.stdout.fileno())
return stat.S_ISFIFO(s.st_mode)
def main():
logging.basicConfig(
format="%(asctime)s.%(msecs)03d [%(name)s:%(process)d] %(levelname)s: %(message)s",
datefmt="%H:%M:%S",
level=config.loglevel,
)
logging.getLogger("sqlalchemy.engine").setLevel(
"INFO" if config.debug else "WARNING"
)
log.debug(f"Log level: {config.loglevel}")
args = getargs()
# print(args)
if args.verbose and config.loglevel == "WARNING":
config.loglevel = "INFO"
logging.getLogger().setLevel(config.loglevel)
config.db_allow_slow = not args.quick
config.dryrun = args.dry_run
config.is_stdout_piped = is_stdout_piped()
if args.hostname:
config.hostname = args.hostname
if config.hostname:
log.info("Using hostname: %a", config.hostname)
else:
log.error("Hostname is not set.")
log.info(
"If the hostname cannot be found automatically, try setting it using --hostname."
)
return 2
if config.dryrun:
log.info(f"--- DRY RUN ---")
if args.mode == "scan":
return cmd_scan(args)
elif args.mode == "ingest-ls":
return cmd_ingest_ls(args)
elif args.mode == "ingest-db":
return cmd_ingest_db(args)
elif args.mode == "rm":
return cmd_rm(args)
elif args.mode == "ls":
# Since this is a read-only operation we can change some config params.
config.db_allow_slow = False
config.dryrun = True
return cmd_ls(args)
if __name__ == "__main__":
sys.exit(main())

11
metadex/config.py Normal file
View file

@ -0,0 +1,11 @@
import os
from pathlib import Path
debug = os.getenv("DEBUG") == "1"
loglevel = os.getenv("METADEX_LOGLEVEL") or ("DEBUG" if debug else "WARNING")
dryrun = False
hostname = os.uname().nodename # or socket.gethostname()
default_db = Path("metadex.sqlite")
default_ignore = Path("metadex.ignore")
db_allow_slow = True
is_stdout_piped = False

540
metadex/db.py Normal file
View file

@ -0,0 +1,540 @@
import logging
from contextlib import contextmanager
from datetime import datetime
from functools import lru_cache
from pathlib import Path
from random import randint
from typing import Iterable, overload
from sqlalchemy import (
Column,
DateTime,
Enum,
Integer,
MetaData,
String,
Table,
UniqueConstraint,
create_engine,
)
from sqlalchemy.engine.base import Connection, Engine
from sqlalchemy.engine.row import Row
from sqlalchemy.exc import IntegrityError
from sqlalchemy.sql import and_, or_, select, text
from sqlalchemy.sql.schema import ForeignKey
from . import config
log = logging.getLogger(__name__)
metadata = MetaData()
metadex = Table(
"metadex",
metadata,
Column("id", Integer, primary_key=True),
Column(
"parent_id",
ForeignKey("metadex.id"),
nullable=True,
index=True,
comment="Points to the entry with the parent location.",
),
Column(
"added", DateTime, nullable=False
), # switch to Integer for smaller size maybe?
Column(
"updated", DateTime, nullable=False
), # switch to Integer for smaller size maybe?
Column("location", String, nullable=False, index=True),
Column("hostname", String, nullable=False, index=True),
Column("stat_bytes", Integer, nullable=False),
# Column("stat_changed", DateTime, nullable=False), # switch to Integer for smaller size maybe?
Column(
"stat_modified", DateTime, nullable=False
), # switch to Integer for smaller size maybe?
Column("stat_type", Enum("d", "f", "l", "-"), nullable=False),
UniqueConstraint("location", "hostname"),
)
engine: Engine = None # type:ignore
def check_integrity(conn: Connection):
stmt = text("PRAGMA integrity_check")
state = conn.execute(stmt).scalar()
if state is None:
raise IntegrityError(stmt, None, None)
log.info("Database file integrity: %s", state)
def check_parent_ids(conn: Connection):
log.info("Checking parent file associations ... press Ctrl-C to skip!")
try:
reassign_parent_ids(conn)
except KeyboardInterrupt:
log.warning("Aborted parent ID rebuild.")
def optimize(conn: Connection, *, vacuum: bool = False):
log.info("Optimizing database ...")
conn.execute(text("PRAGMA analysis_limit=400"))
conn.execute(text("PRAGMA optimize"))
if vacuum:
log.info("Running vacuum on database ... press Ctrl-C to skip!")
try:
conn.execute(text("VACUUM"))
except KeyboardInterrupt:
log.warning("Aborted DB cleanup.")
def autoconf(conn: Connection):
log.info("Configuring database ...")
conn.execute(text("PRAGMA journal_mode=WAL"))
conn.execute(text("PRAGMA synchronous=NORMAL"))
class Db:
engine: "Engine | None" = None
is_dirty: bool = False
def __init__(self, path: Path):
self.open(path)
def __del__(self):
self.close()
def open(self, path: Path):
log.info("Using database: %a", str(path))
if self.engine:
raise RuntimeError("DB already initialized.")
prefix = "sqlite+pysqlite:///"
self.engine = create_engine(f"{prefix}{path}", future=True)
metadata.create_all(engine)
if config.db_allow_slow:
with self.transaction() as conn:
autoconf(conn)
check_integrity(conn)
def close(self):
if self.engine is None:
return
if self.is_dirty:
chance = 10 # Set the chance for long running actions to happen to 1 in X.
do_slow = config.db_allow_slow and randint(1, chance) == 1
with self.transaction() as conn:
# if do_slow:
# check_parent_ids(conn)
optimize(conn, vacuum=do_slow)
self.engine = None
@contextmanager
def transaction(
self, *, rollback_on_error: bool = False, force_rollback: bool = False
):
if self.engine is None:
raise RuntimeError("DB was closed.")
connect = (
self.engine.connect
if (force_rollback or config.dryrun)
else self.engine.begin
)
err = None
with connect() as conn:
try:
yield conn
except BaseException as e:
if force_rollback or rollback_on_error:
raise e
# Allow the connection to run its ordinary clean up, i.e. to flush
# the data written to it so far to disk.
err = e
if err:
raise err
if not force_rollback:
self.is_dirty = True
def init(path: Path = Path(":memory:")):
global engine
log.info("Using database: %a", str(path))
if engine:
raise RuntimeError("DB already initialized.")
prefix = "sqlite+pysqlite:///"
engine = create_engine(f"{prefix}{path}", future=True)
metadata.create_all(engine)
if config.db_allow_slow:
with transaction() as conn:
autoconf(conn)
check_integrity(conn)
def close():
global engine
chance = 10 # Set the chance for long running actions to happen to 1 in X.
do_slow = config.db_allow_slow and randint(1, chance) == 1
with transaction() as conn:
# if do_slow:
# check_parent_ids(conn)
optimize(conn, vacuum=do_slow)
engine = None # type: ignore
def iter_all(conn: Connection) -> Iterable[Row]:
return conn.execute(select(metadex))
def get_file(conn: Connection, *, location: str, hostname: str):
stmt = select(metadex).where(
and_(
metadex.c.location == location,
metadex.c.hostname == hostname,
)
)
return conn.execute(stmt).one_or_none()
def get_files(conn: Connection, *, parent_id: int):
stmt = select(metadex).where(
metadex.c.parent_id == parent_id,
)
return conn.execute(stmt).all()
_escape_char = "#"
def escape(s: str) -> str:
return (
s.replace(_escape_char, 2 * _escape_char)
.replace("%", _escape_char + "%")
.replace("_", _escape_char + "_")
)
def search(
conn: Connection,
*,
contains: "str | None" = None,
startswith: "str | None" = None,
endswith: "str | None" = None,
like: "str | None" = None,
regex: "str | None" = None,
type: "str | None" = None,
hostname: "str | None" = None,
hostname_like: "str | None" = None,
hostname_regex: "str | None" = None,
) -> "Iterable[Row]":
stmt = select(metadex)
if type:
stmt = stmt.where(metadex.c.stat_type == type)
if hostname:
stmt = stmt.where(metadex.c.hostname == hostname)
if hostname_like:
stmt = stmt.where(metadex.c.hostname.ilike(hostname_like, escape=_escape_char))
if hostname_regex:
stmt = stmt.where(metadex.c.hostname.regexp_match(hostname_regex))
if contains:
stmt = stmt.where(
metadex.c.location.contains(contains, autoescape=True),
)
if endswith:
stmt = stmt.where(
metadex.c.location.endswith(endswith, autoescape=True),
)
if startswith:
stmt = stmt.where(
metadex.c.location.startswith(startswith, autoescape=True),
)
if like:
stmt = stmt.where(
metadex.c.location.ilike(like, escape=_escape_char),
)
if regex:
# It's important the "regex" filter comes last, because the order actually matters for SQLAlchemy/SQLite.
# Running this filter last allows for all the _quick_ filters to apply first, leaving less rows for the expensive REGEXP statement.
stmt = stmt.where(
metadex.c.location.regexp_match(regex),
)
return conn.execute(stmt)
def all_hostnames(conn: Connection) -> Iterable[str]:
stmt = select(metadex.c.hostname).distinct()
return conn.execute(stmt).scalars().all()
def _fake_entry(path: Path, *, hostname=None, now, parent_id) -> dict:
return dict(
parent_id=parent_id,
added=now,
updated=now,
location=str(path),
hostname=hostname if hostname is not None else config.hostname,
stat_bytes=0,
stat_modified=datetime.fromtimestamp(0),
stat_type="d",
)
def _add_parents(conn: Connection, *, location: str, hostname: str):
p_id: "int | None" = None
for p in reversed(Path(location).parents):
log.warning("Forging parent: %a:%a", hostname, str(p))
d = _fake_entry(p, hostname=hostname, now=datetime.now(), parent_id=p_id)
d = get_or_add(conn, d)
p_id = d["id"]
# r = conn.execute(
# metadex.insert(),
# [d],
# )
# p_id = r.inserted_primary_key.id
return p_id
def get_or_add(conn: Connection, new_data: dict):
row = get_file(conn, location=new_data["location"], hostname=new_data["hostname"])
if row:
return row
log.info(
"File added: %a:%a (size: %i)",
new_data["hostname"],
new_data["location"],
new_data["stat_bytes"],
)
if "id" in new_data:
del new_data["id"]
new_data["parent_id"] = _parent_id(
conn, location=new_data["location"], hostname=new_data["hostname"]
)
if new_data["parent_id"] is None:
new_data["parent_id"] = _add_parents(
conn, location=new_data["location"], hostname=new_data["hostname"]
)
now = datetime.now()
if "added" not in new_data:
new_data["added"] = now
if "updated" not in new_data:
new_data["updated"] = now
r = conn.execute(metadex.insert(), [new_data])
new_data["id"] = r.inserted_primary_key.id
return new_data
def upsert_if_changed(conn: Connection, new_data: dict):
row = get_or_add(conn, new_data)
is_from_db = isinstance(row, Row)
if not is_from_db:
return "added"
is_changed = (
new_data["stat_bytes"] != row["stat_bytes"]
# or new_data["stat_changed"] != row["stat_changed"] # Ignore ctime, mtime is enough
or new_data["stat_modified"] != row["stat_modified"]
or new_data["stat_type"] != row["stat_type"]
)
if not is_changed:
return "unchanged"
log.info("File changed: %a:%a", new_data["hostname"], new_data["location"])
# changelog = []
# for f in ("stat_bytes", "stat_modified", "stat_type"):
# if new_data[f] != row[f]:
# changelog.append(f"{f[5:]}: {row[f]!a} -> {new_data[f]!a}")
# log.info("File changes: %s", ", ".join(changelog))
if "id" in new_data:
del new_data["id"]
new_data["parent_id"] = _parent_id(conn, metadex_id=row["id"])
# del new_data["added"]
new_data["updated"] = datetime.now()
stmt = metadex.update(
and_(
metadex.c.location == new_data["location"],
metadex.c.hostname == new_data["hostname"],
)
)
conn.execute(stmt, [new_data])
return "changed"
def remove_all(conn: Connection, location: str, *, hostname=None) -> int:
"""Remove the entry with the given path and all its descendants."""
# We're using text comparison here to catch removed descendants even if
# an intermediate directory is missing, e.g.
# we have indexed /foo and /foo/bar/boo but not /foo/bar,
# this can happen through ignore rules and users adding those paths explicitly.
# We could also choose to ignore these edge cases and create orphans instead,
# or change our parent-id-mechanism to support skipping intermediates, both of
# which might be valid decisions for sake of optimization. For now we choose
# simple correctness. Let's see how bad the performance can get.
if hostname is None:
hostname = config.hostname
selector = and_(
metadex.c.hostname == hostname,
or_(
metadex.c.location == location,
metadex.c.location.startswith(location + "/", autoescape=True),
),
)
stmt = select(metadex.c.location).where(selector)
cur = conn.execute(stmt)
for (loc,) in cur:
log.warning("Purging file from DB: %a:%a", hostname, loc)
stmt = metadex.delete(selector)
return conn.execute(stmt).rowcount
@contextmanager
def transaction(rollback_on_error: bool = False):
connect = engine.connect if config.dryrun else engine.begin
err = None
with connect() as conn:
try:
yield conn
except BaseException as e:
if rollback_on_error:
raise e
# Allow the connection to run its ordinary clean up, i.e. to flush
# the data written to it so far to disk.
err = e
if err:
raise err
def files_in_dir(conn: Connection, location: str, *, hostname=None) -> Iterable[str]:
"""Return all file names for the given dir."""
if hostname is None:
hostname = config.hostname
query = select(metadex.c.id).where(
and_(metadex.c.hostname == hostname, metadex.c.location == location)
)
dir_id = conn.execute(query).scalar()
if dir_id is None:
return
query = select(metadex.c.location).where(metadex.c.parent_id == dir_id)
for (loc,) in conn.execute(query):
yield Path(loc).name
MetadexId = int
@overload
def _parent_id(conn: Connection, *, metadex_id: MetadexId) -> "MetadexId | None":
...
@overload
def _parent_id(conn: Connection, *, location: str, hostname: str) -> "MetadexId | None":
...
@lru_cache(maxsize=2048)
def _parent_id(
conn: Connection,
*,
metadex_id: "MetadexId | None" = None,
location: "str | None" = None,
hostname: "str | None" = None,
) -> "MetadexId | None":
if location is None:
stmt = select(metadex.c.location, metadex.c.hostname).where(
metadex.c.id == metadex_id
)
row = conn.execute(stmt).first()
if not row:
raise RuntimeError(
"Metadex ID referenced but missing from DB: %a", metadex_id
)
location, hostname = row
assert location
parent_loc = str(Path(location).parent)
stmt = select(metadex.c.id).where(
and_(metadex.c.location == parent_loc, metadex.c.hostname == hostname)
)
val = conn.execute(stmt).scalar()
if not val:
log.warning(
"No parent found: %a",
{"metadex_id": metadex_id, "location": location, "hostname": hostname},
)
return val
def reassign_parent_ids(conn: Connection):
stmt = select(
metadex.c.id, metadex.c.parent_id, metadex.c.location, metadex.c.hostname
)
for (m_id, p_id_old, loc, host) in conn.execute(stmt):
parent_loc = str(Path(loc).parent)
if parent_loc == loc:
p_id = None
else:
stmt = select(metadex.c.id).where(
and_(metadex.c.location == parent_loc, metadex.c.hostname == host)
)
p_id = conn.execute(stmt).scalar()
if not p_id:
log.warning(
"No parent found: %a",
{"metadex_id": m_id, "loc": loc, "host": host},
)
p_id = _add_parents(conn, location=loc, hostname=host)
if p_id != p_id_old:
log.warning(
"Parent changed: %a",
{"metadex_id": m_id, "loc": loc, "host": host, "parent_id": p_id},
)
stmt = metadex.update().where(metadex.c.id == m_id)
conn.execute(stmt, {"parent_id": p_id})

35
metadex/ignore.py Normal file
View file

@ -0,0 +1,35 @@
import re
from functools import partial
from pathlib import Path
from typing import Match
_regex_glob_map = {
"**": r".*",
"*": r"[^/]*",
"?": r"[^/]",
}
_regex_glob_map = {re.escape(k): v for k, v in _regex_glob_map.items()}
def _regex_from_glob(match: Match[str]) -> str:
return _regex_glob_map[match.group()]
_replace_globs_re = re.compile("|".join(re.escape(k) for k in _regex_glob_map))
_replace_globs = partial(_replace_globs_re.sub, _regex_from_glob)
def parse(path: Path):
rules = []
for line in path.open():
line = line.rstrip()
if not line or line.startswith("# "):
continue
rule = _replace_globs(re.escape(line))
if not rule.startswith("/"):
rule = r".*/" + rule
rules.append(rule)
regex = "|".join(rules)
return re.compile(regex).fullmatch

202
metadex/ls_parser.py Normal file
View file

@ -0,0 +1,202 @@
import argparse
import json
import logging
import re
import sys
from dataclasses import asdict, dataclass, fields
from datetime import datetime
from pathlib import Path
from typing import Any, Iterable, TextIO
from . import utils
log = logging.getLogger(__name__)
# file mode entry type:
# - Regular file.
# b Block special file.
# c Character special file.
# d Directory.
# l Symbolic link.
# p FIFO.
# s Socket.
# w Whiteout.
# ls_re = re.compile("drwxrwsr-x 555 somuser somegrp 555 Dec 25 20:06 .")
# ls_re = re.compile("drwxr-xr-x 11 501 20 352 1649098510 .")
ls_re = re.compile(
r"(?P<mode>[-bcdlpsw][-rwSsx]{6}[-rwSsxTt]{3})[@+]?\s+(?P<links>\d+)\s+(?P<owner>\S+)\s+(?P<group>\S+)\s+(?P<size>[.\d]+["
+ utils._size_quantifiers
+ r"]?)\s+(?P<date>(\d+|(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) [ 1-3]\d ( \d{4}|\d\d:\d\d)))\s(?P<name>.*)"
)
@dataclass
class File:
mode: str
owner: str
group: str
size_bytes: int
date: datetime
path: Path
@property
def is_dir(self):
return self.mode.startswith("d")
@property
def is_symlink(self):
return self.mode.startswith("l")
def asplain(o: object) -> "dict[str, Any]":
d = asdict(o)
for f in fields(o):
if f.type is datetime:
d[f.name] = d[f.name].isoformat()
elif f.type is Path:
d[f.name] = d[f.name].as_posix()
return d
def parse_date(date: str, ref_year: "int | None" = None) -> datetime:
try:
return (
datetime.fromtimestamp(float(date))
if date.isdigit()
else datetime.strptime(date, "%b %d %Y")
)
except ValueError:
pass
if ref_year is None:
log.error("A reference year is required for relative timestamps: %a", date)
raise ValueError("Missing ref_year.")
# We need to include the year in the string for parsing with strptime to
# fully support leap years; otherwise without a year it might complain that
# "Feb 29" is out of range.
return datetime.strptime(f"{date} {ref_year}", "%b %d %H:%M %Y")
@dataclass
class ChangeDir:
from_: "Path | None"
to: "Path | None"
def parse_file(
file: TextIO, *, ref_year: "int | None" = None
) -> Iterable["File | ChangeDir"]:
lines = (line.rstrip() for line in file)
yield from parse_lines(lines, ref_year=ref_year)
def parse_lines(
lines: Iterable[str], *, ref_year: "int | None" = None
) -> Iterable["File | ChangeDir"]:
workdir = Path("/")
dirname: "Path | None" = None
for i, line in enumerate(lines, start=1):
if not line:
# empty line, reset context
if dirname is not None:
yield ChangeDir(from_=dirname, to=None)
dirname = None
continue
if dirname is None:
if not line.endswith(":"):
log.error("Path is missing from context, instead got: %a", line)
raise ValueError(f"Unexpected input in line #{i}")
if not line.startswith("/"):
log.error("Only absolute paths are supported: %a", line)
raise ValueError(f"Unexpected input in line #{i}")
dirname = workdir / line[:-1]
yield ChangeDir(from_=None, to=dirname)
elif line.startswith("total "):
pass
elif match := ls_re.fullmatch(line):
name = match["name"]
# Support `ls` output where dirs are marked with a `/` suffix.
if name.endswith("/"):
name = name[:-1]
if name in (".", ".."):
continue
if match["mode"].startswith("l"):
markers = name.count("->")
if markers == 1:
name = name.split(" -> ")[0]
elif markers >= 2:
raise RuntimeError(f"Symlink has an ambiguous name: {name!a}")
else:
log.warning("Symlink is missing a target: %a", name)
try:
size = utils.parse_size(match["size"])
except ValueError as err:
log.error("Error parsing size value: %a", match["size"], exc_info=err)
raise ValueError(f"Unexpected input in line #{i}") from err
try:
date = parse_date(match["date"], ref_year)
except ValueError as err:
log.error("Error parsing date value: %a", match["date"], exc_info=err)
raise ValueError(f"Unexpected input in line #{i}") from err
yield File(
mode=match["mode"],
owner=match["owner"],
group=match["group"],
size_bytes=size,
date=date,
path=dirname / name,
)
else:
log.error("Line not matched by parser: %a", line)
raise ValueError(f"Unexpected input in line #{i}")
if dirname is not None:
yield ChangeDir(from_=dirname, to=None)
def get_args(argv: "list[str]"):
parser = argparse.ArgumentParser()
# parser.add_argument("--workdir", help="The directory from where 'ls -l' was run")
parser.add_argument("--ref-year", type=int, help="The year when 'ls -l' was run")
# parser.add_argument(
# "--json", action="store_true", default=False, help="Output as JSON"
# )
parser.add_argument(
"infile",
nargs="?",
type=argparse.FileType(),
default=sys.stdin,
)
args = parser.parse_args(argv[1:])
return args
def main(argv: "list[str]"):
args = get_args(argv)
# workdir = Path(args.workdir or ".")
ref_year = args.ref_year or datetime.now().year
for f in parse_file(args.infile, ref_year=ref_year):
print(json.dumps(asplain(f)))
if __name__ == "__main__":
main(sys.argv)

618
metadex/metadex.py Normal file
View file

@ -0,0 +1,618 @@
import logging
import os
import re
import sys
import time
from collections import deque
from dataclasses import dataclass
from pathlib import Path
from shutil import get_terminal_size
from typing import Iterable, Literal, TextIO
from . import config, db, ignore, ls_parser, models
log = logging.getLogger(__name__)
init = db.init
close = db.close
def scan(
path: Path,
*,
ignore_file: Path,
remove_missing: bool = False,
map_pathspecs: "list[str]" = [],
) -> "_LogContext":
f = _scan_remove_missing if remove_missing else _scan_add_only
return f(path, ignore_file=ignore_file, map_pathspecs=map_pathspecs)
# Opportunistically compensate for wide chars on the terminal.
_terminal_width = int(get_terminal_size().columns * 0.9)
_last_log = 0
def _log_ephemeral(msg: str, *, debounce_ms: "int | None" = 200):
global _last_log
if debounce_ms is not None:
now = time.monotonic()
if _last_log + (debounce_ms / 1000) > now:
return
_last_log = now
msg = msg.encode(errors="replace").decode()
if len(msg) > _terminal_width:
msg = msg[: _terminal_width - 3] + "..."
sys.stderr.write(msg.ljust(_terminal_width) + "\r")
@dataclass
class _LogContext:
seen: int = 0
ignored: int = 0
added: int = 0
changed: int = 0
removed: int = 0
def _log_context(path, context: _LogContext):
if config.is_stdout_piped:
return
_log_ephemeral(
f"{context.seen} a:{context.added} c:{context.changed} i:{context.ignored} r:{context.removed} {path}"
)
def _scan_add_only(path: Path, *, ignore_file: Path, map_pathspecs: "list[str]"):
is_ignored = ignore.parse(ignore_file)
maps = _parse_pathspec_mapping(map_pathspecs)
context = _LogContext()
with db.transaction() as conn:
context.seen += 1
d = models.File.dict_from_entry(path)
_apply_mapping(maps, d)
if is_ignored(d["location"]):
log.warning(
"Skipping ignored basedir: %a:%a",
d["hostname"],
d["location"],
)
return context
if (action := db.upsert_if_changed(conn, d)) == "added":
context.added += 1
elif action == "changed":
context.changed += 1
dirs: deque[Path] = deque()
if d["stat_type"] == "d":
dirs.append(path)
while dirs:
cwd = dirs.popleft()
try:
scan = os.scandir(cwd)
except Exception as err:
log.error(err)
continue
subdirs: deque[Path] = deque()
with scan as files:
for f in files:
context.seen += 1
_log_context(f.path, context)
d = models.File.dict_from_entry(f)
_apply_mapping(maps, d)
if is_ignored(d["location"]):
log.debug(
"Skipping ignored entry: %a:%a",
d["hostname"],
d["location"],
)
context.ignored += 1
continue
if (action := db.upsert_if_changed(conn, d)) == "added":
context.added += 1
append = subdirs.append
elif action == "changed":
context.changed += 1
append = subdirs.append
else:
append = subdirs.appendleft
if f.is_dir(follow_symlinks=False):
append(Path(f.path))
# `subdirs` sorts all changed dirs to the right, which means when we
# extend `dirs` using `extendleft` it'll put them all left-most.
# Or put more simply: new stuff on the left, old on the right.
dirs.extendleft(subdirs)
return context
def _scan_remove_missing(path: Path, *, ignore_file: Path, map_pathspecs: "list[str]"):
"""Like `scan` but also search for missing files."""
is_ignored = ignore.parse(ignore_file)
maps = _parse_pathspec_mapping(map_pathspecs)
context = _LogContext()
with db.transaction() as conn:
context.seen += 1
d = models.File.dict_from_entry(path)
_apply_mapping(maps, d)
if is_ignored(d["location"]):
log.warning(
"Skipping ignored basedir: %a:%a",
d["hostname"],
d["location"],
)
return context
if (action := db.upsert_if_changed(conn, d)) == "added":
context.added += 1
elif action == "changed":
context.changed += 1
dirs: deque[Path] = deque()
if d["stat_type"] == "d":
dirs.append(path)
while dirs:
cwd = dirs.popleft()
try:
scan = os.scandir(cwd)
except Exception as err:
log.error(err)
continue
expected = {name for name in db.files_in_dir(conn, str(cwd))}
subdirs: deque[Path] = deque()
with scan as files:
for f in files:
context.seen += 1
_log_context(f.path, context)
d = models.File.dict_from_entry(f)
_apply_mapping(maps, d)
if is_ignored(d["location"]):
log.debug(
"Skipping ignored entry: %a:%a",
d["hostname"],
d["location"],
)
context.ignored += 1
continue
if (action := db.upsert_if_changed(conn, d)) == "added":
context.added += 1
append = subdirs.append
elif action == "changed":
context.changed += 1
append = subdirs.append
else:
append = subdirs.appendleft
if f.is_dir(follow_symlinks=False):
append(Path(f.path))
expected.discard(f.name)
# `subdirs` sorts all changed dirs to the right, which means when we
# extend `dirs` using `extendleft` it'll put them all left-most.
# Or put more simply: new stuff on the left, old on the right.
dirs.extendleft(subdirs)
for name in expected:
f = str(cwd / name)
if is_ignored(f):
continue
log.info("File removed: %a", f)
db.remove_all(conn, f)
return context
_pathspec_re = re.compile(r"((?P<host>[^:/]*):)?(?P<path>.*)")
_src_dest_re = re.compile(r"src=(?P<src>.*),dest=(?P<dest>.*)")
def _parse_pathspec(pathspec: str):
match = _pathspec_re.fullmatch(pathspec)
assert match
host: "str | None" = match["host"]
path: str = match["path"] or "/"
return host, path
def _clean_dirname(loc: str, *, force_absolute=True):
if force_absolute and not loc.startswith("/"):
loc = "/" + loc
if not loc.endswith("/"):
loc += "/"
return loc
# if loc != "/" and loc.endswith("/"):
# return loc[:-1]
# return loc
def _parse_pathspec_mapping(map_pathspecs: "list[str]"):
Hostname = str
Location = str
maps: dict[Hostname, dict[Location, tuple[Hostname, Location]]] = {}
for pathspec_mapping in map_pathspecs:
match = _src_dest_re.fullmatch(pathspec_mapping)
if not match:
log.error("Invalid mapping: %a", pathspec_mapping)
raise ValueError("Could not parse mapping.")
src_host, src_path = _parse_pathspec(match["src"])
if not src_host:
src_host = config.hostname
log.warning("Using default hostname for mapping source: %a", src_host)
# log.error("Hostname is required when mapping paths: %a", match["src"])
# raise ValueError("Missing hostname.")
src_path = _clean_dirname(src_path)
if src_host not in maps:
maps[src_host] = {}
dest_host, dest_path = _parse_pathspec(match["dest"])
if not dest_host:
dest_host = config.hostname
log.warning("Using default hostname for mapping dest: %a", dest_host)
# log.error("Hostname is required when mapping paths: %a", match["dest"])
# raise ValueError("Missing hostname.")
dest_path = _clean_dirname(dest_path)
maps[src_host][src_path] = dest_host, dest_path
log.info("Mapping %a:%a -> %a:%a", src_host, src_path, dest_host, dest_path)
return maps
def _apply_mapping(maps: dict, d: dict):
hostname = d["hostname"]
location = (
d["location"]
if d["stat_type"] != "d"
else _clean_dirname(d["location"], force_absolute=False)
)
if hostname in maps:
for src_loc, (dest_host, dest_loc) in maps[hostname].items():
if location.startswith(src_loc):
d["hostname"] = dest_host
d["location"] = dest_loc + d["location"][len(src_loc) :]
log.debug(
"Mapping %a -> %a",
f"{hostname}:{location}",
f'{d["hostname"]}:{d["location"]}',
)
break
def ingest_db_file(
db_file: Path,
*,
ignore_file: Path,
map_pathspecs: "list[str]" = [],
select_pathspecs: "list[str]" = [],
) -> _LogContext:
is_ignored = ignore.parse(ignore_file)
maps = _parse_pathspec_mapping(map_pathspecs)
context = _LogContext()
other_db = db.Db(db_file)
with db.transaction() as conn, other_db.transaction(
force_rollback=True
) as other_conn:
for row in db.iter_all(other_conn):
context.seen += 1
_log_context(row["location"], context)
d = dict(row)
_apply_mapping(maps, d)
if is_ignored(d["location"]):
log.debug("Skipping ignored entry: %a:%a", d["hostname"], d["location"])
context.ignored += 1
continue
if (action := db.upsert_if_changed(conn, d)) == "added":
context.added += 1
elif action == "changed":
context.changed += 1
return context
def ingest_ls(
file: TextIO,
*,
ignore_file: Path,
ref_year: "int | None",
remove_missing: bool = False,
) -> _LogContext:
f = _ingest_ls_remove_missing if remove_missing else _ingest_ls_add_only
return f(file, ignore_file=ignore_file, ref_year=ref_year)
def _ingest_ls_add_only(file: TextIO, *, ignore_file: Path, ref_year: "int | None"):
is_ignored = ignore.parse(ignore_file)
context = _LogContext()
with db.transaction() as conn:
for f in ls_parser.parse_file(file, ref_year=ref_year):
if isinstance(f, ls_parser.ChangeDir):
continue
context.seen += 1
_log_context(f.path, context)
d = _dict_from_lsfile(f)
# _apply_mapping(maps, d)
if is_ignored(d["location"]):
log.debug("Skipping ignored entry: %a:%a", d["hostname"], d["location"])
context.ignored += 1
continue
if (action := db.upsert_if_changed(conn, d)) == "added":
context.added += 1
elif action == "changed":
context.changed += 1
return context
def _dict_from_lsfile(f: ls_parser.File) -> dict:
mode = f.mode[0]
if mode == "-":
mode = "f"
elif mode not in "dl":
mode = "-"
return dict(
location=str(f.path),
hostname=config.hostname,
stat_bytes=f.size_bytes,
stat_modified=f.date,
stat_type=mode,
)
def _ingest_ls_remove_missing(
file: TextIO, *, ignore_file: Path, ref_year: "int | None"
):
is_ignored = ignore.parse(ignore_file)
expected: set[str] = set()
context = _LogContext()
with db.transaction() as conn:
for f in ls_parser.parse_file(file, ref_year=ref_year):
if isinstance(f, ls_parser.ChangeDir):
if f.to is not None:
expected = {name for name in db.files_in_dir(conn, str(f.to))}
elif f.from_:
# remove missing
for name in expected:
loc = str(f.from_ / name)
if is_ignored(loc):
log.info("Ignoring file (for removal): %a", loc)
continue
log.info("File removed: %a", loc)
context.removed += db.remove_all(conn, loc)
continue
context.seen += 1
_log_context(f.path, context)
d = _dict_from_lsfile(f)
# _apply_mapping(maps, d)
if is_ignored(d["location"]):
log.debug("Skipping ignored entry: %a:%a", d["hostname"], d["location"])
context.ignored += 1
continue
if (action := db.upsert_if_changed(conn, d)) == "added":
context.added += 1
elif action == "changed":
context.changed += 1
expected.discard(f.path.name)
return context
def _ls_files(
*,
host: "str | None",
path: str,
type: "models.StatType | None" = None,
match: Literal["regex", "glob", "fuzzy"] = "glob",
) -> Iterable[models.File]:
def map_replace(mapping: dict, string: str):
pattern = "|".join(re.escape(k) for k in mapping.keys())
return re.sub(pattern, lambda m: mapping[m[0]], string)
def liketerm_from_glob(glob: str) -> str:
s = db.escape(glob)
s = map_replace({"*": "%", "?": "_"}, s)
return s
def regex_from_glob(glob: str) -> str:
s = re.escape(glob)
s = map_replace({r"\*\*": ".*", r"\*": "[^/]*", r"\?": "[^/]"}, s)
return s
with db.transaction() as conn:
if match == "regex":
for f in db.search(
conn, type=type, hostname_regex=host, regex=f"(?i){path}"
):
yield models.File(**f) # type: ignore
elif match == "glob":
filters = {"type": type}
if host and _uses_glob(host):
filters["hostname_like"] = liketerm_from_glob(host)
else:
filters["hostname"] = host
if not _uses_glob(path):
rterm = re.escape(path)
lterm = path # no `db.escape`, `endswith` does autoescape
result = db.search(
conn,
endswith=lterm,
regex=f"(?i)(^|/){rterm}$", # ensure a full name match
**filters,
)
else:
rterm = regex_from_glob(path)
lterm = liketerm_from_glob(path)
result = db.search(
conn,
regex=f"(?i)(^|/){rterm}$",
like=f"%{lterm}", # helps to drastically speed up the regex match
**filters,
)
for f in result:
yield models.File(**f) # type: ignore
elif match == "fuzzy":
term = "%".join(db.escape(p) for p in path.split("/"))
for f in db.search(conn, like=f"%{term}%", type=type, hostname=host):
yield models.File(**f) # type: ignore
def _ls_dir_contents(*, host: str, path: str) -> Iterable[models.File]:
with db.transaction() as conn:
row = db.get_file(conn, location=path, hostname=host)
if not row:
log.warning("No match: %a:%a", host, path)
return
if row["stat_type"] != "d":
yield models.File(**row) # type: ignore
return
for f in db.get_files(conn, parent_id=row["id"]):
yield models.File(**f) # type: ignore
def _uses_glob(string: str) -> bool:
return "*" in string or "?" in string
def ls(
pathspec: str,
*,
type: "models.StatType | None" = None,
match: Literal["regex", "glob", "fuzzy"] = "glob",
) -> Iterable[models.File]:
host, path = _parse_pathspec(pathspec)
if host == "":
host = config.hostname # allow ":foo" as shortcut for local search
log.info("Using path spec: %a:%a", host, path)
if path != "/" and path.endswith("/"):
# In our DB no path except root (`/`) ends with `/`.
path = path.rstrip("/")
if host and path.startswith("/") and not _uses_glob(host + path):
yield from _ls_dir_contents(host=host, path=path)
else:
yield from _ls_files(host=host, path=path, type=type, match=match)
def rm(pathspec: str, *, include_children: bool = False):
"""Remove the given path and all its descendants."""
host, path = _parse_pathspec(pathspec)
if not host or not path.startswith("/"):
log.error(
"A full absolute path including hostname is required when removing files: %a",
pathspec,
)
raise ValueError("Incomplete path specification.")
if path != "/" and path.endswith("/"):
path = path[:-1]
with db.transaction() as conn:
row = db.get_file(conn, hostname=host, location=path)
if not row:
log.error("No matching file found: %a", pathspec)
raise ValueError("Path not found.")
children = db.get_files(conn, parent_id=row["id"])
if children and not include_children:
log.error("File has children: %a", pathspec)
raise RuntimeError("Path has children.")
db.remove_all(conn, location=path, hostname=host)
def hosts() -> "set[str]":
with db.transaction() as conn:
return set(db.all_hostnames(conn))

84
metadex/models.py Normal file
View file

@ -0,0 +1,84 @@
import os
from dataclasses import asdict, dataclass
from datetime import datetime
from os import DirEntry
from pathlib import Path
from stat import S_IFDIR, S_IFLNK, S_IFMT, S_IFREG
from typing import Literal
from . import config
_modes = {S_IFDIR: "d", S_IFREG: "f", S_IFLNK: "l"}
asdict = asdict
StatType = Literal["d", "f", "l", "-"]
@dataclass
class File:
id: int
parent_id: int
added: datetime
updated: datetime
location: str
hostname: str # XXX should better use a fingerprint/unique-id per host (e.g. `/etc/metadex.hostid`, for disks put it on their /)
stat_bytes: int
# stat_changed: datetime # XXX remove? The `ctime` changes not only for content changes but also file attr changes, which we don't track anyway.
stat_modified: datetime
stat_type: StatType
@classmethod
def from_direntry(cls, entry: DirEntry):
now = datetime.now()
pstat = entry.stat(follow_symlinks=False)
return cls(
added=now,
updated=now,
location=entry.path,
hostname=config.hostname,
stat_bytes=pstat.st_size,
# stat_changed=datetime.fromtimestamp(pstat.st_ctime),
stat_modified=datetime.fromtimestamp(pstat.st_mtime),
stat_type=_modes.get(S_IFMT(pstat.st_mode), "-"), # type: ignore
)
@classmethod
def from_path(cls, path: Path):
now = datetime.now()
pstat = os.stat(path, follow_symlinks=False)
return cls(
added=now,
updated=now,
location=os.path.abspath(path),
hostname=config.hostname,
stat_bytes=pstat.st_size,
# stat_changed=datetime.fromtimestamp(pstat.st_ctime),
stat_modified=datetime.fromtimestamp(pstat.st_mtime),
stat_type=_modes.get(S_IFMT(pstat.st_mode), "-"), # type: ignore
)
@staticmethod
def dict_from_entry(entry: "DirEntry | Path") -> dict:
"""Return the File's data structure as dict.
This can be useful to skip calling `asdict`, which can be quite slow.
"""
# now = datetime.now()
if isinstance(entry, Path):
location = os.path.abspath(entry)
pstat = os.stat(entry, follow_symlinks=False)
else:
location = entry.path.encode(errors="replace").decode()
pstat = entry.stat(follow_symlinks=False)
return dict(
# added=now,
# updated=now,
location=location,
hostname=config.hostname,
stat_bytes=pstat.st_size,
stat_modified=datetime.fromtimestamp(pstat.st_mtime),
stat_type=_modes.get(S_IFMT(pstat.st_mode), "-"),
)

51
metadex/utils.py Normal file
View file

@ -0,0 +1,51 @@
import os
from pathlib import Path
_size_quantifiers = "BKMGTP"
_size_map: "dict[str, int]" = {
_size_quantifiers[i]: 2 ** (10 * i) for i in range(len(_size_quantifiers))
}
def size_for_display(byte_count: int, precision: int = 2, format="short") -> str:
for qtf in reversed(_size_quantifiers):
qty = byte_count / _size_map[qtf]
if qty > 1:
break
size = f"{qty:.{precision}f}"
if format == "compact":
size = size.replace("." + "0" * precision, "") # silly hack to remove
return f"{size:>{4+precision}}{qtf}"
tpl = "{{:.{precision}f}} {{}}".format(precision=precision)
if format == "short":
pass
elif format == "long" and qtf != "B":
tpl += "iB"
return tpl.format(qty, qtf)
def parse_size(size: str) -> int:
"""Return the given size converted to byte count.
Supported are
- plain byte count, e.g. "12345"
- short format, e.g. "123.45K"
not supported: Kb = kBit, KB = kByte, KB = 10**3 B, KiB = 2**10 B
"""
if size.isdigit():
return int(size)
d, q = float(size[:-1]), size[-1]
return int(d * _size_map[q])
def abspath(path: Path) -> Path:
"""Normalize & make the given path absolute while maintaining symlinks.
Similar to Path.resolve(strict=False), but doesn't resolve symlinks."""
return Path(os.path.abspath(path))

139
poetry.lock generated Normal file
View file

@ -0,0 +1,139 @@
[[package]]
name = "greenlet"
version = "1.1.2"
description = "Lightweight in-process concurrent programming"
category = "main"
optional = false
python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*"
[package.extras]
docs = ["sphinx"]
[[package]]
name = "sqlalchemy"
version = "1.4.39"
description = "Database Abstraction Library"
category = "main"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
[package.dependencies]
greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"}
[package.extras]
aiomysql = ["greenlet (!=0.4.17)", "aiomysql"]
aiosqlite = ["typing_extensions (!=3.10.0.1)", "greenlet (!=0.4.17)", "aiosqlite"]
asyncio = ["greenlet (!=0.4.17)"]
asyncmy = ["greenlet (!=0.4.17)", "asyncmy (>=0.2.3,!=0.2.4)"]
mariadb_connector = ["mariadb (>=1.0.1)"]
mssql = ["pyodbc"]
mssql_pymssql = ["pymssql"]
mssql_pyodbc = ["pyodbc"]
mypy = ["sqlalchemy2-stubs", "mypy (>=0.910)"]
mysql = ["mysqlclient (>=1.4.0,<2)", "mysqlclient (>=1.4.0)"]
mysql_connector = ["mysql-connector-python"]
oracle = ["cx_oracle (>=7,<8)", "cx_oracle (>=7)"]
postgresql = ["psycopg2 (>=2.7)"]
postgresql_asyncpg = ["greenlet (!=0.4.17)", "asyncpg"]
postgresql_pg8000 = ["pg8000 (>=1.16.6,!=1.29.0)"]
postgresql_psycopg2binary = ["psycopg2-binary"]
postgresql_psycopg2cffi = ["psycopg2cffi"]
pymysql = ["pymysql (<1)", "pymysql"]
sqlcipher = ["sqlcipher3-binary"]
[metadata]
lock-version = "1.1"
python-versions = "^3.8"
content-hash = "01d83cdef20caa2f18db197ca0498033c4995040150a36de92a3958efb0e9fb3"
[metadata.files]
greenlet = [
{file = "greenlet-1.1.2-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:58df5c2a0e293bf665a51f8a100d3e9956febfbf1d9aaf8c0677cf70218910c6"},
{file = "greenlet-1.1.2-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:aec52725173bd3a7b56fe91bc56eccb26fbdff1386ef123abb63c84c5b43b63a"},
{file = "greenlet-1.1.2-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:833e1551925ed51e6b44c800e71e77dacd7e49181fdc9ac9a0bf3714d515785d"},
{file = "greenlet-1.1.2-cp27-cp27m-win32.whl", hash = "sha256:aa5b467f15e78b82257319aebc78dd2915e4c1436c3c0d1ad6f53e47ba6e2713"},
{file = "greenlet-1.1.2-cp27-cp27m-win_amd64.whl", hash = "sha256:40b951f601af999a8bf2ce8c71e8aaa4e8c6f78ff8afae7b808aae2dc50d4c40"},
{file = "greenlet-1.1.2-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:95e69877983ea39b7303570fa6760f81a3eec23d0e3ab2021b7144b94d06202d"},
{file = "greenlet-1.1.2-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:356b3576ad078c89a6107caa9c50cc14e98e3a6c4874a37c3e0273e4baf33de8"},
{file = "greenlet-1.1.2-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:8639cadfda96737427330a094476d4c7a56ac03de7265622fcf4cfe57c8ae18d"},
{file = "greenlet-1.1.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:97e5306482182170ade15c4b0d8386ded995a07d7cc2ca8f27958d34d6736497"},
{file = "greenlet-1.1.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e6a36bb9474218c7a5b27ae476035497a6990e21d04c279884eb10d9b290f1b1"},
{file = "greenlet-1.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:abb7a75ed8b968f3061327c433a0fbd17b729947b400747c334a9c29a9af6c58"},
{file = "greenlet-1.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:14d4f3cd4e8b524ae9b8aa567858beed70c392fdec26dbdb0a8a418392e71708"},
{file = "greenlet-1.1.2-cp35-cp35m-macosx_10_14_x86_64.whl", hash = "sha256:17ff94e7a83aa8671a25bf5b59326ec26da379ace2ebc4411d690d80a7fbcf23"},
{file = "greenlet-1.1.2-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:9f3cba480d3deb69f6ee2c1825060177a22c7826431458c697df88e6aeb3caee"},
{file = "greenlet-1.1.2-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:fa877ca7f6b48054f847b61d6fa7bed5cebb663ebc55e018fda12db09dcc664c"},
{file = "greenlet-1.1.2-cp35-cp35m-win32.whl", hash = "sha256:7cbd7574ce8e138bda9df4efc6bf2ab8572c9aff640d8ecfece1b006b68da963"},
{file = "greenlet-1.1.2-cp35-cp35m-win_amd64.whl", hash = "sha256:903bbd302a2378f984aef528f76d4c9b1748f318fe1294961c072bdc7f2ffa3e"},
{file = "greenlet-1.1.2-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:049fe7579230e44daef03a259faa24511d10ebfa44f69411d99e6a184fe68073"},
{file = "greenlet-1.1.2-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:dd0b1e9e891f69e7675ba5c92e28b90eaa045f6ab134ffe70b52e948aa175b3c"},
{file = "greenlet-1.1.2-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:7418b6bfc7fe3331541b84bb2141c9baf1ec7132a7ecd9f375912eca810e714e"},
{file = "greenlet-1.1.2-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f9d29ca8a77117315101425ec7ec2a47a22ccf59f5593378fc4077ac5b754fce"},
{file = "greenlet-1.1.2-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:21915eb821a6b3d9d8eefdaf57d6c345b970ad722f856cd71739493ce003ad08"},
{file = "greenlet-1.1.2-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eff9d20417ff9dcb0d25e2defc2574d10b491bf2e693b4e491914738b7908168"},
{file = "greenlet-1.1.2-cp36-cp36m-win32.whl", hash = "sha256:32ca72bbc673adbcfecb935bb3fb1b74e663d10a4b241aaa2f5a75fe1d1f90aa"},
{file = "greenlet-1.1.2-cp36-cp36m-win_amd64.whl", hash = "sha256:f0214eb2a23b85528310dad848ad2ac58e735612929c8072f6093f3585fd342d"},
{file = "greenlet-1.1.2-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:b92e29e58bef6d9cfd340c72b04d74c4b4e9f70c9fa7c78b674d1fec18896dc4"},
{file = "greenlet-1.1.2-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:fdcec0b8399108577ec290f55551d926d9a1fa6cad45882093a7a07ac5ec147b"},
{file = "greenlet-1.1.2-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:93f81b134a165cc17123626ab8da2e30c0455441d4ab5576eed73a64c025b25c"},
{file = "greenlet-1.1.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e12bdc622676ce47ae9abbf455c189e442afdde8818d9da983085df6312e7a1"},
{file = "greenlet-1.1.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8c790abda465726cfb8bb08bd4ca9a5d0a7bd77c7ac1ca1b839ad823b948ea28"},
{file = "greenlet-1.1.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f276df9830dba7a333544bd41070e8175762a7ac20350786b322b714b0e654f5"},
{file = "greenlet-1.1.2-cp37-cp37m-win32.whl", hash = "sha256:64e6175c2e53195278d7388c454e0b30997573f3f4bd63697f88d855f7a6a1fc"},
{file = "greenlet-1.1.2-cp37-cp37m-win_amd64.whl", hash = "sha256:b11548073a2213d950c3f671aa88e6f83cda6e2fb97a8b6317b1b5b33d850e06"},
{file = "greenlet-1.1.2-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:9633b3034d3d901f0a46b7939f8c4d64427dfba6bbc5a36b1a67364cf148a1b0"},
{file = "greenlet-1.1.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:eb6ea6da4c787111adf40f697b4e58732ee0942b5d3bd8f435277643329ba627"},
{file = "greenlet-1.1.2-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:f3acda1924472472ddd60c29e5b9db0cec629fbe3c5c5accb74d6d6d14773478"},
{file = "greenlet-1.1.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e859fcb4cbe93504ea18008d1df98dee4f7766db66c435e4882ab35cf70cac43"},
{file = "greenlet-1.1.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:00e44c8afdbe5467e4f7b5851be223be68adb4272f44696ee71fe46b7036a711"},
{file = "greenlet-1.1.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec8c433b3ab0419100bd45b47c9c8551248a5aee30ca5e9d399a0b57ac04651b"},
{file = "greenlet-1.1.2-cp38-cp38-win32.whl", hash = "sha256:288c6a76705dc54fba69fbcb59904ae4ad768b4c768839b8ca5fdadec6dd8cfd"},
{file = "greenlet-1.1.2-cp38-cp38-win_amd64.whl", hash = "sha256:8d2f1fb53a421b410751887eb4ff21386d119ef9cde3797bf5e7ed49fb51a3b3"},
{file = "greenlet-1.1.2-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:166eac03e48784a6a6e0e5f041cfebb1ab400b394db188c48b3a84737f505b67"},
{file = "greenlet-1.1.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:572e1787d1460da79590bf44304abbc0a2da944ea64ec549188fa84d89bba7ab"},
{file = "greenlet-1.1.2-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:be5f425ff1f5f4b3c1e33ad64ab994eed12fc284a6ea71c5243fd564502ecbe5"},
{file = "greenlet-1.1.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1692f7d6bc45e3200844be0dba153612103db241691088626a33ff1f24a0d88"},
{file = "greenlet-1.1.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7227b47e73dedaa513cdebb98469705ef0d66eb5a1250144468e9c3097d6b59b"},
{file = "greenlet-1.1.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ff61ff178250f9bb3cd89752df0f1dd0e27316a8bd1465351652b1b4a4cdfd3"},
{file = "greenlet-1.1.2-cp39-cp39-win32.whl", hash = "sha256:f70a9e237bb792c7cc7e44c531fd48f5897961701cdaa06cf22fc14965c496cf"},
{file = "greenlet-1.1.2-cp39-cp39-win_amd64.whl", hash = "sha256:013d61294b6cd8fe3242932c1c5e36e5d1db2c8afb58606c5a67efce62c1f5fd"},
{file = "greenlet-1.1.2.tar.gz", hash = "sha256:e30f5ea4ae2346e62cedde8794a56858a67b878dd79f7df76a0767e356b1744a"},
]
sqlalchemy = [
{file = "SQLAlchemy-1.4.39-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:4770eb3ba69ec5fa41c681a75e53e0e342ac24c1f9220d883458b5596888e43a"},
{file = "SQLAlchemy-1.4.39-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:752ef2e8dbaa3c5d419f322e3632f00ba6b1c3230f65bc97c2ff5c5c6c08f441"},
{file = "SQLAlchemy-1.4.39-cp27-cp27m-win32.whl", hash = "sha256:b30e70f1594ee3c8902978fd71900d7312453922827c4ce0012fa6a8278d6df4"},
{file = "SQLAlchemy-1.4.39-cp27-cp27m-win_amd64.whl", hash = "sha256:864d4f89f054819cb95e93100b7d251e4d114d1c60bc7576db07b046432af280"},
{file = "SQLAlchemy-1.4.39-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:8f901be74f00a13bf375241a778455ee864c2c21c79154aad196b7a994e1144f"},
{file = "SQLAlchemy-1.4.39-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:1745987ada1890b0e7978abdb22c133eca2e89ab98dc17939042240063e1ef21"},
{file = "SQLAlchemy-1.4.39-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ede13a472caa85a13abe5095e71676af985d7690eaa8461aeac5c74f6600b6c0"},
{file = "SQLAlchemy-1.4.39-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:7f13644b15665f7322f9e0635129e0ef2098409484df67fcd225d954c5861559"},
{file = "SQLAlchemy-1.4.39-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26146c59576dfe9c546c9f45397a7c7c4a90c25679492ff610a7500afc7d03a6"},
{file = "SQLAlchemy-1.4.39-cp310-cp310-win32.whl", hash = "sha256:91d2b89bb0c302f89e753bea008936acfa4e18c156fb264fe41eb6bbb2bbcdeb"},
{file = "SQLAlchemy-1.4.39-cp310-cp310-win_amd64.whl", hash = "sha256:50e7569637e2e02253295527ff34666706dbb2bc5f6c61a5a7f44b9610c9bb09"},
{file = "SQLAlchemy-1.4.39-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:107df519eb33d7f8e0d0d052128af2f25066c1a0f6b648fd1a9612ab66800b86"},
{file = "SQLAlchemy-1.4.39-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f24d4d6ec301688c59b0c4bb1c1c94c5d0bff4ecad33bb8f5d9efdfb8d8bc925"},
{file = "SQLAlchemy-1.4.39-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:7b2785dd2a0c044a36836857ac27310dc7a99166253551ee8f5408930958cc60"},
{file = "SQLAlchemy-1.4.39-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6e2c8581c6620136b9530137954a8376efffd57fe19802182c7561b0ab48b48"},
{file = "SQLAlchemy-1.4.39-cp36-cp36m-win32.whl", hash = "sha256:fbc076f79d830ae4c9d49926180a1140b49fa675d0f0d555b44c9a15b29f4c80"},
{file = "SQLAlchemy-1.4.39-cp36-cp36m-win_amd64.whl", hash = "sha256:0ec54460475f0c42512895c99c63d90dd2d9cbd0c13491a184182e85074b04c5"},
{file = "SQLAlchemy-1.4.39-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:6f95706da857e6e79b54c33c1214f5467aab10600aa508ddd1239d5df271986e"},
{file = "SQLAlchemy-1.4.39-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:621f050e72cc7dfd9ad4594ff0abeaad954d6e4a2891545e8f1a53dcdfbef445"},
{file = "SQLAlchemy-1.4.39-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:05a05771617bfa723ba4cef58d5b25ac028b0d68f28f403edebed5b8243b3a87"},
{file = "SQLAlchemy-1.4.39-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20bf65bcce65c538e68d5df27402b39341fabeecf01de7e0e72b9d9836c13c52"},
{file = "SQLAlchemy-1.4.39-cp37-cp37m-win32.whl", hash = "sha256:f2a42acc01568b9701665e85562bbff78ec3e21981c7d51d56717c22e5d3d58b"},
{file = "SQLAlchemy-1.4.39-cp37-cp37m-win_amd64.whl", hash = "sha256:6d81de54e45f1d756785405c9d06cd17918c2eecc2d4262dc2d276ca612c2f61"},
{file = "SQLAlchemy-1.4.39-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:5c2d19bfb33262bf987ef0062345efd0f54c4189c2d95159c72995457bf4a359"},
{file = "SQLAlchemy-1.4.39-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:14ea8ff2d33c48f8e6c3c472111d893b9e356284d1482102da9678195e5a8eac"},
{file = "SQLAlchemy-1.4.39-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ec3985c883d6d217cf2013028afc6e3c82b8907192ba6195d6e49885bfc4b19d"},
{file = "SQLAlchemy-1.4.39-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1962dfee37b7fb17d3d4889bf84c4ea08b1c36707194c578f61e6e06d12ab90f"},
{file = "SQLAlchemy-1.4.39-cp38-cp38-win32.whl", hash = "sha256:047ef5ccd8860f6147b8ac6c45a4bc573d4e030267b45d9a1c47b55962ff0e6f"},
{file = "SQLAlchemy-1.4.39-cp38-cp38-win_amd64.whl", hash = "sha256:b71be98ef6e180217d1797185c75507060a57ab9cd835653e0112db16a710f0d"},
{file = "SQLAlchemy-1.4.39-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:365b75938049ae31cf2176efd3d598213ddb9eb883fbc82086efa019a5f649df"},
{file = "SQLAlchemy-1.4.39-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e7a7667d928ba6ee361a3176e1bef6847c1062b37726b33505cc84136f657e0d"},
{file = "SQLAlchemy-1.4.39-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c6d00cb9da8d0cbfaba18cad046e94b06de6d4d0ffd9d4095a3ad1838af22528"},
{file = "SQLAlchemy-1.4.39-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b0538b66f959771c56ff996d828081908a6a52a47c5548faed4a3d0a027a5368"},
{file = "SQLAlchemy-1.4.39-cp39-cp39-win32.whl", hash = "sha256:d1f665e50592caf4cad3caed3ed86f93227bffe0680218ccbb293bd5a6734ca8"},
{file = "SQLAlchemy-1.4.39-cp39-cp39-win_amd64.whl", hash = "sha256:8b773c9974c272aae0fa7e95b576d98d17ee65f69d8644f9b6ffc90ee96b4d19"},
{file = "SQLAlchemy-1.4.39.tar.gz", hash = "sha256:8194896038753b46b08a0b0ae89a5d80c897fb601dd51e243ed5720f1f155d27"},
]

15
pyproject.toml Normal file
View file

@ -0,0 +1,15 @@
[tool.poetry]
name = "metadex"
version = "0.1.0"
description = ""
authors = ["ducklet <ducklet@noreply.code.dumpr.org>"]
[tool.poetry.dependencies]
python = "^3.8"
SQLAlchemy = "^1.4.35"
[tool.poetry.dev-dependencies]
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

1
pyrightconfig.json Normal file
View file

@ -0,0 +1 @@
{"pythonPlatform":"Linux", "pythonVersion":"3.8"}

16
run Executable file
View file

@ -0,0 +1,16 @@
#!/bin/sh -euf
here=$(dirname "$(realpath "$0")")
# cd "$here"
python_bin="$here"/.venv/bin/python
[ -z "${DEBUG:-}" ] || set -x
# time python -m cProfile -s tottime -m metadex -n scan ~ >profile-scan.txt
PYTHONPATH="$here" \
"$python_bin" -m metadex \
--db "$here/metadex.sqlite" \
--ignore-from "$here/metadex.ignore" \
"$@"

25
scripts/lint Executable file
View file

@ -0,0 +1,25 @@
#!/bin/sh -eu
if [ "${1:-}" = '--fix' ]; then
autoflake \
--remove-duplicate-keys \
--remove-unused-variables \
--remove-all-unused-imports \
--ignore-init-module-imports \
--recursive \
--in-place \
.
isort --profile black .
black .
else
autoflake \
--remove-duplicate-keys \
--remove-unused-variables \
--remove-all-unused-imports \
--ignore-init-module-imports \
--recursive \
--check \
.
isort --profile black --check .
black --check .
fi

23
scripts/ls Executable file
View file

@ -0,0 +1,23 @@
#!/bin/sh -eu
# Create a `ingest-ls` compatible file listing.
#
# Compatible with current versions of GNU and macOS `ls`.
#
# $ scripts/ls -R /some/base/path \
# | python -m metadex ingest-ls --remove-missing
_ls() {
if command ls -d --time-style='+%s' . >/dev/null 2>&1; then
# echo 'GNU'
command ls --time-style='+%s' "$@"
elif command ls -d -D '%s' . >/dev/null 2>&1; then
# echo 'macOS'
command ls -D '%s' "$@"
else
# echo 'unknown'
command ls "$@"
fi
}
_ls -lnAU "$@"