From d49a241c0ced46744ea74f7261219ef316de3cff Mon Sep 17 00:00:00 2001 From: ducklet Date: Wed, 22 Feb 2023 19:57:02 +0100 Subject: [PATCH] fix handling of non-utf8 filenames Our SQLite DB requires UTF-8 strings but the filenames from the file system could use any encoding. Ideally we'd try to find the right encoding, but we're already using `str.encode(errors="replace").decode()` in other places to handle filenames that don't use UTF-8 encoding and so this is a pragmatic solution that should work for all cases, even though we loose information - all non-utf8 chars will be converted to "?". An iteration on this could use an encoding detection mechanism. --- metadex/metadex.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/metadex/metadex.py b/metadex/metadex.py index 3ef5b81..78b69df 100644 --- a/metadex/metadex.py +++ b/metadex/metadex.py @@ -169,7 +169,8 @@ def _scan_remove_missing( log.error(err) continue - expected = {name for name in db.files_in_dir(conn, str(cwd))} + cwd_utf8 = str(cwd).encode(errors="replace").decode() + expected = {name for name in db.files_in_dir(conn, cwd_utf8)} subdirs: deque[Path] = deque() with scan as files: @@ -210,7 +211,7 @@ def _scan_remove_missing( dirs.extendleft(subdirs) for name in expected: - ff = str(cwd / name) + ff = str(cwd / name).encode(errors="replace").decode() if is_ignored(ff): log.info("Ignoring file (for removal): %a", ff) continue @@ -402,7 +403,7 @@ def ingest_rclone_json( else: # remove missing for name in expected: - loc = str(parent / name) + loc = str(parent / name).encode(errors="replace").decode() if is_ignored(loc): log.info("Ignoring file (for removal): %a", loc) continue @@ -413,7 +414,8 @@ def ingest_rclone_json( parent = new_parent - expected = {name for name in db.files_in_dir(conn, str(new_parent))} + parent_utf8 = str(new_parent).encode(errors="replace").decode() + expected = {name for name in db.files_in_dir(conn, parent_utf8)} context.seen += 1 @@ -435,7 +437,7 @@ def ingest_rclone_json( if remove_missing: if parent is not None: for name in expected: - loc = str(parent / name) + loc = str(parent / name).encode(errors="replace").decode() if is_ignored(loc): log.info("Ignoring file (for removal): %a", loc) continue @@ -507,7 +509,7 @@ def _dict_from_lsfile(f: ls_parser.File) -> "dict[str, Any]": mode = "-" return dict( - location=str(f.path), + location=str(f.path).encode(errors="replace").decode(), hostname=config.hostname, stat_bytes=f.size_bytes, stat_modified=f.date, @@ -528,12 +530,13 @@ def _ingest_ls_remove_missing( for f in ls_parser.parse_file(file, ref_year=ref_year): if isinstance(f, ls_parser.ChangeDir): if f.to is not None: - expected = {name for name in db.files_in_dir(conn, str(f.to))} + fto_utf8 = str(f.to).encode(errors="replace").decode() + expected = {name for name in db.files_in_dir(conn, fto_utf8)} elif f.from_: # remove missing for name in expected: - loc = str(f.from_ / name) + loc = str(f.from_ / name).encode(errors="replace").decode() if is_ignored(loc): log.info("Ignoring file (for removal): %a", loc) continue