fix handling of non-utf8 filenames

Our SQLite DB requires UTF-8 strings but the filenames from the file
system could use any encoding.  Ideally we'd try to find the right
encoding, but we're already using `str.encode(errors="replace").decode()`
in other places to handle filenames that don't use UTF-8 encoding and so
this is a pragmatic solution that should work for all cases, even though
we loose information - all non-utf8 chars will be converted to "?".
An iteration on this could use an encoding detection mechanism.
This commit is contained in:
ducklet 2023-02-22 19:57:02 +01:00
parent 9720d75a61
commit d49a241c0c

View file

@ -169,7 +169,8 @@ def _scan_remove_missing(
log.error(err)
continue
expected = {name for name in db.files_in_dir(conn, str(cwd))}
cwd_utf8 = str(cwd).encode(errors="replace").decode()
expected = {name for name in db.files_in_dir(conn, cwd_utf8)}
subdirs: deque[Path] = deque()
with scan as files:
@ -210,7 +211,7 @@ def _scan_remove_missing(
dirs.extendleft(subdirs)
for name in expected:
ff = str(cwd / name)
ff = str(cwd / name).encode(errors="replace").decode()
if is_ignored(ff):
log.info("Ignoring file (for removal): %a", ff)
continue
@ -402,7 +403,7 @@ def ingest_rclone_json(
else:
# remove missing
for name in expected:
loc = str(parent / name)
loc = str(parent / name).encode(errors="replace").decode()
if is_ignored(loc):
log.info("Ignoring file (for removal): %a", loc)
continue
@ -413,7 +414,8 @@ def ingest_rclone_json(
parent = new_parent
expected = {name for name in db.files_in_dir(conn, str(new_parent))}
parent_utf8 = str(new_parent).encode(errors="replace").decode()
expected = {name for name in db.files_in_dir(conn, parent_utf8)}
context.seen += 1
@ -435,7 +437,7 @@ def ingest_rclone_json(
if remove_missing:
if parent is not None:
for name in expected:
loc = str(parent / name)
loc = str(parent / name).encode(errors="replace").decode()
if is_ignored(loc):
log.info("Ignoring file (for removal): %a", loc)
continue
@ -507,7 +509,7 @@ def _dict_from_lsfile(f: ls_parser.File) -> "dict[str, Any]":
mode = "-"
return dict(
location=str(f.path),
location=str(f.path).encode(errors="replace").decode(),
hostname=config.hostname,
stat_bytes=f.size_bytes,
stat_modified=f.date,
@ -528,12 +530,13 @@ def _ingest_ls_remove_missing(
for f in ls_parser.parse_file(file, ref_year=ref_year):
if isinstance(f, ls_parser.ChangeDir):
if f.to is not None:
expected = {name for name in db.files_in_dir(conn, str(f.to))}
fto_utf8 = str(f.to).encode(errors="replace").decode()
expected = {name for name in db.files_in_dir(conn, fto_utf8)}
elif f.from_:
# remove missing
for name in expected:
loc = str(f.from_ / name)
loc = str(f.from_ / name).encode(errors="replace").decode()
if is_ignored(loc):
log.info("Ignoring file (for removal): %a", loc)
continue