fix handling of non-utf8 filenames
Our SQLite DB requires UTF-8 strings but the filenames from the file system could use any encoding. Ideally we'd try to find the right encoding, but we're already using `str.encode(errors="replace").decode()` in other places to handle filenames that don't use UTF-8 encoding and so this is a pragmatic solution that should work for all cases, even though we loose information - all non-utf8 chars will be converted to "?". An iteration on this could use an encoding detection mechanism.
This commit is contained in:
parent
9720d75a61
commit
d49a241c0c
1 changed files with 11 additions and 8 deletions
|
|
@ -169,7 +169,8 @@ def _scan_remove_missing(
|
|||
log.error(err)
|
||||
continue
|
||||
|
||||
expected = {name for name in db.files_in_dir(conn, str(cwd))}
|
||||
cwd_utf8 = str(cwd).encode(errors="replace").decode()
|
||||
expected = {name for name in db.files_in_dir(conn, cwd_utf8)}
|
||||
|
||||
subdirs: deque[Path] = deque()
|
||||
with scan as files:
|
||||
|
|
@ -210,7 +211,7 @@ def _scan_remove_missing(
|
|||
dirs.extendleft(subdirs)
|
||||
|
||||
for name in expected:
|
||||
ff = str(cwd / name)
|
||||
ff = str(cwd / name).encode(errors="replace").decode()
|
||||
if is_ignored(ff):
|
||||
log.info("Ignoring file (for removal): %a", ff)
|
||||
continue
|
||||
|
|
@ -402,7 +403,7 @@ def ingest_rclone_json(
|
|||
else:
|
||||
# remove missing
|
||||
for name in expected:
|
||||
loc = str(parent / name)
|
||||
loc = str(parent / name).encode(errors="replace").decode()
|
||||
if is_ignored(loc):
|
||||
log.info("Ignoring file (for removal): %a", loc)
|
||||
continue
|
||||
|
|
@ -413,7 +414,8 @@ def ingest_rclone_json(
|
|||
|
||||
parent = new_parent
|
||||
|
||||
expected = {name for name in db.files_in_dir(conn, str(new_parent))}
|
||||
parent_utf8 = str(new_parent).encode(errors="replace").decode()
|
||||
expected = {name for name in db.files_in_dir(conn, parent_utf8)}
|
||||
|
||||
context.seen += 1
|
||||
|
||||
|
|
@ -435,7 +437,7 @@ def ingest_rclone_json(
|
|||
if remove_missing:
|
||||
if parent is not None:
|
||||
for name in expected:
|
||||
loc = str(parent / name)
|
||||
loc = str(parent / name).encode(errors="replace").decode()
|
||||
if is_ignored(loc):
|
||||
log.info("Ignoring file (for removal): %a", loc)
|
||||
continue
|
||||
|
|
@ -507,7 +509,7 @@ def _dict_from_lsfile(f: ls_parser.File) -> "dict[str, Any]":
|
|||
mode = "-"
|
||||
|
||||
return dict(
|
||||
location=str(f.path),
|
||||
location=str(f.path).encode(errors="replace").decode(),
|
||||
hostname=config.hostname,
|
||||
stat_bytes=f.size_bytes,
|
||||
stat_modified=f.date,
|
||||
|
|
@ -528,12 +530,13 @@ def _ingest_ls_remove_missing(
|
|||
for f in ls_parser.parse_file(file, ref_year=ref_year):
|
||||
if isinstance(f, ls_parser.ChangeDir):
|
||||
if f.to is not None:
|
||||
expected = {name for name in db.files_in_dir(conn, str(f.to))}
|
||||
fto_utf8 = str(f.to).encode(errors="replace").decode()
|
||||
expected = {name for name in db.files_in_dir(conn, fto_utf8)}
|
||||
|
||||
elif f.from_:
|
||||
# remove missing
|
||||
for name in expected:
|
||||
loc = str(f.from_ / name)
|
||||
loc = str(f.from_ / name).encode(errors="replace").decode()
|
||||
if is_ignored(loc):
|
||||
log.info("Ignoring file (for removal): %a", loc)
|
||||
continue
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue