fix handling of non-utf8 filenames
Our SQLite DB requires UTF-8 strings but the filenames from the file system could use any encoding. Ideally we'd try to find the right encoding, but we're already using `str.encode(errors="replace").decode()` in other places to handle filenames that don't use UTF-8 encoding and so this is a pragmatic solution that should work for all cases, even though we loose information - all non-utf8 chars will be converted to "?". An iteration on this could use an encoding detection mechanism.
This commit is contained in:
parent
9720d75a61
commit
d49a241c0c
1 changed files with 11 additions and 8 deletions
|
|
@ -169,7 +169,8 @@ def _scan_remove_missing(
|
||||||
log.error(err)
|
log.error(err)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
expected = {name for name in db.files_in_dir(conn, str(cwd))}
|
cwd_utf8 = str(cwd).encode(errors="replace").decode()
|
||||||
|
expected = {name for name in db.files_in_dir(conn, cwd_utf8)}
|
||||||
|
|
||||||
subdirs: deque[Path] = deque()
|
subdirs: deque[Path] = deque()
|
||||||
with scan as files:
|
with scan as files:
|
||||||
|
|
@ -210,7 +211,7 @@ def _scan_remove_missing(
|
||||||
dirs.extendleft(subdirs)
|
dirs.extendleft(subdirs)
|
||||||
|
|
||||||
for name in expected:
|
for name in expected:
|
||||||
ff = str(cwd / name)
|
ff = str(cwd / name).encode(errors="replace").decode()
|
||||||
if is_ignored(ff):
|
if is_ignored(ff):
|
||||||
log.info("Ignoring file (for removal): %a", ff)
|
log.info("Ignoring file (for removal): %a", ff)
|
||||||
continue
|
continue
|
||||||
|
|
@ -402,7 +403,7 @@ def ingest_rclone_json(
|
||||||
else:
|
else:
|
||||||
# remove missing
|
# remove missing
|
||||||
for name in expected:
|
for name in expected:
|
||||||
loc = str(parent / name)
|
loc = str(parent / name).encode(errors="replace").decode()
|
||||||
if is_ignored(loc):
|
if is_ignored(loc):
|
||||||
log.info("Ignoring file (for removal): %a", loc)
|
log.info("Ignoring file (for removal): %a", loc)
|
||||||
continue
|
continue
|
||||||
|
|
@ -413,7 +414,8 @@ def ingest_rclone_json(
|
||||||
|
|
||||||
parent = new_parent
|
parent = new_parent
|
||||||
|
|
||||||
expected = {name for name in db.files_in_dir(conn, str(new_parent))}
|
parent_utf8 = str(new_parent).encode(errors="replace").decode()
|
||||||
|
expected = {name for name in db.files_in_dir(conn, parent_utf8)}
|
||||||
|
|
||||||
context.seen += 1
|
context.seen += 1
|
||||||
|
|
||||||
|
|
@ -435,7 +437,7 @@ def ingest_rclone_json(
|
||||||
if remove_missing:
|
if remove_missing:
|
||||||
if parent is not None:
|
if parent is not None:
|
||||||
for name in expected:
|
for name in expected:
|
||||||
loc = str(parent / name)
|
loc = str(parent / name).encode(errors="replace").decode()
|
||||||
if is_ignored(loc):
|
if is_ignored(loc):
|
||||||
log.info("Ignoring file (for removal): %a", loc)
|
log.info("Ignoring file (for removal): %a", loc)
|
||||||
continue
|
continue
|
||||||
|
|
@ -507,7 +509,7 @@ def _dict_from_lsfile(f: ls_parser.File) -> "dict[str, Any]":
|
||||||
mode = "-"
|
mode = "-"
|
||||||
|
|
||||||
return dict(
|
return dict(
|
||||||
location=str(f.path),
|
location=str(f.path).encode(errors="replace").decode(),
|
||||||
hostname=config.hostname,
|
hostname=config.hostname,
|
||||||
stat_bytes=f.size_bytes,
|
stat_bytes=f.size_bytes,
|
||||||
stat_modified=f.date,
|
stat_modified=f.date,
|
||||||
|
|
@ -528,12 +530,13 @@ def _ingest_ls_remove_missing(
|
||||||
for f in ls_parser.parse_file(file, ref_year=ref_year):
|
for f in ls_parser.parse_file(file, ref_year=ref_year):
|
||||||
if isinstance(f, ls_parser.ChangeDir):
|
if isinstance(f, ls_parser.ChangeDir):
|
||||||
if f.to is not None:
|
if f.to is not None:
|
||||||
expected = {name for name in db.files_in_dir(conn, str(f.to))}
|
fto_utf8 = str(f.to).encode(errors="replace").decode()
|
||||||
|
expected = {name for name in db.files_in_dir(conn, fto_utf8)}
|
||||||
|
|
||||||
elif f.from_:
|
elif f.from_:
|
||||||
# remove missing
|
# remove missing
|
||||||
for name in expected:
|
for name in expected:
|
||||||
loc = str(f.from_ / name)
|
loc = str(f.from_ / name).encode(errors="replace").decode()
|
||||||
if is_ignored(loc):
|
if is_ignored(loc):
|
||||||
log.info("Ignoring file (for removal): %a", loc)
|
log.info("Ignoring file (for removal): %a", loc)
|
||||||
continue
|
continue
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue