mirror of
https://github.com/TagStudioDev/TagStudio.git
synced 2026-01-31 23:29:10 +00:00
feat: increase file scanning performance (#486)
* feat: increase file scanning performance * fix: correct typo in comment * refactor: use `continue` in place of nested `ifs`
This commit is contained in:
committed by
GitHub
parent
dfa4079b23
commit
6490cc905d
@@ -737,7 +737,9 @@ class Library:
|
||||
"""Maps a full filepath to its corresponding Entry's ID."""
|
||||
self.filename_to_entry_id_map.clear()
|
||||
for entry in self.entries:
|
||||
self.filename_to_entry_id_map[(entry.path / entry.filename)] = entry.id
|
||||
self.filename_to_entry_id_map[
|
||||
(self.library_dir / entry.path / entry.filename)
|
||||
] = entry.id
|
||||
|
||||
# def _map_filenames_to_entry_ids(self):
|
||||
# """Maps the file paths of entries to their index in the library list."""
|
||||
@@ -884,59 +886,71 @@ class Library:
|
||||
|
||||
# Scans the directory for files, keeping track of:
|
||||
# - Total file count
|
||||
# - Files without library entries
|
||||
# for type in TYPES:
|
||||
start_time = time.time()
|
||||
# - Files without Library entries
|
||||
start_time_total = time.time()
|
||||
start_time_loop = time.time()
|
||||
ext_set = set(self.ext_list) # Should be slightly faster
|
||||
for f in self.library_dir.glob("**/*"):
|
||||
try:
|
||||
if (
|
||||
"$RECYCLE.BIN" not in f.parts
|
||||
and TS_FOLDER_NAME not in f.parts
|
||||
and "tagstudio_thumbs" not in f.parts
|
||||
and not f.is_dir()
|
||||
):
|
||||
if f.suffix.lower() not in self.ext_list and self.is_exclude_list:
|
||||
self.dir_file_count += 1
|
||||
file = f.relative_to(self.library_dir)
|
||||
if file not in self.filename_to_entry_id_map:
|
||||
self.files_not_in_library.append(file)
|
||||
elif f.suffix.lower() in self.ext_list and not self.is_exclude_list:
|
||||
self.dir_file_count += 1
|
||||
file = f.relative_to(self.library_dir)
|
||||
try:
|
||||
_ = self.filename_to_entry_id_map[file]
|
||||
except KeyError:
|
||||
# print(file)
|
||||
self.files_not_in_library.append(file)
|
||||
except PermissionError:
|
||||
logging.info(
|
||||
f"The File/Folder {f} cannot be accessed, because it requires higher permission!"
|
||||
)
|
||||
end_time = time.time()
|
||||
end_time_loop = time.time()
|
||||
# Yield output every 1/30 of a second
|
||||
if (end_time - start_time) > 0.034:
|
||||
if (end_time_loop - start_time_loop) > 0.034:
|
||||
yield self.dir_file_count
|
||||
start_time = time.time()
|
||||
# Sorts the files by date modified, descending.
|
||||
start_time_loop = time.time()
|
||||
try:
|
||||
# Skip this file if it should be excluded
|
||||
ext: str = f.suffix.lower()
|
||||
if (ext in ext_set and self.is_exclude_list) or (
|
||||
ext not in ext_set and not self.is_exclude_list
|
||||
):
|
||||
continue
|
||||
|
||||
# Finish if the file/path is already mapped in the Library
|
||||
if self.filename_to_entry_id_map.get(f) is not None:
|
||||
# No other checks are required.
|
||||
self.dir_file_count += 1
|
||||
continue
|
||||
|
||||
# If the file is new, check for validity
|
||||
if (
|
||||
"$RECYCLE.BIN" in f.parts
|
||||
or TS_FOLDER_NAME in f.parts
|
||||
or "tagstudio_thumbs" in f.parts
|
||||
or f.is_dir()
|
||||
):
|
||||
continue
|
||||
|
||||
# Add the validated new file to the Library
|
||||
self.dir_file_count += 1
|
||||
self.files_not_in_library.append(f)
|
||||
|
||||
except PermissionError:
|
||||
logging.info(f'[LIBRARY] Cannot access "{f}": PermissionError')
|
||||
|
||||
yield self.dir_file_count
|
||||
end_time_total = time.time()
|
||||
logging.info(
|
||||
f"[LIBRARY] Scanned directories in {(end_time_total - start_time_total):.3f} seconds"
|
||||
)
|
||||
# Sorts the files by date modified, descending
|
||||
if len(self.files_not_in_library) <= 150000:
|
||||
try:
|
||||
if platform.system() == "Windows" or platform.system() == "Darwin":
|
||||
self.files_not_in_library = sorted(
|
||||
self.files_not_in_library,
|
||||
key=lambda t: -(self.library_dir / t).stat().st_birthtime, # type: ignore[attr-defined]
|
||||
key=lambda t: -(t).stat().st_birthtime, # type: ignore[attr-defined]
|
||||
)
|
||||
else:
|
||||
self.files_not_in_library = sorted(
|
||||
self.files_not_in_library,
|
||||
key=lambda t: -(self.library_dir / t).stat().st_ctime,
|
||||
key=lambda t: -(t).stat().st_ctime,
|
||||
)
|
||||
except (FileExistsError, FileNotFoundError):
|
||||
print(
|
||||
"[LIBRARY] [ERROR] Couldn't sort files, some were moved during the scanning/sorting process."
|
||||
logging.info(
|
||||
"[LIBRARY][ERROR] Couldn't sort files, some were moved during the scanning/sorting process."
|
||||
)
|
||||
pass
|
||||
else:
|
||||
print(
|
||||
logging.info(
|
||||
"[LIBRARY][INFO] Not bothering to sort files because there's OVER 150,000! Better sorting methods will be added in the future."
|
||||
)
|
||||
|
||||
@@ -957,7 +971,7 @@ class Library:
|
||||
# Step [1/2]:
|
||||
# Remove this Entry from the Entries list.
|
||||
entry = self.get_entry(entry_id)
|
||||
path = entry.path / entry.filename
|
||||
path = self.library_dir / entry.path / entry.filename
|
||||
# logging.info(f'Removing path: {path}')
|
||||
|
||||
del self.filename_to_entry_id_map[path]
|
||||
@@ -1087,8 +1101,8 @@ class Library:
|
||||
)
|
||||
)
|
||||
for match in matches:
|
||||
file_1 = files[match[0]].relative_to(self.library_dir)
|
||||
file_2 = files[match[1]].relative_to(self.library_dir)
|
||||
file_1 = files[match[0]]
|
||||
file_2 = files[match[1]]
|
||||
|
||||
if (
|
||||
file_1 in self.filename_to_entry_id_map.keys()
|
||||
@@ -1289,8 +1303,7 @@ class Library:
|
||||
"""Adds files from the `files_not_in_library` list to the Library as Entries. Returns list of added indices."""
|
||||
new_ids: list[int] = []
|
||||
for file in self.files_not_in_library:
|
||||
path = Path(file)
|
||||
# print(os.path.split(file))
|
||||
path = Path(*file.parts[len(self.library_dir.parts) :])
|
||||
entry = Entry(
|
||||
id=self._next_entry_id, filename=path.name, path=path.parent, fields=[]
|
||||
)
|
||||
@@ -1301,8 +1314,6 @@ class Library:
|
||||
self.files_not_in_library.clear()
|
||||
return new_ids
|
||||
|
||||
self.files_not_in_library.clear()
|
||||
|
||||
def get_entry(self, entry_id: int) -> Entry:
|
||||
"""Returns an Entry object given an Entry ID."""
|
||||
return self.entries[self._entry_id_to_index_map[int(entry_id)]]
|
||||
@@ -1323,9 +1334,7 @@ class Library:
|
||||
"""Returns an Entry ID given the full filepath it points to."""
|
||||
try:
|
||||
if self.entries:
|
||||
return self.filename_to_entry_id_map[
|
||||
Path(filename).relative_to(self.library_dir)
|
||||
]
|
||||
return self.filename_to_entry_id_map[filename]
|
||||
except KeyError:
|
||||
return -1
|
||||
|
||||
|
||||
@@ -106,6 +106,7 @@ class DropImport:
|
||||
continue
|
||||
|
||||
dest_file = self.get_relative_path(file)
|
||||
full_dest_path: Path = self.driver.lib.library_dir / dest_file
|
||||
|
||||
if file in self.duplicate_files:
|
||||
duplicated_files_progress += 1
|
||||
@@ -115,14 +116,12 @@ class DropImport:
|
||||
if self.choice == 2: # rename
|
||||
new_name = self.get_renamed_duplicate_filename_in_lib(dest_file)
|
||||
dest_file = dest_file.with_name(new_name)
|
||||
self.driver.lib.files_not_in_library.append(dest_file)
|
||||
self.driver.lib.files_not_in_library.append(full_dest_path)
|
||||
else: # override is simply copying but not adding a new entry
|
||||
self.driver.lib.files_not_in_library.append(dest_file)
|
||||
self.driver.lib.files_not_in_library.append(full_dest_path)
|
||||
|
||||
(self.driver.lib.library_dir / dest_file).parent.mkdir(
|
||||
parents=True, exist_ok=True
|
||||
)
|
||||
shutil.copyfile(file, self.driver.lib.library_dir / dest_file)
|
||||
(full_dest_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copyfile(file, full_dest_path)
|
||||
|
||||
fileCount += 1
|
||||
yield [fileCount, duplicated_files_progress]
|
||||
|
||||
Reference in New Issue
Block a user