Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

36214035 tell() method on files opened in text mode has poor performance #470

Merged
merged 1 commit into from
Feb 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 26 additions & 24 deletions src/modules/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,11 @@ def __init__(

self.old_out_token = None

# Handle for the output main dictionary file and
# the current position within.
self.out_main_dict_handle = None
self.out_main_dict_pos = 0

@staticmethod
def __decode_fmri(pfmri):
"""Turn fmris into strings correctly while writing out
Expand Down Expand Up @@ -393,12 +398,9 @@ def _process_fmris(self, fmris):
self._progtrack.job_add_progress(self._progtrack.JOB_REBUILD_SEARCH)
return removed_paths

def _write_main_dict_line(
self, file_handle, token, fv_fmri_pos_list_list, out_dir
):
def _write_main_dict_line(self, token, fv_fmri_pos_list_list, out_dir):
"""Writes out the new main dictionary file and also adds the
token offsets to _data_token_offset. file_handle is the file
handle for the output main dictionary file. token is the token
token offsets to _data_token_offset. token is the token
to add to the file. fv_fmri_pos_list_list is a structure of
lists inside of lists several layers deep. The top layer is a
list of action types. The second layer contains the keys for
Expand All @@ -418,8 +420,7 @@ def _write_main_dict_line(
)
self.old_out_token = token

cur_location_int = file_handle.tell()
cur_location = str(cur_location_int)
cur_location = str(self.out_main_dict_pos)
self._data_token_offset.write_entity(token, cur_location)

for at, st_list in fv_fmri_pos_list_list:
Expand All @@ -438,11 +439,18 @@ def _write_main_dict_line(
for fv, p_list in fv_list:
for p_id, m_off_set in p_list:
p_id = int(p_id)
self._data_fmri_offsets.add_pair(p_id, cur_location_int)
file_handle.write(
self._data_main_dict.transform_main_dict_line(
token, fv_fmri_pos_list_list
)
self._data_fmri_offsets.add_pair(
p_id, self.out_main_dict_pos
)
data = self._data_main_dict.transform_main_dict_line(
token, fv_fmri_pos_list_list
)
self.out_main_dict_handle.write(data)
# Using tell() on file objects opened in text mode
# is very slow compared to simple counting.
# https://docs.python.org/3/library/io.html#performance
self.out_main_dict_pos += len(
data.encode(self.out_main_dict_handle.encoding)
)

@staticmethod
Expand Down Expand Up @@ -597,11 +605,12 @@ def _update_index(self, dicts, out_dir):
self._data_main_dict.write_dict_file(out_dir, self.file_version_number)
# The dictionary file's opened in append mode to avoid removing
# the version information the search storage class added.
out_main_dict_handle = open(
self.out_main_dict_handle = open(
os.path.join(out_dir, self._data_main_dict.get_file_name()),
"a",
buffering=PKG_FILE_BUFSIZ,
)
self.out_main_dict_pos = self.out_main_dict_handle.tell()

self._data_token_offset.open_out_file(out_dir, self.file_version_number)

Expand Down Expand Up @@ -641,10 +650,7 @@ def _update_index(self, dicts, out_dir):
while new_toks_available and next_new_tok < tok:
assert len(next_new_tok) > 0
self._write_main_dict_line(
out_main_dict_handle,
next_new_tok,
new_tok_info,
out_dir,
next_new_tok, new_tok_info, out_dir
)
try:
next_new_tok, new_tok_info = next(new_toks_it)
Expand All @@ -668,18 +674,14 @@ def _update_index(self, dicts, out_dir):
# associated with it, write them to the file.
if existing_entries:
assert len(tok) > 0
self._write_main_dict_line(
out_main_dict_handle, tok, existing_entries, out_dir
)
self._write_main_dict_line(tok, existing_entries, out_dir)

# For any new tokens which are alphabetically after the
# last entry in the existing file, add them to the end
# of the file.
while new_toks_available:
assert len(next_new_tok) > 0
self._write_main_dict_line(
out_main_dict_handle, next_new_tok, new_tok_info, out_dir
)
self._write_main_dict_line(next_new_tok, new_tok_info, out_dir)
try:
next_new_tok, new_tok_info = next(new_toks_it)
except StopIteration:
Expand All @@ -689,7 +691,7 @@ def _update_index(self, dicts, out_dir):
file_handle.close()
self._data_main_dict.close_file_handle()

out_main_dict_handle.close()
self.out_main_dict_handle.close()
self._data_token_offset.close_file_handle()
for fh in self.at_fh.values():
fh.close()
Expand Down
5 changes: 4 additions & 1 deletion src/modules/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1347,7 +1347,10 @@ def __handle_list(lst, cp):
if return_line:
arg = l
__handle_list(inds, arg)
cur_pos = file_handle.tell()
# Using tell() on file objects opened in text mode
# is very slow compared to simple counting.
# https://docs.python.org/3/library/io.html#performance
cur_pos += len(line.encode(file_handle.encoding))
line = file_handle.readline()
file_handle.close()
return action_dict
Expand Down
Loading