Skip to content

Commit

Permalink
Added some comments on the workings
Browse files Browse the repository at this point in the history
  • Loading branch information
nnansters committed Jul 1, 2024
1 parent a2520f9 commit 3aaed3d
Showing 1 changed file with 10 additions and 4 deletions.
14 changes: 10 additions & 4 deletions nannyml/drift/multivariate/domain_classifier/calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,8 +226,12 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs):
if column_name not in self.categorical_column_names:
self.categorical_column_names.append(column_name)

# get timestamp column from chunker incase the calculator is initialized with a chunker without directly
# been provided the timestamp column name
# Get timestamp column from chunker incase the calculator is initialized with a chunker without directly
# been provided the timestamp column name.
#
# The reference data will be sorted according to the timestamp column (when available) to mimic
# Chunker behavior. This means the reference data will be "aligned" with chunked reference data.
# This way we can use chunk indices on the internal reference data copy.
if self.chunker.timestamp_column_name:
if self.chunker.timestamp_column_name not in list(reference_data.columns):
raise InvalidArgumentsException(

Check warning on line 237 in nannyml/drift/multivariate/domain_classifier/calculator.py

View check run for this annotation

Codecov / codecov/patch

nannyml/drift/multivariate/domain_classifier/calculator.py#L237

Added line #L237 was not covered by tests
Expand Down Expand Up @@ -293,7 +297,6 @@ def _calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
return self.result

def _calculate_chunk(self, chunk: Chunk):

if self._is_fitted:
chunk_X = chunk.data[self.feature_column_names]
reference_X = self._reference_X
Expand All @@ -302,7 +305,10 @@ def _calculate_chunk(self, chunk: Chunk):
X = pd.concat([reference_X, chunk_X], ignore_index=True)
y = np.concatenate([reference_y, chunk_y])
else:
# Use information from chunk indices to identify reference chunk's location
# Use information from chunk indices to identify reference chunk's location. This is possible because
# both the internal reference data copy and the chunk data were sorted by timestamp, so these
# indices align. This way we eliminate the need to combine these two data frames and drop duplicate rows,
# which is a costly operation.
X = self._reference_X
y = np.zeros(len(X))
y[chunk.start_index : chunk.end_index + 1] = 1
Expand Down

0 comments on commit 3aaed3d

Please sign in to comment.