Skip to content

Commit bf0f18e

Browse files
mantheyX-TRON404cooperlab
authored
fix: redundant writes to dataframe #1105 (#1106) (#1112)
* fix: redundant writes to dataframe * refac: removed double quotes * refac: removed unused variables * fix: acc to latest version of HistomicsTK * refac: removed unused import * fix: Area is float in ground truth * refac: relative import * fix: rm coordinates from rprops * fix: ignore rprops if None * fix: ignore rprops if None * linting compute_fsd_features.py * lint compute_gradient_features.py * lint compute_haralick_features.py * lint compute_morphometry_features.py * lint test_feature_extraction.py * lint: rm conditionals from compute_nuclei_features * Revert "lint: rm conditionals from compute_nuclei_features" This reverts commit cd842a0. * lint: trailing comma and newline * lint: colon spacing in compute_haralick_features.py * lint: colon spacing in compute_nuclei_features.py --------- Co-authored-by: X-TRON404 <[email protected]> Co-authored-by: Lee Cooper <[email protected]>
1 parent 8d77c73 commit bf0f18e

File tree

6 files changed

+256
-235
lines changed

6 files changed

+256
-235
lines changed

histomicstk/features/compute_fsd_features.py

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -56,34 +56,36 @@ def compute_fsd_features(im_label, K=128, Fs=6, Delta=8, rprops=None):
5656
# create pandas data frame containing the features for each object
5757
numFeatures = len(feature_list)
5858
numLabels = len(rprops)
59-
fdata = pd.DataFrame(np.zeros((numLabels, numFeatures)),
60-
columns=feature_list)
6159

62-
# fourier descriptors, spaced evenly over the interval 1:K/2
60+
# pre-compute Interval outside the loop
6361
Interval = np.round(
64-
np.power(
65-
2, np.linspace(0, np.log2(K) - 1, Fs + 1, endpoint=True),
66-
),
62+
np.power(2, np.linspace(0, np.log2(K) - 1, Fs + 1, endpoint=True)),
6763
).astype(np.uint8)
6864

65+
# initialize an empty list to collect data
66+
data_list = []
67+
6968
for i in range(numLabels):
7069
# get bounds of dilated nucleus
71-
min_row, max_row, min_col, max_col = \
72-
_GetBounds(rprops[i].bbox, Delta, sizex, sizey)
70+
min_row, max_row, min_col, max_col = _GetBounds(
71+
rprops[i].bbox, Delta, sizex, sizey,
72+
)
73+
7374
# grab label mask
74-
lmask = (
75-
im_label[min_row:max_row, min_col:max_col] == rprops[i].label
76-
).astype(bool)
75+
lmask = (im_label[min_row:max_row, min_col:max_col] == rprops[i].label).astype(bool)
7776
# find boundaries
7877
Bounds = np.argwhere(
7978
find_boundaries(lmask, mode='inner').astype(np.uint8) == 1,
8079
)
8180
# check length of boundaries
8281
if len(Bounds) < 2:
83-
fdata.iloc[i, :] = 0
82+
data_list.append(np.zeros(numFeatures))
8483
else:
85-
# compute fourier descriptors
86-
fdata.iloc[i, :] = _FSDs(Bounds[:, 0], Bounds[:, 1], K, Interval)
84+
# compute fourier descriptors and collect data
85+
data_list.append(_FSDs(Bounds[:, 0], Bounds[:, 1], K, Interval))
86+
87+
# create DataFrame after the loop
88+
fdata = pd.DataFrame(data_list, columns=feature_list)
8789

8890
return fdata
8991

histomicstk/features/compute_gradient_features.py

Lines changed: 33 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -78,57 +78,60 @@ def compute_gradient_features(im_label, im_intensity,
7878
'Gradient.Canny.Mean',
7979
]
8080

81-
# compute object properties if not provided
81+
# Compute object properties if not provided
8282
if rprops is None:
8383
rprops = regionprops(im_label)
8484

85-
# create pandas data frame containing the features for each object
86-
numFeatures = len(feature_list)
8785
numLabels = len(rprops)
88-
fdata = pd.DataFrame(np.zeros((numLabels, numFeatures)),
89-
columns=feature_list)
9086

9187
Gx, Gy = np.gradient(im_intensity)
9288
diffG = np.sqrt(Gx**2 + Gy**2)
9389
cannyG = canny(im_intensity)
9490

91+
# Prepare data collection
92+
data = []
93+
9594
for i in range(numLabels):
9695
if rprops[i] is None:
9796
continue
9897

9998
# get gradients of object pixels
100-
pixelGradients = np.sort(
101-
diffG[rprops[i].coords[:, 0], rprops[i].coords[:, 1]],
102-
)
103-
104-
# compute mean
105-
fdata.at[i, 'Gradient.Mag.Mean'] = np.mean(pixelGradients)
106-
107-
# compute standard deviation
108-
fdata.at[i, 'Gradient.Mag.Std'] = np.std(pixelGradients)
109-
110-
# compute skewness
111-
fdata.at[i, 'Gradient.Mag.Skewness'] = scipy.stats.skew(pixelGradients)
112-
113-
# compute kurtosis
114-
fdata.at[i, 'Gradient.Mag.Kurtosis'] = \
115-
scipy.stats.kurtosis(pixelGradients)
99+
pixelGradients = np.sort(diffG[rprops[i].coords[:, 0], rprops[i].coords[:, 1]])
116100

117-
# compute intensity histogram
101+
# Compute intensity histogram
118102
hist, bins = np.histogram(pixelGradients, bins=num_hist_bins)
119103
prob = hist / np.sum(hist, dtype=np.float32)
120104

121-
# compute entropy
122-
fdata.at[i, 'Gradient.Mag.HistEntropy'] = scipy.stats.entropy(prob)
123-
124-
# compute energy
125-
fdata.at[i, 'Gradient.Mag.HistEnergy'] = np.sum(prob**2)
126-
105+
# Canny edges for the object
127106
bw_canny = cannyG[rprops[i].coords[:, 0], rprops[i].coords[:, 1]]
128107
canny_sum = np.sum(bw_canny).astype('float')
129108

130-
fdata.at[i, 'Gradient.Canny.Sum'] = canny_sum
109+
# Aggregate features
110+
features = [
111+
np.mean(pixelGradients), # Mean
112+
np.std(pixelGradients), # Std
113+
scipy.stats.skew(pixelGradients), # Skewness
114+
scipy.stats.kurtosis(pixelGradients), # Kurtosis
115+
scipy.stats.entropy(prob), # HistEntropy
116+
np.sum(prob**2), # HistEnergy
117+
canny_sum, # Canny.Sum
118+
canny_sum / len(pixelGradients), # Canny.Mean
119+
]
120+
121+
data.append(features)
122+
123+
# Create DataFrame
124+
feature_list = [
125+
'Gradient.Mag.Mean',
126+
'Gradient.Mag.Std',
127+
'Gradient.Mag.Skewness',
128+
'Gradient.Mag.Kurtosis',
129+
'Gradient.Mag.HistEntropy',
130+
'Gradient.Mag.HistEnergy',
131+
'Gradient.Canny.Sum',
132+
'Gradient.Canny.Mean',
133+
]
131134

132-
fdata.at[i, 'Gradient.Canny.Mean'] = canny_sum / len(pixelGradients)
135+
fdata = pd.DataFrame(data, columns=feature_list)
133136

134137
return fdata

histomicstk/features/compute_haralick_features.py

Lines changed: 67 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -242,25 +242,21 @@ def compute_haralick_features(im_label, im_intensity, offsets=None,
242242

243243
# check for consistent shapes between 'I' and 'Label'
244244
if im_intensity.shape != im_label.shape:
245-
msg = "Inputs 'I' and 'Label' must have same shape"
246-
raise ValueError(msg)
245+
err_str = 'Inputs I and Label must have same shape'
246+
raise ValueError(err_str)
247247

248248
num_dims = len(im_intensity.shape)
249249

250250
# offsets
251251
if offsets is None:
252-
253252
# set default offset value
254253
offsets = _default_offsets(im_intensity)
255254

256255
else:
257-
258256
# check sanity
259257
if offsets.shape[1] != num_dims:
260-
msg = 'Dimension mismatch between input image and offsets'
261-
raise ValueError(
262-
msg,
263-
)
258+
err_str = 'Dimension mismatch between input image and offsets'
259+
raise ValueError(err_str)
264260

265261
num_offsets = offsets.shape[0]
266262

@@ -270,18 +266,25 @@ def compute_haralick_features(im_label, im_intensity, offsets=None,
270266

271267
# create pandas data frame containing the features for each object
272268
numLabels = len(rprops)
273-
fdata = pd.DataFrame(np.zeros((numLabels, len(agg_feature_list))),
274-
columns=agg_feature_list)
269+
fdata = pd.DataFrame(
270+
np.zeros((numLabels, len(agg_feature_list))), columns=agg_feature_list,
271+
)
275272

276273
n_Minus = np.arange(num_levels)
277274
n_Plus = np.arange(2 * num_levels - 1)
278275

279276
x, y = np.mgrid[0:num_levels, 0:num_levels]
280277
xy = x * y
281-
xy_IDM = 1. / (1 + np.square(x - y))
278+
xy_IDM = 1.0 / (1 + np.square(x - y))
282279

283280
e = 0.00001 # small positive constant to avoid log 0
284-
eps = np.finfo(float).eps # small constant to avoid div / 0
281+
282+
num_features = len(feature_list)
283+
284+
# Initialize the array for aggregated features
285+
aggregated_features = np.zeros(
286+
(numLabels, 2 * num_features),
287+
) # Alternating mean and range
285288

286289
for i in range(numLabels):
287290
if rprops[i] is None:
@@ -291,92 +294,103 @@ def compute_haralick_features(im_label, im_intensity, offsets=None,
291294
minr, minc, maxr, maxc = rprops[i].bbox
292295

293296
# grab nucleus mask
294-
subImage = im_intensity[minr:maxr + 1, minc:maxc + 1]
297+
subImage = im_intensity[minr: maxr + 1, minc: maxc + 1].astype(np.uint8)
295298

296299
# gets GLCM or gray-tone spatial dependence matrix
297-
arrayGLCM = graycomatrixext(subImage, offsets=offsets,
298-
num_levels=num_levels,
299-
gray_limits=gray_limits,
300-
symmetric=True, normed=True)
300+
arrayGLCM = graycomatrixext(
301+
subImage,
302+
offsets=offsets,
303+
num_levels=num_levels,
304+
gray_limits=gray_limits,
305+
symmetric=True,
306+
normed=True,
307+
)
301308

302-
# Compute haralick features for each offset
303-
ldata = pd.DataFrame(np.zeros((num_offsets, len(feature_list))),
304-
columns=feature_list)
309+
features_per_offset = np.zeros((num_offsets, num_features))
305310

306311
for r in range(num_offsets):
307-
308312
nGLCM = arrayGLCM[:, :, r]
309313

310314
# get marginal-probabilities
311-
px, py, pxPlusy, pxMinusy = _compute_marginal_glcm_probs_cython(
312-
nGLCM)
315+
px, py, pxPlusy, pxMinusy = _compute_marginal_glcm_probs_cython(nGLCM)
313316

314317
# computes angular second moment
315-
ldata.at[r, 'Haralick.ASM'] = np.sum(np.square(nGLCM))
318+
ASM = np.sum(np.square(nGLCM))
316319

317320
# computes contrast
318-
ldata.at[r, 'Haralick.Contrast'] = \
319-
np.dot(np.square(n_Minus), pxMinusy)
321+
Contrast = np.dot(np.square(n_Minus), pxMinusy)
320322

321323
# computes correlation
322324
# gets weighted mean and standard deviation of px and py
323325
meanx = np.dot(n_Minus, px)
324326
variance = np.dot(px, np.square(n_Minus)) - np.square(meanx)
325327
nGLCMr = np.ravel(nGLCM)
326-
327-
har_corr = (np.dot(np.ravel(xy), nGLCMr) - np.square(meanx)) /\
328-
max(eps, variance)
329-
ldata.at[r, 'Haralick.Correlation'] = np.clip(har_corr,
330-
a_min=-1, a_max=1)
328+
Correlation = (np.dot(np.ravel(xy), nGLCMr) - np.square(meanx)) / variance
331329

332330
# computes sum of squares : variance
333-
ldata.at[r, 'Haralick.SumOfSquares'] = variance
331+
SumOfSquares = variance
334332

335333
# computes inverse difference moment
336-
ldata.at[r, 'Haralick.IDM'] = \
337-
np.dot(np.ravel(xy_IDM), nGLCMr)
334+
IDM = np.dot(np.ravel(xy_IDM), nGLCMr)
338335

339336
# computes sum average
340-
ldata.at[r, 'Haralick.SumAverage'] = \
341-
np.dot(n_Plus, pxPlusy)
337+
SumAverage = np.dot(n_Plus, pxPlusy)
342338

343339
# computes sum variance
344340
# [1] uses sum entropy, but we use sum average
345-
ldata.at[r, 'Haralick.SumVariance'] = \
346-
np.dot(np.square(n_Plus), pxPlusy) - \
347-
np.square(ldata.at[r, 'Haralick.SumAverage'])
341+
SumVariance = np.dot(np.square(n_Plus), pxPlusy) - np.square(SumAverage)
348342

349343
# computes sum entropy
350-
ldata.at[r, 'Haralick.SumEntropy'] = \
351-
-np.dot(pxPlusy, np.log2(pxPlusy + e))
344+
SumEntropy = -np.dot(pxPlusy, np.log2(pxPlusy + e))
352345

353346
# computes entropy
354-
ldata.at[r, 'Haralick.Entropy'] = \
355-
-np.dot(nGLCMr, np.log2(nGLCMr + e))
347+
Entropy = -np.dot(nGLCMr, np.log2(nGLCMr + e))
356348

357349
# computes variance px-y
358-
ldata.at[r, 'Haralick.DifferenceVariance'] = np.var(pxMinusy)
350+
DifferenceVariance = np.var(pxMinusy)
359351

360352
# computes difference entropy px-y
361-
ldata.at[r, 'Haralick.DifferenceEntropy'] = \
362-
-np.dot(pxMinusy, np.log2(pxMinusy + e))
353+
DifferenceEntropy = -np.dot(pxMinusy, np.log2(pxMinusy + e))
363354

364355
# computes information measures of correlation
365356
# gets entropies of px and py
366357
HX = -np.dot(px, np.log2(px + e))
367358
HY = -np.dot(py, np.log2(py + e))
368-
HXY = ldata.at[r, 'Haralick.Entropy']
359+
HXY = Entropy
369360
pxy_ij = np.outer(px, py)
370361
pxy_ijr = np.ravel(pxy_ij)
371362
HXY1 = -np.dot(nGLCMr, np.log2(pxy_ijr + e))
372363
HXY2 = -np.dot(pxy_ijr, np.log2(pxy_ijr + e))
373-
ldata.at[r, 'Haralick.IMC1'] = ((HXY - HXY1) / max(HX, HY)) if max(HX, HY) else 0
364+
IMC1 = (HXY - HXY1) / max(HX, HY)
374365

375366
# computes information measures of correlation
376-
ldata.at[r, 'Haralick.IMC2'] = \
377-
np.sqrt(max(0, 1 - np.exp(-2.0 * (HXY2 - HXY))))
378-
379-
fdata.values[i, ::2] = np.mean(ldata.values, axis=0)
380-
fdata.values[i, 1::2] = np.ptp(ldata.values, axis=0)
367+
IMC2 = np.sqrt(1 - np.exp(-2.0 * (HXY2 - HXY)))
368+
369+
features_per_offset[r] = [
370+
ASM,
371+
Contrast,
372+
Correlation,
373+
SumOfSquares,
374+
IDM,
375+
SumAverage,
376+
SumVariance,
377+
SumEntropy,
378+
Entropy,
379+
DifferenceVariance,
380+
DifferenceEntropy,
381+
IMC1,
382+
IMC2,
383+
]
384+
385+
# Calculate means and ranges across all features in a vectorized manner
386+
means = np.mean(features_per_offset, axis=0)
387+
ranges = np.ptp(features_per_offset, axis=0)
388+
389+
# Assign means and ranges to the aggregated_features array in alternating columns
390+
aggregated_features[i, ::2] = means
391+
aggregated_features[i, 1::2] = ranges
392+
393+
# Preparing DataFrame columns with alternating mean and range suffixes
394+
fdata = pd.DataFrame(aggregated_features, columns=agg_feature_list)
381395

382396
return fdata

0 commit comments

Comments
 (0)