Skip to content

Commit c10d2f8

Browse files
authored
Merge pull request #99 from lisc-tools/cdims
[MNT] - Updates to Counts
2 parents c365a00 + 2405648 commit c10d2f8

File tree

3 files changed

+140
-73
lines changed

3 files changed

+140
-73
lines changed

lisc/objects/counts.py

Lines changed: 108 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ def add_terms(self, terms, term_type='terms', directory=None, dim='A'):
248248
if term_type == 'terms':
249249
self.terms[dim].counts = np.zeros(self.terms[dim].n_terms, dtype=int)
250250

251+
251252
def add_labels(self, terms, directory=None, dim='A'):
252253
"""Add labels for terms to the object.
253254
@@ -375,6 +376,9 @@ def compute_score(self, score_type='association', dim='A', return_result=False):
375376
>>> plot_dendrogram(counts) # doctest:+SKIP
376377
"""
377378

379+
# Clear any previously computed score
380+
self.clear_score()
381+
378382
if not self.has_data:
379383
raise ValueError('No data is available - cannot proceed.')
380384

@@ -403,12 +407,19 @@ def compute_score(self, score_type='association', dim='A', return_result=False):
403407
return deepcopy(self.score)
404408

405409

410+
def clear_score(self):
411+
"""Clear any previously computed score."""
412+
413+
self.score = np.zeros(0)
414+
self.score_info = {}
415+
416+
406417
def check_top(self, dim='A'):
407418
"""Check the terms with the most articles.
408419
409420
Parameters
410421
----------
411-
dim : {'A', 'B'}, optional
422+
dim : {'A', 'B', 'both'}, optional
412423
Which set of terms to check.
413424
414425
Examples
@@ -421,18 +432,26 @@ def check_top(self, dim='A'):
421432
if not self.has_data:
422433
raise ValueError('No data is available - cannot proceed.')
423434

424-
max_ind = np.argmax(self.terms[dim].counts)
425-
print("The most studied term is {} with {} articles.".format(
426-
wrap(self.terms[dim].labels[max_ind]),
427-
self.terms[dim].counts[max_ind]))
435+
if dim == 'both':
436+
437+
self.check_top('A')
438+
print('\n')
439+
self.check_top('B')
440+
441+
else:
442+
443+
max_ind = np.argmax(self.terms[dim].counts)
444+
print("The most studied term is {} with {} articles.".format(
445+
wrap(self.terms[dim].labels[max_ind]),
446+
self.terms[dim].counts[max_ind]))
428447

429448

430449
def check_counts(self, dim='A'):
431450
"""Check how many articles were found for each term.
432451
433452
Parameters
434453
----------
435-
dim : {'A', 'B'}
454+
dim : {'A', 'B', 'both'}
436455
Which set of terms to check.
437456
438457
Examples
@@ -445,14 +464,22 @@ def check_counts(self, dim='A'):
445464
if not self.has_data:
446465
raise ValueError('No data is available - cannot proceed.')
447466

448-
# Calculate widths for printing
449-
twd = get_max_length(self.terms[dim].labels, 2)
450-
nwd = get_max_length(self.terms[dim].counts)
467+
if dim == 'both':
451468

452-
print("The number of documents found for each search term is:")
453-
for ind, term in enumerate(self.terms[dim].labels):
454-
print(" {:{twd}} - {:{nwd}.0f}".format(
455-
wrap(term), self.terms[dim].counts[ind], twd=twd, nwd=nwd))
469+
self.check_counts('A')
470+
print('\n')
471+
self.check_counts('B')
472+
473+
else:
474+
475+
# Calculate widths for printing
476+
twd = get_max_length(self.terms[dim].labels, 2)
477+
nwd = get_max_length(self.terms[dim].counts)
478+
479+
print("The number of documents found for each search term is:")
480+
for ind, term in enumerate(self.terms[dim].labels):
481+
print(" {:{twd}} - {:{nwd}.0f}".format(
482+
wrap(term), self.terms[dim].counts[ind], twd=twd, nwd=nwd))
456483

457484

458485
def check_data(self, data_type='counts', dim='A'):
@@ -462,7 +489,7 @@ def check_data(self, data_type='counts', dim='A'):
462489
----------
463490
data_type : {'counts', 'score'}
464491
Which data type to use.
465-
dim : {'A', 'B'}, optional
492+
dim : {'A', 'B', 'both'}, optional
466493
Which set of terms to check.
467494
468495
Examples
@@ -487,28 +514,36 @@ def check_data(self, data_type='counts', dim='A'):
487514
if self.score_info['type'] == 'similarity':
488515
raise ValueError('Cannot check value counts for similarity score.')
489516

490-
# Set up which direction to act across
491-
data = getattr(self, data_type)
492-
data = data.T if dim == 'B' else data
493-
alt = 'B' if dim == 'A' and not self.square else 'A'
517+
if dim == 'both':
494518

495-
# Calculate widths for printing
496-
twd1 = get_max_length(self.terms[dim].labels, 2)
497-
twd2 = get_max_length(self.terms[alt].labels, 2)
498-
nwd = '>10.0f' if data_type == 'counts' else '06.3f'
519+
self.check_data(data_type, 'A')
520+
print('\n')
521+
self.check_data(data_type, 'B')
522+
523+
else:
524+
525+
# Set up which direction to act across
526+
data = getattr(self, data_type)
527+
data = data.T if dim == 'B' else data
528+
alt = 'B' if dim == 'A' and not self.square else 'A'
499529

500-
# Loop through each term, find maximally associated term and print out
501-
for term_ind, term in enumerate(self.terms[dim].labels):
530+
# Calculate widths for printing
531+
twd1 = get_max_length(self.terms[dim].labels, 2)
532+
twd2 = get_max_length(self.terms[alt].labels, 2)
533+
nwd = '>10.0f' if data_type == 'counts' else '06.3f'
502534

503-
# Find the index of the most common association for current term
504-
assoc_ind = np.argmax(data[term_ind, :])
535+
# Loop through each term, find maximally associated term and print out
536+
for term_ind, term in enumerate(self.terms[dim].labels):
505537

506-
print("For {:{twd1}} the highest association is {:{twd2}} with {:{nwd}}".format(
507-
wrap(term), wrap(self.terms[alt].labels[assoc_ind]),
508-
data[term_ind, assoc_ind], twd1=twd1, twd2=twd2, nwd=nwd))
538+
# Find the index of the most common association for current term
539+
assoc_ind = np.argmax(data[term_ind, :])
509540

541+
print("For {:{twd1}} the highest association is {:{twd2}} with {:{nwd}}".format(
542+
wrap(term), wrap(self.terms[alt].labels[assoc_ind]),
543+
data[term_ind, assoc_ind], twd1=twd1, twd2=twd2, nwd=nwd))
510544

511-
def drop_data(self, n_articles, dim='A'):
545+
546+
def drop_data(self, n_articles, dim='A', value='count'):
512547
"""Drop terms based on number of article results.
513548
514549
Parameters
@@ -517,6 +552,14 @@ def drop_data(self, n_articles, dim='A'):
517552
Minimum number of articles required to keep each term.
518553
dim : {'A', 'B'}, optional
519554
Which set of terms to drop.
555+
value : {'count', 'coocs'}
556+
Which data count to drop based on:
557+
'count' : drops based on the total number of articles per term
558+
'coocs' : drops based on the co-occurrences, if all values are below `n_articles`
559+
560+
Notes
561+
-----
562+
This will drop any computed scores, as they may not be accurate after dropping data.
520563
521564
Examples
522565
--------
@@ -525,11 +568,39 @@ def drop_data(self, n_articles, dim='A'):
525568
>>> counts.drop_data(20) # doctest: +SKIP
526569
"""
527570

528-
# Set a flipper dictionary, to flip inds if needed
529-
flip_inds = {'A' : 'B', 'B' : 'A'}
571+
self.clear_score()
530572

531-
# Finds the indices of the terms with enough data to keep
532-
keep_inds = np.where(self.terms[dim].counts >= n_articles)[0]
573+
if dim == 'both':
574+
575+
self.drop_data(n_articles, 'A', value)
576+
self.drop_data(n_articles, 'B', value)
577+
578+
else:
579+
580+
dim_inds = {'A' : 1, 'B' : 0}
581+
582+
# Get set of indices to drop & drop them from the object
583+
if value == 'count':
584+
drop_inds = np.where(self.terms[dim].counts < n_articles)[0]
585+
elif value == 'coocs':
586+
drop_inds = list(np.where(np.all(self.counts < n_articles, dim_inds[dim]))[0])
587+
588+
self._drop_terms(drop_inds, dim)
589+
590+
591+
def _drop_terms(self, drop_inds, dim):
592+
"""Sub-function to drop terms from object.
593+
594+
Parameters
595+
----------
596+
drop_inds : list of int
597+
Indices of terms to drop.
598+
dim : {'A', 'B'}
599+
Which dim to drop terms from.
600+
"""
601+
602+
# Invert to indices of the terms to keep
603+
keep_inds = np.delete(np.arange(self.terms[dim].n_terms), drop_inds)
533604

534605
# Drop terms that do not have enough data
535606
self.terms[dim].terms = [self.terms[dim].terms[ind] for ind in keep_inds]
@@ -539,6 +610,9 @@ def drop_data(self, n_articles, dim='A'):
539610
# Create an inds dictionary that defaults to all-index slice
540611
inds = defaultdict(lambda: np.s_[:])
541612

613+
# Set a flipper dictionary, to flip inds if needed
614+
flip_inds = {'A' : 'B', 'B' : 'A'}
615+
542616
# If square, set both dims, and do array orgs needed for fancy indexing
543617
if self.square:
544618
inds[dim] = keep_inds[:, None]
@@ -548,12 +622,3 @@ def drop_data(self, n_articles, dim='A'):
548622

549623
# Drop raw count data for terms without enough data
550624
self.counts = self.counts[inds['A'], inds['B']]
551-
552-
if self.score.any():
553-
554-
# If score is a similarity matrix check and flip data indices as needed
555-
if self.score_info['type'] == 'similarity':
556-
inds[flip_inds[self.score_info['dim']]] = inds[self.score_info['dim']]
557-
558-
# Drop score data for terms without enough data
559-
self.score = self.score[inds['A'], inds['B']]

lisc/tests/collect/test_counts.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
def test_collect_counts_two(test_req):
99

1010
terms_a = ['language', 'memory']
11-
excls_a = [['protein'], ['protein']]
11+
excls_a = [['protein'], ['cell']]
1212
terms_b = ['brain']
1313

1414
# Test co-occurence with two terms lists
@@ -22,7 +22,7 @@ def test_collect_counts_two(test_req):
2222
def test_collect_counts_one(test_req):
2323

2424
terms_a = ['language', 'memory']
25-
excls_a = [['protein'], ['protein']]
25+
excls_a = [['protein'], ['cell']]
2626

2727
# Test co-occurence with one list of terms
2828
cooc, counts, meta_data = collect_counts(\
@@ -34,7 +34,7 @@ def test_collect_counts_one(test_req):
3434
def test_collect_counts_nocooc(test_req):
3535

3636
terms_a = ['language', 'memory']
37-
excls_a = [['protein'], ['protein']]
37+
excls_a = [['protein'], ['cell']]
3838

3939
# Test coounts without co-occurence
4040
counts, meta_data = collect_counts(\

lisc/tests/objects/test_counts.py

Lines changed: 29 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -5,42 +5,40 @@
55
###################################################################################################
66
###################################################################################################
77

8-
## Helper test functions
8+
## Helper test functions for Counts object
99

1010
def check_dunders(counts):
1111

12-
if isinstance(counts, Counts1D):
12+
label0 = counts.terms['A'].labels[0]
13+
label1 = counts.terms['B' if counts.terms['B'].terms else 'A'].labels[0]
1314

14-
label = counts.labels[0]
15-
out = counts[label0]
16-
17-
assert out == counts.counts[0]
18-
19-
if isinstance(counts, Counts):
20-
21-
label0 = counts.terms['A'].labels[0]
22-
label1 = counts.terms['B' if counts.terms['B'].terms else 'A'].labels[0]
23-
24-
out = counts[label0, label1]
25-
assert out == counts.counts[(0, 0)]
15+
out = counts[label0, label1]
16+
assert out == counts.counts[(0, 0)]
2617

2718
def check_funcs(counts):
2819

29-
if isinstance(counts, Counts):
30-
counts.check_data()
31-
counts.check_top()
32-
counts.check_counts()
20+
for dim in ['A'] if counts.square else ['A', 'B', 'both']:
21+
counts.check_data(dim=dim)
22+
counts.check_top(dim=dim)
23+
counts.check_counts(dim=dim)
3324

3425
def drop_data(counts):
3526

36-
counts.drop_data(0)
27+
for dim in ['A'] if counts.square else ['A', 'B', 'both']:
28+
counts.drop_data(10, dim=dim)
3729

3830
def compute_scores(counts):
3931

40-
for score_type in ['normalize', 'association', 'similarity']:
41-
counts.compute_score(score_type)
42-
assert counts.score.any()
43-
assert counts.score_info['type'] == score_type
32+
counts.compute_score('association')
33+
assert counts.score.any()
34+
assert counts.score_info['type'] == 'association'
35+
36+
for score_type in ['similarity', 'normalize']:
37+
for dim in ['A'] if counts.square else ['A', 'B']:
38+
counts.compute_score(score_type, dim=dim)
39+
assert counts.score.any()
40+
assert counts.score_info['type'] == score_type
41+
assert counts.score_info['dim'] == dim
4442

4543
## Counts1D Object
4644

@@ -57,8 +55,12 @@ def test_collect(test_req):
5755

5856
assert counts.has_data
5957

60-
check_funcs(counts)
61-
drop_data(counts)
58+
assert counts[counts.labels[0]] == counts.counts[0]
59+
60+
counts.check_top()
61+
counts.check_counts()
62+
63+
counts.drop_data(10)
6264

6365
## Counts Object
6466

@@ -78,7 +80,7 @@ def test_collect_one(test_req):
7880
counts = Counts()
7981

8082
counts.add_terms(['language', 'memory'], dim='A')
81-
counts.add_terms(['protein', 'protein'], term_type='exclusions', dim='A')
83+
counts.add_terms(['protein', 'cell'], term_type='exclusions', dim='A')
8284

8385
counts.run_collection(db='pubmed', logging=test_req)
8486
assert counts.has_data
@@ -94,7 +96,7 @@ def test_collect_two(test_req):
9496
counts = Counts()
9597

9698
counts.add_terms(['language', 'memory'], dim='A')
97-
counts.add_terms(['protein', 'protein'], term_type='exclusions', dim='A')
99+
counts.add_terms(['protein', 'cell'], term_type='exclusions', dim='A')
98100
counts.add_terms(['cognition'], dim='B')
99101

100102
counts.run_collection(db='pubmed', logging=test_req)

0 commit comments

Comments
 (0)