Merge pull request #99 from lisc-tools/cdims

TomDonoghue · web-flow · commit c10d2f83c693 · 2024-12-03T15:23:59.000-05:00
[MNT] - Updates to Counts
diff --git a/lisc/objects/counts.py b/lisc/objects/counts.py
@@ -248,6 +248,7 @@ def add_terms(self, terms, term_type='terms', directory=None, dim='A'):
         if term_type == 'terms':
             self.terms[dim].counts = np.zeros(self.terms[dim].n_terms, dtype=int)
 
+
     def add_labels(self, terms, directory=None, dim='A'):
         """Add labels for terms to the object.
 
@@ -375,6 +376,9 @@ def compute_score(self, score_type='association', dim='A', return_result=False):
         >>> plot_dendrogram(counts)  # doctest:+SKIP
         """
 
+        # Clear any previously computed score
+        self.clear_score()
+
         if not self.has_data:
             raise ValueError('No data is available - cannot proceed.')
 
@@ -403,12 +407,19 @@ def compute_score(self, score_type='association', dim='A', return_result=False):
             return deepcopy(self.score)
 
 
+    def clear_score(self):
+        """Clear any previously computed score."""
+
+        self.score = np.zeros(0)
+        self.score_info = {}
+
+
     def check_top(self, dim='A'):
         """Check the terms with the most articles.
 
         Parameters
         ----------
-        dim : {'A', 'B'}, optional
+        dim : {'A', 'B', 'both'}, optional
             Which set of terms to check.
 
         Examples
@@ -421,18 +432,26 @@ def check_top(self, dim='A'):
         if not self.has_data:
             raise ValueError('No data is available - cannot proceed.')
 
-        max_ind = np.argmax(self.terms[dim].counts)
-        print("The most studied term is  {}  with  {}  articles.".format(
-            wrap(self.terms[dim].labels[max_ind]),
-            self.terms[dim].counts[max_ind]))
+        if dim == 'both':
+
+            self.check_top('A')
+            print('\n')
+            self.check_top('B')
+
+        else:
+
+            max_ind = np.argmax(self.terms[dim].counts)
+            print("The most studied term is  {}  with  {}  articles.".format(
+                wrap(self.terms[dim].labels[max_ind]),
+                self.terms[dim].counts[max_ind]))
 
 
     def check_counts(self, dim='A'):
         """Check how many articles were found for each term.
 
         Parameters
         ----------
-        dim : {'A', 'B'}
+        dim : {'A', 'B', 'both'}
             Which set of terms to check.
 
         Examples
@@ -445,14 +464,22 @@ def check_counts(self, dim='A'):
         if not self.has_data:
             raise ValueError('No data is available - cannot proceed.')
 
-        # Calculate widths for printing
-        twd = get_max_length(self.terms[dim].labels, 2)
-        nwd = get_max_length(self.terms[dim].counts)
+        if dim == 'both':
 
-        print("The number of documents found for each search term is:")
-        for ind, term in enumerate(self.terms[dim].labels):
-            print("  {:{twd}}   -   {:{nwd}.0f}".format(
-                wrap(term), self.terms[dim].counts[ind], twd=twd, nwd=nwd))
+            self.check_counts('A')
+            print('\n')
+            self.check_counts('B')
+
+        else:
+
+            # Calculate widths for printing
+            twd = get_max_length(self.terms[dim].labels, 2)
+            nwd = get_max_length(self.terms[dim].counts)
+
+            print("The number of documents found for each search term is:")
+            for ind, term in enumerate(self.terms[dim].labels):
+                print("  {:{twd}}   -   {:{nwd}.0f}".format(
+                    wrap(term), self.terms[dim].counts[ind], twd=twd, nwd=nwd))
 
 
     def check_data(self, data_type='counts', dim='A'):
@@ -462,7 +489,7 @@ def check_data(self, data_type='counts', dim='A'):
         ----------
         data_type : {'counts', 'score'}
             Which data type to use.
-        dim : {'A', 'B'}, optional
+        dim : {'A', 'B', 'both'}, optional
             Which set of terms to check.
 
         Examples
@@ -487,28 +514,36 @@ def check_data(self, data_type='counts', dim='A'):
             if self.score_info['type'] == 'similarity':
                 raise ValueError('Cannot check value counts for similarity score.')
 
-        # Set up which direction to act across
-        data = getattr(self, data_type)
-        data = data.T if dim == 'B' else data
-        alt = 'B' if dim == 'A' and not self.square else 'A'
+        if dim == 'both':
 
-        # Calculate widths for printing
-        twd1 = get_max_length(self.terms[dim].labels, 2)
-        twd2 = get_max_length(self.terms[alt].labels, 2)
-        nwd = '>10.0f' if data_type == 'counts' else '06.3f'
+            self.check_data(data_type, 'A')
+            print('\n')
+            self.check_data(data_type, 'B')
+
+        else:
+
+            # Set up which direction to act across
+            data = getattr(self, data_type)
+            data = data.T if dim == 'B' else data
+            alt = 'B' if dim == 'A' and not self.square else 'A'
 
-        # Loop through each term, find maximally associated term and print out
-        for term_ind, term in enumerate(self.terms[dim].labels):
+            # Calculate widths for printing
+            twd1 = get_max_length(self.terms[dim].labels, 2)
+            twd2 = get_max_length(self.terms[alt].labels, 2)
+            nwd = '>10.0f' if data_type == 'counts' else '06.3f'
 
-            # Find the index of the most common association for current term
-            assoc_ind = np.argmax(data[term_ind, :])
+            # Loop through each term, find maximally associated term and print out
+            for term_ind, term in enumerate(self.terms[dim].labels):
 
-            print("For  {:{twd1}}  the highest association is  {:{twd2}}  with  {:{nwd}}".format(
-                wrap(term), wrap(self.terms[alt].labels[assoc_ind]),
-                data[term_ind, assoc_ind], twd1=twd1, twd2=twd2, nwd=nwd))
+                # Find the index of the most common association for current term
+                assoc_ind = np.argmax(data[term_ind, :])
 
+                print("For  {:{twd1}}  the highest association is  {:{twd2}}  with  {:{nwd}}".format(
+                    wrap(term), wrap(self.terms[alt].labels[assoc_ind]),
+                    data[term_ind, assoc_ind], twd1=twd1, twd2=twd2, nwd=nwd))
 
-    def drop_data(self, n_articles, dim='A'):
+
+    def drop_data(self, n_articles, dim='A', value='count'):
         """Drop terms based on number of article results.
 
         Parameters
@@ -517,6 +552,14 @@ def drop_data(self, n_articles, dim='A'):
             Minimum number of articles required to keep each term.
         dim : {'A', 'B'}, optional
             Which set of terms to drop.
+        value : {'count', 'coocs'}
+            Which data count to drop based on:
+                'count' : drops based on the total number of articles per term
+                'coocs' : drops based on the co-occurrences, if all values are below `n_articles`
+
+        Notes
+        -----
+        This will drop any computed scores, as they may not be accurate after dropping data.
 
         Examples
         --------
@@ -525,11 +568,39 @@ def drop_data(self, n_articles, dim='A'):
         >>> counts.drop_data(20) # doctest: +SKIP
         """
 
-        # Set a flipper dictionary, to flip inds if needed
-        flip_inds = {'A' : 'B', 'B' : 'A'}
+        self.clear_score()
 
-        # Finds the indices of the terms with enough data to keep
-        keep_inds = np.where(self.terms[dim].counts >= n_articles)[0]
+        if dim == 'both':
+
+            self.drop_data(n_articles, 'A', value)
+            self.drop_data(n_articles, 'B', value)
+
+        else:
+
+            dim_inds = {'A' : 1, 'B' : 0}
+
+            # Get set of indices to drop & drop them from the object
+            if value == 'count':
+                drop_inds = np.where(self.terms[dim].counts < n_articles)[0]
+            elif value == 'coocs':
+                drop_inds = list(np.where(np.all(self.counts < n_articles, dim_inds[dim]))[0])
+
+            self._drop_terms(drop_inds, dim)
+
+
+    def _drop_terms(self, drop_inds, dim):
+        """Sub-function to drop terms from object.
+
+        Parameters
+        ----------
+        drop_inds : list of int
+            Indices of terms to drop.
+        dim : {'A', 'B'}
+            Which dim to drop terms from.
+        """
+
+        # Invert to indices of the terms to keep
+        keep_inds = np.delete(np.arange(self.terms[dim].n_terms), drop_inds)
 
         # Drop terms that do not have enough data
         self.terms[dim].terms = [self.terms[dim].terms[ind] for ind in keep_inds]
@@ -539,6 +610,9 @@ def drop_data(self, n_articles, dim='A'):
         # Create an inds dictionary that defaults to all-index slice
         inds = defaultdict(lambda: np.s_[:])
 
+        # Set a flipper dictionary, to flip inds if needed
+        flip_inds = {'A' : 'B', 'B' : 'A'}
+
         # If square, set both dims, and do array orgs needed for fancy indexing
         if self.square:
             inds[dim] = keep_inds[:, None]
@@ -548,12 +622,3 @@ def drop_data(self, n_articles, dim='A'):
 
         # Drop raw count data for terms without enough data
         self.counts = self.counts[inds['A'], inds['B']]
-
-        if self.score.any():
-
-            # If score is a similarity matrix check and flip data indices as needed
-            if self.score_info['type'] == 'similarity':
-                inds[flip_inds[self.score_info['dim']]] = inds[self.score_info['dim']]
-
-            # Drop score data for terms without enough data
-            self.score = self.score[inds['A'], inds['B']]
diff --git a/lisc/tests/collect/test_counts.py b/lisc/tests/collect/test_counts.py
@@ -8,7 +8,7 @@
 def test_collect_counts_two(test_req):
 
     terms_a = ['language', 'memory']
-    excls_a = [['protein'], ['protein']]
+    excls_a = [['protein'], ['cell']]
     terms_b = ['brain']
 
     # Test co-occurence with two terms lists
@@ -22,7 +22,7 @@ def test_collect_counts_two(test_req):
 def test_collect_counts_one(test_req):
 
     terms_a = ['language', 'memory']
-    excls_a = [['protein'], ['protein']]
+    excls_a = [['protein'], ['cell']]
 
     # Test co-occurence with one list of terms
     cooc, counts, meta_data = collect_counts(\
@@ -34,7 +34,7 @@ def test_collect_counts_one(test_req):
 def test_collect_counts_nocooc(test_req):
 
     terms_a = ['language', 'memory']
-    excls_a = [['protein'], ['protein']]
+    excls_a = [['protein'], ['cell']]
 
     # Test coounts without co-occurence
     counts, meta_data = collect_counts(\
diff --git a/lisc/tests/objects/test_counts.py b/lisc/tests/objects/test_counts.py
@@ -5,42 +5,40 @@
 ###################################################################################################
 ###################################################################################################
 
-## Helper test functions
+## Helper test functions for Counts object
 
 def check_dunders(counts):
 
-    if isinstance(counts, Counts1D):
+    label0 = counts.terms['A'].labels[0]
+    label1 = counts.terms['B' if counts.terms['B'].terms else 'A'].labels[0]
 
-        label = counts.labels[0]
-        out = counts[label0]
-
-        assert out == counts.counts[0]
-
-    if isinstance(counts, Counts):
-
-        label0 = counts.terms['A'].labels[0]
-        label1 = counts.terms['B' if counts.terms['B'].terms else 'A'].labels[0]
-
-        out = counts[label0, label1]
-        assert out == counts.counts[(0, 0)]
+    out = counts[label0, label1]
+    assert out == counts.counts[(0, 0)]
 
 def check_funcs(counts):
 
-    if isinstance(counts, Counts):
-        counts.check_data()
-    counts.check_top()
-    counts.check_counts()
+    for dim in ['A'] if counts.square else ['A', 'B', 'both']:
+        counts.check_data(dim=dim)
+        counts.check_top(dim=dim)
+        counts.check_counts(dim=dim)
 
 def drop_data(counts):
 
-    counts.drop_data(0)
+    for dim in ['A'] if counts.square else ['A', 'B', 'both']:
+        counts.drop_data(10, dim=dim)
 
 def compute_scores(counts):
 
-    for score_type in ['normalize', 'association', 'similarity']:
-        counts.compute_score(score_type)
-        assert counts.score.any()
-        assert counts.score_info['type'] == score_type
+    counts.compute_score('association')
+    assert counts.score.any()
+    assert counts.score_info['type'] == 'association'
+
+    for score_type in ['similarity', 'normalize']:
+        for dim in ['A'] if counts.square else ['A', 'B']:
+            counts.compute_score(score_type, dim=dim)
+            assert counts.score.any()
+            assert counts.score_info['type'] == score_type
+            assert counts.score_info['dim'] == dim
 
 ## Counts1D Object
 
@@ -57,8 +55,12 @@ def test_collect(test_req):
 
     assert counts.has_data
 
-    check_funcs(counts)
-    drop_data(counts)
+    assert counts[counts.labels[0]] == counts.counts[0]
+
+    counts.check_top()
+    counts.check_counts()
+
+    counts.drop_data(10)
 
 ## Counts Object
 
@@ -78,7 +80,7 @@ def test_collect_one(test_req):
     counts = Counts()
 
     counts.add_terms(['language', 'memory'], dim='A')
-    counts.add_terms(['protein', 'protein'], term_type='exclusions', dim='A')
+    counts.add_terms(['protein', 'cell'], term_type='exclusions', dim='A')
 
     counts.run_collection(db='pubmed', logging=test_req)
     assert counts.has_data
@@ -94,7 +96,7 @@ def test_collect_two(test_req):
     counts = Counts()
 
     counts.add_terms(['language', 'memory'], dim='A')
-    counts.add_terms(['protein', 'protein'], term_type='exclusions', dim='A')
+    counts.add_terms(['protein', 'cell'], term_type='exclusions', dim='A')
     counts.add_terms(['cognition'], dim='B')
 
     counts.run_collection(db='pubmed', logging=test_req)