@@ -248,6 +248,7 @@ def add_terms(self, terms, term_type='terms', directory=None, dim='A'):
248
248
if term_type == 'terms' :
249
249
self .terms [dim ].counts = np .zeros (self .terms [dim ].n_terms , dtype = int )
250
250
251
+
251
252
def add_labels (self , terms , directory = None , dim = 'A' ):
252
253
"""Add labels for terms to the object.
253
254
@@ -375,6 +376,9 @@ def compute_score(self, score_type='association', dim='A', return_result=False):
375
376
>>> plot_dendrogram(counts) # doctest:+SKIP
376
377
"""
377
378
379
+ # Clear any previously computed score
380
+ self .clear_score ()
381
+
378
382
if not self .has_data :
379
383
raise ValueError ('No data is available - cannot proceed.' )
380
384
@@ -403,12 +407,19 @@ def compute_score(self, score_type='association', dim='A', return_result=False):
403
407
return deepcopy (self .score )
404
408
405
409
410
+ def clear_score (self ):
411
+ """Clear any previously computed score."""
412
+
413
+ self .score = np .zeros (0 )
414
+ self .score_info = {}
415
+
416
+
406
417
def check_top (self , dim = 'A' ):
407
418
"""Check the terms with the most articles.
408
419
409
420
Parameters
410
421
----------
411
- dim : {'A', 'B'}, optional
422
+ dim : {'A', 'B', 'both' }, optional
412
423
Which set of terms to check.
413
424
414
425
Examples
@@ -421,18 +432,26 @@ def check_top(self, dim='A'):
421
432
if not self .has_data :
422
433
raise ValueError ('No data is available - cannot proceed.' )
423
434
424
- max_ind = np .argmax (self .terms [dim ].counts )
425
- print ("The most studied term is {} with {} articles." .format (
426
- wrap (self .terms [dim ].labels [max_ind ]),
427
- self .terms [dim ].counts [max_ind ]))
435
+ if dim == 'both' :
436
+
437
+ self .check_top ('A' )
438
+ print ('\n ' )
439
+ self .check_top ('B' )
440
+
441
+ else :
442
+
443
+ max_ind = np .argmax (self .terms [dim ].counts )
444
+ print ("The most studied term is {} with {} articles." .format (
445
+ wrap (self .terms [dim ].labels [max_ind ]),
446
+ self .terms [dim ].counts [max_ind ]))
428
447
429
448
430
449
def check_counts (self , dim = 'A' ):
431
450
"""Check how many articles were found for each term.
432
451
433
452
Parameters
434
453
----------
435
- dim : {'A', 'B'}
454
+ dim : {'A', 'B', 'both' }
436
455
Which set of terms to check.
437
456
438
457
Examples
@@ -445,14 +464,22 @@ def check_counts(self, dim='A'):
445
464
if not self .has_data :
446
465
raise ValueError ('No data is available - cannot proceed.' )
447
466
448
- # Calculate widths for printing
449
- twd = get_max_length (self .terms [dim ].labels , 2 )
450
- nwd = get_max_length (self .terms [dim ].counts )
467
+ if dim == 'both' :
451
468
452
- print ("The number of documents found for each search term is:" )
453
- for ind , term in enumerate (self .terms [dim ].labels ):
454
- print (" {:{twd}} - {:{nwd}.0f}" .format (
455
- wrap (term ), self .terms [dim ].counts [ind ], twd = twd , nwd = nwd ))
469
+ self .check_counts ('A' )
470
+ print ('\n ' )
471
+ self .check_counts ('B' )
472
+
473
+ else :
474
+
475
+ # Calculate widths for printing
476
+ twd = get_max_length (self .terms [dim ].labels , 2 )
477
+ nwd = get_max_length (self .terms [dim ].counts )
478
+
479
+ print ("The number of documents found for each search term is:" )
480
+ for ind , term in enumerate (self .terms [dim ].labels ):
481
+ print (" {:{twd}} - {:{nwd}.0f}" .format (
482
+ wrap (term ), self .terms [dim ].counts [ind ], twd = twd , nwd = nwd ))
456
483
457
484
458
485
def check_data (self , data_type = 'counts' , dim = 'A' ):
@@ -462,7 +489,7 @@ def check_data(self, data_type='counts', dim='A'):
462
489
----------
463
490
data_type : {'counts', 'score'}
464
491
Which data type to use.
465
- dim : {'A', 'B'}, optional
492
+ dim : {'A', 'B', 'both' }, optional
466
493
Which set of terms to check.
467
494
468
495
Examples
@@ -487,28 +514,36 @@ def check_data(self, data_type='counts', dim='A'):
487
514
if self .score_info ['type' ] == 'similarity' :
488
515
raise ValueError ('Cannot check value counts for similarity score.' )
489
516
490
- # Set up which direction to act across
491
- data = getattr (self , data_type )
492
- data = data .T if dim == 'B' else data
493
- alt = 'B' if dim == 'A' and not self .square else 'A'
517
+ if dim == 'both' :
494
518
495
- # Calculate widths for printing
496
- twd1 = get_max_length (self .terms [dim ].labels , 2 )
497
- twd2 = get_max_length (self .terms [alt ].labels , 2 )
498
- nwd = '>10.0f' if data_type == 'counts' else '06.3f'
519
+ self .check_data (data_type , 'A' )
520
+ print ('\n ' )
521
+ self .check_data (data_type , 'B' )
522
+
523
+ else :
524
+
525
+ # Set up which direction to act across
526
+ data = getattr (self , data_type )
527
+ data = data .T if dim == 'B' else data
528
+ alt = 'B' if dim == 'A' and not self .square else 'A'
499
529
500
- # Loop through each term, find maximally associated term and print out
501
- for term_ind , term in enumerate (self .terms [dim ].labels ):
530
+ # Calculate widths for printing
531
+ twd1 = get_max_length (self .terms [dim ].labels , 2 )
532
+ twd2 = get_max_length (self .terms [alt ].labels , 2 )
533
+ nwd = '>10.0f' if data_type == 'counts' else '06.3f'
502
534
503
- # Find the index of the most common association for current term
504
- assoc_ind = np . argmax ( data [ term_ind , :])
535
+ # Loop through each term, find maximally associated term and print out
536
+ for term_ind , term in enumerate ( self . terms [ dim ]. labels ):
505
537
506
- print ("For {:{twd1}} the highest association is {:{twd2}} with {:{nwd}}" .format (
507
- wrap (term ), wrap (self .terms [alt ].labels [assoc_ind ]),
508
- data [term_ind , assoc_ind ], twd1 = twd1 , twd2 = twd2 , nwd = nwd ))
538
+ # Find the index of the most common association for current term
539
+ assoc_ind = np .argmax (data [term_ind , :])
509
540
541
+ print ("For {:{twd1}} the highest association is {:{twd2}} with {:{nwd}}" .format (
542
+ wrap (term ), wrap (self .terms [alt ].labels [assoc_ind ]),
543
+ data [term_ind , assoc_ind ], twd1 = twd1 , twd2 = twd2 , nwd = nwd ))
510
544
511
- def drop_data (self , n_articles , dim = 'A' ):
545
+
546
+ def drop_data (self , n_articles , dim = 'A' , value = 'count' ):
512
547
"""Drop terms based on number of article results.
513
548
514
549
Parameters
@@ -517,6 +552,14 @@ def drop_data(self, n_articles, dim='A'):
517
552
Minimum number of articles required to keep each term.
518
553
dim : {'A', 'B'}, optional
519
554
Which set of terms to drop.
555
+ value : {'count', 'coocs'}
556
+ Which data count to drop based on:
557
+ 'count' : drops based on the total number of articles per term
558
+ 'coocs' : drops based on the co-occurrences, if all values are below `n_articles`
559
+
560
+ Notes
561
+ -----
562
+ This will drop any computed scores, as they may not be accurate after dropping data.
520
563
521
564
Examples
522
565
--------
@@ -525,11 +568,39 @@ def drop_data(self, n_articles, dim='A'):
525
568
>>> counts.drop_data(20) # doctest: +SKIP
526
569
"""
527
570
528
- # Set a flipper dictionary, to flip inds if needed
529
- flip_inds = {'A' : 'B' , 'B' : 'A' }
571
+ self .clear_score ()
530
572
531
- # Finds the indices of the terms with enough data to keep
532
- keep_inds = np .where (self .terms [dim ].counts >= n_articles )[0 ]
573
+ if dim == 'both' :
574
+
575
+ self .drop_data (n_articles , 'A' , value )
576
+ self .drop_data (n_articles , 'B' , value )
577
+
578
+ else :
579
+
580
+ dim_inds = {'A' : 1 , 'B' : 0 }
581
+
582
+ # Get set of indices to drop & drop them from the object
583
+ if value == 'count' :
584
+ drop_inds = np .where (self .terms [dim ].counts < n_articles )[0 ]
585
+ elif value == 'coocs' :
586
+ drop_inds = list (np .where (np .all (self .counts < n_articles , dim_inds [dim ]))[0 ])
587
+
588
+ self ._drop_terms (drop_inds , dim )
589
+
590
+
591
+ def _drop_terms (self , drop_inds , dim ):
592
+ """Sub-function to drop terms from object.
593
+
594
+ Parameters
595
+ ----------
596
+ drop_inds : list of int
597
+ Indices of terms to drop.
598
+ dim : {'A', 'B'}
599
+ Which dim to drop terms from.
600
+ """
601
+
602
+ # Invert to indices of the terms to keep
603
+ keep_inds = np .delete (np .arange (self .terms [dim ].n_terms ), drop_inds )
533
604
534
605
# Drop terms that do not have enough data
535
606
self .terms [dim ].terms = [self .terms [dim ].terms [ind ] for ind in keep_inds ]
@@ -539,6 +610,9 @@ def drop_data(self, n_articles, dim='A'):
539
610
# Create an inds dictionary that defaults to all-index slice
540
611
inds = defaultdict (lambda : np .s_ [:])
541
612
613
+ # Set a flipper dictionary, to flip inds if needed
614
+ flip_inds = {'A' : 'B' , 'B' : 'A' }
615
+
542
616
# If square, set both dims, and do array orgs needed for fancy indexing
543
617
if self .square :
544
618
inds [dim ] = keep_inds [:, None ]
@@ -548,12 +622,3 @@ def drop_data(self, n_articles, dim='A'):
548
622
549
623
# Drop raw count data for terms without enough data
550
624
self .counts = self .counts [inds ['A' ], inds ['B' ]]
551
-
552
- if self .score .any ():
553
-
554
- # If score is a similarity matrix check and flip data indices as needed
555
- if self .score_info ['type' ] == 'similarity' :
556
- inds [flip_inds [self .score_info ['dim' ]]] = inds [self .score_info ['dim' ]]
557
-
558
- # Drop score data for terms without enough data
559
- self .score = self .score [inds ['A' ], inds ['B' ]]
0 commit comments