dahrs
diff --git a/‎fecode/arpi_evaluator.py
Lines changed: 92 additions & 0 deletions b/‎fecode/arpi_evaluator.py
Lines changed: 92 additions & 0 deletions
diff --git a/‎fecode/feacro.py
Lines changed: 137 additions & 0 deletions b/‎fecode/feacro.py
Lines changed: 137 additions & 0 deletions
diff --git a/‎fecode/fecluster_stats.py
Lines changed: 154 additions & 0 deletions b/‎fecode/fecluster_stats.py
Lines changed: 154 additions & 0 deletions
@@ -0,0 +1,92 @@
+"""
+This script evaluates a candidate clustering given a reference.
+"""
+from sklearn.metrics import homogeneity_completeness_v_measure
+from sklearn.metrics.cluster import adjusted_rand_score
+import pandas as pd
+
+NO_CLUSTER_LABEL = -1
+
+
+def evaluate_recurrent_defects(ref_df: pd.DataFrame, predictions, remove_ata_zero_section=True):
+    """
+    Uses sklearn's Adjusted Rand Index, homogeneity, completeness and v-measure
+    to evaluate the clustering predictions.
+
+    https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html
+    https://scikit-learn.org/stable/modules/generated/sklearn.metrics.homogeneity_score.html
+    https://scikit-learn.org/stable/modules/generated/sklearn.metrics.completeness_score.html
+    https://scikit-learn.org/stable/modules/generated/sklearn.metrics.v_measure_score.html
+
+    :param ref_df: The reference dataframe.
+    :param predictions: The predictions. Their format is an iterable collection of sets of defect labels belonging to
+                        the same cluster, i.e.
+                        [{'C-6414274-1', 'L-5245081-1'}, {'C-6414294-1', 'C-6414295-1', 'C-6414296-1'}, ...]
+                        Clusters containing a single element are ignored during evaluation.
+    :param remove_ata_zero_section: Remove from the reference all clusters for which the ATA section is 0 (recommended)
+    :return: A dict with the following keys
+        ari_score - Adjusted Rand Index, similarity score between -1.0 and 1.0. Random labelings have an ARI close to 0.
+                                         1.0 stands for perfect match.
+        homogeneity - A clustering result satisfies homogeneity if all of its predicted clusters contain only data
+                      points that are clustered in the reference.
+        completeness - A clustering result satisfies completeness if all the data points that are members of the
+                       same reference cluster are found in the same predicted cluster.
+        v_measure - harmonic mean of homogeneity and completeness
+        pred_clusters - a list of predicted cluster labels, useful for debug
+        ref_clusters - a list of reference cluster labels, useful for debug
+        remove_ata_zero_section - copy of argument remove_ata_zero_section for this function
+    """
+
+    filled_df = ref_df.recurrent.fillna(NO_CLUSTER_LABEL)  # when there is no recurrent id, define as not clustered
+
+    if remove_ata_zero_section:
+        filled_df.where(ref_df.section == 0, NO_CLUSTER_LABEL, inplace=True)
+
+    # remove clusters with a single member, which are not clusters at all
+    duplicate_df = filled_df.duplicated(keep=False)
+    filled_df.where(duplicate_df, NO_CLUSTER_LABEL, inplace=True)
+    ref_clusters = filled_df
+
+    # convert cluster assignments from the predictions in the same order as those from the ref
+    pred_clusters = convert_cluster_labels_to_seq(ref_df, predictions)
+
+    # evaluate
+    homogeneity, completeness, v_measure_score = homogeneity_completeness_v_measure(ref_clusters, pred_clusters)
+    ari_score = adjusted_rand_score(ref_clusters, pred_clusters)
+
+    return {'ari_score': ari_score, 'homogeneity': homogeneity,
+            'completeness': completeness, 'v_measure': v_measure_score,
+            'pred_clusters': pred_clusters, 'ref_clusters': ref_clusters,
+            'remove_ata_zero_section': remove_ata_zero_section}
+
+
+def convert_cluster_labels_to_seq(ref_df: pd.DataFrame, predictions):
+    """Convert the predictions in a format usable by adjusted_rand_score"""
+
+    label_to_cluster_name = {}
+    for i, cluster in enumerate(predictions):
+        if len(cluster) > 1:  # we only keep clusters whose size is > 1
+            for label in cluster:
+                label_to_cluster_name[label] = i
+
+    result = [NO_CLUSTER_LABEL] * len(ref_df)
+
+    for i, label in enumerate(ref_df.index):
+        result[i] = label_to_cluster_name.get(label, NO_CLUSTER_LABEL)
+
+    return result
+
+
+def dump_debug_info(defect_df: pd.DataFrame, debug_info, fout):
+    """
+    Dumps debug info in fout for analyzing results. Lines will have format:
+    defect_label    predicted_cluster   reference_cluster
+
+    :param defect_df: The defect dataframe used for prediction/evaluation.
+    :param debug_info: The debug info returned by function evaluate_recurrent_defects
+    :param fout: The stream onto we write the debug info.
+    :return: Nothing.
+    """
+    print("id\tpred_label\tref_label", file=fout)
+    for id, pred, ref in zip(defect_df.index, debug_info['pred_clusters'], debug_info['ref_clusters']):
+        print(f"{id}\t{str(pred)}\t{str(ref)}", file=fout)
@@ -0,0 +1,137 @@
+"""Find candidate acronyms"""
+import argparse 
+import sys
+from operator import itemgetter
+
+# expected to read acronym\tcontext lines, such as: 
+# ICS      . ( INTEGRATED COOLING SYSTEM
+# ICS      .. ( INTEGRATED COOLING SYSTEM
+# ICS      2017 ( INTEGRATED COOLING SYSTEM
+
+# ---------------------------------------------
+#       ligne de commande
+# ---------------------------------------------
+# 
+def get_args(): 
+    parser = argparse.ArgumentParser(description='search for acronyms.')
+    
+    parser.add_argument("-v", '--verbosity', type=int, help="increase output verbosity", default=0)    
+    parser.add_argument("-t", '--top', type=int, help="print top candidates per acronym", default=1) 
+    parser.add_argument("-x", '--test', action='store_true' , help="only run tests", default=False) 
+    parser.add_argument("-m", '--min', type=int, help="min score of a resolution", default=0) 
+
+    args = parser.parse_args()
+    return args
+
+# ---------------------------------------------
+#        main
+# ---------------------------------------------
+def main():
+ 
+
+    args = get_args()
+    nblines = 0
+    nbsolvings = 0
+    nbsolved = 0
+
+    if args.test:
+        print(solve('SATCOM',  'SATCOM INOP'))
+        print(solve("WATER","WATER")) 
+        print(solve("SVDU","BULKHEAD-MOUNTED SMART VIDEO DISPLAY UNIT")) 
+        print(solve("EMK", "BLABLA EQUIPMENT - EMERGENCY MEDICAL KIT NARATIVE"))
+        print(solve("EMK", "SMOK"))
+    else:
+
+        for line in sys.stdin:
+
+            nblines += 1
+            t = line.rstrip().split('\t')
+
+            if len(t) == 2:
+
+                acro = t[0].strip()
+                context = t[1].strip()
+
+                # remove badly scored ones, and remove the score of survivals
+                sol = list(map(itemgetter(0),
+                               filter(lambda x: x[1] >= args.min, 
+                                      solve(acro,context, args.verbosity > 3))))
+
+                nbsolvings += 1
+                if len(sol) >0:
+                    nbsolved += 1
+
+                if args.verbosity:
+                    print(f"#acro: {acro} context: {context}",file=sys.stderr)
+
+                # print top ones    
+                for a in sol[:args.top]:
+                    print(f"{acro}\t{a}")
+
+
+        print(f"#lines {nblines} #solvings {nbsolvings} #solved: {nbsolved}",file=sys.stderr)                
+
+# ---------------------------------------------
+#       could do much lighter 
+# ---------------------------------------------
+def solve(acro,context,verbose):
+
+   l_acro = len(acro)
+   indexes = []
+   acros = []
+
+   # currently, the score favours resolutions with letters at the beginning of words
+   # and shorter resolutions
+   # should be improved
+   def _score():
+      sc = 0
+      for i in indexes:
+        if i == 0 or context[i-1].isspace():
+            sc += 1
+      return sc * 1000 - len(_plain())
+            
+   # the resolution  (string of plain words)          
+   def _plain():
+    
+      d = indexes[0]
+      f = context.find(' ',indexes[-1])
+      if verbose:
+        print(f"plain: i:{indexes[-1]} d:{d} f:{f}")
+
+      if d>0 and not context[d-1].isspace():
+        d = context.rfind(' ',0,d-1)
+        d = 0 if d == -1 else d+1
+      #if verbose:
+        #print(d,f)  
+      return context[d:] if f == -1 else context[d:f]
+
+   # the beats: nothing beats recursvity
+   def _solve(i_acro, i_context):
+     if i_acro == l_acro:         
+        resolution = _plain()
+        score = _score()
+        acros.append((resolution,score))
+        if verbose:     
+            print(f"info [{acro}]\t[{context}]\t{indexes}\t{score}\t{resolution}", file=sys.stderr)
+            for i in indexes:
+                print('sfx: ',context[i:], file=sys.stderr)       
+     else:
+        a = acro[i_acro]
+        i = context.find(a,i_context)
+        while i != -1:
+            indexes.append(i)
+            _solve(i_acro+1,i+1)
+            indexes.pop()
+            i = context.find(a,i+1)
+
+ 
+   _solve(0,0)
+
+   # sort by score, strip score, and remove equalities
+   return sorted(acros,key=itemgetter(1), reverse=True)
+
+# ---------------------------------------------
+#        
+# ---------------------------------------------
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,154 @@
+"""
+An evolving tool for computing info from actual clusters
+"""
+import argparse  
+import os
+import pandas as pd
+import pickle
+import sys
+import traceback
+
+import pickle
+
+from nltk.tokenize import sent_tokenize, word_tokenize
+from collections import Counter
+from operator import itemgetter
+
+# ---------------------------------------------
+#        a bit of tools
+# ---------------------------------------------
+
+# just for custom printing (trace purpose)
+class FeCounter(Counter):
+    def __str__(self):
+        return " ".join(f'{k}' for k, v in self.items() if v > 1)
+
+# ---------------------------------------------
+#        gestion ligne de commande
+# ---------------------------------------------
+
+def get_args():
+
+    parser = argparse.ArgumentParser(description='analyse clusters')
+    parser.add_argument("input_file", help="A pickle input file, e.g. aircan-data-split-clean.pkl.")
+
+    parser.add_argument("-v", '--verbosity', type=int, help="increase output verbosity", default=0)
+    parser.add_argument("-t", '--test', action='store_true', help="for dealing with test", default=False)
+    parser.add_argument("-p", '--pickle', type=str, help="Pickle Bows", default=None)
+    
+
+    args = parser.parse_args()
+    return args
+
+# ---------------------------------------------
+#        main
+# ---------------------------------------------
+
+
+def main():
+
+    # parse args
+    args = get_args()
+
+    if not os.path.exists(args.input_file):
+        print(f"Invalid input file: {args.input_file}", file=sys.stderr)
+        sys.exit(1)
+
+    # read data; this will load the data as 6 pandas DataFrames, which allow fast manipulations and (slower) iterations
+    # more info on pandas here: https://pandas.pydata.org/
+    try:
+        with open(args.input_file, 'rb') as fin:
+            [defect_df_train, defect_df_dev, defect_df_test, ata_df, mel_df, trax_df] = pickle.load(fin)
+            print(f"Read # samples: {len(defect_df_train)} train, {len(defect_df_test)} dev, {len(defect_df_test)} test.")
+    except:
+        print("Loading the pickle failed.", file=sys.stderr)
+
+        if pd.__version__ != '1.1.0':
+            print("""You can upgrade your version of pandas with the command 
+                  'pip install 'pandas==1.1.0' --force-reinstall'.""", file=sys.stderr)
+
+        print("""You can also recreate the pickle by following the instructions here: 
+                 https://github.com/rali-udem/arpi_air_canada#data-preparation""", file=sys.stderr)
+        print()
+        traceback.print_exc()
+
+    # basic stats on each cluster
+    check_ref_clusters(defect_df_test if args.test is True else defect_df_train, args.pickle)
+
+
+
+# ---------------------------------------------------------------
+# felipe's function (to ramp up on pandas I never used seriously)
+# incidentally producing views of clusters
+# ---------------------------------------------------------------
+
+def check_ref_clusters(defect, save):
+
+    # ATA-signature -> bow (counter)
+    bows = {}
+
+    # note: ATA signatures might be null, which might generate some noise (even bugs)
+    #       for now I leave it like this
+
+    grouped_by_recurrent = defect.groupby('recurrent')
+    for name, group in grouped_by_recurrent:
+
+       l = len(group)
+       if l == 1:
+           #ignore clusters with only one member (it does happen !)
+           print(f"#WARNING: recurrent defect {name} has only one member (skipped)")
+       else:
+
+           # the count the number of chapter-section signatures per cluster 
+           # (mind you: some clusters have numerous signatures, which defeats my understanding of TRAX)
+           grouped_by_ata = group.groupby(['chapter', 'section'])
+           print(f"---\n#INFO: Recurrent defect {name}, with {len(group)} member(s), and  {len(grouped_by_ata)} ata-code(s)")
+           if len(grouped_by_ata) > 1:
+                # warn if more than one signature
+                print(f"#WARNING: more than one chapter-section ({len(grouped_by_ata)})")
+
+           # number of lines retained in the cluster 
+           nb = 0
+
+           # let's keep track of words in a given cluster
+           c = FeCounter()
+
+           # iterate over signatures in the cluster
+           for sname,sgroup in grouped_by_ata:
+                code = format(f"{sname[0]}-{sname[1]}")
+                print(f"+ ata-code: {code}")
+                if sname[1] != 0:
+                    # and print concerned lines provided section is not 0 and the description is filled
+                    for index,row in sgroup.iterrows():
+                        desc = row['defect_description']
+                        if pd.notnull(desc): 
+                          c.update(word_tokenize(desc.lower()))
+                          print(f"\t#line\t{index}\t{row['chapter']}-{row['section']}\t{row['ac']}\t{desc}")                          
+                          nb += 1
+
+                    if code not in bows: 
+                      bows[code] = Counter(c)
+                    else:
+                      bows[code].update(c)
+
+           # cluster-wise journalization              
+           print(f"#trace: {nb} safe lines for defect {name}")
+           print("#bow: ",c)
+
+    # dataset-wise journalization
+    print(f"#ata-signatures: {len(bows)}")
+    for signature,bow in bows.items():
+      b = bow.most_common(10)
+      #b = dict(sorted(bow.items(), key=itemgetter(1), reverse=True))
+      print(f"#bow({signature}) [{len(bow)}]: {b}")
+
+    if not save is None:
+      outfile = open(save,'wb')
+      pickle.dump(bows,outfile)
+      outfile.close()
+      print(f"Generated pickle: {save}")
+
+
+      
+if __name__ == '__main__':
+    main()