Skip to content

Commit 05edca3

Browse files
ajout du code de felipe
1 parent a124b08 commit 05edca3

11 files changed

+1091
-0
lines changed

fecode/arpi_evaluator.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
"""
2+
This script evaluates a candidate clustering given a reference.
3+
"""
4+
from sklearn.metrics import homogeneity_completeness_v_measure
5+
from sklearn.metrics.cluster import adjusted_rand_score
6+
import pandas as pd
7+
8+
NO_CLUSTER_LABEL = -1
9+
10+
11+
def evaluate_recurrent_defects(ref_df: pd.DataFrame, predictions, remove_ata_zero_section=True):
12+
"""
13+
Uses sklearn's Adjusted Rand Index, homogeneity, completeness and v-measure
14+
to evaluate the clustering predictions.
15+
16+
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html
17+
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.homogeneity_score.html
18+
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.completeness_score.html
19+
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.v_measure_score.html
20+
21+
:param ref_df: The reference dataframe.
22+
:param predictions: The predictions. Their format is an iterable collection of sets of defect labels belonging to
23+
the same cluster, i.e.
24+
[{'C-6414274-1', 'L-5245081-1'}, {'C-6414294-1', 'C-6414295-1', 'C-6414296-1'}, ...]
25+
Clusters containing a single element are ignored during evaluation.
26+
:param remove_ata_zero_section: Remove from the reference all clusters for which the ATA section is 0 (recommended)
27+
:return: A dict with the following keys
28+
ari_score - Adjusted Rand Index, similarity score between -1.0 and 1.0. Random labelings have an ARI close to 0.
29+
1.0 stands for perfect match.
30+
homogeneity - A clustering result satisfies homogeneity if all of its predicted clusters contain only data
31+
points that are clustered in the reference.
32+
completeness - A clustering result satisfies completeness if all the data points that are members of the
33+
same reference cluster are found in the same predicted cluster.
34+
v_measure - harmonic mean of homogeneity and completeness
35+
pred_clusters - a list of predicted cluster labels, useful for debug
36+
ref_clusters - a list of reference cluster labels, useful for debug
37+
remove_ata_zero_section - copy of argument remove_ata_zero_section for this function
38+
"""
39+
40+
filled_df = ref_df.recurrent.fillna(NO_CLUSTER_LABEL) # when there is no recurrent id, define as not clustered
41+
42+
if remove_ata_zero_section:
43+
filled_df.where(ref_df.section == 0, NO_CLUSTER_LABEL, inplace=True)
44+
45+
# remove clusters with a single member, which are not clusters at all
46+
duplicate_df = filled_df.duplicated(keep=False)
47+
filled_df.where(duplicate_df, NO_CLUSTER_LABEL, inplace=True)
48+
ref_clusters = filled_df
49+
50+
# convert cluster assignments from the predictions in the same order as those from the ref
51+
pred_clusters = convert_cluster_labels_to_seq(ref_df, predictions)
52+
53+
# evaluate
54+
homogeneity, completeness, v_measure_score = homogeneity_completeness_v_measure(ref_clusters, pred_clusters)
55+
ari_score = adjusted_rand_score(ref_clusters, pred_clusters)
56+
57+
return {'ari_score': ari_score, 'homogeneity': homogeneity,
58+
'completeness': completeness, 'v_measure': v_measure_score,
59+
'pred_clusters': pred_clusters, 'ref_clusters': ref_clusters,
60+
'remove_ata_zero_section': remove_ata_zero_section}
61+
62+
63+
def convert_cluster_labels_to_seq(ref_df: pd.DataFrame, predictions):
64+
"""Convert the predictions in a format usable by adjusted_rand_score"""
65+
66+
label_to_cluster_name = {}
67+
for i, cluster in enumerate(predictions):
68+
if len(cluster) > 1: # we only keep clusters whose size is > 1
69+
for label in cluster:
70+
label_to_cluster_name[label] = i
71+
72+
result = [NO_CLUSTER_LABEL] * len(ref_df)
73+
74+
for i, label in enumerate(ref_df.index):
75+
result[i] = label_to_cluster_name.get(label, NO_CLUSTER_LABEL)
76+
77+
return result
78+
79+
80+
def dump_debug_info(defect_df: pd.DataFrame, debug_info, fout):
81+
"""
82+
Dumps debug info in fout for analyzing results. Lines will have format:
83+
defect_label predicted_cluster reference_cluster
84+
85+
:param defect_df: The defect dataframe used for prediction/evaluation.
86+
:param debug_info: The debug info returned by function evaluate_recurrent_defects
87+
:param fout: The stream onto we write the debug info.
88+
:return: Nothing.
89+
"""
90+
print("id\tpred_label\tref_label", file=fout)
91+
for id, pred, ref in zip(defect_df.index, debug_info['pred_clusters'], debug_info['ref_clusters']):
92+
print(f"{id}\t{str(pred)}\t{str(ref)}", file=fout)

fecode/feacro.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
"""Find candidate acronyms"""
2+
import argparse
3+
import sys
4+
from operator import itemgetter
5+
6+
# expected to read acronym\tcontext lines, such as:
7+
# ICS . ( INTEGRATED COOLING SYSTEM
8+
# ICS .. ( INTEGRATED COOLING SYSTEM
9+
# ICS 2017 ( INTEGRATED COOLING SYSTEM
10+
11+
# ---------------------------------------------
12+
# ligne de commande
13+
# ---------------------------------------------
14+
#
15+
def get_args():
16+
parser = argparse.ArgumentParser(description='search for acronyms.')
17+
18+
parser.add_argument("-v", '--verbosity', type=int, help="increase output verbosity", default=0)
19+
parser.add_argument("-t", '--top', type=int, help="print top candidates per acronym", default=1)
20+
parser.add_argument("-x", '--test', action='store_true' , help="only run tests", default=False)
21+
parser.add_argument("-m", '--min', type=int, help="min score of a resolution", default=0)
22+
23+
args = parser.parse_args()
24+
return args
25+
26+
# ---------------------------------------------
27+
# main
28+
# ---------------------------------------------
29+
def main():
30+
31+
32+
args = get_args()
33+
nblines = 0
34+
nbsolvings = 0
35+
nbsolved = 0
36+
37+
if args.test:
38+
print(solve('SATCOM', 'SATCOM INOP'))
39+
print(solve("WATER","WATER"))
40+
print(solve("SVDU","BULKHEAD-MOUNTED SMART VIDEO DISPLAY UNIT"))
41+
print(solve("EMK", "BLABLA EQUIPMENT - EMERGENCY MEDICAL KIT NARATIVE"))
42+
print(solve("EMK", "SMOK"))
43+
else:
44+
45+
for line in sys.stdin:
46+
47+
nblines += 1
48+
t = line.rstrip().split('\t')
49+
50+
if len(t) == 2:
51+
52+
acro = t[0].strip()
53+
context = t[1].strip()
54+
55+
# remove badly scored ones, and remove the score of survivals
56+
sol = list(map(itemgetter(0),
57+
filter(lambda x: x[1] >= args.min,
58+
solve(acro,context, args.verbosity > 3))))
59+
60+
nbsolvings += 1
61+
if len(sol) >0:
62+
nbsolved += 1
63+
64+
if args.verbosity:
65+
print(f"#acro: {acro} context: {context}",file=sys.stderr)
66+
67+
# print top ones
68+
for a in sol[:args.top]:
69+
print(f"{acro}\t{a}")
70+
71+
72+
print(f"#lines {nblines} #solvings {nbsolvings} #solved: {nbsolved}",file=sys.stderr)
73+
74+
# ---------------------------------------------
75+
# could do much lighter
76+
# ---------------------------------------------
77+
def solve(acro,context,verbose):
78+
79+
l_acro = len(acro)
80+
indexes = []
81+
acros = []
82+
83+
# currently, the score favours resolutions with letters at the beginning of words
84+
# and shorter resolutions
85+
# should be improved
86+
def _score():
87+
sc = 0
88+
for i in indexes:
89+
if i == 0 or context[i-1].isspace():
90+
sc += 1
91+
return sc * 1000 - len(_plain())
92+
93+
# the resolution (string of plain words)
94+
def _plain():
95+
96+
d = indexes[0]
97+
f = context.find(' ',indexes[-1])
98+
if verbose:
99+
print(f"plain: i:{indexes[-1]} d:{d} f:{f}")
100+
101+
if d>0 and not context[d-1].isspace():
102+
d = context.rfind(' ',0,d-1)
103+
d = 0 if d == -1 else d+1
104+
#if verbose:
105+
#print(d,f)
106+
return context[d:] if f == -1 else context[d:f]
107+
108+
# the beats: nothing beats recursvity
109+
def _solve(i_acro, i_context):
110+
if i_acro == l_acro:
111+
resolution = _plain()
112+
score = _score()
113+
acros.append((resolution,score))
114+
if verbose:
115+
print(f"info [{acro}]\t[{context}]\t{indexes}\t{score}\t{resolution}", file=sys.stderr)
116+
for i in indexes:
117+
print('sfx: ',context[i:], file=sys.stderr)
118+
else:
119+
a = acro[i_acro]
120+
i = context.find(a,i_context)
121+
while i != -1:
122+
indexes.append(i)
123+
_solve(i_acro+1,i+1)
124+
indexes.pop()
125+
i = context.find(a,i+1)
126+
127+
128+
_solve(0,0)
129+
130+
# sort by score, strip score, and remove equalities
131+
return sorted(acros,key=itemgetter(1), reverse=True)
132+
133+
# ---------------------------------------------
134+
#
135+
# ---------------------------------------------
136+
if __name__ == '__main__':
137+
main()

fecode/fecluster_stats.py

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
"""
2+
An evolving tool for computing info from actual clusters
3+
"""
4+
import argparse
5+
import os
6+
import pandas as pd
7+
import pickle
8+
import sys
9+
import traceback
10+
11+
import pickle
12+
13+
from nltk.tokenize import sent_tokenize, word_tokenize
14+
from collections import Counter
15+
from operator import itemgetter
16+
17+
# ---------------------------------------------
18+
# a bit of tools
19+
# ---------------------------------------------
20+
21+
# just for custom printing (trace purpose)
22+
class FeCounter(Counter):
23+
def __str__(self):
24+
return " ".join(f'{k}' for k, v in self.items() if v > 1)
25+
26+
# ---------------------------------------------
27+
# gestion ligne de commande
28+
# ---------------------------------------------
29+
30+
def get_args():
31+
32+
parser = argparse.ArgumentParser(description='analyse clusters')
33+
parser.add_argument("input_file", help="A pickle input file, e.g. aircan-data-split-clean.pkl.")
34+
35+
parser.add_argument("-v", '--verbosity', type=int, help="increase output verbosity", default=0)
36+
parser.add_argument("-t", '--test', action='store_true', help="for dealing with test", default=False)
37+
parser.add_argument("-p", '--pickle', type=str, help="Pickle Bows", default=None)
38+
39+
40+
args = parser.parse_args()
41+
return args
42+
43+
# ---------------------------------------------
44+
# main
45+
# ---------------------------------------------
46+
47+
48+
def main():
49+
50+
# parse args
51+
args = get_args()
52+
53+
if not os.path.exists(args.input_file):
54+
print(f"Invalid input file: {args.input_file}", file=sys.stderr)
55+
sys.exit(1)
56+
57+
# read data; this will load the data as 6 pandas DataFrames, which allow fast manipulations and (slower) iterations
58+
# more info on pandas here: https://pandas.pydata.org/
59+
try:
60+
with open(args.input_file, 'rb') as fin:
61+
[defect_df_train, defect_df_dev, defect_df_test, ata_df, mel_df, trax_df] = pickle.load(fin)
62+
print(f"Read # samples: {len(defect_df_train)} train, {len(defect_df_test)} dev, {len(defect_df_test)} test.")
63+
except:
64+
print("Loading the pickle failed.", file=sys.stderr)
65+
66+
if pd.__version__ != '1.1.0':
67+
print("""You can upgrade your version of pandas with the command
68+
'pip install 'pandas==1.1.0' --force-reinstall'.""", file=sys.stderr)
69+
70+
print("""You can also recreate the pickle by following the instructions here:
71+
https://github.com/rali-udem/arpi_air_canada#data-preparation""", file=sys.stderr)
72+
print()
73+
traceback.print_exc()
74+
75+
# basic stats on each cluster
76+
check_ref_clusters(defect_df_test if args.test is True else defect_df_train, args.pickle)
77+
78+
79+
80+
# ---------------------------------------------------------------
81+
# felipe's function (to ramp up on pandas I never used seriously)
82+
# incidentally producing views of clusters
83+
# ---------------------------------------------------------------
84+
85+
def check_ref_clusters(defect, save):
86+
87+
# ATA-signature -> bow (counter)
88+
bows = {}
89+
90+
# note: ATA signatures might be null, which might generate some noise (even bugs)
91+
# for now I leave it like this
92+
93+
grouped_by_recurrent = defect.groupby('recurrent')
94+
for name, group in grouped_by_recurrent:
95+
96+
l = len(group)
97+
if l == 1:
98+
#ignore clusters with only one member (it does happen !)
99+
print(f"#WARNING: recurrent defect {name} has only one member (skipped)")
100+
else:
101+
102+
# the count the number of chapter-section signatures per cluster
103+
# (mind you: some clusters have numerous signatures, which defeats my understanding of TRAX)
104+
grouped_by_ata = group.groupby(['chapter', 'section'])
105+
print(f"---\n#INFO: Recurrent defect {name}, with {len(group)} member(s), and {len(grouped_by_ata)} ata-code(s)")
106+
if len(grouped_by_ata) > 1:
107+
# warn if more than one signature
108+
print(f"#WARNING: more than one chapter-section ({len(grouped_by_ata)})")
109+
110+
# number of lines retained in the cluster
111+
nb = 0
112+
113+
# let's keep track of words in a given cluster
114+
c = FeCounter()
115+
116+
# iterate over signatures in the cluster
117+
for sname,sgroup in grouped_by_ata:
118+
code = format(f"{sname[0]}-{sname[1]}")
119+
print(f"+ ata-code: {code}")
120+
if sname[1] != 0:
121+
# and print concerned lines provided section is not 0 and the description is filled
122+
for index,row in sgroup.iterrows():
123+
desc = row['defect_description']
124+
if pd.notnull(desc):
125+
c.update(word_tokenize(desc.lower()))
126+
print(f"\t#line\t{index}\t{row['chapter']}-{row['section']}\t{row['ac']}\t{desc}")
127+
nb += 1
128+
129+
if code not in bows:
130+
bows[code] = Counter(c)
131+
else:
132+
bows[code].update(c)
133+
134+
# cluster-wise journalization
135+
print(f"#trace: {nb} safe lines for defect {name}")
136+
print("#bow: ",c)
137+
138+
# dataset-wise journalization
139+
print(f"#ata-signatures: {len(bows)}")
140+
for signature,bow in bows.items():
141+
b = bow.most_common(10)
142+
#b = dict(sorted(bow.items(), key=itemgetter(1), reverse=True))
143+
print(f"#bow({signature}) [{len(bow)}]: {b}")
144+
145+
if not save is None:
146+
outfile = open(save,'wb')
147+
pickle.dump(bows,outfile)
148+
outfile.close()
149+
print(f"Generated pickle: {save}")
150+
151+
152+
153+
if __name__ == '__main__':
154+
main()

0 commit comments

Comments
 (0)