From d41668db5462fd696fc227d541e936aee544de19 Mon Sep 17 00:00:00 2001 From: "diego.esteves" Date: Wed, 6 May 2020 17:47:37 +0100 Subject: [PATCH] #51 - text features dict - feat extraction - bash update (spacy models) --- scripts/05_feature_extraction.py | 4 +- scripts/picklep2to3.py | 39 +++++++++++++++++++ .../text_classification/bow_tfidf.py | 15 +++---- 3 files changed, 49 insertions(+), 9 deletions(-) create mode 100644 scripts/picklep2to3.py diff --git a/scripts/05_feature_extraction.py b/scripts/05_feature_extraction.py index 0f848aa..ee4b914 100644 --- a/scripts/05_feature_extraction.py +++ b/scripts/05_feature_extraction.py @@ -9,8 +9,8 @@ config = HorusConfig() - EXTRACT_LEXICAL = True - EXTRACT_TEXT = False + EXTRACT_LEXICAL = False + EXTRACT_TEXT = True EXTRACT_IMAGE = False _extractor_lexical = None diff --git a/scripts/picklep2to3.py b/scripts/picklep2to3.py new file mode 100644 index 0000000..260004c --- /dev/null +++ b/scripts/picklep2to3.py @@ -0,0 +1,39 @@ +# kimchi.py +# For converting Python 2 pickles to Python 3 + +import os +import dill +import pickle +import argparse + + +def convert(old_pkl): + """ + Convert a Python 2 pickle to Python 3 + """ + # Make a name for the new pickle + new_pkl = os.path.splitext(os.path.basename(old_pkl))[0]+"_p3.pkl" + + # Convert Python 2 "ObjectType" to Python 3 object + dill._dill._reverse_typemap["ObjectType"] = object + + # Open the pickle using latin1 encoding + with open(old_pkl, "rb") as f: + loaded = pickle.load(f, encoding="latin1") + + # Re-save as Python 3 pickle + with open(new_pkl, "wb") as outfile: + pickle.dump(loaded, outfile) + + +if __name__ == "__main__": + #parser = argparse.ArgumentParser( + # description="Convert a Python 2 pickle to Python 3" + #) + + #parser.add_argument("infile", help="Python 2 pickle filename") + + #args = parser.parse_args() + + #convert(args.infile) + convert('/Volumes/dne5ssd/horus/resources/models/tfidf-ml/text_classification_LinearSVC.pkl') \ No newline at end of file diff --git a/src/algorithms/text_classification/bow_tfidf.py b/src/algorithms/text_classification/bow_tfidf.py index 42d7731..6dd6334 100644 --- a/src/algorithms/text_classification/bow_tfidf.py +++ b/src/algorithms/text_classification/bow_tfidf.py @@ -1,28 +1,29 @@ - from sklearn.externals import joblib from sklearn.feature_extraction.text import TfidfTransformer -class BowTfidf(): + +class BowTfidf(object): def __init__(self, config): try: self.config = config self.config.logger.debug('loading TF-IDF') + # TODO: retrain all models (Python2to3 issue on joblib) and remove the encoding parameter self.text_checking_model_1 = joblib.load(config.models_1_text) self.text_checking_model_2 = joblib.load(config.models_2_text) self.text_checking_model_3 = joblib.load(config.models_3_text) self.text_checking_model_4 = joblib.load(config.models_4_text) self.text_checking_model_5 = joblib.load(config.models_5_text) - #self.tfidf_transformer = TfidfTransformer() + # self.tfidf_transformer = TfidfTransformer() except Exception as e: raise e def detect_text_klass(self, text): try: predictions = [self.text_checking_model_1.predict(text)[0], - self.text_checking_model_2.predict(text)[0], - self.text_checking_model_3.predict(text)[0], - self.text_checking_model_4.predict(text)[0], - self.text_checking_model_5.predict(text)[0]] + self.text_checking_model_2.predict(text)[0], + self.text_checking_model_3.predict(text)[0], + self.text_checking_model_4.predict(text)[0], + self.text_checking_model_5.predict(text)[0]] return predictions except Exception as e: