Skip to content

Commit

Permalink
#51
Browse files Browse the repository at this point in the history
- text features dict
- feat extraction
- bash update (spacy models)
  • Loading branch information
diegoesteves committed May 6, 2020
1 parent 9e3db92 commit d41668d
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 9 deletions.
4 changes: 2 additions & 2 deletions scripts/05_feature_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@

config = HorusConfig()

EXTRACT_LEXICAL = True
EXTRACT_TEXT = False
EXTRACT_LEXICAL = False
EXTRACT_TEXT = True
EXTRACT_IMAGE = False

_extractor_lexical = None
Expand Down
39 changes: 39 additions & 0 deletions scripts/picklep2to3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# kimchi.py
# For converting Python 2 pickles to Python 3

import os
import dill
import pickle
import argparse


def convert(old_pkl):
"""
Convert a Python 2 pickle to Python 3
"""
# Make a name for the new pickle
new_pkl = os.path.splitext(os.path.basename(old_pkl))[0]+"_p3.pkl"

# Convert Python 2 "ObjectType" to Python 3 object
dill._dill._reverse_typemap["ObjectType"] = object

# Open the pickle using latin1 encoding
with open(old_pkl, "rb") as f:
loaded = pickle.load(f, encoding="latin1")

# Re-save as Python 3 pickle
with open(new_pkl, "wb") as outfile:
pickle.dump(loaded, outfile)


if __name__ == "__main__":
#parser = argparse.ArgumentParser(
# description="Convert a Python 2 pickle to Python 3"
#)

#parser.add_argument("infile", help="Python 2 pickle filename")

#args = parser.parse_args()

#convert(args.infile)
convert('/Volumes/dne5ssd/horus/resources/models/tfidf-ml/text_classification_LinearSVC.pkl')
15 changes: 8 additions & 7 deletions src/algorithms/text_classification/bow_tfidf.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,29 @@

from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfTransformer

class BowTfidf():

class BowTfidf(object):
def __init__(self, config):
try:
self.config = config
self.config.logger.debug('loading TF-IDF')
# TODO: retrain all models (Python2to3 issue on joblib) and remove the encoding parameter
self.text_checking_model_1 = joblib.load(config.models_1_text)
self.text_checking_model_2 = joblib.load(config.models_2_text)
self.text_checking_model_3 = joblib.load(config.models_3_text)
self.text_checking_model_4 = joblib.load(config.models_4_text)
self.text_checking_model_5 = joblib.load(config.models_5_text)
#self.tfidf_transformer = TfidfTransformer()
# self.tfidf_transformer = TfidfTransformer()
except Exception as e:
raise e

def detect_text_klass(self, text):
try:
predictions = [self.text_checking_model_1.predict(text)[0],
self.text_checking_model_2.predict(text)[0],
self.text_checking_model_3.predict(text)[0],
self.text_checking_model_4.predict(text)[0],
self.text_checking_model_5.predict(text)[0]]
self.text_checking_model_2.predict(text)[0],
self.text_checking_model_3.predict(text)[0],
self.text_checking_model_4.predict(text)[0],
self.text_checking_model_5.predict(text)[0]]

return predictions
except Exception as e:
Expand Down

0 comments on commit d41668d

Please sign in to comment.