|
| 1 | +import pandas as pd |
| 2 | +from sklearn.model_selection import train_test_split |
| 3 | +from sklearn.feature_extraction.text import CountVectorizer |
| 4 | +from sklearn.naive_bayes import MultinomialNB |
| 5 | +from sklearn import metrics |
| 6 | +msg=pd.read_csv('naivetext.csv',names=['message','label']) |
| 7 | +print('The dimensions of the dataset',msg.shape) |
| 8 | +msg['labelnum']=msg.label.map({'pos':1,'neg':0}) |
| 9 | +X=msg.message |
| 10 | +y=msg.labelnum |
| 11 | +xtrain,xtest,ytrain,ytest=train_test_split(X,y) |
| 12 | +print ('\n the total number of Training Data :',ytrain.shape) |
| 13 | +print ('\n the total number of Test Data :',ytest.shape) |
| 14 | +cv = CountVectorizer() |
| 15 | +xtrain_dtm = cv.fit_transform(xtrain) |
| 16 | +xtest_dtm=cv.transform(xtest) |
| 17 | +print('\n The words or Tokens in the text documents \n') |
| 18 | +print(cv.get_feature_names()) |
| 19 | +df=pd.DataFrame(xtrain_dtm.toarray(),columns=cv.get_feature_names()) |
| 20 | +clf = MultinomialNB().fit(xtrain_dtm,ytrain) |
| 21 | +predicted = clf.predict(xtest_dtm) |
| 22 | +print('\n Accuracy of the classifier is',metrics.accuracy_score(ytest,predicted)) |
| 23 | +print('\n Confusion matrix') |
| 24 | +print(metrics.confusion_matrix(ytest,predicted)) |
| 25 | +print('\n The value of Precision', metrics.precision_score(ytest,predicted)) |
| 26 | +print('\n The value of Recall', metrics.recall_score(ytest,predicted)) |
0 commit comments