Skip to content

Commit c7db3fc

Browse files
committed
add tfidf.py
2 parents 26b0657 + 5fc9b88 commit c7db3fc

File tree

4,776 files changed

+18264
-1928901
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

4,776 files changed

+18264
-1928901
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
FFP/save_model.sav

FFP/README.md

100644100755
File mode changed.

FFP/bar.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import matplotlib.pyplot as plt
2+
3+
y = [0.99, 0.93]
4+
x = ["Train" "Test"]
5+
6+
plt.title("FFP")
7+
plt.bar(x, y)
8+
plt.show()

FFP/load.py

100644100755
Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,29 @@
11
import pickle
22
from sklearn.metrics import accuracy_score
3-
from sklearn.metrics import classification_report,confusion_matrix
4-
from sklearn.svm import SVC
5-
import sys
6-
sys.path.append('module')
7-
import ffp
3+
from sklearn.metrics import classification_report
4+
from FFP.module import ffp
85

9-
def load(train_path,test_path,k):
10-
features=ffp.parsing(train_path,k)
11-
train=ffp.matrix(train_path,features,k)
126

13-
train_data=train[:,0:len(features)-1]
14-
train_label=train[:,len(features)-1]
7+
def load(train_path, test_path, k):
8+
features = ffp.parsing(train_path, k)
9+
train = ffp.matrix(train_path, features, k)
1510

16-
test=ffp.matrix(test_path,features,k)
11+
train_data = train[:, 0:len(features)-1]
12+
train_label = train[:, len(features)-1]
1713

18-
test_data=test[:,0:len(features)-1]
19-
test_label=test[:,len(features)-1]
20-
21-
loaded_model = pickle.load(open("save_model.sav", 'rb'))
14+
test = ffp.matrix(test_path, features, k)
2215

23-
pred=loaded_model.predict(test_data)
16+
test_data = test[:, 0:len(features)-1]
17+
test_label = test[:, len(features)-1]
2418

25-
print("Test k :"+str(k)+" = "+str(accuracy_score(test_label,pred)))
19+
loaded_model = pickle.load(open("save_model.sav", 'rb'))
20+
21+
pred = loaded_model.predict(test_data)
22+
23+
print("Test k :"+str(k)+" = "+str(accuracy_score(test_label,pred)))
24+
25+
print(classification_report(test_label, pred, target_names=['class 0', 'class 2']))
26+
return accuracy_score(test_label, pred)
2627

27-
print(classification_report(test_label, pred, target_names=['class 0','class 2']))
28-
return accuracy_score(test_label,pred)
2928

3029
load("public/train.txt","public/test.txt",2)

FFP/main.py

100644100755
Lines changed: 71 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1,98 +1,89 @@
1-
import numpy as np
2-
import pydotplus
3-
import collections
41
from sklearn.svm import SVC
52
from sklearn.metrics import accuracy_score
6-
from sklearn.metrics import classification_report,confusion_matrix
3+
from sklearn.metrics import classification_report
74
from sklearn.model_selection import GridSearchCV
8-
import sys
9-
sys.path.append('module')
10-
import ffp
11-
import matplotlib as mpl
12-
import matplotlib.pylab as plt
13-
import multiprocessing as mp
14-
import initial
15-
import frequency
5+
from FFP.module import ffp, initial, frequency
166
import pickle
177

18-
def preprocess(path,p):
19-
count=0
20-
21-
indexs=frequency.convert_index(path,"public/first.txt",p)
22-
23-
with open("public/first.txt",'r') as f:
24-
read=f.read()
25-
for index in indexs:
26-
read=read.replace(index,initial.initial(index))
27-
28-
with open("public/second.txt",'w') as f:
29-
for line in read.split('\n'):
30-
if line != '\n':
31-
f.write(line+'\n')
32-
33-
with open("public/second.txt",'r') as In:
34-
with open("public/third.txt",'w') as Out:
35-
for line in iter(lambda: In.readline(),''):
36-
if line[0]=='2' or line[0]=='0' :
37-
print(line)
38-
count+=1
39-
Out.write(line)
40-
41-
train_num = int(count*0.8)
42-
test_num = count-train_num
43-
44-
with open("public/third.txt",'r') as In:
45-
with open("public/train.txt",'w') as train:
46-
for i in range(train_num):
47-
train.write(In.readline())
48-
49-
with open("public/test.txt",'w') as test:
50-
for i in range(test_num):
51-
test.write(In.readline())
52-
53-
54-
def work(train_path,test_path,k):
55-
56-
features=ffp.parsing(train_path,k)
57-
train=ffp.matrix(train_path,features,k)
58-
59-
train_data=train[:,0:len(features)-1]
60-
train_label=train[:,len(features)-1]
61-
62-
test=ffp.matrix(test_path,features,k)
63-
64-
test_data=test[:,0:len(features)-1]
65-
test_label=test[:,len(features)-1]
66-
gamma_range=[0.01,0.1,1.0,10.0]
67-
68-
parameter_grid=[
69-
{'gamma':gamma_range,'kernel':['rbf']},
8+
9+
def preprocess(path, p):
10+
count = 0
11+
12+
indexs=frequency.convert_index(path, "public/first.txt", p)
13+
14+
with open("public/first.txt", 'r') as f:
15+
read=f.read()
16+
for index in indexs:
17+
read=read.replace(index,initial.initial(index))
18+
19+
with open("public/second.txt", 'w') as f:
20+
for line in read.split('\n'):
21+
if line != '\n':
22+
f.write(line+'\n')
23+
24+
with open("public/second.txt",'r') as In:
25+
with open("public/third.txt",'w') as Out:
26+
for line in iter(lambda: In.readline(), ''):
27+
if line[0] == '2' or line[0] == '0':
28+
print(line)
29+
count += 1
30+
Out.write(line)
31+
32+
train_num = int(count*0.8)
33+
test_num = count-train_num
34+
35+
with open("public/third.txt", 'r') as In:
36+
with open("public/train.txt", 'w') as train:
37+
for i in range(train_num):
38+
train.write(In.readline())
39+
40+
with open("public/test.txt", 'w') as test:
41+
for i in range(test_num):
42+
test.write(In.readline())
43+
44+
45+
def work(train_path, test_path, k):
46+
47+
features = ffp.parsing(train_path, k)
48+
train = ffp.matrix(train_path, features, k)
49+
50+
train_data = train[:, 0:len(features)-1]
51+
train_label = train[:, len(features)-1]
52+
53+
test=ffp.matrix(test_path, features, k)
54+
55+
test_data = test[:, 0:len(features)-1]
56+
test_label = test[:, len(features)-1]
57+
gamma_range = [0.01, 0.1, 1.0, 10.0]
58+
59+
parameter_grid = [
60+
{'gamma': gamma_range, 'kernel': ['rbf']},
7061
]
71-
grid=GridSearchCV(SVC(),parameter_grid,scoring='accuracy',cv=5)
72-
grid.fit(train_data,train_label)
73-
print('best params:',grid.best_params_)
62+
grid=GridSearchCV(SVC(), parameter_grid, scoring='accuracy', cv=5)
63+
grid.fit(train_data, train_label)
64+
print('best params:', grid.best_params_)
7465

7566
#insert best params to test
76-
clf=SVC(**grid.best_params_)
77-
clf=clf.fit(train_data,train_label)
67+
clf = SVC(**grid.best_params_)
68+
clf = clf.fit(train_data, train_label)
7869

79-
filename = 'save_model.sav'
80-
pickle.dump(clf, open(filename, 'wb'))
70+
filename = 'save_model.sav'
71+
pickle.dump(clf, open(filename, 'wb'))
8172

82-
pred=clf.predict(train_data)
83-
print("Train k :"+str(k)+" = "+str(accuracy_score(train_label,pred)))
73+
pred = clf.predict(train_data)
74+
print("Train k :"+str(k)+" = "+str(accuracy_score(train_label, pred)))
8475

85-
print(classification_report(train_label, pred, target_names=['class 0','class 1']))
76+
print(classification_report(train_label, pred, target_names=['class 0', 'class 1']))
8677

87-
pred=clf.predict(test_data)
88-
print("Test k :"+str(k)+" = "+str(accuracy_score(test_label,pred)))
78+
pred=clf.predict(test_data)
79+
print("Test k :"+str(k)+" = "+str(accuracy_score(test_label, pred)))
8980

90-
print(classification_report(test_label, pred, target_names=['class 0','class 1']))
91-
return accuracy_score(test_label,pred)
81+
print(classification_report(test_label, pred, target_names=['class 0', 'class 1']))
82+
return accuracy_score(test_label, pred)
9283

93-
preprocess("simple.txt",1)
84+
preprocess("simple.txt", 1)
9485

95-
work("public/train.txt","public/test.txt",2)
86+
work("public/train.txt", "public/test.txt", 2)
9687

9788
"""
9889
p = mp.Pool(3)

FFP/module/__pycache__/ffp.cpython-36.pyc

100644100755
File mode changed.

FFP/module/__pycache__/frequency.cpython-36.pyc

100644100755
File mode changed.

FFP/module/__pycache__/initial.cpython-36.pyc

100644100755
File mode changed.

FFP/module/ffp.py

100644100755
Lines changed: 41 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -14,58 +14,53 @@
1414
1515
"""
1616
import numpy as np
17-
import pandas as pd
18-
from pandas import ExcelWriter
19-
from pandas import ExcelFile
2017

21-
22-
def matrix(path,features,k):
23-
24-
num_lines=0
18+
def matrix(path, features, k):
19+
num_lines = 0
2520

2621
#count line
27-
with open(path, 'r') as f:
28-
for line in f:
29-
num_lines += 1
22+
with open(path, 'r') as f:
23+
for line in f:
24+
num_lines += 1
3025

31-
array = np.zeros([num_lines,len(features)],dtype='i')
32-
count=0
33-
with open(path,'r') as file_in:
34-
for read in iter(lambda: file_in.readline(),''):
35-
for num in range(1,int(len(read)/k)):
36-
feature=read[num:num+k]
37-
try:
38-
array[count][features.index(feature)]+=1
39-
except Exception:
40-
1+1
41-
if read[0]=='2':
42-
array[count][-1]=2
43-
else:
44-
array[count][-1]=0
45-
count+=1
26+
array = np.zeros([num_lines, len(features)], dtype='i')
27+
count=0
28+
with open(path, 'r') as file_in:
29+
for read in iter(lambda: file_in.readline(), ''):
30+
for num in range(1, int(len(read)/k)):
31+
feature=read[num:num+k]
32+
try:
33+
array[count][features.index(feature)]+=1
34+
except Exception:
35+
1+1
36+
if read[0] == '2':
37+
array[count][-1] = 2
38+
else:
39+
array[count][-1] = 0
40+
count += 1
4641

47-
return array
42+
return array
4843

49-
def parsing(path,k):
44+
def parsing(path, k):
45+
features=[]
46+
with open(path, 'r') as file_in:
47+
for read in iter(lambda: file_in.readline(), ''):
48+
for num in range(1, int(len(read)/k)):
49+
feature = read[num:num+k]
50+
if not feature in features:
51+
features.append(feature)
52+
features.append("score")
53+
return features
5054

51-
features=[]
52-
with open(path,'r') as file_in:
53-
for read in iter(lambda: file_in.readline(),''):
54-
for num in range(1,int(len(read)/k)):
55-
feature=read[num:num+k]
56-
if not feature in features:
57-
features.append(feature)
58-
features.append("score")
59-
return features
6055

6156
def main():
62-
k_range=list(range(2,4))
63-
p_range=list(range(0.5,1,1.5))
64-
for i in k_range:
65-
features=parsing("test.txt",i)
66-
result = matrix("test.txt",features,i)
67-
final=np.shape(result)
68-
'''
69-
print(np.shape(result))
70-
np.savetxt('foo.csv',result,delim
71-
'''
57+
k_range = list(range(2, 4))
58+
p_range = list(range(0.5, 1, 1.5))
59+
for i in k_range:
60+
features = parsing("test.txt", i)
61+
result = matrix("test.txt", features, i)
62+
final = np.shape(result)
63+
'''
64+
print(np.shape(result))
65+
np.savetxt('foo.csv',result,delim
66+
'''

FFP/module/ffp.pyc

100644100755
File mode changed.

0 commit comments

Comments
 (0)