This repository was archived by the owner on Mar 28, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathPreProcessor.py
61 lines (42 loc) · 1.66 KB
/
PreProcessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# Author Alvaro Esperanca
import numpy as np
import csv
class PreProcessor(object):
def __init__(self):
self.featureList = list()
self.featureIndexMap = dict()
def createDTM(self, documentSet):
dtm = [[0 for x in range(len(self.featureList))] for y in range(len(documentSet))]
for i in range(len(documentSet)):
for feature in documentSet[i].split():
if feature in self.featureList:
dtm[i][self.featureIndexMap[feature]] += 1
return np.array(dtm)
def loadTrainingSet(self, trainSetFilepath):
trainSetFile = open(trainSetFilepath, "r")
docList = list()
labelList = list()
documentReader = csv.DictReader(trainSetFile, delimiter='\t')
for document in documentReader:
docList.append(document["Text"])
labelList.append(float(document["isClean"]))
trainSetFile.close()
for document in docList:
for feature in document.split():
if feature not in self.featureList:
self.featureList.append(feature)
index = 0
for feature in self.featureList:
self.featureIndexMap[feature] = index
index += 1
X = self.createDTM(docList)
y = np.array(labelList)
return X, y
def loadTestSet(self, testSetFilePath):
testeSetFile = open(testSetFilePath, "r")
docList = list()
documentReader = csv.DictReader(testeSetFile, delimiter='\t')
for document in documentReader:
docList.append(document["Text"])
X = self.createDTM(docList)
return X