-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproblem_unittests.py
153 lines (115 loc) · 5.97 KB
/
problem_unittests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
from unittest.mock import MagicMock, patch
import sklearn.naive_bayes
import numpy as np
import pandas as pd
import re
# test csv file
TEST_CSV = 'data/test_info.csv'
class AssertTest(object):
'''Defines general test behavior.'''
def __init__(self, params):
self.assert_param_message = '\n'.join([str(k) + ': ' + str(v) + '' for k, v in params.items()])
def test(self, assert_condition, assert_message):
assert assert_condition, assert_message + '\n\nUnit Test Function Parameters\n' + self.assert_param_message
def _print_success_message():
print('Tests Passed!')
# test clean_dataframe
def test_numerical_df(numerical_dataframe):
# test result
transformed_df = numerical_dataframe(TEST_CSV)
# Check type is a DataFrame
assert isinstance(transformed_df, pd.DataFrame), 'Returned type is {}.'.format(type(transformed_df))
# check columns
column_names = list(transformed_df)
assert 'File' in column_names, 'No File column, found.'
assert 'Task' in column_names, 'No Task column, found.'
assert 'Category' in column_names, 'No Category column, found.'
assert 'Class' in column_names, 'No Class column, found.'
# check conversion values
assert transformed_df.loc[0, 'Category'] == 1, '`heavy` plagiarism mapping test, failed.'
assert transformed_df.loc[2, 'Category'] == 0, '`non` plagiarism mapping test, failed.'
assert transformed_df.loc[30, 'Category'] == 3, '`cut` plagiarism mapping test, failed.'
assert transformed_df.loc[5, 'Category'] == 2, '`light` plagiarism mapping test, failed.'
assert transformed_df.loc[37, 'Category'] == -1, 'original file mapping test, failed; should have a Category = -1.'
assert transformed_df.loc[41, 'Category'] == -1, 'original file mapping test, failed; should have a Category = -1.'
_print_success_message()
def test_containment(complete_df, containment_fn):
# check basic format and value
# for n = 1 and just the fifth file
test_val = containment_fn(complete_df, 1, 'g0pA_taske.txt')
assert isinstance(test_val, float), 'Returned type is {}.'.format(type(test_val))
assert test_val<=1.0, 'It appears that the value is not normalized; expected a value <=1, got: '+str(test_val)
# known vals for first few files
filenames = ['g0pA_taska.txt', 'g0pA_taskb.txt', 'g0pA_taskc.txt', 'g0pA_taskd.txt']
ngram_1 = [0.39814814814814814, 1.0, 0.86936936936936937, 0.5935828877005348]
ngram_3 = [0.0093457943925233638, 0.96410256410256412, 0.61363636363636365, 0.15675675675675677]
# results for comparison
results_1gram = []
results_3gram = []
for i in range(4):
val_1 = containment_fn(complete_df, 1, filenames[i])
val_3 = containment_fn(complete_df, 3, filenames[i])
results_1gram.append(val_1)
results_3gram.append(val_3)
# check correct results
assert all(np.isclose(results_1gram, ngram_1, rtol=1e-04)), \
'n=1 calculations are incorrect. Double check the intersection calculation.'
# check correct results
assert all(np.isclose(results_3gram, ngram_3, rtol=1e-04)), \
'n=3 calculations are incorrect.'
_print_success_message()
def test_lcs(df, lcs_word):
test_index = 10 # file 10
# get answer file text
answer_text = df.loc[test_index, 'Text']
# get text for orig file
# find the associated task type (one character, a-e)
task = df.loc[test_index, 'Task']
# we know that source texts have Class = -1
orig_rows = df[(df['Class'] == -1)]
orig_row = orig_rows[(orig_rows['Task'] == task)]
source_text = orig_row['Text'].values[0]
# calculate LCS
test_val = lcs_word(answer_text, source_text)
# check type
assert isinstance(test_val, float), 'Returned type is {}.'.format(type(test_val))
assert test_val<=1.0, 'It appears that the value is not normalized; expected a value <=1, got: '+str(test_val)
# known vals for first few files
lcs_vals = [0.1917808219178082, 0.8207547169811321, 0.8464912280701754, 0.3160621761658031, 0.24257425742574257]
# results for comparison
results = []
for i in range(5):
# get answer and source text
answer_text = df.loc[i, 'Text']
task = df.loc[i, 'Task']
# we know that source texts have Class = -1
orig_rows = df[(df['Class'] == -1)]
orig_row = orig_rows[(orig_rows['Task'] == task)]
source_text = orig_row['Text'].values[0]
# calc lcs
val = lcs_word(answer_text, source_text)
results.append(val)
# check correct results
assert all(np.isclose(results, lcs_vals, rtol=1e-05)), 'LCS calculations are incorrect.'
_print_success_message()
def test_data_split(train_x, train_y, test_x, test_y):
# check types
assert isinstance(train_x, np.ndarray),\
'train_x is not an array, instead got type: {}'.format(type(train_x))
assert isinstance(train_y, np.ndarray),\
'train_y is not an array, instead got type: {}'.format(type(train_y))
assert isinstance(test_x, np.ndarray),\
'test_x is not an array, instead got type: {}'.format(type(test_x))
assert isinstance(test_y, np.ndarray),\
'test_y is not an array, instead got type: {}'.format(type(test_y))
# should hold all 95 submission files
assert len(train_x) + len(test_x) == 95, \
'Unexpected amount of train + test data. Expecting 95 answer text files, got ' +str(len(train_x) + len(test_x))
assert len(test_x) > 1, \
'Unexpected amount of test data. There should be multiple test files.'
# check shape
assert train_x.shape[1]==2, \
'train_x should have as many columns as selected features, got: {}'.format(train_x.shape[1])
assert len(train_y.shape)==1, \
'train_y should be a 1D array, got shape: {}'.format(train_y.shape)
_print_success_message()