Skip to content

Commit

Permalink
Merge pull request #83 from EducationalTestingService/feature/handle-…
Browse files Browse the repository at this point in the history
…warnings

Improve warning handling and other minor changes
  • Loading branch information
desilinguist authored Feb 24, 2017
2 parents 0e3b855 + 9be5dd5 commit b19efc0
Show file tree
Hide file tree
Showing 6 changed files with 107 additions and 85 deletions.
10 changes: 8 additions & 2 deletions rsmtool/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,11 @@ def metrics_helper(human_scores, system_scores):
# any cases where either of the scores are NaNs.
df = pd.DataFrame({'human': human_scores,
'system': system_scores}).dropna(how='any')
correlations = pearsonr(df['human'], df['system'])[0]

if len(df['human'].unique()) == 1 or len(df['system'].unique()) == 1:
correlations = np.nan
else:
correlations = pearsonr(df['human'], df['system'])[0]

# compute the min/max/mean/std. dev. for the system and human scores
min_system_score = np.min(system_scores)
Expand All @@ -441,7 +445,9 @@ def metrics_helper(human_scores, system_scores):
# by Williamson et al (2012)
numerator = mean_system_score - mean_human_score
denominator = np.sqrt((system_score_sd**2 + human_score_sd**2) / 2)
SMD = numerator/denominator

# if the denominator is zero, then return NaN as the SMD
SMD = np.nan if denominator == 0 else numerator/denominator

# compute r2 and MSE
r2 = r2_score(human_scores, system_scores)
Expand Down
15 changes: 9 additions & 6 deletions rsmtool/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,15 @@ def read_data_file(filename, converters=None):
raise ValueError("RSMTool only supports files in .csv, .tsv or .xls/.xlsx format. "
"The file should have the extension which matches its format.")

try:
df = do_read(filename)
except pd.parser.CParserError:
raise pd.parser.CParserError('Cannot read {}. Please check that it is '
'not corrupt or in an incompatible format. '
'(Try running dos2unix?)'.format(filename))
# ignore warnings about mixed data types for large files
with warnings.catch_warnings():
warnings.filterwarnings('ignore', category=pd.io.common.DtypeWarning)
try:
df = do_read(filename)
except pd.parser.CParserError:
raise pd.parser.CParserError('Cannot read {}. Please check that it is '
'not corrupt or in an incompatible format. '
'(Try running dos2unix?)'.format(filename))
return df


Expand Down
151 changes: 77 additions & 74 deletions rsmtool/notebooks/header.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -203,86 +203,89 @@
"source": [
"# Read in the training and testing features, both raw and pre-processed\n",
"# Make sure that the `spkitemid` column is read as a string\n",
"# We filter DtypeWarnings that pop up mostly in very large files\n",
"\n",
"if exists(train_file_location):\n",
" df_train_orig = read_data_file(train_file_location)\n",
"with warnings.catch_warnings():\n",
" warnings.filterwarnings('ignore', category=pd.io.common.DtypeWarning)\n",
" if exists(train_file_location):\n",
" df_train_orig = read_data_file(train_file_location)\n",
"\n",
"train_file = join(output_dir, '{}_train_features.csv'.format(experiment_id))\n",
"if exists(train_file):\n",
" df_train = pd.read_csv(train_file, converters={'spkitemid': str})\n",
" \n",
"train_metadata_file = join(output_dir, '{}_train_metadata.csv'.format(experiment_id)) \n",
"if exists(train_metadata_file):\n",
" df_train_metadata = pd.read_csv(train_metadata_file, converters={'spkitemid': str})\n",
" train_file = join(output_dir, '{}_train_features.csv'.format(experiment_id))\n",
" if exists(train_file):\n",
" df_train = pd.read_csv(train_file, converters={'spkitemid': str})\n",
"\n",
"train_other_columns_file = join(output_dir, '{}_train_other_columns.csv'.format(experiment_id))\n",
"if exists(train_other_columns_file):\n",
" df_train_other_columns = pd.read_csv(train_other_columns_file, converters={'spkitemid': str})\n",
" train_metadata_file = join(output_dir, '{}_train_metadata.csv'.format(experiment_id)) \n",
" if exists(train_metadata_file):\n",
" df_train_metadata = pd.read_csv(train_metadata_file, converters={'spkitemid': str})\n",
"\n",
"train_length_file = join(output_dir, '{}_train_response_lengths.csv'.format(experiment_id))\n",
"if exists(train_length_file):\n",
" df_train_length = pd.read_csv(train_length_file, converters={'spkitemid': str})\n",
" \n",
"train_excluded_file = join(output_dir, '{}_train_excluded_responses.csv'.format(experiment_id))\n",
"if exists(train_excluded_file):\n",
" df_train_excluded = pd.read_csv(train_excluded_file, converters={'spkitemid': str})\n",
" \n",
"train_responses_with_excluded_flags_file = join(output_dir, '{}_train_responses_with_excluded_flags.csv'.format(experiment_id))\n",
"if exists(train_responses_with_excluded_flags_file):\n",
" df_train_responses_with_excluded_flags = pd.read_csv(train_responses_with_excluded_flags_file, converters={'spkitemid': str})\n",
" \n",
"train_preproc_file = join(output_dir, '{}_train_preprocessed_features.csv'.format(experiment_id)) \n",
"if exists(train_preproc_file):\n",
" df_train_preproc = pd.read_csv(train_preproc_file, converters={'spkitemid': str})\n",
" \n",
"if exists(test_file_location):\n",
" df_test_orig = read_data_file(test_file_location)\n",
" train_other_columns_file = join(output_dir, '{}_train_other_columns.csv'.format(experiment_id))\n",
" if exists(train_other_columns_file):\n",
" df_train_other_columns = pd.read_csv(train_other_columns_file, converters={'spkitemid': str})\n",
"\n",
"test_file = join(output_dir, '{}_test_features.csv'.format(experiment_id))\n",
"if exists(test_file):\n",
" df_test = pd.read_csv(test_file, converters={'spkitemid': str})\n",
" train_length_file = join(output_dir, '{}_train_response_lengths.csv'.format(experiment_id))\n",
" if exists(train_length_file):\n",
" df_train_length = pd.read_csv(train_length_file, converters={'spkitemid': str})\n",
"\n",
"test_metadata_file = join(output_dir, '{}_test_metadata.csv'.format(experiment_id)) \n",
"if exists(test_metadata_file):\n",
" df_test_metadata = pd.read_csv(test_metadata_file, converters={'spkitemid': str})\n",
" \n",
"test_other_columns_file = join(output_dir, '{}_test_other_columns.csv'.format(experiment_id))\n",
"if exists(test_other_columns_file):\n",
" df_test_other_columns = pd.read_csv(test_other_columns_file, converters={'spkitemid': str})\n",
"\n",
"test_human_scores_file = join(output_dir, '{}_test_human_scores.csv'.format(experiment_id))\n",
"if exists(test_human_scores_file):\n",
" df_test_human_scores = pd.read_csv(test_human_scores_file, converters={'spkitemid': str})\n",
" \n",
"test_excluded_file = join(output_dir, '{}_test_excluded_responses.csv'.format(experiment_id))\n",
"if exists(test_excluded_file):\n",
" df_test_excluded = pd.read_csv(test_excluded_file, converters={'spkitemid': str})\n",
" \n",
"test_responses_with_excluded_flags_file = join(output_dir, '{}_test_responses_with_excluded_flags.csv'.format(experiment_id))\n",
"if exists(test_responses_with_excluded_flags_file):\n",
" df_test_responses_with_excluded_flags = pd.read_csv(test_responses_with_excluded_flags_file, converters={'spkitemid': str})\n",
"\n",
"test_preproc_file = join(output_dir, '{}_test_preprocessed_features.csv'.format(experiment_id))\n",
"if exists(test_preproc_file):\n",
" df_test_preproc = pd.read_csv(test_preproc_file, converters={'spkitemid': str})\n",
"\n",
"pred_preproc_file = join(output_dir, '{}_pred_processed.csv'.format(experiment_id))\n",
"if exists(pred_preproc_file):\n",
" df_pred_preproc = pd.read_csv(pred_preproc_file, converters={'spkitemid': str})\n",
"\n",
"feature_file = join(output_dir, '{}_feature.csv'.format(experiment_id))\n",
"if exists(feature_file):\n",
" df_features = pd.read_csv(feature_file, converters={'spkitemid': str})\n",
" features_used = [c for c in df_features.feature.values]\n",
" \n",
"betas_file = join(output_dir, '{}_betas.csv'.format(experiment_id))\n",
"if exists(betas_file):\n",
" df_betas = pd.read_csv(betas_file)\n",
" \n",
"if exists(feature_subset_file):\n",
" df_feature_subset_specs = read_data_file(feature_subset_file)\n",
"else:\n",
" df_feature_subset_specs = None\n",
" train_excluded_file = join(output_dir, '{}_train_excluded_responses.csv'.format(experiment_id))\n",
" if exists(train_excluded_file):\n",
" df_train_excluded = pd.read_csv(train_excluded_file, converters={'spkitemid': str})\n",
"\n",
" train_responses_with_excluded_flags_file = join(output_dir, '{}_train_responses_with_excluded_flags.csv'.format(experiment_id))\n",
" if exists(train_responses_with_excluded_flags_file):\n",
" df_train_responses_with_excluded_flags = pd.read_csv(train_responses_with_excluded_flags_file, converters={'spkitemid': str})\n",
"\n",
" train_preproc_file = join(output_dir, '{}_train_preprocessed_features.csv'.format(experiment_id)) \n",
" if exists(train_preproc_file):\n",
" df_train_preproc = pd.read_csv(train_preproc_file, converters={'spkitemid': str})\n",
"\n",
" if exists(test_file_location):\n",
" df_test_orig = read_data_file(test_file_location)\n",
"\n",
" test_file = join(output_dir, '{}_test_features.csv'.format(experiment_id))\n",
" if exists(test_file):\n",
" df_test = pd.read_csv(test_file, converters={'spkitemid': str})\n",
"\n",
" test_metadata_file = join(output_dir, '{}_test_metadata.csv'.format(experiment_id)) \n",
" if exists(test_metadata_file):\n",
" df_test_metadata = pd.read_csv(test_metadata_file, converters={'spkitemid': str})\n",
"\n",
" test_other_columns_file = join(output_dir, '{}_test_other_columns.csv'.format(experiment_id))\n",
" if exists(test_other_columns_file):\n",
" df_test_other_columns = pd.read_csv(test_other_columns_file, converters={'spkitemid': str})\n",
"\n",
" test_human_scores_file = join(output_dir, '{}_test_human_scores.csv'.format(experiment_id))\n",
" if exists(test_human_scores_file):\n",
" df_test_human_scores = pd.read_csv(test_human_scores_file, converters={'spkitemid': str})\n",
"\n",
" test_excluded_file = join(output_dir, '{}_test_excluded_responses.csv'.format(experiment_id))\n",
" if exists(test_excluded_file):\n",
" df_test_excluded = pd.read_csv(test_excluded_file, converters={'spkitemid': str})\n",
"\n",
" test_responses_with_excluded_flags_file = join(output_dir, '{}_test_responses_with_excluded_flags.csv'.format(experiment_id))\n",
" if exists(test_responses_with_excluded_flags_file):\n",
" df_test_responses_with_excluded_flags = pd.read_csv(test_responses_with_excluded_flags_file, converters={'spkitemid': str})\n",
"\n",
" test_preproc_file = join(output_dir, '{}_test_preprocessed_features.csv'.format(experiment_id))\n",
" if exists(test_preproc_file):\n",
" df_test_preproc = pd.read_csv(test_preproc_file, converters={'spkitemid': str})\n",
"\n",
" pred_preproc_file = join(output_dir, '{}_pred_processed.csv'.format(experiment_id))\n",
" if exists(pred_preproc_file):\n",
" df_pred_preproc = pd.read_csv(pred_preproc_file, converters={'spkitemid': str})\n",
"\n",
" feature_file = join(output_dir, '{}_feature.csv'.format(experiment_id))\n",
" if exists(feature_file):\n",
" df_features = pd.read_csv(feature_file, converters={'spkitemid': str})\n",
" features_used = [c for c in df_features.feature.values]\n",
"\n",
" betas_file = join(output_dir, '{}_betas.csv'.format(experiment_id))\n",
" if exists(betas_file):\n",
" df_betas = pd.read_csv(betas_file)\n",
"\n",
" if exists(feature_subset_file):\n",
" df_feature_subset_specs = read_data_file(feature_subset_file)\n",
" else:\n",
" df_feature_subset_specs = None\n",
"\n"
]
}
Expand Down
2 changes: 1 addition & 1 deletion rsmtool/notebooks/skll_model.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@
" coef = coef.toarray()[0]\n",
"\n",
" # inverse transform to get indices for before feature selection\n",
" coef = learner.feat_selector.inverse_transform(coef)[0]\n",
" coef = learner.feat_selector.inverse_transform(coef.reshape(1, -1))[0]\n",
" for feat, idx in iteritems(learner.feat_vectorizer.vocabulary_):\n",
" if coef[idx]:\n",
" weights[feat] = coef[idx]\n",
Expand Down
10 changes: 8 additions & 2 deletions rsmtool/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from rsmtool.rsmsummarize import run_summary

html_error_regexp = re.compile(r'Traceback \(most recent call last\)')
html_warning_regexp = re.compile(r'<div class=".*?output_stderr.*?>')
section_regexp = re.compile(r'<h2>(.*?)</h2>')


Expand Down Expand Up @@ -188,12 +189,17 @@ def check_report(html_file):
Path the HTML report file on disk.
"""
report_errors = 0
report_warnings = 0
with open(html_file, 'r') as htmlf:
for line in htmlf:
m = html_error_regexp.search(line)
if m:
m_error = html_error_regexp.search(line)
if m_error:
report_errors += 1
m_warning = html_warning_regexp.search(line)
if m_warning:
report_warnings += 1
assert_equal(report_errors, 0)
assert_equal(report_warnings, 0)


def check_scaled_coefficients(source, experiment_id):
Expand Down
4 changes: 4 additions & 0 deletions rsmtool/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,10 @@ def color_highlighter(num, low=0, high=1, prec=3, absolute=False):


def compute_subgroup_plot_params(group_names, num_plots):
"""
Computing subgroup plot and figure parameters based on number of
subgroups and number of plots to be generated.
"""
wrapped_group_names = ['\n'.join(wrap(str(gn), 20)) for gn in group_names]
plot_height = 4 if wrapped_group_names == group_names else 6
num_groups = len(group_names)
Expand Down

0 comments on commit b19efc0

Please sign in to comment.