Merge pull request #83 from EducationalTestingService/feature/handle-…

…warnings Improve warning handling and other minor changes
EducationalTestingService · Feb 24, 2017 · b19efc0 · b19efc0
2 parents 0e3b855 + 9be5dd5
commit b19efc0
Show file tree

Hide file tree

Showing 6 changed files with 107 additions and 85 deletions.
diff --git a/rsmtool/analysis.py b/rsmtool/analysis.py
@@ -422,7 +422,11 @@ def metrics_helper(human_scores, system_scores):
     # any cases where either of the scores are NaNs.
     df = pd.DataFrame({'human': human_scores,
                        'system': system_scores}).dropna(how='any')
-    correlations = pearsonr(df['human'], df['system'])[0]
+
+    if len(df['human'].unique()) == 1 or len(df['system'].unique()) == 1:
+        correlations = np.nan
+    else:
+        correlations = pearsonr(df['human'], df['system'])[0]
 
     # compute the min/max/mean/std. dev. for the system and human scores
     min_system_score = np.min(system_scores)
@@ -441,7 +445,9 @@ def metrics_helper(human_scores, system_scores):
     # by Williamson et al (2012)
     numerator = mean_system_score - mean_human_score
     denominator = np.sqrt((system_score_sd**2 + human_score_sd**2) / 2)
-    SMD = numerator/denominator
+
+    # if the denominator is zero, then return NaN as the SMD
+    SMD = np.nan if denominator == 0 else numerator/denominator
 
     # compute r2 and MSE
     r2 = r2_score(human_scores, system_scores)

diff --git a/rsmtool/input.py b/rsmtool/input.py
@@ -73,12 +73,15 @@ def read_data_file(filename, converters=None):
         raise ValueError("RSMTool only supports files in .csv, .tsv or .xls/.xlsx format. "
                          "The file should have the extension which matches its format.")
 
-    try:
-        df = do_read(filename)
-    except pd.parser.CParserError:
-        raise pd.parser.CParserError('Cannot read {}. Please check that it is '
-                                     'not corrupt or in an incompatible format. '
-                                     '(Try running dos2unix?)'.format(filename))
+    # ignore warnings about mixed data types for large files
+    with warnings.catch_warnings():
+        warnings.filterwarnings('ignore', category=pd.io.common.DtypeWarning)
+        try:
+            df = do_read(filename)
+        except pd.parser.CParserError:
+            raise pd.parser.CParserError('Cannot read {}. Please check that it is '
+                                         'not corrupt or in an incompatible format. '
+                                         '(Try running dos2unix?)'.format(filename))
     return df
 
 

diff --git a/rsmtool/notebooks/header.ipynb b/rsmtool/notebooks/header.ipynb
@@ -203,86 +203,89 @@
    "source": [
     "# Read in the training and testing features, both raw and pre-processed\n",
     "# Make sure that the `spkitemid` column is read as a string\n",
+    "# We filter DtypeWarnings that pop up mostly in very large files\n",
     "\n",
-    "if exists(train_file_location):\n",
-    "    df_train_orig = read_data_file(train_file_location)\n",
+    "with warnings.catch_warnings():\n",
+    "    warnings.filterwarnings('ignore', category=pd.io.common.DtypeWarning)\n",
+    "    if exists(train_file_location):\n",
+    "        df_train_orig = read_data_file(train_file_location)\n",
     "\n",
-    "train_file = join(output_dir, '{}_train_features.csv'.format(experiment_id))\n",
-    "if exists(train_file):\n",
-    "    df_train = pd.read_csv(train_file, converters={'spkitemid': str})\n",
-    "    \n",
-    "train_metadata_file = join(output_dir, '{}_train_metadata.csv'.format(experiment_id))    \n",
-    "if exists(train_metadata_file):\n",
-    "    df_train_metadata = pd.read_csv(train_metadata_file, converters={'spkitemid': str})\n",
+    "    train_file = join(output_dir, '{}_train_features.csv'.format(experiment_id))\n",
+    "    if exists(train_file):\n",
+    "        df_train = pd.read_csv(train_file, converters={'spkitemid': str})\n",
     "\n",
-    "train_other_columns_file = join(output_dir, '{}_train_other_columns.csv'.format(experiment_id))\n",
-    "if exists(train_other_columns_file):\n",
-    "    df_train_other_columns = pd.read_csv(train_other_columns_file, converters={'spkitemid': str})\n",
+    "    train_metadata_file = join(output_dir, '{}_train_metadata.csv'.format(experiment_id))    \n",
+    "    if exists(train_metadata_file):\n",
+    "        df_train_metadata = pd.read_csv(train_metadata_file, converters={'spkitemid': str})\n",
     "\n",
-    "train_length_file = join(output_dir, '{}_train_response_lengths.csv'.format(experiment_id))\n",
-    "if exists(train_length_file):\n",
-    "    df_train_length = pd.read_csv(train_length_file, converters={'spkitemid': str})\n",
-    "    \n",
-    "train_excluded_file = join(output_dir, '{}_train_excluded_responses.csv'.format(experiment_id))\n",
-    "if exists(train_excluded_file):\n",
-    "    df_train_excluded = pd.read_csv(train_excluded_file, converters={'spkitemid': str})\n",
-    "    \n",
-    "train_responses_with_excluded_flags_file = join(output_dir, '{}_train_responses_with_excluded_flags.csv'.format(experiment_id))\n",
-    "if exists(train_responses_with_excluded_flags_file):\n",
-    "    df_train_responses_with_excluded_flags = pd.read_csv(train_responses_with_excluded_flags_file, converters={'spkitemid': str})\n",
-    "    \n",
-    "train_preproc_file = join(output_dir, '{}_train_preprocessed_features.csv'.format(experiment_id))    \n",
-    "if exists(train_preproc_file):\n",
-    "    df_train_preproc = pd.read_csv(train_preproc_file, converters={'spkitemid': str})\n",
-    "    \n",
-    "if exists(test_file_location):\n",
-    "    df_test_orig = read_data_file(test_file_location)\n",
+    "    train_other_columns_file = join(output_dir, '{}_train_other_columns.csv'.format(experiment_id))\n",
+    "    if exists(train_other_columns_file):\n",
+    "        df_train_other_columns = pd.read_csv(train_other_columns_file, converters={'spkitemid': str})\n",
     "\n",
-    "test_file = join(output_dir, '{}_test_features.csv'.format(experiment_id))\n",
-    "if exists(test_file):\n",
-    "    df_test = pd.read_csv(test_file, converters={'spkitemid': str})\n",
+    "    train_length_file = join(output_dir, '{}_train_response_lengths.csv'.format(experiment_id))\n",
+    "    if exists(train_length_file):\n",
+    "        df_train_length = pd.read_csv(train_length_file, converters={'spkitemid': str})\n",
     "\n",
-    "test_metadata_file = join(output_dir, '{}_test_metadata.csv'.format(experiment_id))    \n",
-    "if exists(test_metadata_file):\n",
-    "    df_test_metadata = pd.read_csv(test_metadata_file, converters={'spkitemid': str})\n",
-    "    \n",
-    "test_other_columns_file = join(output_dir, '{}_test_other_columns.csv'.format(experiment_id))\n",
-    "if exists(test_other_columns_file):\n",
-    "    df_test_other_columns = pd.read_csv(test_other_columns_file, converters={'spkitemid': str})\n",
-    "\n",
-    "test_human_scores_file = join(output_dir, '{}_test_human_scores.csv'.format(experiment_id))\n",
-    "if exists(test_human_scores_file):\n",
-    "    df_test_human_scores = pd.read_csv(test_human_scores_file, converters={'spkitemid': str})\n",
-    "        \n",
-    "test_excluded_file = join(output_dir, '{}_test_excluded_responses.csv'.format(experiment_id))\n",
-    "if exists(test_excluded_file):\n",
-    "    df_test_excluded = pd.read_csv(test_excluded_file, converters={'spkitemid': str})\n",
-    "    \n",
-    "test_responses_with_excluded_flags_file = join(output_dir, '{}_test_responses_with_excluded_flags.csv'.format(experiment_id))\n",
-    "if exists(test_responses_with_excluded_flags_file):\n",
-    "    df_test_responses_with_excluded_flags = pd.read_csv(test_responses_with_excluded_flags_file, converters={'spkitemid': str})\n",
-    "\n",
-    "test_preproc_file = join(output_dir, '{}_test_preprocessed_features.csv'.format(experiment_id))\n",
-    "if exists(test_preproc_file):\n",
-    "    df_test_preproc = pd.read_csv(test_preproc_file, converters={'spkitemid': str})\n",
-    "\n",
-    "pred_preproc_file = join(output_dir, '{}_pred_processed.csv'.format(experiment_id))\n",
-    "if exists(pred_preproc_file):\n",
-    "    df_pred_preproc = pd.read_csv(pred_preproc_file, converters={'spkitemid': str})\n",
-    "\n",
-    "feature_file = join(output_dir, '{}_feature.csv'.format(experiment_id))\n",
-    "if exists(feature_file):\n",
-    "    df_features = pd.read_csv(feature_file, converters={'spkitemid': str})\n",
-    "    features_used = [c for c in df_features.feature.values]\n",
-    "    \n",
-    "betas_file = join(output_dir, '{}_betas.csv'.format(experiment_id))\n",
-    "if exists(betas_file):\n",
-    "    df_betas = pd.read_csv(betas_file)\n",
-    "    \n",
-    "if exists(feature_subset_file):\n",
-    "    df_feature_subset_specs = read_data_file(feature_subset_file)\n",
-    "else:\n",
-    "    df_feature_subset_specs = None\n",
+    "    train_excluded_file = join(output_dir, '{}_train_excluded_responses.csv'.format(experiment_id))\n",
+    "    if exists(train_excluded_file):\n",
+    "        df_train_excluded = pd.read_csv(train_excluded_file, converters={'spkitemid': str})\n",
+    "\n",
+    "    train_responses_with_excluded_flags_file = join(output_dir, '{}_train_responses_with_excluded_flags.csv'.format(experiment_id))\n",
+    "    if exists(train_responses_with_excluded_flags_file):\n",
+    "        df_train_responses_with_excluded_flags = pd.read_csv(train_responses_with_excluded_flags_file, converters={'spkitemid': str})\n",
+    "\n",
+    "    train_preproc_file = join(output_dir, '{}_train_preprocessed_features.csv'.format(experiment_id))    \n",
+    "    if exists(train_preproc_file):\n",
+    "        df_train_preproc = pd.read_csv(train_preproc_file, converters={'spkitemid': str})\n",
+    "\n",
+    "    if exists(test_file_location):\n",
+    "        df_test_orig = read_data_file(test_file_location)\n",
+    "\n",
+    "    test_file = join(output_dir, '{}_test_features.csv'.format(experiment_id))\n",
+    "    if exists(test_file):\n",
+    "        df_test = pd.read_csv(test_file, converters={'spkitemid': str})\n",
+    "\n",
+    "    test_metadata_file = join(output_dir, '{}_test_metadata.csv'.format(experiment_id))    \n",
+    "    if exists(test_metadata_file):\n",
+    "        df_test_metadata = pd.read_csv(test_metadata_file, converters={'spkitemid': str})\n",
+    "\n",
+    "    test_other_columns_file = join(output_dir, '{}_test_other_columns.csv'.format(experiment_id))\n",
+    "    if exists(test_other_columns_file):\n",
+    "        df_test_other_columns = pd.read_csv(test_other_columns_file, converters={'spkitemid': str})\n",
+    "\n",
+    "    test_human_scores_file = join(output_dir, '{}_test_human_scores.csv'.format(experiment_id))\n",
+    "    if exists(test_human_scores_file):\n",
+    "        df_test_human_scores = pd.read_csv(test_human_scores_file, converters={'spkitemid': str})\n",
+    "\n",
+    "    test_excluded_file = join(output_dir, '{}_test_excluded_responses.csv'.format(experiment_id))\n",
+    "    if exists(test_excluded_file):\n",
+    "        df_test_excluded = pd.read_csv(test_excluded_file, converters={'spkitemid': str})\n",
+    "\n",
+    "    test_responses_with_excluded_flags_file = join(output_dir, '{}_test_responses_with_excluded_flags.csv'.format(experiment_id))\n",
+    "    if exists(test_responses_with_excluded_flags_file):\n",
+    "        df_test_responses_with_excluded_flags = pd.read_csv(test_responses_with_excluded_flags_file, converters={'spkitemid': str})\n",
+    "\n",
+    "    test_preproc_file = join(output_dir, '{}_test_preprocessed_features.csv'.format(experiment_id))\n",
+    "    if exists(test_preproc_file):\n",
+    "        df_test_preproc = pd.read_csv(test_preproc_file, converters={'spkitemid': str})\n",
+    "\n",
+    "    pred_preproc_file = join(output_dir, '{}_pred_processed.csv'.format(experiment_id))\n",
+    "    if exists(pred_preproc_file):\n",
+    "        df_pred_preproc = pd.read_csv(pred_preproc_file, converters={'spkitemid': str})\n",
+    "\n",
+    "    feature_file = join(output_dir, '{}_feature.csv'.format(experiment_id))\n",
+    "    if exists(feature_file):\n",
+    "        df_features = pd.read_csv(feature_file, converters={'spkitemid': str})\n",
+    "        features_used = [c for c in df_features.feature.values]\n",
+    "\n",
+    "    betas_file = join(output_dir, '{}_betas.csv'.format(experiment_id))\n",
+    "    if exists(betas_file):\n",
+    "        df_betas = pd.read_csv(betas_file)\n",
+    "\n",
+    "    if exists(feature_subset_file):\n",
+    "        df_feature_subset_specs = read_data_file(feature_subset_file)\n",
+    "    else:\n",
+    "        df_feature_subset_specs = None\n",
     "\n"
    ]
   }

diff --git a/rsmtool/notebooks/skll_model.ipynb b/rsmtool/notebooks/skll_model.ipynb
@@ -83,7 +83,7 @@
     "        coef = coef.toarray()[0]\n",
     "\n",
     "    # inverse transform to get indices for before feature selection\n",
-    "    coef = learner.feat_selector.inverse_transform(coef)[0]\n",
+    "    coef = learner.feat_selector.inverse_transform(coef.reshape(1, -1))[0]\n",
     "    for feat, idx in iteritems(learner.feat_vectorizer.vocabulary_):\n",
     "        if coef[idx]:\n",
     "            weights[feat] = coef[idx]\n",

diff --git a/rsmtool/test_utils.py b/rsmtool/test_utils.py
@@ -19,6 +19,7 @@
 from rsmtool.rsmsummarize import run_summary
 
 html_error_regexp = re.compile(r'Traceback \(most recent call last\)')
+html_warning_regexp = re.compile(r'<div class=".*?output_stderr.*?>')
 section_regexp = re.compile(r'<h2>(.*?)</h2>')
 
 
@@ -188,12 +189,17 @@ def check_report(html_file):
         Path the HTML report file on disk.
     """
     report_errors = 0
+    report_warnings = 0
     with open(html_file, 'r') as htmlf:
         for line in htmlf:
-            m = html_error_regexp.search(line)
-            if m:
+            m_error = html_error_regexp.search(line)
+            if m_error:
                 report_errors += 1
+            m_warning = html_warning_regexp.search(line)
+            if m_warning:
+                report_warnings += 1
     assert_equal(report_errors, 0)
+    assert_equal(report_warnings, 0)
 
 
 def check_scaled_coefficients(source, experiment_id):

diff --git a/rsmtool/utils.py b/rsmtool/utils.py
@@ -413,6 +413,10 @@ def color_highlighter(num, low=0, high=1, prec=3, absolute=False):
 
 
 def compute_subgroup_plot_params(group_names, num_plots):
+    """
+    Computing subgroup plot and figure parameters based on number of
+    subgroups and number of plots to be generated.
+    """
     wrapped_group_names = ['\n'.join(wrap(str(gn), 20)) for gn in group_names]
     plot_height = 4 if wrapped_group_names == group_names else 6
     num_groups = len(group_names)