diff --git a/delphi/polismath/benchmarks/__init__.py b/delphi/polismath/benchmarks/__init__.py new file mode 100644 index 0000000000..d6408e0378 --- /dev/null +++ b/delphi/polismath/benchmarks/__init__.py @@ -0,0 +1 @@ +"""Benchmark scripts for polismath performance testing.""" diff --git a/delphi/polismath/benchmarks/bench_repness.py b/delphi/polismath/benchmarks/bench_repness.py new file mode 100755 index 0000000000..ba2be3fd89 --- /dev/null +++ b/delphi/polismath/benchmarks/bench_repness.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python3 +""" +Benchmark script for repness (representativeness) computation performance. + +Usage: + cd delphi + ../.venv/bin/python -m polismath.benchmarks.bench_repness [--runs N] + ../.venv/bin/python -m polismath.benchmarks.bench_repness --profile + +Example: + ../.venv/bin/python -m polismath.benchmarks.bench_repness real_data/.local/r7wehfsmutrwndviddnii-bg2050/2025-11-25-1909-r7wehfsmutrwndviddnii-votes.csv --runs 3 + ../.venv/bin/python -m polismath.benchmarks.bench_repness real_data/.local/r7wehfsmutrwndviddnii-bg2050/2025-11-25-1909-r7wehfsmutrwndviddnii-votes.csv --profile +""" +# TODO(datasets): Once PR https://github.com/compdemocracy/polis/pull/2312 is merged, +# use the datasets package with include_local=True instead of requiring a path argument. + +import time +from pathlib import Path + +import click + +from polismath.benchmarks.benchmark_utils import ( + load_votes_from_csv, + extract_dataset_name, + votes_csv_argument, + runs_option, +) +from polismath.conversation import Conversation +from polismath.pca_kmeans_rep.repness import ( + conv_repness, + comment_stats, + add_comparative_stats, + finalize_cmt_stats, + select_rep_comments, + compute_group_comment_stats_df, + select_rep_comments_df, + select_consensus_comments_df, + prop_test_vectorized, + two_prop_test_vectorized +) + + +profile_option = click.option( + '--profile', '-p', + is_flag=True, + help='Run with line profiler on conv_repness', +) + + +def setup_conversation(votes_csv: Path) -> tuple[Conversation, str, int, float]: + """ + Load votes and setup conversation with PCA and clusters. + + Args: + votes_csv: Path to votes CSV file + + Returns: + Tuple of (conversation, dataset_name, n_votes, setup_time) + """ + dataset_name = extract_dataset_name(votes_csv) + + print(f"Loading votes from '{votes_csv}'...") + votes_dict = load_votes_from_csv(votes_csv) + n_votes = len(votes_dict['votes']) + print(f"Loaded {n_votes:,} votes") + print() + + print("Setting up conversation with votes and clusters...") + setup_start = time.perf_counter() + conv = Conversation(dataset_name) + conv = conv.update_votes(votes_dict, recompute=False) + conv._compute_pca() + conv._compute_clusters() + setup_time = time.perf_counter() - setup_start + + print(f"Setup completed in {setup_time:.2f}s") + print(f" Matrix shape: {conv.raw_rating_mat.shape}") + print(f" Number of groups: {len(conv.group_clusters)}") + print() + + return conv, dataset_name, n_votes, setup_time + + +def benchmark_repness(votes_csv: Path, runs: int = 3) -> dict: + """ + Benchmark repness computation on a dataset. + + Args: + votes_csv: Path to votes CSV file + runs: Number of runs to average + + Returns: + Dictionary with benchmark results + """ + conv, dataset_name, n_votes, setup_time = setup_conversation(votes_csv) + + # Benchmark repness computation + print(f"Benchmarking repness computation ({runs} runs)...") + times = [] + for i in range(runs): + start = time.perf_counter() + conv._compute_repness() + elapsed = time.perf_counter() - start + times.append(elapsed) + n_rep_comments = sum(len(v) for v in conv.repness.get('group_repness', {}).values()) + print(f" Run {i+1}: {elapsed:.3f}s ({n_rep_comments} representative comments)") + + avg = sum(times) / len(times) + min_time = min(times) + max_time = max(times) + + print() + print("=" * 50) + print(f"Dataset: {dataset_name}") + print(f"Votes: {n_votes:,}") + print(f"Matrix shape: {conv.raw_rating_mat.shape}") + print(f"Groups: {len(conv.group_clusters)}") + print(f"Average repness time: {avg:.3f}s") + print(f"Min/Max: {min_time:.3f}s / {max_time:.3f}s") + + # Calculate comments per second + n_comments = conv.raw_rating_mat.shape[1] + n_participants = conv.raw_rating_mat.shape[0] + n_groups = len(conv.group_clusters) + + # Repness complexity is roughly O(groups * comments * participants) + operations = n_groups * n_comments * n_participants + print(f"Throughput: {operations/avg:,.0f} ops/sec (groups × comments × participants)") + + return { + 'dataset': dataset_name, + 'n_votes': n_votes, + 'shape': conv.raw_rating_mat.shape, + 'n_groups': n_groups, + 'times': times, + 'avg': avg, + 'min': min_time, + 'max': max_time, + 'setup_time': setup_time, + } + + +def profile_repness(votes_csv: Path) -> None: + """ + Run line profiler on conv_repness. + + Args: + votes_csv: Path to votes CSV file + """ + from line_profiler import LineProfiler + conv, _, _, _ = setup_conversation(votes_csv) + + # Setup line profiler + profiler = LineProfiler() + profiler.add_function(conv_repness) + profiler.add_function(compute_group_comment_stats_df) + profiler.add_function(select_rep_comments_df) + profiler.add_function(select_consensus_comments_df) + profiler.add_function(prop_test_vectorized) + profiler.add_function(two_prop_test_vectorized) + + # Run profiled + print("Running conv_repness with line profiler...") + profiler.runcall(conv_repness, conv.rating_mat, conv.group_clusters) + + # Print results + print() + print("=" * 70) + print("LINE PROFILE RESULTS") + print("=" * 70) + profiler.print_stats() + + +@click.command() +@votes_csv_argument +@runs_option +@profile_option +def main(votes_csv: Path, runs: int, profile: bool): + """Benchmark repness computation performance.""" + if profile: + profile_repness(votes_csv) + else: + benchmark_repness(votes_csv, runs) + + +if __name__ == '__main__': + main() diff --git a/delphi/polismath/benchmarks/bench_update_votes.py b/delphi/polismath/benchmarks/bench_update_votes.py new file mode 100755 index 0000000000..2595623433 --- /dev/null +++ b/delphi/polismath/benchmarks/bench_update_votes.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +""" +Benchmark script for update_votes performance. + +Usage: + cd delphi + ../.venv/bin/python -m polismath.benchmarks.bench_update_votes [--runs N] + +Example: + ../.venv/bin/python -m polismath.benchmarks.bench_update_votes real_data/.local/r7wehfsmutrwndviddnii-bg2050/2025-11-25-1909-r7wehfsmutrwndviddnii-votes.csv --runs 3 +""" +# TODO(datasets): Once PR https://github.com/compdemocracy/polis/pull/2312 is merged, +# use the datasets package with include_local=True instead of requiring a path argument. + +import time +from pathlib import Path + +import click + +from polismath.benchmarks.benchmark_utils import ( + load_votes_from_csv, + extract_dataset_name, + votes_csv_argument, + runs_option, +) + + +def benchmark_update_votes(votes_csv: Path, runs: int = 3) -> dict: + """ + Benchmark update_votes on a dataset. + + Args: + votes_csv: Path to votes CSV file + runs: Number of runs to average + + Returns: + Dictionary with benchmark results + """ + from polismath.conversation import Conversation + + dataset_name = extract_dataset_name(votes_csv) + + print(f"Loading votes from '{votes_csv}'...") + votes_dict = load_votes_from_csv(votes_csv) + n_votes = len(votes_dict['votes']) + print(f"Loaded {n_votes:,} votes") + print() + + times = [] + for i in range(runs): + conv = Conversation(dataset_name) + start = time.perf_counter() + conv = conv.update_votes(votes_dict, recompute=False) + elapsed = time.perf_counter() - start + times.append(elapsed) + print(f" Run {i+1}: {elapsed:.2f}s") + + avg = sum(times) / len(times) + min_time = min(times) + max_time = max(times) + + print() + print(f"Dataset: {dataset_name}") + print(f"Votes: {n_votes:,}") + print(f"Matrix shape: {conv.raw_rating_mat.shape}") + print(f"Average time: {avg:.2f}s") + print(f"Min/Max: {min_time:.2f}s / {max_time:.2f}s") + print(f"Throughput: {n_votes/avg:,.0f} votes/sec") + + return { + 'dataset': dataset_name, + 'n_votes': n_votes, + 'shape': conv.raw_rating_mat.shape, + 'times': times, + 'avg': avg, + 'min': min_time, + 'max': max_time, + 'throughput': n_votes / avg, + } + + +@click.command() +@votes_csv_argument +@runs_option +def main(votes_csv: Path, runs: int): + """Benchmark update_votes performance.""" + benchmark_update_votes(votes_csv, runs) + + +if __name__ == '__main__': + main() diff --git a/delphi/polismath/benchmarks/benchmark_utils.py b/delphi/polismath/benchmarks/benchmark_utils.py new file mode 100644 index 0000000000..23cb57fc5f --- /dev/null +++ b/delphi/polismath/benchmarks/benchmark_utils.py @@ -0,0 +1,110 @@ +""" +Shared utilities for benchmark scripts. +""" + +import time +from pathlib import Path +from typing import Callable, Dict, Any + +import click +import pandas as pd + + +def load_votes_from_csv(votes_csv: Path) -> dict: + """ + Load votes from a CSV file into the format expected by Conversation.update_votes(). + + Args: + votes_csv: Path to votes CSV file with columns: voter-id, comment-id, vote, timestamp + + Returns: + Dictionary with 'votes' list and 'lastVoteTimestamp' + """ + df = pd.read_csv(votes_csv) + + # Fixed timestamp for reproducibility + fixed_timestamp = 1700000000000 + + # Use vectorized pandas operations instead of iterrows() for efficiency + df = df.rename(columns={ + 'voter-id': 'pid', + 'comment-id': 'tid', + }) + if 'timestamp' in df.columns: + df['created'] = df['timestamp'].astype(int) + else: + df['created'] = fixed_timestamp + + votes_list = df[['pid', 'tid', 'vote', 'created']].to_dict('records') + + return { + 'votes': votes_list, + 'lastVoteTimestamp': fixed_timestamp + } + + +def extract_dataset_name(votes_path: Path) -> str: + """ + Extract dataset name from path. + + Args: + votes_path: Path to votes CSV file + + Returns: + Dataset name (e.g., "r7wehfsmutrwndviddnii-bg2050" -> "bg2050") + """ + parent_name = votes_path.parent.name + if '-' in parent_name: + return parent_name.split('-', 1)[1] + return parent_name + + +def run_benchmark( + func: Callable[[], Any], + runs: int, + description: str = "operation" +) -> Dict[str, Any]: + """ + Run a benchmark function multiple times and collect timing statistics. + + Args: + func: Function to benchmark (called with no arguments) + runs: Number of runs + description: Description for printing + + Returns: + Dictionary with timing statistics and results from last run + """ + times = [] + result = None + for i in range(runs): + start = time.perf_counter() + result = func() + elapsed = time.perf_counter() - start + times.append(elapsed) + print(f" Run {i+1}: {elapsed:.3f}s") + + avg = sum(times) / len(times) + min_time = min(times) + max_time = max(times) + + return { + 'times': times, + 'avg': avg, + 'min': min_time, + 'max': max_time, + 'result': result, + } + + +# Common click options for benchmark scripts +votes_csv_argument = click.argument( + 'votes_csv', + type=click.Path(exists=True, path_type=Path), +) + +runs_option = click.option( + '--runs', '-n', + default=3, + help='Number of benchmark runs (default: 3)', +) diff --git a/delphi/polismath/conversation/conversation.py b/delphi/polismath/conversation/conversation.py index 42c37ca74d..c2ea6c9a8e 100644 --- a/delphi/polismath/conversation/conversation.py +++ b/delphi/polismath/conversation/conversation.py @@ -19,7 +19,6 @@ from polismath.pca_kmeans_rep.clusters import cluster_dataframe from polismath.pca_kmeans_rep.repness import conv_repness, participant_stats from polismath.pca_kmeans_rep.corr import compute_correlation -from polismath.utils.general import agree, disagree, pass_vote # Configure logging @@ -231,25 +230,39 @@ def update_votes(self, logger.info(f"[{time.time() - start_time:.2f}s] Found {len(new_rows)} new rows and {len(new_cols)} new columns") - # Apply all updates in a single batch operation for better performance - # Honestly, we should probably keep the matrix of votes in long-form, - # and only convert to wide-form when requested. - - logger.info(f"[{time.time() - start_time:.2f}s] Applying {len(vote_updates)} votes as batch update...") + # Apply all updates using vectorized pivot_table approach. + # This is much faster than row-by-row iteration because pandas/numpy + # can use optimized C code for the reshape operation. + + logger.info(f"[{time.time() - start_time:.2f}s] Applying {len(updates_df)} votes as batch update...") batch_start = time.time() - # For backward compatibility, sort the rows and columns by label. - result.raw_rating_mat = result.raw_rating_mat.reindex(index=all_rows, columns=all_cols, fill_value=np.nan) - # NOTE: we cannot use .loc(rows, cols) = values with rows,cols,and values being Series - # for example `result.raw_rating_mat.loc[updates_df['row'], updates_df['col']] = updates_df['value'].values` - # because pandas then tries to assign to the Cartesian product of rows and cols, and it gets very messy - # and is definitely *not* what we intended. - # We could convert to integer indices with get_loc, then use .value to use numpy assignment (which does not - # do any cartesian product), but a/ it's less legible, b/ there is *no* guarantee at all that .value is always - # a view and not a copy, so we might end up modifying a copy of the data frame. - # Therefore, for simplicity and readability, sticking to an ugly for loop. - # If you have a better idea, let me know at julien@cornebise.com, I would love to know :) - for idx, row_data in updates_df.iterrows(): - result.raw_rating_mat.at[row_data['row'], row_data['col']] = row_data['value'] + + # Build a wide-form matrix from the long-form updates using pivot_table. + # aggfunc='last' keeps the last vote if any duplicates remain after dedup. + update_matrix = updates_df.pivot_table( + index='row', + columns='col', + values='value', + aggfunc='last' + ) + + # Expand the existing matrix to include any new rows/columns. + # fill_value=np.nan ensures new cells start as "no vote". + result.raw_rating_mat = result.raw_rating_mat.reindex( + index=all_rows, columns=all_cols, fill_value=np.nan + ) + + # Align the update matrix to the same shape (new cells become NaN). + update_matrix = update_matrix.reindex(index=all_rows, columns=all_cols) + + # Merge: where update_matrix has a value, use it; otherwise keep original. + # DataFrame.where(cond, other) keeps self where cond is True, uses other where False. + # So: keep raw_rating_mat where update_matrix is NaN, else use update_matrix. + result.raw_rating_mat = result.raw_rating_mat.where( + update_matrix.isna(), # condition: True where update has no value + update_matrix # other: use update value where condition is False + ) + logger.info(f"[{time.time() - start_time:.2f}s] Batch update completed in {time.time() - batch_start:.2f}s") # Update last updated timestamp @@ -292,12 +305,10 @@ def _apply_moderation(self) -> None: def _compute_vote_stats(self) -> None: """ - Compute statistics on votes. + Compute statistics on votes using vectorized operations. """ - # Make sure pandas is imported import numpy as np - import pandas as pd - + # Initialize stats self.vote_stats = { 'n_votes': 0, @@ -307,84 +318,65 @@ def _compute_vote_stats(self) -> None: 'comment_stats': {}, 'participant_stats': {} } - - # Get matrix values and ensure they are numeric + try: - # Make a clean copy that's definitely numeric + # Get clean numeric matrix clean_mat = self._get_clean_matrix() - # TODO: we can probably count without needing to convert to numpy array values = clean_mat.to_numpy() - # Count votes safely + # Create boolean masks once for the entire matrix. + # These are 2D arrays of the same shape as values. + non_null_mask = ~np.isnan(values) + agree_mask = np.abs(values - 1.0) < 0.001 # Close to 1 + disagree_mask = np.abs(values + 1.0) < 0.001 # Close to -1 + + # Global stats: sum over entire matrix try: - # Create masks, handling non-numeric data - non_null_mask = ~np.isnan(values) - agree_mask = np.abs(values - 1.0) < 0.001 # Close to 1 - disagree_mask = np.abs(values + 1.0) < 0.001 # Close to -1 - self.vote_stats['n_votes'] = int(np.sum(non_null_mask)) self.vote_stats['n_agree'] = int(np.sum(agree_mask)) self.vote_stats['n_disagree'] = int(np.sum(disagree_mask)) - self.vote_stats['n_pass'] = int(np.sum(np.isnan(values))) + self.vote_stats['n_pass'] = int(np.sum(~non_null_mask)) except Exception as e: - logger.error(f"Error counting votes: {e}") - # Set defaults if counting fails - self.vote_stats['n_votes'] = 0 - self.vote_stats['n_agree'] = 0 - self.vote_stats['n_disagree'] = 0 - self.vote_stats['n_pass'] = 0 - - # Compute comment stats - for i, cid in enumerate(clean_mat.columns): - if i >= values.shape[1]: - continue - - try: - col = values[:, i] - n_votes = np.sum(~np.isnan(col)) - n_agree = np.sum(np.abs(col - 1.0) < 0.001) - n_disagree = np.sum(np.abs(col + 1.0) < 0.001) - - self.vote_stats['comment_stats'][cid] = { - 'n_votes': int(n_votes), - 'n_agree': int(n_agree), - 'n_disagree': int(n_disagree), - 'agree_ratio': float(n_agree / max(n_votes, 1)) - } - except Exception as e: - logger.error(f"Error computing stats for comment {cid}: {e}") + logger.error(f"Error counting global votes: {e}") + + # Per-comment stats: sum along axis=0 (columns). + # axis=0 sums over rows, giving one value per column (comment). + try: + comment_n_votes = np.sum(non_null_mask, axis=0) + comment_n_agree = np.sum(agree_mask, axis=0) + comment_n_disagree = np.sum(disagree_mask, axis=0) + # Avoid division by zero: use np.maximum to ensure denominator >= 1 + comment_agree_ratio = comment_n_agree / np.maximum(comment_n_votes, 1) + + # Build comment_stats dict from the arrays. + for i, cid in enumerate(clean_mat.columns): self.vote_stats['comment_stats'][cid] = { - 'n_votes': 0, - 'n_agree': 0, - 'n_disagree': 0, - 'agree_ratio': 0.0 - } - - # Compute participant stats - for i, pid in enumerate(clean_mat.index): - if i >= values.shape[0]: - continue - - try: - row = values[i, :] - n_votes = np.sum(~np.isnan(row)) - n_agree = np.sum(np.abs(row - 1.0) < 0.001) - n_disagree = np.sum(np.abs(row + 1.0) < 0.001) - - self.vote_stats['participant_stats'][pid] = { - 'n_votes': int(n_votes), - 'n_agree': int(n_agree), - 'n_disagree': int(n_disagree), - 'agree_ratio': float(n_agree / max(n_votes, 1)) + 'n_votes': int(comment_n_votes[i]), + 'n_agree': int(comment_n_agree[i]), + 'n_disagree': int(comment_n_disagree[i]), + 'agree_ratio': float(comment_agree_ratio[i]) } - except Exception as e: - logger.error(f"Error computing stats for participant {pid}: {e}") + except Exception as e: + logger.error(f"Error computing comment stats: {e}") + + # Per-participant stats: sum along axis=1 (rows). + # axis=1 sums over columns, giving one value per row (participant). + try: + ptpt_n_votes = np.sum(non_null_mask, axis=1) + ptpt_n_agree = np.sum(agree_mask, axis=1) + ptpt_n_disagree = np.sum(disagree_mask, axis=1) + ptpt_agree_ratio = ptpt_n_agree / np.maximum(ptpt_n_votes, 1) + + # Build participant_stats dict from the arrays. + for i, pid in enumerate(clean_mat.index): self.vote_stats['participant_stats'][pid] = { - 'n_votes': 0, - 'n_agree': 0, - 'n_disagree': 0, - 'agree_ratio': 0.0 + 'n_votes': int(ptpt_n_votes[i]), + 'n_agree': int(ptpt_n_agree[i]), + 'n_disagree': int(ptpt_n_disagree[i]), + 'agree_ratio': float(ptpt_agree_ratio[i]) } + except Exception as e: + logger.error(f"Error computing participant stats: {e}") except Exception as e: logger.error(f"Error in vote stats computation: {e}") # Initialize with empty stats if computation fails @@ -447,14 +439,18 @@ def update_moderation(self, def _compute_pca(self, n_components: int = 2) -> None: """ Compute PCA on the vote matrix. - + Args: n_components: Number of principal components """ + import time + start_time = time.time() + logger.info(f"Starting PCA computation (matrix shape: {self.rating_mat.shape})...") + # Make sure pandas and numpy are imported import numpy as np import pandas as pd - + # Check if we have enough data if self.rating_mat.shape[0] < 2 or self.rating_mat.shape[1] < 2: # Not enough data for PCA, create minimal results @@ -464,32 +460,35 @@ def _compute_pca(self, n_components: int = 2) -> None: 'comps': np.zeros((min(n_components, 2), cols)) } self.proj = {pid: np.zeros(2) for pid in self.rating_mat.index} + logger.info(f"PCA computation completed in {time.time() - start_time:.2f}s (insufficient data)") return try: # Make a clean copy of the rating matrix clean_matrix = self._get_clean_matrix() - + pca_results, proj_dict = pca_project_dataframe(clean_matrix, n_components) - + # Store results self.pca = pca_results self.proj = proj_dict - + logger.info(f"PCA computation completed in {time.time() - start_time:.2f}s") + except Exception as e: # If PCA fails, create minimal results logger.error(f"Error in PCA computation: {e}") # Make sure we have numpy and pandas import numpy as np import pandas as pd - + cols = self.rating_mat.shape[1] self.pca = { 'center': np.zeros(cols), 'comps': np.zeros((min(n_components, 2), cols)) } self.proj = {pid: np.zeros(2) for pid in self.rating_mat.index} - + logger.info(f"PCA computation completed in {time.time() - start_time:.2f}s (with errors)") + def _get_clean_matrix(self) -> pd.DataFrame: """ Get a clean copy of the rating matrix with proper numeric values. @@ -527,55 +526,64 @@ def _compute_clusters(self) -> None: """ Compute participant clusters using auto-determination of optimal k. """ + import time + start_time = time.time() + logger.info(f"Starting clustering computation ({len(self.proj)} participants)...") + # Make sure numpy and pandas are imported import numpy as np import pandas as pd - + # Check if we have projections if not self.proj: self.base_clusters = [] self.group_clusters = [] self.subgroup_clusters = {} + logger.info(f"Clustering completed in {time.time() - start_time:.2f}s (no projections)") return - + # Prepare data for clustering ptpt_ids = list(self.proj.keys()) proj_values = np.array([self.proj[pid] for pid in ptpt_ids]) - + # Create projection matrix proj_matrix = pd.DataFrame( data=proj_values, index=ptpt_ids, columns=['x', 'y'] ) - + # Use auto-determination of k based on data size # The determine_k function will handle this appropriately # Let the clustering function auto-determine the appropriate number of clusters # Pass k=None to use the built-in determine_k function base_clusters = cluster_dataframe(proj_matrix, k=None) - + # Convert base clusters to group clusters # Group clusters are high-level groups based on base clusters group_clusters = base_clusters - + # Store results self.base_clusters = base_clusters self.group_clusters = group_clusters - + # Compute subgroup clusters if needed self.subgroup_clusters = {} - - # TODO: Implement subgroup clustering if needed - + + logger.info(f"Clustering completed in {time.time() - start_time:.2f}s ({len(group_clusters)} groups)") + def _compute_repness(self) -> None: """ Compute comment representativeness. """ + import time + start_time = time.time() + logger.info(f"Starting representativeness computation ({len(self.group_clusters)} groups, {self.rating_mat.shape[1]} comments)...") + # Make sure numpy and pandas are imported import numpy as np import pandas as pd - + # Check if we have groups if not self.group_clusters: self.repness = { @@ -583,11 +591,13 @@ def _compute_repness(self) -> None: 'group_repness': {}, 'consensus_comments': [] } + logger.info(f"Representativeness completed in {time.time() - start_time:.2f}s (no groups)") return - + # Compute representativeness self.repness = conv_repness(self.rating_mat, self.group_clusters) - + logger.info(f"Representativeness completed in {time.time() - start_time:.2f}s") + def _compute_participant_info_optimized(self, vote_matrix: pd.DataFrame, group_clusters: List[Dict[str, Any]]) -> Dict[str, Any]: """ Optimized version of the participant info computation. diff --git a/delphi/polismath/pca_kmeans_rep/repness.py b/delphi/polismath/pca_kmeans_rep/repness.py index bef5c00e4d..abbd83515c 100644 --- a/delphi/polismath/pca_kmeans_rep/repness.py +++ b/delphi/polismath/pca_kmeans_rep/repness.py @@ -12,13 +12,27 @@ import math from scipy import stats -from polismath.utils.general import agree, disagree, pass_vote +from polismath.utils.general import AGREE, DISAGREE # Statistical constants Z_90 = 1.645 # Z-score for 90% confidence Z_95 = 1.96 # Z-score for 95% confidence -PSEUDO_COUNT = 1.5 # Pseudocount for Bayesian smoothing + +# Pseudocount for Bayesian smoothing (Laplace smoothing / additive smoothing) +# +# Why use pseudocounts? +# - Prevents extreme probabilities (0 or 1) when sample sizes are small +# - With PSEUDO_COUNT = 1.5, we effectively add 0.75 "virtual" agrees and +# 0.75 "virtual" disagrees to each comment's vote count +# - This pulls probabilities toward 0.5 (the prior), with the effect diminishing +# as sample size grows +# - Formula: p_agree = (n_agree + PSEUDO_COUNT/2) / (n_votes + PSEUDO_COUNT) +# +# Example: With 3 agrees out of 4 votes: +# - Raw probability: 3/4 = 0.75 +# - Smoothed (PSEUDO_COUNT=1.5): (3 + 0.75) / (4 + 1.5) = 3.75/5.5 ≈ 0.68 +PSEUDO_COUNT = 1.5 def z_score_sig_90(z: float) -> bool: @@ -113,11 +127,11 @@ def comment_stats(votes: np.ndarray, group_members: List[int]) -> Dict[str, Any] Dictionary of statistics """ # Filter votes to only include group members - group_votes = [votes[i] for i in group_members if i < len(votes)] - + group_votes = votes[group_members] + # Count agrees, disagrees, and total votes - n_agree = sum(1 for v in group_votes if agree(v)) - n_disagree = sum(1 for v in group_votes if disagree(v)) + n_agree = np.sum(group_votes == AGREE) + n_disagree = np.sum(group_votes == DISAGREE) n_votes = n_agree + n_disagree # Calculate probabilities with pseudocounts (Bayesian smoothing) @@ -399,10 +413,10 @@ def calculate_kl_divergence(p: np.ndarray, q: np.ndarray) -> float: def select_consensus_comments(all_stats: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Select comments with broad consensus. - + Args: all_stats: List of comment statistics for all groups - + Returns: List of consensus comments """ @@ -413,18 +427,18 @@ def select_consensus_comments(all_stats: List[Dict[str, Any]]) -> List[Dict[str, if cid not in by_comment: by_comment[cid] = [] by_comment[cid].append(stat) - + # Comments that have stats for all groups consensus_candidates = [] - + for cid, stats in by_comment.items(): # Check if all groups mostly agree all_agree = all(s['pa'] > 0.6 for s in stats) - + if all_agree: # Calculate average agreement avg_agree = sum(s['pa'] for s in stats) / len(stats) - + # Add as consensus candidate consensus_candidates.append({ 'comment_id': cid, @@ -432,159 +446,494 @@ def select_consensus_comments(all_stats: List[Dict[str, Any]]) -> List[Dict[str, 'repful': 'consensus', 'stats': stats }) - + # Sort by average agreement consensus_candidates.sort(key=lambda x: x['avg_agree'], reverse=True) - + # Take top 2 return consensus_candidates[:2] -def conv_repness(vote_matrix: pd.DataFrame, group_clusters: List[Dict[str, Any]]) -> Dict[str, Any]: +# ============================================================================= +# Vectorized DataFrame-native functions for multi-group operations +# ============================================================================= + +def prop_test_vectorized(p: pd.Series, n: pd.Series, p0: float = 0.5) -> pd.Series: """ - Calculate representativeness for all comments and groups. - + Vectorized one-proportion z-test. + Args: - vote_matrix: pd.DataFrame of votes + p: Series of observed proportions + n: Series of number of observations + p0: Expected proportion under null hypothesis (default: 0.5) + + Returns: + Series of z-scores + """ + se = np.sqrt(p0 * (1 - p0) / n) + z = (p - p0) / se + # Handle edge cases: n=0, p0=0, p0=1 all result in 0 + z = z.fillna(0.0) + z = z.replace([np.inf, -np.inf], 0.0) + return z + + +def two_prop_test_vectorized(p1: pd.Series, n1: pd.Series, + p2: pd.Series, n2: pd.Series) -> pd.Series: + """ + Vectorized two-proportion z-test. + + Args: + p1: Series of first proportions + n1: Series of number of observations for first proportion + p2: Series of second proportions + n2: Series of number of observations for second proportion + + Returns: + Series of z-scores + """ + # Pooled probability + p_pooled = (p1 * n1 + p2 * n2) / (n1 + n2) + + # Standard error + se = np.sqrt(p_pooled * (1 - p_pooled) * (1/n1 + 1/n2)) + + # Z-score calculation + z = (p1 - p2) / se + + # Handle edge cases + z = z.fillna(0.0) + z = z.replace([np.inf, -np.inf], 0.0) + return z + + +def compute_group_comment_stats_df(votes_long: pd.DataFrame, + group_clusters: List[Dict[str, Any]]) -> pd.DataFrame: + """ + Compute vote counts and probabilities for all (group, comment) pairs. + + This is the vectorized version of comment_stats() that operates on all + groups and comments simultaneously. + + Args: + votes_long: Long-format DataFrame with columns: + - 'participant': participant ID + - 'comment': comment ID + - 'vote': vote value (AGREE, DISAGREE, PASS, or NaN) group_clusters: List of group clusters - + Returns: - Dictionary with representativeness data for each group + DataFrame indexed by (group_id, comment) with columns: + - na: number of agrees + - nd: number of disagrees + - ns: number of votes (agrees + disagrees) + - pa: probability of agree (with pseudocount smoothing) + - pd: probability of disagree (with pseudocount smoothing) + - pat: proportion test z-score for agree + - pdt: proportion test z-score for disagree + - ra: representativeness ratio for agree (group vs other) + - rd: representativeness ratio for disagree (group vs other) + - rat: representativeness test z-score for agree + - rdt: representativeness test z-score for disagree + - agree_metric: metric for agree representativeness + - disagree_metric: metric for disagree representativeness + - repful: 'agree' or 'disagree' based on which is more representative + """ + # Build participant -> group mapping + ptpt_to_group = {} + for group in group_clusters: + for member in group['members']: + ptpt_to_group[member] = group['id'] + + # Drop NaN votes (unvoted) first - this applies to all participants + votes_only = votes_long.dropna(subset=['vote']) + + if votes_only.empty: + # Return empty DataFrame with correct schema + return pd.DataFrame(columns=['na', 'nd', 'ns', 'pa', 'pd', 'pat', 'pdt']) + + # Compute total counts per comment BEFORE filtering to group members + # This matches the old behavior where "other" included ALL participants + # not in the current group (even those not in any cluster) + total_counts = votes_only.groupby('comment').agg( + total_agree=('vote', lambda x: (x == AGREE).sum()), + total_disagree=('vote', lambda x: (x == DISAGREE).sum()), + ) + total_counts['total_votes'] = total_counts['total_agree'] + total_counts['total_disagree'] + + # Now add group column and filter to only group members + votes_with_group = votes_only.copy() + votes_with_group['group_id'] = votes_with_group['participant'].map(ptpt_to_group) + + # Keep only votes from participants in some group (for group-specific counts) + votes_in_groups = votes_with_group.dropna(subset=['group_id']) + + if votes_in_groups.empty: + # Return empty DataFrame with correct schema + return pd.DataFrame(columns=['na', 'nd', 'ns', 'pa', 'pd', 'pat', 'pdt']) + + # Get all unique comments that have at least one vote (from anyone) + all_comments = total_counts.index.tolist() + + # Get all group IDs + all_group_ids = [group['id'] for group in group_clusters] + + # Compute vote counts per (group, comment) for votes from group members + group_counts = votes_in_groups.groupby(['group_id', 'comment']).agg( + na=('vote', lambda x: (x == AGREE).sum()), + nd=('vote', lambda x: (x == DISAGREE).sum()), + ) + group_counts['ns'] = group_counts['na'] + group_counts['nd'] + + # Create full index with all (group, comment) combinations to match old behavior + # Old implementation: for each group, iterate over ALL comments (that have any votes) + full_index = pd.MultiIndex.from_product( + [all_group_ids, all_comments], + names=['group_id', 'comment'] + ) + + # Reindex to include all combinations, filling missing with 0 + group_counts = group_counts.reindex(full_index, fill_value=0) + + # Join total counts to group counts + stats_df = group_counts.join(total_counts, on='comment') + + # Compute "other" counts (everyone not in this group) + stats_df['other_agree'] = stats_df['total_agree'] - stats_df['na'] + stats_df['other_disagree'] = stats_df['total_disagree'] - stats_df['nd'] + stats_df['other_votes'] = stats_df['total_votes'] - stats_df['ns'] + + # Compute probabilities with pseudocounts (Bayesian smoothing) + # For group + stats_df['pa'] = (stats_df['na'] + PSEUDO_COUNT/2) / (stats_df['ns'] + PSEUDO_COUNT) + stats_df['pd'] = (stats_df['nd'] + PSEUDO_COUNT/2) / (stats_df['ns'] + PSEUDO_COUNT) + + # Handle ns == 0 case: default to uninformative prior (0.5) + zero_mask = stats_df['ns'] == 0 + stats_df.loc[zero_mask, 'pa'] = 0.5 + stats_df.loc[zero_mask, 'pd'] = 0.5 + + # For "other" group + stats_df['other_pa'] = (stats_df['other_agree'] + PSEUDO_COUNT/2) / (stats_df['other_votes'] + PSEUDO_COUNT) + stats_df['other_pd'] = (stats_df['other_disagree'] + PSEUDO_COUNT/2) / (stats_df['other_votes'] + PSEUDO_COUNT) + + other_zero_mask = stats_df['other_votes'] == 0 + stats_df.loc[other_zero_mask, 'other_pa'] = 0.5 + stats_df.loc[other_zero_mask, 'other_pd'] = 0.5 + + # Compute proportion tests (group vs 0.5) + stats_df['pat'] = prop_test_vectorized(stats_df['pa'], stats_df['ns'], 0.5) + stats_df['pdt'] = prop_test_vectorized(stats_df['pd'], stats_df['ns'], 0.5) + + # Compute representativeness ratios (group vs other) + stats_df['ra'] = stats_df['pa'] / stats_df['other_pa'] + stats_df['rd'] = stats_df['pd'] / stats_df['other_pd'] + + # Handle division by zero (other_pa or other_pd == 0) + stats_df['ra'] = stats_df['ra'].replace([np.inf, -np.inf], 1.0).fillna(1.0) + stats_df['rd'] = stats_df['rd'].replace([np.inf, -np.inf], 1.0).fillna(1.0) + + # Compute representativeness tests (two-proportion z-test: group vs other) + stats_df['rat'] = two_prop_test_vectorized( + stats_df['pa'], stats_df['ns'], + stats_df['other_pa'], stats_df['other_votes'] + ) + stats_df['rdt'] = two_prop_test_vectorized( + stats_df['pd'], stats_df['ns'], + stats_df['other_pd'], stats_df['other_votes'] + ) + + # Compute metrics + # agree_metric = pa * (|pat| + |rat|) + # disagree_metric = (1 - pd) * (|pdt| + |rdt|) + stats_df['agree_metric'] = stats_df['pa'] * (stats_df['pat'].abs() + stats_df['rat'].abs()) + stats_df['disagree_metric'] = (1 - stats_df['pd']) * (stats_df['pdt'].abs() + stats_df['rdt'].abs()) + + # Determine repful ('agree' or 'disagree') + # Logic: if pa > 0.5 and ra > 1.0 -> 'agree' + # elif pd > 0.5 and rd > 1.0 -> 'disagree' + # else: use higher metric + conditions = [ + (stats_df['pa'] > 0.5) & (stats_df['ra'] > 1.0), + (stats_df['pd'] > 0.5) & (stats_df['rd'] > 1.0), + ] + choices = ['agree', 'disagree'] + stats_df['repful'] = np.select(conditions, choices, + default=np.where(stats_df['agree_metric'] >= stats_df['disagree_metric'], + 'agree', 'disagree')) + + return stats_df + + +def select_rep_comments_df(stats_df: pd.DataFrame, + agree_count: int = 3, + disagree_count: int = 2) -> pd.DataFrame: + """ + Select representative comments for a single group from a DataFrame. + + DataFrame-native version of select_rep_comments(). + + Args: + stats_df: DataFrame with comment statistics for ONE group + agree_count: Number of agreement comments to select + disagree_count: Number of disagreement comments to select + + Returns: + DataFrame of selected representative comments + """ + if stats_df.empty: + return stats_df + + total_wanted = agree_count + disagree_count + + # Best agree: pa > pd and passes significance tests + agree_candidates = stats_df[stats_df['pa'] > stats_df['pd']].copy() + if not agree_candidates.empty: + # Check significance: |pat| >= Z_90 and |rat| >= Z_90 + passing_agree = agree_candidates[ + (agree_candidates['pat'].abs() >= Z_90) & + (agree_candidates['rat'].abs() >= Z_90) & + (agree_candidates['pa'] >= 0.5) + ] + if not passing_agree.empty: + agree_candidates = passing_agree + + # Best disagree: pd > pa and passes significance tests + disagree_candidates = stats_df[stats_df['pd'] > stats_df['pa']].copy() + if not disagree_candidates.empty: + passing_disagree = disagree_candidates[ + (disagree_candidates['pdt'].abs() >= Z_90) & + (disagree_candidates['rdt'].abs() >= Z_90) & + (disagree_candidates['pd'] >= 0.5) + ] + if not passing_disagree.empty: + disagree_candidates = passing_disagree + + # Sort candidates by metric + if not agree_candidates.empty: + agree_candidates = agree_candidates.sort_values('agree_metric', ascending=False) + if not disagree_candidates.empty: + disagree_candidates = disagree_candidates.sort_values('disagree_metric', ascending=False) + + # Select top N from each category + selected_parts = [] + + if not agree_candidates.empty: + top_agree = agree_candidates.head(agree_count).copy() + top_agree['repful'] = 'agree' + selected_parts.append(top_agree) + + if not disagree_candidates.empty: + top_disagree = disagree_candidates.head(disagree_count).copy() + top_disagree['repful'] = 'disagree' + selected_parts.append(top_disagree) + + if selected_parts: + selected = pd.concat(selected_parts, ignore_index=False) + else: + selected = pd.DataFrame() + + # If we couldn't find enough, try to fill from available candidates + # This matches the exact behavior of the old select_rep_comments() function: + # - First fallback adds agree_comments[agree_count:min(len, total_wanted)] regardless of + # whether we exceed total_wanted (up to disagree_count more agrees) + # - Second fallback only runs if STILL < total_wanted + if len(selected) < total_wanted: + # Try to add more agree comments + # Old code: range(agree_count, min(len(agree_comments), agree_count + disagree_count)) + if not agree_candidates.empty and len(agree_candidates) > agree_count: + extra_limit = min(len(agree_candidates), total_wanted) + extra_agrees = agree_candidates.iloc[agree_count:extra_limit].copy() + extra_agrees['repful'] = 'agree' + selected = pd.concat([selected, extra_agrees], ignore_index=False) + + # Try to add more disagree comments (only if still not enough) + # Old code: range(disagree_count, min(len(disagree_comments), agree_count + disagree_count)) + if len(selected) < total_wanted and not disagree_candidates.empty and len(disagree_candidates) > disagree_count: + extra_limit = min(len(disagree_candidates), total_wanted) + extra_disagrees = disagree_candidates.iloc[disagree_count:extra_limit].copy() + extra_disagrees['repful'] = 'disagree' + selected = pd.concat([selected, extra_disagrees], ignore_index=False) + + # Fallback: if still empty, take first row + if selected.empty and not stats_df.empty: + selected = stats_df.head(1).copy() + selected['repful'] = selected['repful'].iloc[0] if 'repful' in selected.columns else 'agree' + + return selected + + +def select_consensus_comments_df(stats_df: pd.DataFrame, + n_groups: int) -> List[Dict[str, Any]]: + """ + Select consensus comments from DataFrame. + + Args: + stats_df: DataFrame with all (group, comment) statistics + n_groups: Number of groups + + Returns: + List of consensus comment dicts + """ + if stats_df.empty: + return [] + + # Group by comment and check if all groups have high agreement + stats_reset = stats_df.reset_index() + comment_stats = stats_reset.groupby('comment').agg( + min_pa=('pa', 'min'), + avg_pa=('pa', 'mean'), + group_count=('group_id', 'count') + ) + + # Filter to comments where all groups agree (pa > 0.6 for all) + # and present in all groups + consensus = comment_stats[ + (comment_stats['min_pa'] > 0.6) & + (comment_stats['group_count'] == n_groups) + ].copy() + + if consensus.empty: + return [] + + # Sort by average agreement and take top 2 + consensus = consensus.nlargest(2, 'avg_pa') + + # Convert to list of dicts using _stats_row_to_dict for legacy format + result = [] + for comment_id in consensus.index: + comment_rows = stats_reset[stats_reset['comment'] == comment_id] + # Convert each row to legacy dict format + stats_list = [_stats_row_to_dict(row) for _, row in comment_rows.iterrows()] + result.append({ + 'comment_id': comment_id, + 'avg_agree': consensus.loc[comment_id, 'avg_pa'], + 'repful': 'consensus', + 'stats': stats_list + }) + + return result + + +def _stats_row_to_dict(row: pd.Series) -> Dict[str, Any]: + """Convert a stats DataFrame row to the legacy dict format.""" + return { + 'comment_id': row['comment'], + 'group_id': row['group_id'], + 'na': int(row['na']), + 'nd': int(row['nd']), + 'ns': int(row['ns']), + 'pa': row['pa'], + 'pd': row['pd'], + 'pat': row['pat'], + 'pdt': row['pdt'], + 'ra': row['ra'], + 'rd': row['rd'], + 'rat': row['rat'], + 'rdt': row['rdt'], + 'agree_metric': row['agree_metric'], + 'disagree_metric': row['disagree_metric'], + 'repful': row['repful'], + } + + +def conv_repness(vote_matrix_df: pd.DataFrame, group_clusters: List[Dict[str, Any]]) -> Dict[str, Any]: + """ + Calculate representativeness for all comments and groups. + + Uses a vectorized long-format DataFrame approach for efficiency. + + Args: + vote_matrix_df: pd.DataFrame of matrix of votes (participants × comments) + Values should be AGREE (1), DISAGREE (-1), PASS (0), or NaN (unvoted) + group_clusters: List of group clusters, each with 'id' and 'members' + + Returns: + Dictionary with representativeness data for each group: + - comment_ids: list of comment IDs + - group_repness: dict mapping group_id -> list of representative comments + - consensus_comments: list of consensus comments + - comment_repness: list of all comment repness data """ - # Extract and clean the matrix values - matrix_values = vote_matrix.to_numpy(copy = True) - - # Ensure the matrix contains numeric values - if not np.issubdtype(matrix_values.dtype, np.number): - # Convert to numeric matrix with proper NaN handling - numeric_matrix = np.zeros(matrix_values.shape, dtype=float) - for i in range(matrix_values.shape[0]): - for j in range(matrix_values.shape[1]): - val = matrix_values[i, j] - if pd.isna(val) or val is None: - numeric_matrix[i, j] = np.nan - else: - try: - numeric_matrix[i, j] = float(val) - except (ValueError, TypeError): - numeric_matrix[i, j] = np.nan - matrix_values = numeric_matrix - - # Replace NaNs with None for the algorithm - matrix_values = np.where(np.isnan(matrix_values), None, matrix_values) - # Create empty-result structure in case we need to return early empty_result = { - 'comment_ids': vote_matrix.columns.tolist(), + 'comment_ids': vote_matrix_df.columns.tolist(), 'group_repness': {group['id']: [] for group in group_clusters}, 'consensus_comments': [], - 'comment_repness': [] # Add a list for all comment repness data + 'comment_repness': [] } - + # Check if we have enough data - if matrix_values.shape[0] < 2 or matrix_values.shape[1] < 2: + if vote_matrix_df.shape[0] < 2 or vote_matrix_df.shape[1] < 2: return empty_result - - # Result will hold repness data for each group + + # Convert wide-format to long-format DataFrame + # Wide: participants × comments (values = votes) + # Long: participant | comment | vote + votes_long = vote_matrix_df.melt( + ignore_index=False, + var_name='comment', + value_name='vote' + ).reset_index(names='participant') + + # Ensure vote column is numeric (handle object dtype with None values) + votes_long['vote'] = pd.to_numeric(votes_long['vote'], errors='coerce') + + # Compute all stats using vectorized function + stats_df = compute_group_comment_stats_df(votes_long, group_clusters) + + if stats_df.empty: + return empty_result + + # Reset index for easier manipulation + stats_df_reset = stats_df.reset_index() + + # Build comment_repness list (vectorized) + stats_df_reset['repness'] = np.where( + stats_df_reset['repful'] == 'agree', + stats_df_reset['agree_metric'], + stats_df_reset['disagree_metric'] + ) + comment_repness = stats_df_reset[['comment', 'group_id', 'repness', 'pa', 'pd']].copy() + comment_repness.columns = ['tid', 'gid', 'repness', 'pa', 'pd'] + + # Build result structure result = { - 'comment_ids': vote_matrix.columns.tolist(), + 'comment_ids': vote_matrix_df.columns.tolist(), 'group_repness': {}, - 'comment_repness': [] # Add a list for all comment repness data + 'comment_repness': comment_repness.to_dict('records') } - - # For each group, calculate representativeness - all_stats = [] - + + # Select representative comments per group (DataFrame operations) for group in group_clusters: group_id = group['id'] - - # Convert member IDs to indices with error handling - group_members = [] - for m in group['members']: - try: - if m in vote_matrix.index: - idx = vote_matrix.index.get_loc(m) - # Question: why would idx ever *not* be within matrix_values.shape ? - if 0 <= idx < matrix_values.shape[0]: - group_members.append(idx) - except (ValueError, TypeError) as e: - print(f"Error finding member {m} in matrix: {e}") - - if not group_members: - # Skip empty groups + group_stats = stats_df_reset[stats_df_reset['group_id'] == group_id] + + if group_stats.empty: result['group_repness'][group_id] = [] continue - - # Calculate other members (all participants not in this group) - all_indices = list(range(matrix_values.shape[0])) - other_members = [i for i in all_indices if i not in group_members] - - # Stats for each comment - group_stats = [] - - for c_idx, comment_id in enumerate(vote_matrix.columns): - if c_idx >= matrix_values.shape[1]: - continue - - comment_votes = matrix_values[:, c_idx] - - # Skip comments with no votes - if not any(v is not None for v in comment_votes): - continue - - try: - # Calculate stats for this group - stats = comment_stats(comment_votes, group_members) - - # Calculate stats for other groups - other_stats = comment_stats(comment_votes, other_members) - - # Add comparative stats - stats = add_comparative_stats(stats, other_stats) - - # Finalize stats - stats = finalize_cmt_stats(stats) - - # Add metadata - stats['comment_id'] = comment_id - stats['group_id'] = group_id - - group_stats.append(stats) - all_stats.append(stats) - - # Also add to the comment_repness list - repness = { - 'tid': comment_id, - 'gid': group_id, - 'repness': stats.get('agree_metric', 0) if stats.get('repful') == 'agree' else stats.get('disagree_metric', 0), - 'pa': stats.get('pa', 0), - 'pd': stats.get('pd', 0) - } - result['comment_repness'].append(repness) - except Exception as e: - print(f"Error calculating stats for comment {comment_id} in group {group_id}: {e}") - continue - + try: - # Select representative comments for this group - rep_comments = select_rep_comments(group_stats) - - # Store in result + rep_df = select_rep_comments_df(group_stats) + # Convert to list of dicts only at the end + rep_comments = [_stats_row_to_dict(row) for _, row in rep_df.iterrows()] result['group_repness'][group_id] = rep_comments except Exception as e: print(f"Error selecting representative comments for group {group_id}: {e}") result['group_repness'][group_id] = [] - + # Add consensus comments if there are multiple groups try: - if len(group_clusters) > 1 and all_stats: - result['consensus_comments'] = select_consensus_comments(all_stats) + if len(group_clusters) > 1: + result['consensus_comments'] = select_consensus_comments_df( + stats_df, len(group_clusters) + ) else: result['consensus_comments'] = [] except Exception as e: print(f"Error selecting consensus comments: {e}") result['consensus_comments'] = [] - + return result diff --git a/delphi/polismath/utils/general.py b/delphi/polismath/utils/general.py index 6467833ffc..df3f2a9d30 100644 --- a/delphi/polismath/utils/general.py +++ b/delphi/polismath/utils/general.py @@ -12,6 +12,9 @@ T = TypeVar('T') U = TypeVar('U') +AGREE = 1 +DISAGREE = -1 +PASS = 0 def xor(a: bool, b: bool) -> bool: """ @@ -41,45 +44,6 @@ def round_to(n: float, digits: int = 0) -> float: return round(n, digits) -def agree(vote: Optional[float]) -> bool: - """ - Check if a vote is an agreement. - - Args: - vote: Vote value (1 for agree, -1 for disagree, None for pass) - - Returns: - True if the vote is an agreement - """ - return vote == 1 - - -def disagree(vote: Optional[float]) -> bool: - """ - Check if a vote is a disagreement. - - Args: - vote: Vote value (1 for agree, -1 for disagree, None for pass) - - Returns: - True if the vote is a disagreement - """ - return vote == -1 - - -def pass_vote(vote: Optional[float]) -> bool: - """ - Check if a vote is a pass. - - Args: - vote: Vote value (1 for agree, -1 for disagree, None for pass) - - Returns: - True if the vote is a pass (None) - """ - return vote is None - - def zip_collections(*colls: Iterable[T]) -> List[Tuple[T, ...]]: """ Zip multiple collections together. diff --git a/delphi/pyproject.toml b/delphi/pyproject.toml index d96def3b4a..1d949c729b 100644 --- a/delphi/pyproject.toml +++ b/delphi/pyproject.toml @@ -76,6 +76,8 @@ dev = [ # Type stubs "types-requests", "types-psycopg2", + # Profiling + "line_profiler>=4.0.0", ] notebook = [ diff --git a/delphi/tests/test_old_format_repness.py b/delphi/tests/test_old_format_repness.py new file mode 100644 index 0000000000..6679172a76 --- /dev/null +++ b/delphi/tests/test_old_format_repness.py @@ -0,0 +1,541 @@ +""" +Tests for the representativeness module's backwards-compatible interface. + +These tests verify the single-group, single-comment "old format" API +that wraps the new DataFrame-native implementation. +""" + +import numpy as np +import pandas as pd +import sys +import os + +# Add the parent directory to the path to import the module +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from polismath.pca_kmeans_rep.repness import ( + z_score_sig_90, z_score_sig_95, prop_test, two_prop_test, + comment_stats, add_comparative_stats, repness_metric, finalize_cmt_stats, + passes_by_test, best_agree, best_disagree, select_rep_comments, + select_consensus_comments, conv_repness, + participant_stats +) + + +class TestStatisticalFunctions: + """Tests for the statistical utility functions.""" + + def test_z_score_significance(self): + """Test z-score significance checks.""" + # 90% confidence + assert z_score_sig_90(1.645) + assert z_score_sig_90(2.0) + assert z_score_sig_90(-1.645) + assert not z_score_sig_90(1.0) + + # 95% confidence + assert z_score_sig_95(1.96) + assert z_score_sig_95(2.5) + assert z_score_sig_95(-1.96) + assert not z_score_sig_95(1.5) + + def test_prop_test(self): + """Test one-proportion z-test.""" + # Test cases + assert np.isclose(prop_test(0.7, 100, 0.5), 4.0, atol=0.1) + assert np.isclose(prop_test(0.2, 50, 0.3), -1.6, atol=0.1) + + # Edge cases + assert prop_test(0.5, 0, 0.5) == 0.0 + assert prop_test(0.7, 100, 0.0) == 0.0 + assert prop_test(0.7, 100, 1.0) == 0.0 + + def test_two_prop_test(self): + """Test two-proportion z-test.""" + # Test cases + assert np.isclose(two_prop_test(0.7, 100, 0.5, 100), 2.9, atol=0.1) + assert np.isclose(two_prop_test(0.2, 50, 0.3, 50), -1.2, atol=0.1) + + # Edge cases + assert two_prop_test(0.5, 0, 0.5, 100) == 0.0 + assert two_prop_test(0.5, 100, 0.5, 0) == 0.0 + + +class TestCommentStats: + """Tests for comment statistics functions (old single-array interface).""" + + def test_comment_stats(self): + """Test basic comment statistics calculation.""" + # Create test votes: 3 agrees, 1 disagree, 1 pass + votes = np.array([1, 1, 1, -1, None]) + group_members = [0, 1, 2, 3, 4] + + stats = comment_stats(votes, group_members) + + assert stats['na'] == 3 + assert stats['nd'] == 1 + assert stats['ns'] == 4 + + # Check probabilities (with pseudocounts) + n_agree = 3 + n_disagree = 1 + n_votes = 4 + p_agree = (n_agree + 1.5/2) / (n_votes + 1.5) + p_disagree = (n_disagree + 1.5/2) / (n_votes + 1.5) + + assert np.isclose(stats['pa'], p_agree) + assert np.isclose(stats['pd'], p_disagree) + + # Test with no votes + empty_votes = np.array([None, None]) + empty_stats = comment_stats(empty_votes, [0, 1]) + + assert empty_stats['na'] == 0 + assert empty_stats['nd'] == 0 + assert empty_stats['ns'] == 0 + assert np.isclose(empty_stats['pa'], 0.5) + assert np.isclose(empty_stats['pd'], 0.5) + + def test_add_comparative_stats(self): + """Test adding comparative statistics.""" + # Group stats: 80% agree + group_stats = { + 'na': 8, + 'nd': 2, + 'ns': 10, + 'pa': 0.8, + 'pd': 0.2, + 'pat': 3.0, + 'pdt': -3.0 + } + + # Other group stats: 40% agree + other_stats = { + 'na': 4, + 'nd': 6, + 'ns': 10, + 'pa': 0.4, + 'pd': 0.6, + 'pat': -1.0, + 'pdt': 1.0 + } + + result = add_comparative_stats(group_stats, other_stats) + + # Check representativeness ratios + assert np.isclose(result['ra'], 0.8 / 0.4) + assert np.isclose(result['rd'], 0.2 / 0.6) + + # Test edge case with zero probability + other_stats_zero = { + 'na': 0, + 'nd': 10, + 'ns': 10, + 'pa': 0.0, + 'pd': 1.0, + 'pat': -5.0, + 'pdt': 5.0 + } + + result_zero = add_comparative_stats(group_stats, other_stats_zero) + assert np.isclose(result_zero['ra'], 1.0) # Should default to 1.0 + + def test_repness_metric(self): + """Test representativeness metric calculation.""" + stats = { + 'pa': 0.8, + 'pd': 0.2, + 'pat': 3.0, + 'pdt': -3.0, + 'ra': 2.0, + 'rd': 0.33, + 'rat': 2.5, + 'rdt': -2.5 + } + + # Calculate agree metric + agree_metric = repness_metric(stats, 'a') + expected_agree = 0.8 * (abs(3.0) + abs(2.5)) + assert np.isclose(agree_metric, expected_agree) + + # Calculate disagree metric + disagree_metric = repness_metric(stats, 'd') + expected_disagree = (1 - 0.2) * (abs(-3.0) + abs(-2.5)) + assert np.isclose(disagree_metric, expected_disagree) + + def test_finalize_cmt_stats(self): + """Test finalizing comment statistics.""" + # Stats where agree is more representative + agree_stats = { + 'pa': 0.8, + 'pd': 0.2, + 'pat': 3.0, + 'pdt': -3.0, + 'ra': 2.0, + 'rd': 0.33, + 'rat': 2.5, + 'rdt': -2.5 + } + + finalized_agree = finalize_cmt_stats(agree_stats) + + assert 'agree_metric' in finalized_agree + assert 'disagree_metric' in finalized_agree + assert finalized_agree['repful'] == 'agree' + + # Stats where disagree is more representative + disagree_stats = { + 'pa': 0.2, + 'pd': 0.8, + 'pat': -3.0, + 'pdt': 3.0, + 'ra': 0.33, + 'rd': 2.0, + 'rat': -2.5, + 'rdt': 2.5 + } + + finalized_disagree = finalize_cmt_stats(disagree_stats) + assert finalized_disagree['repful'] == 'disagree' + + +class TestSelectionFunctions: + """Tests for representative comment selection functions.""" + + def test_passes_by_test(self): + """Test checking if comments pass significance tests.""" + # Create stats that pass significance tests + passing_stats = { + 'pa': 0.8, + 'pd': 0.2, + 'pat': 3.0, + 'pdt': -3.0, + 'ra': 2.0, + 'rd': 0.33, + 'rat': 3.0, + 'rdt': -3.0 + } + + assert passes_by_test(passing_stats, 'agree') + assert not passes_by_test(passing_stats, 'disagree') + + # Create stats that don't pass (not significant) + failing_stats = { + 'pa': 0.8, + 'pd': 0.2, + 'pat': 1.0, # Below 90% threshold + 'pdt': -1.0, + 'ra': 2.0, + 'rd': 0.33, + 'rat': 1.0, # Below 90% threshold + 'rdt': -1.0 + } + + assert not passes_by_test(failing_stats, 'agree') + + def test_best_agree(self): + """Test filtering for best agreement comments.""" + # Create a mix of stats + stats = [ + { # Passes tests, high agreement + 'comment_id': 'c1', + 'pa': 0.8, 'pd': 0.2, + 'pat': 3.0, 'pdt': -3.0, + 'rat': 3.0, 'rdt': -3.0 + }, + { # Doesn't pass tests + 'comment_id': 'c2', + 'pa': 0.6, 'pd': 0.4, + 'pat': 1.0, 'pdt': -1.0, + 'rat': 1.0, 'rdt': -1.0 + }, + { # Not agreement (more disagree) + 'comment_id': 'c3', + 'pa': 0.3, 'pd': 0.7, + 'pat': -2.0, 'pdt': 2.0, + 'rat': -2.0, 'rdt': 2.0 + }, + { # Passes tests, moderate agreement + 'comment_id': 'c4', + 'pa': 0.7, 'pd': 0.3, + 'pat': 2.5, 'pdt': -2.5, + 'rat': 2.5, 'rdt': -2.5 + } + ] + + best = best_agree(stats) + + # Should return 2 comments that pass tests + assert len(best) == 2 + comment_ids = [s['comment_id'] for s in best] + assert 'c1' in comment_ids + assert 'c4' in comment_ids + assert 'c3' not in comment_ids + + def test_best_disagree(self): + """Test filtering for best disagreement comments.""" + # Create a mix of stats + stats = [ + { # Not disagreement (more agree) + 'comment_id': 'c1', + 'pa': 0.8, 'pd': 0.2, + 'pat': 3.0, 'pdt': -3.0, + 'rat': 3.0, 'rdt': -3.0 + }, + { # Disagreement but doesn't pass tests + 'comment_id': 'c2', + 'pa': 0.4, 'pd': 0.6, + 'pat': -1.0, 'pdt': 1.0, + 'rat': -1.0, 'rdt': 1.0 + }, + { # Passes tests, high disagreement + 'comment_id': 'c3', + 'pa': 0.2, 'pd': 0.8, + 'pat': -3.0, 'pdt': 3.0, + 'rat': -3.0, 'rdt': 3.0 + } + ] + + best = best_disagree(stats) + + # Should return 1 comment that passes tests + assert len(best) == 1 + assert best[0]['comment_id'] == 'c3' + + def test_select_rep_comments(self): + """Test selecting representative comments.""" + # Create a mix of stats + stats = [ + { # Strong agree + 'comment_id': 'c1', + 'pa': 0.9, 'pd': 0.1, + 'pat': 4.0, 'pdt': -4.0, + 'rat': 4.0, 'rdt': -4.0, + 'agree_metric': 7.2, + 'disagree_metric': 0.9 + }, + { # Moderate agree + 'comment_id': 'c2', + 'pa': 0.7, 'pd': 0.3, + 'pat': 2.0, 'pdt': -2.0, + 'rat': 2.0, 'rdt': -2.0, + 'agree_metric': 2.8, + 'disagree_metric': 1.2 + }, + { # Weak agree + 'comment_id': 'c3', + 'pa': 0.6, 'pd': 0.4, + 'pat': 1.0, 'pdt': -1.0, + 'rat': 1.0, 'rdt': -1.0, + 'agree_metric': 1.2, + 'disagree_metric': 0.8 + }, + { # Strong disagree + 'comment_id': 'c4', + 'pa': 0.1, 'pd': 0.9, + 'pat': -4.0, 'pdt': 4.0, + 'rat': -4.0, 'rdt': 4.0, + 'agree_metric': 0.8, + 'disagree_metric': 7.2 + }, + { # Moderate disagree + 'comment_id': 'c5', + 'pa': 0.3, 'pd': 0.7, + 'pat': -2.0, 'pdt': 2.0, + 'rat': -2.0, 'rdt': 2.0, + 'agree_metric': 1.2, + 'disagree_metric': 2.8 + } + ] + + # Set 'repful' for all stats to match the implementation + for stat in stats: + if stat.get('agree_metric', 0) >= stat.get('disagree_metric', 0): + stat['repful'] = 'agree' + else: + stat['repful'] = 'disagree' + + # Select with default counts + selected = select_rep_comments(stats) + + # Check that we get some representative comments + assert len(selected) > 0 + + # Verify that comments are properly marked + agree_comments = [s for s in selected if s['repful'] == 'agree'] + disagree_comments = [s for s in selected if s['repful'] == 'disagree'] + + # Make sure we have both types of comments if available + assert len(agree_comments) > 0 + assert len(disagree_comments) > 0 + + # Check that the order is by metrics + if len(agree_comments) >= 2: + assert agree_comments[0]['agree_metric'] >= agree_comments[1]['agree_metric'] + + if len(disagree_comments) >= 2: + assert disagree_comments[0]['disagree_metric'] >= disagree_comments[1]['disagree_metric'] + + # Test with different counts + selected_custom = select_rep_comments(stats, agree_count=2, disagree_count=1) + + assert len(selected_custom) == 3 + agree_count = sum(1 for s in selected_custom if s['repful'] == 'agree') + disagree_count = sum(1 for s in selected_custom if s['repful'] == 'disagree') + + assert agree_count == 2 + assert disagree_count == 1 + + # Test with empty stats + assert select_rep_comments([]) == [] + + +class TestConsensusAndGroupRepness: + """Tests for consensus and group representativeness functions.""" + + def test_select_consensus_comments(self): + """Test selecting consensus comments.""" + # Create stats for groups + group1_stats = [ + { + 'comment_id': 'c1', + 'group_id': 1, + 'pa': 0.8, 'pd': 0.2 + }, + { + 'comment_id': 'c2', + 'group_id': 1, + 'pa': 0.7, 'pd': 0.3 + } + ] + + group2_stats = [ + { + 'comment_id': 'c1', + 'group_id': 2, + 'pa': 0.85, 'pd': 0.15 + }, + { + 'comment_id': 'c2', + 'group_id': 2, + 'pa': 0.6, 'pd': 0.4 + }, + { + 'comment_id': 'c3', + 'group_id': 2, + 'pa': 0.9, 'pd': 0.1 + } + ] + + # Combine stats + all_stats = group1_stats + group2_stats + + consensus = select_consensus_comments(all_stats) + + # Comments with high agreement across all groups should be consensus + assert len(consensus) > 0 + + # Verify comment IDs in consensus list - both c1 and c2 have high agreement + consensus_ids = [c['comment_id'] for c in consensus] + + # At least one of these should be in the consensus + assert 'c1' in consensus_ids or 'c2' in consensus_ids + + # NOTE: The implementation actually sorts by average agreement + # c3 has the highest average agreement (0.9) but is only in one group + # So it's actually expected that c3 could be in the consensus + # Just verify that the implementation is consistent in its behavior + + # Check all consensus comments have the correct label + for comment in consensus: + assert comment['repful'] == 'consensus' + + +class TestIntegration: + """Integration tests for the representativeness module.""" + + def test_conv_repness(self): + """Test the main representativeness calculation function.""" + # Create a test vote matrix + vote_data = np.array([ + [1, 1, -1, None], # Participant 1 + [1, 1, -1, 1], # Participant 2 + [-1, -1, 1, -1], # Participant 3 + [-1, -1, 1, 1] # Participant 4 + ]) + + row_names = ['p1', 'p2', 'p3', 'p4'] + col_names = ['c1', 'c2', 'c3', 'c4'] + + vote_matrix = pd.DataFrame(vote_data, index=row_names, columns=col_names) + + # Create group clusters + group_clusters = [ + {'id': 1, 'members': ['p1', 'p2']}, # Group 1: mostly agrees with c1, c2 + {'id': 2, 'members': ['p3', 'p4']} # Group 2: mostly agrees with c3 + ] + + # Calculate representativeness + repness_result = conv_repness(vote_matrix, group_clusters) + + # Check result structure + assert 'comment_ids' in repness_result + assert 'group_repness' in repness_result + assert 'consensus_comments' in repness_result + + # Check group repness + assert 1 in repness_result['group_repness'] + assert 2 in repness_result['group_repness'] + + # Group 1 should identify c1/c2 as representative + group1_rep_ids = [s['comment_id'] for s in repness_result['group_repness'][1]] + assert 'c1' in group1_rep_ids or 'c2' in group1_rep_ids + + # Group 2 should identify c3 as representative + group2_rep_ids = [s['comment_id'] for s in repness_result['group_repness'][2]] + assert 'c3' in group2_rep_ids + + def test_participant_stats(self): + """Test participant statistics calculation.""" + # Create a test vote matrix + vote_data = np.array([ + [1, 1, -1, None], # Participant 1 + [1, 1, -1, 1], # Participant 2 + [-1, -1, 1, -1], # Participant 3 + [-1, -1, 1, 1] # Participant 4 + ]) + + row_names = ['p1', 'p2', 'p3', 'p4'] + col_names = ['c1', 'c2', 'c3', 'c4'] + + vote_matrix = pd.DataFrame(vote_data, index=row_names, columns=col_names) + + # Create group clusters + group_clusters = [ + {'id': 1, 'members': ['p1', 'p2']}, + {'id': 2, 'members': ['p3', 'p4']} + ] + + # Calculate participant stats + ptpt_stats = participant_stats(vote_matrix, group_clusters) + + # Check result structure + assert 'participant_ids' in ptpt_stats + assert 'stats' in ptpt_stats + + # Check participant stats + for ptpt_id in row_names: + assert ptpt_id in ptpt_stats['stats'] + stats = ptpt_stats['stats'][ptpt_id] + + assert 'n_agree' in stats + assert 'n_disagree' in stats + assert 'n_votes' in stats + assert 'group' in stats + assert 'group_correlations' in stats + + # Check specific stats + p1_stats = ptpt_stats['stats']['p1'] + assert p1_stats['n_agree'] == 2 + assert p1_stats['n_disagree'] == 1 + assert p1_stats['group'] == 1 diff --git a/delphi/tests/test_repness_unit.py b/delphi/tests/test_repness_unit.py index 4dd111b525..69da767264 100644 --- a/delphi/tests/test_repness_unit.py +++ b/delphi/tests/test_repness_unit.py @@ -17,7 +17,9 @@ comment_stats, add_comparative_stats, repness_metric, finalize_cmt_stats, passes_by_test, best_agree, best_disagree, select_rep_comments, calculate_kl_divergence, select_consensus_comments, conv_repness, - participant_stats + participant_stats, + # DataFrame-native vectorized functions + prop_test_vectorized, two_prop_test_vectorized, compute_group_comment_stats_df ) @@ -537,4 +539,174 @@ def test_participant_stats(self): p1_stats = ptpt_stats['stats']['p1'] assert p1_stats['n_agree'] == 2 assert p1_stats['n_disagree'] == 1 - assert p1_stats['group'] == 1 \ No newline at end of file + assert p1_stats['group'] == 1 + + +class TestVectorizedFunctions: + """Tests for DataFrame-native vectorized functions.""" + + def test_prop_test_vectorized(self): + """Test vectorized one-proportion z-test.""" + p = pd.Series([0.7, 0.2, 0.5]) + n = pd.Series([100, 50, 100]) + + result = prop_test_vectorized(p, n, 0.5) + + # Compare with scalar version + assert np.isclose(result.iloc[0], prop_test(0.7, 100, 0.5), atol=0.01) + assert np.isclose(result.iloc[1], prop_test(0.2, 50, 0.5), atol=0.01) + assert np.isclose(result.iloc[2], prop_test(0.5, 100, 0.5), atol=0.01) + + def test_prop_test_vectorized_edge_cases(self): + """Test vectorized prop test handles edge cases.""" + p = pd.Series([0.5, 0.7]) + n = pd.Series([0, 100]) # n=0 should return 0 + + result = prop_test_vectorized(p, n, 0.5) + + assert result.iloc[0] == 0.0 # n=0 case + assert not np.isnan(result.iloc[1]) # normal case + + def test_two_prop_test_vectorized(self): + """Test vectorized two-proportion z-test.""" + p1 = pd.Series([0.7, 0.2]) + n1 = pd.Series([100, 50]) + p2 = pd.Series([0.5, 0.3]) + n2 = pd.Series([100, 50]) + + result = two_prop_test_vectorized(p1, n1, p2, n2) + + # Compare with scalar version + assert np.isclose(result.iloc[0], two_prop_test(0.7, 100, 0.5, 100), atol=0.01) + assert np.isclose(result.iloc[1], two_prop_test(0.2, 50, 0.3, 50), atol=0.01) + + def test_two_prop_test_vectorized_edge_cases(self): + """Test vectorized two-prop test handles edge cases.""" + p1 = pd.Series([0.5, 0.7]) + n1 = pd.Series([0, 100]) # n1=0 should return 0 + p2 = pd.Series([0.5, 0.5]) + n2 = pd.Series([100, 0]) # n2=0 should return 0 + + result = two_prop_test_vectorized(p1, n1, p2, n2) + + assert result.iloc[0] == 0.0 # n1=0 case + assert result.iloc[1] == 0.0 # n2=0 case + + def test_compute_group_comment_stats_df(self): + """Test vectorized computation of group/comment statistics.""" + # Create test data in long format + votes_long = pd.DataFrame({ + 'participant': ['p1', 'p1', 'p2', 'p2', 'p3', 'p3', 'p4', 'p4'], + 'comment': ['c1', 'c2', 'c1', 'c2', 'c1', 'c2', 'c1', 'c2'], + 'vote': [1, 1, 1, 1, -1, -1, -1, -1] # AGREE=1, DISAGREE=-1 + }) + + group_clusters = [ + {'id': 1, 'members': ['p1', 'p2']}, # Group 1: agrees with c1, c2 + {'id': 2, 'members': ['p3', 'p4']} # Group 2: disagrees with c1, c2 + ] + + stats_df = compute_group_comment_stats_df(votes_long, group_clusters) + + # Check that we have stats for all group/comment combinations + assert len(stats_df) == 4 # 2 groups x 2 comments + + # Check group 1, comment c1 stats + g1_c1 = stats_df.loc[(1, 'c1')] + assert g1_c1['na'] == 2 # 2 agrees + assert g1_c1['nd'] == 0 # 0 disagrees + assert g1_c1['ns'] == 2 # 2 total votes + + # Check group 2, comment c1 stats + g2_c1 = stats_df.loc[(2, 'c1')] + assert g2_c1['na'] == 0 # 0 agrees + assert g2_c1['nd'] == 2 # 2 disagrees + assert g2_c1['ns'] == 2 # 2 total votes + + # Check that probabilities are computed + assert 'pa' in stats_df.columns + assert 'pd' in stats_df.columns + assert 'pat' in stats_df.columns + assert 'pdt' in stats_df.columns + + # Check that comparative stats are computed + assert 'ra' in stats_df.columns + assert 'rd' in stats_df.columns + assert 'rat' in stats_df.columns + assert 'rdt' in stats_df.columns + + # Check that metrics are computed + assert 'agree_metric' in stats_df.columns + assert 'disagree_metric' in stats_df.columns + assert 'repful' in stats_df.columns + + def test_compute_group_comment_stats_df_with_nan_votes(self): + """Test that NaN votes are properly excluded.""" + # Create test data with some NaN votes + votes_long = pd.DataFrame({ + 'participant': ['p1', 'p1', 'p2', 'p2', 'p3', 'p3'], + 'comment': ['c1', 'c2', 'c1', 'c2', 'c1', 'c2'], + 'vote': [1, np.nan, 1, 1, -1, -1] # p1 didn't vote on c2 + }) + + group_clusters = [ + {'id': 1, 'members': ['p1', 'p2']}, + {'id': 2, 'members': ['p3']} + ] + + stats_df = compute_group_comment_stats_df(votes_long, group_clusters) + + # Group 1, c2 should have only 1 vote (from p2, since p1's NaN is dropped) + g1_c2 = stats_df.loc[(1, 'c2')] + assert g1_c2['ns'] == 1 # Only p2's vote counted + + def test_compute_group_comment_stats_df_empty(self): + """Test handling of empty input.""" + votes_long = pd.DataFrame(columns=['participant', 'comment', 'vote']) + group_clusters = [{'id': 1, 'members': ['p1']}] + + stats_df = compute_group_comment_stats_df(votes_long, group_clusters) + + assert stats_df.empty + + def test_compute_group_comment_stats_matches_scalar(self): + """Test that vectorized results match scalar function results.""" + # Create test data + vote_data = np.array([ + [1, 1, -1], # p1 + [1, -1, 1], # p2 + [-1, 1, -1], # p3 + [-1, -1, 1] # p4 + ], dtype=float) + + row_names = ['p1', 'p2', 'p3', 'p4'] + col_names = ['c1', 'c2', 'c3'] + vote_matrix = pd.DataFrame(vote_data, index=row_names, columns=col_names) + + group_clusters = [ + {'id': 1, 'members': ['p1', 'p2']}, + {'id': 2, 'members': ['p3', 'p4']} + ] + + # Get vectorized results + votes_long = vote_matrix.melt( + ignore_index=False, + var_name='comment', + value_name='vote' + ).reset_index(names='participant') + stats_df = compute_group_comment_stats_df(votes_long, group_clusters) + + # Compare with scalar results from conv_repness + repness_result = conv_repness(vote_matrix, group_clusters) + + # Check that we have the same number of comment_repness entries + assert len(repness_result['comment_repness']) == len(stats_df) + + # Check a few specific values + for entry in repness_result['comment_repness']: + gid = entry['gid'] + tid = entry['tid'] + df_row = stats_df.loc[(gid, tid)] + + assert np.isclose(entry['pa'], df_row['pa'], atol=1e-10) + assert np.isclose(entry['pd'], df_row['pd'], atol=1e-10) \ No newline at end of file