Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[aggr-] allow ranking rows by key column #2417

Open
wants to merge 3 commits into
base: develop
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 44 additions & 2 deletions visidata/aggregators.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
import functools
import collections
import statistics
import itertools

from visidata import Progress, Sheet, Column, ColumnsSheet, VisiData
from visidata import Progress, Sheet, Column, ColumnsSheet, VisiData, SettableColumn
from visidata import vd, anytype, vlen, asyncthread, wrapply, AttrDict, date

vd.help_aggregators = '''# Choose Aggregators
Expand Down Expand Up @@ -142,11 +143,39 @@ def __init__(self, pct, helpstr=''):
def aggregate(self, col, rows):
return _percentile(sorted(col.getValues(rows)), self.pct/100, key=float)

class RankAggregator(Aggregator):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A pass-through __init__ is unnecessary and will happen by default.


def aggregate(self, col, rows):
if not col.sheet.keyCols:
vd.error('ranking requires one or more key columns')
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this actually true? I could see row number being used if there are no key columns. If we remove this check, does that just work?

return None
return rank(col.sheet, rows)

def quantiles(q, helpstr):
return [PercentileAggregator(round(100*i/q), helpstr) for i in range(1, q)]


def rank(sheet, rows):
def _key_progress(prog):
def identity(val):
prog.addProgress(1)
return val
return identity

keys = map(sheet.rowkey, rows)
with Progress(gerund='grouping', total=sheet.nRows) as prog:
keys_sorted = sorted(((rowkey, i) for i, rowkey in enumerate(keys)), key=_key_progress(prog))
# group elements by rowkey
with Progress(gerund='ranking', total=sheet.nRows) as prog:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using these Progress objects separately in serial will reset the progress meter. You only need one of them, with a total=3*sheet.nRows (since there are 3 steps).

If you want to keep the gerunds to indicate the various steps, you can have the first one be 'outermost', and then then other Progress() within that scope with different gerunds (and no total), and only addProgress on the outer one.

ranks_sorted = [ (i, rowkey, rank) for rank, (_, grp) in enumerate(itertools.groupby(keys_sorted,
key=lambda v,prog=prog: _key_progress(prog)(v[0])), 1) for rowkey, i in grp ]
# sort back into their initial order
with Progress(gerund='collating', total=sheet.nRows) as prog:
ranks_reordered = [ rank for i, rowkey, rank in sorted(ranks_sorted, key=_key_progress(prog)) ]
return ranks_reordered

vd.aggregator('min', min, 'minimum value')
vd.aggregator('max', max, 'maximum value')
vd.aggregator('avg', mean, 'arithmetic mean of values', type=float)
Expand All @@ -158,6 +187,7 @@ def quantiles(q, helpstr):
vd.aggregator('count', lambda values: sum(1 for v in values), 'number of values', type=int)
vd.aggregator('list', list, 'list of values', type=anytype)
vd.aggregator('stdev', statistics.stdev, 'standard deviation of values', type=float)
vd.aggregators['rank'] = RankAggregator('rank', anytype, helpstr='list of ranks after grouping by key columns')

vd.aggregators['q3'] = quantiles(3, 'tertiles (33/66th pctile)')
vd.aggregators['q4'] = quantiles(4, 'quartiles (25/50/75th pctile)')
Expand Down Expand Up @@ -231,7 +261,8 @@ def memo_aggregate(col, agg_choices, rows):
for agg in aggs:
aggval = agg.aggregate(col, rows)
typedval = wrapply(agg.type or col.type, aggval)
dispval = col.format(typedval)
# limit width to limit formatting time when typedval is a long list
dispval = col.format(typedval, width=1000)
k = col.name+'_'+agg.name
vd.status(f'{k}={dispval}')
vd.memory[k] = typedval
Expand Down Expand Up @@ -276,10 +307,21 @@ def _fmt_aggr_summary(match, row, trigger_key):
vd.warning(f'aggregator does not exist: {aggr}')
return aggrs

@Sheet.api
def addcol_list_aggr(sheet, col, list_agg):
if not sheet.keyCols:
vd.fail('ranking requires one or more key columns')
c = SettableColumn(name=f'{sheet.name}_{list_agg}', type=int)
rows = vd.aggregators[list_agg].aggregate(col, col.sheet.rows)
sheet.addColumnAtCursor(c)
c.setValues(sheet.rows, *rows)

Sheet.addCommand('+', 'aggregate-col', 'addAggregators([cursorCol], chooseAggregators())', 'Add aggregator to current column')
Sheet.addCommand('z+', 'memo-aggregate', 'cursorCol.memo_aggregate(chooseAggregators(), selectedRows or rows)', 'memo result of aggregator over values in selected rows for current column')
ColumnsSheet.addCommand('g+', 'aggregate-cols', 'addAggregators(selectedRows or source[0].nonKeyVisibleCols, chooseAggregators())', 'add aggregators to selected source columns')
Sheet.addCommand('', 'addcol-rank', 'addcol_list_aggr(cursorCol, "rank")', 'create new column ranking rows by their key columns')
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this specific to "rank"? What happens if we apply a different list aggregator? Should this take an input? (the default value could be "rank" to make it the easiest option).

Also what happens with non-list aggregators? This could be an instant SUM(curcol) GROUP BY keycols which I think would be a major hit!


vd.addMenuItems('''
Column > Add aggregator > aggregate-col
Column > Add column > rank > addcol-rank
''')
Loading