-
-
Notifications
You must be signed in to change notification settings - Fork 274
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[aggr-] allow ranking rows by key column #2417
base: develop
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,8 +3,9 @@ | |
import functools | ||
import collections | ||
import statistics | ||
import itertools | ||
|
||
from visidata import Progress, Sheet, Column, ColumnsSheet, VisiData | ||
from visidata import Progress, Sheet, Column, ColumnsSheet, VisiData, SettableColumn | ||
from visidata import vd, anytype, vlen, asyncthread, wrapply, AttrDict, date | ||
|
||
vd.help_aggregators = '''# Choose Aggregators | ||
|
@@ -142,11 +143,39 @@ def __init__(self, pct, helpstr=''): | |
def aggregate(self, col, rows): | ||
return _percentile(sorted(col.getValues(rows)), self.pct/100, key=float) | ||
|
||
class RankAggregator(Aggregator): | ||
def __init__(self, *args, **kwargs): | ||
super().__init__(*args, **kwargs) | ||
|
||
def aggregate(self, col, rows): | ||
if not col.sheet.keyCols: | ||
vd.error('ranking requires one or more key columns') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this actually true? I could see row number being used if there are no key columns. If we remove this check, does that just work? |
||
return None | ||
return rank(col.sheet, rows) | ||
|
||
def quantiles(q, helpstr): | ||
return [PercentileAggregator(round(100*i/q), helpstr) for i in range(1, q)] | ||
|
||
|
||
def rank(sheet, rows): | ||
def _key_progress(prog): | ||
def identity(val): | ||
prog.addProgress(1) | ||
return val | ||
return identity | ||
|
||
keys = map(sheet.rowkey, rows) | ||
with Progress(gerund='grouping', total=sheet.nRows) as prog: | ||
keys_sorted = sorted(((rowkey, i) for i, rowkey in enumerate(keys)), key=_key_progress(prog)) | ||
# group elements by rowkey | ||
with Progress(gerund='ranking', total=sheet.nRows) as prog: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using these Progress objects separately in serial will reset the progress meter. You only need one of them, with a If you want to keep the gerunds to indicate the various steps, you can have the first one be 'outermost', and then then other Progress() within that scope with different gerunds (and no total), and only addProgress on the outer one. |
||
ranks_sorted = [ (i, rowkey, rank) for rank, (_, grp) in enumerate(itertools.groupby(keys_sorted, | ||
key=lambda v,prog=prog: _key_progress(prog)(v[0])), 1) for rowkey, i in grp ] | ||
# sort back into their initial order | ||
with Progress(gerund='collating', total=sheet.nRows) as prog: | ||
ranks_reordered = [ rank for i, rowkey, rank in sorted(ranks_sorted, key=_key_progress(prog)) ] | ||
return ranks_reordered | ||
|
||
vd.aggregator('min', min, 'minimum value') | ||
vd.aggregator('max', max, 'maximum value') | ||
vd.aggregator('avg', mean, 'arithmetic mean of values', type=float) | ||
|
@@ -158,6 +187,7 @@ def quantiles(q, helpstr): | |
vd.aggregator('count', lambda values: sum(1 for v in values), 'number of values', type=int) | ||
vd.aggregator('list', list, 'list of values', type=anytype) | ||
vd.aggregator('stdev', statistics.stdev, 'standard deviation of values', type=float) | ||
vd.aggregators['rank'] = RankAggregator('rank', anytype, helpstr='list of ranks after grouping by key columns') | ||
|
||
vd.aggregators['q3'] = quantiles(3, 'tertiles (33/66th pctile)') | ||
vd.aggregators['q4'] = quantiles(4, 'quartiles (25/50/75th pctile)') | ||
|
@@ -231,7 +261,8 @@ def memo_aggregate(col, agg_choices, rows): | |
for agg in aggs: | ||
aggval = agg.aggregate(col, rows) | ||
typedval = wrapply(agg.type or col.type, aggval) | ||
dispval = col.format(typedval) | ||
# limit width to limit formatting time when typedval is a long list | ||
dispval = col.format(typedval, width=1000) | ||
k = col.name+'_'+agg.name | ||
vd.status(f'{k}={dispval}') | ||
vd.memory[k] = typedval | ||
|
@@ -276,10 +307,21 @@ def _fmt_aggr_summary(match, row, trigger_key): | |
vd.warning(f'aggregator does not exist: {aggr}') | ||
return aggrs | ||
|
||
@Sheet.api | ||
def addcol_list_aggr(sheet, col, list_agg): | ||
if not sheet.keyCols: | ||
vd.fail('ranking requires one or more key columns') | ||
c = SettableColumn(name=f'{sheet.name}_{list_agg}', type=int) | ||
rows = vd.aggregators[list_agg].aggregate(col, col.sheet.rows) | ||
sheet.addColumnAtCursor(c) | ||
c.setValues(sheet.rows, *rows) | ||
|
||
Sheet.addCommand('+', 'aggregate-col', 'addAggregators([cursorCol], chooseAggregators())', 'Add aggregator to current column') | ||
Sheet.addCommand('z+', 'memo-aggregate', 'cursorCol.memo_aggregate(chooseAggregators(), selectedRows or rows)', 'memo result of aggregator over values in selected rows for current column') | ||
ColumnsSheet.addCommand('g+', 'aggregate-cols', 'addAggregators(selectedRows or source[0].nonKeyVisibleCols, chooseAggregators())', 'add aggregators to selected source columns') | ||
Sheet.addCommand('', 'addcol-rank', 'addcol_list_aggr(cursorCol, "rank")', 'create new column ranking rows by their key columns') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this specific to "rank"? What happens if we apply a different list aggregator? Should this take an input? (the default value could be "rank" to make it the easiest option). Also what happens with non-list aggregators? This could be an instant |
||
|
||
vd.addMenuItems(''' | ||
Column > Add aggregator > aggregate-col | ||
Column > Add column > rank > addcol-rank | ||
''') |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A pass-through
__init__
is unnecessary and will happen by default.