1
Fork 0
mirror of https://github.com/RGBCube/nu_scripts synced 2025-07-31 14:17:45 +00:00

[stdlib-candidate] add aggregate (#991)

# Description

Add `aggregate`, a command that operates on the output of `group-by
--to-table` to help aggregate to do quick inspections.

# Related
- https://github.com/nushell/nushell/pull/14316#issuecomment-2511395679
- https://github.com/nushell/nushell/issues/2607
- https://github.com/nushell/nushell/pull/14337

# Examples

```nushell
open ~/Downloads/movies.csv
  | group-by Lead_Studio Genre --to-table
  | aggregate Worldwide_Gross
  # | first 4
  # | to md
```


|Lead_Studio|Genre|count|Worldwide_Gross_min|Worldwide_Gross_avg|Worldwide_Gross_max|Worldwide_Gross_sum|
|-|-|-|-|-|-|-|
|The Weinstein Company|Comedy|1|19.62|19.62|19.62|19.62|
|The Weinstein Company|Drama|1|8.26|8.26|8.26|8.26|
|Independent|Comedy|7|14.31|57.01|205.3|399.07|
|Independent|Romance|7|0.03|149.82142857142858|702.17|1048.75|

---

```nushell
open ~/Downloads/movies.csv
  | group-by Lead_Studio Genre --to-table
  | aggregate Worldwide_Gross --ops {avg: {math avg}, std: {math stddev}}
  # | first 4
  # | to md
```

|Lead_Studio|Genre|count|Worldwide_Gross_avg|Worldwide_Gross_std|
|-|-|-|-|-|
|The Weinstein Company|Comedy|1|19.62|0|
|The Weinstein Company|Drama|1|8.26|0|
|Independent|Comedy|7|57.01|66.1709932134704|
|Independent|Romance|7|149.82142857142858|229.79475832816996|

---

```nushell
open ~/Downloads/movies.csv
  | group-by Lead_Studio Genre --to-table
  | aggregate Worldwide_Gross Audience_score_% --ops {avg: {math avg}}
  # | first 4
  # | to md
```

|Lead_Studio|Genre|count|Worldwide_Gross_avg|Audience_score_%_avg|
|-|-|-|-|-|
|The Weinstein Company|Comedy|1|19.62|52|
|The Weinstein Company|Drama|1|8.26|84|
|Independent|Comedy|7|57.01|60.142857142857146|
|Independent|Romance|7|149.82142857142858|59.857142857142854|
This commit is contained in:
Bahex 2024-12-31 22:07:35 +03:00 committed by GitHub
parent a83a40dff0
commit 8db6af6376
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 321 additions and 0 deletions

View file

@ -0,0 +1,126 @@
def aggregate-default-ops [] {
{
min: {math min},
avg: {math avg},
max: {math max},
sum: {math sum},
}
}
def aggregate-col-name [col: cell-path, op_name: string]: [nothing -> string] {
$col | split cell-path | get value | str join "." | $"($in)_($op_name)"
}
def get-item-with-error [
col: cell-path,
opts: record<span: record<start: int, end: int>, items: bool>
]: [table -> any] {
try {
get $col
} catch {
let full_cellpath = if $opts.items {
$col
| split cell-path
| prepend {value: items, optional: false}
| into cell-path
} else {
$col
}
error make {
msg: $"Cannot find column '($full_cellpath)'",
label: {
text: "value originates here",
span: $opts.span
},
}
}
}
def "error not-a-table" [span: record<start: int, end:int>] {
error make {
msg: "input must be a table",
label: {
text: "from here",
span: $span
},
help: "Are you using `group-by`? Make sure to use its `--to-table` flag."
}
}
# Run aggregate operations on output of `group-by --to-table`.
#
# # Example
#
# - group files by type and extension, and get stats about their sizes
# ```nushell
# >_ ls | group-by type { get name | path parse | get extension } --to-table | aggregate size
# ```
#
# - group data by multiple columns, and run custom aggregate operations
# ```nushell
# >_ open movies.csv
# | group-by Lead_Studio Genre --to-table
# | aggregate Worldwide_Gross Profitability --ops {avg: {math avg}, std: {math stddev}}
# ```
#
# - run aggregate operations without grouping the input
# ```nushell
# >_ open movies.csv | aggregate Year
# ```
export def main [
--ops: record, # default = {min: {math min}, avg: {math avg}, max: {math max}, sum: {math sum}}
...columns: cell-path, # columns to perform aggregations on
]: [
table -> table<count: int>
] {
let IN = $in
let md = metadata $in
let first = try { $IN | first } catch { error not-a-table $md.span }
if not (($first | describe) starts-with record) {
error not-a-table $md.span
}
let grouped = "items" in $first
let IN = if $grouped {
$IN
} else {
[{items: $IN}]
}
let agg_ops = $ops | default (aggregate-default-ops)
let results = $IN
| update items {|group|
let column_results = $columns
| each {|col| # col: cell-path
let column = $group.items | get-item-with-error $col {span: $md.span, items: $grouped}
$agg_ops | items {|op_name, op| # op_name: string, op: closure
$column | do $op | wrap (aggregate-col-name $col $op_name)
}
| reduce {|it| merge $it}
}
# Manually propagate errors
for r in $column_results {
if ($r | describe) == error {
return $r
}
}
$column_results
| reduce --fold {} {|it| merge $it}
| insert count ($group.items | length)
| roll right # put count as the first column
}
# Manually propagate errors
for r in $results {
if ($r.items | describe) == error {
return $r.items
}
}
$results | flatten items
}

View file

@ -0,0 +1,195 @@
use std/assert
use ../std-rfc/aggregate
const movies = [
[ Film, Genre, Lead_Studio, Audience_score_%, Profitability, Rotten_Tomatoes_%, Worldwide_Gross, Year ];
[ "Youth in Revolt", Comedy, "The Weinstein Company", 52, 1.09, 68, 19.62, 2010 ],
[ "You Will Meet a Tall Dark Stranger", Comedy, Independent, 35, 1.211818182, 43, 26.66, 2010 ],
[ "When in Rome", Comedy, Disney, 44, 0, 15, 43.04, 2010 ],
[ "What Happens in Vegas", Comedy, Fox, 72, 6.267647029, 28, 219.37, 2008 ],
[ "Water For Elephants", Drama, "20th Century Fox", 72, 3.081421053, 60, 117.09, 2011 ],
[ WALL-E, Animation, Disney, 89, 2.896019067, 96, 521.28, 2008 ],
[ Waitress, Romance, Independent, 67, 11.0897415, 89, 22.18, 2007 ],
[ "Waiting For Forever", Romance, Independent, 53, 0.005, 6, 0.03, 2011 ],
[ "Valentine's Day", Comedy, "Warner Bros.", 54, 4.184038462, 17, 217.57, 2010 ],
[ "Tyler Perry's Why Did I get Married", Romance, Independent, 47, 3.7241924, 46, 55.86, 2007 ],
[ "Twilight: Breaking Dawn", Romance, Independent, 68, 6.383363636, 26, 702.17, 2011 ],
[ Twilight, Romance, Summit, 82, 10.18002703, 49, 376.66, 2008 ],
[ "The Ugly Truth", Comedy, Independent, 68, 5.402631579, 14, 205.3, 2009 ],
[ "The Twilight Saga: New Moon", Drama, Summit, 78, 14.1964, 27, 709.82, 2009 ],
[ "The Time Traveler's Wife", Drama, Paramount, 65, 2.598205128, 38, 101.33, 2009 ],
[ "The Proposal", Comedy, Disney, 74, 7.8675, 43, 314.7, 2009 ],
[ "The Invention of Lying", Comedy, "Warner Bros.", 47, 1.751351351, 56, 32.4, 2009 ],
[ "The Heartbreak Kid", Comedy, Paramount, 41, 2.129444167, 30, 127.77, 2007 ],
[ "The Duchess", Drama, Paramount, 68, 3.207850222, 60, 43.31, 2008 ],
[ "The Curious Case of Benjamin Button", Fantasy, "Warner Bros.", 81, 1.78394375, 73, 285.43, 2008 ],
[ "The Back-up Plan", Comedy, CBS, 47, 2.202571429, 20, 77.09, 2010 ],
[ Tangled, Animation, Disney, 88, 1.365692308, 89, 355.01, 2010 ],
[ "Something Borrowed", Romance, Independent, 48, 1.719514286, 15, 60.18, 2011 ],
[ "She's Out of My League", Comedy, Paramount, 60, 2.4405, 57, 48.81, 2010 ],
[ "Sex and the City Two", Comedy, "Warner Bros.", 49, 2.8835, 15, 288.35, 2010 ],
[ "Sex and the City 2", Comedy, "Warner Bros.", 49, 2.8835, 15, 288.35, 2010 ],
[ "Sex and the City", Comedy, "Warner Bros.", 81, 7.221795791, 49, 415.25, 2008 ],
[ "Remember Me", Drama, Summit, 70, 3.49125, 28, 55.86, 2010 ],
[ "Rachel Getting Married", Drama, Independent, 61, 1.384166667, 85, 16.61, 2008 ],
[ Penelope, Comedy, Summit, 74, 1.382799733, 52, 20.74, 2008 ],
[ "P.S. I Love You", Romance, Independent, 82, 5.103116833, 21, 153.09, 2007 ],
[ "Over Her Dead Body", Comedy, "New Line", 47, 2.071, 15, 20.71, 2008 ],
[ "Our Family Wedding", Comedy, Independent, 49, 0, 14, 21.37, 2010 ],
[ "One Day", Romance, Independent, 54, 3.682733333, 37, 55.24, 2011 ],
[ "Not Easily Broken", Drama, Independent, 66, 2.14, 34, 10.7, 2009 ],
[ "No Reservations", Comedy, "Warner Bros.", 64, 3.307180357, 39, 92.6, 2007 ],
[ "Nick and Norah's Infinite Playlist", Comedy, Sony, 67, 3.3527293, 73, 33.53, 2008 ],
[ "New Year's Eve", Romance, "Warner Bros.", 48, 2.536428571, 8, 142.04, 2011 ],
[ "My Week with Marilyn", Drama, "The Weinstein Company", 84, 0.8258, 83, 8.26, 2011 ],
[ "Music and Lyrics", Romance, "Warner Bros.", 70, 3.64741055, 63, 145.9, 2007 ],
[ "Monte Carlo", Romance, "20th Century Fox", 50, 1.9832, 38, 39.66, 2011 ],
[ "Miss Pettigrew Lives for a Day", Comedy, Independent, 70, 0.2528949, 78, 15.17, 2008 ],
[ "Midnight in Paris", Romence, Sony, 84, 8.744705882, 93, 148.66, 2011 ],
[ "Marley and Me", Comedy, Fox, 77, 3.746781818, 63, 206.07, 2008 ],
[ "Mamma Mia!", Comedy, Universal, 76, 9.234453864, 53, 609.47, 2008 ],
[ "Mamma Mia!", Comedy, Universal, 76, 9.234453864, 53, 609.47, 2008 ],
[ "Made of Honor", Comdy, Sony, 61, 2.64906835, 13, 105.96, 2008 ],
[ "Love Happens", Drama, Universal, 40, 2.004444444, 18, 36.08, 2009 ],
[ "Love & Other Drugs", Comedy, Fox, 55, 1.817666667, 48, 54.53, 2010 ],
[ "Life as We Know It", Comedy, Independent, 62, 2.530526316, 28, 96.16, 2010 ],
[ "License to Wed", Comedy, "Warner Bros.", 55, 1.9802064, 8, 69.31, 2007 ],
[ "Letters to Juliet", Comedy, Summit, 62, 2.639333333, 40, 79.18, 2010 ],
[ "Leap Year", Comedy, Universal, 49, 1.715263158, 21, 32.59, 2010 ],
[ "Knocked Up", Comedy, Universal, 83, 6.636401848, 91, 219, 2007 ],
[ Killers, Action, Lionsgate, 45, 1.245333333, 11, 93.4, 2010 ],
[ "Just Wright", Comedy, Fox, 58, 1.797416667, 45, 21.57, 2010 ],
[ "Jane Eyre", Romance, Universal, 77, 0, 85, 30.15, 2011 ],
[ "It's Complicated", Comedy, Universal, 63, 2.642352941, 56, 224.6, 2009 ],
[ "I Love You Phillip Morris", Comedy, Independent, 57, 1.34, 71, 20.1, 2010 ],
[ "High School Musical 3: Senior Year", Comedy, Disney, 76, 22.91313646, 65, 252.04, 2008 ],
[ "He's Just Not That Into You", Comedy, "Warner Bros.", 60, 7.1536, 42, 178.84, 2009 ],
[ "Good Luck Chuck", Comedy, Lionsgate, 61, 2.36768512, 3, 59.19, 2007 ],
[ "Going the Distance", Comedy, "Warner Bros.", 56, 1.3140625, 53, 42.05, 2010 ],
[ "Gnomeo and Juliet", Animation, Disney, 52, 5.387972222, 56, 193.97, 2011 ],
[ "Gnomeo and Juliet", Animation, Disney, 52, 5.387972222, 56, 193.97, 2011 ],
[ "Ghosts of Girlfriends Past", Comedy, "Warner Bros.", 47, 2.0444, 27, 102.22, 2009 ],
[ "Four Christmases", Comedy, "Warner Bros.", 52, 2.022925, 26, 161.83, 2008 ],
[ Fireproof, Drama, Independent, 51, 66.934, 40, 33.47, 2008 ],
[ Enchanted, Comedy, Disney, 80, 4.005737082, 93, 340.49, 2007 ],
[ "Dear John", Drama, Sony, 66, 4.5988, 29, 114.97, 2010 ],
[ Beginners, Comedy, Independent, 80, 4.471875, 84, 14.31, 2011 ],
[ "Across the Universe", romance, Independent, 84, 0.652603178, 54, 29.37, 2007 ],
[ "A Serious Man", Drama, Universal, 64, 4.382857143, 89, 30.68, 2009 ],
[ "A Dangerous Method", Drama, Independent, 89, 0.44864475, 79, 8.97, 2011 ],
[ "27 Dresses", Comedy, Fox, 71, 5.3436218, 40, 160.31, 2008 ],
[ "(500) Days of Summer", comedy, Fox, 81, 8.096, 87, 60.72, 2009 ]
]
#[test]
def count_movies_by_Lead_Studio [] {
let grouped = $movies | group-by Lead_Studio --to-table
let out = $grouped | aggregate
# let expected = $grouped | insert count {get items | length} | select Lead_Studio count
let expected = [
[ Lead_Studio, count ];
[ "The Weinstein Company", 2 ],
[ Independent, 19 ],
[ Disney, 8 ],
[ Fox, 6 ],
[ "20th Century Fox", 2 ],
[ "Warner Bros.", 14 ],
[ Summit, 5 ],
[ Paramount, 4 ],
[ CBS, 1 ],
[ "New Line", 1 ],
[ Sony, 4 ],
[ Universal, 8 ],
[ Lionsgate, 2 ]
]
assert equal $out $expected
}
#[test]
def average_gross_by_Genre [] {
let grouped = $movies | group-by Genre --to-table
let out = $grouped | aggregate --ops {avg: {math avg}} Worldwide_Gross | select Genre Worldwide_Gross_avg
# let expected = $grouped | insert Worldwide_Gross_avg {get items.Worldwide_Gross | math avg} | select Genre Worldwide_Gross_avg
# Round to 2 digits of precision to keep floating point operations consistent between platforms.
let out = $out | update Worldwide_Gross_avg {math round --precision 2}
let expected = [
[ Genre, Worldwide_Gross_avg ];
[ Comedy, 148.33 ],
[ Drama, 99.01 ],
[ Animation, 316.06 ],
[ Romance, 148.60 ],
[ Fantasy, 285.43 ],
[ Romence, 148.66 ],
[ Comdy, 105.96 ],
[ Action, 93.40 ],
[ romance, 29.37 ],
[ comedy, 60.72 ]
]
assert equal $out $expected
}
#[test]
def aggregate_default_ops [] {
let grouped = $movies | group-by Genre --to-table
let out = $grouped | aggregate Worldwide_Gross
# Round to 2 digits of precision to keep floating point operations consistent between platforms.
let out = $out | update cells -c [Worldwide_Gross_min, Worldwide_Gross_avg, Worldwide_Gross_max, Worldwide_Gross_sum] { math round --precision 2 }
let expected = [
[Genre , count, Worldwide_Gross_min, Worldwide_Gross_avg, Worldwide_Gross_max, Worldwide_Gross_sum];
[Comedy , 41, 14.31, 148.33, 609.47, 6081.73],
[Drama , 13, 8.26, 99.01, 709.82, 1287.15],
[Animation, 4, 193.97, 316.06, 521.28, 1264.23],
[Romance , 12, 0.03, 148.60, 702.17, 1783.16],
[Fantasy , 1, 285.43, 285.43, 285.43, 285.43],
[Romence , 1, 148.66, 148.66, 148.66, 148.66],
[Comdy , 1, 105.96, 105.96, 105.96, 105.96],
[Action , 1, 93.40, 93.40, 93.40, 93.40],
[romance , 1, 29.37, 29.37, 29.37, 29.37],
[comedy , 1, 60.72, 60.72, 60.72, 60.72],
]
assert equal $out $expected
}
#[test]
def throw_error_on_non-table_input [] {
# without --to-table
let out = try {
$movies | group-by Genre | aggregate Worldwide_Gross
} catch {|e|
$e.msg
}
assert equal $out "input must be a table"
}
#[test]
def throw_error_on_non-existing_column [] {
let grouped = $movies | group-by Genre --to-table
let error = try {
$grouped | aggregate --ops {avg: {math avg}} NotInTheDataSet
} catch {|e|
$e.json | from json
}
assert equal $error.inner.0.msg "Cannot find column '$.items.NotInTheDataSet'"
}
#[test]
def aggregate_stats_without_grouping [] {
let out = $movies | aggregate Year | update cells -c [Year_min Year_avg Year_max Year_sum] {math round -p 2}
let expected = [{
count: 76,
Year_min: 2007,
Year_avg: 2009.09,
Year_max: 2011,
Year_sum: 152691
}]
assert equal $out $expected
}