Grouping and Summarizing Data

Author

IND215

Published

September 22, 2025

The Power of Group-Wise Operations

One of the most powerful features of dplyr is the ability to perform operations on groups of data. The combination of group_by() and summarise() allows you to calculate statistics for each group in your data, implementing the split-apply-combine strategy that’s fundamental to data analysis.

library(tidyverse)

# Load the sample dataset from Module 5 index
set.seed(123)
sales_raw <- tibble(
  order_id = paste0("ORD-", str_pad(1:1000, 4, pad = "0")),
  customer_id = sample(paste0("CUST-", 1:200), 1000, replace = TRUE),
  product_id = sample(paste0("PROD-", 1:50), 1000, replace = TRUE),
  order_date = sample(seq(ymd("2023-01-01"), ymd("2024-12-31"), by = "day"), 1000, replace = TRUE),
  quantity = sample(1:10, 1000, replace = TRUE),
  unit_price = round(runif(1000, 5, 200), 2),
  discount_pct = sample(c(0, 0.05, 0.10, 0.15, 0.20), 1000, replace = TRUE, prob = c(0.6, 0.2, 0.1, 0.05, 0.05)),
  sales_rep = sample(c("Alice Johnson", "Bob Smith", "Charlie Brown", "Diana Prince", "Eve Wilson"), 1000, replace = TRUE),
  region = sample(c("North", "South", "East", "West"), 1000, replace = TRUE),
  customer_type = sample(c("Individual", "Small Business", "Enterprise"), 1000, replace = TRUE, prob = c(0.5, 0.3, 0.2))
) %>%
  mutate(
    sales_rep = ifelse(row_number() %in% sample(1:1000, 50), NA, sales_rep),
    discount_pct = ifelse(row_number() %in% sample(1:1000, 30), NA, discount_pct),
    gross_amount = quantity * unit_price,
    discount_amount = gross_amount * coalesce(discount_pct, 0),
    net_amount = gross_amount - discount_amount,
    unit_price = ifelse(row_number() %in% sample(1:1000, 5), unit_price * 100, unit_price),
    quantity = ifelse(row_number() %in% sample(1:1000, 3), 0, quantity)
  )

cat("Working with", nrow(sales_raw), "sales transactions\n")

Working with 1000 sales transactions

Understanding group_by()

The group_by() function doesn’t change the data itself - it adds metadata that tells dplyr to perform subsequent operations by group.

# Group by a single variable
sales_by_region <- sales_raw %>%
  group_by(region)

# The data looks the same, but it's now grouped
cat("Grouped data has", n_groups(sales_by_region), "groups\n")

Grouped data has 4 groups

cat("Group sizes:\n")

Group sizes:

count(sales_by_region, region)

# A tibble: 4 × 2
# Groups:   region [4]
  region     n
  <chr>  <int>
1 East     251
2 North    246
3 South    234
4 West     269

# Group by multiple variables
sales_by_region_type <- sales_raw %>%
  group_by(region, customer_type)

cat("\nGrouping by region and customer type creates", n_groups(sales_by_region_type), "groups\n")


Grouping by region and customer type creates 12 groups

Basic Summarization

summarise() (or summarize()) reduces grouped data to summary statistics:

# Calculate summary statistics by region
regional_summary <- sales_raw %>%
  group_by(region) %>%
  summarise(
    total_sales = sum(net_amount),
    avg_sale = mean(net_amount),
    median_sale = median(net_amount),
    total_orders = n(),
    unique_customers = n_distinct(customer_id),
    .groups = "drop"  # Good practice to specify group handling
  )

print(regional_summary)

# A tibble: 4 × 6
  region total_sales avg_sale median_sale total_orders unique_customers
  <chr>        <dbl>    <dbl>       <dbl>        <int>            <int>
1 East       134961.     538.        402.          251              140
2 North      121848.     495.        354.          246              138
3 South      119161.     509.        387.          234              133
4 West       142025.     528.        374.          269              152

# Multiple grouping variables
detailed_summary <- sales_raw %>%
  filter(quantity > 0) %>%  # Exclude invalid orders
  group_by(region, customer_type) %>%
  summarise(
    revenue = sum(net_amount),
    orders = n(),
    avg_order_value = mean(net_amount),
    avg_discount = mean(discount_pct, na.rm = TRUE) * 100,
    .groups = "drop"
  ) %>%
  arrange(desc(revenue))

cat("\nTop 5 segments by revenue:\n")


Top 5 segments by revenue:

print(head(detailed_summary, 5))

# A tibble: 5 × 6
  region customer_type  revenue orders avg_order_value avg_discount
  <chr>  <chr>            <dbl>  <int>           <dbl>        <dbl>
1 West   Individual      78443.    133            590.         3.98
2 East   Individual      72121.    130            555.         3.29
3 South  Individual      60550.    121            500.         3.87
4 North  Individual      52854.    112            472.         2.84
5 North  Small Business  44267.     81            547.         4.03

Common Summary Functions

dplyr provides many useful functions for summarization:

# Comprehensive summary statistics
sales_raw %>%
  filter(!is.na(sales_rep), quantity > 0) %>%
  group_by(sales_rep) %>%
  summarise(
    # Count functions
    total_orders = n(),
    distinct_customers = n_distinct(customer_id),
    distinct_products = n_distinct(product_id),
    
    # Central tendency
    mean_sale = mean(net_amount),
    median_sale = median(net_amount),
    
    # Spread
    sd_sale = sd(net_amount),
    min_sale = min(net_amount),
    max_sale = max(net_amount),
    range = max_sale - min_sale,
    
    # Position
    q25 = quantile(net_amount, 0.25),
    q75 = quantile(net_amount, 0.75),
    iqr = IQR(net_amount),
    
    # Totals
    total_revenue = sum(net_amount),
    total_units = sum(quantity),
    
    .groups = "drop"
  ) %>%
  arrange(desc(total_revenue))

# A tibble: 5 × 15
  sales_rep     total_orders distinct_customers distinct_products mean_sale
  <chr>                <int>              <int>             <int>     <dbl>
1 Alice Johnson          198                130                49      535.
2 Diana Prince           200                127                49      516.
3 Bob Smith              178                123                50      538.
4 Eve Wilson             184                116                47      511.
5 Charlie Brown          187                128                49      489.
# ℹ 10 more variables: median_sale <dbl>, sd_sale <dbl>, min_sale <dbl>,
#   max_sale <dbl>, range <dbl>, q25 <dbl>, q75 <dbl>, iqr <dbl>,
#   total_revenue <dbl>, total_units <dbl>

Grouped Mutations

You can also use mutate() with grouped data to add group-wise calculations:

# Add group-level statistics to each row
sales_with_stats <- sales_raw %>%
  filter(quantity > 0) %>%
  group_by(region) %>%
  mutate(
    regional_avg = mean(net_amount),
    pct_of_regional_avg = net_amount / regional_avg * 100,
    regional_rank = row_number(desc(net_amount)),
    regional_percentile = percent_rank(net_amount)
  ) %>%
  ungroup()

# Show top performers in each region
sales_with_stats %>%
  filter(regional_rank <= 3) %>%
  select(order_id, region, net_amount, regional_avg, pct_of_regional_avg, regional_rank) %>%
  arrange(region, regional_rank)

# A tibble: 12 × 6
   order_id region net_amount regional_avg pct_of_regional_avg regional_rank
   <chr>    <chr>       <dbl>        <dbl>               <dbl>         <int>
 1 ORD-0646 East        1916.         538.                356.             1
 2 ORD-0694 East        1909.         538.                355.             2
 3 ORD-0804 East        1815.         538.                337.             3
 4 ORD-0757 North       1975          495.                399.             1
 5 ORD-0143 North       1885.         495.                380.             2
 6 ORD-0172 North       1724.         495.                348.             3
 7 ORD-0957 South       1941.         509.                382.             1
 8 ORD-0652 South       1733.         509.                341.             2
 9 ORD-0482 South       1688.         509.                332.             3
10 ORD-0358 West        1903          529.                360.             1
11 ORD-0599 West        1867.         529.                353.             2
12 ORD-0268 West        1767.         529.                334.             3

Window Functions

Window functions are particularly useful with grouped data:

# Various window functions
monthly_sales <- sales_raw %>%
  filter(quantity > 0) %>%
  mutate(
    year_month = floor_date(order_date, "month")
  ) %>%
  group_by(year_month, region) %>%
  summarise(
    total_sales = sum(net_amount),
    .groups = "drop"
  ) %>%
  group_by(region) %>%
  arrange(year_month) %>%
  mutate(
    # Cumulative functions
    cumulative_sales = cumsum(total_sales),
    
    # Lag and lead
    prev_month_sales = lag(total_sales, 1),
    next_month_sales = lead(total_sales, 1),
    
    # Month-over-month growth
    mom_growth = (total_sales - prev_month_sales) / prev_month_sales * 100,
    
    # Rolling calculations (3-month average)
    rolling_avg_3m = (lag(total_sales, 2) + lag(total_sales, 1) + total_sales) / 3
  ) %>%
  ungroup()

# Show recent trends
monthly_sales %>%
  filter(year_month >= "2024-01-01") %>%
  select(year_month, region, total_sales, mom_growth, rolling_avg_3m) %>%
  arrange(region, year_month)

# A tibble: 48 × 5
   year_month region total_sales mom_growth rolling_avg_3m
   <date>     <chr>        <dbl>      <dbl>          <dbl>
 1 2024-01-01 East         2746.      -29.3          3425.
 2 2024-02-01 East         4833.       76.0          3821.
 3 2024-03-01 East         2834.      -41.4          3471.
 4 2024-04-01 East         8790.      210.           5486.
 5 2024-05-01 East         6045.      -31.2          5890.
 6 2024-06-01 East         7244.       19.8          7360.
 7 2024-07-01 East         3070.      -57.6          5453.
 8 2024-08-01 East         6230.      103.           5514.
 9 2024-09-01 East         3299.      -47.0          4200.
10 2024-10-01 East         2170.      -34.2          3899.
# ℹ 38 more rows

Multiple Summary Statistics at Once

You can calculate multiple statistics for multiple variables efficiently:

# Summary statistics for multiple numeric variables
sales_raw %>%
  filter(quantity > 0, unit_price <= 500) %>%
  group_by(customer_type) %>%
  summarise(
    across(
      c(quantity, gross_amount, discount_pct, net_amount),
      list(
        mean = ~mean(.x, na.rm = TRUE),
        median = ~median(.x, na.rm = TRUE),
        sd = ~sd(.x, na.rm = TRUE)
      ),
      .names = "{.col}_{.fn}"
    ),
    n_orders = n(),
    .groups = "drop"
  ) %>%
  mutate(
    across(where(is.numeric), ~round(.x, 2))
  )

# A tibble: 3 × 14
  customer_type  quantity_mean quantity_median quantity_sd gross_amount_mean
  <chr>                  <dbl>           <dbl>       <dbl>             <dbl>
1 Enterprise              5.1                5        2.95              498.
2 Individual              5.36               5        2.91              549.
3 Small Business          5.33               5        3.04              542.
# ℹ 9 more variables: gross_amount_median <dbl>, gross_amount_sd <dbl>,
#   discount_pct_mean <dbl>, discount_pct_median <dbl>, discount_pct_sd <dbl>,
#   net_amount_mean <dbl>, net_amount_median <dbl>, net_amount_sd <dbl>,
#   n_orders <dbl>

Handling Missing Values in Groups

Proper handling of missing values is crucial in grouped operations:

# Compare different approaches to missing values
missing_comparison <- sales_raw %>%
  group_by(region) %>%
  summarise(
    # Different ways to handle NAs
    total_orders = n(),
    orders_with_rep = sum(!is.na(sales_rep)),
    orders_missing_rep = sum(is.na(sales_rep)),
    
    # Average discount - different approaches
    avg_discount_na_rm = mean(discount_pct, na.rm = TRUE) * 100,
    avg_discount_zeros = mean(coalesce(discount_pct, 0)) * 100,
    
    # Count of discounted orders
    discounted_orders = sum(discount_pct > 0, na.rm = TRUE),
    
    .groups = "drop"
  )

print(missing_comparison)

# A tibble: 4 × 7
  region total_orders orders_with_rep orders_missing_rep avg_discount_na_rm
  <chr>         <int>           <int>              <int>              <dbl>
1 East            251             237                 14               3.08
2 North           246             236                 10               3.45
3 South           234             221                 13               4.13
4 West            269             256                 13               4.10
# ℹ 2 more variables: avg_discount_zeros <dbl>, discounted_orders <int>

Advanced Grouping Techniques

Dynamic Grouping

# Function to analyze sales by any grouping variable
analyze_by_group <- function(data, group_var) {
  data %>%
    group_by({{ group_var }}) %>%
    summarise(
      revenue = sum(net_amount),
      orders = n(),
      avg_order = mean(net_amount),
      customers = n_distinct(customer_id),
      .groups = "drop"
    ) %>%
    mutate(
      revenue_pct = revenue / sum(revenue) * 100
    ) %>%
    arrange(desc(revenue))
}

# Apply to different grouping variables
cat("Analysis by sales rep:\n")

Analysis by sales rep:

sales_raw %>%
  filter(!is.na(sales_rep)) %>%
  analyze_by_group(sales_rep)

# A tibble: 5 × 6
  sales_rep     revenue orders avg_order customers revenue_pct
  <chr>           <dbl>  <int>     <dbl>     <int>       <dbl>
1 Alice Johnson 105879.    198      535.       130        21.5
2 Diana Prince  103229.    200      516.       127        21.0
3 Bob Smith      95691.    178      538.       123        19.5
4 Eve Wilson     94591.    185      511.       116        19.2
5 Charlie Brown  92240.    189      488.       129        18.8

cat("\nAnalysis by customer type:\n")


Analysis by customer type:

sales_raw %>%
  analyze_by_group(customer_type)

# A tibble: 3 × 6
  customer_type  revenue orders avg_order customers revenue_pct
  <chr>            <dbl>  <int>     <dbl>     <int>       <dbl>
1 Individual     265047.    498      532.       190        51.2
2 Small Business 159730.    308      519.       160        30.8
3 Enterprise      93219.    194      481.       123        18.0

Conditional Grouping

# Group by calculated categories
sales_categorized <- sales_raw %>%
  filter(quantity > 0) %>%
  mutate(
    order_size = case_when(
      net_amount < 100 ~ "Small",
      net_amount < 500 ~ "Medium",
      net_amount < 1000 ~ "Large",
      TRUE ~ "Enterprise"
    ),
    discount_category = case_when(
      is.na(discount_pct) | discount_pct == 0 ~ "No discount",
      discount_pct <= 0.05 ~ "Low discount",
      discount_pct <= 0.10 ~ "Medium discount",
      TRUE ~ "High discount"
    )
  ) %>%
  group_by(order_size, discount_category) %>%
  summarise(
    orders = n(),
    revenue = sum(net_amount),
    avg_margin = mean((net_amount - gross_amount * 0.6) / net_amount) * 100,  # Assuming 60% cost
    .groups = "drop"
  ) %>%
  arrange(order_size, desc(revenue))

print(sales_categorized)

# A tibble: 16 × 5
   order_size discount_category orders revenue avg_margin
   <chr>      <chr>              <int>   <dbl>      <dbl>
 1 Enterprise No discount           97 129674.       40  
 2 Enterprise Low discount          27  37766.       36.8
 3 Enterprise Medium discount       13  16490.       33.3
 4 Enterprise High discount          9  11244.       27.0
 5 Large      No discount          161 115914.       40  
 6 Large      Low discount          46  32329.       36.8
 7 Large      Medium discount       26  19280.       33.3
 8 Large      High discount         26  19067.       27.2
 9 Medium     No discount          288  77926.       40  
10 Medium     Low discount          95  26367.       36.8
11 Medium     Medium discount       53  13551.       33.3
12 Medium     High discount         38  10259.       27.1
13 Small      No discount           66   3723.       40  
14 Small      Low discount          25   1460.       36.8
15 Small      High discount         18    956.       27.9
16 Small      Medium discount        9    660.       33.3

Performance Tips

When working with large grouped datasets:

# Efficient grouping - calculate once, use multiple times
efficient_summary <- sales_raw %>%
  filter(quantity > 0) %>%
  group_by(region, customer_type) %>%
  summarise(
    n = n(),
    revenue = sum(net_amount),
    .groups = "drop"
  )

# Add multiple calculated fields after summarizing
efficient_summary <- efficient_summary %>%
  mutate(
    avg_order = revenue / n,
    revenue_share = revenue / sum(revenue) * 100,
    cumulative_share = cumsum(revenue_share)
  ) %>%
  arrange(desc(revenue))

cat("Most efficient to summarize first, then calculate derived metrics\n")

Most efficient to summarize first, then calculate derived metrics

print(head(efficient_summary, 10))

# A tibble: 10 × 7
   region customer_type      n revenue avg_order revenue_share cumulative_share
   <chr>  <chr>          <int>   <dbl>     <dbl>         <dbl>            <dbl>
 1 West   Individual       133  78443.      590.         15.2             92.8 
 2 East   Individual       130  72121.      555.         14.0             18.8 
 3 South  Individual       121  60550.      500.         11.7             64.8 
 4 North  Individual       112  52854.      472.         10.2             41.0 
 5 North  Small Business    81  44267.      547.          8.57            49.6 
 6 South  Small Business    74  40350.      545.          7.81            72.6 
 7 East   Small Business    73  37881.      519.          7.33            26.1 
 8 West   Small Business    79  36981.      468.          7.16           100   
 9 West   Enterprise        56  26349.      471.          5.10            77.7 
10 East   Enterprise        48  24959.      520.          4.83             4.83

Common Patterns and Use Cases

Top N per Group

# Find top 3 customers in each region
top_customers <- sales_raw %>%
  filter(quantity > 0) %>%
  group_by(region, customer_id) %>%
  summarise(
    total_spent = sum(net_amount),
    order_count = n(),
    .groups = "drop"
  ) %>%
  group_by(region) %>%
  slice_max(total_spent, n = 3) %>%
  ungroup()

cat("Top 3 customers by region:\n")

Top 3 customers by region:

print(top_customers)

# A tibble: 12 × 4
   region customer_id total_spent order_count
   <chr>  <chr>             <dbl>       <int>
 1 East   CUST-135          3715.           4
 2 East   CUST-128          3493.           3
 3 East   CUST-100          2769.           3
 4 North  CUST-81           2952.           2
 5 North  CUST-87           2834.           4
 6 North  CUST-83           2754.           3
 7 South  CUST-161          3119.           2
 8 South  CUST-75           2704.           3
 9 South  CUST-99           2586.           4
10 West   CUST-108          3905.           6
11 West   CUST-94           3353.           5
12 West   CUST-76           3006.           5

Year-over-Year Comparisons

# Year-over-year analysis
yoy_analysis <- sales_raw %>%
  filter(quantity > 0) %>%
  mutate(
    year = year(order_date),
    month = month(order_date)
  ) %>%
  group_by(year, month, region) %>%
  summarise(
    revenue = sum(net_amount),
    .groups = "drop"
  ) %>%
  pivot_wider(
    names_from = year,
    values_from = revenue,
    names_prefix = "year_"
  ) %>%
  mutate(
    yoy_change = (year_2024 - year_2023) / year_2023 * 100
  ) %>%
  filter(!is.na(yoy_change))

cat("Year-over-year revenue change by region and month:\n")

Year-over-year revenue change by region and month:

print(yoy_analysis)

# A tibble: 48 × 5
   month region year_2023 year_2024 yoy_change
   <dbl> <chr>      <dbl>     <dbl>      <dbl>
 1     1 East       4459.     2746.     -38.4 
 2     1 North      7519.     5179.     -31.1 
 3     1 South      8642.     4138.     -52.1 
 4     1 West       6852.     3932.     -42.6 
 5     2 East       5761.     4833.     -16.1 
 6     2 North      2469.     2530.       2.45
 7     2 South      3934.     4271.       8.57
 8     2 West       6360.     4214.     -33.7 
 9     3 East       8300.     2834.     -65.8 
10     3 North      5916.     2370.     -59.9 
# ℹ 38 more rows

Practice Exercises

Customer Segmentation: Group customers by their total purchase value and analyze their behavior
Time-Based Analysis: Create weekly summaries showing trends and patterns
Product Performance: Identify best and worst performing products by region
Sales Team Metrics: Create a comprehensive dashboard for sales rep performance

Summary

Grouping and summarizing are essential skills for data analysis. Key takeaways:

group_by() creates groups for subsequent operations
summarise() reduces groups to single rows with summary statistics
Use .groups = "drop" to avoid unexpected behavior
Grouped mutate() adds group-level calculations to each row
Window functions provide powerful tools for within-group calculations
Always consider how to handle missing values in grouped operations
Think about performance when working with large datasets

Next, we’ll explore how to combine multiple datasets with Joining Data.