library(tidyverse)
# Set seed for reproducibility
set.seed(123)
Data Tidying Principles with tidyr
Introduction to tidyr
tidyr helps you create tidy data, where:
- Every column is a variable
- Every row is an observation
- Every cell is a single value
Most real-world data violates these principles. tidyr provides tools to reshape data into tidy format, making it easier to work with dplyr and ggplot2.
What is Tidy Data?
The Tidy Data Principles
Let’s see the difference between messy and tidy data:
# Messy data: Multiple variables in column names
<- tibble(
messy_data country = c("USA", "UK", "Japan"),
`2020_population` = c(331, 67, 126),
`2020_gdp` = c(21433, 2708, 5065),
`2021_population` = c(332, 68, 125),
`2021_gdp` = c(22996, 3131, 4941)
)
print(messy_data)
# A tibble: 3 × 5
country `2020_population` `2020_gdp` `2021_population` `2021_gdp`
<chr> <dbl> <dbl> <dbl> <dbl>
1 USA 331 21433 332 22996
2 UK 67 2708 68 3131
3 Japan 126 5065 125 4941
# Tidy version: Each variable in its own column
<- tibble(
tidy_data country = rep(c("USA", "UK", "Japan"), each = 2),
year = rep(c(2020, 2021), 3),
population = c(331, 332, 67, 68, 126, 125),
gdp = c(21433, 22996, 2708, 3131, 5065, 4941)
)
print(tidy_data)
# A tibble: 6 × 4
country year population gdp
<chr> <dbl> <dbl> <dbl>
1 USA 2020 331 21433
2 USA 2021 332 22996
3 UK 2020 67 2708
4 UK 2021 68 3131
5 Japan 2020 126 5065
6 Japan 2021 125 4941
Pivoting: Reshaping Data
pivot_longer(): From Wide to Long
pivot_longer()
makes datasets longer by increasing rows and decreasing columns:
# Wide format data
<- tibble(
wide_sales product = c("Widget A", "Widget B", "Widget C"),
jan = c(100, 150, 120),
feb = c(110, 140, 130),
mar = c(120, 160, 140),
apr = c(130, 155, 135)
)
print(wide_sales)
# A tibble: 3 × 5
product jan feb mar apr
<chr> <dbl> <dbl> <dbl> <dbl>
1 Widget A 100 110 120 130
2 Widget B 150 140 160 155
3 Widget C 120 130 140 135
# Convert to long format
<- wide_sales %>%
long_sales pivot_longer(
cols = jan:apr, # Columns to pivot
names_to = "month", # Name for new column from column names
values_to = "sales" # Name for new column from values
)
print(long_sales)
# A tibble: 12 × 3
product month sales
<chr> <chr> <dbl>
1 Widget A jan 100
2 Widget A feb 110
3 Widget A mar 120
4 Widget A apr 130
5 Widget B jan 150
6 Widget B feb 140
7 Widget B mar 160
8 Widget B apr 155
9 Widget C jan 120
10 Widget C feb 130
11 Widget C mar 140
12 Widget C apr 135
Advanced pivot_longer()
# Data with multiple variables in column names
<- tibble(
complex_wide id = 1:3,
name = c("Alice", "Bob", "Charlie"),
math_2022 = c(85, 78, 92),
math_2023 = c(88, 82, 90),
science_2022 = c(90, 85, 88),
science_2023 = c(92, 87, 91)
)
print(complex_wide)
# A tibble: 3 × 6
id name math_2022 math_2023 science_2022 science_2023
<int> <chr> <dbl> <dbl> <dbl> <dbl>
1 1 Alice 85 88 90 92
2 2 Bob 78 82 85 87
3 3 Charlie 92 90 88 91
# Pivot with name separation
<- complex_wide %>%
complex_long pivot_longer(
cols = -c(id, name), # Everything except id and name
names_to = c("subject", "year"), # Split column names into two variables
names_sep = "_", # Separator in column names
values_to = "score"
%>%
) mutate(year = as.integer(year)) # Convert year to integer
print(complex_long)
# A tibble: 12 × 5
id name subject year score
<int> <chr> <chr> <int> <dbl>
1 1 Alice math 2022 85
2 1 Alice math 2023 88
3 1 Alice science 2022 90
4 1 Alice science 2023 92
5 2 Bob math 2022 78
6 2 Bob math 2023 82
7 2 Bob science 2022 85
8 2 Bob science 2023 87
9 3 Charlie math 2022 92
10 3 Charlie math 2023 90
11 3 Charlie science 2022 88
12 3 Charlie science 2023 91
pivot_wider(): From Long to Wide
pivot_wider()
makes datasets wider by increasing columns and decreasing rows:
# Start with long format
<- tibble(
long_data date = rep(c("2024-01-01", "2024-01-02"), each = 3),
metric = rep(c("views", "clicks", "conversions"), 2),
value = c(1000, 50, 5, 1200, 65, 8)
)
print(long_data)
# A tibble: 6 × 3
date metric value
<chr> <chr> <dbl>
1 2024-01-01 views 1000
2 2024-01-01 clicks 50
3 2024-01-01 conversions 5
4 2024-01-02 views 1200
5 2024-01-02 clicks 65
6 2024-01-02 conversions 8
# Convert to wide format
<- long_data %>%
wide_data pivot_wider(
names_from = metric, # Column to get new column names from
values_from = value # Column to get values from
)
print(wide_data)
# A tibble: 2 × 4
date views clicks conversions
<chr> <dbl> <dbl> <dbl>
1 2024-01-01 1000 50 5
2 2024-01-02 1200 65 8
Advanced pivot_wider()
# Data with multiple value columns
<- tibble(
multi_long year = rep(c(2022, 2023), each = 6),
quarter = rep(c("Q1", "Q2"), 6),
region = rep(c("North", "South", "East"), 4),
sales = round(runif(12, 100, 500)),
costs = round(runif(12, 50, 200))
)
print(multi_long)
# A tibble: 12 × 5
year quarter region sales costs
<dbl> <chr> <chr> <dbl> <dbl>
1 2022 Q1 North 215 152
2 2022 Q2 South 415 136
3 2022 Q1 East 264 65
4 2022 Q2 North 453 185
5 2022 Q1 South 476 87
6 2022 Q2 East 118 56
7 2023 Q1 North 311 99
8 2023 Q2 South 457 193
9 2023 Q1 East 321 183
10 2023 Q2 North 283 154
11 2023 Q1 South 483 146
12 2023 Q2 East 281 199
# Pivot with multiple value columns
<- multi_long %>%
multi_wide pivot_wider(
names_from = c(quarter, region), # Multiple columns for names
names_sep = "_", # Separator for new column names
values_from = c(sales, costs) # Multiple value columns
)
print(multi_wide)
# A tibble: 2 × 13
year sales_Q1_North sales_Q2_South sales_Q1_East sales_Q2_North
<dbl> <dbl> <dbl> <dbl> <dbl>
1 2022 215 415 264 453
2 2023 311 457 321 283
# ℹ 8 more variables: sales_Q1_South <dbl>, sales_Q2_East <dbl>,
# costs_Q1_North <dbl>, costs_Q2_South <dbl>, costs_Q1_East <dbl>,
# costs_Q2_North <dbl>, costs_Q1_South <dbl>, costs_Q2_East <dbl>
Separating and Uniting Columns
separate(): Split One Column into Multiple
# Data with combined information
<- tibble(
combined_data employee = c("John_Smith", "Jane_Doe", "Bob_Johnson"),
dept_role = c("Sales_Manager", "IT_Developer", "HR_Director"),
date_time = c("2024-01-15_09:30", "2024-01-15_10:00", "2024-01-15_14:30")
)
print(combined_data)
# A tibble: 3 × 3
employee dept_role date_time
<chr> <chr> <chr>
1 John_Smith Sales_Manager 2024-01-15_09:30
2 Jane_Doe IT_Developer 2024-01-15_10:00
3 Bob_Johnson HR_Director 2024-01-15_14:30
# Separate into multiple columns
<- combined_data %>%
separated_data separate(employee, into = c("first", "last"), sep = "_") %>%
separate(dept_role, into = c("department", "role"), sep = "_") %>%
separate(date_time, into = c("date", "time"), sep = "_")
print(separated_data)
# A tibble: 3 × 6
first last department role date time
<chr> <chr> <chr> <chr> <chr> <chr>
1 John Smith Sales Manager 2024-01-15 09:30
2 Jane Doe IT Developer 2024-01-15 10:00
3 Bob Johnson HR Director 2024-01-15 14:30
separate_rows(): Split and Expand Rows
# Data with multiple values in cells
<- tibble(
multi_value_data product = c("Widget A", "Widget B", "Widget C"),
colors = c("red,blue", "green,yellow,orange", "black,white"),
sizes = c("S,M,L", "M,L", "S,M,L,XL")
)
print(multi_value_data)
# A tibble: 3 × 3
product colors sizes
<chr> <chr> <chr>
1 Widget A red,blue S,M,L
2 Widget B green,yellow,orange M,L
3 Widget C black,white S,M,L,XL
# Separate into multiple rows
<- multi_value_data %>%
expanded_data separate_rows(colors, sep = ",") %>%
separate_rows(sizes, sep = ",")
print(expanded_data)
# A tibble: 20 × 3
product colors sizes
<chr> <chr> <chr>
1 Widget A red S
2 Widget A red M
3 Widget A red L
4 Widget A blue S
5 Widget A blue M
6 Widget A blue L
7 Widget B green M
8 Widget B green L
9 Widget B yellow M
10 Widget B yellow L
11 Widget B orange M
12 Widget B orange L
13 Widget C black S
14 Widget C black M
15 Widget C black L
16 Widget C black XL
17 Widget C white S
18 Widget C white M
19 Widget C white L
20 Widget C white XL
unite(): Combine Multiple Columns
# Data with separate columns
<- tibble(
separate_data year = c(2024, 2024, 2024),
month = c(1, 2, 3),
day = c(15, 20, 25),
city = c("New York", "Los Angeles", "Chicago"),
state = c("NY", "CA", "IL")
)
print(separate_data)
# A tibble: 3 × 5
year month day city state
<dbl> <dbl> <dbl> <chr> <chr>
1 2024 1 15 New York NY
2 2024 2 20 Los Angeles CA
3 2024 3 25 Chicago IL
# Unite columns
<- separate_data %>%
united_data unite(date, year, month, day, sep = "-") %>%
unite(location, city, state, sep = ", ")
print(united_data)
# A tibble: 3 × 2
date location
<chr> <chr>
1 2024-1-15 New York, NY
2 2024-2-20 Los Angeles, CA
3 2024-3-25 Chicago, IL
Handling Missing Values
complete(): Fill in Missing Combinations
# Data with missing combinations
<- tibble(
incomplete_data year = c(2022, 2022, 2023),
quarter = c("Q1", "Q3", "Q2"),
sales = c(100, 150, 120)
)
print(incomplete_data)
# A tibble: 3 × 3
year quarter sales
<dbl> <chr> <dbl>
1 2022 Q1 100
2 2022 Q3 150
3 2023 Q2 120
# Complete all combinations
<- incomplete_data %>%
complete_data complete(year, quarter, fill = list(sales = 0))
print(complete_data)
# A tibble: 6 × 3
year quarter sales
<dbl> <chr> <dbl>
1 2022 Q1 100
2 2022 Q2 0
3 2022 Q3 150
4 2023 Q1 0
5 2023 Q2 120
6 2023 Q3 0
# Complete with sequences
<- tibble(
sales_data date = as.Date(c("2024-01-01", "2024-01-03", "2024-01-07")),
sales = c(100, 150, 200)
)
<- sales_data %>%
complete_sales complete(
date = seq.Date(min(date), max(date), by = "day"),
fill = list(sales = 0)
)
print(complete_sales)
# A tibble: 7 × 2
date sales
<date> <dbl>
1 2024-01-01 100
2 2024-01-02 0
3 2024-01-03 150
4 2024-01-04 0
5 2024-01-05 0
6 2024-01-06 0
7 2024-01-07 200
drop_na(): Remove Rows with Missing Values
# Data with missing values
<- tibble(
data_with_na id = 1:6,
x = c(1, 2, NA, 4, NA, 6),
y = c(NA, 2, 3, 4, 5, 6),
z = c(1, 2, 3, NA, 5, 6)
)
print(data_with_na)
# A tibble: 6 × 4
id x y z
<int> <dbl> <dbl> <dbl>
1 1 1 NA 1
2 2 2 2 2
3 3 NA 3 3
4 4 4 4 NA
5 5 NA 5 5
6 6 6 6 6
# Drop rows with any NA
%>% drop_na() data_with_na
# A tibble: 2 × 4
id x y z
<int> <dbl> <dbl> <dbl>
1 2 2 2 2
2 6 6 6 6
# Drop rows with NA in specific columns
%>% drop_na(x) data_with_na
# A tibble: 4 × 4
id x y z
<int> <dbl> <dbl> <dbl>
1 1 1 NA 1
2 2 2 2 2
3 4 4 4 NA
4 6 6 6 6
# Drop rows with NA in multiple specific columns
%>% drop_na(x, y) data_with_na
# A tibble: 3 × 4
id x y z
<int> <dbl> <dbl> <dbl>
1 2 2 2 2
2 4 4 4 NA
3 6 6 6 6
fill(): Fill Missing Values
# Time series with gaps
<- tibble(
time_series date = seq.Date(from = as.Date("2024-01-01"),
to = as.Date("2024-01-10"),
by = "day"),
value = c(10, NA, NA, 15, NA, 20, NA, NA, 25, 30)
)
print(time_series)
# A tibble: 10 × 2
date value
<date> <dbl>
1 2024-01-01 10
2 2024-01-02 NA
3 2024-01-03 NA
4 2024-01-04 15
5 2024-01-05 NA
6 2024-01-06 20
7 2024-01-07 NA
8 2024-01-08 NA
9 2024-01-09 25
10 2024-01-10 30
# Fill down (carry forward)
%>%
time_series fill(value, .direction = "down")
# A tibble: 10 × 2
date value
<date> <dbl>
1 2024-01-01 10
2 2024-01-02 10
3 2024-01-03 10
4 2024-01-04 15
5 2024-01-05 15
6 2024-01-06 20
7 2024-01-07 20
8 2024-01-08 20
9 2024-01-09 25
10 2024-01-10 30
# Fill up (carry backward)
%>%
time_series fill(value, .direction = "up")
# A tibble: 10 × 2
date value
<date> <dbl>
1 2024-01-01 10
2 2024-01-02 15
3 2024-01-03 15
4 2024-01-04 15
5 2024-01-05 20
6 2024-01-06 20
7 2024-01-07 25
8 2024-01-08 25
9 2024-01-09 25
10 2024-01-10 30
# Fill both directions
%>%
time_series fill(value, .direction = "downup")
# A tibble: 10 × 2
date value
<date> <dbl>
1 2024-01-01 10
2 2024-01-02 10
3 2024-01-03 10
4 2024-01-04 15
5 2024-01-05 15
6 2024-01-06 20
7 2024-01-07 20
8 2024-01-08 20
9 2024-01-09 25
10 2024-01-10 30
replace_na(): Replace NA with Specific Values
# Data with NAs
<- tibble(
na_data product = c("A", "B", "C", "D"),
price = c(10, NA, 15, 20),
discount = c(0.1, 0.2, NA, NA),
category = c("Food", NA, "Electronics", NA)
)
print(na_data)
# A tibble: 4 × 4
product price discount category
<chr> <dbl> <dbl> <chr>
1 A 10 0.1 Food
2 B NA 0.2 <NA>
3 C 15 NA Electronics
4 D 20 NA <NA>
# Replace NAs with specific values
%>%
na_data replace_na(list(
price = 0,
discount = 0,
category = "Other"
))
# A tibble: 4 × 4
product price discount category
<chr> <dbl> <dbl> <chr>
1 A 10 0.1 Food
2 B 0 0.2 Other
3 C 15 0 Electronics
4 D 20 0 Other
Nesting and Unnesting
nest(): Create List-Columns
# Grouped data
<- tibble(
student_scores student = rep(c("Alice", "Bob", "Charlie"), each = 4),
subject = rep(c("Math", "Science", "English", "History"), 3),
score = c(85, 90, 88, 92, # Alice
78, 85, 82, 88, # Bob
92, 88, 90, 95) # Charlie
)
print(student_scores)
# A tibble: 12 × 3
student subject score
<chr> <chr> <dbl>
1 Alice Math 85
2 Alice Science 90
3 Alice English 88
4 Alice History 92
5 Bob Math 78
6 Bob Science 85
7 Bob English 82
8 Bob History 88
9 Charlie Math 92
10 Charlie Science 88
11 Charlie English 90
12 Charlie History 95
# Nest by student
<- student_scores %>%
nested_scores nest(data = c(subject, score))
print(nested_scores)
# A tibble: 3 × 2
student data
<chr> <list>
1 Alice <tibble [4 × 2]>
2 Bob <tibble [4 × 2]>
3 Charlie <tibble [4 × 2]>
# Access nested data
$data[[1]] nested_scores
# A tibble: 4 × 2
subject score
<chr> <dbl>
1 Math 85
2 Science 90
3 English 88
4 History 92
unnest(): Expand List-Columns
# Start with nested data
<- tibble(
nested_data group = c("A", "B"),
data = list(
tibble(x = 1:3, y = 4:6),
tibble(x = 7:9, y = 10:12)
)
)
print(nested_data)
# A tibble: 2 × 2
group data
<chr> <list>
1 A <tibble [3 × 2]>
2 B <tibble [3 × 2]>
# Unnest the data
<- nested_data %>%
unnested_data unnest(data)
print(unnested_data)
# A tibble: 6 × 3
group x y
<chr> <int> <int>
1 A 1 4
2 A 2 5
3 A 3 6
4 B 7 10
5 B 8 11
6 B 9 12
Real-World Examples
Example 1: Survey Data Cleaning
# Messy survey data
<- tibble(
survey_raw respondent_id = 1:5,
q1_2023 = c(5, 4, 3, 5, 4),
q2_2023 = c(4, 4, 3, 4, 5),
q1_2024 = c(5, 5, 4, 5, 4),
q2_2024 = c(5, 4, 4, 5, 5),
demographics = c("25-34,Male,Urban",
"35-44,Female,Rural",
"18-24,Male,Urban",
"45-54,Female,Suburban",
"25-34,Male,Rural")
)
# Clean and tidy the data
<- survey_raw %>%
survey_clean # Separate demographics
separate(demographics, into = c("age_group", "gender", "location"),
sep = ",") %>%
# Pivot questions longer
pivot_longer(
cols = starts_with("q"),
names_to = c("question", "year"),
names_sep = "_",
values_to = "rating"
%>%
) # Convert year to integer
mutate(year = as.integer(year))
print(survey_clean)
# A tibble: 20 × 7
respondent_id age_group gender location question year rating
<int> <chr> <chr> <chr> <chr> <int> <dbl>
1 1 25-34 Male Urban q1 2023 5
2 1 25-34 Male Urban q2 2023 4
3 1 25-34 Male Urban q1 2024 5
4 1 25-34 Male Urban q2 2024 5
5 2 35-44 Female Rural q1 2023 4
6 2 35-44 Female Rural q2 2023 4
7 2 35-44 Female Rural q1 2024 5
8 2 35-44 Female Rural q2 2024 4
9 3 18-24 Male Urban q1 2023 3
10 3 18-24 Male Urban q2 2023 3
11 3 18-24 Male Urban q1 2024 4
12 3 18-24 Male Urban q2 2024 4
13 4 45-54 Female Suburban q1 2023 5
14 4 45-54 Female Suburban q2 2023 4
15 4 45-54 Female Suburban q1 2024 5
16 4 45-54 Female Suburban q2 2024 5
17 5 25-34 Male Rural q1 2023 4
18 5 25-34 Male Rural q2 2023 5
19 5 25-34 Male Rural q1 2024 4
20 5 25-34 Male Rural q2 2024 5
# Analyze by demographics
<- survey_clean %>%
survey_summary group_by(year, gender, question) %>%
summarize(
avg_rating = mean(rating),
n = n(),
.groups = "drop"
%>%
) pivot_wider(
names_from = question,
values_from = avg_rating
)
print(survey_summary)
# A tibble: 4 × 5
year gender n q1 q2
<int> <chr> <int> <dbl> <dbl>
1 2023 Female 2 4.5 4
2 2023 Male 3 4 4
3 2024 Female 2 5 4.5
4 2024 Male 3 4.33 4.67
Example 2: Financial Data Transformation
# Quarterly financial data in wide format
<- tibble(
financial_wide company = c("TechCorp", "RetailCo", "FinanceInc"),
revenue_q1_2023 = c(100, 80, 120),
revenue_q2_2023 = c(110, 85, 125),
revenue_q3_2023 = c(120, 90, 130),
revenue_q4_2023 = c(130, 95, 140),
costs_q1_2023 = c(60, 50, 70),
costs_q2_2023 = c(65, 52, 72),
costs_q3_2023 = c(68, 54, 75),
costs_q4_2023 = c(70, 56, 78)
)
# Transform to tidy format
<- financial_wide %>%
financial_tidy pivot_longer(
cols = -company,
names_to = c("metric", "quarter", "year"),
names_sep = "_",
values_to = "value"
%>%
) mutate(
year = as.integer(year),
quarter = str_to_upper(quarter)
%>%
) pivot_wider(
names_from = metric,
values_from = value
%>%
) mutate(
profit = revenue - costs,
margin = profit / revenue * 100
)
print(financial_tidy)
# A tibble: 12 × 7
company quarter year revenue costs profit margin
<chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
1 TechCorp Q1 2023 100 60 40 40
2 TechCorp Q2 2023 110 65 45 40.9
3 TechCorp Q3 2023 120 68 52 43.3
4 TechCorp Q4 2023 130 70 60 46.2
5 RetailCo Q1 2023 80 50 30 37.5
6 RetailCo Q2 2023 85 52 33 38.8
7 RetailCo Q3 2023 90 54 36 40
8 RetailCo Q4 2023 95 56 39 41.1
9 FinanceInc Q1 2023 120 70 50 41.7
10 FinanceInc Q2 2023 125 72 53 42.4
11 FinanceInc Q3 2023 130 75 55 42.3
12 FinanceInc Q4 2023 140 78 62 44.3
# Quarterly summary
<- financial_tidy %>%
quarterly_summary group_by(quarter) %>%
summarize(
avg_revenue = mean(revenue),
avg_costs = mean(costs),
avg_margin = mean(margin),
.groups = "drop"
%>%
) arrange(quarter)
print(quarterly_summary)
# A tibble: 4 × 4
quarter avg_revenue avg_costs avg_margin
<chr> <dbl> <dbl> <dbl>
1 Q1 100 60 39.7
2 Q2 107. 63 40.7
3 Q3 113. 65.7 41.9
4 Q4 122. 68 43.8
Example 3: Time Series Data Preparation
# Irregular time series data
<- tibble(
irregular_ts date = as.Date(c("2024-01-01", "2024-01-03", "2024-01-04",
"2024-01-08", "2024-01-10")),
sensor_a = c(10, 12, NA, 15, 18),
sensor_b = c(20, NA, 23, 24, 25),
sensor_c = c(30, 32, 33, NA, 36)
)
# Complete and clean the time series
<- irregular_ts %>%
regular_ts # Complete all dates
complete(
date = seq.Date(min(date), max(date), by = "day")
%>%
) # Fill missing values
fill(everything(), .direction = "downup") %>%
# Pivot to long format for analysis
pivot_longer(
cols = starts_with("sensor"),
names_to = "sensor",
values_to = "reading"
%>%
) # Add derived features
mutate(
day_of_week = weekdays(date),
is_weekend = day_of_week %in% c("Saturday", "Sunday")
)
print(regular_ts)
# A tibble: 30 × 5
date sensor reading day_of_week is_weekend
<date> <chr> <dbl> <chr> <lgl>
1 2024-01-01 sensor_a 10 Monday FALSE
2 2024-01-01 sensor_b 20 Monday FALSE
3 2024-01-01 sensor_c 30 Monday FALSE
4 2024-01-02 sensor_a 10 Tuesday FALSE
5 2024-01-02 sensor_b 20 Tuesday FALSE
6 2024-01-02 sensor_c 30 Tuesday FALSE
7 2024-01-03 sensor_a 12 Wednesday FALSE
8 2024-01-03 sensor_b 20 Wednesday FALSE
9 2024-01-03 sensor_c 32 Wednesday FALSE
10 2024-01-04 sensor_a 12 Thursday FALSE
# ℹ 20 more rows
# Daily summary across sensors
<- regular_ts %>%
daily_summary group_by(date, is_weekend) %>%
summarize(
mean_reading = mean(reading, na.rm = TRUE),
sd_reading = sd(reading, na.rm = TRUE),
.groups = "drop"
)
print(daily_summary)
# A tibble: 10 × 4
date is_weekend mean_reading sd_reading
<date> <lgl> <dbl> <dbl>
1 2024-01-01 FALSE 20 10
2 2024-01-02 FALSE 20 10
3 2024-01-03 FALSE 21.3 10.1
4 2024-01-04 FALSE 22.7 10.5
5 2024-01-05 FALSE 22.7 10.5
6 2024-01-06 TRUE 22.7 10.5
7 2024-01-07 TRUE 22.7 10.5
8 2024-01-08 FALSE 24 9
9 2024-01-09 FALSE 24 9
10 2024-01-10 FALSE 26.3 9.07
Best Practices
1. Start with Tidy Data in Mind
# Design your data collection to be tidy from the start
# Good: Each observation is a row
<- tibble(
good_design date = rep(dates, each = n_products),
product = rep(products, n_dates),
metric = value
)
# Avoid: Wide format with dates/products as columns
<- tibble(
avoid_design product = products,
`2024-01-01` = values1,
`2024-01-02` = values2
)
2. Use Meaningful Column Names
# Good: Clear, descriptive names
<- tibble(
good_names measurement_date = Sys.Date(),
temperature_celsius = 25,
humidity_percent = 60
)
# Better: Consistent naming convention
<- tibble(
better_names date_measured = Sys.Date(),
temp_c = 25,
humid_pct = 60
)
3. Document Your Transformations
# Create sample data for documentation example
<- tibble(
raw_data product_code = c("A-2023", "B-2023", "C-2024"),
jan = c(100, NA, 150),
feb = c(110, 120, NA),
mar = c(120, 130, 170)
)
# Document each step
<- raw_data %>%
documented_pipeline # Step 1: Convert wide sales data to long format
pivot_longer(
cols = jan:mar,
names_to = "month",
values_to = "sales"
%>%
) # Step 2: Separate year from product code
separate(
product_code,into = c("category", "year"),
sep = "-"
%>%
) # Step 3: Fill missing sales with 0
replace_na(list(sales = 0))
print(documented_pipeline)
# A tibble: 9 × 4
category year month sales
<chr> <chr> <chr> <dbl>
1 A 2023 jan 100
2 A 2023 feb 110
3 A 2023 mar 120
4 B 2023 jan 0
5 B 2023 feb 120
6 B 2023 mar 130
7 C 2024 jan 150
8 C 2024 feb 0
9 C 2024 mar 170
Common Patterns
Pattern 1: Wide to Long to Summary
# Common analysis pattern
<- tibble(
wide_data group = c("A", "B"),
day1 = c(10, 20),
day2 = c(15, 22),
day3 = c(12, 25)
)
<- wide_data %>%
summary pivot_longer(cols = starts_with("day"),
names_to = "day",
values_to = "value") %>%
group_by(group) %>%
summarize(
mean = mean(value),
sd = sd(value),
.groups = "drop"
)
print(summary)
# A tibble: 2 × 3
group mean sd
<chr> <dbl> <dbl>
1 A 12.3 2.52
2 B 22.3 2.52
Pattern 2: Separate, Clean, Unite
# Clean messy identifiers
<- tibble(
messy_ids raw_id = c("USA_2024_Q1", "CAN_2024_Q2", "MEX_2024_Q1"),
value = c(100, 150, 120)
)
<- messy_ids %>%
clean_ids separate(raw_id, into = c("country", "year", "quarter"), sep = "_") %>%
mutate(
year = as.integer(year),
quarter = factor(quarter, levels = c("Q1", "Q2", "Q3", "Q4"))
%>%
) unite(period, year, quarter, sep = "-", remove = FALSE)
print(clean_ids)
# A tibble: 3 × 5
country period year quarter value
<chr> <chr> <int> <fct> <dbl>
1 USA 2024-Q1 2024 Q1 100
2 CAN 2024-Q2 2024 Q2 150
3 MEX 2024-Q1 2024 Q1 120
Exercises
Exercise 1: Basic Pivoting
Create a dataset of monthly temperatures for 3 cities across 4 months. Start in wide format, then: 1. Pivot to long format 2. Calculate average temperature per city 3. Pivot back to wide with cities as columns
Exercise 2: Data Cleaning Pipeline
Given messy survey data with: - Combined name fields (“FirstName LastName”) - Multiple responses in single cells (“option1;option2;option3”) - Missing value combinations
Clean the data to create a tidy dataset ready for analysis.
Exercise 3: Complex Reshaping
Create a sales dataset with: - Multiple products - Multiple regions - Quarterly data for 2 years
Practice pivoting between different representations for different types of analysis.
Summary
tidyr provides essential tools for data tidying:
pivot_longer()
/pivot_wider()
: Reshape between wide and long formatsseparate()
/unite()
: Split and combine columnscomplete()
: Fill in missing combinationsdrop_na()
/fill()
/replace_na()
: Handle missing valuesnest()
/unnest()
: Create and expand list-columns
These functions, combined with dplyr, form the foundation of data manipulation in the tidyverse. Tidy data makes subsequent analysis much easier and more intuitive.
The key insight: spend time getting your data tidy, and the rest of your analysis will be much simpler!