Separating and Uniting: Column Manipulation Mastery
Author
IND215
Published
September 22, 2025
The Art of Column Surgery 🔧
Real-world data often comes with information packed into single columns that needs to be split, or scattered across multiple columns that should be combined. The separate() and unite() functions in tidyr are your surgical tools for precise column manipulation.
Understanding the Problem
library(tidyverse)library(lubridate)# Common messy data scenarioscat("TYPICAL MESSY DATA PATTERNS:\n\n")
TYPICAL MESSY DATA PATTERNS:
# Example 1: Multiple values in one columncontact_data <-tibble(employee_id =1:5,full_name =c("Sarah Johnson", "Michael Chen", "Emily Rodriguez-Smith","David O'Connor", "Anna-Marie Dubois"),contact_info =c("sarah.j@company.com | 555-0123","m.chen@company.com | 555-0156 | ext:234","emily.rs@company.com","d.oconnor@company.com | 555-0189 | LinkedIn: davidoc","anna.dubois@company.com | 555-0167 | Skype: annadubois"))cat("Problem 1: Multiple contact methods in single column\n")
Problem 1: Multiple contact methods in single column
# Example 2: Date components scattered across columnsdate_components <-tibble(transaction_id =1:4,year =c(2023, 2023, 2024, 2024),month =c("Jan", "Dec", "Mar", "Jul"),day =c(15, 28, 8, 22),amount =c(1250.50, 899.99, 2100.00, 567.80))cat("\nProblem 2: Date components in separate columns\n")
Problem 2: Date components in separate columns
print(date_components)
# A tibble: 4 × 5
transaction_id year month day amount
<int> <dbl> <chr> <dbl> <dbl>
1 1 2023 Jan 15 1250.
2 2 2023 Dec 28 900.
3 3 2024 Mar 8 2100
4 4 2024 Jul 22 568.
# Example 3: Product codes with embedded informationproduct_data <-tibble(sku =c("ELC-LTOP-15-BLK-256", "HME-CHAI-STD-BRN-001", "OFF-DESK-ADJ-WHT-042"),description =c("Electronics Laptop 15inch Black 256GB","Home Chair Standard Brown","Office Desk Adjustable White"),price =c(899.99, 249.99, 399.99))cat("\nProblem 3: Structured codes that need parsing\n")
Problem 3: Structured codes that need parsing
print(product_data)
# A tibble: 3 × 3
sku description price
<chr> <chr> <dbl>
1 ELC-LTOP-15-BLK-256 Electronics Laptop 15inch Black 256GB 900.
2 HME-CHAI-STD-BRN-001 Home Chair Standard Brown 250.
3 OFF-DESK-ADJ-WHT-042 Office Desk Adjustable White 400.
separate(): Splitting One Column Into Many
Basic Separation with Delimiters
# Basic separation using delimiterscontact_separated <- contact_data %>%separate(col = contact_info,into =c("email", "phone", "other"),sep =" \\| ", # Regex pattern for " | "fill ="right"# Fill missing values on the right with NA )cat("Basic separation result:\n")
Basic separation result:
print(contact_separated)
# A tibble: 5 × 5
employee_id full_name email phone other
<int> <chr> <chr> <chr> <chr>
1 1 Sarah Johnson sarah.j@company.com 555-0123 <NA>
2 2 Michael Chen m.chen@company.com 555-0156 ext:234
3 3 Emily Rodriguez-Smith emily.rs@company.com <NA> <NA>
4 4 David O'Connor d.oconnor@company.com 555-0189 LinkedIn: …
5 5 Anna-Marie Dubois anna.dubois@company.com 555-0167 Skype: ann…
# Handle names with various formatsnames_separated <- contact_data %>%separate(col = full_name,into =c("first_name", "last_name"),sep =" ",extra ="merge"# If more than 2 parts, merge extras into last column )cat("\nName separation (handling middle names/hyphenated names):\n")
Name separation (handling middle names/hyphenated names):
# A tibble: 5 × 3
domain_type has_plus_addressing n
<chr> <lgl> <int>
1 Commercial FALSE 2
2 Educational FALSE 1
3 Organization FALSE 1
4 Other FALSE 1
5 Other TRUE 1
Handling Messy Real-World Data
# Realistic messy customer datamessy_customers <-tibble(record_id =1:8,customer_info =c("John Smith, 42, Manager, New York, NY, 10001","Sarah Johnson; 35; Developer; San Francisco; CA; 94102","Mike Chen | 28 | Designer | Austin TX 78701","Emily Rodriguez-Davis, 39, Director, Chicago, IL, 60601","David Kim; 45; Analyst; Seattle WA 98101","Anna Petrova, 33, Consultant, Boston, MA, 02101","Carlos Mendez | 41 | Manager | Miami FL 33101","Jennifer Wu, 37, Engineer, Portland, OR, 97201" ))cat("Messy customer data with mixed delimiters:\n")
Messy customer data with mixed delimiters:
print(messy_customers)
# A tibble: 8 × 2
record_id customer_info
<int> <chr>
1 1 John Smith, 42, Manager, New York, NY, 10001
2 2 Sarah Johnson; 35; Developer; San Francisco; CA; 94102
3 3 Mike Chen | 28 | Designer | Austin TX 78701
4 4 Emily Rodriguez-Davis, 39, Director, Chicago, IL, 60601
5 5 David Kim; 45; Analyst; Seattle WA 98101
6 6 Anna Petrova, 33, Consultant, Boston, MA, 02101
7 7 Carlos Mendez | 41 | Manager | Miami FL 33101
8 8 Jennifer Wu, 37, Engineer, Portland, OR, 97201
# Clean step by stepcustomers_cleaned <- messy_customers %>%# First, standardize delimitersmutate(customer_info =str_replace_all(customer_info, "[;|]", ","),customer_info =str_replace_all(customer_info, ", +", ",") ) %>%# Separate into main componentsseparate(col = customer_info,into =c("name", "age", "title", "location"),sep =",",extra ="merge" ) %>%# Parse location furtherseparate(col = location,into =c("city", "state", "zip"),sep =",",extra ="merge",fill ="right" ) %>%# Clean up the datamutate(across(everything(), str_trim), # Remove extra whitespaceage =as.numeric(age),# Handle state abbreviations with/without zipstate_zip =ifelse(is.na(zip), state, paste(state, zip)),state =str_extract(state_zip, "^[A-Z]{2}"),zip =str_extract(state_zip, "\\d{5}$") ) %>%select(-state_zip)cat("\nCleaned customer data:\n")
Cleaned customer data:
print(customers_cleaned)
# A tibble: 8 × 7
record_id name age title city state zip
<chr> <chr> <dbl> <chr> <chr> <chr> <chr>
1 1 John Smith 42 Manager New York NY 10001
2 2 Sarah Johnson 35 Developer San Francisco CA 94102
3 3 Mike Chen 28 Designer Austin TX 78701 <NA> <NA>
4 4 Emily Rodriguez-Davis 39 Director Chicago IL 60601
5 5 David Kim 45 Analyst Seattle WA 98101 <NA> <NA>
6 6 Anna Petrova 33 Consultant Boston MA 02101
7 7 Carlos Mendez 41 Manager Miami FL 33101 <NA> <NA>
8 8 Jennifer Wu 37 Engineer Portland OR 97201
# Validation and summarycat("\nData quality check:\n")
# A tibble: 6 × 4
dept_location employee_id full_name display_name
<chr> <chr> <chr> <chr>
1 IT_NYC IT-NYC-1001 John Smith Smith, John
2 HR_LA HR-LA-2001 Sarah Johnson Johnson, Sarah
3 IT_CHI IT-CHI-1002 Mike Chen Chen, Mike
4 Finance_NYC Finance-NYC-3001 Emily Rodriguez Rodriguez, Emily
5 HR_CHI HR-CHI-2002 David Kim Kim, David
6 IT_LA IT-LA-1003 Anna Petrova Petrova, Anna
# A tibble: 6 × 6
department location employee_num first_name last_name user_code
<chr> <chr> <dbl> <chr> <chr> <chr>
1 IT NYC 1001 John Smith ITNYJS
2 HR LA 2001 Sarah Johnson HRLASJ
3 IT CHI 1002 Mike Chen ITCHMC
4 Finance NYC 3001 Emily Rodriguez FINNYER
5 HR CHI 2002 David Kim HRCHDK
6 IT LA 1003 Anna Petrova ITLAAP
# Complex sales data requiring multiple operationscomplex_sales <-tibble(record =1:6,sales_rep_info =c("Alice Johnson (North) - alice.j@company.com","Bob Smith (South) - bob.smith@company.com","Carol Davis (East) - c.davis@company.com","David Wilson (West) - d.wilson@company.com","Emma Brown (North) - emma.b@company.com","Frank Miller (South) - f.miller@company.com"),transaction_details =c("Product:Widget A|Qty:100|Price:$25.99|Date:2023-01-15","Product:Widget B|Qty:75|Price:$45.50|Date:2023-01-18","Product:Widget C|Qty:150|Price:$15.75|Date:2023-01-22","Product:Widget A|Qty:200|Price:$25.99|Date:2023-01-25","Product:Widget B|Qty:125|Price:$45.50|Date:2023-01-28","Product:Widget C|Qty:80|Price:$15.75|Date:2023-02-01"))cat("Complex sales data to clean:\n")
Complex sales data to clean:
print(complex_sales)
# A tibble: 6 × 3
record sales_rep_info transaction_details
<int> <chr> <chr>
1 1 Alice Johnson (North) - alice.j@company.com Product:Widget A|Qty:100|P…
2 2 Bob Smith (South) - bob.smith@company.com Product:Widget B|Qty:75|Pr…
3 3 Carol Davis (East) - c.davis@company.com Product:Widget C|Qty:150|P…
4 4 David Wilson (West) - d.wilson@company.com Product:Widget A|Qty:200|P…
5 5 Emma Brown (North) - emma.b@company.com Product:Widget B|Qty:125|P…
6 6 Frank Miller (South) - f.miller@company.com Product:Widget C|Qty:80|Pr…
# A tibble: 6 × 7
rep_name region month_year transactions total_quantity total_revenue
<chr> <chr> <chr> <int> <dbl> <dbl>
1 Emma Brown North 2023-01 1 125 5688.
2 David Wilson West 2023-01 1 200 5198
3 Bob Smith South 2023-01 1 75 3412.
4 Alice Johnson North 2023-01 1 100 2599
5 Carol Davis East 2023-01 1 150 2362.
6 Frank Miller South 2023-02 1 80 1260
# ℹ 1 more variable: avg_transaction <dbl>
Dynamic Column Operations
# Survey data with varying response structuressurvey_responses <-tibble(respondent_id =1:5,demographics =c("Age:25|Gender:F|Education:Bachelor","Age:34|Gender:M|Education:Master|Income:75000","Age:42|Gender:F|Education:PhD|Income:95000|Years_Experience:15","Age:28|Gender:M|Education:Bachelor|Income:55000","Age:51|Gender:F|Education:Master|Income:85000|Years_Experience:20"),responses =c("Q1:5|Q2:4|Q3:3","Q1:4|Q2:5|Q3:4|Q4:3","Q1:3|Q2:3|Q3:5|Q4:4|Q5:5","Q1:5|Q2:4|Q3:4","Q1:4|Q2:3|Q3:3|Q4:5|Q5:4"))cat("Variable structure survey data:\n")
# Function to parse key-value pairsparse_key_value <-function(data, col_name) { data %>%separate_rows(!!sym(col_name), sep ="\\|") %>%separate(!!sym(col_name), into =c("key", "value"), sep =":") %>%pivot_wider(names_from = key, values_from = value)}# Parse demographicsdemographics_parsed <- survey_responses %>%select(respondent_id, demographics) %>%parse_key_value("demographics") %>%mutate(Age =as.numeric(Age),Income =as.numeric(Income),Years_Experience =as.numeric(Years_Experience) )cat("\nParsed demographics:\n")
Parsed demographics:
print(demographics_parsed)
# A tibble: 5 × 6
respondent_id Age Gender Education Income Years_Experience
<int> <dbl> <chr> <chr> <dbl> <dbl>
1 1 25 F Bachelor NA NA
2 2 34 M Master 75000 NA
3 3 42 F PhD 95000 15
4 4 28 M Bachelor 55000 NA
5 5 51 F Master 85000 20
cat(" - Set fill and extra parameters appropriately\n")
- Set fill and extra parameters appropriately
cat(" - Validate results after each step\n\n")
- Validate results after each step
cat("4. TESTING APPROACH:\n")
4. TESTING APPROACH:
test_data <-tibble(id =1:3,messy_col =c("A|B|C", "X|Y", "P|Q|R|S|T"))# Test with small sample firsttest_result <-robust_separation(test_data, "messy_col", c("col1", "col2", "col3"), "\\|")cat(" - Always test with small samples first\n")
- Always test with small samples first
cat(" - Check edge cases (missing values, extra delimiters)\n")
- Check edge cases (missing values, extra delimiters)
cat(" - Verify column counts and types\n")
- Verify column counts and types
print(test_result)
# A tibble: 3 × 4
id col1 col2 col3
<int> <chr> <chr> <chr>
1 1 A B C
2 2 X Y <NA>
3 3 P Q R|S|T
cat("\n5. REGEX BEST PRACTICES:\n")
5. REGEX BEST PRACTICES:
cat(" - Escape special characters: \\\\| for literal pipe\n")
- Escape special characters: \\| for literal pipe
cat(" - Use anchors: ^ for start, $ for end\n")
- Use anchors: ^ for start, $ for end
cat(" - Test patterns: str_detect() before separate()\n")
- Test patterns: str_detect() before separate()
cat(" - Consider str_split() for complex patterns\n")
address_data <-tibble(id =1:3,full_address =c("123 Main St, Apt 4B, New York, NY 10001","456 Oak Avenue, San Francisco, CA 94102","789 Pine Road, Unit 12, Chicago, IL 60601"))addresses_parsed <- address_data %>%separate(full_address, into =c("street", "city_state_zip"), sep =", (?=[^,]*,[^,]*$)") %>%separate(city_state_zip, into =c("city", "state_zip"), sep =", ") %>%separate(state_zip, into =c("state", "zip"), sep =" ")print(addresses_parsed)
# A tibble: 3 × 5
id street city state zip
<int> <chr> <chr> <chr> <chr>
1 1 123 Main St, Apt 4B New York NY 10001
2 2 456 Oak Avenue San Francisco CA 94102
3 3 789 Pine Road, Unit 12 Chicago IL 60601
cat("\n2. NAME STANDARDIZATION:\n")
2. NAME STANDARDIZATION:
name_variations <-tibble(id =1:4,name_input =c("Dr. John Smith Jr.", "Ms. Sarah Johnson-Brown","Michael Chen", "Prof. Emily Rodriguez PhD"))names_standardized <- name_variations %>%mutate(# Remove titles and suffixes for core nameclean_name =str_remove_all(name_input, "\\b(Dr\\.|Ms\\.|Prof\\.|Jr\\.|PhD)\\b"),clean_name =str_trim(clean_name) ) %>%separate(clean_name, into =c("first", "last"), sep =" ", extra ="merge")print(names_standardized)
# A tibble: 4 × 4
id name_input first last
<int> <chr> <chr> <chr>
1 1 Dr. John Smith Jr. Dr. John Smith Jr.
2 2 Ms. Sarah Johnson-Brown Ms. Sarah Johnson-Brown
3 3 Michael Chen Michael Chen
4 4 Prof. Emily Rodriguez PhD Prof. Emily Rodriguez