# Absolute paths (full path from root)
# Windows: "C:/Users/YourName/Documents/data.csv"
# Mac/Linux: "/home/username/documents/data.csv"
# Relative paths (relative to current working directory)
# "data.csv" - file in current directory
# "data/sales.csv" - file in 'data' subdirectory
# "../data.csv" - file in parent directory
Working with Files in R
Introduction to File Operations
Working with files is a fundamental part of data analysis. In this section, you’ll learn how to:
- Read data from various file formats
- Write data to files for sharing and storage
- Navigate file systems and organize projects
- Handle different data formats (CSV, Excel, text files)
- Work with file paths across different operating systems
R provides powerful tools for file operations that integrate seamlessly with data analysis workflows.
Understanding File Paths
Absolute vs Relative Paths
Working Directory Operations
# Check current working directory
<- getwd()
current_dir print(current_dir)
[1] "/home/simon/githubRepos/intro-to-R-IND215/website/modules/module-2"
# List files in current directory
<- list.files()
files_here print(files_here)
[1] "control-structures.qmd" "data-structures.qmd"
[3] "data-types.qmd" "functions.html"
[5] "functions.qmd" "index.html"
[7] "index.qmd" "vectors.html"
[9] "vectors.qmd" "working-with-files.qmd"
[11] "working-with-files.rmarkdown"
# List files with full paths
<- list.files(full.names = TRUE)
files_full head(files_full)
[1] "./control-structures.qmd" "./data-structures.qmd"
[3] "./data-types.qmd" "./functions.html"
[5] "./functions.qmd" "./index.html"
# List files with specific pattern
<- list.files(pattern = "\\.csv$")
csv_files print(csv_files)
character(0)
# Check if file exists
file.exists("data.csv")
[1] FALSE
file.exists("nonexistent.csv")
[1] FALSE
Creating Directories and File Paths
# Create directory (if it doesn't exist)
if (!dir.exists("output")) {
dir.create("output")
cat("Created 'output' directory\n")
}
Created 'output' directory
# Create nested directories
if (!dir.exists("data/raw")) {
dir.create("data/raw", recursive = TRUE)
cat("Created 'data/raw' directory structure\n")
}
Created 'data/raw' directory structure
# Build file paths safely (works across operating systems)
<- file.path("data", "raw", "sales_data.csv")
data_path print(data_path)
[1] "data/raw/sales_data.csv"
# Get file information
if (file.exists(data_path)) {
<- file.info(data_path)
file_info print(file_info)
}
Reading Data Files
Reading CSV Files
CSV (Comma-Separated Values) files are the most common format for data exchange:
# Create sample CSV data for demonstration
<- data.frame(
sample_data name = c("Alice", "Bob", "Charlie", "Diana"),
age = c(25, 30, 35, 28),
salary = c(50000, 60000, 55000, 52000),
department = c("Sales", "IT", "Marketing", "HR")
)
# Write sample data to demonstrate reading
write.csv(sample_data, "sample_employees.csv", row.names = FALSE)
# Read CSV file
<- read.csv("sample_employees.csv")
employees print(employees)
name age salary department
1 Alice 25 50000 Sales
2 Bob 30 60000 IT
3 Charlie 35 55000 Marketing
4 Diana 28 52000 HR
# Check structure
str(employees)
'data.frame': 4 obs. of 4 variables:
$ name : chr "Alice" "Bob" "Charlie" "Diana"
$ age : int 25 30 35 28
$ salary : int 50000 60000 55000 52000
$ department: chr "Sales" "IT" "Marketing" "HR"
# Read CSV with custom settings
<- read.csv("sample_employees.csv",
employees_custom stringsAsFactors = FALSE, # Keep strings as characters
header = TRUE, # First row contains column names
sep = ",") # Comma separator
print(employees_custom)
name age salary department
1 Alice 25 50000 Sales
2 Bob 30 60000 IT
3 Charlie 35 55000 Marketing
4 Diana 28 52000 HR
Handling Different Separators
# Create tab-separated file
write.table(sample_data, "sample_employees.tsv",
sep = "\t", row.names = FALSE, quote = FALSE)
# Read tab-separated file
<- read.table("sample_employees.tsv",
employees_tsv header = TRUE, sep = "\t",
stringsAsFactors = FALSE)
print(employees_tsv)
name age salary department
1 Alice 25 50000 Sales
2 Bob 30 60000 IT
3 Charlie 35 55000 Marketing
4 Diana 28 52000 HR
# Create semicolon-separated file (common in European locales)
write.table(sample_data, "sample_employees_semi.csv",
sep = ";", row.names = FALSE, quote = FALSE)
# Read semicolon-separated file
<- read.table("sample_employees_semi.csv",
employees_semi header = TRUE, sep = ";",
stringsAsFactors = FALSE)
print(employees_semi)
name age salary department
1 Alice 25 50000 Sales
2 Bob 30 60000 IT
3 Charlie 35 55000 Marketing
4 Diana 28 52000 HR
Reading Text Files
# Create a simple text file
<- c(
text_content "This is line 1",
"This is line 2",
"This is line 3 with more text",
"Final line"
)
writeLines(text_content, "sample_text.txt")
# Read entire file as character vector
<- readLines("sample_text.txt")
file_lines print(file_lines)
[1] "This is line 1" "This is line 2"
[3] "This is line 3 with more text" "Final line"
# Read specific number of lines
<- readLines("sample_text.txt", n = 2)
first_two_lines print(first_two_lines)
[1] "This is line 1" "This is line 2"
# Read file as single string
<- paste(readLines("sample_text.txt"), collapse = "\n")
file_content cat(file_content)
This is line 1
This is line 2
This is line 3 with more text
Final line
Reading Excel Files
# Note: This requires the readxl package
# install.packages("readxl")
library(readxl)
# Read Excel file (first sheet by default)
<- read_excel("data.xlsx")
excel_data
# Read specific sheet
<- read_excel("data.xlsx", sheet = "Sheet2")
excel_data_sheet2
# Read specific sheet by number
<- read_excel("data.xlsx", sheet = 1)
excel_data_sheet1
# Read with specific range
<- read_excel("data.xlsx", range = "A1:D10")
excel_range
# Skip rows and specify column types
<- read_excel("data.xlsx",
excel_custom skip = 2, # Skip first 2 rows
col_types = c("text", "numeric", "date"))
Writing Data Files
Writing CSV Files
# Create analysis results
<- data.frame(
analysis_results metric = c("Total Sales", "Average Sale", "Max Sale", "Min Sale"),
value = c(125000, 2500, 15000, 250),
period = rep("Q1 2024", 4)
)
# Write to CSV
write.csv(analysis_results, "analysis_results.csv", row.names = FALSE)
# Write with custom settings
write.csv(analysis_results, "analysis_results_custom.csv",
row.names = FALSE, # Don't include row numbers
quote = FALSE, # Don't quote text fields
na = "") # How to represent missing values
# Verify the file was written
if (file.exists("analysis_results.csv")) {
cat("File written successfully!\n")
cat("File size:", file.info("analysis_results.csv")$size, "bytes\n")
}
File written successfully!
File size: 139 bytes
Writing Other Formats
# Write tab-separated file
write.table(analysis_results, "analysis_results.tsv",
sep = "\t", row.names = FALSE, quote = FALSE)
# Write space-separated file
write.table(analysis_results, "analysis_results.txt",
sep = " ", row.names = FALSE, quote = FALSE)
# Write with custom separator
write.table(analysis_results, "analysis_results_pipe.txt",
sep = "|", row.names = FALSE, quote = FALSE)
# Write R data file (preserves data types perfectly)
save(analysis_results, file = "analysis_results.RData")
# Write individual object to RDS file (more efficient)
saveRDS(analysis_results, "analysis_results.rds")
Writing Text Files
# Create a report
<- c(
report_lines "Data Analysis Report",
"====================",
"",
paste("Generated on:", Sys.Date()),
paste("Total records analyzed:", nrow(sample_data)),
"",
"Summary Statistics:",
paste("Average age:", round(mean(sample_data$age), 1)),
paste("Average salary: $", format(mean(sample_data$salary), big.mark = ",")),
"",
"End of report"
)
# Write report to text file
writeLines(report_lines, "analysis_report.txt")
# Write using cat() for more control
cat("Custom Report\n",
"=============\n\n",
"This report was generated using cat().\n",
"Current time:", format(Sys.time()), "\n",
file = "custom_report.txt")
# Append to existing file
cat("\nAdditional information added later.\n",
file = "custom_report.txt", append = TRUE)
Project Organization
Creating a Standard Project Structure
# Function to create standard project directories
<- function(project_name) {
create_project_structure
# Create main project directory
if (!dir.exists(project_name)) {
dir.create(project_name)
}
# Create subdirectories
<- c("data/raw", "data/processed", "scripts", "output", "docs")
subdirs
for (subdir in subdirs) {
<- file.path(project_name, subdir)
full_path if (!dir.exists(full_path)) {
dir.create(full_path, recursive = TRUE)
cat("Created:", full_path, "\n")
}
}
# Create README file
<- c(
readme_content paste("#", project_name),
"",
"## Project Structure",
"- `data/raw/` - Original, unmodified data",
"- `data/processed/` - Cleaned and processed data",
"- `scripts/` - R scripts for analysis",
"- `output/` - Results, plots, reports",
"- `docs/` - Documentation",
"",
paste("Created:", Sys.Date())
)
writeLines(readme_content, file.path(project_name, "README.md"))
cat("Created README.md\n")
cat("Project structure created successfully!\n")
}
# Create a sample project
create_project_structure("my_analysis_project")
Created: my_analysis_project/data/raw
Created: my_analysis_project/data/processed
Created: my_analysis_project/scripts
Created: my_analysis_project/output
Created: my_analysis_project/docs
Created README.md
Project structure created successfully!
# List the created structure
if (dir.exists("my_analysis_project")) {
cat("\nProject structure:\n")
<- list.files("my_analysis_project", recursive = TRUE, include.dirs = TRUE)
files for (file in files) {
cat(" ", file, "\n")
} }
Project structure:
data
data/processed
data/raw
docs
output
README.md
scripts
File Naming Conventions
# Good file naming practices
<- function() {
create_analysis_files
# Use ISO date format for chronological ordering
<- format(Sys.Date(), "%Y%m%d")
today
# Descriptive names with underscores
<- c(
good_names paste0(today, "_sales_analysis.R"),
paste0(today, "_customer_data_cleaning.R"),
"01_data_import.R",
"02_data_cleaning.R",
"03_exploratory_analysis.R",
"04_modeling.R",
"05_report_generation.R"
)
# Create empty files to demonstrate naming
for (name in good_names) {
<- file.path("scripts", name)
file_path if (!file.exists(file_path)) {
# Create directory if it doesn't exist
if (!dir.exists("scripts")) {
dir.create("scripts")
}# Create empty file
file.create(file_path)
cat("Created:", name, "\n")
}
}
cat("\nFile naming best practices:\n")
cat("✓ Use dates in YYYYMMDD format\n")
cat("✓ Use descriptive names\n")
cat("✓ Use underscores instead of spaces\n")
cat("✓ Use numbers for sequential scripts\n")
cat("✓ Avoid special characters\n")
}
create_analysis_files()
Created: 20250922_sales_analysis.R
Created: 20250922_customer_data_cleaning.R
Created: 01_data_import.R
Created: 02_data_cleaning.R
Created: 03_exploratory_analysis.R
Created: 04_modeling.R
Created: 05_report_generation.R
File naming best practices:
✓ Use dates in YYYYMMDD format
✓ Use descriptive names
✓ Use underscores instead of spaces
✓ Use numbers for sequential scripts
✓ Avoid special characters
Advanced File Operations
Reading Large Files Efficiently
# Function to read large files in chunks
<- function(filename, chunk_size = 1000) {
read_large_csv_chunked
# First, count total rows
<- length(readLines(filename)) - 1 # Subtract header
total_rows cat("Total rows to process:", total_rows, "\n")
# Read file in chunks
<- list()
all_data <- file(filename, "r")
connection
# Read header first
<- readLines(connection, n = 1)
header
<- 1
chunk_num while (TRUE) {
# Read chunk
<- readLines(connection, n = chunk_size)
lines
if (length(lines) == 0) break # End of file
# Convert to data frame
<- c(header, lines)
chunk_text <- read.csv(text = paste(chunk_text, collapse = "\n"))
chunk_data
<- chunk_data
all_data[[chunk_num]] cat("Processed chunk", chunk_num, "with", nrow(chunk_data), "rows\n")
<- chunk_num + 1
chunk_num
}
close(connection)
# Combine all chunks
<- do.call(rbind, all_data)
final_data return(final_data)
}
# For demonstration, create a larger file
<- data.frame(
large_sample id = 1:500,
name = paste("Person", 1:500),
value = runif(500, 1, 100),
category = sample(c("A", "B", "C"), 500, replace = TRUE)
)
write.csv(large_sample, "large_sample.csv", row.names = FALSE)
# Read it back in chunks
# chunked_data <- read_large_csv_chunked("large_sample.csv", chunk_size = 100)
Working with Compressed Files
# Create sample data
<- data.frame(
compression_demo x = 1:1000,
y = rnorm(1000),
text = paste("Text entry", 1:1000)
)
# Write regular CSV
write.csv(compression_demo, "uncompressed.csv", row.names = FALSE)
# Write compressed CSV (gzip)
<- gzfile("compressed.csv.gz", "w")
gz_connection write.csv(compression_demo, gz_connection, row.names = FALSE)
close(gz_connection)
# Read compressed file
<- read.csv("compressed.csv.gz")
compressed_data
# Compare file sizes
<- file.info("uncompressed.csv")$size
uncompressed_size <- file.info("compressed.csv.gz")$size
compressed_size
cat("Uncompressed size:", uncompressed_size, "bytes\n")
Uncompressed size: 38967 bytes
cat("Compressed size:", compressed_size, "bytes\n")
Compressed size: 14512 bytes
cat("Compression ratio:", round(compressed_size / uncompressed_size * 100, 1), "%\n")
Compression ratio: 37.2 %
File Manipulation
# Copy files
if (file.exists("sample_employees.csv")) {
file.copy("sample_employees.csv", "backup_employees.csv")
cat("File copied successfully\n")
}
File copied successfully
# Rename files
if (file.exists("backup_employees.csv")) {
file.rename("backup_employees.csv", "employees_backup.csv")
cat("File renamed successfully\n")
}
File renamed successfully
# Get file information
<- file.info("sample_employees.csv")
file_info cat("File size:", file_info$size, "bytes\n")
File size: 133 bytes
cat("Created:", file_info$ctime, "\n")
Created: 1758499359
cat("Modified:", file_info$mtime, "\n")
Modified: 1758499359
# Remove files (be careful!)
<- c("temp1.txt", "temp2.txt")
temp_files for (temp_file in temp_files) {
cat("Creating temporary file:", temp_file, "\n")
writeLines("Temporary content", temp_file)
}
Creating temporary file: temp1.txt
Creating temporary file: temp2.txt
# List temporary files
<- temp_files[file.exists(temp_files)]
existing_temp cat("Temporary files found:", paste(existing_temp, collapse = ", "), "\n")
Temporary files found: temp1.txt, temp2.txt
# Remove temporary files
for (temp_file in existing_temp) {
file.remove(temp_file)
cat("Removed:", temp_file, "\n")
}
Removed: temp1.txt
Removed: temp2.txt
Data Import/Export Best Practices
Robust Data Reading Function
# Robust function for reading CSV files
<- function(filename, ...) {
read_csv_safe
# Check if file exists
if (!file.exists(filename)) {
stop("File does not exist: ", filename)
}
# Get file information
<- file.info(filename)
file_info cat("Reading file:", filename, "\n")
cat("File size:", file_info$size, "bytes\n")
# Try to read the file
tryCatch({
<- read.csv(filename, stringsAsFactors = FALSE, ...)
data
cat("Successfully read", nrow(data), "rows and", ncol(data), "columns\n")
# Basic data quality checks
if (nrow(data) == 0) {
warning("File contains no data rows")
}
# Check for missing values
<- sapply(data, function(x) sum(is.na(x)))
missing_counts if (any(missing_counts > 0)) {
cat("Missing values found:\n")
for (col in names(missing_counts)[missing_counts > 0]) {
cat(" ", col, ":", missing_counts[col], "\n")
}
}
return(data)
error = function(e) {
}, stop("Error reading file: ", e$message)
})
}
# Test the robust reading function
<- read_csv_safe("sample_employees.csv") safe_data
Reading file: sample_employees.csv
File size: 133 bytes
Successfully read 4 rows and 4 columns
str(safe_data)
'data.frame': 4 obs. of 4 variables:
$ name : chr "Alice" "Bob" "Charlie" "Diana"
$ age : int 25 30 35 28
$ salary : int 50000 60000 55000 52000
$ department: chr "Sales" "IT" "Marketing" "HR"
Data Export with Metadata
# Function to export data with metadata
<- function(data, filename, description = "") {
export_with_metadata
# Create metadata
<- list(
metadata filename = filename,
description = description,
export_date = Sys.time(),
rows = nrow(data),
columns = ncol(data),
column_names = names(data),
column_types = sapply(data, class),
r_version = R.version.string
)
# Write data file
write.csv(data, filename, row.names = FALSE)
# Write metadata file
<- sub("\\.csv$", "_metadata.txt", filename)
metadata_filename
<- c(
metadata_text "Data Export Metadata",
"===================",
"",
paste("File:", metadata$filename),
paste("Description:", metadata$description),
paste("Export Date:", metadata$export_date),
paste("R Version:", metadata$r_version),
"",
"Data Summary:",
paste(" Rows:", metadata$rows),
paste(" Columns:", metadata$columns),
"",
"Column Information:",
paste(" Names:", paste(metadata$column_names, collapse = ", ")),
"",
"Column Types:"
)
for (i in seq_along(metadata$column_types)) {
<- c(metadata_text,
metadata_text paste(" ", metadata$column_names[i], ":", metadata$column_types[i]))
}
writeLines(metadata_text, metadata_filename)
cat("Data exported to:", filename, "\n")
cat("Metadata exported to:", metadata_filename, "\n")
return(invisible(metadata))
}
# Test the export function
export_with_metadata(sample_data, "employees_with_metadata.csv",
"Sample employee data for training purposes")
Data exported to: employees_with_metadata.csv
Metadata exported to: employees_with_metadata_metadata.txt
Practical Examples
Example 1: Data Pipeline
# Complete data processing pipeline
<- function(input_file, output_dir = "output") {
process_sales_data
cat("=== Sales Data Processing Pipeline ===\n")
# Create output directory
if (!dir.exists(output_dir)) {
dir.create(output_dir)
}
# Step 1: Read raw data
cat("Step 1: Reading raw data...\n")
tryCatch({
<- read.csv(input_file, stringsAsFactors = FALSE)
raw_data cat(" ✓ Read", nrow(raw_data), "records\n")
error = function(e) {
}, stop("Failed to read input file: ", e$message)
})
# Step 2: Data cleaning
cat("Step 2: Cleaning data...\n")
# Remove duplicates
<- nrow(raw_data)
initial_rows <- unique(raw_data)
clean_data <- initial_rows - nrow(clean_data)
duplicates_removed if (duplicates_removed > 0) {
cat(" ✓ Removed", duplicates_removed, "duplicate records\n")
}
# Handle missing values (example: remove rows with missing sales)
if ("sales" %in% names(clean_data)) {
<- clean_data[!is.na(clean_data$sales), ]
clean_data cat(" ✓ Removed records with missing sales values\n")
}
# Step 3: Add calculated fields
cat("Step 3: Adding calculated fields...\n")
if ("sales" %in% names(clean_data)) {
$sales_category <- ifelse(clean_data$sales > mean(clean_data$sales),
clean_data"High", "Low")
cat(" ✓ Added sales category field\n")
}
# Step 4: Generate summary
cat("Step 4: Generating summary...\n")
<- list(
summary_stats total_records = nrow(clean_data),
date_processed = Sys.time()
)
if ("sales" %in% names(clean_data)) {
$total_sales <- sum(clean_data$sales)
summary_stats$avg_sales <- mean(clean_data$sales)
summary_stats$max_sales <- max(clean_data$sales)
summary_stats$min_sales <- min(clean_data$sales)
summary_stats
}
# Step 5: Export results
cat("Step 5: Exporting results...\n")
# Export cleaned data
<- file.path(output_dir, "cleaned_sales_data.csv")
output_file write.csv(clean_data, output_file, row.names = FALSE)
cat(" ✓ Exported cleaned data to", output_file, "\n")
# Export summary
<- file.path(output_dir, "processing_summary.txt")
summary_file <- c(
summary_text "Sales Data Processing Summary",
"============================",
"",
paste("Processing Date:", summary_stats$date_processed),
paste("Total Records:", summary_stats$total_records),
""
)
if ("sales" %in% names(clean_data)) {
<- c(summary_text,
summary_text paste("Total Sales: $", format(summary_stats$total_sales, big.mark = ",")),
paste("Average Sales: $", round(summary_stats$avg_sales, 2)),
paste("Max Sales: $", format(summary_stats$max_sales, big.mark = ",")),
paste("Min Sales: $", format(summary_stats$min_sales, big.mark = ",")))
}
writeLines(summary_text, summary_file)
cat(" ✓ Exported summary to", summary_file, "\n")
cat("=== Pipeline completed successfully! ===\n")
return(clean_data)
}
# Create sample sales data for testing
<- data.frame(
sample_sales date = seq(as.Date("2024-01-01"), by = "day", length.out = 100),
sales = round(runif(100, 1000, 5000), 2),
region = sample(c("North", "South", "East", "West"), 100, replace = TRUE)
)
# Add some duplicates and missing values for demonstration
<- rbind(sample_sales, sample_sales[1:5, ]) # Add duplicates
sample_sales $sales[c(10, 25, 50)] <- NA # Add missing values
sample_sales
write.csv(sample_sales, "sample_sales_raw.csv", row.names = FALSE)
# Run the pipeline
<- process_sales_data("sample_sales_raw.csv") processed_data
=== Sales Data Processing Pipeline ===
Step 1: Reading raw data...
✓ Read 105 records
Step 2: Cleaning data...
✓ Removed 5 duplicate records
✓ Removed records with missing sales values
Step 3: Adding calculated fields...
✓ Added sales category field
Step 4: Generating summary...
Step 5: Exporting results...
✓ Exported cleaned data to output/cleaned_sales_data.csv
✓ Exported summary to output/processing_summary.txt
=== Pipeline completed successfully! ===
Example 2: Automated Reporting
# Function to generate automated reports
<- function(data, output_file = "data_report.txt") {
generate_data_report
# Start report
<- c(
report_lines "AUTOMATED DATA REPORT",
paste(rep("=", 40), collapse = ""),
"",
paste("Generated:", format(Sys.time(), "%Y-%m-%d %H:%M:%S")),
paste("R Version:", R.version.string),
""
)
# Data overview
<- c(report_lines,
report_lines "DATA OVERVIEW",
paste(rep("-", 20), collapse = ""),
paste("Rows:", nrow(data)),
paste("Columns:", ncol(data)),
paste("Column Names:", paste(names(data), collapse = ", ")),
""
)
# Column analysis
<- c(report_lines, "COLUMN ANALYSIS", paste(rep("-", 20), collapse = ""))
report_lines
for (col_name in names(data)) {
<- data[[col_name]]
col_data
<- c(report_lines, paste("Column:", col_name))
report_lines <- c(report_lines, paste(" Type:", class(col_data)[1]))
report_lines <- c(report_lines, paste(" Missing:", sum(is.na(col_data))))
report_lines
if (is.numeric(col_data)) {
<- c(report_lines,
report_lines paste(" Min:", round(min(col_data, na.rm = TRUE), 3)),
paste(" Max:", round(max(col_data, na.rm = TRUE), 3)),
paste(" Mean:", round(mean(col_data, na.rm = TRUE), 3)),
paste(" Std Dev:", round(sd(col_data, na.rm = TRUE), 3))
)else if (is.character(col_data) || is.factor(col_data)) {
} <- length(unique(col_data[!is.na(col_data)]))
unique_vals <- c(report_lines, paste(" Unique Values:", unique_vals))
report_lines
if (unique_vals <= 10) {
<- names(sort(table(col_data), decreasing = TRUE))[1:min(5, unique_vals)]
top_values <- c(report_lines, paste(" Top Values:", paste(top_values, collapse = ", ")))
report_lines
}
}
<- c(report_lines, "")
report_lines
}
# Write report
writeLines(report_lines, output_file)
cat("Report generated:", output_file, "\n")
# Return report content for display
return(report_lines)
}
# Generate report for our sample data
<- generate_data_report(sample_data, "employee_data_report.txt") report_content
Report generated: employee_data_report.txt
# Display first part of report
cat(paste(report_content[1:20], collapse = "\n"))
AUTOMATED DATA REPORT
========================================
Generated: 2025-09-22 01:02:39
R Version: R version 4.3.2 (2023-10-31)
DATA OVERVIEW
--------------------
Rows: 4
Columns: 4
Column Names: name, age, salary, department
COLUMN ANALYSIS
--------------------
Column: name
Type: character
Missing: 0
Unique Values: 4
Top Values: Alice, Bob, Charlie, Diana
Common Pitfalls and Solutions
1. Encoding Issues
# Handle different text encodings
<- function(filename, encoding = "UTF-8") {
read_with_encoding
# Try different encodings if the first one fails
<- c(encoding, "UTF-8", "latin1", "CP1252")
encodings_to_try
for (enc in encodings_to_try) {
tryCatch({
cat("Trying encoding:", enc, "\n")
<- read.csv(filename, fileEncoding = enc, stringsAsFactors = FALSE)
data cat("Success with encoding:", enc, "\n")
return(data)
error = function(e) {
}, cat("Failed with", enc, ":", e$message, "\n")
})
}
stop("Could not read file with any of the attempted encodings")
}
2. Large File Handling
# Tips for handling large files
<- function() {
large_file_tips cat("Tips for working with large files:\n")
cat("1. Use read.csv with nrows parameter to test structure:\n")
cat(" sample <- read.csv('large_file.csv', nrows = 100)\n\n")
cat("2. Specify column types to save memory:\n")
cat(" col_classes <- c('character', 'numeric', 'Date')\n")
cat(" data <- read.csv('file.csv', colClasses = col_classes)\n\n")
cat("3. Consider using data.table or readr for better performance:\n")
cat(" library(data.table)\n")
cat(" data <- fread('large_file.csv')\n\n")
cat("4. Process in chunks if memory is limited\n")
cat("5. Use compression for storage (gzip, bzip2)\n")
cat("6. Consider binary formats like RDS for R-specific data\n")
}
large_file_tips()
Tips for working with large files:
1. Use read.csv with nrows parameter to test structure:
sample <- read.csv('large_file.csv', nrows = 100)
2. Specify column types to save memory:
col_classes <- c('character', 'numeric', 'Date')
data <- read.csv('file.csv', colClasses = col_classes)
3. Consider using data.table or readr for better performance:
library(data.table)
data <- fread('large_file.csv')
4. Process in chunks if memory is limited
5. Use compression for storage (gzip, bzip2)
6. Consider binary formats like RDS for R-specific data
Exercises
Exercise 1: File Organization
Create a function that: 1. Takes a directory path as input 2. Creates a standard data science project structure 3. Adds a README file with project description 4. Creates sample R scripts with appropriate names
Exercise 2: Data Import Pipeline
Write a robust data import function that: 1. Checks if the file exists 2. Determines the file type automatically 3. Reads the data with appropriate functions 4. Performs basic data quality checks 5. Returns a list with data and quality report
Exercise 3: Batch Processing
Create a function that: 1. Processes multiple CSV files in a directory 2. Combines them into a single dataset 3. Handles files with different structures gracefully 4. Exports the combined data with a processing log
Exercise 4: Report Generation
Build an automated reporting system that: 1. Takes a dataset as input 2. Generates summary statistics 3. Creates visualizations (if plotting libraries available) 4. Exports everything to a formatted text report 5. Includes metadata about the analysis
Summary
Working with files effectively is crucial for data analysis workflows:
Key Concepts:
- File paths: Understanding absolute vs relative paths and cross-platform compatibility
- Reading data: CSV, text, Excel, and other formats with appropriate functions
- Writing data: Exporting results in various formats with proper encoding
- Project organization: Creating consistent, logical file structures
- Error handling: Robust file operations with proper validation
Best Practices:
- Use consistent naming conventions for files and directories
- Validate inputs before processing files
- Handle errors gracefully with informative messages
- Document data sources and processing steps
- Use compression for large files to save space
- Create backups of important data
Common Functions:
- Reading:
read.csv()
,readLines()
,read.table()
- Writing:
write.csv()
,writeLines()
,save()
,saveRDS()
- File operations:
file.exists()
,list.files()
,file.copy()
,file.remove()
- Directory operations:
getwd()
,setwd()
,dir.create()
,file.path()
Advanced Features:
- Chunk processing for large files
- Compression support with gzip connections
- Encoding handling for international text
- Metadata preservation for reproducible analyses
Proper file handling skills will serve you well throughout your data analysis career. They ensure your work is reproducible, shareable, and maintainable!
This completes our exploration of R fundamentals. You now have the foundation to tackle real-world data analysis projects!