Data Analysis using R and DPLYR
Enrico
2024-05-14
Load package
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Load dataset get column name
data('msleep')
msleep
## # A tibble: 83 × 11
## name genus vore order conservation sleep_total sleep_rem sleep_cycle awake
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Cheet… Acin… carni Carn… lc 12.1 NA NA 11.9
## 2 Owl m… Aotus omni Prim… <NA> 17 1.8 NA 7
## 3 Mount… Aplo… herbi Rode… nt 14.4 2.4 NA 9.6
## 4 Great… Blar… omni Sori… lc 14.9 2.3 0.133 9.1
## 5 Cow Bos herbi Arti… domesticated 4 0.7 0.667 20
## 6 Three… Brad… herbi Pilo… <NA> 14.4 2.2 0.767 9.6
## 7 North… Call… carni Carn… vu 8.7 1.4 0.383 15.3
## 8 Vespe… Calo… <NA> Rode… <NA> 7 NA NA 17
## 9 Dog Canis carni Carn… domesticated 10.1 2.9 0.333 13.9
## 10 Roe d… Capr… herbi Arti… lc 3 NA NA 21
## # ℹ 73 more rows
## # ℹ 2 more variables: brainwt <dbl>, bodywt <dbl>
names(msleep)
## [1] "name" "genus" "vore" "order" "conservation"
## [6] "sleep_total" "sleep_rem" "sleep_cycle" "awake" "brainwt"
## [11] "bodywt"
?name
Select column were I need to work , filter and add column Calculating the mean of sleep total , remove NA
msleep %>%
select(c(1,6,7,10,11)) %>% # Column1, Column6,Column7 etc #
filter(sleep_total < 5) %>%
mutate(avg = mean(sleep_total, na.rm = T)) # add Column avg #
## # A tibble: 11 × 6
## name sleep_total sleep_rem brainwt bodywt avg
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Cow 4 0.7 0.423 600 3.32
## 2 Roe deer 3 NA 0.0982 14.8 3.32
## 3 Asian elephant 3.9 NA 4.60 2547 3.32
## 4 Horse 2.9 0.6 0.655 521 3.32
## 5 Donkey 3.1 0.4 0.419 187 3.32
## 6 Giraffe 1.9 0.4 NA 900. 3.32
## 7 Pilot whale 2.7 0.1 NA 800 3.32
## 8 African elephant 3.3 NA 5.71 6654 3.32
## 9 Sheep 3.8 0.6 0.175 55.5 3.32
## 10 Caspian seal 3.5 0.4 NA 86 3.32
## 11 Brazilian tapir 4.4 1 0.169 208. 3.32
The same the up but with Column name and using Summarize the average sleep total
msleep %>%
select(name, sleep_total, sleep_rem, bodywt, brainwt) %>%
filter(sleep_total < 5) %>%
summarize(avg_sleep_total = mean(sleep_total, na.rm = TRUE)) # Summarize the average sleep total
## # A tibble: 1 × 1
## avg_sleep_total
## <dbl>
## 1 3.32
Calculates sleep difference, sorts results
msleep %>%
mutate(sleep_over_avg = sleep_total - mean(sleep_total)) %>%
select(name,sleep_total, sleep_over_avg) %>%
arrange(sleep_over_avg) # or desc arrange(desc(sleep_over_avg)
## # A tibble: 83 × 3
## name sleep_total sleep_over_avg
## <chr> <dbl> <dbl>
## 1 Giraffe 1.9 -8.53
## 2 Pilot whale 2.7 -7.73
## 3 Horse 2.9 -7.53
## 4 Roe deer 3 -7.43
## 5 Donkey 3.1 -7.33
## 6 African elephant 3.3 -7.13
## 7 Caspian seal 3.5 -6.93
## 8 Sheep 3.8 -6.63
## 9 Asian elephant 3.9 -6.53
## 10 Cow 4 -6.43
## # ℹ 73 more rows
It calculates the row-wise mean of sleep_total and sleep_rem, adding a new column avg_test. rowwise ensures the calculations are applied to each row independently
msleep %>%
rowwise() %>%
mutate(avg_test = mean(c(sleep_total,sleep_rem, na.rm = TRUE))) %>%
select(sleep_total,sleep_rem,avg_test)
## # A tibble: 83 × 3
## # Rowwise:
## sleep_total sleep_rem avg_test
## <dbl> <dbl> <dbl>
## 1 12.1 NA NA
## 2 17 1.8 6.6
## 3 14.4 2.4 5.93
## 4 14.9 2.3 6.07
## 5 4 0.7 1.9
## 6 14.4 2.2 5.87
## 7 8.7 1.4 3.7
## 8 7 NA NA
## 9 10.1 2.9 4.67
## 10 3 NA NA
## # ℹ 73 more rows
Conditionally adds the string ‘YEEHA’ to rows where the brain weight is less than 0.05, and assigns NA for other rows.
This code will create a new column brain_wt_under_1 with either ‘YEEHA’ or NA depending on the brain weight ##
msleep %>%
mutate(brain_wt_under_1 = ifelse(brainwt < .05, 'YEEHA', NA))
## # A tibble: 83 × 12
## name genus vore order conservation sleep_total sleep_rem sleep_cycle awake
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Cheet… Acin… carni Carn… lc 12.1 NA NA 11.9
## 2 Owl m… Aotus omni Prim… <NA> 17 1.8 NA 7
## 3 Mount… Aplo… herbi Rode… nt 14.4 2.4 NA 9.6
## 4 Great… Blar… omni Sori… lc 14.9 2.3 0.133 9.1
## 5 Cow Bos herbi Arti… domesticated 4 0.7 0.667 20
## 6 Three… Brad… herbi Pilo… <NA> 14.4 2.2 0.767 9.6
## 7 North… Call… carni Carn… vu 8.7 1.4 0.383 15.3
## 8 Vespe… Calo… <NA> Rode… <NA> 7 NA NA 17
## 9 Dog Canis carni Carn… domesticated 10.1 2.9 0.333 13.9
## 10 Roe d… Capr… herbi Arti… lc 3 NA NA 21
## # ℹ 73 more rows
## # ℹ 3 more variables: brainwt <dbl>, bodywt <dbl>, brain_wt_under_1 <chr>
Only applying tolower to character columns):
If you only want to convert text (i.e., character or factor columns) to lowercase, use mutate_if():##
msleep %>%
mutate_if(is.character, tolower)
## # A tibble: 83 × 11
## name genus vore order conservation sleep_total sleep_rem sleep_cycle awake
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 cheet… acin… carni carn… lc 12.1 NA NA 11.9
## 2 owl m… aotus omni prim… <NA> 17 1.8 NA 7
## 3 mount… aplo… herbi rode… nt 14.4 2.4 NA 9.6
## 4 great… blar… omni sori… lc 14.9 2.3 0.133 9.1
## 5 cow bos herbi arti… domesticated 4 0.7 0.667 20
## 6 three… brad… herbi pilo… <NA> 14.4 2.2 0.767 9.6
## 7 north… call… carni carn… vu 8.7 1.4 0.383 15.3
## 8 vespe… calo… <NA> rode… <NA> 7 NA NA 17
## 9 dog canis carni carn… domesticated 10.1 2.9 0.333 13.9
## 10 roe d… capr… herbi arti… lc 3 NA NA 21
## # ℹ 73 more rows
## # ℹ 2 more variables: brainwt <dbl>, bodywt <dbl>
# Load necessary libraries
library(ggplot2)
library(dplyr)
# Simulated data of 10 animals and their total sleep hours
msleep_data <- data.frame(
name = c("Cheetah", "Elephant", "Horse", "Rabbit", "Tiger",
"Kangaroo", "Bat", "Lion", "Giraffe", "Monkey"),
sleep_total = c(12.1, 8.2, 16.0, 5.0, 10.3, 14.9, 19.9, 13.5, 4.6, 9.0)
)
# Bar plot of total sleep by animal
ggplot(msleep_data, aes(x = name, y = sleep_total)) +
geom_bar(stat = "identity", fill = "skyblue", color = "black") +
labs(title = "Total Sleep by Animal (10 Animals)",
x = "Animal",
y = "Total Sleep (hours)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Scatter plot of brain weight vs body weight, colored by diet type (vore)
ggplot(msleep, aes(x = bodywt, y = brainwt, color = vore)) +
geom_point(size = 3) +
labs(title = "Brain Weight vs Body Weight (Colored by Diet Type)",
x = "Body Weight (kg)",
y = "Brain Weight (kg)") +
theme_minimal() +
scale_x_log10() + # Log scale for better visualization (since weights can vary drastically)
scale_y_log10() # Log scale for better visualization
## Warning: Removed 27 rows containing missing values or values outside the scale range
## (`geom_point()`).
More body weight, more brain weight: If the points in the scatter plot generally show an upward trend , it suggests that larger animals tend to have larger brains.
Exceptions: find outliers (e.g., small animals with large brains or large animals with small brains), which could indicate unique evolutionary traits or adaptations. However, this plot doesn’t necessarily prove a strict biological rule.##
average sleep for different types of diets (vore)
# Group the data by diet type (vore) and calculate the average sleep for each group
avg_sleep_by_diet <- msleep %>%
group_by(vore) %>%
summarize(avg_sleep = mean(sleep_total, na.rm = TRUE)) # Calculate average sleep and handle NA values
# Create a bar plot to visualize the average sleep for each diet type
ggplot(avg_sleep_by_diet, aes(x = vore, y = avg_sleep, fill = vore)) +
geom_bar(stat = "identity", color = "black") + # Create bars with black edges
labs(title = "Average Total Sleep by Diet Type",
x = "Diet Type (Vore)",
y = "Average Total Sleep (hours)") +
theme_minimal()
Scatter Plot Code for Total Sleep vs REM Sleep
# Scatter plot of total sleep vs REM sleep
ggplot(msleep, aes(x = sleep_total, y = sleep_rem)) +
geom_point(color = "blue", size = 3) + # Create points in blue with size 3
labs(title = "Total Sleep vs REM Sleep",
x = "Total Sleep (hours)",
y = "REM Sleep (hours)") +
theme_minimal() +
geom_smooth(method = "lm", se = FALSE, color = "red") # Add a regression line without the confidence interval
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 22 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 22 rows containing missing values or values outside the scale range
## (`geom_point()`).