Data Analysis using R, DPLYR, and GGPLOT2

Data Analysis using R and DPLYR

Load package

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Load dataset get column name

data('msleep')
msleep
## # A tibble: 83 × 11
##    name   genus vore  order conservation sleep_total sleep_rem sleep_cycle awake
##    <chr>  <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
##  1 Cheet… Acin… carni Carn… lc                  12.1      NA        NA      11.9
##  2 Owl m… Aotus omni  Prim… <NA>                17         1.8      NA       7  
##  3 Mount… Aplo… herbi Rode… nt                  14.4       2.4      NA       9.6
##  4 Great… Blar… omni  Sori… lc                  14.9       2.3       0.133   9.1
##  5 Cow    Bos   herbi Arti… domesticated         4         0.7       0.667  20  
##  6 Three… Brad… herbi Pilo… <NA>                14.4       2.2       0.767   9.6
##  7 North… Call… carni Carn… vu                   8.7       1.4       0.383  15.3
##  8 Vespe… Calo… <NA>  Rode… <NA>                 7        NA        NA      17  
##  9 Dog    Canis carni Carn… domesticated        10.1       2.9       0.333  13.9
## 10 Roe d… Capr… herbi Arti… lc                   3        NA        NA      21  
## # ℹ 73 more rows
## # ℹ 2 more variables: brainwt <dbl>, bodywt <dbl>
names(msleep)
##  [1] "name"         "genus"        "vore"         "order"        "conservation"
##  [6] "sleep_total"  "sleep_rem"    "sleep_cycle"  "awake"        "brainwt"     
## [11] "bodywt"
?name

Select column were I need to work , filter and add column Calculating the mean of sleep total , remove NA

msleep %>%
  select(c(1,6,7,10,11)) %>% # Column1, Column6,Column7 etc #
  filter(sleep_total < 5) %>%
  mutate(avg = mean(sleep_total, na.rm = T)) # add Column avg #
## # A tibble: 11 × 6
##    name             sleep_total sleep_rem brainwt bodywt   avg
##    <chr>                  <dbl>     <dbl>   <dbl>  <dbl> <dbl>
##  1 Cow                      4         0.7  0.423   600    3.32
##  2 Roe deer                 3        NA    0.0982   14.8  3.32
##  3 Asian elephant           3.9      NA    4.60   2547    3.32
##  4 Horse                    2.9       0.6  0.655   521    3.32
##  5 Donkey                   3.1       0.4  0.419   187    3.32
##  6 Giraffe                  1.9       0.4 NA       900.   3.32
##  7 Pilot whale              2.7       0.1 NA       800    3.32
##  8 African elephant         3.3      NA    5.71   6654    3.32
##  9 Sheep                    3.8       0.6  0.175    55.5  3.32
## 10 Caspian seal             3.5       0.4 NA        86    3.32
## 11 Brazilian tapir          4.4       1    0.169   208.   3.32

The same the up but with Column name and using Summarize the average sleep total

msleep %>%
  select(name, sleep_total, sleep_rem, bodywt, brainwt) %>%
  filter(sleep_total < 5) %>%
  summarize(avg_sleep_total = mean(sleep_total, na.rm = TRUE))  # Summarize the average sleep total
## # A tibble: 1 × 1
##   avg_sleep_total
##             <dbl>
## 1            3.32

Calculates sleep difference, sorts results

msleep %>%
  mutate(sleep_over_avg = sleep_total - mean(sleep_total)) %>%
  select(name,sleep_total, sleep_over_avg) %>%
  arrange(sleep_over_avg) # or desc arrange(desc(sleep_over_avg)
## # A tibble: 83 × 3
##    name             sleep_total sleep_over_avg
##    <chr>                  <dbl>          <dbl>
##  1 Giraffe                  1.9          -8.53
##  2 Pilot whale              2.7          -7.73
##  3 Horse                    2.9          -7.53
##  4 Roe deer                 3            -7.43
##  5 Donkey                   3.1          -7.33
##  6 African elephant         3.3          -7.13
##  7 Caspian seal             3.5          -6.93
##  8 Sheep                    3.8          -6.63
##  9 Asian elephant           3.9          -6.53
## 10 Cow                      4            -6.43
## # ℹ 73 more rows

It calculates the row-wise mean of sleep_total and sleep_rem, adding a new column avg_test. rowwise ensures the calculations are applied to each row independently

msleep %>%
  rowwise() %>%
  mutate(avg_test = mean(c(sleep_total,sleep_rem, na.rm = TRUE))) %>%
  select(sleep_total,sleep_rem,avg_test)
## # A tibble: 83 × 3
## # Rowwise: 
##    sleep_total sleep_rem avg_test
##          <dbl>     <dbl>    <dbl>
##  1        12.1      NA      NA   
##  2        17         1.8     6.6 
##  3        14.4       2.4     5.93
##  4        14.9       2.3     6.07
##  5         4         0.7     1.9 
##  6        14.4       2.2     5.87
##  7         8.7       1.4     3.7 
##  8         7        NA      NA   
##  9        10.1       2.9     4.67
## 10         3        NA      NA   
## # ℹ 73 more rows

Conditionally adds the string ‘YEEHA’ to rows where the brain weight is less than 0.05, and assigns NA for other rows.

This code will create a new column brain_wt_under_1 with either ‘YEEHA’ or NA depending on the brain weight ##

msleep %>%
  mutate(brain_wt_under_1 = ifelse(brainwt < .05, 'YEEHA', NA))
## # A tibble: 83 × 12
##    name   genus vore  order conservation sleep_total sleep_rem sleep_cycle awake
##    <chr>  <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
##  1 Cheet… Acin… carni Carn… lc                  12.1      NA        NA      11.9
##  2 Owl m… Aotus omni  Prim… <NA>                17         1.8      NA       7  
##  3 Mount… Aplo… herbi Rode… nt                  14.4       2.4      NA       9.6
##  4 Great… Blar… omni  Sori… lc                  14.9       2.3       0.133   9.1
##  5 Cow    Bos   herbi Arti… domesticated         4         0.7       0.667  20  
##  6 Three… Brad… herbi Pilo… <NA>                14.4       2.2       0.767   9.6
##  7 North… Call… carni Carn… vu                   8.7       1.4       0.383  15.3
##  8 Vespe… Calo… <NA>  Rode… <NA>                 7        NA        NA      17  
##  9 Dog    Canis carni Carn… domesticated        10.1       2.9       0.333  13.9
## 10 Roe d… Capr… herbi Arti… lc                   3        NA        NA      21  
## # ℹ 73 more rows
## # ℹ 3 more variables: brainwt <dbl>, bodywt <dbl>, brain_wt_under_1 <chr>

Only applying tolower to character columns):

If you only want to convert text (i.e., character or factor columns) to lowercase, use mutate_if():##

msleep %>%
  mutate_if(is.character, tolower)
## # A tibble: 83 × 11
##    name   genus vore  order conservation sleep_total sleep_rem sleep_cycle awake
##    <chr>  <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
##  1 cheet… acin… carni carn… lc                  12.1      NA        NA      11.9
##  2 owl m… aotus omni  prim… <NA>                17         1.8      NA       7  
##  3 mount… aplo… herbi rode… nt                  14.4       2.4      NA       9.6
##  4 great… blar… omni  sori… lc                  14.9       2.3       0.133   9.1
##  5 cow    bos   herbi arti… domesticated         4         0.7       0.667  20  
##  6 three… brad… herbi pilo… <NA>                14.4       2.2       0.767   9.6
##  7 north… call… carni carn… vu                   8.7       1.4       0.383  15.3
##  8 vespe… calo… <NA>  rode… <NA>                 7        NA        NA      17  
##  9 dog    canis carni carn… domesticated        10.1       2.9       0.333  13.9
## 10 roe d… capr… herbi arti… lc                   3        NA        NA      21  
## # ℹ 73 more rows
## # ℹ 2 more variables: brainwt <dbl>, bodywt <dbl>
# Load necessary libraries
library(ggplot2)
library(dplyr)

# Simulated data of 10 animals and their total sleep hours
msleep_data <- data.frame(
  name = c("Cheetah", "Elephant", "Horse", "Rabbit", "Tiger", 
           "Kangaroo", "Bat", "Lion", "Giraffe", "Monkey"),
  sleep_total = c(12.1, 8.2, 16.0, 5.0, 10.3, 14.9, 19.9, 13.5, 4.6, 9.0)
)

# Bar plot of total sleep by animal
ggplot(msleep_data, aes(x = name, y = sleep_total)) +
  geom_bar(stat = "identity", fill = "skyblue", color = "black") +
  labs(title = "Total Sleep by Animal (10 Animals)", 
       x = "Animal", 
       y = "Total Sleep (hours)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Scatter plot of brain weight vs body weight, colored by diet type (vore)
ggplot(msleep, aes(x = bodywt, y = brainwt, color = vore)) +
  geom_point(size = 3) +
  labs(title = "Brain Weight vs Body Weight (Colored by Diet Type)",
       x = "Body Weight (kg)",
       y = "Brain Weight (kg)") +
  theme_minimal() +
  scale_x_log10() +  # Log scale for better visualization (since weights can vary drastically)
  scale_y_log10()    # Log scale for better visualization
## Warning: Removed 27 rows containing missing values or values outside the scale range
## (`geom_point()`).

More body weight, more brain weight: If the points in the scatter plot generally show an upward trend , it suggests that larger animals tend to have larger brains.

Exceptions: find outliers (e.g., small animals with large brains or large animals with small brains), which could indicate unique evolutionary traits or adaptations. However, this plot doesn’t necessarily prove a strict biological rule.##

average sleep for different types of diets (vore)

# Group the data by diet type (vore) and calculate the average sleep for each group
avg_sleep_by_diet <- msleep %>%
  group_by(vore) %>%
  summarize(avg_sleep = mean(sleep_total, na.rm = TRUE))  # Calculate average sleep and handle NA values

# Create a bar plot to visualize the average sleep for each diet type
ggplot(avg_sleep_by_diet, aes(x = vore, y = avg_sleep, fill = vore)) +
  geom_bar(stat = "identity", color = "black") +  # Create bars with black edges
  labs(title = "Average Total Sleep by Diet Type",
       x = "Diet Type (Vore)",
       y = "Average Total Sleep (hours)") +
  theme_minimal()

Scatter Plot Code for Total Sleep vs REM Sleep

# Scatter plot of total sleep vs REM sleep
ggplot(msleep, aes(x = sleep_total, y = sleep_rem)) +
  geom_point(color = "blue", size = 3) +  # Create points in blue with size 3
  labs(title = "Total Sleep vs REM Sleep",
       x = "Total Sleep (hours)",
       y = "REM Sleep (hours)") +
  theme_minimal() +
  geom_smooth(method = "lm", se = FALSE, color = "red")  # Add a regression line without the confidence interval
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 22 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 22 rows containing missing values or values outside the scale range
## (`geom_point()`).