Cleaning Starwors

Cleaning_Starwors
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
1 Check Dada set or head (starwors) or str(starwors)
View(starwars)

2 Check variable type number of column number of row

glimpse(starwars)
## Rows: 87
## Columns: 14
## $ name       <chr> "Luke Skywalker", "C-3PO", "R2-D2", "Darth Vader", "Leia Or…
## $ height     <int> 172, 167, 96, 202, 150, 178, 165, 97, 183, 182, 188, 180, 2…
## $ mass       <dbl> 77.0, 75.0, 32.0, 136.0, 49.0, 120.0, 75.0, 32.0, 84.0, 77.…
## $ hair_color <chr> "blond", NA, NA, "none", "brown", "brown, grey", "brown", N…
## $ skin_color <chr> "fair", "gold", "white, blue", "white", "light", "light", "…
## $ eye_color  <chr> "blue", "yellow", "red", "yellow", "brown", "blue", "blue",…
## $ birth_year <dbl> 19.0, 112.0, 33.0, 41.9, 19.0, 52.0, 47.0, NA, 24.0, 57.0, …
## $ sex        <chr> "male", "none", "none", "male", "female", "male", "female",…
## $ gender     <chr> "masculine", "masculine", "masculine", "masculine", "femini…
## $ homeworld  <chr> "Tatooine", "Tatooine", "Naboo", "Tatooine", "Alderaan", "T…
## $ species    <chr> "Human", "Droid", "Droid", "Human", "Human", "Human", "Huma…
## $ films      <list> <"A New Hope", "The Empire Strikes Back", "Return of the J…
## $ vehicles   <list> <"Snowspeeder", "Imperial Speeder Bike">, <>, <>, <>, "Imp…
## $ starships  <list> <"X-wing", "Imperial shuttle">, <>, <>, "TIE Advanced x1",…

Type of variable

= carachers = integer = number = double = numbers with decimal <>. = Factors = es. size smal, smaller, smollest or smal , medium , and larg= order

After checking variabke we decided to change ‘gender’ now in to factor

check again if the column gender is

class(starwars$gender)
## [1] "character"

now check the unique variable in the colum gender

unique(starwars$gender)
## [1] "masculine" "feminine"  NA

Now changing the column from to factor and check if is change

#####. if you will write the same variable starwars\(gender1 <- as_factor(starwars\)gender) you will have a new column call gender1

starwars$gender <- as_factor(starwars$gender)
class(starwars$gender)
## [1] "factor"

Che the levels of the column and count NA in a column

levels(starwars$gender)
## [1] "masculine" "feminine"
sum(is.na(starwars$gender))
## [1] 4

Change level of the factor in this case feminine and masculine

starwars$gender<- factor((starwars$gender), levels = c('masculine','feminine'))
check levels change
levels(starwars$gender)
## [1] "masculine" "feminine"
select variables
starwars %>% 
  select(name,height,ends_with('color')) %>% 
  names()
## [1] "name"       "height"     "hair_color" "skin_color" "eye_color"
  # or 
starwars %>% 
  select(name,height,ends_with('color')) 
## # A tibble: 87 × 5
##    name               height hair_color    skin_color  eye_color
##    <chr>               <int> <chr>         <chr>       <chr>    
##  1 Luke Skywalker        172 blond         fair        blue     
##  2 C-3PO                 167 <NA>          gold        yellow   
##  3 R2-D2                  96 <NA>          white, blue red      
##  4 Darth Vader           202 none          white       yellow   
##  5 Leia Organa           150 brown         light       brown    
##  6 Owen Lars             178 brown, grey   light       blue     
##  7 Beru Whitesun Lars    165 brown         light       blue     
##  8 R5-D4                  97 <NA>          white, red  red      
##  9 Biggs Darklighter     183 black         light       brown    
## 10 Obi-Wan Kenobi        182 auburn, white fair        blue-gray
## # ℹ 77 more rows

looking for unique value in the column hair_color

unique(starwars$hair_color)
##  [1] "blond"         NA              "none"          "brown"        
##  [5] "brown, grey"   "black"         "auburn, white" "auburn, grey" 
##  [9] "white"         "grey"          "auburn"        "blonde"
using select and filter in the hair_color column using %in% = either blond or brown the row have to contain one of the two value in order to be troue and hieght min than 180
starwars %>% 
  select(name,height,ends_with('color')) %>% 
  filter(hair_color %in% c('blond','brown') &
           height<180)  
## # A tibble: 8 × 5
##   name                  height hair_color skin_color eye_color
##   <chr>                  <int> <chr>      <chr>      <chr>    
## 1 Luke Skywalker           172 blond      fair       blue     
## 2 Leia Organa              150 brown      light      brown    
## 3 Beru Whitesun Lars       165 brown      light      blue     
## 4 Wedge Antilles           170 brown      fair       hazel    
## 5 Wicket Systri Warrick     88 brown      brown      brown    
## 6 Finis Valorum            170 blond      fair       blue     
## 7 Cordé                    157 brown      light      brown    
## 8 Dormé                    165 brown      light      brown

missing data .NA option we can remuved all the missing data in order to calculate the mean NOT RACCOMANDAD IF YOU DONT UNDERSTEND THE NA

mean(starwars$height,na.rm = TRUE)
## [1] 174.6049
#or
starwars %>% 
  select(name,gender,hair_color,height) %>% na.omit()
## # A tibble: 72 × 4
##    name               gender    hair_color    height
##    <chr>              <fct>     <chr>          <int>
##  1 Luke Skywalker     masculine blond            172
##  2 Darth Vader        masculine none             202
##  3 Leia Organa        feminine  brown            150
##  4 Owen Lars          masculine brown, grey      178
##  5 Beru Whitesun Lars feminine  brown            165
##  6 Biggs Darklighter  masculine black            183
##  7 Obi-Wan Kenobi     masculine auburn, white    182
##  8 Anakin Skywalker   masculine blond            188
##  9 Wilhuff Tarkin     masculine auburn, grey     180
## 10 Chewbacca          masculine brown            228
## # ℹ 62 more rows
second option we can try really to understand were the missing data NAis.
filter(!complete.cases(.)) complete.cases= with out NA il puntino si riferisce alla dataset con !complete.cases= with esclamation point means the opposit or gime me all the value missing
starwars %>% 
  select(name, height,gender,hair_color) %>% 
  filter(!complete.cases(.))
## # A tibble: 15 × 4
##    name                  height gender    hair_color
##    <chr>                  <int> <fct>     <chr>     
##  1 C-3PO                    167 masculine <NA>      
##  2 R2-D2                     96 masculine <NA>      
##  3 R5-D4                     97 masculine <NA>      
##  4 Greedo                   173 masculine <NA>      
##  5 Jabba Desilijic Tiure    175 masculine <NA>      
##  6 Jek Tono Porkins         180 <NA>      brown     
##  7 Arvel Crynyd              NA masculine brown     
##  8 Gregar Typho             185 <NA>      black     
##  9 Cordé                    157 <NA>      brown     
## 10 Sly Moore                178 <NA>      none      
## 11 Finn                      NA masculine black     
## 12 Rey                       NA feminine  brown     
## 13 Poe Dameron               NA masculine brown     
## 14 BB8                       NA masculine none      
## 15 Captain Phasma            NA feminine  none
After examing the NA in the dataset we want to drop NA for height but not for hair color because the caracter on the film have no hair
#drop NA for height
starwars %>% 
  select(name, height,gender,hair_color) %>% 
  filter(!complete.cases(.)) %>% 
  drop_na(height)
## # A tibble: 9 × 4
##   name                  height gender    hair_color
##   <chr>                  <int> <fct>     <chr>     
## 1 C-3PO                    167 masculine <NA>      
## 2 R2-D2                     96 masculine <NA>      
## 3 R5-D4                     97 masculine <NA>      
## 4 Greedo                   173 masculine <NA>      
## 5 Jabba Desilijic Tiure    175 masculine <NA>      
## 6 Jek Tono Porkins         180 <NA>      brown     
## 7 Gregar Typho             185 <NA>      black     
## 8 Cordé                    157 <NA>      brown     
## 9 Sly Moore                178 <NA>      none

transform hair_color for some character in none because they did not have hair

starwars %>% 
  select(name, height,gender,hair_color) %>% 
  mutate(hair_color = replace_na(hair_color,'none'))
## # A tibble: 87 × 4
##    name               height gender    hair_color   
##    <chr>               <int> <fct>     <chr>        
##  1 Luke Skywalker        172 masculine blond        
##  2 C-3PO                 167 masculine none         
##  3 R2-D2                  96 masculine none         
##  4 Darth Vader           202 masculine none         
##  5 Leia Organa           150 feminine  brown        
##  6 Owen Lars             178 masculine brown, grey  
##  7 Beru Whitesun Lars    165 feminine  brown        
##  8 R5-D4                  97 masculine none         
##  9 Biggs Darklighter     183 masculine black        
## 10 Obi-Wan Kenobi        182 masculine auburn, white
## # ℹ 77 more rows

Duplicate

names <- c('Peter','John','Andrew','Peter')
age <- c(22,33,44,22)

friends <- data.frame(names,age)
#this is with the tidyverse to see the duplicated 
duplicated(friends)
## [1] FALSE FALSE FALSE  TRUE
glimpse(friends)
## Rows: 4
## Columns: 2
## $ names <chr> "Peter", "John", "Andrew", "Peter"
## $ age   <dbl> 22, 33, 44, 22
# this under is the old R 

friends[!duplicated(friends), ]
##    names age
## 1  Peter  22
## 2   John  33
## 3 Andrew  44

####. sama with tidyverse

friends %>% distinct()
##    names age
## 1  Peter  22
## 2   John  33
## 3 Andrew  44

Recording variables

starwars %>% select(name,gender)
## # A tibble: 87 × 2
##    name               gender   
##    <chr>              <fct>    
##  1 Luke Skywalker     masculine
##  2 C-3PO              masculine
##  3 R2-D2              masculine
##  4 Darth Vader        masculine
##  5 Leia Organa        feminine 
##  6 Owen Lars          masculine
##  7 Beru Whitesun Lars feminine 
##  8 R5-D4              masculine
##  9 Biggs Darklighter  masculine
## 10 Obi-Wan Kenobi     masculine
## # ℹ 77 more rows
starwars %>% 
  select(name, gender) %>% 
 mutate(gender_value = recode(gender,
                         'masculine'= 1,
                         'feminine'= 2))
## # A tibble: 87 × 3
##    name               gender    gender_value
##    <chr>              <fct>            <dbl>
##  1 Luke Skywalker     masculine            1
##  2 C-3PO              masculine            1
##  3 R2-D2              masculine            1
##  4 Darth Vader        masculine            1
##  5 Leia Organa        feminine             2
##  6 Owen Lars          masculine            1
##  7 Beru Whitesun Lars feminine             2
##  8 R5-D4              masculine            1
##  9 Biggs Darklighter  masculine            1
## 10 Obi-Wan Kenobi     masculine            1
## # ℹ 77 more rows