7.6 A few more useful dplyr functions

7.6.1 Mutating joins

The following functions allows one to join / merge 2 tibbles into 1 using columns that contain common keys.

Table 7.4: mutating joins functions
name what.it.does
inner_join includes all rows in x and y (intersection)
left_join includes all rows in x
right_join includes all rows in y
full_join includes all rows in x or y (union)

Let’s create 2 small tibbles:

tibX <- tibble(ID=LETTERS[1:4],
               year=c("2020", "2021", "2021", "2020")
)

tibY <- tibble(ID=LETTERS[3:5],
               month=c("January", "October", "July")
               )

We will join tibX and tibY using the ID column, and keep only rows that contain a matching ID with inner_join:

inner_join(x=tibX,
           y=tibY,
           by="ID")
## # A tibble: 2 x 3
##   ID    year  month  
##   <chr> <chr> <chr>  
## 1 C     2021  January
## 2 D     2020  October

Keep all rows from tibX regardless on whether they have a match in tibY with left_join:

left_join(x=tibX,
           y=tibY,
           by="ID")
## # A tibble: 4 x 3
##   ID    year  month  
##   <chr> <chr> <chr>  
## 1 A     2020  <NA>   
## 2 B     2021  <NA>   
## 3 C     2021  January
## 4 D     2020  October

Keep all rows from tibY regardless on whether they have a match in tibX with right_join:

right_join(x=tibX,
           y=tibY,
           by="ID")
## # A tibble: 3 x 3
##   ID    year  month  
##   <chr> <chr> <chr>  
## 1 C     2021  January
## 2 D     2020  October
## 3 E     <NA>  July

Keep all rows from both tibbles with full_join:

full_join(x=tibX,
           y=tibY,
           by="ID")
## # A tibble: 5 x 3
##   ID    year  month  
##   <chr> <chr> <chr>  
## 1 A     2020  <NA>   
## 2 B     2021  <NA>   
## 3 C     2021  January
## 4 D     2020  October
## 5 E     <NA>  July

Note that columns do NOT need to be named the same way!
Let’s consider the new tibble tibZ:

tibZ <- tibble(id=LETTERS[3:5],
               month=c("May", "June", "April")
               )

We can join it with tibX by giving the “by” parameter a named vector that contains 1 element:

full_join(x=tibX,
           y=tibZ,
           by=c("ID" = "id")
)
## # A tibble: 5 x 3
##   ID    year  month
##   <chr> <chr> <chr>
## 1 A     2020  <NA> 
## 2 B     2021  <NA> 
## 3 C     2021  May  
## 4 D     2020  June 
## 5 E     <NA>  April

HANDS-ON

Join the 2 following tibbles (keep all rows from the mynames tibble):

mynames <- tibble(name=c("Einstein", "Newton", "Curie", "Mendel", "Franklin"),
                  birth_year=c(1879, 1643, 1867, 1822, 1920))
  
myemails <- tibble(full_name=c("Albert Einstein", "Isaac Newton", "Marie Curie", "Rosalind Franklin"),
                               email_address=c("aeinstein283@coolmail.com", "isaac.newton.scientist@coolmail.com", "mariecurie007@coolmail.com", "rosalind2_franklin@coolmail.com"))
Answer
# need to separate a column first!
myemails %>% separate(col=full_name,
                      into=c("first_name", "last_name"),
                      sep=" ") %>%
            right_join(y=mynames,
                       by=c("last_name" = "name"))

7.6.2 Extract or remove rows with slice:

  • Extract rows:
# Fetch the first 2 rows (index 1 and 2)
slice(presidential2, 
      1:2)
  • Remove rows:
# Remove 1rst and 4th rows
slice(presidential2, 
      -c(1,4))

The slice helpers can be useful:

Table 7.5: slice helpers
name usage
slice_min select rows with lowest values of a variable
slice_max select rows with highest values of a variable
slice_head select the first rows
slice_tail select the last rows
slice_sample randomly select rows

Extract the row that has the maximum height from in the starwars dataset with slice_max:

# by default, only 1 row is extracted
slice_max(starwars, 
          order_by=height)
## # A tibble: 1 x 14
##   name   height  mass hair_color skin_color eye_color birth_year sex   gender homeworld species
##   <chr>   <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr>  <chr>     <chr>  
## 1 Yarae…    264    NA none       white      yellow            NA male  mascu… Quermia   Quermi…
## # … with 3 more variables: films <list>, vehicles <list>, starships <list>
# set parameter "n" if you want to extract the "n" top rows
slice_max(starwars, 
          order_by=height,
          n=3)
## # A tibble: 3 x 14
##   name   height  mass hair_color skin_color eye_color birth_year sex   gender homeworld species
##   <chr>   <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr>  <chr>     <chr>  
## 1 Yarae…    264    NA none       white      yellow            NA male  mascu… Quermia   Quermi…
## 2 Tarff…    234   136 brown      brown      blue              NA male  mascu… Kashyyyk  Wookiee
## 3 Lama …    229    88 none       grey       black             NA male  mascu… Kamino    Kamino…
## # … with 3 more variables: films <list>, vehicles <list>, starships <list>

Same for the minimum with slice_min:

slice_min(starwars, 
          order_by=mass,
          n=2)
## # A tibble: 2 x 14
##   name   height  mass hair_color skin_color eye_color birth_year sex   gender homeworld species
##   <chr>   <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr>  <chr>     <chr>  
## 1 Ratts…     79    15 none       grey, blue unknown           NA male  mascu… Aleen Mi… Aleena 
## 2 Yoda       66    17 white      green      brown            896 male  mascu… <NA>      Yoda's…
## # … with 3 more variables: films <list>, vehicles <list>, starships <list>

Extract the first or last row with slice_head and slice_tail, respectively

# first row
slice_head(starwars)
## # A tibble: 1 x 14
##   name   height  mass hair_color skin_color eye_color birth_year sex   gender homeworld species
##   <chr>   <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr>  <chr>     <chr>  
## 1 Luke …    172    77 blond      fair       blue              19 male  mascu… Tatooine  Human  
## # … with 3 more variables: films <list>, vehicles <list>, starships <list>
# last row
slice_tail(starwars)
## # A tibble: 1 x 14
##   name   height  mass hair_color skin_color eye_color birth_year sex   gender homeworld species
##   <chr>   <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr>  <chr>     <chr>  
## 1 Padmé…    165    45 brown      light      brown             46 fema… femin… Naboo     Human  
## # … with 3 more variables: films <list>, vehicles <list>, starships <list>

You can extract the “n” first or last rows, or you can extract a certain proportions of rows to select with “prop”:

# first 5 rows
slice_head(starwars, 
           n=5)

# first 10% of the rows
slice_head(starwars, 
           prop=0.1)

# last 7 rows
slice_tail(starwars, 
           n=7)

# last 25% of the rows
slice_tail(starwars, 
           prop=0.25)

Select a random sample of rows with slice_random:

# 1 random row
slice_sample(starwars)
## # A tibble: 1 x 14
##   name   height  mass hair_color skin_color eye_color birth_year sex   gender homeworld species
##   <chr>   <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr>  <chr>     <chr>  
## 1 Plo K…    188    80 none       orange     black             22 male  mascu… Dorin     Kel Dor
## # … with 3 more variables: films <list>, vehicles <list>, starships <list>
# 4 random rows
slice_sample(starwars, 
             n=4)
## # A tibble: 4 x 14
##   name   height  mass hair_color skin_color eye_color birth_year sex   gender homeworld species
##   <chr>   <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr>  <chr>     <chr>  
## 1 Taun …    213    NA none       grey       black             NA fema… femin… Kamino    Kamino…
## 2 Gasga…    122    NA none       white, bl… black             NA male  mascu… Troiken   Xexto  
## 3 Grega…    185    85 black      dark       brown             NA male  mascu… Naboo     Human  
## 4 Saese…    188    NA none       pale       orange            NA male  mascu… Iktotch   Iktotc…
## # … with 3 more variables: films <list>, vehicles <list>, starships <list>
# 3% of rows selected randomly
slice_sample(starwars, 
             prop=0.03)
## # A tibble: 2 x 14
##   name   height  mass hair_color skin_color eye_color birth_year sex   gender homeworld species
##   <chr>   <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr>  <chr>     <chr>  
## 1 Plo K…    188    80 none       orange     black             22 male  mascu… Dorin     Kel Dor
## 2 Clieg…    183    NA brown      fair       blue              82 male  mascu… Tatooine  Human  
## # … with 3 more variables: films <list>, vehicles <list>, starships <list>

Note that the slice functions can be combined with a grouping.

  • With slice_max you would get the maximum per group (with slice_min, the minimum per group):
presidential2 %>% 
  group_by(party) %>%
  slice_max(order_by = duration_days)
## # A tibble: 5 x 5
## # Groups:   party [2]
##   name       start      end        party      duration_days
##   <chr>      <date>     <date>     <chr>      <drtn>       
## 1 Clinton    1993-01-20 2001-01-20 Democratic 2922 days    
## 2 Obama      2009-01-20 2017-01-20 Democratic 2922 days    
## 3 Eisenhower 1953-01-20 1961-01-20 Republican 2922 days    
## 4 Reagan     1981-01-20 1989-01-20 Republican 2922 days    
## 5 Bush       2001-01-20 2009-01-20 Republican 2922 days
presidential2 %>% 
  group_by(party) %>%
  slice_min(order_by = duration_days)
## # A tibble: 2 x 5
## # Groups:   party [2]
##   name    start      end        party      duration_days
##   <chr>   <date>     <date>     <chr>      <drtn>       
## 1 Kennedy 1961-01-20 1963-11-22 Democratic 1036 days    
## 2 Ford    1974-08-09 1977-01-20 Republican  895 days

7.6.3 Extract a single column as a vector with pull:

# extract column "duration_days"
presidential2 %>% pull(duration_days)
## Time differences in days
##  [1] 2922 1036 1886 2027  895 1461 2922 1461 2922 2922 2922
# extract column "duration_days" as a vector, and name the vector using the "name" column
presidential2 %>% pull(duration_days, 
                       name=name)
## Time differences in days
## Eisenhower    Kennedy    Johnson      Nixon       Ford     Carter     Reagan       Bush 
##       2922       1036       1886       2027        895       1461       2922       1461 
##    Clinton       Bush      Obama 
##       2922       2922       2922

7.6.4 Change column order with relocate:

# move column "party" as the start
relocate(presidential, 
         party)

# move column "name" before column "party"
relocate(presidential, 
         party, 
         .before=party)

# move column "name" at the end (after last column)
relocate(presidential, 
         name, 
         .after=last_col())

# move around all columns
relocate(presidential, 
         party, start, name, end)

# rename a column as you relocate it
relocate(presidential, 
         President=name, 
         .after=last_col())

# reorganize columns alphabetically
relocate(presidential, 
         sort(tidyselect::peek_vars()))