7.2 select

select will select (and optionally rename) columns/variables in a data frame / tibble.

Select column “name” only from the presidential2 object:

name	start	end	party	duration_days
Eisenhower	1953-01-20	1961-01-20	Republican	2922 days
Kennedy	1961-01-20	1963-11-22	Democratic	1036 days
Johnson	1963-11-22	1969-01-20	Democratic	1886 days
Nixon	1969-01-20	1974-08-09	Republican	2027 days
Ford	1974-08-09	1977-01-20	Republican	895 days
Carter	1977-01-20	1981-01-20	Democratic	1461 days
Reagan	1981-01-20	1989-01-20	Republican	2922 days
Bush	1989-01-20	1993-01-20	Republican	1461 days
Clinton	1993-01-20	2001-01-20	Democratic	2922 days
Bush	2001-01-20	2009-01-20	Republican	2922 days
Obama	2009-01-20	2017-01-20	Democratic	2922 days

select(presidential2, 
       name)

Select 2 columns: party and name (in that order):

select(presidential2, 
       party, name)

Rename a column as you select it:

select(presidential2, 
       party, President=name)

Select all columns except party with the - sign:

select(presidential2, 
       -party)

Select all columns between start and party (with both columns included)

 select(presidential2, 
        start:party)

## # A tibble: 11 x 3
##    start      end        party     
##    <date>     <date>     <chr>     
##  1 1953-01-20 1961-01-20 Republican
##  2 1961-01-20 1963-11-22 Democratic
##  3 1963-11-22 1969-01-20 Democratic
##  4 1969-01-20 1974-08-09 Republican
##  5 1974-08-09 1977-01-20 Republican
##  6 1977-01-20 1981-01-20 Democratic
##  7 1981-01-20 1989-01-20 Republican
##  8 1989-01-20 1993-01-20 Republican
##  9 1993-01-20 2001-01-20 Democratic
## 10 2001-01-20 2009-01-20 Republican
## 11 2009-01-20 2017-01-20 Democratic

7.2.1 select_if

Select only columns containing characters with select_if():

# select columns containing characters:
select_if(presidential2, 
    is.character)

## # A tibble: 11 x 2
##    name       party     
##    <chr>      <chr>     
##  1 Eisenhower Republican
##  2 Kennedy    Democratic
##  3 Johnson    Democratic
##  4 Nixon      Republican
##  5 Ford       Republican
##  6 Carter     Democratic
##  7 Reagan     Republican
##  8 Bush       Republican
##  9 Clinton    Democratic
## 10 Bush       Republican
## 11 Obama      Democratic

Select only columns containing a date with the is.Date function from the lubridate package:

select_if(presidential2, 
      lubridate::is.Date)

## # A tibble: 11 x 2
##    start      end       
##    <date>     <date>    
##  1 1953-01-20 1961-01-20
##  2 1961-01-20 1963-11-22
##  3 1963-11-22 1969-01-20
##  4 1969-01-20 1974-08-09
##  5 1974-08-09 1977-01-20
##  6 1977-01-20 1981-01-20
##  7 1981-01-20 1989-01-20
##  8 1989-01-20 1993-01-20
##  9 1993-01-20 2001-01-20
## 10 2001-01-20 2009-01-20
## 11 2009-01-20 2017-01-20

HANDS-ON

For the next hands-on, we will use th starwars dataset: it contains information about the Star Wars movie characters:

name	height	mass	hair_color	skin_color	eye_color	birth_year	sex	gender
Luke Skywalker	172	77	blond	fair	blue	19.0	male	masculine
C-3PO	167	75	NA	gold	yellow	112.0	none	masculine
R2-D2	96	32	NA	white, blue	red	33.0	none	masculine
Darth Vader	202	136	none	white	yellow	41.9	male	masculine
Leia Organa	150	49	brown	light	brown	19.0	female	feminine

Create a new column BMI that contains the BMI of each character (Body Mass Index, calculated as weight in kg / (height in m)^2: we will assume that the height column is expressed as cm and the mass column is expressed as kg)
Rename column name to character_name.
Remove columns vehicles and starships.
Save all the changes into the new tibble starwarsBMI (use the %>% !).

Answer

# Create a new column BMI that contains the BMI of each character
mutate(starwars, BMI=mass/(height*0.01)^2)

# Rename column name to character_name.
rename(starwars, character_name=name)

# Remove columns `vehicles` and `starships`.
select(starwars, -c(vehicles, starships))

# Save all the changes into the new tibble starwarsBMI (use the %>% operator)
starwarsBMI <- starwars %>% 
  mutate(BMI=mass/(height*0.01)^2) %>% 
  select(-c(vehicles, starships)) %>%
  rename(character_name=name)

7.2.2 select using patterns

Some select helpers are available, that help you select columns given certain patterns in their names:

Table 7.3: select helpers
name	description
starts_with	starts with a prefix
ends_with	ends with a suffix
contains	contains a literal string
matches	matches a regular expression
num_range	matches a numerical range like x01, x02, x03

For example, select only columns from the starwars dataset which name end with “color”:

select(starwars,
       ends_with("color"))

## # A tibble: 87 x 3
##    hair_color    skin_color  eye_color
##    <chr>         <chr>       <chr>    
##  1 blond         fair        blue     
##  2 <NA>          gold        yellow   
##  3 <NA>          white, blue red      
##  4 none          white       yellow   
##  5 brown         light       brown    
##  6 brown, grey   light       blue     
##  7 brown         light       blue     
##  8 <NA>          white, red  red      
##  9 black         light       brown    
## 10 auburn, white fair        blue-gray
## # … with 77 more rows

Or which start with the letter h:

select(starwars,
       starts_with("h"))

## # A tibble: 87 x 3
##    height hair_color    homeworld
##     <int> <chr>         <chr>    
##  1    172 blond         Tatooine 
##  2    167 <NA>          Tatooine 
##  3     96 <NA>          Naboo    
##  4    202 none          Tatooine 
##  5    150 brown         Alderaan 
##  6    178 brown, grey   Tatooine 
##  7    165 brown         Tatooine 
##  8     97 <NA>          Tatooine 
##  9    183 black         Tatooine 
## 10    182 auburn, white Stewjon  
## # … with 77 more rows

If you are familiar with regular expressions, you can also use them within the matches function:

select(starwars,
       matches("^h")) # same as starts_with("h")

## # A tibble: 87 x 3
##    height hair_color    homeworld
##     <int> <chr>         <chr>    
##  1    172 blond         Tatooine 
##  2    167 <NA>          Tatooine 
##  3     96 <NA>          Naboo    
##  4    202 none          Tatooine 
##  5    150 brown         Alderaan 
##  6    178 brown, grey   Tatooine 
##  7    165 brown         Tatooine 
##  8     97 <NA>          Tatooine 
##  9    183 black         Tatooine 
## 10    182 auburn, white Stewjon  
## # … with 77 more rows

Finally, you can select columns which name match a numerical range with num_range.
For example, let’s take the billboard dataset that contains column names wk1, wk2, wk3 … up to wk76, and select only columns from wk18 to wk22:

select(billboard,
       num_range("wk", 18:22))

## # A tibble: 317 x 5
##     wk18  wk19  wk20  wk21  wk22
##    <dbl> <dbl> <dbl> <dbl> <dbl>
##  1    NA    NA    NA    NA    NA
##  2    NA    NA    NA    NA    NA
##  3    18    18    14    12     7
##  4    67    73    70    NA    NA
##  5    96    NA    NA    NA    NA
##  6    84    93    94    NA    NA
##  7    NA    NA    NA    NA    NA
##  8    67    83    86    NA    NA
##  9     2     3     4     5     5
## 10    71    79    89    NA    NA
## # … with 307 more rows