Down with OPP (Other People's Pipes)

JD Long
May 9, 2018

Where are we going?

  • dplyr can be fast to write
  • dplyr can write SQL
  • dplyr writing SQL is fast
  • One controversial idea
  • One warning
  • One hack (row-wise)

Where can I find this prez?

Hosted Slideshow: http://bit.ly/opp_dplyr

Source code on Github: http://bit.ly/opp_source

JD Who?


OPP?


Tidy by Nature

R puns rock!

sink("/dev/null")

The Unsinkable /dev/null


RenaissanceRe


We're Hiring!

NYC Risk Analyst

  • early/mid career
  • Not R centric :(
  • Team with me :)

http://bit.ly/rnr_job

dplyr - set up example

library(tidyverse)

mtcars %>%
  rownames_to_column( var = 'longname') ->
df_cars

dplyr - select

df_cars %>% 
  select(longname, cyl, hp) %>%
  head
           longname cyl  hp
1         Mazda RX4   6 110
2     Mazda RX4 Wag   6 110
3        Datsun 710   4  93
4    Hornet 4 Drive   6 110
5 Hornet Sportabout   8 175
6           Valiant   6 105

dplyr - filter

df_cars %>% 
  select(longname, cyl, hp) %>%
  filter( cyl == 8) %>%
  head
            longname cyl  hp
1  Hornet Sportabout   8 175
2         Duster 360   8 245
3         Merc 450SE   8 180
4         Merc 450SL   8 180
5        Merc 450SLC   8 180
6 Cadillac Fleetwood   8 205

dplyr - mutate

df_cars %>% 
  select(longname, cyl, hp) %>%
  filter( cyl == 8) %>%
  mutate( sqrt_hp = sqrt(hp), 
          shortname = word(longname, 1) ) %>%
  head
            longname cyl  hp  sqrt_hp shortname
1  Hornet Sportabout   8 175 13.22876    Hornet
2         Duster 360   8 245 15.65248    Duster
3         Merc 450SE   8 180 13.41641      Merc
4         Merc 450SL   8 180 13.41641      Merc
5        Merc 450SLC   8 180 13.41641      Merc
6 Cadillac Fleetwood   8 205 14.31782  Cadillac

dplyr - group_by

df_cars %>% 
  select(longname, cyl, hp) %>%
  filter( cyl == 8) %>%
  mutate( sqrt_hp = sqrt(hp), 
          shortname = word(longname, 1) ) %>%
  select( - longname) %>%
  group_by(shortname) %>%
  head
# A tibble: 6 x 4
# Groups:   shortname [4]
    cyl    hp sqrt_hp shortname
  <dbl> <dbl>   <dbl> <chr>    
1    8.  175.    13.2 Hornet   
2    8.  245.    15.7 Duster   
3    8.  180.    13.4 Merc     
4    8.  180.    13.4 Merc     
5    8.  180.    13.4 Merc     
6    8.  205.    14.3 Cadillac 

clean up - note changes

df_cars %>% 
  select(longname, cyl, hp) %>%
  mutate( shortname = word(longname, 1) ) %>%
  select( - longname)  ->
df_cars_limited
head( df_cars_limited )
  cyl  hp shortname
1   6 110     Mazda
2   6 110     Mazda
3   4  93    Datsun
4   6 110    Hornet
5   8 175    Hornet
6   6 105   Valiant

tangent: why pipe?

head(
  select(
    mutate( 
      select(df_cars, longname, cyl, hp) , 
      shortname = word(longname, 1) 
      ), 
    -longname 
    )
)
  cyl  hp shortname
1   6 110     Mazda
2   6 110     Mazda
3   4  93    Datsun
4   6 110    Hornet
5   8 175    Hornet
6   6 105   Valiant

tangent: why pipe? (cont)

df_cars_3vars <-
  select(df_cars, longname, cyl, hp)

df_cars_3vars_shortname <-
  mutate( df_cars_3vars, 
          shortname = word(df_cars_3vars$longname, 1) ) 

df_cars_3vars_shortname <-
  select( df_cars_3vars_shortname, - longname)  

head( df_cars_3vars_shortname )
  cyl  hp shortname
1   6 110     Mazda
2   6 110     Mazda
3   4  93    Datsun
4   6 110    Hornet
5   8 175    Hornet
6   6 105   Valiant

Before the tangent...

df_cars %>% 
  select(longname, cyl, hp) %>%
  mutate( shortname = word(longname, 1) ) %>%
  select( - longname)  ->
df_cars_limited
head( df_cars_limited )
  cyl  hp shortname
1   6 110     Mazda
2   6 110     Mazda
3   4  93    Datsun
4   6 110    Hornet
5   8 175    Hornet
6   6 105   Valiant

dplyr - summarize

df_cars_limited %>%
  group_by(shortname, cyl)%>% 
  summarize( avg_hp = mean(hp) )  %>%
  head(3)
# A tibble: 3 x 3
# Groups:   shortname [3]
  shortname   cyl avg_hp
  <chr>     <dbl>  <dbl>
1 AMC          8.   150.
2 Cadillac     8.   205.
3 Camaro       8.   245.

Note the grouping change. wtf?

dplyr - join

cyl_map <- data.frame(
  cyl = c(4,6,8), 
  cyl_name = c('four','six','eight'))

df_cars_limited %>% 
  left_join( cyl_map ) %>%
  head(3)
  cyl  hp shortname cyl_name
1   6 110     Mazda      six
2   6 110     Mazda      six
3   4  93    Datsun     four

dplyr - case_when (+ sort)

df_cars_limited %>% 
  mutate(Pow= case_when( hp>200 & cyl>=8 ~ 'Strong', 
                     TRUE ~ 'Weak') ) %>%
  arrange(-hp)
   cyl  hp shortname    Pow
1    8 335  Maserati Strong
2    8 264      Ford Strong
3    8 245    Duster Strong
4    8 245    Camaro Strong
5    8 230  Chrysler Strong
6    8 215   Lincoln Strong
7    8 205  Cadillac Strong
8    8 180      Merc   Weak
9    8 180      Merc   Weak
10   8 180      Merc   Weak
11   8 175    Hornet   Weak
12   8 175   Pontiac   Weak
13   6 175   Ferrari   Weak
14   8 150     Dodge   Weak
15   8 150       AMC   Weak
16   6 123      Merc   Weak
17   6 123      Merc   Weak
18   4 113     Lotus   Weak
19   6 110     Mazda   Weak
20   6 110     Mazda   Weak
21   6 110    Hornet   Weak
22   4 109     Volvo   Weak
23   6 105   Valiant   Weak
24   4  97    Toyota   Weak
25   4  95      Merc   Weak
26   4  93    Datsun   Weak
27   4  91   Porsche   Weak
28   4  66      Fiat   Weak
29   4  66      Fiat   Weak
30   4  65    Toyota   Weak
31   4  62      Merc   Weak
32   4  52     Honda   Weak

mutate vs. summarize

mutate adds a column but keeps same # rows

summarize adds a column but reduces the rows down to one row per unique group based on group_by

database time!

con <- DBI::dbConnect(RSQLite::SQLite(), 
                      path = ":memory:")
## could use memdb_frame

copy_to(con, df_cars, "df_cars",
  temporary = FALSE)

df_cars_db <- tbl(con, "df_cars")

what are these things?

class(df_cars)
[1] "data.frame"
class(as.tibble(df_cars))
[1] "tbl_df"     "tbl"        "data.frame"
class(df_cars_db)
[1] "tbl_dbi"  "tbl_sql"  "tbl_lazy" "tbl"     

so this looks sane...

df_cars_db %>% 
  group_by( cyl) %>%
  summarize( avg = mean(hp),
             std = sd(hp) )  ->
mean_hp_by_cyl

head( mean_hp_by_cyl )
# Source:   lazy query [?? x 3]
# Database: sqlite 3.22.0 []
    cyl   avg   std
  <dbl> <dbl> <dbl>
1    4.  82.6  20.9
2    6. 122.   24.3
3    8. 209.   51.0

what's in mean_hp_by_cyl?

df_cars_db %>% 
  group_by( cyl) %>%
  summarize( avg = mean(hp),
             std = sd(hp) )  %>%
  show_query 
<SQL>
SELECT 
  `cyl`, 
  AVG(`hp`) AS `avg`, 
  STDEV(`hp`) AS `std`
FROM `df_cars`
GROUP BY `cyl`

automagic subqueries

df_cars_db %>% 
  group_by( cyl) %>%
  summarize( avg = mean(hp),
             std = sd(hp) )  %>%
  filter( avg > 100) %>%
  show_query 
<SQL>
SELECT *
FROM ( SELECT `cyl`, 
       AVG(`hp`) AS `avg`, 
       STDEV(`hp`) AS `std`
       FROM `df_cars`
       GROUP BY `cyl`
     )
WHERE (`avg` > 100.0)

oh yes...

lazy af...

lazy af...

mean_hp_by_cyl %>%
  collect ->
local_mean_hp_by_cyl

head(local_mean_hp_by_cyl)
# A tibble: 3 x 3
    cyl   avg   std
  <dbl> <dbl> <dbl>
1    4.  82.6  20.9
2    6. 122.   24.3
3    8. 209.   51.0
class(local_mean_hp_by_cyl)
[1] "tbl_df"     "tbl"        "data.frame"

R Mapped to SQL

math operators: +, -, *, /, %%, ^

math functions: abs, acos, acosh, asin, asinh, atan, atan2, atanh, ceiling, cos, cosh, cot, coth, exp, floor, log, log10, round, sign, sin, sinh, sqrt, tan, tanh

logical comparisons: <, <=, !=, >=, >, ==, %in%

boolean operations: &, &&, |, ||, !, xor

basic aggregations: mean, sum, min, max, sd, var

string functions: tolower, toupper, trimws, nchar, substr

coerce types: as.numeric, as.integer, as.character

KNEAD MOAR?

vignette("sql-translation")

reprex

tbl_lazy & simulate_dbi

library(dbplyr)
data.frame(x = 1, y = 2) %>%
  tbl_lazy(src = simulate_sqlite()) ->
df_sqlite ## this is a lazy table

df_sqlite %>% 
  summarise(x = sd(x)) %>%
  show_query()
<SQL> SELECT STDEV(`x`) AS `x`
FROM `df`

simulate functions

dbi           sqlite
postgres      mysql
odbc          impala
mssql         oracle
hive          odbc_postgresql
teradata      odbc_access

Common Snags (for me)

1) Search Path:

dbSendQuery(con, 
    build_sql("set search_path to 
    '$user', public, reports, sandbox"))

2) Name Clashes on Join

## drop the duplicate field or rename
df_cars_db <- tbl(con, "df_cars") %>% select(-wt)

workflow with redshift

count(*)
349,523,453,670
  • pre define joins
  • pass around a “big table”
  • set up criteria
  • transform
  • collect

WARNING!!!

dplyr can stomp on Core R functions:

head( lag(ldeaths, 12) )
[1] 3035 2552 2704 2554 2014 1655
head( lag(ldeaths, k=12) == 
        head( lag(ldeaths, 12) ) )
[1] TRUE TRUE TRUE TRUE TRUE TRUE

WARNING!!!

Now load dplyr

library(dplyr)
head( lag(ldeaths, 12) )
Error: `x` must be a vector, not a ts object, do you want `stats::lag()`?
head( lag(ldeaths, k=12) )
Error: `x` must be a vector, not a ts object, do you want `stats::lag()`?

WARNING!!!

must clarify in your code:

library(dplyr)
head( stats::lag(ldeaths, 12) )
[1] 3035 2552 2704 2554 2014 1655
head( stats::lag(ldeaths, k=12) )
[1] 3035 2552 2704 2554 2014 1655

WTF?!?! Hidden Failure!

vec_dbl <- as.double(ldeaths)
head( lag(vec_dbl, 12) )
[1] 3035 2552 2704 2554 2014 1655
library(dplyr)
head( lag(vec_dbl, 12) )
[1] NA NA NA NA NA NA

Opinions Vary

Controversy

Right assignment after a pipe chain is teh awesome1!11!!

data %>%
  fun %>%
  more_fun ->
fun_fun_data

No left indent on the result, y'all!

row-wise design pattern

fun <- function(a, b, c){
  sum(seq(a,b,c))
}
df <- data.frame(mn=c(1,2,3), 
                 mx=c(8,13,18), 
                 rng=c(1,2,3))
df
  mn mx rng
1  1  8   1
2  2 13   2
3  3 18   3

@JennyBryan: http://bit.ly/rowwise

row-wise

Bug here… see next slide

df %>% 
  mutate( output = 
            pmap(list(a=mn, b=mx, c=rng), fun)
        )
  mn mx rng output
1  1  8   1     36
2  2 13   2     42
3  3 18   3     63

row-wise bug - the video

Video Describing my mistake: http://bit.ly/pmap_bug

I can't embed Loom video in this prez… no iframe support in *.rpres

row-wise: my bug

pmap returns a list

df %>% 
  mutate(output = 
          pmap(list(a=mn, b=mx, c=rng), fun)
        ) -> 
out_df
str(out_df)
'data.frame':   3 obs. of  4 variables:
 $ mn    : num  1 2 3
 $ mx    : num  8 13 18
 $ rng   : num  1 2 3
 $ output:List of 3
  ..$ : num 36
  ..$ : num 42
  ..$ : num 63

row-wise: my bug cont

pmap_dbl returns vec of doubles

df %>% 
  mutate(output = 
          pmap_dbl(list(a=mn, b=mx, c=rng), fun)
        ) -> 
out_df
str(out_df)
'data.frame':   3 obs. of  4 variables:
 $ mn    : num  1 2 3
 $ mx    : num  8 13 18
 $ rng   : num  1 2 3
 $ output: num  36 42 63

vector in a df cell

fun2 <- function(a, b, c){
  seq(a,b,c)
}
df %>% 
  mutate( output = 
            pmap(list(a=mn, b=mx, c=rng), fun2)
          )
  mn mx rng                 output
1  1  8   1 1, 2, 3, 4, 5, 6, 7, 8
2  2 13   2     2, 4, 6, 8, 10, 12
3  3 18   3    3, 6, 9, 12, 15, 18