My personal notes from DataCamp’s courses:

Back to home

Contents

The Art of Benchmarking

General

It’s unfair to compare R velocity with C velocity. Though C running code time is faster, R writing code is simpler and faster.

Simply keeping R up to date, is the first step to optimization.

  • v2.0 Lazy loading; fast loading of data with minimal expense of system memory;
  • v2.13 Speeding up functions with the byte compiler;
  • v3.0 Support for large vectors;
  • Main releases every April (e.g. 3.0, 3.1, 3.2)
  • Smaller bug fixes throughout the year (e.g. 3.3.0, 3.3.1, 3.3.2)
# Print the R version details using version
version
##                _                           
## platform       x86_64-w64-mingw32          
## arch           x86_64                      
## os             mingw32                     
## system         x86_64, mingw32             
## status                                     
## major          3                           
## minor          6.3                         
## year           2020                        
## month          02                          
## day            29                          
## svn rev        77875                       
## language       R                           
## version.string R version 3.6.3 (2020-02-29)
## nickname       Holding the Windsock

Benchmarking

Read datasets files is a common task that we want to optimize. R provides a native format .rds that, if used without compression, is suitable for fast reading and writing.

# How long does it take to read movies from CSV?
system.time(read.csv('data/movies.csv'))
##    user  system elapsed 
##    0.49    0.00    0.48
# How long does it take to read movies from RDS?
system.time(readRDS('data/movies.rds'))
##    user  system elapsed 
##    0.05    0.00    0.04

Using microbenckmark package:

# Load the microbenchmark package
library(microbenchmark)

# Compare the two functions
compare <- microbenchmark(read.csv('data/movies.csv'), 
                          readRDS('data/movies.rds'), 
                          times = 10)

# Print compare
compare
## Unit: milliseconds
##                         expr      min       lq      mean    median       uq
##  read.csv("data/movies.csv") 454.5975 458.2171 468.16173 462.89610 467.3220
##   readRDS("data/movies.rds")  55.8068  56.4693  67.94085  59.28785  68.7131
##       max neval
##  508.0392    10
##  101.7521    10

Getting data and benchmarking my machine performance:

# Load the benchmarkme package
library(benchmarkme)

# Assign the variable ram to the amount of RAM on this machine
ram <- get_ram()
ram
## NA B
# Assign the variable cpu to the cpu specs
cpu <- get_cpu()
cpu
## $vendor_id
## [1] "GenuineIntel"
## 
## $model_name
## [1] "Intel(R) Core(TM) i5-7200U CPU @ 2.50GHz"
## 
## $no_of_cores
## [1] 4
# Run the io benchmark
res <- benchmark_io(runs = 1, size = 5)
## Preparing read/write io
## # IO benchmarks (2 tests) for size 5 MB:
##   Writing a csv with 625000 values: 1.87 (sec).
##   Reading a csv with 625000 values: 0.53 (sec).
# Plot the results
plot(res)
## You are ranked 102 out of 135 machines.
## Press return to get next plot
## You are ranked 125 out of 135 machines.

# Contribute to community
# upload_results(res)

Fine Tuning: Efficient Base R

Memory allocation

Avoid growing a vector. Prefer to define it previously with correct size and type. Each time you increase the size of a vector, R requests the OS for more memory, which causes the code run slowly.

n <- 30000

# Slow code
growing <- function(n) {
    x <- NULL
    for(i in 1:n)
        x <- c(x, rnorm(1))
    x
}

# Fast code
pre_allocate <- function(n) {
    x <- numeric(n) # Pre-allocate
    for(i in 1:n) 
        x[i] <- rnorm(1)
    x
}

# Use <- with system.time() to store the results of performance measures
system.time(res_grow <- growing(n))
##    user  system elapsed 
##    1.61    0.04    1.64
system.time(res_allocate <- pre_allocate(n))
##    user  system elapsed 
##    0.06    0.00    0.06

Vectorize your code

Calling an native R function eventually leads to C or FORTRAN code, which are heavily optimized.

Use vectorized solution whenever possible:

n <- 1e6
x <- vector("numeric", n)
microbenchmark(
  x <- rnorm(n), # vectorized solution
  {
    for(i in seq_along(x)) # not vectorized
      x[i] <- rnorm(1)
  },
  times = 10)
## Unit: milliseconds
##                                              expr       min        lq
##                                     x <- rnorm(n)   74.3023   75.5989
##  {     for (i in seq_along(x)) x[i] <- rnorm(1) } 1885.9603 1968.0757
##        mean     median        uq       max neval
##    84.91396   77.93325   83.0169  120.2617    10
##  2101.44342 1979.77340 2111.3313 2596.8162    10

Data frames vs matrices

Work with matrices whenever possible to optimize your code. Matrices provide better performance, mainly when extracting rows.

mat <- matrix(rnorm(100000), ncol = 1000)
df <- as.data.frame(mat)

# extracting columns
microbenchmark(mat[,1], df[,1])
## Unit: nanoseconds
##      expr  min   lq  mean median    uq   max neval
##  mat[, 1]  700 1100  1706   1400  2000  8100   100
##   df[, 1] 6700 7650 12693  13800 15500 52900   100
# extracting rows
microbenchmark(mat[1,], df[1,])
## Unit: microseconds
##      expr    min      lq     mean  median      uq     max neval
##  mat[1, ]    5.1    6.35   21.166   17.40   32.00    57.1   100
##   df[1, ] 5092.8 5571.40 6336.108 6093.75 6706.55 10513.8   100

Diagnosing Problems: Code Profiling

Use Profvis package to profile your code and find bottlenecks.

library(profvis)
profvis({
  data(movies, package = "ggplot2movies") # Load data
  braveheart <- movies[7288,]
  movies <- movies[movies$Action == 1,]
  plot(movies$year, movies$rating, xlab = "Year", ylab="Rating")
  model <- loess(rating ~ year, data = movies) # loess regression line
  j <- order(movies$year)
  lines(movies$year[j], model$fitted[j], col="forestgreen", lwd=2)
  points(braveheart$year, braveheart$rating, pch = 21, bg = "steelblue", 
         cex = 3)
  })

Turbo Charged Code: Parallel Programming

Parallel package allow processing in multiple cores. Not every routine can benefit, or even be executed, in parallel. Only loops with no dependency among iterations. Besides that, if you have few iterations, multi thread communication may not compensate for using multiple cores.

library(parallel)

play <- function() {
    total <- no_of_rolls <- 0
    while(total < 10) {
      total <- total + sample(1:6, 1)
  
      # If even. Reset to 0
      if(total %% 2 == 0) total <- 0 
      no_of_rolls <- no_of_rolls + 1
    }
    no_of_rolls
}

# Set the number of games to play
no_of_games <- 1e5

## Time serial version
system.time(serial <- sapply(1:no_of_games, function(i) play()))
##    user  system elapsed 
##    8.53    0.00    8.56
## Set up cluster
cl <- makeCluster(2) # 2 cores
clusterExport(cl, "play")

## Time parallel version
system.time(par <- parSapply(cl, 1:no_of_games, function(i) play()))
##    user  system elapsed 
##    0.04    0.02    5.41
## Stop cluster
stopCluster(cl)

An excelent example using:

suppressPackageStartupMessages(library(dplyr))
library(stringr)
library(janeaustenr)
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
library(foreach)
library(microbenchmark)

janeausten_words <- function() {
  # Names of the six books contained in janeaustenr
  books <- austen_books()$book %>% unique %>% as.character
  # Vector of words from all six books
  words <- sapply(books, extract_words) %>% unlist
  words
}

extract_words <- function(book_name) {
  # extract the text of the book
  text <- subset(austen_books(), book == book_name)$text
  # extract words from the text and convert to lowercase
  str_extract_all(text, boundary("word")) %>% unlist %>% tolower
}

max_frequency <- function(letter, words, min_length = 1) {
  w <- select_words(letter, words = words, min_length = min_length)
  frequency <- table(w)    
  frequency[which.max(frequency)]
}

select_words <- function(letter, words, min_length = 1) {
  min_length_words <- words[nchar(words) >= min_length]
  grep(paste0("^", letter), min_length_words, value = TRUE)
}

# Vector of words from all six books
words <- janeausten_words()

# function that runs in parallel
freq_doPar <- function(cores, min_length = 5) {
  # Register a cluster of size cores
  registerDoParallel(cores = cores)
  
  # foreach loop
  foreach(let = letters, .combine = c, 
          .export = c('max_frequency', "select_words", "words"),
          .packages = c('janeaustenr', 'stringr')) %dopar%
    max_frequency(let, words = words, min_length = min_length)
}

# function that runs sequentially
freq_seq <- function(min_length = 5) {
  foreach(let = letters, .combine = c) %do%
    max_frequency(let, words = words, min_length = min_length)
}

microbenchmark(
  { res_seq <- freq_seq() },
  { res_par <- freq_doPar(2) },
  times = 1, 
  unit = 's')
## Unit: seconds
##                              expr      min       lq     mean   median       uq
##     {     res_seq <- freq_seq() } 7.485440 7.485440 7.485440 7.485440 7.485440
##  {     res_par <- freq_doPar(2) } 7.559246 7.559246 7.559246 7.559246 7.559246
##       max neval
##  7.485440     1
##  7.559246     1
all.equal(res_par, res_seq)
## [1] TRUE

Reproducing results using Parallel Programming

You must use clusterSetRNGStream in order to set seeds in parallel clusters to reproduce random numbers.

The code bellow shows how to activate RNG stream in parallel programming:

# Create a cluster
cl <- makeCluster(2)

# Check RNGkind on workers
clusterCall(cl, RNGkind)
## [[1]]
## [1] "Mersenne-Twister" "Inversion"        "Rejection"       
## 
## [[2]]
## [1] "Mersenne-Twister" "Inversion"        "Rejection"
# Set the RNG seed on workers
clusterSetRNGStream(cl, iseed = 100)

# Check RNGkind on workers
clusterCall(cl, RNGkind)
## [[1]]
## [1] "L'Ecuyer-CMRG" "Inversion"     "Rejection"    
## 
## [[2]]
## [1] "L'Ecuyer-CMRG" "Inversion"     "Rejection"
stopCluster(cl)

Using RNG stream in parallel:

mean_of_rnorm <- function(n) {
  random_numbers <- rnorm(n)
  mean(random_numbers)
}

n_vec <- rep(1000, 5)

cl <- makeCluster(2)

replicate(
  n = 3,
  expr = {
    # Set the cluster's RNG stream seed to 1234
    clusterSetRNGStream(cl, iseed = 1234)
    clusterApply(cl, n_vec, mean_of_rnorm)
  }
)
##      [,1]         [,2]         [,3]        
## [1,] -0.008597904 -0.008597904 -0.008597904
## [2,] -0.006089337 -0.006089337 -0.006089337
## [3,] -0.01398052  -0.01398052  -0.01398052 
## [4,] -0.06629339  -0.06629339  -0.06629339 
## [5,] 0.004297755  0.004297755  0.004297755
stopCluster(cl)