Web data in R

My personal notes from DataCamp’s course

Back to home

Downloading Files and Using API Clients
– Introduction
– APIs
Using httr to interact with APIs directly
Handling JSON and XML
Web scraping with XPATHs

Downloading Files and Using API Clients

Introduction

Read functions like read.csv(), real.delim() can accept urls in place of local paths. You can use download.file() to save a local copy.

csv_url <- paste0("http://s3.amazonaws.com/assets.datacamp.com/production",
                  "/course_1561/datasets/chickwts.csv")

# Download the file with download.file()
download.file(url = csv_url, destfile = 'data/feed_data.csv')

# Read it in with read.csv()
csv_data <- read.csv('data/feed_data.csv')

APIs

R has several packages with APIs implementations. Google ‘CRAN ’.

Example with pageviews: client for Wikipedia’s API

# Load pageviews
library(pageviews)

# Get the pageviews for "Hadley Wickham"
hadley_pageviews <- article_pageviews(project = "en.wikipedia", 
                                      "Hadley Wickham")

# Examine the resulting object
str(hadley_pageviews)

## 'data.frame':    1 obs. of  8 variables:
##  $ project    : chr "wikipedia"
##  $ language   : chr "en"
##  $ article    : chr "Hadley_Wickham"
##  $ access     : chr "all-access"
##  $ agent      : chr "all-agents"
##  $ granularity: chr "daily"
##  $ date       : POSIXct, format: "2015-10-01"
##  $ views      : num 53

Using httr to interact with APIs directly

Package httr helps to interact directly with APIs.

library(httr)

url <- paste0("https://wikimedia.org/api/rest_v1/metrics/pageviews/",
              "per-article/en.wikipedia.org/all-access/all-agents/",
              "Hadley_Wickham/daily/20170101/20170102")

# Make a GET request to url and save the results
pageview_response <- GET(url)

# Call content() to retrieve the data the server sent back
pageview_data <- content(pageview_response)

# Examine the results with str()
str(pageview_data)

## List of 1
##  $ items:List of 2
##   ..$ :List of 7
##   .. ..$ project    : chr "en.wikipedia"
##   .. ..$ article    : chr "Hadley_Wickham"
##   .. ..$ granularity: chr "daily"
##   .. ..$ timestamp  : chr "2017010100"
##   .. ..$ access     : chr "all-access"
##   .. ..$ agent      : chr "all-agents"
##   .. ..$ views      : int 45
##   ..$ :List of 7
##   .. ..$ project    : chr "en.wikipedia"
##   .. ..$ article    : chr "Hadley_Wickham"
##   .. ..$ granularity: chr "daily"
##   .. ..$ timestamp  : chr "2017010200"
##   .. ..$ access     : chr "all-access"
##   .. ..$ agent      : chr "all-agents"
##   .. ..$ views      : int 86

function httr::http_error() helps with response codes:

fake_url <- "http://google.com/fakepagethatdoesnotexist"

# Make the GET request
request_result <- GET(fake_url)

# Check request_result
if(http_error(request_result)){
    warning("The request failed")
} else {
    content(request_result)
}

## Warning: The request failed

Use ‘query’ argument for parameters based APIs:

# Create list with nationality and country elements
query_params <- list(nationality = 'americans', 
    country = 'antigua')
    
# Make parameter-based call to httpbin, with query_params
parameter_response <- GET('https://httpbin.org/get', query = query_params)

# Print parameter_response
parameter_response

## Response [https://httpbin.org/get?nationality=americans&country=antigua]
##   Date: 2020-06-14 23:22
##   Status: 200
##   Content-Type: application/json
##   Size: 470 B
## {
##   "args": {
##     "country": "antigua", 
##     "nationality": "americans"
##   }, 
##   "headers": {
##     "Accept": "application/json, text/xml, application/xml, */*", 
##     "Accept-Encoding": "deflate, gzip, br", 
##     "Host": "httpbin.org", 
##     "User-Agent": "libcurl/7.68.0 r-curl/4.3 httr/1.4.1", 
## ...

Typical consumption of an API with traffic limiter:

# Construct a vector of 2 URLs
urls <- c('http://httpbin.org/status/404', 'http://httpbin.org/status/301')

for(url in urls){
    # Send a GET request to url
    result <- GET(url)
    # Delay for 5 seconds between requests
    Sys.sleep(1)
}

A function tying all together:

get_pageviews <- function(article_title){
  url <- paste(
    paste0("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/",
           "en.wikipedia/all-access/all-agents"), 
    article_title, 
    "daily/2015100100/2015103100", 
    sep = "/"
  )   
  response <- GET(url, user_agent("my@email.com this is a test")) 
  # Is there an HTTP error?
  if(http_error(response)){ 
    # Throw an R error
    stop("the request failed") 
  }
  # Return the response's content
  content(response)
}

Handling JSON and XML

toJSON, fromJSON {jsonlite}:

library(jsonlite)

# Stringify some data
jsoncars <- toJSON(mtcars[1:5,], pretty=TRUE)
jsoncars

## [
##   {
##     "mpg": 21,
##     "cyl": 6,
##     "disp": 160,
##     "hp": 110,
##     "drat": 3.9,
##     "wt": 2.62,
##     "qsec": 16.46,
##     "vs": 0,
##     "am": 1,
##     "gear": 4,
##     "carb": 4,
##     "_row": "Mazda RX4"
##   },
##   {
##     "mpg": 21,
##     "cyl": 6,
##     "disp": 160,
##     "hp": 110,
##     "drat": 3.9,
##     "wt": 2.875,
##     "qsec": 17.02,
##     "vs": 0,
##     "am": 1,
##     "gear": 4,
##     "carb": 4,
##     "_row": "Mazda RX4 Wag"
##   },
##   {
##     "mpg": 22.8,
##     "cyl": 4,
##     "disp": 108,
##     "hp": 93,
##     "drat": 3.85,
##     "wt": 2.32,
##     "qsec": 18.61,
##     "vs": 1,
##     "am": 1,
##     "gear": 4,
##     "carb": 1,
##     "_row": "Datsun 710"
##   },
##   {
##     "mpg": 21.4,
##     "cyl": 6,
##     "disp": 258,
##     "hp": 110,
##     "drat": 3.08,
##     "wt": 3.215,
##     "qsec": 19.44,
##     "vs": 1,
##     "am": 0,
##     "gear": 3,
##     "carb": 1,
##     "_row": "Hornet 4 Drive"
##   },
##   {
##     "mpg": 18.7,
##     "cyl": 8,
##     "disp": 360,
##     "hp": 175,
##     "drat": 3.15,
##     "wt": 3.44,
##     "qsec": 17.02,
##     "vs": 0,
##     "am": 0,
##     "gear": 3,
##     "carb": 2,
##     "_row": "Hornet Sportabout"
##   }
## ]

# Parse it back
fromJSON(jsoncars)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2

## Not run: 
#retrieve data frame
data1 <- fromJSON("https://api.github.com/users/hadley/orgs")
names(data1)

##  [1] "login"              "id"                 "node_id"           
##  [4] "url"                "repos_url"          "events_url"        
##  [7] "hooks_url"          "issues_url"         "members_url"       
## [10] "public_members_url" "avatar_url"         "description"

data1$login

##  [1] "ggobi"             "rstudio"           "rstats"           
##  [4] "ropensci"          "rjournal"          "r-dbi"            
##  [7] "RConsortium"       "tidyverse"         "r-lib"            
## [10] "rstudio-education"

# Nested data frames:
data2 <- fromJSON("https://api.github.com/users/hadley/repos")
names(data2)

##  [1] "id"                "node_id"           "name"             
##  [4] "full_name"         "private"           "owner"            
##  [7] "html_url"          "description"       "fork"             
## [10] "url"               "forks_url"         "keys_url"         
## [13] "collaborators_url" "teams_url"         "hooks_url"        
## [16] "issue_events_url"  "events_url"        "assignees_url"    
## [19] "branches_url"      "tags_url"          "blobs_url"        
## [22] "git_tags_url"      "git_refs_url"      "trees_url"        
## [25] "statuses_url"      "languages_url"     "stargazers_url"   
## [28] "contributors_url"  "subscribers_url"   "subscription_url" 
## [31] "commits_url"       "git_commits_url"   "comments_url"     
## [34] "issue_comment_url" "contents_url"      "compare_url"      
## [37] "merges_url"        "archive_url"       "downloads_url"    
## [40] "issues_url"        "pulls_url"         "milestones_url"   
## [43] "notifications_url" "labels_url"        "releases_url"     
## [46] "deployments_url"   "created_at"        "updated_at"       
## [49] "pushed_at"         "git_url"           "ssh_url"          
## [52] "clone_url"         "svn_url"           "homepage"         
## [55] "size"              "stargazers_count"  "watchers_count"   
## [58] "language"          "has_issues"        "has_projects"     
## [61] "has_downloads"     "has_wiki"          "has_pages"        
## [64] "forks_count"       "mirror_url"        "archived"         
## [67] "disabled"          "open_issues_count" "license"          
## [70] "forks"             "open_issues"       "watchers"         
## [73] "default_branch"

names(data2$owner)

##  [1] "login"               "id"                  "node_id"            
##  [4] "avatar_url"          "gravatar_id"         "url"                
##  [7] "html_url"            "followers_url"       "following_url"      
## [10] "gists_url"           "starred_url"         "subscriptions_url"  
## [13] "organizations_url"   "repos_url"           "events_url"         
## [16] "received_events_url" "type"                "site_admin"

data2$owner$login

##  [1] "hadley" "hadley" "hadley" "hadley" "hadley" "hadley" "hadley" "hadley"
##  [9] "hadley" "hadley" "hadley" "hadley" "hadley" "hadley" "hadley" "hadley"
## [17] "hadley" "hadley" "hadley" "hadley" "hadley" "hadley" "hadley" "hadley"
## [25] "hadley" "hadley" "hadley" "hadley" "hadley" "hadley"

# Flatten the data into a regular non-nested dataframe
names(flatten(data2))

##  [1] "id"                        "node_id"                  
##  [3] "name"                      "full_name"                
##  [5] "private"                   "html_url"                 
##  [7] "description"               "fork"                     
##  [9] "url"                       "forks_url"                
## [11] "keys_url"                  "collaborators_url"        
## [13] "teams_url"                 "hooks_url"                
## [15] "issue_events_url"          "events_url"               
## [17] "assignees_url"             "branches_url"             
## [19] "tags_url"                  "blobs_url"                
## [21] "git_tags_url"              "git_refs_url"             
## [23] "trees_url"                 "statuses_url"             
## [25] "languages_url"             "stargazers_url"           
## [27] "contributors_url"          "subscribers_url"          
## [29] "subscription_url"          "commits_url"              
## [31] "git_commits_url"           "comments_url"             
## [33] "issue_comment_url"         "contents_url"             
## [35] "compare_url"               "merges_url"               
## [37] "archive_url"               "downloads_url"            
## [39] "issues_url"                "pulls_url"                
## [41] "milestones_url"            "notifications_url"        
## [43] "labels_url"                "releases_url"             
## [45] "deployments_url"           "created_at"               
## [47] "updated_at"                "pushed_at"                
## [49] "git_url"                   "ssh_url"                  
## [51] "clone_url"                 "svn_url"                  
## [53] "homepage"                  "size"                     
## [55] "stargazers_count"          "watchers_count"           
## [57] "language"                  "has_issues"               
## [59] "has_projects"              "has_downloads"            
## [61] "has_wiki"                  "has_pages"                
## [63] "forks_count"               "mirror_url"               
## [65] "archived"                  "disabled"                 
## [67] "open_issues_count"         "forks"                    
## [69] "open_issues"               "watchers"                 
## [71] "default_branch"            "owner.login"              
## [73] "owner.id"                  "owner.node_id"            
## [75] "owner.avatar_url"          "owner.gravatar_id"        
## [77] "owner.url"                 "owner.html_url"           
## [79] "owner.followers_url"       "owner.following_url"      
## [81] "owner.gists_url"           "owner.starred_url"        
## [83] "owner.subscriptions_url"   "owner.organizations_url"  
## [85] "owner.repos_url"           "owner.events_url"         
## [87] "owner.received_events_url" "owner.type"               
## [89] "owner.site_admin"          "license.key"              
## [91] "license.name"              "license.spdx_id"          
## [93] "license.url"               "license.node_id"

# Flatten directly (more efficient):
data3 <- fromJSON("https://api.github.com/users/hadley/repos", flatten = TRUE)
identical(data3, flatten(data2))

## [1] TRUE

dplyr::bind_rows() is a good helper to deal with lists parsed from JSON:

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

url <- paste0("https://wikimedia.org/api/rest_v1/metrics/pageviews/",
              "per-article/en.wikipedia.org/all-access/all-agents/",
              "Hadley_Wickham/daily/20170101/20170102")

# Make a GET request to url and save the results
pageview_response <- GET(url)

# Call content() to retrieve the data the server sent back
pageview_data <- content(pageview_response)

# Examine the results with str()
str(pageview_data)

## List of 1
##  $ items:List of 2
##   ..$ :List of 7
##   .. ..$ project    : chr "en.wikipedia"
##   .. ..$ article    : chr "Hadley_Wickham"
##   .. ..$ granularity: chr "daily"
##   .. ..$ timestamp  : chr "2017010100"
##   .. ..$ access     : chr "all-access"
##   .. ..$ agent      : chr "all-agents"
##   .. ..$ views      : int 45
##   ..$ :List of 7
##   .. ..$ project    : chr "en.wikipedia"
##   .. ..$ article    : chr "Hadley_Wickham"
##   .. ..$ granularity: chr "daily"
##   .. ..$ timestamp  : chr "2017010200"
##   .. ..$ access     : chr "all-access"
##   .. ..$ agent      : chr "all-agents"
##   .. ..$ views      : int 86

pageview_data[["items"]] %>% bind_rows()

## # A tibble: 2 x 7
##   project      article        granularity timestamp  access     agent      views
##   <chr>        <chr>          <chr>       <chr>      <chr>      <chr>      <int>
## 1 en.wikipedia Hadley_Wickham daily       2017010100 all-access all-agents    45
## 2 en.wikipedia Hadley_Wickham daily       2017010200 all-access all-agents    86

Like jsonlite, you have xlm2:

library(xml2)
cd <- read_xml(xml2_example("cd_catalog.xml"))
class(cd)

## [1] "xml_document" "xml_node"

xml_structure(xml_child(cd, 1))

## <CD>
##   <TITLE>
##     {text}
##   <ARTIST>
##     {text}
##   <COUNTRY>
##     {text}
##   <COMPANY>
##     {text}
##   <PRICE>
##     {text}
##   <YEAR>
##     {text}

# working with xpaths
xml_find_all(cd, xpath = '/CATALOG/CD/ARTIST')

## {xml_nodeset (26)}
##  [1] <ARTIST>Bob Dylan</ARTIST>
##  [2] <ARTIST>Bonnie Tylor</ARTIST>
##  [3] <ARTIST>Dolly Parton</ARTIST>
##  [4] <ARTIST>Gary More</ARTIST>
##  [5] <ARTIST>Eros Ramazzotti</ARTIST>
##  [6] <ARTIST>Bee Gees</ARTIST>
##  [7] <ARTIST>Dr.Hook</ARTIST>
##  [8] <ARTIST>Rod Stewart</ARTIST>
##  [9] <ARTIST>Andrea Bocelli</ARTIST>
## [10] <ARTIST>Percy Sledge</ARTIST>
## [11] <ARTIST>Savage Rose</ARTIST>
## [12] <ARTIST>Many</ARTIST>
## [13] <ARTIST>Kenny Rogers</ARTIST>
## [14] <ARTIST>Will Smith</ARTIST>
## [15] <ARTIST>Van Morrison</ARTIST>
## [16] <ARTIST>Jorn Hoel</ARTIST>
## [17] <ARTIST>Cat Stevens</ARTIST>
## [18] <ARTIST>Sam Brown</ARTIST>
## [19] <ARTIST>T`Pau</ARTIST>
## [20] <ARTIST>Tina Turner</ARTIST>
## ...

# create data frame
cds <- xml_find_all(cd, xpath = '/CATALOG/CD')
df <- data.frame(title = 
                   xml_text(xml_find_all(cd, xpath = '/CATALOG/CD/TITLE')),
                 artist = 
                   xml_text(xml_find_all(cd, xpath = '/CATALOG/CD/ARTIST')),
                 country =
                   xml_text(xml_find_all(cd, xpath = '/CATALOG/CD/COUNTRY')),
                 company =
                   xml_text(xml_find_all(cd, xpath = '/CATALOG/CD/COMPANY')),
                 price =
                   xml_double(xml_find_all(cd, xpath = '/CATALOG/CD/PRICE')),
                 year =
                   xml_integer(xml_find_all(cd, xpath = '/CATALOG/CD/YEAR')))
df

##                       title            artist country        company price year
## 1          Empire Burlesque         Bob Dylan     USA       Columbia  10.9 1985
## 2           Hide your heart      Bonnie Tylor      UK    CBS Records   9.9 1988
## 3             Greatest Hits      Dolly Parton     USA            RCA   9.9 1982
## 4       Still got the blues         Gary More      UK Virgin redords  10.2 1990
## 5                      Eros   Eros Ramazzotti      EU            BMG   9.9 1997
## 6            One night only          Bee Gees      UK        Polydor  10.9 1998
## 7            Sylvias Mother           Dr.Hook      UK            CBS   8.1 1973
## 8                Maggie May       Rod Stewart      UK       Pickwick   8.5 1990
## 9                   Romanza    Andrea Bocelli      EU        Polydor  10.8 1996
## 10 When a man loves a woman      Percy Sledge     USA       Atlantic   8.7 1987
## 11              Black angel       Savage Rose      EU           Mega  10.9 1995
## 12     1999 Grammy Nominees              Many     USA         Grammy  10.2 1999
## 13       For the good times      Kenny Rogers      UK   Mucik Master   8.7 1995
## 14         Big Willie style        Will Smith     USA       Columbia   9.9 1997
## 15             Tupelo Honey      Van Morrison      UK        Polydor   8.2 1971
## 16               Soulsville         Jorn Hoel  Norway            WEA   7.9 1996
## 17         The very best of       Cat Stevens      UK         Island   8.9 1990
## 18                     Stop         Sam Brown      UK        A and M   8.9 1988
## 19          Bridge of Spies             T`Pau      UK          Siren   7.9 1987
## 20           Private Dancer       Tina Turner      UK        Capitol   8.9 1983
## 21           Midt om natten        Kim Larsen      EU         Medley   7.8 1983
## 22   Pavarotti Gala Concert Luciano Pavarotti      UK          DECCA   9.9 1991
## 23      The dock of the bay      Otis Redding     USA       Atlantic   7.9 1987
## 24             Picture book        Simply Red      EU        Elektra   7.2 1985
## 25                      Red    The Communards      UK         London   7.8 1987
## 26         Unchain my heart        Joe Cocker     USA            EMI   8.2 1987

Web scraping with XPATHs

Use package rvest to extract data from web html pages.

library(rvest)

# Hadley Wickham's Wikipedia page
test_url <- "https://en.wikipedia.org/wiki/Hadley_Wickham"

# Read the URL stored as "test_url" with read_html()
test_xml <- read_html(test_url)
test_xml

## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject  ...

# xpath to look for a class called vcard
xpath <- paste0("//*[contains(concat( \" \", @class, \" \" ),",
                " concat( \" \", \"vcard\", \" \" ))]")

# Use html_node() to grab the node with the XPATH
node <- html_node(x = test_xml, xpath = xpath)
node

## {html_node}
## <table class="infobox biography vcard" style="width:22em">
## [1] <tbody>\n<tr><th colspan="2" style="text-align:center;font-size:125%;font ...

# look for directly for class fn with css argument
page_name <- html_node(x = node, css = '.fn')
page_name

## {html_node}
## <div class="fn" style="display:inline">

# Extract the text from page_name
page_title <- html_text(page_name)
page_title

## [1] "Hadley Wickham"

Use rvest::html_table() to convert web pages tables to data.frames:

wiki_table <- html_table(node)
colnames(wiki_table) <- c("key", "value")
cleaned_table <- subset(wiki_table, !key == '')
str(cleaned_table)

## 'data.frame':    8 obs. of  2 variables:
##  $ key  : chr  "Born" "Alma mater" "Known for" "Awards" ...
##  $ value: chr  "(1979-10-14) 14 October 1979 (age 40)Hamilton, New Zealand" "Iowa State University, University of Auckland" "R programming language packages" "John Chambers Award (2006)\nFellow of the American Statistical Association (2015)" ...

More examples with css argument

# Select the table elements
html_nodes(test_xml, css = 'table')

## {xml_nodeset (2)}
## [1] <table class="infobox biography vcard" style="width:22em"><tbody>\n<tr><t ...
## [2] <table class="nowraplinks hlist navbox-inner" style="border-spacing:0;bac ...

# Select elements with class = "infobox"
html_nodes(test_xml, css = '.infobox')

## {xml_nodeset (1)}
## [1] <table class="infobox biography vcard" style="width:22em"><tbody>\n<tr><t ...

# Select elements with id = "firstHeading"
html_nodes(test_xml, css = '#firstHeading')

## {xml_nodeset (1)}
## [1] <h1 id="firstHeading" class="firstHeading" lang="en">Hadley Wickham</h1>

Wrapping everything up in usable function to extract infobox from wikipedia’s pages:

library(httr)
library(rvest)
library(xml2)

get_infobox <- function(title){
  base_url <- "https://en.wikipedia.org/w/api.php"
  
  # Change "Hadley Wickham" to title
  query_params <- list(action = "parse", 
    page = title, 
    format = "xml")
  
  resp <- GET(url = base_url, query = query_params)
  resp_xml <- content(resp)
  
  page_html <- read_html(xml_text(resp_xml))
  infobox_element <- html_node(x = page_html, css =".infobox")
  page_name <- html_node(x = infobox_element, css = ".fn")
  page_title <- html_text(page_name)
  
  wiki_table <- html_table(infobox_element)
  colnames(wiki_table) <- c("key", "value")
  cleaned_table <- subset(wiki_table, !wiki_table$key == "")
  name_df <- data.frame(key = "Full name", value = page_title)
  wiki_table <- rbind(name_df, cleaned_table)
  
  wiki_table
}