Web Scraping

Trying to replicate from rvest.
  1. Package pdf.
  2. Github
  3. Last updated: 2014-27-01
library(rvest)
library(rvest)
lego_movie <- html("http://www.imdb.com/title/tt1490017/")

rating <- lego_movie %>% 
  html_nodes("strong span") %>%
  html_text() %>%
  as.numeric()
rating
## [1] 7.8
cast <- lego_movie %>%
  html_nodes("#titleCast .itemprop span") %>%
  html_text()
cast
##  [1] "Will Arnett"     "Elizabeth Banks" "Craig Berry"    
##  [4] "Alison Brie"     "David Burrows"   "Anthony Daniels"
##  [7] "Charlie Day"     "Amanda Farinos"  "Keith Ferguson" 
## [10] "Will Ferrell"    "Will Forte"      "Dave Franco"    
## [13] "Morgan Freeman"  "Todd Hansen"     "Jonah Hill"
poster <- lego_movie %>%
  html_nodes("#img_primary img") %>%
  html_attr("src")

library(rvest)
library(httr)
library(jpeg)

lego_movie <- html("http://www.imdb.com/title/tt1490017/")

poster <- lego_movie %>%
  html_nodes("#img_primary img") %>%
  html_attr("src")

Replicating a particular rpubs: https://rpubs.com/aammd/kivascrape

library(dplyr)
library(tidyr)
library(magrittr)
library(rvest)


site <- html("http://www.kiva.org/lend/774331")


kiva_name <- function(.site){
  .site %>%
    html_nodes("#pageHeader h2") %>%
    html_text
  }

kiva_name(site)
## [1] "Resineros De San José De Cañas Group"
kiva_place <- function(.site){
  .site %>%
    html_nodes("#pageHeader .country") %>%
    html_text
  }

kiva_place(site)
## [1] "San José de Cañas, Mexico"
kiva_amt <- function(.site){
.site %>%
  html_nodes(".loanExcerpt") %>%
  html_text %>%
  gsub("[^0-9.]+", "", .) %>%
  gsub("\\.*$", "", .) %>%        ## remove trailing .
  gsub("^\\.*", "", .) %>%           ## remove leading .
  as.numeric
}

kiva_amt(site)
## [1] 29050
kiva_percent <- function(.site){
.site %>%
  html_nodes("#loanSummary .number") %>%
  html_text %>%
  gsub("[^0-9.]+", "", .) %>%
  as.numeric
}

kiva_percent(site)
## [1] 0
kiva_funded <- function(.site){
  .site %>%
    html_nodes(".fullyFundedNotice") %>%
    html_text %>%
    identical(., character(0)) %>%
    not
}

kiva_funded(site)
## [1] TRUE
loansum <- html(site) %>%
  html_nodes("#loanSummary dl")


loansum %>%
  html_text
## [1] "Repayment Term:\n\t\t\t\t\t\t120 months (more info)\n\t\n\t\t\t\t\t\tRepayment Schedule:\n\t\t\t\t\t\tIrregularly\n\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tPre-Disbursed:\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tAug 25, 2014\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\tListed\n\t\t\t\t\t\t\tOct 21, 2014\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\tCurrency Exchange Loss:\n\t\t\t\t\t\tN/A \n\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t"
loansum %>%
  html_nodes("dt") %>%
  html_text
## [1] "Repayment Term:"         "Repayment Schedule:"    
## [3] "Pre-Disbursed:"          "Listed"                 
## [5] "Currency Exchange Loss:"
loansum %>%
  html_nodes("dd") %>%
  html_text
## [1] "120 months (more info)" "Irregularly"           
## [3] "Aug 25, 2014"           "Oct 21, 2014"          
## [5] "N/A "
deflist_to_df <- function(.site){
  require(rvest)
  require(dplyr)

deflist_xml <- .site %>%
    html_nodes("#loanSummary dl")

terms <- deflist_xml %>%
  html_nodes("dt") %>%
  html_text

defs <- loansum %>%
  html_nodes("dd") %>%
  html_text

names(defs) <- terms

data.frame(t(defs))
}

deflist_to_df(site)
##          Repayment.Term. Repayment.Schedule. Pre.Disbursed.       Listed
## 1 120 months (more info)         Irregularly   Aug 25, 2014 Oct 21, 2014
##   Currency.Exchange.Loss.
## 1                    N/A
numvec2 <- c(786671,785489)

set.seed(5)
numvec <- sample(5000:7914, size = 10)+780000

download <- data.frame(startnum = numvec) %>%
  mutate(url = paste0("http://www.kiva.org/lend/", startnum)) %>%
  group_by(url) %>%
  do(site = failwith(NULL, html)(.$url))

clean_download <- download %>%
  mutate(test = try(kiva_name(site))) %>%
  filter(!grepl("Error", x = test))
output <- clean_download %>%
  group_by(url) %>% 
  mutate(name = kiva_name(site[[1]]),
         funded = kiva_funded(site[[1]]),
         percent = kiva_percent(site[[1]]),
         amount = kiva_amt(site[[1]]),
         place = kiva_place(site[[1]])) %>%
  #separate(place, c("city", "country"), sep = ", ") %>%
  do(data.frame(., deflist_to_df(.[["site"]][[1]]))) %>%
  select(-site)
library(knitr)
kable(as.data.frame(output[1:4]))
url test name funded
http://www.kiva.org/lend/785304 Manjurani Manjurani TRUE
http://www.kiva.org/lend/785320 Janet Janet TRUE
http://www.kiva.org/lend/785583 San Valentin Group San Valentin Group TRUE
http://www.kiva.org/lend/785828 Djiguiya Group Djiguiya Group TRUE
http://www.kiva.org/lend/786535 Hanifan Hanifan TRUE
http://www.kiva.org/lend/786996 Anonymous Anonymous FALSE
http://www.kiva.org/lend/787040 Savoeun's Group Savoeun's Group TRUE
http://www.kiva.org/lend/787349 Goutami Goutami TRUE
http://www.kiva.org/lend/787670 Marjhory Rosa Derita Marjhory Rosa Derita TRUE
http://www.kiva.org/lend/787780 Zenie Zenie TRUE
h