Webscraping Art Auction Data

February 19, 2015
By

(This article was first published on NYC Data Science Academy » R, and kindly contributed to R-bloggers)

Check out my:

Introduction

Goal

Scrape all of the data off of the Blouin Art Sales Index: http://artsalesindex.artinfo.com

Method

Step 1 — Gather URLs for all artists from search directory

Step 2 — Gather URLS for all pieces from Artist Bio Pages

Step 3 — Scrape data for each individual piece

Step 1

#-----------Packages--------------------------------------------------------

library(dplyr)
library(rvest)
library(httr)

#************************************     STEP 1    *****************************************

#------------Definitions-----------------------------------------------------

url <- "http://artsalesindex.artinfo.com/asi/search/artistLanding.ai"
path1 <- "?lastName="
path2 <- "&startRowNum="


#----------Number of increments for each letter------------------------------ 

alpha_length <- as.data.frame(LETTERS)
alpha_length[,2] <- c(9450, 25950, 18600, 12600, 4650, 10200, 15450, 16350, 1500, 5400, 10800, 15600,
                      24000,5550, 3450, 14700, 750, 13200, 25200, 9300, 900, 6600, 11550, 750, 1950,
                      3750)
colnames(alpha_length) <- c("letters", "number")


#-----------Generate all of the A-Z webpages that will be scraped---------------

pages <- c()
for (x in 1:nrow(alpha_length)) {
    for (i in seq(from=0, to=alpha_length$number[x], by=150)) {
        pages <- append(pages, 
                        paste(url, path1, alpha_length$letter[x], path2, as.character(i), sep=""))
}}


#------------------Scrape the Artists' Names--------------------------------

#Prep our definition
parse.function <- function(x) {
                  x %>% html_nodes(".artist-list li a") %>% html_attr(name = "href")
                 }

names <- c()
#Scrape through each page
for (i in 1:length(pages)) {  #---- length(pages)   <><><><><><><><><><> Change for test
    print(c(i, "out of", length(pages)))
    Sys.sleep(.5)
    art_site <- rvest::html(pages[i])
    names <- append(names, parse.function(art_site))
}


#Text Manipulation
names <- gsub(" ", "%20", names)
names <- gsub("", "%20", names, fixed=TRUE)
names <- gsub("_Asgar/Gabriel", "_Asgar%20Gabriel", names)
names <- gsub("?", "", names) %>%
         as.data.frame(names)
row.names(names) <- seq(length=nrow(names))
url <- "http://artsalesindex.artinfo.com"

#Art Profile pages to be scraped
pages <- c()
for (i in 1:nrow(names)) {
    print(c(i, "out of", nrow(names)))
    pages <- append(pages, paste(url, names$.[i], sep=''))
}

Step 2

###********************************       STEP 2         *************************************

#------------------Scrape the Artists' Profile Pages-----------------------
parsefunction <- function(x) {
                  x %>% rvest::html_nodes(".results-title a") %>% rvest::html_attr(name = "href")
                 }

art.pieces <- character()
system.time(
for (i in 1:length(pages)) {  #----- length(pages) <><><><><>Change for test
       print(c(i, "out of", length(pages)))
       Sys.sleep(1)
       art_site_bio <- html(pages[i])
       art.pieces <- append(art.pieces, parsefunction(art_site_bio))
})
#---------------------------------------------------------------


url <- "http://artsalesindex.artinfo.com"

#String final URL for piece pages
url <- "http://artsalesindex.artinfo.com"
art.pieces <- as.data.frame(art.pieces)
pages <- c()
for (i in 1:nrow(art.pieces)) {
    print(i)
    pages <- append(pages, paste(url, art.pieces$art.pieces[i], sep=''))
}

Step 3

##*************************************     STEP 3    ****************************************

artist                <- vector(mode="character")
artist.nationality    <- vector(mode="character")
title                 <- vector(mode="character")
year                  <- vector(mode = "character")
lot.number            <- vector(mode = "character")
auction.data          <- vector(mode = "character")
price                 <- vector(mode = "character")
lot.details           <- vector(mode = "character")
materials             <- vector(mode = "character")
measurements          <- vector(mode = "character")
description           <- vector(mode = "character")
markings              <- vector(mode = "character")
image.link            <- vector(mode = "character")

for (i in 1:length(pages)) {  #-----  length(pages) <><><><><><><><><><><><><><><><><>Change for test
    print(c(i, "out of", length(pages)))  
    Sys.sleep(1)
    piece_html <- html(pages[i])
    #-----------------Artist
    x   <- piece_html %>% html_nodes("#artistName") %>% html_attr(name="value")
        if (length(x)==0) {artist[i] <- "NA"}
        else {artist[i] <- x}   
    #-----------------Artist Nationality
    x   <- piece_html %>% html_nodes(".artist-nationality") %>% html_text()
        if (length(x)==0) {artist.nationality[i] <- "NA"}
        else {artist.nationality[i] <- x}
    #-----------------Title
    x   <- piece_html %>% html_nodes(".title") %>% html_text()
        if (length(x)==0) {title[i] <- "NA"}
        else {title[i] <- x}    
    #-----------------Year
    x   <- piece_html %>% html_nodes("#artworkIndex_0 p:nth-child(3)") %>% html_text()
        if (length(x)==0) {year[i] <- "NA"}
        else {year[i] <- x}
    #-----------------Lot Number
    x   <- piece_html %>% html_nodes(".lotnumber") %>% html_text()
        if (length(x)==0) {lot.number[i] <- "NA"}
        else {lot.number[i] <- x}
    #-----------------Auction Data
    x   <- piece_html %>% html_nodes(".auctiondata") %>% html_text()
        if (length(x)==0) {auction.data[i] <- "NA"}
        else {auction.data[i] <- x}
    #-----------------Price
    x   <- piece_html %>% html_nodes(".price") %>% html_text()
        if (length(x)==0) {price[i] <- "NA"}
        else {price[i] <- x}
    #-----------------Lot Details
    foo <- piece_html %>% html_nodes(".lot-details1") %>% html_text()
    x   <- lot.details[2]
        if (length(x)==0) {lot.details[i] <- "NA"}
        else {lot.details[i] <- x}
    #-----------------Details  -------------------------------###
    details   <- piece_html %>% html_nodes(".artworkdetails") %>% html_text()
    #-----------------Materials
    x   <- details[1]
        if (length(x)==0) {materials[i] <- "NA"}
        else {materials[i] <- x}
    #-----------------Measurements
    x   <- details[2]
        if (length(x)==0) {measurements[i] <- "NA"}
        else {measurements[i] <- x}
    #-----------------Description
    x   <- details[3]
        if (length(x)==0) {description[i] <- "NA"}
        else {description[i] <- x}
    #-----------------Markings
    x   <- details[4]
        if (length(x)==0) {markings[i] <- "NA"}
        else {markings[i] <- x}
    #-----------------Image Link
    x   <- piece_html %>% html_nodes("#imageIndex_0 img") %>% html_attr(name="src")
        if (length(x)==0) {image.link[i] <- "NA"}
        else {image.link[i] <- x}
}

final <- data.frame(artist=artist,
                    artist.nationality=artist.nationality,
                    title=title,
                    year=year,
                    lot.number=lot.number,
                    auction.data=auction.data,
                    price=price,
                    lot.details=lot.details,
                    materials=materials,
                    measurements=measurements,
                    description=description,
                    markings=markings,
                    image.link=image.link)

To leave a comment for the author, please follow the link and comment on their blog: NYC Data Science Academy » R.

R-bloggers.com offers daily e-mail updates about R news and tutorials on topics such as: Data science, Big Data, R jobs, visualization (ggplot2, Boxplots, maps, animation), programming (RStudio, Sweave, LaTeX, SQL, Eclipse, git, hadoop, Web Scraping) statistics (regression, PCA, time series, trading) and more...



If you got this far, why not subscribe for updates from the site? Choose your flavor: e-mail, twitter, RSS, or facebook...

Comments are closed.

Search R-bloggers


Recent popular posts

Most visited articles of the week

  1. How to write the first for loop in R
  2. A perfect RStudio layout
  3. ML models: What they can’t learn?
  4. The myth that AI or Cognitive Analytics will replace data scientists: There is no easy button
  5. Installing R packages
  6. Finalfit, knitr and R Markdown for quick results
  7. How to Make a Histogram with Basic R
  8. How to perform a Logistic Regression in R
  9. Using apply, sapply, lapply in R

Sponsors

RSS Jobs for R users

Never miss an update!
Subscribe to R-bloggers to receive
e-mails with the latest R posts.
(You will not see this message again.)

Click here to close (This popup will not appear again)