Webscraping Art Auction Data

[This article was first published on NYC Data Science Academy » R, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

Check out my:

Introduction

Goal

Scrape all of the data off of the Blouin Art Sales Index: http://artsalesindex.artinfo.com

Method

Step 1 — Gather URLs for all artists from search directory

Step 2 — Gather URLS for all pieces from Artist Bio Pages

Step 3 — Scrape data for each individual piece

Step 1

#-----------Packages--------------------------------------------------------

library(dplyr)
library(rvest)
library(httr)

#************************************     STEP 1    *****************************************

#------------Definitions-----------------------------------------------------

url <- "http://artsalesindex.artinfo.com/asi/search/artistLanding.ai"
path1 <- "?lastName="
path2 <- "&startRowNum="


#----------Number of increments for each letter------------------------------ 

alpha_length <- as.data.frame(LETTERS)
alpha_length[,2] <- c(9450, 25950, 18600, 12600, 4650, 10200, 15450, 16350, 1500, 5400, 10800, 15600,
                      24000,5550, 3450, 14700, 750, 13200, 25200, 9300, 900, 6600, 11550, 750, 1950,
                      3750)
colnames(alpha_length) <- c("letters", "number")


#-----------Generate all of the A-Z webpages that will be scraped---------------

pages <- c()
for (x in 1:nrow(alpha_length)) {
    for (i in seq(from=0, to=alpha_length$number[x], by=150)) {
        pages <- append(pages, 
                        paste(url, path1, alpha_length$letter[x], path2, as.character(i), sep=""))
}}


#------------------Scrape the Artists' Names--------------------------------

#Prep our definition
parse.function <- function(x) {
                  x %>% html_nodes(".artist-list li a") %>% html_attr(name = "href")
                 }

names <- c()
#Scrape through each page
for (i in 1:length(pages)) {  #---- length(pages)   <><><><><><><><><><> Change for test
    print(c(i, "out of", length(pages)))
    Sys.sleep(.5)
    art_site <- rvest::html(pages[i])
    names <- append(names, parse.function(art_site))
}


#Text Manipulation
names <- gsub(" ", "%20", names)
names <- gsub("", "%20", names, fixed=TRUE)
names <- gsub("_Asgar/Gabriel", "_Asgar%20Gabriel", names)
names <- gsub("?", "", names) %>%
         as.data.frame(names)
row.names(names) <- seq(length=nrow(names))
url <- "http://artsalesindex.artinfo.com"

#Art Profile pages to be scraped
pages <- c()
for (i in 1:nrow(names)) {
    print(c(i, "out of", nrow(names)))
    pages <- append(pages, paste(url, names$.[i], sep=''))
}

Step 2

###********************************       STEP 2         *************************************

#------------------Scrape the Artists' Profile Pages-----------------------
parsefunction <- function(x) {
                  x %>% rvest::html_nodes(".results-title a") %>% rvest::html_attr(name = "href")
                 }

art.pieces <- character()
system.time(
for (i in 1:length(pages)) {  #----- length(pages) <><><><><>Change for test
       print(c(i, "out of", length(pages)))
       Sys.sleep(1)
       art_site_bio <- html(pages[i])
       art.pieces <- append(art.pieces, parsefunction(art_site_bio))
})
#---------------------------------------------------------------


url <- "http://artsalesindex.artinfo.com"

#String final URL for piece pages
url <- "http://artsalesindex.artinfo.com"
art.pieces <- as.data.frame(art.pieces)
pages <- c()
for (i in 1:nrow(art.pieces)) {
    print(i)
    pages <- append(pages, paste(url, art.pieces$art.pieces[i], sep=''))
}

Step 3

##*************************************     STEP 3    ****************************************

artist                <- vector(mode="character")
artist.nationality    <- vector(mode="character")
title                 <- vector(mode="character")
year                  <- vector(mode = "character")
lot.number            <- vector(mode = "character")
auction.data          <- vector(mode = "character")
price                 <- vector(mode = "character")
lot.details           <- vector(mode = "character")
materials             <- vector(mode = "character")
measurements          <- vector(mode = "character")
description           <- vector(mode = "character")
markings              <- vector(mode = "character")
image.link            <- vector(mode = "character")

for (i in 1:length(pages)) {  #-----  length(pages) <><><><><><><><><><><><><><><><><>Change for test
    print(c(i, "out of", length(pages)))  
    Sys.sleep(1)
    piece_html <- html(pages[i])
    #-----------------Artist
    x   <- piece_html %>% html_nodes("#artistName") %>% html_attr(name="value")
        if (length(x)==0) {artist[i] <- "NA"}
        else {artist[i] <- x}   
    #-----------------Artist Nationality
    x   <- piece_html %>% html_nodes(".artist-nationality") %>% html_text()
        if (length(x)==0) {artist.nationality[i] <- "NA"}
        else {artist.nationality[i] <- x}
    #-----------------Title
    x   <- piece_html %>% html_nodes(".title") %>% html_text()
        if (length(x)==0) {title[i] <- "NA"}
        else {title[i] <- x}    
    #-----------------Year
    x   <- piece_html %>% html_nodes("#artworkIndex_0 p:nth-child(3)") %>% html_text()
        if (length(x)==0) {year[i] <- "NA"}
        else {year[i] <- x}
    #-----------------Lot Number
    x   <- piece_html %>% html_nodes(".lotnumber") %>% html_text()
        if (length(x)==0) {lot.number[i] <- "NA"}
        else {lot.number[i] <- x}
    #-----------------Auction Data
    x   <- piece_html %>% html_nodes(".auctiondata") %>% html_text()
        if (length(x)==0) {auction.data[i] <- "NA"}
        else {auction.data[i] <- x}
    #-----------------Price
    x   <- piece_html %>% html_nodes(".price") %>% html_text()
        if (length(x)==0) {price[i] <- "NA"}
        else {price[i] <- x}
    #-----------------Lot Details
    foo <- piece_html %>% html_nodes(".lot-details1") %>% html_text()
    x   <- lot.details[2]
        if (length(x)==0) {lot.details[i] <- "NA"}
        else {lot.details[i] <- x}
    #-----------------Details  -------------------------------###
    details   <- piece_html %>% html_nodes(".artworkdetails") %>% html_text()
    #-----------------Materials
    x   <- details[1]
        if (length(x)==0) {materials[i] <- "NA"}
        else {materials[i] <- x}
    #-----------------Measurements
    x   <- details[2]
        if (length(x)==0) {measurements[i] <- "NA"}
        else {measurements[i] <- x}
    #-----------------Description
    x   <- details[3]
        if (length(x)==0) {description[i] <- "NA"}
        else {description[i] <- x}
    #-----------------Markings
    x   <- details[4]
        if (length(x)==0) {markings[i] <- "NA"}
        else {markings[i] <- x}
    #-----------------Image Link
    x   <- piece_html %>% html_nodes("#imageIndex_0 img") %>% html_attr(name="src")
        if (length(x)==0) {image.link[i] <- "NA"}
        else {image.link[i] <- x}
}

final <- data.frame(artist=artist,
                    artist.nationality=artist.nationality,
                    title=title,
                    year=year,
                    lot.number=lot.number,
                    auction.data=auction.data,
                    price=price,
                    lot.details=lot.details,
                    materials=materials,
                    measurements=measurements,
                    description=description,
                    markings=markings,
                    image.link=image.link)

To leave a comment for the author, please follow the link and comment on their blog: NYC Data Science Academy » R.

R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

Never miss an update!
Subscribe to R-bloggers to receive
e-mails with the latest R posts.
(You will not see this message again.)

Click here to close (This popup will not appear again)