Mining Twitter for consumer attitudes towards hotels

March 9, 2012
By

(This article was first published on ModelR » rstats, and kindly contributed to R-bloggers)

Couple of months back I read Jeffrey Breen’s presentation on mining Twitter for consumer attitudes towards airlines, so I was just curious how it would look if I estimate the sentiment toward major hotels.

So here it is:

# load twitter library
> library(twitteR)
# search for all the hilton tweets
> hilton.tweets=searchTwitter('@hilton',n=1500)
> length(hilton.tweets)
[1] 162
> class(hilton.tweets)
[1] "list"
> tweet=hilton.tweets[[1]]
> class(tweet)
[1] "status"
attr(,"package")
[1] "twitteR"
> tweet$getScreenName()
[1] "i_RAHOFA"
> tweet$getText()
> library("plyr")
> hilton.text=laply(hilton.tweets,function(t)t$getText())
> length(hilton.text)
> head(hilton.text,5)
# load list of positive and negative words for SIMPLE sentiment analysis
# you would have to download the files from a website I included below - make sure you put in the directory that you will be
# referencing
> hu.liu.pos=scan('/Users/marcinkulakowski/Downloads/r/positive-words.txt',what='character',comment.char=';')
> hu.liu.neg=scan('/Users/marcinkulakowski/Downloads/r/negative-words.txt',what='character',comment.char=';')
> pos.words=c(hu.liu.pos,'upgrade')
> neg.words=c(hu.liu.neg,'wtf','wait','waiting','epicfail','mechanical')
# sampling
> sample=c("You'reawesomeandIloveyou","Ihateandhateandhate.Soangry.Die!","Impressedandamazed:youarepeerlessinyourachievementofunparalleledmediocrity.")
> score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
require(plyr)
require(stringr)

# we got a vector of sentences. plyr will handle a list
# or a vector as an "l" for us
# we want a simple array ("a") of scores back, so we use
# "l" + "a" + "ply" = "laply":
scores = laply(sentences, function(sentence, pos.words, neg.words) {

# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)

# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)

# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)

# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)

# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)

return(score)
}, pos.words, neg.words, .progress=.progress )

scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
> result=score.sentiment(sample,pos.words,neg.words)
> class(result)
[1] "data.frame"
> result$score
[1] 0 0 0
> hilton.scores=score.sentiment(hilton.text,pos.words,neg.words,.progress='text')
> hilton.scores$hotel='Hilton'
> hilton.scores$code='HL'
> hist(hilton.scores$score)
# hilton histogram
> library("ggplot2")
> qplot(hilton.scores$score)
# qplot hilton
# lets search for all other major hotels
# Intercontinental
intercontinental.tweets=searchTwitter('@intercontinental',n=1500)
class(tweet)
intercontinental.text=laply(intercontinental.tweets,function(t)t$getText())
intercontinental.scores=score.sentiment(intercontinental.text,pos.words,neg.words,.progress='text')
intercontinental.scores$hotel='Intercontinental'
intercontinental.scores$code='IC'
# Wyndham
wyndham.tweets=searchTwitter('@wyndham',n=1500)
class(tweet)
wyndham.text=laply(wyndham.tweets,function(t)t$getText())
wyndham.scores=score.sentiment(wyndham.text,pos.words,neg.words,.progress='text')
wyndham.scores$hotel='Wyndham'
wyndham.scores$code='WY'
# Marriott
marriott.tweets=searchTwitter('@marriott',n=1500)
class(tweet)
marriott.text=laply(marriott.tweets,function(t)t$getText())
marriott.scores=score.sentiment(marriott.text,pos.words,neg.words,.progress='text')
marriott.scores$hotel='Marriott'
marriott.scores$code='MI'
# BestWestern
bestwestern.tweets=searchTwitter('@bestwestern',n=1500)
class(tweet)
bestwestern.text=laply(bestwestern.tweets,function(t)t$getText())
bestwestern.scores=score.sentiment(bestwestern.text,pos.words,neg.words,.progress='text')
bestwestern.scores$hotel='Bestwestern'
bestwestern.scores$code='BW'
# Starwood
starwood.tweets=searchTwitter('@starwood',n=1500)
class(tweet)
starwood.text=laply(starwood.tweets,function(t)t$getText())
starwood.scores=score.sentiment(starwood.text,pos.words,neg.words,.progress='text')
starwood.scores$hotel='Starwood'
starwood.scores$code='SW'
# Hyatt
hyatt.tweets=searchTwitter('@hyatt',n=1500)
class(tweet)
hyatt.text=laply(hyatt.tweets,function(t)t$getText())
hyatt.scores=score.sentiment(hyatt.text,pos.words,neg.words,.progress='text')
hyatt.scores$hotel='Hyatt'
hyatt.scores$code='HY'
> all.scores=rbind(intercontinental.scores,wyndham.scores,hilton.scores,marriott.scores,bestwestern.scores,starwood.scores,hyatt.scores)
# Make separate plot for each hotel
> ggplot(data=all.scores)+#ggplotworksondata.frames,always
geom_bar(mapping=aes(x=score,fill=hotel),binwidth=1)+
facet_grid(hotel~.)+#makeaseparateplotforeachhotel
theme_bw()+scale_fill_brewer()#plaindisplay,nicercolors
# Plot
> all.scores$very.pos=as.numeric(all.scores$score>=2)
> all.scores$very.neg=as.numeric(all.scores$score twitter.df=ddply(all.scores,c('hotel','code'),summarise,pos.count=sum(very.pos),neg.count=sum(very.neg))
> twitter.df$all.count=twitter.df$pos.count+twitter.df$neg.count
> twitter.df$score=round(100*twitter.df$pos.count/twitter.df$all.count)
> install.packages("doBy")
> library("doBy")
> orderBy(~-score,twitter.df)
hotel code pos.count neg.count all.count score
1 Bestwestern BW 6 0 6 100
5 Starwood SW 7 0 7 100
6 Wyndham WY 2 0 2 100
3 Hyatt HY 7 1 8 88
2 Hilton HL 15 3 18 83
4 Marriott MI 13 4 17 76
> install.packages("XML")
> library(XML)
> acsi.url='http://www.theacsi.org/index.php?option=com_content&view=article&id=147&catid=&Itemid=212&i=Hotels'
# scrape acsi website for scores
> acsi.df=readHTMLTable(acsi.url,header=T,which=1,stringsAsFactors=F)
> acsi.df=acsi.df[,c(1,18)]
> head(acsi.df,1)
> colnames(acsi.df)=c('hotel','score')
> acsi.df$score=as.numeric(acsi.df$score)
> View(acsi.df)
# ACSI Dataframe
> acsi.df$code=c('HL','SW','MI','NA','HY','NA','IC','BW','NA','WY','NA','NA','NA')
> acsi.df$score=as.numeric(acsi.df$score)
> compare.df=merge(twitter.df,acsi.df,by='code',suffixes=c('.twitter','.acsi'))
> compare.df=subset(compare.df,all.count>100)
> compare.df=merge(twitter.df,acsi.df,by='code',suffixes=c('.twitter','.acsi'))
> View(compare.df)
# scores compared
> ggplot(compare.df)+geom_point(aes(x=score.twitter,y=score.acsi,color=hotel.twitter),size=6)+ geom_smooth(aes(x=score.twitter,y=score.acsi,group=1),se=F,method="lm")+theme_bw()+opts(legend.position=c(0.85,0.85))
# final plot

Materials used:
——————————————
Jeffrey Breen’s presentation

The American Customer Satisfaction Index (Hotels)

Opinion Mining, Sentiment Analysis, Opinion Extraction


To leave a comment for the author, please follow the link and comment on his blog: ModelR » rstats.

R-bloggers.com offers daily e-mail updates about R news and tutorials on topics such as: visualization (ggplot2, Boxplots, maps, animation), programming (RStudio, Sweave, LaTeX, SQL, Eclipse, git, hadoop, Web Scraping) statistics (regression, PCA, time series, trading) and more...



If you got this far, why not subscribe for updates from the site? Choose your flavor: e-mail, twitter, RSS, or facebook...

Comments are closed.