# 来玩玩QQ群的数据

December 13, 2012
By

 This post was kindly contributed by 数据科学与R语言 - go there to comment and to read the full post.

 用户 频数 7cha18 1498 6cha4376 1209 4cha3875 1079 8cha083 691 4cha698 528 1cha65314 438 [email protected] 372 2cha1 350 5cha80 296 3cha4233 294

 6cha4376 160 4cha3875 147 7cha18 109 [email protected] 89 4cha698 87 2cha1 77 8cha083 74 1cha457 58 1cha59002 53 8cha08 50

`# 数据读入data <- read.csv('qq.csv',T,stringsAsFactors=F)data <- data[-nrow(data),] # 最后一行有问题，删除 library(stringr)library(plyr)library(lubridate)library(ggplot2)library(reshape2)library(igraph) # 数据整理# 将字符串中的日期和时间划分为不同变量temp1 <- str_split(data\$time,' ')result1 <- ldply(temp1,.fun=NULL)names(result1) <- c('date','clock') #分离年月日temp2 <- str_split(result1\$date,'/')result2 <- ldply(temp2,.fun=NULL)names(result2) <- c('year','month','day') # 分离小时分钟temp3 <- str_split(result1\$clock,':')result3 <- ldply(temp3,.fun=NULL)names(result3) <- c('hour','minutes') # 合并数据newdata <- cbind(data,result1,result2,result3) # 转换日期为时间格式newdata\$date <- ymd(newdata\$date) # 提取星期数据newdata\$wday <- wday(newdata\$date) # 转换数据格式newdata\$month <- ordered(as.numeric(newdata\$month) )newdata\$year <- ordered(newdata\$year)newdata\$day <- ordered(as.numeric(newdata\$day))newdata\$hour <- ordered(as.numeric(newdata\$hour))newdata\$wday <- ordered(newdata\$wday) # 关于时间的一元描述# 观察时间相关各变量的频数分布# 周一和周日聊天不多，难道说是周一要安心上班？qplot(wday,data=newdata,geom='bar')# 白天上班的时间聊天比较多嘛，下午形成高峰。qplot(hour,data=newdata,geom='bar') # 关于用户的频度描述# 前十大发言最多用户user <- as.data.frame(table(newdata\$id))user <- user[order(user\$Freq,decreasing=T),]user[1:10,]topuser <- user[1:10,]\$Var1 # 活跃天数计算# 将数据展开为宽表，每一行为用户，每一列为日期，对应数值为发言次数flat.day <- dcast(newdata,id~date,length,value.var='date')flat.mat <- as.matrix(flat.day[-1]) #转为矩阵# 转为0-1值，以观察是否活跃flat.mat <- ifelse(flat.mat>0,1,0)# 根据上线天数求和topday <- data.frame(flat.day[,1],apply(flat.mat,1,sum))names(topday) <- c('id','days')topday <- topday[order(topday\$days,decreasing=T),]# 获得前十大活跃用户topday[1:10,] # 观察每天的发言次数# online.day为每天的发言次数online.day <- sapply(flat.day[,-1],sum)tempdf <- data.frame(time=ymd(names(online.day )),online.day )qplot(x=time,y=online.day ,ymin=0,ymax=online.day ,      data=tempdf,geom='linerange')ggsave('2.png')# 观察到有少数峰值日，看超过200次发言以上是哪几天names(which(online.day>200) #根据flat.day数据观察每天活跃用户变化# numday为每天发言人数numday <- apply(flat.mat,2,sum)tempdf <- data.frame(time=ymd(names(numday)),numday)qplot(x=time,y=numday,ymin=0,ymax=numday,      data=tempdf,geom='linerange')ggsave('3.png')#直方图观察qplot(x=numday,data=tempdf,geom='histogram') # 当某天登录人数增加的话，发言数也会增加吗？   tempdf <- data.frame(time=ymd(names(online.day )),people=numday,                     speech=online.day) qplot(x=people,y=speech ,      data=tempdf,geom=c('point','smooth')) # 再观察十强选手的日内情况flat.hour <- dcast(newdata,id~hour,length,value.var='hour',      subset=.(id %in% topuser))# 平行坐标图hour.melt <- melt(flat.hour)p <- ggplot(data=hour.melt,aes(x=variable,y=value))p + geom_line(aes(group=id,color=id))+    theme_bw()+    opts(legend.position = "none") # 连续对话的次数，以三十分钟为间隔newdata\$realtime <- strptime(newdata\$time,'%Y/%m/%d %H:%M')# 时间排序有问题，按时间重排数据newdata2 <- newdata[order(newdata\$realtime),]# 将数据按讨论来分组group <- rep(1,11279)for (i in 2:11279) {    d <- as.numeric(difftime(newdata2\$realtime[i],                             newdata2\$realtime[i-1],                             units='mins'))    if ( d <30) {        group[i] <- group[i-1]    } else {group[i] <- group[i-1]+1}}barplot(table(group)) # 得到719多组对话newdata2\$group <- group # igraph进行十强之间的网络分析# 建立关系矩阵，如果两个用户同时在一次群讨论中出现，则计数+1newdata3 <- dcast(newdata2, id~group, sum,                   value.var='group',                  subset=.(id %in% topuser))newdata4 <- ifelse(newdata3[,-1] > 0, 1, 0)rownames(newdata4) <- newdata3[,1]relmatrix <- newdata4 %*% t(newdata4)# 很容易看出哪两个人聊得最多，6cha4376和4cha3875，有基情？deldiag <- relmatrix-diag(diag(relmatrix))which(deldiag==max(deldiag),arr.ind=T) # 根据关系矩阵画社交网络画g <- graph.adjacency(relmatrix,weighted=T,mode='undirected')g <-simplify(g)V(g)\$label<-rownames(relmatrix)V(g)\$degree<- degree(g)layout1 <- layout.fruchterman.reingold(g)egam <- 10*E(g)\$weight/max(E(g)\$weight)egam <- (log(E(g)\$weight)+1) / max(log(E(g)\$weight)+1)V(g)\$label.cex <- V(g)\$degree / max(V(g)\$degree)+ .2V(g)\$label.color <- rgb(0, 0, .2, .8)V(g)\$frame.color <- NAE(g)\$width <- egamE(g)\$color <- rgb(0, 0, 1, egam)plot(g, layout=layout1)`

Tags: ,