Voronoi Diagrams in Plotly and R

February 1, 2016
By

(This article was first published on Modern Data » R, and kindly contributed to R-bloggers)

Here’s a function which uses plotly’s R Library to overlay a voronoi diagram on top of a 2-D K-Means visualization.

Visit the Getting Started section to get Plotly up and running using R.

Here’s a quick rundown on how this function works:

  • Accepts a data frame containing x and y coordinates of a bunch of data points
  • The dataframe must have a cluster column specifying the k-means cluster to which point belongs to
  • The function uses the deldir package to compute the Voronoi tesselation
  • The function then uses some co-ordinate geometry to create the final visualization using plot_ly
  • If interested, set the parameter print.ggplot to TRUE to see a diagnostic intermediary plot
  • Try modifying the n.sd.x and n.sd.y parameters to adjust x-axis and y-axis limits

Note that this is more of a hack and I am sure there are better more elegant ways to do this. If you have suggestions let us know !

Function definition

# Function Definition ---------------------------------------------------------
VoronoiPlotly <- function(fit,  # Fit object from K-Means
                          ds,  # Data frame containing original data and clusters from K-Means
                          n.sd.x = 3,  # Controls the width of the plot
                          n.sd.y = 3,  # Controls the height of the plot
                          print.ggplot = FALSE,  # Plots a diagnostic chart using ggplot2
                          point.opacity = 0.8,  
                          point.size = 7, 
                          point.symbol = "circle",
                          point.linewidth = 2,
                          point.lineopacity = 0.5,
                          plot_bgcolor = "#ffffff",
                          paper_bgcolor = "#ffffff",
                          center.size = 15,
                          shapes.opacity = 0.5,
                          shapes.linecolor = "#404040", 
                          center.color = "#000000"){
  
  # Options
  options(stringsAsFactors = F)
  graphics.off()
  
  # Load libraries ------------------------------------------------------------
  library(plotly)
  library(deldir)
  
  # Create convenience data frames ----------------------------------------------
  centers <- data.frame(fit$centers)
  vor <- deldir(centers)
  
  # Calculate slopes
  vor.df <- data.frame(vor$dirsgs, m = (vor$dirsgs$y2- vor$dirsgs$y1)/(vor$dirsgs$x2 - vor$dirsgs$x1))
  
  # Calculate constants
  vor.df$c <- with(vor.df, ((y2 - m*x2) + (y1 - m*x1))/2)
  
  # Covnert to strings for better matching later on
  vor.df.str <- data.frame(C1 = apply(vor.df[,1:2], 1, paste, collapse = ","),
                           C2 = apply(vor.df[,3:4], 1, paste, collapse = ","))
  
  # Combine the x and y coordinates for each segment
  coord.df <- rbind(as.matrix(vor.df[1:2]), as.matrix(vor.df[,3:4]))
  
  # Convert to string
  coord.df.str <- apply(coord.df, 1, paste, collapse = ",")
  
  # Find unique strings
  count <- sapply(coord.df.str, function(x){sum(coord.df.str == x)})
  coord.df.str <- data.frame(str = coord.df.str, count = count)
  coord.df.str <- subset(coord.df.str, count == 1)
  
  # Get outer boundary co-ordinates
  outer.bound <- matrix(as.numeric(unlist(strsplit(coord.df.str$str, ","))), ncol = 2, byrow = T)
  outer.bound <- data.frame(x = outer.bound[,1], y = outer.bound[,2])
  
  # Add respective slopes and constants
  for(i in 1:nrow(outer.bound)){
    str <- coord.df.str[i,1]
    idx <- ifelse(is.na(match(str, vor.df.str$C1)), match(str, vor.df.str$C2), match(str, vor.df.str$C1))
    
    # Slope
    outer.bound$m[i] <- vor.df$m[idx]
    
    # Constants
    outer.bound$c[i] <- vor.df$c[idx]
  }
  
  # Find enclosing rectangle boundaries -----------------------------------------
  x.min <- mean(ds$x) - n.sd.x*sd(ds$x)
  x.max <- mean(ds$x) + n.sd.x*sd(ds$x)
  y.min <- mean(ds$y) - n.sd.y*sd(ds$y)
  y.max <- mean(ds$y) + n.sd.y*sd(ds$y)
  
  # Create x-axsi and y-axis limits
  xlim <- c(x.min, x.max)
  ylim <- c(y.min, y.max)
  
  # Extend outer boundary points to above rectangle ------------------------------
  for(i in 1:nrow(outer.bound)){
    # Extract x-y coordinates
    x <- outer.bound$x[i]
    y <- outer.bound$y[i]
    
    # Get slope
    m <- outer.bound$m[i]
    
    # Get slope
    c <- outer.bound$c[i]
    
    # Extend to each edge of enclosing rectangle
    ext.coord <- mat.or.vec(4,3)
    
    # Extend to left edge
    ext.coord[1,1] <- x.min
    ext.coord[1,2] <- m*x.min + c
    ext.coord[1,3] <- sqrt((ext.coord[1,1] - x)^2 + (ext.coord[1,2] - y)^2)
    
    # Extend to right edge
    ext.coord[2,1] <- x.max
    ext.coord[2,2] <- m*x.max + c
    ext.coord[2,3] <- sqrt((ext.coord[2,1] - x)^2 + (ext.coord[2,2] - y)^2)
    
    # Extend to top edge
    ext.coord[3,2] <- y.max
    ext.coord[3,1] <- (y.max - c)/m
    ext.coord[3,3] <- sqrt((ext.coord[3,1] - x)^2 + (ext.coord[3,2] - y)^2)
    
    # Extend to bottom edge
    ext.coord[4,2] <- y.min
    ext.coord[4,1] <- (y.min - c)/m
    ext.coord[4,3] <- sqrt((ext.coord[4,1] - x)^2 + (ext.coord[4,2] - y)^2)
    
    # Find the closest edge
    idx <- which.min(ext.coord[,3])
    
    x <- ext.coord[idx,1]
    y <- ext.coord[idx,2]
    
    # Insert into outer bound 
    outer.bound$x.ext[i] <- x
    outer.bound$y.ext[i] <- y
  }
  
  # Convert to string for easier searcing later on
  outer.bound.str <- apply(outer.bound[,5:6], 1, paste, collapse = ",")
  
  # Augment vor.df with extended outer bound coordinates -------------------------
  for(i in 1:nrow(outer.bound)){
    # Convert to string to help matching
    str <- paste(outer.bound[i,1:2], collapse = ",")
    
    # Match with original vor.df
    if(is.na(match(str, vor.df.str$C1))){
      idx <- match(str, vor.df.str$C2)
      vor.df[idx, 3:4] <- outer.bound[i, 5:6]
    }else{
      idx <- match(str, vor.df.str$C1)
      vor.df[idx, 1:2] <- outer.bound[i, 5:6]
    }
  }
  
  # Plot Check ------------------------------------------------------------------
  p.ggplot <- ggplot() +
    geom_point(data = centers, aes(x, y), color= "red", size = 5) +
    geom_point(data = ds, aes(x, y, color = cluster)) +
    geom_segment(data = vor.df, aes(x = x1, y = y1, xend = x2, yend = y2)) +
    geom_point(data = as.data.frame(fit$centers), aes(x, y)) +
    geom_text(data = centers, aes(x,y, label = 1:nrow(centers)), size = 10) +
    geom_point(data = outer.bound, aes(x.ext, y.ext), color = "blue", size = 5) + 
    geom_point(data = outer.bound, aes(x, y), color = "red", size = 5) + 
    geom_hline(yintercept = y.min) + 
    geom_hline(yintercept = y.max) + 
    geom_vline(xintercept = x.min) +
    geom_vline(xintercept = x.max)
  p.ggplot <- ggplotly(p.ggplot)
  if(print.ggplot == T){print(p.ggplot)}
  # -----------------------------------------------------------------------------
  
  # Function to calculate which side of line is point on ------------------------
  sideFUNC <- function(x, y, x1, y1, x2, y2){
    d <- (x - x1)*(y2-y1) - (y - y1)*(x2 - x1)
    
    return(round(d,2))
  }
  
  # Figure out the path for each polygon ----------------------------------------
  path <- list()
  
  # Loop thorough each centroid and find corrosponding edges
  for(i in 1:nrow(centers)){
    # Find each row where centeroid is available
    mat <- subset(vor.df, ind1 == i | ind2 == i)
    
    # Find all unique coordinates associated with centroid
    mat <- cbind(matrix(c(mat$x1, mat$x2), ncol = 1), matrix(c(mat$y1, mat$y2), ncol = 1))
    mat <- unique(mat)
    mat.str <- apply(mat, 1, paste, collapse = ",")
    
    # print(mat)
    
    # Find all outer boundary points asociated with centroid
    # If an outer boundary point is found, there must be atleast two
    idx <- outer.bound.str %in% mat.str
    if(sum(idx) == 2){
      # Only if two outer boundary points are found
      # then need to modify matrix and add edge end points
      
      # Find the side where all other outer boundary points are
      # Assuming all other boundary points are on the same side
      # need only one point to find this out
      p <- as.numeric(unlist(strsplit(outer.bound.str[!idx][1], split = ",")))
      
      # Line segment is defined by the two identified outer boundary points 
      p1 <- as.numeric(unlist(strsplit(outer.bound.str[idx][1], split = ",")))
      p2 <- as.numeric(unlist(strsplit(outer.bound.str[idx][2], split = ",")))
      
      # Find side
      side <- sideFUNC(p[1], p[2], p1[1], p1[2], p2[1], p2[2])
      
      # Case when only two cluster and hence only one dividing segment
      if(is.na(side)){
        side <- sideFUNC(centers[i,1], centers[i,2], p1[1], p1[2], p2[1], p2[2])
      }
      
      if(side != 0){
        
        # Find the enclosing rectangle"s endpoints that are on the opposite side
        # Top - Left
        side.check <- sideFUNC(x.min, y.max, p1[1], p1[2], p2[1], p2[2])
        if(side.check != 0){if(sign(side.check) != sign(side)) {mat <- rbind(mat, c(x.min, y.max))}}
        
        # Bottom - Left
        side.check <- sideFUNC(x.min, y.min, p1[1], p1[2], p2[1], p2[2])
        if(side.check != 0){if(sign(side.check) != sign(side)) {mat <- rbind(mat, c(x.min, y.min))}}
        
        # Top - Right
        side.check <- sideFUNC(x.max, y.max, p1[1], p1[2], p2[1], p2[2])
        if(side.check != 0){if(sign(side.check) != sign(side)) {mat <- rbind(mat, c(x.max, y.max))}}
        
        # Bottom - Right
        side.check <- sideFUNC(x.max, y.min, p1[1], p1[2], p2[1], p2[2])
        if(side.check != 0){if(sign(side.check) != sign(side)) {mat <- rbind(mat, c(x.max, y.min))}}
      }
    }
    
    # print(mat)
    # readline("Enter:")
    
    # Re-order the points to ensure it makes a convex polygon
    mat <- mat[chull(mat),]
    
    #Paste together
    path[[i]] <- paste0("M", paste0(mat[1,], collapse = ","))
    
    path[[i]] <- paste(path[[i]],
                       paste(apply(matrix(mat[-1,], ncol = 2), 1, function(x){
                         vec <- paste0(x, collapse = ",")
                         vec <- paste0("L", vec)
                       }), collapse = " "),
                       "Z")
  }
  
  # Finally plot using Plotly ---------------------------------------------------
  # crate a "shapes" list for voronoi polygons to be passed to layout()
  shapes <- list()
  cols <- RColorBrewer::brewer.pal(nrow(centers), "Paired")
  
  # Loop through each path and add params like fill color, opacity etc
  for(i in 1:length(path)){
    shapes[[i]] <- list(type = "path",
                        path = path[[i]],
                        fillcolor = cols[i],
                        opacity = shapes.opacity,
                        line = list(color = shapes.linecolor))
  }
  
  # Change colors for each cluster to allow manual spec
  for(i in 1:nrow(centers)){
    ds$color[ds$cluster == i] <- cols[i]
  }
  
  # Create plot
  # base layer
  p <- plot_ly(ds, x = x, y = y , mode = "markers", name = "Clusters", opacity = point.opacity, 
               hoverinfo = "x+y+text",
               text = paste("Cluster:",cluster),
               marker = list(symbol = point.symbol, color = color, size = point.size, 
                             line = list(color = "#262626", width = point.linewidth, opacity = point.lineopacity)),
               showlegend = F)
  
  # Add centroids
  p <- add_trace(centers, x = x, y = y, mode = "markers", name = "Cluster Centers",
                 hoverinfo = "none",
                 marker = list(color = center.color, symbol = "cross", size = center.size))
  
  # Add polygons
  p <- layout(title = "Voronoi polygons and K- Means clustering",
              paper_bgcolor = paper_bgcolor,
              plot_bgcolor = plot_bgcolor,
              xaxis = list(range = xlim, zeroline = F),
              yaxis = list(range = ylim, zeroline = F),
              shapes = shapes)

  print(p)
}

Function call

# Generate some random numbers ------------------------------------------------
set.seed(12345)
nClust <- 12 # Number of clusters
nPoints <- 2000  # Number of data points

ds <- data.frame(x = rnorm(nPoints), y = rnorm(nPoints))

# Do K-Means Clustering -------------------------------------------------------
fit <- kmeans(ds, centers = nClust)
ds <- cbind(ds, cluster = as.factor(fit$cluster))

# Call function ---------------------------------------------------------------
VoronoiPlotly(fit, ds, n.sd.x = 3, n.sd.y = 3, print.ggplot = F)

Another example

set.seed(12345)
nClust <- 8 # Number of clusters
nPoints <- 2000  # Number of data points

ds <- data.frame(x = rchisq(nPoints, 10, 0), y = rnorm(nPoints))

# Do K-Means Clustering -------------------------------------------------------
fit <- kmeans(ds, centers = nClust)
ds <- cbind(ds, cluster = as.factor(fit$cluster))

# Call function ---------------------------------------------------------------
VoronoiPlotly(fit, ds, n.sd.x = 2, n.sd.y = 3, print.ggplot = F)

To leave a comment for the author, please follow the link and comment on their blog: Modern Data » R.

R-bloggers.com offers daily e-mail updates about R news and tutorials on topics such as: Data science, Big Data, R jobs, visualization (ggplot2, Boxplots, maps, animation), programming (RStudio, Sweave, LaTeX, SQL, Eclipse, git, hadoop, Web Scraping) statistics (regression, PCA, time series, trading) and more...



If you got this far, why not subscribe for updates from the site? Choose your flavor: e-mail, twitter, RSS, or facebook...

Comments are closed.

Sponsors

Never miss an update!
Subscribe to R-bloggers to receive
e-mails with the latest R posts.
(You will not see this message again.)

Click here to close (This popup will not appear again)