1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 | #========================================================# # Quantitative ALM, Financial Econometrics & Derivatives # ML/DL using R, Python, Tensorflow by Sang-Heon Lee # # https://kiandlee.blogspot.com #——————————————————–# # Adams and Deventer Maximum Smoothness Forward Curve # using the inverse matrix #========================================================# graphics.off() # clear all graphs rm(list = ls()) # remove all files from your workspace # Input : market zero rate, maturity df.mkt <– data.frame( mat = c(0.25, 1, 3, 5, 10), zrc = c(4.75, 4.5, 5.5, 5.25, 6.5)/100 ) # number of maturity n <– length(df.mkt$mat) # add 0-maturity zero rate (assumption) #df <- rbind(c(0, df.mkt$zrc[1]), df.mkt) df <– rbind(c(0, 0.04), df.mkt) # discount factor df$DF <– with(df, exp(–zrc*mat)) # -ln(P(t(i)/t(i-1))) df$mln <– c(NA,–log(df$DF[1:n+1]/df$DF[1:n])) # ti^n df$t5 <– df$mat^5 df$t4 <– df$mat^4 df$t3 <– df$mat^3 df$t2 <– df$mat^2 df$t1 <– df$mat^1 df$t0 <– 1 # dti = ti^n-(ti-1)^n df$dt5 <– c(NA,df$t5[1:n+1] – df$t5[1:n]) df$dt4 <– c(NA,df$t4[1:n+1] – df$t4[1:n]) df$dt3 <– c(NA,df$t3[1:n+1] – df$t3[1:n]) df$dt2 <– c(NA,df$t2[1:n+1] – df$t2[1:n]) df$dt1 <– c(NA,df$t1[1:n+1] – df$t1[1:n]) # construction linear system mQ <– mA <– matrix(0, nrow = n*n, ncol = n*n) vC <– vB <– rep(0,n*n) # Objective function for(r in 1:n) { mQ[((r–1)*n+1):(r*n–2),((r–1)*n+1):(r*n–2)] <– matrix(with(df[r+1,], c(144/5*dt5, 18*dt4, 8*dt3, 18*dt4, 12*dt3, 6*dt2, 8*dt3, 6*dt2, 4*dt1)),3,3) } # Smoothness Constraints : f, f’, f”, f”’ r = 1; for(t in 1:(n–1)) { mA[(r–1)*(n–1)+t,(1+(t–1)*n):(t*n–r+1)] <– with(df[t+1,], c(t4, t3, t2, t1, t0)) mA[(r–1)*(n–1)+t,(1+t*n):((t+1)*n–r+1)] <– with(df[t+1,], –c(t4, t3, t2, t1, t0)) } r = 2; for(t in 1:(n–1)) { mA[(r–1)*(n–1)+t,(1+(t–1)*n):(t*n–r+1)] <– with(df[t+1,], c(4*t3, 3*t2, 2*t1, t0)) mA[(r–1)*(n–1)+t,(1+t*n):((t+1)*n–r+1)] <– with(df[t+1,], –c(4*t3, 3*t2, 2*t1, t0)) } r = 3; for(t in 1:(n–1)) { mA[(r–1)*(n–1)+t,(1+(t–1)*n):(t*n–r+1)] <– with(df[t+1,], c(12*t2, 6*t1, 2*t0)) mA[(r–1)*(n–1)+t,(1+t*n):((t+1)*n–r+1)] <– with(df[t+1,], –c(12*t2, 6*t1, 2*t0)) } r = 4; for(t in 1:(n–1)) { mA[(r–1)*(n–1)+t,(1+(t–1)*n):(t*n–r+1)] <– with(df[t+1,], c(24*t1, 6*t0)) mA[(r–1)*(n–1)+t,(1+t*n):((t+1)*n–r+1)] <– with(df[t+1,], –c(24*t1, 6*t0)) } # bond price fitting constraints r = 5; for(t in 1:n) { mA[(r–1)*(n–1)+t,(1+(t–1)*n):(t*n)] <– with(df[t+1,],c(dt5/5,dt4/4,dt3/3,dt2/2,dt1)) } # additional four constraints r = (n–1)*4 + n+1; c = n ; mA[r,c] <– 1 r = (n–1)*4 + n+2; c = n–1; mA[r,c] <– 1 r = (n–1)*4 + n+3; c = (n*4+1):(n*4+4) mA[r,c] <– with(df[n+1,], c(4*t3, 3*t2, 2*t1, t0)) r = (n–1)*4 + n+4; c = (n*4+1):(n*4+3) mA[r,c] <– with(df[n+1,], c(12*t2, 6*t1, 2*t0)) # RHS vector vC <– rep(0,n*n) vB <– c(rep(0,(n–1)*(n–1)), df$mln[2:(n+1)], df$zrc[1],0,0,0) # concatenation of matrix and vector AA = rbind(cbind(mQ, –t(mA)), cbind(mA, matrix(0,n*n,n*n))) BB = c(–vC, vB) # solve linear system by using inverse matrix XX = solve(AA)%*%BB XX | cs |
1 2 3 4 5 6 7 8 | > df.mkt[,c(“a”,”b”,”c”,”d”,”e”)] a b c d e 1 3.7020077927 -3.343981336 0.8481712 1.235996e-15 0.04000000 2 -0.1354629834 0.493489440 -0.5908803 2.398419e-01 0.02500988 3 0.0080049888 -0.080382448 0.2699275 -3.340300e-01 0.16847785 4 -0.0024497295 0.045074170 -0.2946273 7.950796e-01 -0.67835432 5 0.0002985362 -0.009891144 0.1176126 -5.790532e-01 1.03931174 | cs |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | # save calibrated parameters for(i in 1:n) { df.mkt[i,c(“a”,“b”,“c”,“d”,“e”)] <– XX[((i–1)*n+1):(i*n)] } # monthly forward rate and spot rate df.mm <– data.frame( mat = seq(0,10,1/12),y = NA, fwd = NA) # which segment df.mm$seg_no <– apply(df.mm, 1, function(x) min(which(x[1]<=df.mkt$mat)) ) # ti^n df.mm$t5 <– df.mm$mat^5 df.mm$t4 <– df.mm$mat^4 df.mm$t3 <– df.mm$mat^3 df.mm$t2 <– df.mm$mat^2 df.mm$t1 <– df.mm$mat^1 df.mm$t0 <– 1 nr <– nrow(df.mm) # number of rows # dti = ti^n-(ti-1)^n df.mm$dt5 <– c(NA,df.mm$t5[2:nr] – df.mm$t5[1:(nr–1)]) df.mm$dt4 <– c(NA,df.mm$t4[2:nr] – df.mm$t4[1:(nr–1)]) df.mm$dt3 <– c(NA,df.mm$t3[2:nr] – df.mm$t3[1:(nr–1)]) df.mm$dt2 <– c(NA,df.mm$t2[2:nr] – df.mm$t2[1:(nr–1)]) df.mm$dt1 <– c(NA,df.mm$t1[2:nr] – df.mm$t1[1:(nr–1)]) # monthly maximum smoothness forward curve df.mm$fwd[1] <– df.mkt$e[1] # time 0 forward rate df.mm$y[1] <– df.mkt$e[1] # time 0 yield temp_y_sum <– 0 for(i in 2:nr) { mat <– df.mm$mat[i] seg_no <– df.mm$seg_no[i] # which segment v_tn <– df.mm[i,c(“t4”,“t3”,“t2”,“t1”,“t0”)] v_dtn <– df.mm[i,c(“dt5”,“dt4”,“dt3”,“dt2”,“dt1”)] v_abcde <– df.mkt[seg_no, c(“a”,“b”,“c”,“d”,“e”)] # monthly maximum smoothness forward curve df.mm$fwd[i] <– sum(v_abcde*v_tn) # monthly yield curve temp_y_sum <– temp_y_sum + sum(c(1/5,1/4,1/3,1/2,1)*v_abcde*v_dtn) df.mm$y[i] <– (1/mat)*temp_y_sum } # Draw Graph x11(width=8, height = 6); plot(df.mkt$mat, df.mkt$zrc, col = “red”, cex = 1.5, ylim=c(0,0.12), xlab = “Maturity”, ylab = “Interest Rate”, lwd = 10, main = “Monthly Maximum Smoothness Forward Rates and Spot Rates”) text(df.mkt$mat, df.mkt$zrc–c(0.015,–0.01,rep(0.01,3)), labels=c(“3M”,“1Y”,“3Y”,“5Y”,“10Y”), cex= 1.5) lines(df.mm$mat, df.mm$y , col = “blue” , lwd = 5) lines(df.mm$mat, df.mm$fwd, col = “green”, lwd = 10) legend(“bottomright”,legend=c(“Spot Curve”, “Forward Curve”, “Input Spot Rates”), col=c(“blue”,“green”,“red”), pch = c(15,15,16), border=“white”, box.lty=0, cex=1.5) | cs |
1 2 3 4 | XX2 <– solve(mA)%*%vB cbind(XX[1:(n*n)],XX2,XX[1:(n*n)]–XX2) sum(abs(XX[1:(n*n)]–XX2)) | cs |
R Packages for Data Science, you’ll learn about the tidyverse library in this lesson, which is a collection of R tools that you can use to manipulate your datasets.
You’ll also discover how to use some of the dplyr package’s key functions to select and filter data.
An R package is a collection of code, data, documentation, and tests that is easy to share.
The enormous number of packages available in R is one of the reasons for its popularity.
There’s a good probability that someone else has previously solved a problem identical to yours, and you may take advantage of utilizing those r packages.
The tidyverse library, which is a collection of fundamental R programs for data research, will be used extensively in this post.
The tidyverse library’s core contains packages that you’re likely to utilize in your daily data analysis.
There are four sections to the tidyverse library:
Funnel Chart in R-Interactive Funnel Plot »
Dplyr and tidyr are two packages in the Data Wrangling and Transformation category.
The pipe operator may be used to mix several functions, which is the package’s primary advantage.
This package does it all, from filtering to grouping data.
The readr package belongs to the Data Import and Management group. This package handles the problem of converting a flat file, such as a.csv, to a tibble.
“purrr” is a package from the Functional Programming group. This package calculates the mean value for each column and offers statistics for the dataset.
The ggplot2 package is part of the Data Visualization and Exploration group.
ggplot2 is popular among data scientists for creating charts and visualizations like box plots, density plots, violin plots, tile plots, and time series plots.
One Sample Analysis in R » Quick Guide »
The tidyverse’s dplyr package includes methods for performing some of the most popular actions when working with data.
The following are the five most important dplyr functions:
The select() function chooses variables based on their names.
The filter() function filters observations based on their values.
The summarise() function computes summary statistics.
The arrange() function rearranges the rows.
The modify() function creates new variables.
We provided a full explanation for the dplyr package in one of our older posts, which you can read by following the link below.
Data Manipulation Techniques with dplyr »
You learned in this tutorial that the tidyverse packages, such as dplyr, tidyr, readr, purr, and ggplot2, provide a plethora of capabilities for data analysis.
Selecting, filtering, summarizing, organizing, and modifying are some of the most frequent operations you’ll do when working with data, and you can even combine functions using the pipe operator to generate more powerful operations.
tidyverse in r – Complete Tutorial » Unknown Techniques
The post R Packages for Data Science appeared first on finnstats.
The current 2021 french administrative limits database (Adminexpress from IGN) is more detailed than the original version (from 50 MB zipped in 2017 to 500 MB zipped now), thanks to a more detailed geometry being currently based on the BDTOPO. However we don’t always need large scale details especially for web applications. The commune layer itself is a huge 400 MB shapefile not really usable for example in a small scale leaflet map.
Using sf::st_simplify()
in R or a similar command in QGIS on these shapefiles would create holes or overlapping polygons, shapefiles not being topologically aware. We could probably convert to lines, build topology, simplify, clean, build polygons in GRASS or ArcGis, but it’s quite a hassle…
A nice solution is using Mapshaper on mapshaper.org, or better for reproducibility using {mapshaper} in R. For such large dataset it is advised to use a node.js install instead of relying on the package’s embedded version.
On Debian-like :
> sudo apt-get install nodejs npm
or on windows : install https://nodejs.org/. If needed add C:\Users\xxxxxxxx\AppData\Roaming\npm to your $PATH.
> npm config set proxy "http://login:password@proxy:8080" # if necessary > npm install -g mapshaper
For ms_simplify()
we will set sys = TRUE
to take advantage of the node.js executable. Experiment with the other parameters to get a resolution that suits you. Here we use Visvalingam at 3%, squeezing the commune layer from 400 MB to 30 MB. From here we rebuild departement, region and epci with ms_dissolve()
commands. Then we join back with original attributes and export in a geopackage with some metadata.
library(tidyverse) library(sf) library(rmapshaper) library(geojsonio) library(janitor) library(fs) # ADMIN EXPRESS COG France entière édition 2021 (in WGS84) # ftp://Admin_Express_ext:Dahnoh0eigheeFok@ftp3.ign.fr/ADMIN-EXPRESS-COG_3-0__SHP__FRA_WM_2021-05-19.7z # also available on : # http://files.opendatarchives.fr/professionnels.ign.fr/adminexpress/ADMIN-EXPRESS-COG_3-0__SHP__FRA_WM_2021-05-19.7z # originals --------------------------------------------------------------- source_ign <- "~/sig/ADMINEXPRESS/ADMIN-EXPRESS-COG_3-0__SHP__FRA_2021-05-19/ADMIN-EXPRESS-COG/1_DONNEES_LIVRAISON_2021-05-19/ADECOG_3-0_SHP_WGS84G_FRA" com <- source_ign %>% path("COMMUNE.shp") %>% read_sf() %>% clean_names() dep <- source_ign %>% path("DEPARTEMENT.shp") %>% read_sf() %>% clean_names() reg <- source_ign %>% path("REGION.SHP") %>% read_sf() %>% clean_names() epci <- source_ign %>% path("EPCI.shp") %>% read_sf() %>% clean_names() # simplify --------------------------------------------------------------- check_sys_mapshaper() # 6 min # using a conversion to geojson_json to avoid encoding problems com_simpl <- com %>% geojson_json(lat = "lat", lon = "long", group = "INSEE_COM", geometry = "polygon", precision = 6) %>% ms_simplify(keep = 0.03, method = "vis", keep_shapes = TRUE, sys = TRUE) dep_simpl <- com_simpl %>% ms_dissolve(field = "insee_dep", sys = TRUE) reg_simpl <- com_simpl %>% ms_dissolve(field = "insee_reg", sys = TRUE) epci_simpl <- com_simpl %>% ms_dissolve(field = "siren_epci", sys = TRUE) # add attributes and export ---------------------------------------------- destination <- "~/donnees/ign/adminexpress_simpl.gpkg" com_simpl %>% geojson_sf() %>% st_write(destination, layer = "commune", layer_options = c("IDENTIFIER=Communes Adminexpress 2021 simplifiées", "DESCRIPTION=France WGS84 version COG (2021-05). Simplification mapshaper.")) dep_simpl %>% geojson_sf() %>% left_join(st_drop_geometry(dep), by = "insee_dep") %>% st_write(destination, layer = "departement", layer_options = c("IDENTIFIER=Départements Adminexpress 2021 simplifiés", "DESCRIPTION=France WGS84 version COG (2021-05). Simplification mapshaper.")) reg_simpl %>% geojson_sf() %>% left_join(st_drop_geometry(reg), by = "insee_reg") %>% st_write(destination, layer = "region", layer_options = c("IDENTIFIER=Régions Adminexpress 2021 simplifiées", "DESCRIPTION=France WGS84 version COG (2021-05). Simplification mapshaper.")) epci_simpl %>% geojson_sf() %>% mutate(siren_epci = str_remove(siren_epci, "200054781/")) %>% # remove Grand Paris left_join(st_drop_geometry(epci), by = c("siren_epci" = "code_siren")) %>% st_write(destination, layer = "epci", layer_options = c("IDENTIFIER=EPCI Adminexpress 2021 simplifiés", "DESCRIPTION=Établissement public de coopération intercommunale France WGS84 version COG (2021-05). Simplification mapshaper."))
With shiny you can add an interactive web interface to your R work, enabling also users without a technical background to profit from your analysi...
Continue reading: Learn to ‘Make a Shiny App sparkle’ with us]]>Find out how to customize a shiny
app and make it more accessible to users with our workshop on 17/11.
With shiny
you can add an interactive web interface to your R
work, enabling also users without a technical background to profit from your analysis and make data-driven decisions. Despite being a very powerful tool, a basic usage of shiny
may not deliver its full potential. Thanks to many new packages ever expanding the shiny
universe, and to a clever organization of the UI, however, it is possible to make the content of your app more user friendly and therefore more effective.
In our 3-hours hands-on workshop on Wednesday November 17th at 2 pm CET we will explore how to ‘Make a Shiny App sparkle’ by playing with aesthetics, HTML
and CSS
customization, experimenting with other shiny
-related packages, and having a look at different UI structures, layouts and ways to organize the content.
This workshop is a natural continuation of the ‘Build your first Shiny App’ workshop, and it is part of the “Shiny” learning path. However, it can also be attended as a stand alone workshop. Only minimal knowledge of shiny
is expected, whereas no HTML
or JavaScript
skills are required.
Register at this link before 10/11 to benefit from the early bird discount.
Welcome to the second part of the forester blog. In the previous part, we explained the main idea of the forester package, the motivations behind it, its advantages, and the innovations it brings to the ML world. You should definitely check it out! In this part, however, we will focus on showing the wide range of possibilities of the forester package and things you can achieve with it. We will present you the main functions of the package with their parameters and show how you can use them in your problems.
The forester package capsulizes important steps in the ML pipeline. We discussed each step in the previous part using the graph below.
Now we will try to explain how our package exactly works and what happens between the first and the last step of the process. The basic scheme of functions of our package is presented on the graph below. Notice how the colors below match particular steps on the graph above. The only thing user has to do to create a model is to run one function. Then the data preprocessing is performed. In the next step, the forester package creates models and then tunes them. Finally, the models are evaluated and compared so that the best model can be chosen. The forester package returns the DALEX object so the user is able to easily create various plots to explain the model’s predictions. We will now look closer at the particular functions of our pipeline and show what you can achieve with them.
Make functions are arguably the core of our package. Their goal is to simplify the process of creating models. Using those functions, you’re able to create basic tree-based models in just a few seconds. And simply by choosing the right arguments, you can also perform simple data preprocessing and even train your model. Let’s see how it works.
First, we’ll need some data.
# Loading libraries library(DALEX) library(forester) # Creating train and test set data_shuffled <- fifa[sample(1:5000, 5000),] data_train <- head(data_shuffled, 4000) data_test <- data_shuffled[4001:5000,]
Then using this line of code you can create basic models. We will focus on ranger models, but everything works the same for the rest of the models.
basic_model <- make_ranger(data = data_train, target = "overall", type = "regression", label = "Basic Ranger")
You can also perform simple data preprocessing by selecting the right parameters. You can decide what you want to do with missing data, whether you want to delete or fill it. Moreover, if you choose how many features you want to use in training, then, by using Boruta package, most important features will be chosen. In this example we will use ten most important features and we will fill NA values.
prep_model <- make_ranger(data = data_train, target = "overall", type = "regression", fill_na = TRUE, num_features = 10, label = "Prep Ranger")
Moreover, you can easily tune your models by setting tune = True. You can (but don’t have to) choose a metric, based on which the model will be evaluated during the tunning process.
tuned_model <- make_ranger(data = data_train, target = "overall", type = "regression", fill_na = TRUE, num_features = 10, tune = TRUE, label = "Tuned Ranger")
After creating several different models, you may want to compare them and see, which one is the best. forester provides a compare function that evaluates all models, creates a table with different metrics, and selects the best model based on the chosen metric. We can use this function to see which of our three models is the best. We have to provide the test set of course. We can see, that both preprocessing and tunning improved our model’s score.
evaluation <- evaluate(basic_model, prep_model, tuned_model, data_test = data_test, target = "overall") best_model <- evaluation$best_model
Evaluate function returns a list, that contains the best model and a data frame with the results of the evaluation, so to use best_model we have to extract it first.
Now that we know which model is the best we can use explainable artificial intelligence (XAI) methods to explain our model. Because the forester is well adjusted to the DALEX package, we can do it easily by using functions from the DALEX package. In this example we will create Break Down Profile and Feature Importance plot. To read more about those methods I recomend you this blog about XAI methods.
### Feature importance best_model <- evaluation$best_model mp <- model_parts(best_model) plot(mp, max_vars = 5)
### Break Down profile nobs <- data_test[150, , drop = FALSE] pp <- predict_parts(best_model, new_observation = nobs, type = "break_down") plot(pp, max_vars = 5)
If you don’t want to choose a model on your own then you can use the forester function, which performs the whole pipeline for all available models in one function, and then chooses the best one. This function has very similar arguments to earlier mentioned make functions.
f_model <- forester(data = data_train, target = "overall", type = "regression")
As we can see, using the forester package is very simple, in fact it only takes one line of code to create a tree-based model without getting into the data and its processing, and the final object is easy to use with the DALEX package. You can read more about our package on the GitHub repository.
If you are interested in other posts about explainable, fair, and responsible ML, follow #ResponsibleML on Medium.
In order to see more R related content visit https://www.r-bloggers.com
Guide through jungle of models! What’s more about the forester R package? was originally published in ResponsibleML on Medium, where people are continuing the conversation by highlighting and responding to this story.
Shiny Basics, you will learn two standards for constructing a simple Shiny application in this tutorial.
You’ll also learn how the program is structured, the components of the user interface, and where to put application logic.
How to Calculate Root Mean Square Error (RMSE) in R »
To summarise, a Shiny application consists of two major components.
The first component is the user interface, or UI, which is a web page that shows the information. It is composed of HTML components that you design with Shiny functions.
The Server is the second component. This is the location of the application logic.
Let’s make a simple Shiny application.
Here is the basic structure and to run a Shiny application. This script’s name is app.R, but you can call it whatever you like.
How to Measure Heteroscedasticity in Regression? »
Call shinyApp() in the final line of code, passing in the empty UI and server variables.
With the presence of the shinyApp() function in your code, the Run command transforms to Run App.
You can now launch the app by clicking the Run App button.
The console will have some output at this stage.
When you press the Run App button, the runApp() method is invoked. The app’s link will appear below it.
Stopping the app is as simple as clicking the Stop button.
It is important to note that the runApp() function should not be placed within the Shiny code.
Another method for creating a Shiny application is to split the UI and server code into two different files.
This is the preferred method for writing Shiny apps as they become more complicated.
How to Calculate Mean Absolute Percentage Error (MAPE) in R »
The server-side application logic code is written in a file called server.R, and the UI-specific code is written in a file called ui.R.
It is important to note that a call to shinyApp() is not required at the conclusion of either the server.R or ui.R files.
The ui.R and server.R files must be combined into a single folder.
When RStudio sees the two files combined, it recognizes them as a Shiny application.
Panel functions are used to arrange UI elements into a single panel.
For example, if you want text input and numeric input boxes on the side, you can put them in a sidebar panel.
Log Rank Test in R-Survival Curve Comparison »
Here is a list of panels that are currently available.
Panel routines return HTML div tags with certain bootstrap class characteristics.
In the application layout, you use the Layout methods to organize panels containing UI elements.
Here are several examples:
The fluidRow() function generates a fluid page layout consisting of rows and columns.
Rows ensure that all of their items are on the same line (if the browser has adequate width.)
Columns specify how much horizontal space its items should occupy inside a 12-unit wide grid. Fluid pages automatically scale their components to span the entire browser width.
The flowLayout() function arranges components from left to right and top to bottom. The sidebarLayout() function arranges items in a layout that includes a sidebar and a main section.
By default, the splitLayout() function arranges elements horizontally, splitting the available horizontal space into equal parts.
The verticalLayout() function, on the other hand, produces a container with one or more rows of content.
Control widgets are web elements with which users can interact. They enable users to provide input to the Server, which then executes the logic.
How to Remove Duplicates in R with Example »
When the user changes the widget values, the output matching to the input is also changed.
The Shiny package has a plethora of pre-built widget functions, which not only makes it easier to construct widgets but also improves their appearance.
Date input, file input, slider input, and more examples are provided. When input widgets update, the Server executes the logic and sends the result back to the corresponding UI output components.
Assume you have a program that computes the square of a number. Numeric input and output UI components are required.
You can transfer data to the server using the numeric input widget in the UI. The Server will compute the square of the input and return the result to the UI output element as shown.
You learned in this video that there are two standards for developing Shiny application files: a single file or two files named server.R and ui.R in a single folder.
You also learned about the UI components, including HTML tags, layouts, and widgets, as well as how the UI elements interact with the application logic.
How to Perform Dunnett’s Test in R » Post-hoc Test »
The post Shiny Basics-Introduction appeared first on finnstats.
What is the Atkinson index?
The Atkinson index, introduced by Atkinson (1970) (Reference 1), is a measure of inequality used in economics. Given a population with values and an inequality-aversion parameter , the Atkinson index is defined as
If we denote the Hölder mean by
then the Atkinson index is simply
While the index is defined for all , we restrict to be . (Some of the properties in the next section would not hold otherwise.)
Properties of the Atkinson index
The Atkinson index has a number of nice properties:
Some intuition for the Atkinson index
In R, the Atkinson
function in the DescTools
package implements the Atkinson index. It is so simple that I can reproduce the whole function here (most of the function is dedicated to checking for NA values):
function (x, n = rep(1, length(x)), parameter = 0.5, na.rm = FALSE) { x <- rep(x, n) if (na.rm) x <- na.omit(x) if (any(is.na(x)) || any(x < 0)) return(NA_real_) if (is.null(parameter)) parameter <- 0.5 if (parameter == 1) A <- 1 - (exp(mean(log(x)))/mean(x)) else { x <- (x/mean(x))^(1 - parameter) A <- 1 - mean(x)^(1/(1 - parameter)) } A }
To get some intuition for the Atkinson index, let’s look at the index for a population consisting of just 2 people. By homogeneity, we can assume that the first person has value 1; we will denote the second person’s value by x
. We plot the Atkinson index for and , with ranging from to :
library(DescTools) x <- 10^(-40:40 / 10) eps <- 1 atkinsonIndex <- sapply(x, function(x) Atkinson(c(1, x), parameter = eps)) # log10 x axis par(mfrow = c(1, 2)) plot(x, atkinsonIndex, type = "l", log = "x", ylab = "Atkinson index for (1, x)", main = "Atkinson index, eps = 1 (log x-axis)") # regular x axis plot(x, atkinsonIndex, type = "l", xlim = c(0, 1000), ylab = "Atkinson index for (1, x)", main = "Atkinson index, eps = 1 (linear x-axis)")
The two plots show the same curve, with the only difference being the x-axis (log scale on the left, linear scale on the right). The curve is symmetric around when the x-axis is on the log scale. We expect this since, by homogeneity, the index for is the same as the index for .
Next, we look at the Atkinson index for for a range of values for the parameter:
x <- 10^(0:40 / 10) epsList <- 10^(-2:2 / 4) plot(c(1, 10^4), c(0, 1), log = "x", type = "n", xlab = "x", ylab = "Atkinson index for (1, x)", main = "Atkinson index for various epsilon") for (i in seq_along(epsList)) { atkinsonIndex <- sapply(x, function(x) Atkinson(c(1, x), parameter = epsList[i])) lines(x, atkinsonIndex, col = i, lty = i, lwd = 2) } legend("topleft", legend = sprintf("%.2f", epsList), col = seq_along(epsList), lty = seq_along(epsList), lwd = 2)
The larger is, the more “inequality-averse” we are. For fixed , the Atkinson index for increases as increases.
Finally, let’s look at what values the Atkinson index might take for samples taken from different distributions. In each of the panels below, we take 100 samples, each of size 1000. The samples are drawn from a log-t distribution with a given degrees of freedom (that is, the log of the values follows a t distribution). For each of these 100 samples, we compute the Atkinson index (with the default ), then make a histogram of the 100 index values. (The t distribution with is basically indistinguishable from the standard normal distribution.)
nsim <- 100 n <- 1000 dfList <- c(50, 10, 5, 3) png("various-t-df.png", width = 900, height = 700, res = 120) par(mfrow = c(2, 2)) set.seed(1) for (dfVal in dfList) { atkinsonIndices <- replicate(nsim, Atkinson(exp(rt(n, df = dfVal)))) hist(atkinsonIndices, xlim = c(0, 1), xlab = "Atkinson Index", main = paste("Histogram of Atkinson indices, df =", dfVal)) } dev.off()
References:
A few weeks ago, I introduced a Forecasting API that I deployed on Heroku. Under the hood, this API is built on top of ahead
(and through Python packages rpy2 and Flask); an R package for univariate and multivariate time series forecasting. As of October 13th, 2021, 5 forecasting methods are implemented in ahead
:
armagarchf
: univariate time series forecasting method using simulation of an ARMA(1, 1) – GARCH(1, 1)dynrmf
: univariate time series forecasting method adapted from forecast::nnetar
to support any
Statistical/Machine learning model (such as Ridge Regression, Random Forest, Support Vector Machines, etc.)eatf
: univariate time series forecasting method based on combinations of forecast::ets
, forecast::auto.arima
, and forecast::thetaf
ridge2f
: multivariate time series forecasting method, based on quasi-randomized networks and presented in this papervarf
: multivariate time series forecasting method using Vector AutoRegressive model (VAR, mostly here for benchmarking purpose)Here’s how to install the package:
1st method: from R-universe
In R console:
options(repos = c( techtonique = 'https://techtonique.r-universe.dev', CRAN = 'https://cloud.r-project.org')) install.packages("ahead")
2nd method: from Github
In R console:
devtools::install_github("Techtonique/ahead")
Or
remotes::install_github("Techtonique/ahead")
And here are the packages that will be used for this demo:
library(ahead) library(fpp) library(datasets) library(randomForest) library(e1071)
In this section, we illustrate dynrmf
for univariate time series forecasting, using Random Forest and SVMs. Do not hesitate to type ?dynrmf
,
?armagarchf
or ?eatf
in R console for more details and examples.
par(mfrow=c(2, 2)) # Plotting forecasts # With a Random Forest regressor, an horizon of 20, # and a 95% prediction interval plot(dynrmf(fdeaths, h=20, level=95, fit_func = randomForest::randomForest, fit_params = list(ntree = 50), predict_func = predict)) # With a Support Vector Machine regressor, an horizon of 20, # and a 95% prediction interval plot(dynrmf(fdeaths, h=20, level=95, fit_func = e1071::svm, fit_params = list(kernel = "linear"), predict_func = predict)) plot(dynrmf(Nile, h=20, level=95, fit_func = randomForest::randomForest, fit_params = list(ntree = 50), predict_func = predict)) plot(dynrmf(Nile, h=20, level=95, fit_func = e1071::svm, fit_params = list(kernel = "linear"), predict_func = predict))
In this section, we illustrate ridge2f
and varf
forecasting for multivariate time series.
Do not hesitate to type ?ridge2f
or ?varf
in R console for more details on both functions.
# Forecast using ridge2 # With 2 time series lags, an horizon of 10, # and a 95% prediction interval fit_obj_ridge2 <- ahead::ridge2f(fpp::insurance, lags = 2, h = 10, level = 95) # Forecast using VAR fit_obj_VAR <- ahead::varf(fpp::insurance, lags = 2, h = 10, level = 95) # Plotting forecasts # fpp::insurance contains 2 time series, Quotes and TV.advert par(mfrow=c(2, 2)) plot(fit_obj_ridge2, "Quotes") plot(fit_obj_VAR, "Quotes") plot(fit_obj_ridge2, "TV.advert") plot(fit_obj_VAR, "TV.advert")
The {emayili}
package supports configuring a generic SMTP server via the server()
function. In the most recent version, v0.6.5
, we add three new functions, gmail()
, sendgrid()
and mailgun()
, which provide specific support for Gmail, SendGrid and Mailgun.
library(emayili) options(envelope.details = TRUE) options(envelope.invisible = FALSE) packageVersion("emayili") [1] '0.6.5'
Gmail is a popular email service that’s rather pervasive and does not require an introduction. The gmail()
function makes it possible to easily use the Gmail SMTP server, requiring only username and password for authentication.
# Using server(). smtp <- server( host = "smtp.gmail.com", port = 587, username = "bob@gmail.com", password = "bd40ef6d4a9413de9c1318a65cbae5d7" ) # Using gmail(). smtp <- gmail( username = "bob@gmail.com", password = "bd40ef6d4a9413de9c1318a65cbae5d7" )
By default gmail()
will use port 587 (TLS), but you can specify the port
argument if for some reason you prefer to use port 465 (SSL).
SendGrid is a cloud-based service for sending and managing email at scale.
To send an email via SendGrid from {emayili}
use the sendgrid()
function. You’ll need to first get an API key, which will then be used as the SMTP password.
smtp <- sendgrid( password = "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" )
Below is the lightly redacted raw content of a message sent by Bob (bob@gmail.com) to Alice (alice@gmail.com) via SendGrid using {emayili}
.
Delivered-To: alice@gmail.com Received: by 2002:a92:dc07:0:0:0:0:0 with SMTP id t7csp5413809iln; Thu, 14 Oct 2021 00:42:07 -0700 (PDT) X-Google-Smtp-Source: ABdhPJzUkPUh4K+kyOdnqDyLpZy61eYgtm91lhZddP6z3nlPOIX6Sprs0pXjtaP2UiGjSnojckAO X-Received: by 2002:a25:e755:: with SMTP id e82mr4573229ybh.528.1634197327823; Thu, 14 Oct 2021 00:42:07 -0700 (PDT) Received: from xtrwkvxq.outbound-mail.sendgrid.net (xtrwkvxq.outbound-mail.sendgrid.net. [167.89.24.164]) by mx.google.com with ESMTPS id n62si2129988ybn.282.2021.10.14.00.42.07 for <alice@gmail.com> (version=TLS1_3 cipher=TLS_AES_128_GCM_SHA256 bits=128/128); Thu, 14 Oct 2021 00:42:07 -0700 (PDT) Received: by filterdrecv-64fcb979b9-ttm9r with SMTP id filterdrecv-64fcb979b9-ttm9r-1-6167DF4E-7 2021-10-14 07:42:06.149765894 +0000 UTC m=+2619921.806202168 Received: from propane (unknown) by geopod-ismtpd-6-1 (SG) with ESMTP id WQM06PcSTEGg9g6DXD15-Q for <alice@gmail.com>; Thu, 14 Oct 2021 07:42:06.009 +0000 (UTC) Date: Thu, 14 Oct 2021 07:42:06 +0000 (UTC) X-Mailer: {emayili}-0.6.4 MIME-Version: 1.0 From: bob@gmail.com Subject: Mail via SendGrid Content-Type: text/plain; charset=us-ascii; format=flowed Content-Disposition: inline Content-Transfer-Encoding: 7bit Content-MD5: lS0sVtBIWVgzZ0e83ZhZDQ== Message-ID: <WQM06PcSTEGg9g6DXD15-Q@geopod-ismtpd-6-1> X-SG-EID: =?us-ascii?Q?LCYZPn5Un+WXHvrSNyN5bUctJrIgrTLrkNMw3pKU=2FxUypxaYVR4293QfCj3U9f?= =?us-ascii?Q?KknTRh0vtKYz3cREfpwZJXAA0MdhlRN=2FTOlo5Rv?= =?us-ascii?Q?WMV2qu7qgo3Vx01e2DuJDPvmzuG1NNVsN=2FAdfmO?= =?us-ascii?Q?TXWQ1kY=2FoRp24Dekt2E8fZSr=2Fc0Bo6ci0KOdkpH?= =?us-ascii?Q?Zxno0nvvAGn4GhTvOs0E3kvZmCRWsQyQffw=3D=3D?= To: alice@gmail.com X-Entity-ID: yskshUftiOQOXDmOqHb1EA== Hello!
The salient bits of information to extract from the email headers are:
outbound-mail.sendgrid.net
(see the Received
field) andX-SG-EID
field which identifies the message as being sent by SendGrid. The value of this field is encoded according to RFC 2047.Mailgun is another cloud-based service for sending and managing emails. To use Mailgun with {emayili}
you’ll first need to register a sending domain. This will then be assigned a username and password, which you’ll specify in the call to mailgun()
.
smtp <- mailgun( username = "postmaster@sandbox9ptce35fdf0b31338dec4284eb7aaa59.mailgun.org", password = "44d072e7g2b5f3bf23b2b642da0fe3a7-2ac825a1-a5be680a" )
Here’s the simplified raw content of a message sent by Bob (bob@gmail.com) to Alice (alice@gmail.com) via Mailgun using {emayili}
.
Delivered-To: alice@gmail.com Received: by 2002:a92:dc07:0:0:0:0:0 with SMTP id t7csp5413647iln; Thu, 14 Oct 2021 00:41:51 -0700 (PDT) X-Received: by 2002:ad4:4b63:: with SMTP id m3mr4009333qvx.28.1634197311440; Thu, 14 Oct 2021 00:41:51 -0700 (PDT) Received: from m228-62.mailgun.net (m228-62.mailgun.net. [159.135.228.62]) by mx.google.com with UTF8SMTPS id az43si1987716qkb.315.2021.10.14.00.41.51 for <alice@gmail.com> (version=TLS1_3 cipher=TLS_AES_128_GCM_SHA256 bits=128/128); Thu, 14 Oct 2021 00:41:51 -0700 (PDT) X-Mailgun-Sending-Ip: 159.135.228.62 X-Mailgun-Sid: WyIyNDFkOCIsICJhbmRyZXdAZmF0aG9tZGF0YS5kZXYiLCAiNzY5ZTg3Il0= Received: from propane (host-92-12-243-150.as13285.net [92.12.243.150]) by smtp-out-n03.prod.us-west-2.postgun.com with SMTP id 6167df3ff3e5b80f1f369570 (version=TLS1.3, cipher=TLS_AES_128_GCM_SHA256); Thu, 14 Oct 2021 07:41:51 GMT Sender: bob=gmail.com@mail.gmail.com Message-Id: <20211014074151.535b7df0826d28e2@mail.gmail.com> Date: Thu, 14 Oct 2021 07:41:50 GMT X-Mailer: {emayili}-0.6.4 MIME-Version: 1.0 To: alice@gmail.com From: bob@gmail.com Subject: Mail via Mailgun Content-Type: text/plain; charset=utf-8; format=flowed Content-Disposition: inline Content-Transfer-Encoding: 7bit Content-MD5: lS0sVtBIWVgzZ0e83ZhZDQ== Hello!
The important pieces of information to notice in the email headers are:
Received
fields) andX-Mailgun-Sid
field which identifies the message as being sent by Mailgun.We’ll shortly be adding support for other services like Sendinblue and MailerSend.
An elegant application of Functional Data Analysis is to model longitudinal data as a curve and then study the curve’s dynamics. For example, in pharmacokinetics and other medical studies analyzing multiple measurements of drug or protein ...
Continue reading: FDA and the Dynamics of Curves]]>An elegant application of Functional Data Analysis is to model longitudinal data as a curve and then study the curve’s dynamics. For example, in pharmacokinetics and other medical studies analyzing multiple measurements of drug or protein concentrations in blood samples, it may be interest to determine if the concentrations in subjects undergoing one type of treatment rise quicker than those undergoing an alternative treatment. In this post, I will generate some plausible fake data for measurements taken over time for two groups of subjects, use the techniques of Functional Data Analysis to represent these data as a continuous curve for each subject, and look at some of the dynamic properties of the curves, in particular their velocities and accelerations.
The fdaoutlier package contains functions to generate a number of stochastic models with a mechanism to generate reasonable outliers for each type of model. THe curves produced by model 4 look like they can serve plausible synthetic concentration curves. Instead of thinking of normal curves and outliers, I imagine the two related sets of curves to be the results of two different treatments influencing some measured concentration curve. In the example below, the curves associated with treatment 2 are of the form: \[X_i(t) = \mu t(1 – t)^m + e_i(t),\] while those associated with treatment 1 are of the form: \[X_i(t) = \mu(1 – t)t^m + e_i(t)\] where:
library(fdaoutlier) library(tidyverse) library(fda) library(gganimate) library(gridExtra)
The following code generates two different sets of longitudinal data from Model 4 which is described in the vignette to the fdaoutlier package.
set.seed(95139) n_curves <- 100 n_obs <- 50 mod4 <- simulation_model4(n = n_curves, p = n_obs, outlier_rate = .5, seed = 50, plot = FALSE) index <- 1:n_curves index1 <- mod4$true_outliers # curves_mat is an n_curves x n_obs matrix curves_mat <- mod4$data treat <- rep(2,n_obs) curves <- data.frame(index, treat, curves_mat) curves <- curves %>% mutate(treat = if_else((index %in% index1),1,2))
There are 50 curves for each treatment and 50 points for each curve. This is probably the simplest case possible, and it is good enough to show how to explore the dynamics of curves. However, if you work with real longitudinal data you know that things are rarely this simple. But please be assured that FDA can deal with considerably more complexity, including a variable number of measurements for each subject, different measurement times for each subject, and situations where you have far fewer than 50 points for each subject. I have explored some of these situations in previous posts. For example, look here to see how to work with different time points, and here for some ideas for working with sparse data. The first link above also points to basic references that should help you to get started with your data.
Now, I reformat the data into a long form data frame and plot both sets of curves.
time <- 1:n_obs curves_l <- pivot_longer(curves, cols = !c("index", "treat"), names_to = "Xval") %>% mutate(time = rep(time,100), .before = "treat", treat = as.factor(treat)) %>% dplyr::select(-Xval) p <- curves_l %>% ggplot(aes(time,value, color = treat)) + geom_line(aes(group = index)) + scale_color_manual(values=c("navy blue", "dark grey")) + ggtitle("Model 4 Curves") p
So far we just have plots based on a relatively small number of data points. Nevertheless, our eyes extrapolate, and we imagine seeing two sets of continuous curves with one set clearly rising to its maximum value faster than the other set. But we really don’t have continuous curves yet, we just have points.
The next step is to invoke the mathematics of FDA to embed the points in an infinite dimensional vector space where the data for each subject is modeled by a continuous function (curve). Exactly how this happens is a bit involved, but the general idea is that we create a basis and then use the data and some techniques from the linear algebra of Hilbert spaces to estimate the time dependent coefficients that enable modeling each subject’s data points as a linear combination of the basis functions. So, from here on out we are not going to be working with the raw data anymore. We will be working with vector space models of the data. The upside is that we can now now have real curves (continuous function), and can calculate the values of the functions and other properties such as the first and second derivatives at any time point. If you are interested in the math, please have a look at the references listed at the bottom of my post on Functional PCA.
Here we use the function fda::create.bspline.basis()
to create a B-Spline basis covering the interval of our observations. The plot of the curves above indicates that specifying a knot at every multiple of 5 ought to be adequate for representing our data. Note that using cubic splines n_order
= 4 ensures that the splines will have continuous first and second derivatives at the knots.
knots = c(seq(0,n_obs,5)) #Location of knots n_knots = length(knots) #Number of knots n_order = 4 # order of basis functions: for cubic b-splines: order = 3 + 1 n_basis = length(knots) + n_order - 2; spline_basis = create.bspline.basis(rangeval = c(0,n_obs), nbasis = n_basis, norder = n_order) #plot(spline_basis)
The following code uses the function fda::Data2fda()
and the spline-basis
to convert the Model 4 data points into functions and produce one fda object for each of the two groups of treatments.
# df1 is an (n_curves/2) x (n_obs) matrix df1 <- curves_mat[index1,] # data for treatment 1 index2 <- index[!(index %in% index1)] df2 <- curves_mat[index2,] # data for treatment 2 # Use the b-spline basis to create represent the curves as vectors in the function space df1_obj <- Data2fd(argvals = 1:n_obs, y = t(df1), basisobj = spline_basis, lambda = 0.5) df2_obj <- Data2fd(argvals = 1:n_obs, y = t(df2), basisobj = spline_basis, lambda = 0.5)
Here we evaluate each function along with its first and second derivatives at a finer time scale than we used to originally display our data.
tfine <- seq(0,50,by=.5) # Each matrix is 101 x 50 rows are different times, columns are curves pos1 <- as.vector(eval.fd(tfine, df1_obj)); pos2 <- as.vector(eval.fd(tfine, df2_obj)) vel1 <- as.vector(eval.fd(tfine, df1_obj,1)); vel2 <- as.vector(eval.fd(tfine, df2_obj,1)) acc1 <- as.vector(eval.fd(tfine, df1_obj,2)); acc2 <- as.vector(eval.fd(tfine, df2_obj,2))
Note that the velocities and accelerations computed above were returned as matrices. We convert them to vectors and put them into a data frame for plotting.
time <- rep(tfine,50) id1 <- rep(1:50,each=101) id2 <- rep(51:100,each=101) derv1 <- data.frame(time, id1, pos1, vel1, acc1) derv2 <- data.frame(time, id2, pos2, vel2, acc2) pv1 <- derv1 %>% ggplot(aes(time,vel1,col=id1)) + geom_line(aes(group = id1)) + ggtitle("Velocity Treatment 1") pv2 <- derv2 %>% ggplot(aes(time,vel2,col=id2)) + geom_line(aes(group = id2)) + ggtitle("Velocity Treatment 2") pa1 <- derv1 %>% ggplot(aes(time,acc1,col=id1)) + geom_line(aes(group = id1)) +ggtitle("Acceleration Treatment 1") pa2 <- derv2 %>% ggplot(aes(time,acc2,col=id2)) + geom_line(aes(group = id2)) + ggtitle("Acceleration Treatment 2") grid.arrange(pv1, pa1, pv2, pa2, nrow = 2,ncol = 2, padding = unit(1, "line"))
The velocities for treatment 1 head upwards before they decrease while the velocities for treatment 2 go straight down, and the curve up again. The accelerations for treatment 1 slope downward slightly while those of treatment two slope upward. So the plots indicate that the two sets of curves look like they behave differently.
We can test for differences using the function fda::tperm.fd()
which implements a resampling method to do pointwise t-tests. The functional data representing the curves for the two samples are combined in a single array and the labels for the curves are randomly shuffled. Recalculating the maximum value of the t-statistic for each point enables computing a null distribution. Then, at each time point, the observed data are compared with the 1 - \(\alpha\) quantile of the null distribution.
The plot below shows the result of performing the t-test to compare the first derivatives of the two treatments. I use the function fda.deriv.fd()
to calculate the first derivative for each treatment on the fly, just to show another way of doing things in the fda
package. You can easily modify the code to compare accelerations.
dfdt1 <- deriv.fd(df1_obj,1) dfdt2 <- deriv.fd(df2_obj,1) tres <- tperm.fd(dfdt1,dfdt2,plotres=FALSE) max_q <- tres$qval tres_dat <-tibble(time = tres$argvals, t_values = tres$Tvals, q_vals = tres$qvals.pts) p <- tres_dat %>% ggplot(aes(time,t_values, colour = "t_value")) + geom_line() + geom_line(aes(time, q_vals, colour = "q_vals")) + geom_line(aes(time, max_q,colour = "max_q"), linetype= "dashed") + labs(x = "time", y = "") + ggtitle("Statistics for Pointwise t-test") p
The blue curve shows the t-statistic for the observed values. The green curve represents the 95% quantiles, and the dashed red line is the 95% quantile of the maximum of null distribution t-statistics. The t-test confirms that the derivatives are indeed different except in the regions of overlap around time = 10 and time = 40.
Phase plots are useful for evaluating how the curves develop in the abstract phase space created by looking at position versus velocity, or velocity versus acceleration. Here we pick three curves from each treatment and plot velocity versus acceleration.
phase1 <- derv1 %>% filter(id1 %in% 15:17) phase2 <- derv2 %>% filter(id2 %in% 51:53) pph1 <- phase1 %>% ggplot(aes(vel1,acc1,col=id1)) + geom_point() + ggtitle("Treat 1 v vs. acc") pph2 <- phase2 %>% ggplot(aes(vel2,acc2,col=id2)) + geom_point() + ggtitle("Treat 2 v vs. acc") grid.arrange(pph1, pph2, ncol = 2, padding = unit(1, "line"))
Based on this small sample, it certainly appears that the curves associated with the two different treatments inhabit different regions of phase space.
Finally, it is easy and helpful to animate a phase diagram to show how a function develops in phase space over time. The animation below shows the first curve for treatment 1.
pos <- eval.fd(tfine, df2_obj[1]) vel <- eval.fd(tfine, df2_obj[1],1) acc <- eval.fd(tfine, df2_obj[1],2) phase_dat <- tibble(tfine, vel, acc) p <- ggplot(phase_dat, aes(vel,acc)) + geom_point() + ggtitle("Trajectory in Phase Space") anim <- p + transition_time(tfine) + shadow_mark() anim_save("anim.gif", anim) #anim
What do data science teams need to ensure their Shiny apps work as intended once they hit “publish”? Software industry best practices — like continuous integration, deployment, ...
Continue reading: Why Your Data Science Team Might Need a Shiny Deployment Engineer]]>What do data science teams need to ensure their Shiny apps work as intended once they hit “publish”? Software industry best practices — like continuous integration, deployment, and delivery (CI/CD) — can support the creation of production-ready data tools. Teams can test algorithms and models as part of the development cycle, and data scientists can deploy their apps at any moment with confidence.
This is why Guidehouse, a leading global provider of consulting services to the public sector and commercial markets, is currently looking for a Senior Shiny Deployment Engineer. Helping drive adoption of DevOps methodologies for R- and Python-based web applications, the Senior Shiny Deployment Engineer will bridge the worlds between data science and software development.
In this post, we interview Vergil Weatherford, Associate Director on the Advanced Solutions team at Guidehouse. A long-time proponent of open-source tools, Vergil oversees the architecture needed to streamline complex data collection and smooth development of data products.
We were excited to learn more about Vergil’s work planning and implementing data infrastructure, why he drives the adoption of open source at Guidehouse, and his vision for a Senior Shiny Deployment Engineer that will contribute to the data science team’s culture of quality.
I am an energy consultant-turned-data tooling enthusiast. Day in and day out, I help my team adopt modern data science tools to boost productivity and solve our clients’ most complex challenges.
My first day on the job as a consultant was more than 12 years ago. I was given dozens of CSV files and asked to analyze the time-series residential air conditioner runtime data using Excel. I quickly found myself asking, “Is this really the best way to do this?” The answer led me on a long journey to where I am today, where I work on a specialized team dedicated to supporting code-first data science infrastructure at Guidehouse.
I mainly build infrastructure for an analytics team working on our clients’ most complex business challenges related to the clean energy transition. This work requires us to use a myriad of datasets. We work regularly with “smart meter” data, SCADA data, customer demographics, and building characteristics. We run surveys and deploy data acquisition devices to collect detailed energy usage data. We’re given time series data from IoT devices like smart thermostats and data feeds from electric vehicle (EV) charging stations. We then look across the broad spectrum of data and technologies to assess and develop solutions.
This is one of the things that makes analytics in consulting unique: getting all of these varied datasets from external sources poses challenges not found when data comes from inside an organization. We need tools that help us create meaning and value from data instead of focusing on data wrangling or cleaning.
We also need approaches that let us standardize analysis methods and develop reusable interfaces regardless of what the data looks like. The consulting industry is seeing big growth in systematizing solutions and building data products that can be quickly redeployed for different clients. “Modularity” has become the operative word. By having a strong infrastructure in place, we can use our previous work to put together solutions more quickly.
As a long-time Linux hobbyist, I am continually looking for ways to leverage open source to solve problems. So when the management team was looking for alternatives to a proprietary — and expensive! — statistical analysis platform seven years ago, I suggested R mainly due to its strong following in academia and its open source nature.
Fast-forward to the present, and our tooling looks very different than it did when I joined. We’re still using some proprietary tools where they are a good fit but our “daily driver” toolkit is mainly open source: R, Python, and Git, with Linux under the hood. Central to delivering that toolkit, we use the full suite of RStudio Team Enterprise tools to scale our data science work and share results with clients.
Our data science team has a large and talented R user base. They have experience solving all kinds of unique and challenging problems, like detecting the effects of programs designed to incentivize energy savings, forecasting electric vehicle demand, or optimizing the pathway to achieve decarbonization goals. We have built up a lot of R code over the years as we have solved these problems.
Early on, we experimented with GUI-based dashboarding tools, but the jump from analysis in R to visualization in other business intelligence tools had too many gaps. With tools like Shiny, we can seamlessly apply the skills our data scientists have built up over time.
Shiny also allows data scientists to carry the solution development process much further along the deployment lifecycle. The team is able to turn custom analyses into web applications. Then, we can use software like RStudio Connect to rapidly deploy and tweak those web applications. We’re able to iterate and innovate as quickly and often as we would like.
We have a lot of depth and expertise in being able to solve our clients’ most complex problems with R and want to improve our client’s end-to-end experience with our data products. To enable this, we recognize the need for formalizing the Software Development Lifecycle (SDLC) early in the process by bringing software industry best practices to bear.
The Senior Shiny Deployment Engineer will help us do that. However, while our main front-end framework is Shiny, we want to make sure we do not miss out on potential candidates coming from a computer science or formal software development background. The skills required to deploy robust data applications require knowledge of good SDLC practices for testing, CI/CD, and DevOps principles. Those are the sorts of things that data scientists usually have to read up on but are standard skill sets for deployment engineers.
Writing a good Shiny app and deploying that Shiny app into production are two different things. The purpose of the role is to help push client-facing solutions over the finish line. We need to build applications thoughtfully from the early stages and have strategies in place for testing, quality assurance, versioning, updates, and ongoing maintenance. This is where the Senior Shiny Deployment Engineer can really make a difference.
Team members with understanding of both dashboard development and deployment infrastructure are at a premium. When I shopped around the idea of this role with a few solution development leads, they were extremely enthusiastic and said they were very interested to learn from someone with this expertise.
We are really looking for someone who is ready to take their career to the next level: a teammate who can partner with different teams to help establish a culture of quality of the final deployed product starting in the Shiny app architecture and development phase. We look forward to the Senior Shiny Deployment Engineer working alongside other team members to embed software engineering best practices in a community of more research-focused individuals.
Working alongside a Shiny Deployment Engineer, data science teams can apply deployment best practices to their development lifecycle. Data scientists are able to automate and innovate their frameworks, and clients reap the benefits from robust data science products. Interested in applying your SDLC skills as a Senior Shiny Deployment Engineer? Apply here!
Learn more about Guidehouse:
This post is part of a series of technotes about r-universe, a new umbrella project by rOpenSci under which we experiment with various ideas for improving publication and discovery of research software in R. As the project evolves, we will post updates to document features and technical details. For more information, visit the r-universe project page.
After creating your personal universe, the dashboard on https://{yourname}.r-universe.dev
shows the version and other details for each package in your repository. We have also added a new tab that lists the available badges for the repository.
Badges provide a nice way to display the status of your repository or individual packages within external webpages, such as a README file, your homepage, or your personal or organization profile README on GitHub.
There are currently two types of badges in r-universe: individual package badges showing the current package version, and global badges (where the endpoint is prefixed with :
) that show the name, count, and status of the entire repository.
The /badges/{pkg}
API yields a badge with the current version of the given package in the repository. A common place to display this badge is in the package README file, together with the package installation instructions.
If the package is also on CRAN, you could position it next to a badge from r-pkg.org to contrast it with the current CRAN version and installation instructions, see for example the gert package README:
![runiverse-name example badge](https://ropensci.r-universe.dev/badges/:name) ![runiverse-package example badge for gert package](https://ropensci.r-universe.dev/badges/gert) ![cran-badge example for gert package](http://www.r-pkg.org/badges/version/gert)
Which looks like below. Here users can immediately see from the README that the package is available from the ropensci universe, and that the version from r-universe is higher than the CRAN version.
Besides badges for individual packages, the system also provides a few badges with information about the status of the entire package repository. For example:
/badges/:name
the name of the universe, i.e. your github username/badges/:total
the total number of packages in the repository/badges/:registry
if the latest monorepo update was successful, i.e. if your package registry is OKThe last badge in particular may be useful for yourself. If will go red when something is wrong with your package registry file. This usually means your packages.json may reference git repositories or branches that do not exist, or the system did not find a proper package in the given location.
This is what the badges look like for the ropensci universe:
![runiverse-name](https://ropensci.r-universe.dev/badges/:name) ![runiverse-registry](https://ropensci.r-universe.dev/badges/:registry) ![runiverse-total](https://ropensci.r-universe.dev/badges/:total)
One of the early adopters of R-universe had requested some options to customize the visual appearance of the badges. We have added 3 http parameters to the badges API:
scale
the size of the badge (where 1 is the default);color
the color of a badge;style
set to “flat” to get retro style badges.For example: https://ropensci.r-universe.dev/badges/:total?scale=2&color=pink&style=flat
The badgen docs show more details about these parameters. Our backend for this API is fairly straight-forward, try sending a pull request if you have ideas for other useful badges.
Date/Time: Wednesday, November 17, 3pm – 5pm GMT / 7am – 9am PST / 10am – 12 noon EST
In this session we will be looking at various learnings obtained from creating a corporate R infrastructure and developing R packages to address the unique business problems presented by clinical trials.
Architecting and maintaining an R installation across a large organisation can be challenging. How do you balance between giving individual users the ability to meet their specific needs, but also provide a standardised environment which meets regulatory requirements? As R needs expand, then internal packages also get created, and these also need to be managed.
It is often useful to create custom R packages to complement the community R packages. We are developing both open and closed source packages that we validate and deploy on our R infrastructure. We will be discussing some learnings from the NEST software development team including: project management, automation, devops, testing, integration, releasing, validation, and deployment of the in-house built R packages. We will also be presenting steps we took towards simplifying the development process to enable co-creation and collaboration with internal and external developers.
Finally, we will split into break out rooms to discuss some relevant topics on how package creation and management can be dealt with effectively.
Full list of R Consortium webinars here.
The post Webinar: R Package Management at Roche appeared first on R Consortium.
We’ve released the newest version of NIMBLE on CRAN and on our website. NIMBLE is a system for building and sharing analysis methods for statistical models, especially for hierarchical models and computationally-intensive methods (such as MCMC and SMC).
Version 0.12.1, in combination with version 0.12.0 (which was released just last week), provides a variety of new functionality (in particular enhanced WAIC functionality and adding the LKJ distribution) plus bug fixes affecting MCMC in specific narrow cases described below and that warrant upgrading for some users. The changes include:
1 2 3 4 5 6 7 8 9 10 11 12 13 | —————————————— Maturity Spot Rate ZCB Price —————————————— t r P(0,t) —————————————— 0.25 4.75% 0.9882 1 4.50% 0.9560 3 5.50% 0.8479 5 5.25% 0.7691 10 6.50% 0.5220 —————————————— | cs |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | #========================================================# # Quantitative ALM, Financial Econometrics & Derivatives # ML/DL using R, Python, Tensorflow by Sang-Heon Lee # # https://kiandlee.blogspot.com #——————————————————–# # Adams and Deventer Maximum Smoothness Forward Curve # using non-linear optimization #========================================================# graphics.off() # clear all graphs rm(list = ls()) # remove all files from your workspace library(NlcOptim) # optimization with non-linear constraints # whole calculation process f_calc <– function(x0) { # number of maturity n <– length(df.mkt$mat) # add 0-maturity zero rate (assumption) #df <- rbind(c(0, df.mkt$zrc[1]), df.mkt) df <– rbind(c(0, 0.04), df.mkt) # discount factor df$DF <– with(df, exp(–zrc*mat)) # -ln(P(t(i)/t(i-1))) df$mln <– c(NA,–log(df$DF[1:n+1]/df$DF[1:n])) # ti^n df$t5 <– df$mat^5 df$t4 <– df$mat^4 df$t3 <– df$mat^3 df$t2 <– df$mat^2 df$t1 <– df$mat^1 # dti = ti^n-(ti-1)^n df$dt5 <– c(NA,df$t5[1:n+1] – df$t5[1:n]) df$dt4 <– c(NA,df$t4[1:n+1] – df$t4[1:n]) df$dt3 <– c(NA,df$t3[1:n+1] – df$t3[1:n]) df$dt2 <– c(NA,df$t2[1:n+1] – df$t2[1:n]) df$dt1 <– c(NA,df$t1[1:n+1] – df$t1[1:n]) # parameters to be found df$e <– df$d <– df$c <– df$b <– df$a <– c(NA,rep(0,n)) df$a[1:n+1] <– x0[(0*n+1):(1*n)] df$b[1:n+1] <– x0[(1*n+1):(2*n)] df$c[1:n+1] <– x0[(2*n+1):(3*n)] df$d[1:n+1] <– x0[(3*n+1):(4*n)] df$e[1:n+1] <– x0[(4*n+1):(5*n)] # difference of coefficients : dki = ki – k(i-1) df$da <– c(NA, df$a[2:n] – df$a[2:n+1], NA) df$db <– c(NA, df$b[2:n] – df$b[2:n+1], NA) df$dc <– c(NA, df$c[2:n] – df$c[2:n+1], NA) df$dd <– c(NA, df$d[2:n] – df$d[2:n+1], NA) df$de <– c(NA, df$e[2:n] – df$e[2:n+1], NA) # forward rate df$fwd <– NA df$fwd[1:n+1] <– with(df[1:n+1,], a*t4 + b*t3 + c*t2 +d*t1 + e) df$fwd[1] <– df$e[2] # linear constraints df$cmln <– df$cfp3 <– df$cfp2 <– df$cfp1<– df$cfp0<– NA df$cfp0[2:n] <– with(df[2:n,], da*t4 + db*t3 + dc*t2 + dd*t1 + de) df$cfp1[2:n] <– with(df[2:n,], 4*da*t3 + 3*db*t2 + 2*dc*t1 + dd) df$cfp2[2:n] <– with(df[2:n,], 12*da*t2 + 6*db*t1 + 2*dc) df$cfp3[2:n] <– with(df[2:n,], 24*da*t1 + 6*db) df$cmln[2:(n+1)] <– with(df[2:(n+1),], a*dt5/5 + b*dt4/4 + c*dt3/3 + d*dt2/2 + e*dt1 – mln) # additional 4 constraint # initial(0) forward rate(f) = r0, initial f’ = 0 # terminal(T) f’, f” = 0 const_f_0 <– df$e[2] – df$zrc[1] const_f1_0 <– df$d[2] const_f1_T <– with(df[n+1,], 4*a*t3 + 3*b*t2 + 2*c*t1 + d) const_f2_T <– with(df[n+1,], 12*a*t2 + 6*b*t1 + 2*c) # objective function df$obj <– NA df$obj[1:n+1] <– with(df[1:n+1,], (144/5)*dt5*a^2 + 36*dt4*a*b + 12*dt3*b^2 + 16*dt3*a*c + 12*dt2*b*c + 4*dt1*c^2) return(list(df.calc = df, fvalue = sum(df$obj[1:n+1]), const = c( df$cfp0[2:n], df$cfp1[2:n], df$cfp2[2:n], df$cfp3[2:n], df$cmln[2:(n+1)], const_f_0, const_f1_0, const_f1_T, const_f2_T))) } # objective function for solnl obj <– function(x) { lt.out <– f_calc(x); return(lt.out$fvalue) } # constraint function for solnl con <– function(x) { lt.out <– f_calc(x) return(list(ceq = lt.out$const, c = NULL)) } # Input : market zero rate, maturity df.mkt <– data.frame( mat = c(0.25, 1, 3, 5, 10), zrc = c(4.75, 4.5, 5.5, 5.25, 6.5)/100 ) x0 <– rep(0.001,25) # initial guesses out <– solnl(x0, objfun = obj, confun = con) # augment df.mkt with calibrated paramters n <– length(df.mkt$mat) df.mkt$a[1:n] <– out$par[(0*n+1):(1*n)] df.mkt$b[1:n] <– out$par[(1*n+1):(2*n)] df.mkt$c[1:n] <– out$par[(2*n+1):(3*n)] df.mkt$d[1:n] <– out$par[(3*n+1):(4*n)] df.mkt$e[1:n] <– out$par[(4*n+1):(5*n)] df.mkt | cs |
1 2 3 4 5 6 7 | mat zrc a b c d e 1 0.25 0.0475 3.7020078185 -3.343981375 0.8481712 1.429488e-16 0.04000000 2 1.00 0.0450 -0.1354629771 0.493489420 -0.5908803 2.398419e-01 0.02500988 3 3.00 0.0550 0.0080049880 -0.080382440 0.2699275 -3.340299e-01 0.16847782 4 5.00 0.0525 -0.0024497294 0.045074169 -0.2946273 7.950796e-01 -0.67835429 5 10.00 0.0650 0.0002985362 -0.009891143 0.1176126 -5.790532e-01 1.03931172 | cs |
A few months ago, I started hacking together an online e-book on Visualising WRC Rally Stages With rayshader and R. One of the sections (Estimating speeds) described the construction of a simple speed model based around the curvature of the stage route.
As part of another sprint into some rally data tinkering, this time focusing on Visualising WRC Telemetry Data With R, I’ve extracted just the essential code for creating the speed model and split it into a more self-contained extract: Creating a Route Speed Model. The intention is that I can use this speed model to help improve interpolation within a sparse telemetry time series.
Also on the to do list is to see if I can validate – or not! – the speed model using actual telemetry.
The recipe for building the model builds up from the a boundary convexity tool (bct()
) that can be found in the rLFT
processing linear features R package. This tool provides a handy routine for modeling the curvature along each point of a route in the form, a process that also returns the co-ordinates of a center of curvature for each sement. A separate function inspired by the pracma::circlefit()
function, then finds the radius.
Because I don’t know how to write vectorised functions properly, I use the base::Vectorize()
function to do the lifting for me around a simpler, non-vectorised function.
library(devtools) # The curvature function takes an arc defined over # x and y coordinate lists #circlefit, from pracma:: circlefit = function (xp, yp, fast = TRUE) { if (!is.vector(xp, mode = "numeric") || !is.vector(yp, mode = "numeric")) stop("Arguments 'xp' and 'yp' must be numeric vectors.") if (length(xp) != length(yp)) stop("Vectors 'xp' and 'yp' must be of the same length.") if (!fast) warning("Option 'fast' is deprecated and will not be used!", call. = FALSE, immediate. = TRUE) n <- length(xp) p <- qr.solve(cbind(xp, yp, 1), matrix(xp^2 + yp^2, ncol = 1)) v <- c(p[1]/2, p[2]/2, sqrt((p[1]^2 + p[2]^2)/4 + p[3])) rms <- sqrt(sum((sqrt((xp - v[1])^2 + (yp - v[2])^2) - v[3])^2)/n) #cat("RMS error:", rms, "\n") return(v) } curvature = function(x,y){ #729181.8, 729186.1, 729190.4 #4957667 , 4957676, 4957685 tryCatch({ # circlefit gives an error if we pass a straight line # Also hide the print statement in circlefit # circlefit() returns the x and y coords of the circle center # as well as the radius of curvature # We could then also calculate the angle and arc length circlefit(x,y)[3] }, error = function(err) { # For a straight, return the first co-ord and Inf diameter # Alternatively, pass zero diameter? c(x[1], y[1], Inf)[3]}) } curvature2 = function(x1, x2, x3, y1, y2, y3){ curvature(c(x1, x2, x3), c(y1, y2, y3)) } # The base::Vectorize function provides a lazy way of # vectorising a non-vectorised function curvatures = Vectorize(curvature2) # The Midpoint values are calculated by rLFT::bct() route_convexity$radius = curvatures(lag(route_convexity$Midpoint_X), route_convexity$Midpoint_X, lead(route_convexity$Midpoint_X), lag(route_convexity$Midpoint_Y), route_convexity$Midpoint_Y, lead(route_convexity$Midpoint_Y)
A corner speed model than bins each segment into a corner type. This is inspired by the To See The Invisible rally pacenotes tutorial series by David Nafría which uses a simple numerical value to categorise the severity of each corner as well as identifying a nominal target speed for each corner category.
corner_speed_model = function(route_convexity){ invisible_bins = c(0, 10, 15, 20, 27.5, 35, 45, 60, 77.5, 100, 175, Inf) route_convexity$invisible_ci = cut(route_convexity$radius, breaks = invisible_bins, labels = 1:(length(invisible_bins)-1), ordered_result=TRUE) # Speeds in km/h invisible_speeds = c(10, 40, 50, 60, 70, 80, 95, 110, 120, 130, 145) route_convexity$invisible_sp = cut(route_convexity$radius, breaks = invisible_bins, labels = invisible_speeds, ordered_result=TRUE) # Cast speed as factor, via character, to integer route_convexity$invisible_sp = as.integer(as.character(route_convexity$invisible_sp)) route_convexity }
We can now build up the speed model for the route. At each step we accelerate towards a nominal sector target speed (the invisible_sp
value). We can’t accelerate infinitely fast, so our actual target accumulated speed for the segment, acc_sp
, is a simple function of the current speed and the notional target speed. We can then calculate the notional time to complete that segment, invisible_time
.
acceleration_model = function(route_convexity, stepdist=10){ # Acceleration model sp = route_convexity$invisible_sp # Nominal starting target speed # In we don't set this, we don't get started moving sp[1] = 30 # Crude acceleration / brake weights acc = 1 dec = 1 for (i in 2:(length(sp)-1)) { # Simple linear model - accumulated speed is based on # the current speed and the notional segment speed # Accelerate up if (sp[i-1]<=sp[i]) sp[i] = (sp[i-1] + acc * sp[i]) / (1+acc) # Decelerate down if (sp[i]>sp[i+1]) sp[i] = (dec * sp[i] + sp[i+1]) / (1+dec) } route_convexity$acc_sp = sp route_convexity$acc_sp[length(sp)] = route_convexity$invisible_sp[length(sp)] # New time model # Also get speed in m/s for time calculation meters = 1000 seconds_per_hour = 3600 # 60 * 60 kph_unit = meters / seconds_per_hour route_convexity = route_convexity %>% mutate(segment_sp = route_convexity$acc_sp * kph_unit, invisible_time = dist/segment_sp, acc_time = cumsum(invisible_time)) # So now we need to generate kilometer marks route_convexity$kmsection = 1 + trunc(route_convexity$MidMeas/1000) # We can use this to help find the time over each km route_convexity }
With the speed model, we can then generate a simple plot of the anticipated speed against distance into route:
We can also plot the accumulated time into the route:
Finally, a simple cumulative sum of the time taken to complete each segment gives us an estimate of the stage time:
anticipated_time = function(route_convexity) { anticipated_time = sum(route_convexity$invisible_time[1:nrow(route_convexity)-1]) cat(paste0("Anticipated stage time: ", anticipated_time %/% 60, 'm ', round(anticipated_time %% 60, 1), 's' )) } anticipated_time(route_convexity) # Anticipated stage time: 8m 40.3s
Next on my to do list is to generate an “ideal” route from a collection of telemetry traces from different drivers on the same stage.
If we know the start and end of the route are nominally at the same location, we can normalise the route length of multiple routes, maps equidistant points onto each other, and then take the average. For example, this solution: https://stackoverflow.com/a/65341730/454773 seems to offer a sensible way forward. See also https://en.wikipedia.org/wiki/Dynamic_time_warping and https://dynamictimewarping.github.io/r/ .
I was rather surprised, though, not to find a related funciton in one of the ecology / animal tracking R packages that would, for example, pull out a “mean” route based on a collection of locations from a tracked animal or group of animals following the same (ish) path over a period of time. Or maybe I just didnlt spot it? (If you know of just such a function I can reuse, please let me know via the comments…)
It’s been quiet around this blog because supervising two students for Google Summer of Code has kept me pretty busy! But we have some news…
Thanks to Mr. Tejasvi Gupta and the support of GSOC, ChemoSpec and ChemoSpec2D were extended to pro...
Continue reading: GSOC 2021: New Graphics for ChemoSpec]]>It’s been quiet around this blog because supervising two students for Google Summer of Code has kept me pretty busy! But we have some news… |
Thanks to Mr. Tejasvi Gupta and the support of GSOC, ChemoSpec
and ChemoSpec2D
were extended to produce ggplot2
graphics and plotly
graphics! ggplot2
is now the default output, and the ggplot2
object is returned, so if one doesn’t like the choice of theme or any other aspect, one can customize the object to one’s desire. The ggplot2
graphics output are generally similar in layout and spirit to the base
graphics output, but significant improvements have been made in labeling data points using the ggrepel
package. The original base
graphics are still available as well. Much of this work required changes in ChemoSpecUtils
which supports the common needs of both packages.
Tejasvi did a really great job with this project, and I think users of these packages will really like the results. We have greatly expanded the pre-release testing of the graphics, and as far as we can see every thing works as intended. Of course, please file an issue if you see any problems or unexpected behavior.
To see more about how the new graphics options work, take a look at GraphicsOptions. Here are the functions that were updated:
plotSpectra
surveySpectra
surveySpectra2
reviewAllSpectra
(formerly loopThruSpectra
)plotScree
(resides in ChemoSpecUtils
)plotScores
(resides in ChemoSpecUtils
)plotLoadings
(uses patchwork
and hence plotly
isn’t available)plot2Loadings
sPlotSpectra
pcaDiag
plotSampleDist
aovPCAscores
aovPCAloadings
(uses patchwork
and hence plotly
isn’t available)Tejasvi and I are looking forward to your feedback. There are many other smaller changes that we’ll let users discover as they work. And there’s more work to be done, but other projects need attention and I need a little rest!
Get the code for this blog on GitHub
What is this tutorial and who is it for?
This tutorial is aimed mainly at R users who want to learn a bit of D3,
and specifically those who are interested in how you can incorporate D3
into your existing workflows in ...
Get the code for this blog on GitHub
This tutorial is aimed mainly at R users who want to learn a bit of D3, and specifically those who are interested in how you can incorporate D3 into your existing workflows in RStudio. It will gloss over a lot of the fundamentals of D3 and related topics (JavaScript, CSS, and HTML) to fast-forward the process of creating your first D3.js visualisation. It will therefore be far from a comprehensive guide. I’ve tried to include what I think is important, but if you have absolutely no experience with any of those topics you will almost definitely be left with some questions. Hopefully, the satisfaction of creating your first plot will inspire you to break and tweak the code I have provided to learn more.
D3.js, or just D3 as it’s more often referred to, is a JavaScript library used for creating interactive data visualisations optimised for the web. D3 stands for Data-Driven Documents. It is commonly used by those who enjoy making creative or otherwise unusual visualisations as it offers you a great deal of freedom as well as options for interactivity such as animated transitions and plot zooming.
One benefit of D3 is its aforementioned creative control. Another benefit is that rather than creating raster images (e.g. PNG, JPEG) like a lot of plotting libraries it renders your figures as SVGs (scalable vector graphics), which stay crisp no matter how far you zoom in and are generally faster to load (note: when there are many data points, an SVG may be slower than a raster image, learn more about which image file type to use in our blog post on image formats). If you are an R user, you should also care because the {r2d3} package lets you easily incorporate D3 visualisations into your R workflow, and use them in e.g. R Markdown reports or R Shiny dashboards.
The short answer is: it depends. It can be quite tricky and
time-consuming to learn D3 and all associated skills (JavaScript, HTML,
CSS) if you have no previous experience. On the other hand, learning D3
can be a fun way to take your first steps into web development
technologies. Furthermore, you may be perfectly happy with available
plotting libraries in R, e.g. {ggplot2}, as what they offer is indeed
highly flexible and suitable for interactivity. You can even save ggplot
plots as SVG with ggsave()
and svglite
. Therefore, I don’t think
learning D3 is a necessity for data visualisation, but it can be an
addition to your skill set and can be a great first step into creative
coding or web development.
If you are still with me, let’s get into {r2d3}.
{r2d3} is an R package that lets you
create D3 visualisations with R. One way it enhances this process is by
being able to translate between R objects and D3-friendly data
structures. This means that you can clean your data in R, and then just
plot it using D3 without having to go near any data wrangling using
JavaScript. Another cool feature is that you can create D3-rendering
chunks in an R Markdown file that will preview inline, so you can easily
incorporate a D3 visualisation in your reports. You can also easily add
a D3 visualisation to a Shiny app using the renderD3()
and
d3Output()
functions. If you need help with a Shiny Application, we
can
help.
OK, let’s get set up to create our first D3 visualisation in RStudio. We’re gonna be using this fun dataset on Scooby-Doo manually aggregated by user plummeye. We are gonna make a line chart that shows the cumulative total number of monsters caught by each member of Mystery Incorporated. Then we will add some unique D3 flair to it to make an unusually painful line chart worth it.
First, you’ll need to install the {r2d3} package as usual.
install.packages("r2d3")
This allows you to write D3 in RStudio in two main ways:
.js
file with some autopopulated D3 codeFor this blog post, we will be writing our code in a separate .js file, but we will be running it in an R Markdown chunk to preview it (However, it is also possible to preview your code from the script directly, but this way will hopefully show you how easily you can include D3 visualisations in an R Markdown report).
So, we will start by creating two files:
scoobydoo.Rmd
scoobydoo.js
To ensure that the files are able to interact with each other, I
recommend working in an RStudio project (File > New Project) with
both files at the .Rproj
level.
Do you need help with your Shiny app? We can advise or even take over the day-to-day running of your application services
You will need to install some packages for the cleaning steps, which you can install with this line of code:
install.packages(c("dplyr", "lubridate", "r2d3", "stringr", "tidyr", "tidytuesdayR"))
In your .Rmd file, you can copy the following steps to load necessary packages, read in the data, and clean it in preparation of our D3 visualisation. We won’t go through these steps as this blog post assumes you know R and some basic Tidyverse already! If you don’t, we offer courses to help you get started! You can download the data we will be using manually from here if you prefer reading it in from a CSV file.
# in scoobydoo.Rmd library("dplyr") library("tidyr") library("stringr") library("lubridate") # load data from tidytuesday tuesdata = tidytuesdayR::tt_load(2021, week = 29) scoobydoo = tuesdata$scoobydoo # wrangling data into nice shape monsters_caught = scoobydoo %>% select(date_aired, starts_with("caught")) %>% mutate(across(starts_with("caught"), ~ as.logical(.))) %>% pivot_longer(cols = caught_fred:caught_not, names_to = "character", values_to = "monsters_caught") %>% drop_na() %>% filter(!(character %in% c("caught_not", "caught_other"))) %>% mutate(year = year(date_aired), .keep = "unused") %>% group_by(character, year) %>% summarise(caught = sum(monsters_caught), .groups = "drop_last") %>% mutate( cumulative_caught = cumsum(caught), character = str_remove(character, "caught_"), character = str_to_title(character), character = recode(character, "Daphnie" = "Daphne") )
I recommend investigating the resulting columns of the data by printing
monsters_caught
at this stage, as it will help you better understand
the D3 code later on. You will see that there are 5 columns, character
which contains the names of our Mystery Inc. members (Daphne, Fred,
Scooby, Shaggy, and Velma); year
which contains years between 1969 and
2021 obtained from when the episode was aired; caught
which contains
how many monsters were caught for each mystery member in each year and
cumulative_caught
which is the cumulative sum of monsters caught for
each member.
We are going to add a final column which will contain a unique colour for each character, so that our line chart will look a bit nicer. The colours are represented by hex codes obtained from official artwork of the characters.
# setting up colors for each character character_hex = tribble( ~ character, ~ color, "Fred", "#76a2ca", "Velma", "#cd7e05", "Scooby", "#966a00", "Shaggy", "#b2bb1b", "Daphne", "#7c68ae" ) monsters_caught = monsters_caught %>% inner_join(character_hex, by = "character")
We will also add a new chunk which includes the following code:
library("r2d3") r2d3(data = monsters_caught, script = "scoobydoo.js", d3_version = "5")
The r2d3()
function lets you communicate with our scoobydoo.js
script using the monsters_caught
tibble that we’ve created in R. As
our script is currently empty, nothing shows up when you run this line.
After we add some new code to our scoobydoo.js
script we can go back
to scoobydoo.Rmd
and re-run this line to view the output. We are
specifying our D3 version as 5
to ensure our code will continue to
work despite potentially breaking updates to D3.
Okay, let’s add some code to our D3 script. We are defining some variables as constants that set up the size of our margins, plot width and height, and some font and line sizes for later on. Defining our constants at the top makes them easy to find and change if we want to change the sizes throughout our script.
Note: Comments in JavaScript are denoted by //
, and variable names are
often written in camelCase
.
Another important concept being introduced in the code below are attributes. An SVG element has a number of properties and these can be set as attributes. For example, here we are setting the width attribute of the SVG as the width of our (upcoming) plot plus the left and the right margin (white space around the plot). Finally, we set up a group that will represent the plot inside our SVG element, and then move this plot to start where the left and top margin end using the “transform” attribute.
// in scoobydoo.js // set up constants used throughout script const margin = {top: 80, right: 100, bottom: 40, left: 60} const plotWidth = 800 - margin.left - margin.right const plotHeight = 400 - margin.top - margin.bottom const lineWidth = 3 const mediumText = 18 const bigText = 28 // set width and height of svg element (plot + margin) svg.attr("width", plotWidth + margin.left + margin.right) .attr("height", plotHeight + margin.top + margin.bottom) // create plot group and move it let plotGroup = svg.append("g") .attr("transform", "translate(" + margin.left + "," + margin.top + ")")
If we run our r2d3()
line in R Markdown again, the output is still
empty, but if we right-click on the space below our chunk and click
“Inspect Element”, we can now see that there is indeed an SVG element
(everything inside the SVG tags <svg> </svg>
), with the width and
height that we’ve provided in the SVG attributes. Getting comfortable
with using either the RStudio Developer Tools to inspect the element, or
inspecting it in a browser, will help you more easily understand D3
visualisations.
Next, let’s create some axes. At the bottom of scoobydoo.js
we add the
lines defining the , add the following lines which define two functions
xAxis
and yAxis
. These will be used to scale our data to a
coordinate system.
// x-axis values to year range in data // x-axis goes from 0 to width of plot let xAxis = d3.scaleLinear() .domain(d3.extent(data, d => { return d.year; })) .range([ 0, plotWidth ]); // y-axis values to cumulative caught range // y-axis goes from height of plot to 0 let yAxis = d3.scaleLinear() .domain(d3.extent(data, d => { return d.cumulative_caught; })) .range([ plotHeight, 0]);
We set the limits of the x- and y-axes to be between the min and max of
the respective columns (returned by d3.extent
with an anonymous
function returning all values from our respective columns). We then
define the actual length of our axes to be our full plot width and plot
height. Notice that when we define the y-axis, it is defined from top to
bottom (from plot height to 0).
Then, let’s add these axes to the plot. We move the x axis to start at
the bottom of the plot, and define it with a built-in D3 function used
to create a bottom horizontal axis (d3.axisBottom
) and a left vertical
axis (d3.axisLeft
) which require a scale (which we created with
d3.scaleLinear
in our xAxis
and yAxis
functions). We also set
stroke widths and font sizes for both axes.
// add x-axis to plot // move x axis to bottom of plot (height) // format tick values as date (no comma in e.g. 2,001) // set stroke width and font size plotGroup.append("g") .attr("transform", "translate(0," + plotHeight + ")") .call(d3.axisBottom(xAxis).tickFormat(d3.format("d"))) .attr("stroke-width", lineWidth) .attr("font-size", mediumText); // add y-axis to plot // set stroke width and font size plotGroup.append("g") .call(d3.axisLeft(yAxis)) .attr("stroke-width", lineWidth) .attr("font-size", mediumText);
Now, we need reformat our data slightly to be able to create a line chart with multiple lines. Each line will represent a Mystery Inc. member, so we want to create a hierarchical tree structure with the data for each character nested inside a separate key.
// turns data into nested structure for multiple line chart // d3.nest() no longer available in D3 v6 and above hence version set to 5 let nestedData = d3.nest() .key(d => { return d.character;}) .entries(data);
Here, d => {return d.character}
defines an anonymous function which
takes our data as an input and iterates through the character column so
we can create a separate key for each character with key()
. We then
supply the data values associated with that character inside the key
inside entries()
. You can investigate the structure of the nested data
by running nestedData
in the JavaScript console when in “Inspect
Element” mode.
Then, we create a path element which will have new class defined by us
called drawn_lines
(we can create a new class called whatever we want
in the class attribute) so that we can access this specific path element
later on. We define another anonymous function to color the line by the
hex codes in our color column. Finally, we define how we want the path
to use our data (it will be a line (d3.line
) whose x position is
determined by our year
column, and y position by our
cumulative_caught
column)
let path = plotGroup.selectAll(".drawn_lines") .data(nestedData) .enter() .append("path") // set up class so only this path element can be removed .attr("class", "drawn_lines") .attr("fill", "none") // color of lines from hex codes in data .attr("stroke", d => {return d.values[0].color}) .attr("stroke-width", lineWidth) // draw line according to data .attr("d", d => { return d3.line() .x(d => { return xAxis(d.year);}) .y(d => { return yAxis(d.cumulative_caught);}) (d.values) })
Now we will add a plot title. Create a text element for the plot title,
defining where it is anchored, the x and y position of the anchor, what
the actual text says, and its color, font size and font weight. We
append the text to the whole svg, rather than just the plot. So that the
title is above the tallest point of the y axis (end of the plotGroup
).
// create plot title svg.append("text") .attr("text-anchor", "start") .attr("x", margin.left) .attr("y", margin.top/3) .text("Monsters caught by Mystery Inc. members") .attr("fill", "black") .attr("font-size", bigText) .attr("font-weight", "bold")
Now we’ll create legend labels for each line which will identify which
character each line belongs to. Here, we create another group in our
plot that is going to contain text from nestedData
. We set some
attributes in terms of how it will look, as well as give it a custom
class name_labels
. We also decide where these labels will go, giving
them an x position slightly after the last data point on the x axis
(2021) and a y position based on the location of the final value on the
y axis (where the line ends). The text and color of the label will
depend on the character and color columns in the dataset.
// create legend labels i.e. character names plotGroup.append("g") .selectAll("text") .data(nestedData) .enter() .append("text") // add class so name_labels can be removed in drawLines() .attr("class", "name_labels") .style("font-weight", "bold") .style("font-size", mediumText) // set location for labels (at the end) .attr("x", xAxis(2021) + mediumText/2) .attr("y", (d, i) => yAxis(d.values[d.values.length-1].cumulative_caught) + mediumText/3) .attr("fill", d => {return d.values[0].color}) .text(d => {return d.values[0].character})
First, we will add a transition for the labels we just created. By
wrapping our plot-creating code in functions we can recreate the plot at
specific times. We will start by wrapping everything in the previous
chunk inside a function called drawLabels()
and add a transition which
makes the labels appear after 500 milliseconds, giving them a “fade in”
effect.
function drawLabels() { <insert code from previous chunk in here> .attr("opacity", 0) .transition() .duration(500) .attr("opacity", 1) }
We are also gonna create a transition for the lines that makes them
appear as if they’re being drawn from the start to end. Unfortunately,
the easiest way to do this involves some trickery involving the
stroke-dasharray
attribute of each line. This attribute defines the
dashed pattern of a line. So far, the lines on our plot are completely
solid. We will introduce a dash so large that the length of the dash and
the gap between each dash is longer than the width of the plot itself.
We then manipulate the offset of the dashes to make it appear that the
line is growing over time.
To do this, we need to create two functions. The first, tweenDash()
returns a function to take the stroke-dasharray
attribute of a line as
an argument, then manipulate it to get the next “frame” of the
animation. This will keep looping until the dash is covering the entire
length of the line, making it visible. And it will take 2500ms to do
this, as defined by duration(2500)
.
The other function, lineTransition()
, takes a path (i.e. line) as an
argument and passes that path’s stroke-dasharray
attribute into the
function returned by tweenDash()
. It then applies the new dash
configuration to the path. Note that when the transition ends
(.on("end", ...)
), our drawLabels
function is called. This is to
ensure that the labels appear only when the lines have fully appeared.
function tweenDash() { let l = this.getTotalLength(), i = d3.interpolateString("0," + l, l + "," + l); return function(t) { return i(t) }; } function lineTransition(path) { path.transition() .duration(2500) .attrTween("stroke-dasharray", tweenDash) .on("end", () => { drawLabels(); }); }
Now, wrap your line-drawing code (the code chunk starting with
let path =
) in a new function called drawLines()
. We add two new
lines at the top which removes any previously drawn lines and labels. We
chain on a call to the lineTransition()
function at the end of our
path code.
function drawLines() { // remove previously drawn lines when re-drawing plotGroup.selectAll(".drawn_lines").remove() // remove labels e.g. "Daphne" when re-drawing plotGroup.selectAll(".name_labels").remove() <code which starts with 'let path =' goes here> .call(lineTransition) }
Finally, add a line to call our new drawLines()
function at the bottom
of the script.
drawLines()
Now we have a working, animated D3 visualisation! I’ve added a button to
the blogpost to redraw the plot, but you should see the graph animate as
you re-run your r2d3()
line.
You might’ve already noticed that your local plot is of a static size and if you resize your RStudio window, your plot gets cut off. Luckily, {r2d3} comes with built-in width and height objects that change based on the size of the plot container. This means that we can use these variables to make our plot flexibly resize as we resize the window.
If we want to keep similar dimensions between the margins, plot width and height and line and text sizes, you can replace your constant-defining code at the top with the following, but you can play around with the multipliers to determine what relationships you want between sizes.
const margin = {top: 0.1 * width, right: 0.125 * width, bottom: 0.05 * width, left: 0.075 * width} const plotWidth = width - margin.left - margin.right const plotHeight = height - margin.top - margin.bottom const lineWidth = 0.004 * plotWidth const mediumText = 0.03 * plotWidth const bigText = 0.04 * plotWidth
Now, if you re-run your plot, it should automatically resize when you change the size of the window. And notice, because the plot is an SVG (scalable vector graphics) element, our plot stays sharp as we make it bigger or smaller.
Get the final .Rmd and .js files
We’ve now created our first D3 visualisation from scratch using the {r2d3} package in RStudio! As you can see, creating a line chart with many lines requires a lot of code and so, if you’re creating a basic plot for non-aesthetic purposes, sticking to {ggplot2} may make more sense. However, if you want your plot to be an interactive website statement piece or a creative, user-driven exploration of data or ideas, D3 may better suit your needs. As this blogpost was aimed at beginners, the end result is not particularly dramatic, but if this has inspired you to learn more, I have provided some links to some amazing D3 creators and resources below.
If you are looking for more comprehensive materials to learn D3, I highly recommend these two video tutorials by Curran Kelleher: Data Visualization with D3.js and Data Visualization with D3, JavaScript, React. Moreover, the The D3.js Graph Gallery by Yan Holtz is a good reference website to see what kind of plots you can make and how. Check out Observable for plenty of creative community-made D3 visualisations. Finally, if you need to be convinced that you can make cool stuff in D3, I highly recommend checking out Shirley Wu, Nadieh Bremer, and Amelia Wattenberger.
For updates and revisions to this article, see the original post
Introduction
The new gslnls-package provides R bindings to nonlinear least-squares optimization with
the GNU Scientific Library (GSL) using the trust region methods implemented by the gsl_multifit_nlinear module. The gsl_multifit_nlinear module was added in GSL version 2.2 (released in August 2016) and the available nonlinear-least squares routines have been ...
The new gslnls
-package provides R bindings to nonlinear least-squares optimization with
the GNU Scientific Library (GSL) using the trust region methods implemented by the gsl_multifit_nlinear
module. The gsl_multifit_nlinear
module was added in GSL version 2.2 (released in August 2016) and the available nonlinear-least squares routines have been thoroughly tested and are well documented, see (Galassi et al. 2009).
The aim of this post is to put the GSL nonlinear least-squares routines to the test and
benchmark their optimization performance against R’s standard nls()
function based on a small selection of test problems taken from the NIST Statistical Reference Datasets (StRD) archive.
The NIST StRD Nonlinear Regression archive includes both generated and real-world nonlinear least squares problems of varying levels of difficulty. The generated datasets are designed to challenge specific computations. Real-world data include challenging datasets such as the Thurber
problem, and more benign datasets such as Misra1a
(not tested here). The certified parameter values are best-available solutions, obtained using 128-bit precision and confirmed by at least two different algorithms and software packages using analytic derivatives.
The NIST StRD archive orders the regression problems by level of difficulty (lower, moderate and higher). In this post, only the regression problems that are labeled with a higher level of difficulty are tested, as these regression models are generally tedious to fit using R’s default nls()
function, especially when the chosen starting values are not close to the least-squares solution.
Table 1 provides an overview of all evaluated test problems including regression models, certified parameter values and starting values. Except for BoxBOD
, all of the listed datasets can be loaded directly in R with the NISTnls
-package available on CRAN^{1}. For the BoxBOD
dataset, the data is parsed separately from the corresponding NIST StRD data (.dat) file.
Dataset name | # Observations | # Parameters | Regression model | Certified parameter values | Starting values | Dataset source | Reference |
---|---|---|---|---|---|---|---|
Rat42 | 9 | 3 | \(f(x) = \dfrac{b_1}{1 + \exp(b_2 – b_3 x)}\) | \([72.462, 2.6181, 0.0673]\) | \([100, 1, 0.1]\) | Observed | Ratkowsky (1983) |
MGH09 | 11 | 4 | \(f(x) = \dfrac{b_1(x^2 + b_2 x)}{x^2 + b_3x + b_4}\) | \([0.1928, 0.1913, 0.1231, 0.1361]\) | \([25, 39, 41.5, 39]\) | Generated | Kowalik and Osborne (1978) |
Thurber | 37 | 7 | \(f(x) = \dfrac{b_1 + b_2x + b_3x^2 + b_4x^3}{1 + b_5x + b_6x^2 + b_7x^3}\) | \([1288.14, 1491.08, 583.238, 75.417, 0.9663, 0.3980, 0.0497]\) | \([1000, 1000, 400, 40, 0.7, 0.3, 0.03]\) | Observed | Thurber (1979) |
MGH10 | 16 | 3 | \(f(x) = b_1 \exp \left( \dfrac{b_2}{x + b_3} \right)\) | \([0.00561, 6181.35, 345.224]\) | \([2, 400000, 25000]\) | Generated | Meyer (1970) |
Eckerle4 | 35 | 3 | \(f(x) = \dfrac{b_1}{b_2} \exp\left( -\dfrac{1}{2} \left(\dfrac{x – b_3}{b_2}\right)^2 \right)\) | \([1.5544, 4.0888, 451.541]\) | \([1, 10, 500]\) | Observed | Eckerle (1979) |
Rat43 | 15 | 4 | \(f(x) = \dfrac{b_1}{(1 + \exp(b_2 – b_3x))^{1/b_4}}\) | \([699.642, 5.2771, 0.7596, 1.2792]\) | \([100, 10, 1, 1]\) | Observed | Ratkowsky (1983) |
Bennett5 | 154 | 3 | \(f(x) = b_1(b_2 + x)^{-1/b_3}\) | \([-2523.51, 46.737, 0.9322]\) | \([-2000, 50, 0.8]\) | Observed | Bennett, Swartzendruber, and Brown (1994) |
BoxBOD | 6 | 2 | \(f(x) = b_1(1 – \exp(-b_2 x))\) | \([213.809, 0.5472]\) | \([1, 1]\) | Observed | Box et al. (1978) |
The regression models and certified parameter values are taken from their respective NIST StRD data (.dat) files. For each test problem, the NIST StRD archive provides two or three sets of parameter starting values for the purpose of testing. The starting values listed in Table 1 correspond to the most difficult sets of starting values that are generally the furthest away from the target least-squares solution.
The following plots display all observed datasets, with the (unique) predictor variable on the x-axis and the response variable on the y-axis. The overlayed continuous line corresponds to the regression model evaluated at the certified parameter values.
Convergence of the nonlinear least-squares routines is tested across a grid of algorithms and pre-selected control parameter choices. For the GSL nonlinear least-squares algorithms, all trust region methods available through the gsl_nls()
function in the gslnls
-package are evaluated, i.e. the algorithm
argument in gsl_nls()
takes the following values:
"lm"
, Levenberg-Marquadt algorithm"lmaccel"
, Levenberg-Marquadt with geodesic acceleration."dogleg"
, Powell’s Dogleg algorithm"ddogleg"
, Double dogleg algorithm"subspace2D"
, 2D subspace dogleg generalization.By default, if the jac
argument in the gsl_nls()
function is left unspecified, the Jacobian matrix is approximated by (forward) finite differences. Analogously, when geodesic acceleration is used and the fvv
argument is left unspecified, the second directional derivatives are approximated by (forward) finite differences. In testing the convergence of the GSL routines, the jac
argument is always left unspecified. The Levenberg-Marquadt algorithm with geodesic acceleration is evaluated both with the fvv
argument unspecified (denoted by lmaccel
) and with fvv = TRUE
in which case the second directional derivatives are calculated using symbolic differentiation (denoted by lmaccel+fvv
).
For the control parameters set with gsl_nls_control()
, only the scale
and solver
parameters are varied, see also ?gsl_nls_control
. The maximum number of iterations maxiter
is increased from the default maxiter = 50
to maxiter = 1e4
in order to remove the maximum number of iterations as a constraining factor, and the default values are used for all other control parameters available in gsl_nls_control()
.
The scale
control parameter can take the following values^{2}:
"more"
, Moré rescaling. This method makes the problem scale-invariant and has been proven effective on a large class of problems."levenberg"
, Levenberg rescaling. This method has also proven effective on a large class of problems, but is not scale-invariant. It may perform better for problems susceptible to parameter evaporation (parameters going to infinity)."marquadt"
, Marquadt rescaling. This method is scale-invariant, but it is generally considered inferior to both the Levenberg and Moré strategies.The solver
control parameter can take on the following values^{3}:
"qr"
, QR decomposition of the Jacobian. This method will produce reliable solutions in cases where the Jacobian is rank deficient or near-singular but does require more operations than the Cholesky method."cholesky"
, Cholesky decomposition of the Jacobian. This method is faster than the QR approach, however it is susceptible to numerical instabilities if the Jacobian matrix is rank deficient or near-singular."svd"
, SVD decomposition of the Jacobian. This method will produce the most reliable solutions for ill-conditioned Jacobians but is also the slowest.In order to benchmark the performance of the GSL nonlinear least-squares routines against several common R alternatives, each nonlinear regression model is also fitted using the standard nls()
function, as well as the nlsLM()
function from the minpack.lm
-package.
For the nls()
function, all three available algorithms are tested, i.e. the algorithm
argument is set to respectively:
"default"
, the default Gauss-Newton algorithm"plinear"
, Golub-Pereyra algorithm for partially linear least-squares models"port"
, nl2sol
algorithm from the Port libraryThe maximum number of iterations is set to maxiter = 1e4
and the relative convergence tolerance is set to tol = sqrt(.Machine$double.eps)
to mimic the control parameters used for the GSL routines.
For the nlsLM()
function, there is only a single algorithm (Levenberg-Marquadt), so no choice needs to be made here. The maximum number of iterations is set to maxiter = 1e4
and all other control parameters are kept at their default values.
As a worked out example, we display the different NLS calls used to fit the Rat42
nonlinear regression model based on gsl_nls()
, nls()
and nlsLM()
. The Rat42
model and data are an example of fitting sigmoidal growth curves taken from (Ratkowsky 1983). The response variable is pasture yield, and the predictor variable is growing times.
y | 8.93 | 10.8 | 18.59 | 22.33 | 39.35 | 56.11 | 61.73 | 64.62 | 67.08 |
x | 9.00 | 14.0 | 21.00 | 28.00 | 42.00 | 57.00 | 63.00 | 70.00 | 79.00 |
Similar to nls()
, a minimal gsl_nls()
function call consists of the model formula
, the data and a set of starting values. By default, gsl_nls()
uses the Levenberg-Marquadt algorithm (algorithm = "lm"
) with control parameters scale = "more"
and solver = "qr"
. The starting values \((b_1 = 100, b_2 = 1, b_3 = 0.1)\) are taken from Table 1.
library(NISTnls) library(gslnls) ## gsl Levenberg-Marquadt (more+qr) rat42_gsl <- gsl_nls( fn = y ~ b1 / (1 + exp(b2 - b3 * x)), ## model data = Ratkowsky2, ## dataset start = c(b1 = 100, b2 = 1, b3 = 0.1) ## starting values ) rat42_gsl #> Nonlinear regression model #> model: y ~ b1/(1 + exp(b2 - b3 * x)) #> data: Ratkowsky2 #> b1 b2 b3 #> 72.46224 2.61808 0.06736 #> residual sum-of-squares: 8.057 #> #> Algorithm: levenberg-marquardt, (scaling: more, solver: qr) #> #> Number of iterations to convergence: 10 #> Achieved convergence tolerance: 4.619e-14
The gsl_nls()
function returns an object that inherits from the class "nls"
. For this reason, all generic functions available for "nls"
-objects are also applicable to objects returned by gsl_nls()
. For instance,
## model fit summary summary(rat42_gsl) #> #> Formula: y ~ b1/(1 + exp(b2 - b3 * x)) #> #> Parameters: #> Estimate Std. Error t value Pr(>|t|) #> b1 72.462238 1.734028 41.79 1.26e-08 *** #> b2 2.618077 0.088295 29.65 9.76e-08 *** #> b3 0.067359 0.003447 19.54 1.16e-06 *** #> --- #> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 #> #> Residual standard error: 1.159 on 6 degrees of freedom #> #> Number of iterations to convergence: 10 #> Achieved convergence tolerance: 4.619e-14 ## profile confidence intervals confint(rat42_gsl) #> 2.5% 97.5% #> b1 68.76566669 77.19014998 #> b2 2.41558255 2.84839910 #> b3 0.05947284 0.07600439
Note that the existing predict.nls
method is extended to allow for the calculation of asymptotic confidence and prediction intervals, in addition to prediction of the expected response:
predict(rat42_gsl, interval = "prediction", level = 0.95) #> fit lwr upr #> [1,] 8.548006 5.385407 11.71060 #> [2,] 11.431085 8.235094 14.62708 #> [3,] 16.727705 13.526235 19.92917 #> [4,] 23.532240 20.326258 26.73822 #> [5,] 40.039555 36.612415 43.46669 #> [6,] 55.963267 52.689429 59.23711 #> [7,] 60.546511 57.382803 63.71022 #> [8,] 64.536158 61.311113 67.76120 #> [9,] 67.913137 64.327402 71.49887
As benchmarks to the model fits obtained with gsl_nls()
, each test problem is also fitted with calls to nls()
and minpack.lm::nlsLM()
. For the Rat42
dataset, fitting the regression model with nls()
using the default Gauss-Newton algorithm (algorithm = "default"
) fails to return a valid result:
## nls default nls( formula = y ~ b1 / (1 + exp(b2 - b3 * x)), ## model data = Ratkowsky2, ## dataset start = c(b1 = 100, b2 = 1, b3 = 0.1) ## starting values ) #> Error in nls(formula = y ~ b1/(1 + exp(b2 - b3 * x)), data = Ratkowsky2, : singular gradient
Switching to the Port algorithm (algorithm = "port"
), the nls()
call does converge to the target least-squares solution:
## nls port nls( formula = y ~ b1 / (1 + exp(b2 - b3 * x)), ## model data = Ratkowsky2, ## dataset start = c(b1 = 100, b2 = 1, b3 = 0.1), ## starting values algorithm = "port" ## algorithm ) #> Nonlinear regression model #> model: y ~ b1/(1 + exp(b2 - b3 * x)) #> data: Ratkowsky2 #> b1 b2 b3 #> 72.46224 2.61808 0.06736 #> residual sum-of-squares: 8.057 #> #> Algorithm "port", convergence message: relative convergence (4)
And the same is true when using nlsLM()
with the default Levenberg-Marquadt algorithm:
## nls LM minpack.lm::nlsLM( formula = y ~ b1 / (1 + exp(b2 - b3 * x)), ## model data = Ratkowsky2, ## dataset start = c(b1 = 100, b2 = 1, b3 = 0.1), ## starting values ) #> Nonlinear regression model #> model: y ~ b1/(1 + exp(b2 - b3 * x)) #> data: Ratkowsky2 #> b1 b2 b3 #> 72.46223 2.61808 0.06736 #> residual sum-of-squares: 8.057 #> #> Number of iterations to convergence: 8 #> Achieved convergence tolerance: 1.49e-08
The Rat42
model is partially linear in the sense that y ~ b1 * z
with z = 1 / (1 + exp(b2 - b3 * x))
, which means that the Golub-Pereyra algorithm (algorithm = "plinear"
) can also be applied in this example. Note that the model formula is updated to exclude the linear parameter b1
, and a starting value for this parameter is no longer required.
## nls plinear nls( formula = y ~ 1 / (1 + exp(b2 - b3 * x)), ## model data = Ratkowsky2, ## dataset start = c(b2 = 1, b3 = 0.1), ## starting values algorithm = "plinear" ## algorithm ) #> Nonlinear regression model #> model: y ~ 1/(1 + exp(b2 - b3 * x)) #> data: Ratkowsky2 #> b2 b3 .lin #> 2.61808 0.06736 72.46224 #> residual sum-of-squares: 8.057 #> #> Number of iterations to convergence: 9 #> Achieved convergence tolerance: 1.119e-06
The p-linear algorithm also converges successfully, with the b1
parameter now labeled as .lin
(for linear parameter) in the fitted model coefficients.
Below, the convergence status of the evaluated GSL and benchmark NLS routines is displayed for each individual test problem. The obtained convergence results are categorized according to the following status codes:
Based on the displayed results, an initial observation is that the default Gauss-Newton algorithm in nls()
fails to produce any successful model fit and returns an error for each selected test problem. The Port and (minpack.lm
) Levenberg-Marquadt algorithms show roughly similar convergence results, but only successfully converge for half of the evaluated test problems. The p-linear algorithm is somewhat special as it is only applicable for regression models that can be factored into a partially linear model. However, if applicable, the p-linear algorithm can be a powerful alternative as demonstrated by the BoxBOD
problem, where most other (general) NLS routines fail to converge. More precisely, the BoxBOD
regression model contains only two parameters, and by factoring out the linear parameter, the nonlinear model fit that needs to be optimized by the p-linear algorithm depends only on a single unknown parameter.
Regarding the GSL routines, for each test problem there exists at least one algorithm configuration that is able to produce a successful model fit. Across test problems and control parameter configurations, the GSL Levenberg-Marquadt algorithms with and without geodesic acceleration (lm
, lmaccel
, lmaccel+fvv
) appear to be the most stable, as also seen in the figure below, which displays the total number of successful model fits across test problems. In comparison to the LM algorithm without geodesic acceleration (lm
), the LM algorithm with geodesic acceleration (lmaccel
, lmaccel+fvv
) fails to converge in the Rat43
problem, but on the other hand the lmaccel+fvv
algorithm is the only GSL routine that is able to produce a successful fit in the BoxBOD
problem.
Across control parameter configurations, in terms of the scaling method, more
rescaling (the default) exhibits the most stable performance, followed by marqaudt
rescaling and levenberg
rescaling. In the figure below, this is seen most prominently for the different variations of the Dogleg algorithm (dogleg
, ddogleg
, subspace2D
) and somewhat less for the Levenberg-Marquadt algorithms. In terms of the chosen solver method, the svd
solver produces slightly more robust results than the qr
and cholesky
solver methods, which is in line with the solver method section in the GSL documentation.
As supplementary information, we also display the required number of iterations to reach convergence for each successfully converged NLS routine. In case of a successful model fit, the Port algorithm requires only a small number of iterations to reach convergence. The number of iterations required by the minpack.lm
Levenberg-Marquadt algorithm and GSL Levenberg-Marquadt algorithm(s) is of the same order of magnitude. Among the GSL routines, except for the MGH09
problem, the general tendency is that the Dogleg-based algorithms (dogleg
, ddogleg
, subspace2D
) require less iterations than the LM-based algorithms. This is illustrated most clearly by the Rat42
and Bennet5
plots.
Based on a small collection of NIST StRD test problems, this post benchmarks the convergence properties of a number of GSL nonlinear least squares routines as well as several standard NLS algorithms that are in common use. For the tested nonlinear regression problems, the GSL algorithms show at least comparable –and often better– optimization performance than the included benchmark algorithms, using mostly standard choices and default values for the GSL trust region method control parameters. As such, the GSL trust region methods provide a useful supplement to the existing suite of nonlinear least squares fitting algorithms available in R, in particular when adequate starting values are difficult to come by and more stable optimization routines (than provided by R’s standard methods) are required.
https://www.gnu.org/software/gsl/doc/html/nls.html#c.gsl_multifit_nlinear_scale︎
https://www.gnu.org/software/gsl/doc/html/nls.html#c.gsl_multifit_nlinear_solver︎
Here, the maximum relative deviation of the fitted values with respect to the certified values is within a small tolerance range \(\epsilon\).︎
With the last Chapter of our “Introduction to R” learning path we conclude our journey towards building strong R programming foundations and skills. With the “Becoming an R deve...
Continue reading: Think like a programmeR: the workshop]]>Make the transition from R User to R Developer with us on November 3rd!
With the last Chapter of our “Introduction to R” learning path we conclude our journey towards building strong R programming foundations and skills. With the “Becoming an R developer” workshop we will explore how to write your own functions, we will show you how to make your code more efficient and maintainable by avoiding repetition through (implicit and explicit) looping both in base R and with tidyverse
, and we will discuss best development practices such as documentation and unit testing.
The workshop will take place on Wednesday November 3rd, at 2 p.m. CET, and as usual will be 3-hours long and focus on being hands-on, as in our professional and training experience we have seen that trying things out is the best way to actually understand them and integrate them in your way of working.
Register at this link before 27/10 and benefit from the early bird discount.
In their great article Electoral Cycles in Savings ...
Continue reading: RTutor: Does Bank Lending Increase Before Elections?]]>Assume you are an incumbent local politician that can exert certain control on your local public savings bank. Would you try to increase lending before local elections to polish economic performance?
In their great article Electoral Cycles in Savings Bank Lending (JEEA, 2017) Florian Englmaier and Till Stowasser explore the degree of such electoral lending cycles using detailed German data and non-public banks as a control group. They also study related questions: For example, it does matter whether the election was fairly contested or not.
As part of his Master Thesis at Ulm University, Markus Reichart has created a very nice RTutor problem set that allows you to replicate key findings in an interactive fashion. He puts great emphasis in explaining in detail the underlying difference-in-difference structure that is fairly complex in the final analysis and also adds a nice own analysis about the role of the financial crisis. Like in previous RTutor problem sets, you can enter free R code in a web-based shiny app. The code will be automatically checked and you can get hints how to proceed.
You can test the problem set online on shinyapps.io
https://markusreichart.shinyapps.io/RTutorElectionsAndBankLending
or locally install the problem set, by following the installation guide at the problem set’s Github repository:
https://github.com/mareichart/RTutorElectionsAndBankLending
Here is a screenshot of one of the plots you generate in the problem set:
(Yes, part of the problem set is a discussion of the parallel trends assumption. While one surely would not mind a bit more parallel development in the shown plot, the final regressions do add additional control variables. Also trends look more parallel if averaged across states.)
If you want to learn more about RTutor, try out other problem sets, or create a problem set yourself, take a look at the Github page
https://github.com/skranz/RTutor
or at the documentation
https://skranz.github.io/RTutor
For...
Continue reading: A one-liner for generating random participant IDs]]>On one of the Slacks I browse, someone asked how to de-identify a column of participant IDs. The original dataset was a wait list, so the ordering of IDs itself was a sensitive feature of the data and we need to scramble the order of IDs produced.
For example, suppose we have the following repeated measures dataset.
library(tidyverse) data <- tibble::tribble( ~ participant, ~ timepoint, ~ score, "DB", 1, 7, "DB", 2, 8, "DB", 3, 8, "TW", 1, NA, "TW", 2, 9, "CF", 1, 9, "CF", 2, 8, "JH", 1, 10, "JH", 2, 10, "JH", 3, 10 )
We want to map the participant
identifiers onto some sort of
shuffled-up random IDs. Suggestions included hashing the IDs with
digest:
# This approach cryptographically compresses the input into a short # "digest". (It is not a random ID.) data %>% mutate( participant = Vectorize(digest::sha1)(participant) ) #> # A tibble: 10 x 3 #> participant timepoint score #> <chr> <dbl> <dbl> #> 1 ad61ec1247b2381922bec89483c3ce2fb67f98d9 1 7 #> 2 ad61ec1247b2381922bec89483c3ce2fb67f98d9 2 8 #> 3 ad61ec1247b2381922bec89483c3ce2fb67f98d9 3 8 #> 4 c080f9a87edc6d47f28185279fd8be068c566a37 1 NA #> 5 c080f9a87edc6d47f28185279fd8be068c566a37 2 9 #> 6 1f9da22bf684761daec27326331c58b46502a25b 1 9 #> 7 1f9da22bf684761daec27326331c58b46502a25b 2 8 #> 8 627d211747438ae59690cea8f0a8d6adf666b974 1 10 #> 9 627d211747438ae59690cea8f0a8d6adf666b974 2 10 #> 10 627d211747438ae59690cea8f0a8d6adf666b974 3 10
But this approach seems like overkill, and hashing just transforms these IDs. We want to be rid of them completely.
The uuid package provides another approach:
data %>% group_by(participant) %>% mutate( id = uuid::UUIDgenerate(use.time = FALSE) ) %>% ungroup() %>% select(-participant, participant = id) %>% relocate(participant) #> # A tibble: 10 x 3 #> participant timepoint score #> <chr> <dbl> <dbl> #> 1 384eabc6-01ef-4ffb-8e87-bc4c460532c7 1 7 #> 2 384eabc6-01ef-4ffb-8e87-bc4c460532c7 2 8 #> 3 384eabc6-01ef-4ffb-8e87-bc4c460532c7 3 8 #> 4 3f601a14-2c39-48ee-9536-0d5396be2839 1 NA #> 5 3f601a14-2c39-48ee-9536-0d5396be2839 2 9 #> 6 02f888f9-eb05-40d9-a425-89d51a24cb81 1 9 #> 7 02f888f9-eb05-40d9-a425-89d51a24cb81 2 8 #> 8 c007112b-ecd2-4c3b-b186-0fc53504dd38 1 10 #> 9 c007112b-ecd2-4c3b-b186-0fc53504dd38 2 10 #> 10 c007112b-ecd2-4c3b-b186-0fc53504dd38 3 10
Again, these IDs seem excessive: Imagine plotting data with one participant per facet.
When I create blogposts for this site, I use a function to create a new
.Rmd file with the date and a random adjective-animal
phrase for a
placeholder (e.g., 2021-06-28-mild-capybara.Rmd
). We could try that for
fun:
data %>% group_by(participant) %>% mutate( id = ids::adjective_animal() ) %>% ungroup() %>% select(-participant, participant = id) %>% relocate(participant) #> # A tibble: 10 x 3 #> participant timepoint score #> <chr> <dbl> <dbl> #> 1 chrysoprase_bushsqueaker 1 7 #> 2 chrysoprase_bushsqueaker 2 8 #> 3 chrysoprase_bushsqueaker 3 8 #> 4 hideous_cheetah 1 NA #> 5 hideous_cheetah 2 9 #> 6 powdery_siamang 1 9 #> 7 powdery_siamang 2 8 #> 8 ducal_hornshark 1 10 #> 9 ducal_hornshark 2 10 #> 10 ducal_hornshark 3 10
But that’s too whimsical (and something like hideous-cheetah
seems
disrespectful for human subjects).
One user suggested forcats::fct_anon()
:
data %>% mutate( participant = participant %>% as.factor() %>% forcats::fct_anon(prefix = "p0") ) #> # A tibble: 10 x 3 #> participant timepoint score #> <fct> <dbl> <dbl> #> 1 p04 1 7 #> 2 p04 2 8 #> 3 p04 3 8 #> 4 p02 1 NA #> 5 p02 2 9 #> 6 p03 1 9 #> 7 p03 2 8 #> 8 p01 1 10 #> 9 p01 2 10 #> 10 p01 3 10
This approach works wonderfully. The only wrinkle is that it requires converting our IDs to a factor in order to work.
match()
-makerMy approach is a nice combination of base R functions:
data %>% mutate( participant = match(participant, sample(unique(participant))) ) #> # A tibble: 10 x 3 #> participant timepoint score #> <int> <dbl> <dbl> #> 1 3 1 7 #> 2 3 2 8 #> 3 3 3 8 #> 4 1 1 NA #> 5 1 2 9 #> 6 2 1 9 #> 7 2 2 8 #> 8 4 1 10 #> 9 4 2 10 #> 10 4 3 10
match(x, table)
returns the first
positions of the x
elements in some vector table
. What is the
position in the alphabet of the letters L and Q and L again?
match(c("L", "Q", "L"), LETTERS) #> [1] 12 17 12
sample()
shuffles the values in
the table
so the order of elements is lost. The unique()
is
optional. We could just sample(data$participant)
. Then the first
position of one of the IDs might be a number larger than 4:
shuffle <- sample(data$participant) shuffle #> [1] "CF" "JH" "TW" "JH" "DB" "DB" "DB" "JH" "CF" "TW" match(data$participant, shuffle) #> [1] 5 5 5 3 3 1 1 2 2 2
For more aesthetically pleasing names, and for names that will sort
correctly, we can zero-pad the results with
sprintf()
. I am mostly
including this step so that I have it written down somewhere for my own
reference.
zero_pad <- function(xs, prefix = "", width = 0) { # use widest element if bigger than `width` width <- max(c(nchar(xs), width)) sprintf(paste0(prefix, "%0", width, "d"), xs) } data %>% mutate( participant = match(participant, sample(unique(participant))), participant = zero_pad(participant, "p", 3) ) #> # A tibble: 10 x 3 #> participant timepoint score #> <chr> <dbl> <dbl> #> 1 p003 1 7 #> 2 p003 2 8 #> 3 p003 3 8 #> 4 p004 1 NA #> 5 p004 2 9 #> 6 p002 1 9 #> 7 p002 2 8 #> 8 p001 1 10 #> 9 p001 2 10 #> 10 p001 3 10
match()
%in%
disguiseWhat happens when match()
fails to find an x
in the table? By
default, we get NA
. But we can customize the results with the
nomatch
argument.
match(c("7", "A", "L"), LETTERS) #> [1] NA 1 12 match(c("7", "A", "L"), LETTERS, nomatch = -99) #> [1] -99 1 12 match(c("7", "A", "L"), LETTERS, nomatch = 0) #> [1] 0 1 12
If we do something like this last example, then we can check whether an
element in x
has a match by checking for numbers greater than 0.
match(c("7", "A", "L"), LETTERS, nomatch = 0) > 0 #> [1] FALSE TRUE TRUE
And that is how the functions %in%
and is.element()
are implemented
behind the scenes:
c("7", "A", "L") %in% LETTERS #> [1] FALSE TRUE TRUE # The 0L means it's an integer number instead of floating point number `%in%` #> function (x, table) #> match(x, table, nomatch = 0L) > 0L #> <bytecode: 0x00000000146eef20> #> <environment: namespace:base> is.element(c("7", "A", "L"), LETTERS) #> [1] FALSE TRUE TRUE is.element #> function (el, set) #> match(el, set, 0L) > 0L #> <bytecode: 0x000000001714f110> #> <environment: namespace:base>
Last knitted on 2021-10-12. Source code on GitHub.^{1}
sessioninfo::session_info() #> - Session info --------------------------------------------------------------- #> setting value #> version R version 4.1.1 (2021-08-10) #> os Windows 10 x64 #> system x86_64, mingw32 #> ui RTerm #> language (EN) #> collate English_United States.1252 #> ctype English_United States.1252 #> tz America/Chicago #> date 2021-10-12 #> #> - Packages ------------------------------------------------------------------- #> package * version date lib source #> assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.1.0) #> backports 1.2.1 2020-12-09 [1] CRAN (R 4.1.0) #> broom 0.7.9 2021-07-27 [1] CRAN (R 4.1.0) #> cellranger 1.1.0 2016-07-27 [1] CRAN (R 4.1.0) #> cli 3.0.1 2021-07-17 [1] CRAN (R 4.1.0) #> colorspace 2.0-2 2021-06-24 [1] CRAN (R 4.1.0) #> crayon 1.4.1 2021-02-08 [1] CRAN (R 4.1.0) #> DBI 1.1.1 2021-01-15 [1] CRAN (R 4.1.0) #> dbplyr 2.1.1 2021-04-06 [1] CRAN (R 4.1.0) #> digest 0.6.28 2021-09-23 [1] CRAN (R 4.1.1) #> dplyr * 1.0.7 2021-06-18 [1] CRAN (R 4.1.0) #> ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.1.0) #> evaluate 0.14 2019-05-28 [1] CRAN (R 4.1.0) #> fansi 0.5.0 2021-05-25 [1] CRAN (R 4.1.0) #> forcats * 0.5.1 2021-01-27 [1] CRAN (R 4.1.0) #> fs 1.5.0 2020-07-31 [1] CRAN (R 4.1.0) #> generics 0.1.0 2020-10-31 [1] CRAN (R 4.1.0) #> ggplot2 * 3.3.5 2021-06-25 [1] CRAN (R 4.1.0) #> git2r 0.28.0 2021-01-10 [1] CRAN (R 4.1.1) #> glue 1.4.2 2020-08-27 [1] CRAN (R 4.1.0) #> gtable 0.3.0 2019-03-25 [1] CRAN (R 4.1.0) #> haven 2.4.3 2021-08-04 [1] CRAN (R 4.1.0) #> here 1.0.1 2020-12-13 [1] CRAN (R 4.1.0) #> hms 1.1.1 2021-09-26 [1] CRAN (R 4.1.1) #> httr 1.4.2 2020-07-20 [1] CRAN (R 4.1.0) #> ids 1.0.1 2017-05-31 [1] CRAN (R 4.1.0) #> jsonlite 1.7.2 2020-12-09 [1] CRAN (R 4.1.0) #> knitr * 1.36 2021-09-29 [1] CRAN (R 4.1.1) #> lifecycle 1.0.1 2021-09-24 [1] CRAN (R 4.1.1) #> lubridate 1.7.10 2021-02-26 [1] CRAN (R 4.1.0) #> magrittr 2.0.1 2020-11-17 [1] CRAN (R 4.1.0) #> modelr 0.1.8 2020-05-19 [1] CRAN (R 4.1.0) #> munsell 0.5.0 2018-06-12 [1] CRAN (R 4.1.0) #> pillar 1.6.3 2021-09-26 [1] CRAN (R 4.1.1) #> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.1.0) #> purrr * 0.3.4 2020-04-17 [1] CRAN (R 4.1.0) #> R6 2.5.1 2021-08-19 [1] CRAN (R 4.1.1) #> ragg 1.1.3 2021-06-09 [1] CRAN (R 4.1.0) #> Rcpp 1.0.7 2021-07-07 [1] CRAN (R 4.1.0) #> readr * 2.0.2 2021-09-27 [1] CRAN (R 4.1.1) #> readxl 1.3.1 2019-03-13 [1] CRAN (R 4.1.0) #> reprex 2.0.1 2021-08-05 [1] CRAN (R 4.1.0) #> rlang 0.4.11 2021-04-30 [1] CRAN (R 4.1.0) #> rprojroot 2.0.2 2020-11-15 [1] CRAN (R 4.1.0) #> rstudioapi 0.13 2020-11-12 [1] CRAN (R 4.1.0) #> rvest 1.0.1 2021-07-26 [1] CRAN (R 4.1.0) #> scales 1.1.1 2020-05-11 [1] CRAN (R 4.1.0) #> sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 4.1.0) #> stringi 1.7.5 2021-10-04 [1] CRAN (R 4.1.1) #> stringr * 1.4.0 2019-02-10 [1] CRAN (R 4.1.0) #> systemfonts 1.0.2 2021-05-11 [1] CRAN (R 4.1.0) #> textshaping 0.3.5 2021-06-09 [1] CRAN (R 4.1.0) #> tibble * 3.1.5 2021-09-30 [1] CRAN (R 4.1.1) #> tidyr * 1.1.4 2021-09-27 [1] CRAN (R 4.1.1) #> tidyselect 1.1.1 2021-04-30 [1] CRAN (R 4.1.0) #> tidyverse * 1.3.1 2021-04-15 [1] CRAN (R 4.1.0) #> tzdb 0.1.2 2021-07-20 [1] CRAN (R 4.1.0) #> utf8 1.2.2 2021-07-24 [1] CRAN (R 4.1.0) #> uuid 0.1-4 2020-02-26 [1] CRAN (R 4.1.0) #> vctrs 0.3.8 2021-04-29 [1] CRAN (R 4.1.0) #> withr 2.4.2 2021-04-18 [1] CRAN (R 4.1.0) #> xfun 0.26 2021-09-14 [1] CRAN (R 4.1.1) #> xml2 1.3.2 2020-04-23 [1] CRAN (R 4.1.0) #> #> [1] C:/Users/trist/Documents/R/win-library/4.1 #> [2] C:/Program Files/R/R-4.1.1/library
R-Tips Weekly
This article is part of R-Tips Wee...
Continue reading: How to Make a Heatmap of Customers in R [Video]]]>The ggplot2
package is an essential tool in every data scientists toolkit. Today we show you how to use ggplot2
to make a professional heatmap that organizes customers by their sales purchasing habits.
This article is part of R-Tips Weekly, a weekly video tutorial that shows you step-by-step how to do common R coding tasks.
Here are the links to get set up.
Watch this video to learn how to make a Customer Heatmap in ggplot2
. Click the image to play the tutorial.
By the end of this tutorial, you will have created this Customer Heatmap showing purchasing preferences.
You just created a Customer Heatmap using ggplot2
. This is great, but there’s a lot more to learning data science.
If you’d like to learn data visualizations, data wrangling, shiny
apps, and data science for business with R, then read on.
It took me a long time to learn data science. I made a lot of mistakes as I fumbled through learning R. I specifically had a tough time navigating the ever increasing landscape of tools and packages, trying to pick between R and Python, and getting lost along the way.
If you feel like this, you’re not alone. Coding is tough, data science is tough, and connecting it all with the business is tough.
The good news is that, after years of learning, I was able to become a highly-rated business consultant working with Fortune 500 clients and my career advanced rapidly. More than that, I was able to help others in the community by developing open source software that has been downloaded over 1,000,000 times, and I found a real passion for coding.
In fact, that’s the driving reason that I created Business Science to help people like you and me that are struggling to learn data science for business (You can read about my personal journey here).
What I found out is that:
Data Science does not have to be difficult, it just has to be taught smartly
Anyone can learn data science fast provided they are motivated.
If you are interested in learning R and the ecosystem of tools at a deeper level, then I have a streamlined program that will get you past your struggles and improve your career in the process.
It’s called the 5-Course R-Track System. It’s an integrated system containing 5 courses that work together on a learning path. Through 5+ projects, you learn everything you need to help your organization: from data science foundations, to advanced machine learning, to web applications and deployment.
The result is that you break through previous struggles, learning from my experience & our community of 2000+ data scientists that are ready to help you succeed.
Ready to take the next step? Then let’s get started.
Want these tips every week? Join R-Tips Weekly.
In den letzten Jahren1 konnten in diversen Ländern der Aufstieg populistischer Phänomene beobachtet werden: der Aufstieg der AfD in Deutschland, die seit ihrer Gründung binnen 5 Jahren in sämtlichen deutschen Landtagen, d...
Continue reading: Umfragewerte oder Medienpräsenz: Was kommt zuerst?]]>In den letzten Jahren^{1} konnten in diversen Ländern der Aufstieg populistischer Phänomene beobachtet werden: der Aufstieg der AfD in Deutschland, die seit ihrer Gründung binnen 5 Jahren in sämtlichen deutschen Landtagen, dem Bundestag und dem europäischen Parlament vertreten ist; der Wahlsieg Donald J. Trumps in den USA; der Aufstieg der Lega in Italien und viele weitere Beispiele. Ein Verlaufsmuster, das bei jedem einzelnen dieser Aufstiege in der Öffentlichkeit auftritt ist das folgende Ping-Pong-Spiel zwischen Medien und Umfragewerten: Je größer die gesellschaftliche Bekanntheit, desto größer das Medienecho, desto größer die Bekanntheit, etc. In diesem Artikel werde ich diesen Zusammenhang empirisch untersuchen, und dabei folgenden Fragestellungen nachgehen:
1) Gibt es einen messbaren, zeitlichen Zusammenhang zwischen medialer Aufmerksamkeit für die AfD und den Umfragewerten der Partei?
2) Wenn ja, welche zeitliche Abfolge gilt: Erst Medienecho, dann Umfrageergebnis, oder umgekehrt?
In der Folge erkläre ich detailliert meine Vorgehensweise, nehme hier aber bereits die Antworten vorweg:
1) Ja, in der Phase zwischen “erster Spaltung” der AfD (Juli 2015) bis zur Bundestagswahl 2017 ist dieser Zusammenhang deutlich messbar
2) Laut Zahlenwerk am plausibelsten: Die Steigerung der Berichterstattung erfolgt knapp vorher oder genau zeitlich synchron; es lassen sich aber keine eindeutigen Aussagen treffen.
Als Datenquelle für die Analyse verwende ich zwei verschiedene Datenquellen:
Für einen ersten Überblick der verwendeten Daten hier zunächst der Blick auf die Anzahl der veröffentlichten Artikel in der Zeit, die den String “AfD”:
Interessante Beobachtungen hierbei sind zum Einen die Spitzen, deren erste im Jahre 2017 sich einfach mit der Bundestagswahl in diesem Jahr erklären lässt. Der hohe Ausschlag im Jahr 2020 fällt auf den Februar, als die Bürgerschaftswahl in Hamburg anstand. Zum Anderen durchaus spannend ist das “Tal” zwischen 2018 und Mitte 2019, in welchem die Aufmerksamkeit für die Partei, zumindest in der Berichterstattung von zeit.de erstmal zu erlahmen schien. Erst die Bürgerschaftswahl und dann der beginnende Wahlkampf zur Bundestagswahl 2021 scheint das Interesse wieder geweckt zu haben.
Der zweite Datentopf besteht aus den aggregierten Wahlumfragen von wahlrecht.de (“Wenn nächsten Sonntag Bundestagswahl wäre…”), und ergibt folgendes Bild:
Auf den langen Zeitraum betrachtet zeigt sich ein bewegter, insgesamt leicht steigender Verlauf mit bisherigem Höchststand Anfang 2018. Der Einfluss der Corona-Pandemie mit Begin um die Jahreswende 2019/20 scheint ein deutlich negativer zu sein, trotz (oder vielleicht gerade wegen) der Nähe der AfD zur pandemieskeptischen Kreisen.
Zur Ermittelung eines Zusammenhangs zwischen den beiden Datentöpfen wird hier die Korrelation berechnet, je höher dieser Wert ausfällt, desto deutlicher ausgeprägt ist auch der Zusammenhang zwischen beiden Größen. Hierbei ist festzuhalten, dass Korrelation keine Kausation bedeutet, d.h. der festgestellte Zusammenhang muss nicht derart ausgeprägt sein, dass das Auftreten des einen Ereignisses direkt das andere Ereignis herbeiführt.
Um die zeitliche Komponenten der Untersuchung wiederzuspiegeln, wird die Korrelation in einem laufenden Fenster aus paarweise zueinander geordneten Beobachtungen berechnet. Hierbei werden verschiedene zeitliche Abstände probiert, um bspw. die Bewertung “tritt ein erhöhter Umfragewert zwei Monate nach Steigerung der Berichterstattung an?” beantworten zu können. Die Wahl der Länge des zeitlichen Fensters, das jeweils über den Zeithorizont geschoben wird, hat hierbei Auswirkungen in zwei Richtungen: Ein kurzes Fenster ermöglicht den Blick in sehr kurzfristige Veränderungen des Geschehens, hat aber eine statistisch viel größere Streuung als dies ein längeres Fenster hat. In den unten gezeigten Plots habe ich eine Fensterlänge von einem halben Jahr, d.h. 18 Messpunkten gewählt.
In der folgenden Grafik sind die Verläufe von zwei verschiedenen Varianten der laufenden Korrelationsfenster angegeben:
Weiterhin wurde der Schwellwert 0.708 eingezeichnet, als der Schwellwert zur Signifikanz des Tests auf Unkorreliertheit zum Niveau 0,1% für n=18 ^{5}. Die Tatsache, dass beide Kurven über einen Zeitraum von fast einem Jahr über diesem Schwellwert rangieren, deutet auf die Deutlichkeit des Zusammenhangs hin. Der Zeitraum bezieht sich hierbei auf die vorhergehenden 18 Monate je Messpunkt, die Grafik zeigt somit, dass die Aussage “Umfragewerte und Berichterstattung steigen gleichzeitig” für den Zeitraum zwischen September 2014 bis Februar 2017 statistisch sehr plausibel bewertet wird. Das gleiche gilt aber auch für die Aussage “Umfragewerte steigen einen Monat nach erhöhter Berichterstattung”, und deutet damit auf die Unschärfen hin, die dieser Untersuchung zugrunde liegen. In der Tat steigen auch die Kurven zu anderen zeitlichen Abständen (+2 bis -2) in diesem Zeitraum auf statistisch signifikante Werte, jedoch nicht über einen derart langen Zeitraum.
Zusammengefasst: Die Fragestellung 1) kann eindeutig beantwortet werden, alle weiteren Fragestellungen entziehen sich einer eindeutigen Klärung. Dennoch halte ich das dargestellte Vorgehensweise für einen interessanten Weg, politische und mediale Phänomene messbar und bewertbar zu machen.
This post is inspired by a really approachable post on particle swarm optimisation by Adrian Tam. We’ll build a basic particle swam optimiser in R and try to visualise the results.
Libraries
# install.packages(pacman)
pacman::p_load(dplyr, ggan...
This post is inspired by a really approachable post on particle swarm optimisation by Adrian Tam. We’ll build a basic particle swam optimiser in R and try to visualise the results.
# install.packages(pacman) pacman::p_load(dplyr, gganimate, metR)
We’ll use the Ackley’s Function here to evaluate how well the optimiser works. The function has many local optima and should pose a challenge to the optimisation routine.
obj_func <- function(x, y){ # Modifying for a different global minimum -20 * exp(-0.2 * sqrt(0.5 *((x-1)^2 + (y-1)^2))) - exp(0.5*(cos(2*pi*x) + cos(2*pi*y))) + exp(1) + 20 } # Set of x and y values (search domain) x <- seq(-10, 10, length.out = 100) y <- seq(-10, 10, length.out = 100) # Create a data frame that stores every permutation of # x and y coordinates grid <- expand.grid(x, y, stringsAsFactors = F) head(grid) ## Var1 Var2 ## 1 -10.000000 -10 ## 2 -9.797980 -10 ## 3 -9.595960 -10 ## 4 -9.393939 -10 ## 5 -9.191919 -10 ## 6 -8.989899 -10 # Evaluate the objective function at each x, y value grid$z <- obj_func(grid[,1], grid[,2]) # create a contour plot contour_plot <- ggplot(grid, aes(x = Var1, y = Var2)) + geom_contour_filled(aes(z = z), color = "black", alpha = 0.5) + scale_fill_brewer(palette = "Spectral") + theme_minimal() + labs(x = "x", y = "y", title = "Ackley's Function", subtitle = "Contour plot") contour_plot
The optimiser works as such:
Or to put it more formally:
Say we are operating in 2 dimensions (x and y coordinates). Then, for each particle i
$$x_{t+1}^i = x_{t}^i + \Delta{x_t}^i$$ $$y_{t+1}^i = y_{t}^i + \Delta{y_t}^i$$
And,
$$\Delta{x_t}^i = w\Delta{x_{t-1}^i} + c_1r_1(x_{localBest} - x_i) + c_2r_2(x_{globalBest} - x_i)$$ $$\Delta{y_t}^i = w\Delta{y_{t-1}^i} + c_1r_1(y_{localBest} - y_i) + c_2r_2(y_{globalBest} - y_i)$$
Where w
, c1
, c2
, r1
, r2
are positive constants. r1
and r2
are uniformly distributed (positive) random numbers. These random numbers need to be positive because the direction in which each particle will move is decided by where localBest
and globalBest
are. Again, localBest
is the optimal function value observed by the ith
particle and globalBest
is the optimal function value across all particles.
# Say we start with 20 particles n_particles <- 20 # Set some initial values for constants w <- 0.5 c1 <- 0.05 c2 <- 0.1 # Search domain in x and y coordinates x <- seq(-10, 10, length.out = 20) y <- seq(-10, 10, length.out = 20) # Combine into a matrix X <- data.frame(x = sample(x, n_particles, replace = F), y = sample(y, n_particles, replace = F)) # Chart starting locations contour_plot + geom_point(data = X, aes(x, y), color = "red", size = 2.5) + labs(title = "PSO", subtitle = "Iter 0")
# Uniformly distributed (positive) perturbations dX <- matrix(runif(n_particles * 2), ncol = 2) # Scale down the perturbations by a factor (w in the equation above) dX <- dX * w # Set the location of the local best (optimal value) to starting positions pbest <- X # Evaluate the function at each point and store pbest_obj <- obj_func(X[,1], X[,2]) # Find a global best and its position gbest <- pbest[which.min(pbest_obj),] gbest_obj <- min(pbest_obj) X_dir <- X %>% mutate(g_x = gbest[1,1], g_y = gbest[1,2], angle = atan((g_y - y)/(g_x - x))*180/pi, angle = ifelse(g_x < x, 180 + angle, angle)) contour_plot + geom_point(data = X, aes(x, y), color = "red", size = 2.5) + geom_arrow(data = X_dir, aes(x, y, mag = 1, angle = angle), direction = "ccw", pivot = 0, show.legend = F) + labs(title = "PSO", subtitle = "Iter 0")
# Update dx based on the equation shown previously dX <- w * dX + c1*runif(1)*(pbest - X) + c2*runif(1)*(as.matrix(gbest) - X) # Add dx to current locations X <- X + dX # Evaluate objective function at new positions # Note that X[,1] is the first column i.e. x coordinates obj <- obj_func(X[,1], X[,2]) # Find those points where the objective function is lower # than previous iteration idx <- which(obj >= pbest_obj) # Update locations of local best and store local best values pbest[idx,] <- X[idx,] pbest_obj[idx] <- obj[idx] # Identify the minimum value of the of the objective function # amongst all points idx <- which.min(pbest_obj) # Store as global best gbest <- pbest[idx,] gbest_obj <- min(pbest_obj) X_dir <- X %>% mutate(g_x = gbest[1,1], g_y = gbest[1,2], angle = atan((g_y - y)/(g_x - x))*180/pi, # Need angles to show direction angle = ifelse(g_x < x, 180 + angle, angle)) contour_plot + geom_point(data = X, aes(x, y), color = "red", size = 2.5) + geom_arrow(data = X_dir, aes(x, y, mag = 1, angle = angle), direction = "ccw", pivot = 0, show.legend = F) + labs(title = "PSO", subtitle = "Iter 1")
We can now encapsulate everything inside a function for ease of use.
# Final function pso_optim <- function(obj_func, #Accept a function directly c1 = 0.05, c2 = 0.05, w = 0.8, n_particles = 20, init_fact = 0.1, n_iter = 50, ... # This ensures we can pass any additional # parameters to the objective function ){ x <- seq(min(x), max(x), length.out = 100) y <- seq(min(y), max(y), length.out = 100) X <- cbind(sample(x, n_particles, replace = F), sample(y, n_particles, replace = F)) dX <- matrix(runif(n_particles * 2) * init_fact, ncol = 2) pbest <- X pbest_obj <- obj_func(x = X[,1], y = X[,2]) gbest <- pbest[which.min(pbest_obj),] gbest_obj <- min(pbest_obj) loc_df <- data.frame(X, iter = 0) iter <- 1 while(iter < n_iter){ dX <- w * dX + c1*runif(1)*(pbest - X) + c2*runif(1)*(gbest - X) X <- X + dX obj <- obj_func(x = X[,1], y = X[,2]) idx <- which(obj <= pbest_obj) pbest[idx,] <- X[idx,] pbest_obj[idx] <- obj[idx] idx <- which.min(pbest_obj) gbest <- pbest[idx,] # Update iteration iter <- iter + 1 loc_df <- rbind(loc_df, data.frame(X, iter = iter)) } lst <- list(X = loc_df, obj = gbest_obj, obj_loc = paste0(gbest, collapse = ",")) return(lst) } # Test optimiser out <- pso_optim(obj_func, x = x, y = y, c1 = 0.01, c2 = 0.05, w = 0.5, n_particles = 50, init_fact = 0.1, n_iter = 200) # Global minimum is at (1,1) out$obj_loc ## [1] "0.999987552893311,0.999977303579515"
This part is fun! We can use the awesome gganimate
package to visualise the path of each point and see how the optimiser searches and converges towards an optimal value.
ggplot(out$X) + geom_contour(data = grid, aes(x = Var1, y = Var2, z = z), color = "black") + geom_point(aes(X1, X2)) + labs(x = "X", y = "Y") + transition_time(iter) + ease_aes("linear")
A few things are worth noting here:
c1
decides how much a given point moves towards the best value that it has encountered thus far. Keeping this value low helps the optimiser converge faster.w
also helps converge faster but if its too high and then the optimiser tends to swing back and forth between solutions.Thoughts? Comments? Helpful? Not helpful? Like to see anything else added in here? Let me know!