options(repr.plot.width=8, repr.plot.height=6, lwd = 4)
library("RColorBrewer") # for pretty colors
library("tidyverse") # for string interpolation to print variables in plots.
library("latex2exp") # the TeX() function makes it possible to print latex math
= brewer.pal(12, "Paired")[c(1,2,7,8,3,4,5,6,9,10)]; colors
Analyzing internet download speeds with a Gaussian model
a notebook for the book Bayesian Learning by Mattias Villani
Problem
The maximum internet connection speed downstream in my home is 50 Mbit/sec. This maximum will typically never be reached, but my internet service provider (ISP) claims that the average speed is at least 20Mbit/sec. I want to collect some data to investigate this.
Getting started
First, some housekeeping: loading libraries and setting up colors.
Data
I collect a total of five measurements over the course of five consecutive using an speed testing internet service:
= c(15.77, 20.5, 8.26, 14.37, 21.09) x
Model
The measurements are assumed to be
The measurements are reported to have a standard deviation of
= 5^2 sigma2
Prior
I will use a prior centered on the average claimed by the ISP,
= 20
mu_0 = 5^2 tau2_0
Posterior
A normal prior for a normal model gives us a posterior which is also normal:
Let’s write a small function that computes the posterior mean and variance, and plots the prior, likelihood and posterior.
<- function(x, mu_0, tau2_0, sigma2, thetaGrid, areaPoint){
postGaussianIID
# compute posterior mean and variance
= length(x)
n = 1/(n/sigma2 + 1/tau2_0)
tau2_n = (n/sigma2)/(n/sigma2 + 1/tau2_0)
w = w*mean(x) + (1-w)*mu_0
mu_n
# plot PDFs. Likelihood is normalized.
= dnorm(thetaGrid, mean = mu_0, sd = sqrt(tau2_0))
priorPDF = dnorm(thetaGrid, mean = mu_n, sd = sqrt(tau2_n))
postPDF = 1-pnorm(areaPoint, mean = mu_n, sd = sqrt(tau2_n))
postProbAbovePoint = dnorm(thetaGrid, mean = mean(x[1:n]), sd = sqrt(sigma2/n))
normLikePDF plot(1, type="n", axes=FALSE, xlab = expression(theta), ylab = "",
xlim=c(min(thetaGrid),max(thetaGrid)),
ylim = c(0,max(priorPDF,postPDF,normLikePDF)))
axis(side = 1)
polygon(c(thetaGrid[thetaGrid>=areaPoint], max(thetaGrid), areaPoint),
c(postPDF[thetaGrid>=areaPoint], 0, 0),
col=adjustcolor(colors[4],alpha.f=0.3), border=NA)
lines(thetaGrid, priorPDF, type = "l", lwd = 4, col = colors[6])
lines(thetaGrid, normLikePDF, lwd = 4, col = colors[2])
lines(thetaGrid, postPDF, lwd = 4, col = colors[4])
legend(x = "topright", inset=.05, cex = c(1,1,1,1),
legend = c("Prior", "Likelihood (normalized)", "Posterior",
TeX(sprintf("$Pr(\\theta \\geq %2.0f | \\mathbf{x}) = %0.3f$", areaPoint, postProbAbovePoint))),
lty = c(1, 1, 1, 1), pt.lwd = c(3, 3, 3, 3),
col = c(colors[6], colors[2], colors[4], adjustcolor(colors[4],alpha.f=0.3)))
cat("Posterior mean is ", round(mu_n,3), "\n")
cat("Posterior standard deviation is ", round(sqrt(tau2_n),3), "\n")
cat("The weight on the sample mean is ", round(w,3))
return(list("mu_n" = mu_n, "tau2_n" = tau2_n, "w" = w))
}
Let us start by analyzing just the first observation
= seq(0, 40, length = 1000) # Some suitable grid of values to plot over
thetaGrid = 20 # shade the region where theta>= areaPoint (20 in my example)
areaPoint = 1
n = postGaussianIID(x[1:n], mu_0, tau2_0, sigma2, thetaGrid, areaPoint) post
Posterior mean is 17.885
Posterior standard deviation is 3.536
The weight on the sample mean is 0.5
We see that the prior and data information happen to get the same weight (w) in the posterior. That is a coincidence from the fact that the prior variance
Moving on, let’s add the next measurement to the analysis:
= 2
n = postGaussianIID(x[1:n], mu_0, tau2_0, sigma2, thetaGrid, areaPoint) post
Posterior mean is 18.757
Posterior standard deviation is 2.887
The weight on the sample mean is 0.667
We now see that the posterior is more affected by the data information than the prior information (w = 0.666).
Finally, adding all
= 5
n = postGaussianIID(x[1:n], mu_0, tau2_0, sigma2, thetaGrid, areaPoint) post
Posterior mean is 16.665
Posterior standard deviation is 2.041
The weight on the sample mean is 0.833
I am now rather sure that my average download speed is less than 20 MBit/sec since the posterior probability of