import numpy as np
import scipy.stats as sps
import pandas as pd
import matplotlib.pyplot as plt
=123) # Set the seed for reproducibility
np.random.seed(seed
# Load the data
= pd.read_csv('https://github.com/mattiasvillani/BayesianLearningBook/raw/main/data/ebaybids/ebaybids.csv', sep = ',')
eBayData = eBayData['nBids'] nBids
Modeling the number of bids in eBay coin auctions
an Jupyter notebook for the book Bayesian Learning by Mattias Villani
The dataset contains data from 1000 auctions of collector coins. The dataset was collected and first analyzed in the article Bayesian Inference in Structural Second-Price Common Value Auctions.
Import modules and load the data
We will model these data using a Poisson distribution:
so that the posterior is also Gamma:
# Define the Gamma distribution in the rate parametrization
def gammaPDF(x, alpha, beta):
return(sps.gamma.pdf(x, a = alpha, scale = 1/beta))
def PostPoisson(y, alphaPrior, betaPrior, thetaPriorGrid = None, thetaPostGrid = None):
# Compute Prior density and posterior
= gammaPDF(x = thetaPriorGrid, alpha = alphaPrior, beta = betaPrior)
priorDens = len(y)
n = alphaPrior + np.sum(y)
alphaPost = betaPrior + n
betaPost = gammaPDF(x = thetaPostGrid, alpha = alphaPost, beta = betaPost)
postDens
print('Number of data points = ' + str(len(y)))
print('Sum of number of counts = ' + str(np.sum(y)))
print('Mean number of counts = ' + str(np.mean(y)))
print('Prior mean = ' + str(alphaPrior/betaPrior))
print('Prior standard deviation = '+ str(np.sqrt(alphaPrior/(betaPrior**2))))
print('Equal tail 95% prior interval: ' + str(sps.gamma.interval(0.95, a = alphaPrior, scale = 1/betaPrior)))
print('Posterior mean = ' + str(round(alphaPost/betaPost,3)))
print('Posterior standard deviation = '+ str(np.sqrt( (alphaPrior+np.sum(y))/ ((betaPrior+n)**2) ) ))
print('Equal tail 95% posterior interval: ' + str(sps.gamma.interval(0.95, a = alphaPost, scale = 1/betaPost)))
if (thetaPriorGrid.any() != None):
= plt.subplots(1, 2, figsize=(15, 10))
fig, ax = ax[0].plot(thetaPriorGrid, priorDens, lw = 3);
h1, 0].set_xlabel(r'$\theta$');ax[0].set_ylabel('PDF');
ax[0].set_title('Prior distribution');
ax[
= ax[1].plot(thetaPostGrid, postDens, lw = 3, color ="orange");
h2, 1].set_xlabel(r'$\theta$');ax[1].set_ylabel('PDF');
ax[1].set_title('Posterior distribution');
ax[
= 2
alphaPrior = 1/2
betaPrior = nBids, alphaPrior = 2, betaPrior = 1/2,
PostPoisson(y = np.linspace(0.01,12,10000), thetaPostGrid = np.linspace(3.25,4,10000)) thetaPriorGrid
Number of data points = 1000
Sum of number of counts = 3635
Mean number of counts = 3.635
Prior mean = 4.0
Prior standard deviation = 2.8284271247461903
Equal tail 95% prior interval: (0.48441855708793014, 11.143286781877796)
Posterior mean = 3.635
Posterior standard deviation = 0.06027740643004339
Equal tail 95% posterior interval: (3.5179903738284697, 3.7542677655304297)
Fit of the Poisson model
Let’s plot the data along with the fitted Poisson model. We’ll keep things simple and plot the fit for the posterior mean of
def plotPoissonFit(y, alphaPrior, betaPrior):
# Plot data
= np.max(y)
maxY = np.arange(maxY)
yGrid = [np.sum(y==k)/len(y) for k in range(maxY)]
probs = plt.bar(yGrid, probs, alpha = 0.3);
h1 'y');plt.ylabel('PMF');
plt.xlabel(;
plt.xticks(yGrid)'Fitted Poisson model based on posterior mean estimate');
plt.title(
# Compute posterior mean
= len(y)
n = alphaPrior + np.sum(y)
alphaPost = betaPrior + n
betaPost = alphaPost/betaPost
postMean
# Plot the fit based on the posterior mean of theta
= sps.poisson.pmf(yGrid, mu = postMean)
poisFit = 'orange', lw = 3) plt.plot(yGrid, poisFit, color
# Plot the fit for all bids
= 2
alphaPrior = 1/2
betaPrior = nBids, alphaPrior = alphaPrior, betaPrior = betaPrior) plotPoissonFit(y
Wow, that’s are terrible fit! This data does not look at all like a Poisson distribution. What can we do?
Analyzing the auction with low and high reservation prices separately.
We will later model the number of bids using a Poisson regression where we take into account several explanatory variables. But, for now, let’s split the auctions in two subsets:
i) auctions with low reservation price in relation to the item’s book value (MinBidShare<=0)
ii) auctions with high reservation price in relation to the item’s book value (MinBidShare>0)
Let’s start with the 550 auction with low reservation prices. The prior for the auction with low reservation prices is set to
# Auctions with low reservation prices:
= nBids[eBayData['MinBidShare']<=0]
nBidsLow
= nBidsLow, alphaPrior = 4, betaPrior = 1/2,
PostPoisson(y = np.linspace(0.01,25,10000), thetaPostGrid = np.linspace(4.8,5.8,10000)) thetaPriorGrid
Number of data points = 550
Sum of number of counts = 2927
Mean number of counts = 5.321818181818182
Prior mean = 8.0
Prior standard deviation = 4.0
Equal tail 95% prior interval: (2.17973074725265, 17.534546139484647)
Posterior mean = 5.324
Posterior standard deviation = 0.0983446153216288
Equal tail 95% posterior interval: (5.13322503650632, 5.518717305739481)
As expected, the posterior for the mean number of bids is concentrated on a larger number of bids. People like to bid on items where the seller’s reservation price is low.
Is the first for these auctions improved? Yes it is, although there is still room for improvement:
# Plot the fit for low bids
= nBidsLow, alphaPrior = alphaPrior, betaPrior = betaPrior) plotPoissonFit(y
Below are the results for the auction with high reservation bids. The prior is here set to
# Auctions with high reservation prices:
= nBids[eBayData['MinBidShare']>0]
nBidsHigh
= nBidsHigh, alphaPrior = 1, betaPrior = 1/2,
PostPoisson(y = np.linspace(0.01,12,10000), thetaPostGrid = np.linspace(1.3,1.8,10000)) thetaPriorGrid
Number of data points = 450
Sum of number of counts = 708
Mean number of counts = 1.5733333333333333
Prior mean = 2.0
Prior standard deviation = 2.0
Equal tail 95% prior interval: (0.050635615968579795, 7.377758908227871)
Posterior mean = 1.574
Posterior standard deviation = 0.05910555807189499
Equal tail 95% posterior interval: (1.4600786825716714, 1.6917395497993104)
And the fit is not perfect for these bids, but better than before.
# Plot the fit for high bids
= nBidsHigh, alphaPrior = alphaPrior, betaPrior = betaPrior) plotPoissonFit(y
So, separating the bids into dataset with low and high reservation prices makes the Poisson model a lot better for the data. Later in the book, we will use a Poisson regression with reservation price as one of the features, which an even more fine grained analysis.