### CLT FOR DISCRETE RANDOM VARIABLES ### # Suppose X1, X2, X3, ... ~ Bern(p), independently. (So # Sn ~ Binom(n, p).) bernoulliSnHistogram <- function(n, p) { # These first three lines are particular to Bernoulli. mu <- p sigmaSquared <- p - p^2 samples <- rbinom(n=10000, size=n, prob=p) # These last few lines are generic. xs <- seq(from=min(samples), to=max(samples), length.out=100) ys <- sapply(xs, dnorm, mean=(n * mu), sd=sqrt(n * sigmaSquared)) hist(samples) } bernoulliSnHistogram(n=10, p=0.1) bernoulliSnHistogram(n=100, p=0.1) bernoulliSnHistogram(n=1000, p=0.1) bernoulliSnHistogram(n=10000, p=0.1) # Suppose X1, X2, X3, ... ~ Pois(lambda), independently. poissonSnHistogram <- function(n, lambda) { mu <- lambda sigmaSquared <- lambda samples <- replicate(10000, sum(rpois(n=n, lambda=lambda))) xs <- seq(from=min(samples), to=max(samples), length.out=100) ys <- sapply(xs, dnorm, mean=(n * mu), sd=sqrt(n * sigmaSquared)) hist(samples) } poissonSnHistogram(n=10, lambda=0.1) poissonSnHistogram(n=100, lambda=0.1) poissonSnHistogram(n=1000, lambda=0.1) poissonSnHistogram(n=10000, lambda=0.1) # Others you could try: uniform, geometric, negative # binomial, hypergeometric. ### CLT FOR CONTINUOUS RANDOM VARIABLES ### # Suppose X1, X2, X3, ... ~ Expo(lambda), independently. exponentialSnHistogram <- function(n, lambda) { # These first three lines are particular to exponential. mu <- 1 / lambda sigmaSquared <- 1 / lambda^2 samples <- replicate(10000, sum(rexp(n=n, rate=lambda))) # These last few lines are generic. xs <- seq(from=min(samples), to=max(samples), length.out=100) ys <- sapply(xs, dnorm, mean=(n * mu), sd=sqrt(n * sigmaSquared)) hist(samples) } exponentialSnHistogram(n=10, lambda=0.1) exponentialSnHistogram(n=100, lambda=0.1) exponentialSnHistogram(n=1000, lambda=0.1) exponentialSnHistogram(n=10000, lambda=0.1) # Suppose X1, X2, X3, ... ~ Unif(a, b), independently. uniformSnHistogram <- function(n, a, b) { mu <- (a + b) / 2 sigmaSquared <- (b - a)^2 / 12 samples <- replicate(10000, sum(runif(n=n, min=a, max=b))) hist(samples) } uniformSnHistogram(n=10, a=0, b=10) uniformSnHistogram(n=100, a=0, b=10) uniformSnHistogram(n=1000, a=0, b=10) uniformSnHistogram(n=10000, a=0, b=10) # Suppose X1, X2, X3, ... ~ Norm(mu, sigma^2), # independently. normalSnHistogram <- function(n, mu, sigmaSquared) { samples <- replicate(10000, sum(rnorm(n=n, mean=mu, sd=sqrt(sigmaSquared)))) hist(samples) } normalSnHistogram(n=10, mu=0, sigmaSquared=10) normalSnHistogram(n=100, mu=0, sigmaSquared=10) normalSnHistogram(n=1000, mu=0, sigmaSquared=10) normalSnHistogram(n=10000, mu=0, sigmaSquared=10) # Others you could try: gamma, beta. ### CHALLENGES TO THE CLT ### # Here is a synthetic data set. It's really skewed # (asymmetric about its mean). It's so skewed that even # its logarithm is skewed. data <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 100, 1000) hist(data) hist(log(data)) # To explore how skewness affects the CLT, we're going to # use this data set as a distribution (in the same spirit # as bootstrapping). # Suppose X1, X2, X3, ... ~ Empirical(p), independently, # where Empirical is the distribution arising from the # data set above. Statistical theory says that our best # estimates for E(Xi) and V(Xi) are the sample mean and # variance, respectively. dataSnHistogram <- function(n) { mu <- mean(data) sigmaSquared <- sd(data)^2 samples <- replicate(10000, sum(sample(data, n, replace=TRUE))) hist(samples) } dataSnHistogram(n=10) dataSnHistogram(n=100) dataSnHistogram(n=1000) dataSnHistogram(n=10000) # Even for a data set that skewed, the CLT works pretty # well. So let's make an even-more-skewed data set. The n # at which the CLT works well seems a bit larger than # before. data <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10000, 100000) hist(data) hist(log(data)) dataSnHistogram <- function(n) { mu <- mean(data) sigmaSquared <- sd(data)^2 samples <- replicate(10000, sum(sample(data, n, replace=TRUE))) hist(samples) } dataSnHistogram(n=10) dataSnHistogram(n=100) dataSnHistogram(n=1000) dataSnHistogram(n=10000) # And how does the CLT handle bi-modal (two-peaked) data? data <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009) hist(data) dataSnHistogram <- function(n) { mu <- mean(data) sigmaSquared <- sd(data)^2 samples <- replicate(10000, sum(sample(data, n, replace=TRUE))) hist(samples) } dataSnHistogram(n=10) dataSnHistogram(n=100) dataSnHistogram(n=1000) dataSnHistogram(n=10000) # Here is a harder challenge to the CLT: The Pareto PDF # f(x) = 2 x^-3 on [1, infinity). For then X has E(X) = 2 # but infinite variance. So the CLT does not apply. Let's # see how it does, even though it's unfair to the poor # theorem. rUnfair <- function(n) { if (n == 1) { u <- runif(1, min=0, max=1) sqrt(1 / (1 - u)) } else replicate(n, rUnfair(1)) } unfairSnHistogram <- function(n) { mu <- 2 samples <- replicate(10000, sum(rUnfair(n=n))) hist(samples) } unfairSnHistogram(n=10) unfairSnHistogram(n=100) unfairSnHistogram(n=1000) # Takes a minute or two. unfairSnHistogram(n=10000) # Takes tens of minutes? ### ASIDE: KERNEL DENSITY ESTIMATION ### # One of the main problems of statistics is: Given a data # set {a1, a2, ..., aN} of N numbers, what is the # probability distribution from which those data were # sampled? Kernel density estimation is one approach to # this problem. In its most basic form, KDE says: Center a # normal distribution at each data point, and add those # PDFs to make a composite PDF f. data <- rnorm(n=10, mean=0, sd=10) f <- function(x) { sum(sapply(data, function(mu) dnorm(x, mean=mu, sd=0.3))) / length(data) } # Here's a plot, with the data in red and the f in black. # This f probably looks really spiky, because we're adding # together a bunch of normal densities with (the same) # small sigma^2. xs <- seq(from=min(data), to=max(data), length.out=1000) ys <- sapply(xs, f) plot(x=data, y=replicate(length(data), 0), col=c("red"), pch=c(19), ylim=c(0, max(ys))) par(new=TRUE) plot(x=xs, y=sapply(xs, f), ylim=c(0, max(ys))) # There are a couple of details. First, f must integrate # to 1, right? The code above handles this detail, as the # following calculation shows. integrate(Vectorize(f), -Inf, Inf) # Exercise: Write out the PDF f of the composite # distribution described above, in terms of # a1, a2, ..., aN, and sigma^2. Be sure to include the # correct scaling factor, so that f integrates to 1. # Second, for historical reasons the standard deviation is # called the 'bandwidth'. In the example above, the # bandwidth is 0.3. That value is too small for this data # set. It causes the KDE to 'overfit' the data, producing # a PDF that is much more spiky than the one that # generated the data. Choosing the right bandwidth is a # little tricky. A better choice (but maybe still not # ideal) is (4 / (3 * N))^(1 / 5) times the standard # deviation of the data. data <- rnorm(n=10, mean=0, sd=10) bandwidth <- (4 / (3 * length(data)))^(1 / 5) * sd(data) f <- function(x) { sum(sapply(data, function(mu) dnorm(x, mean=mu, sd=bandwidth))) / length(data) } xs <- seq(from=min(data), to=max(data), length.out=1000) ys <- sapply(xs, f) plot(x=data, y=replicate(length(data), 0), col=c("red"), pch=c(19), ylim=c(0, max(ys))) par(new=TRUE) plot(x=xs, y=sapply(xs, f), ylim=c(0, max(ys))) integrate(Vectorize(f), -Inf, Inf) # Exercise: Let X1, ..., XN be random variables, # distributed according to the N normal distributions # described above. Let X be a random variable distributed # according to the composite distribution above. Compute # E(X) (the answer is simple enough that you can even # state it in English) and V(X) (the answer is not so # simple).