### CLT FOR DISCRETE RANDOM VARIABLES ###

# Suppose X1, X2, X3, ... ~ Bern(p), independently. (So 
# Sn ~ Binom(n, p).)
bernoulliSnHistogram <- function(n, p) {
  # These first three lines are particular to Bernoulli.
  mu <- p
  sigmaSquared <- p - p^2
  samples <- rbinom(n=10000, size=n, prob=p)
  # These last few lines are generic.
  xs <- seq(from=min(samples), to=max(samples), length.out=100)
  ys <- sapply(xs, dnorm, mean=(n * mu), sd=sqrt(n * sigmaSquared))
  hist(samples)
}
bernoulliSnHistogram(n=10, p=0.1)
bernoulliSnHistogram(n=100, p=0.1)
bernoulliSnHistogram(n=1000, p=0.1)
bernoulliSnHistogram(n=10000, p=0.1)

# Suppose X1, X2, X3, ... ~ Pois(lambda), independently.
poissonSnHistogram <- function(n, lambda) {
  mu <- lambda
  sigmaSquared <- lambda
  samples <- replicate(10000, sum(rpois(n=n, lambda=lambda)))
  xs <- seq(from=min(samples), to=max(samples), length.out=100)
  ys <- sapply(xs, dnorm, mean=(n * mu), sd=sqrt(n * sigmaSquared))
  hist(samples)
}
poissonSnHistogram(n=10, lambda=0.1)
poissonSnHistogram(n=100, lambda=0.1)
poissonSnHistogram(n=1000, lambda=0.1)
poissonSnHistogram(n=10000, lambda=0.1)

# Others you could try: uniform, geometric, negative 
# binomial, hypergeometric.


### CLT FOR CONTINUOUS RANDOM VARIABLES ###

# Suppose X1, X2, X3, ... ~ Expo(lambda), independently.
exponentialSnHistogram <- function(n, lambda) {
  # These first three lines are particular to exponential.
  mu <- 1 / lambda
  sigmaSquared <- 1 / lambda^2
  samples <- replicate(10000, sum(rexp(n=n, rate=lambda)))
  # These last few lines are generic.
  xs <- seq(from=min(samples), to=max(samples), length.out=100)
  ys <- sapply(xs, dnorm, mean=(n * mu), sd=sqrt(n * sigmaSquared))
  hist(samples)
}
exponentialSnHistogram(n=10, lambda=0.1)
exponentialSnHistogram(n=100, lambda=0.1)
exponentialSnHistogram(n=1000, lambda=0.1)
exponentialSnHistogram(n=10000, lambda=0.1)

# Suppose X1, X2, X3, ... ~ Unif(a, b), independently.
uniformSnHistogram <- function(n, a, b) {
  mu <- (a + b) / 2
  sigmaSquared <- (b - a)^2 / 12
  samples <- replicate(10000, sum(runif(n=n, min=a, max=b)))
  hist(samples)
}
uniformSnHistogram(n=10, a=0, b=10)
uniformSnHistogram(n=100, a=0, b=10)
uniformSnHistogram(n=1000, a=0, b=10)
uniformSnHistogram(n=10000, a=0, b=10)

# Suppose X1, X2, X3, ... ~ Norm(mu, sigma^2), 
# independently.
normalSnHistogram <- function(n, mu, sigmaSquared) {
  samples <- replicate(10000, sum(rnorm(n=n, mean=mu, sd=sqrt(sigmaSquared))))
  hist(samples)
}
normalSnHistogram(n=10, mu=0, sigmaSquared=10)
normalSnHistogram(n=100, mu=0, sigmaSquared=10)
normalSnHistogram(n=1000, mu=0, sigmaSquared=10)
normalSnHistogram(n=10000, mu=0, sigmaSquared=10)

# Others you could try: gamma, beta.


### CHALLENGES TO THE CLT ###

# Here is a synthetic data set. It's really skewed 
# (asymmetric about its mean). It's so skewed that even 
# its logarithm is skewed.
data <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 100, 1000)
hist(data)
hist(log(data))

# To explore how skewness affects the CLT, we're going to 
# use this data set as a distribution (in the same spirit 
# as bootstrapping).

# Suppose X1, X2, X3, ... ~ Empirical(p), independently, 
# where Empirical is the distribution arising from the 
# data set above. Statistical theory says that our best 
# estimates for E(Xi) and V(Xi) are the sample mean and 
# variance, respectively.
dataSnHistogram <- function(n) {
  mu <- mean(data)
  sigmaSquared <- sd(data)^2
  samples <- replicate(10000, sum(sample(data, n, replace=TRUE)))
  hist(samples)
}
dataSnHistogram(n=10)
dataSnHistogram(n=100)
dataSnHistogram(n=1000)
dataSnHistogram(n=10000)

# Even for a data set that skewed, the CLT works pretty 
# well. So let's make an even-more-skewed data set. The n 
# at which the CLT works well seems a bit larger than 
# before.
data <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10000, 100000)
hist(data)
hist(log(data))
dataSnHistogram <- function(n) {
  mu <- mean(data)
  sigmaSquared <- sd(data)^2
  samples <- replicate(10000, sum(sample(data, n, replace=TRUE)))
  hist(samples)
}
dataSnHistogram(n=10)
dataSnHistogram(n=100)
dataSnHistogram(n=1000)
dataSnHistogram(n=10000)

# And how does the CLT handle bi-modal (two-peaked) data?
data <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009)
hist(data)
dataSnHistogram <- function(n) {
  mu <- mean(data)
  sigmaSquared <- sd(data)^2
  samples <- replicate(10000, sum(sample(data, n, replace=TRUE)))
  hist(samples)
}
dataSnHistogram(n=10)
dataSnHistogram(n=100)
dataSnHistogram(n=1000)
dataSnHistogram(n=10000)

# Here is a harder challenge to the CLT: The Pareto PDF 
# f(x) = 2 x^-3 on [1, infinity). For then X has E(X) = 2 
# but infinite variance. So the CLT does not apply. Let's 
# see how it does, even though it's unfair to the poor 
# theorem.
rUnfair <- function(n) {
  if (n == 1) {
    u <- runif(1, min=0, max=1)
    sqrt(1 / (1 - u))
  } else
    replicate(n, rUnfair(1))
}
unfairSnHistogram <- function(n) {
  mu <- 2
  samples <- replicate(10000, sum(rUnfair(n=n)))
  hist(samples)
}
unfairSnHistogram(n=10)
unfairSnHistogram(n=100)
unfairSnHistogram(n=1000) # Takes a minute or two.
unfairSnHistogram(n=10000) # Takes tens of minutes?


### ASIDE: KERNEL DENSITY ESTIMATION ###

# One of the main problems of statistics is: Given a data 
# set {a1, a2, ..., aN} of N numbers, what is the 
# probability distribution from which those data were 
# sampled? Kernel density estimation is one approach to 
# this problem. In its most basic form, KDE says: Center a 
# normal distribution at each data point, and add those 
# PDFs to make a composite PDF f.
data <- rnorm(n=10, mean=0, sd=10)
f <- function(x) {
  sum(sapply(data, function(mu) dnorm(x, mean=mu, sd=0.3))) / length(data)
}

# Here's a plot, with the data in red and the f in black. 
# This f probably looks really spiky, because we're adding 
# together a bunch of normal densities with (the same) 
# small sigma^2.
xs <- seq(from=min(data), to=max(data), length.out=1000)
ys <- sapply(xs, f)
plot(x=data, y=replicate(length(data), 0), col=c("red"), pch=c(19), ylim=c(0, max(ys)))
par(new=TRUE)
plot(x=xs, y=sapply(xs, f), ylim=c(0, max(ys)))

# There are a couple of details. First, f must integrate 
# to 1, right? The code above handles this detail, as the 
# following calculation shows.
integrate(Vectorize(f), -Inf, Inf)

# Exercise: Write out the PDF f of the composite 
# distribution described above, in terms of 
# a1, a2, ..., aN, and sigma^2. Be sure to include the 
# correct scaling factor, so that f integrates to 1.

# Second, for historical reasons the standard deviation is 
# called the 'bandwidth'. In the example above, the 
# bandwidth is 0.3. That value is too small for this data 
# set. It causes the KDE to 'overfit' the data, producing 
# a PDF that is much more spiky than the one that 
# generated the data. Choosing the right bandwidth is a 
# little tricky. A better choice (but maybe still not 
# ideal) is (4 / (3 * N))^(1 / 5) times the standard 
# deviation of the data.
data <- rnorm(n=10, mean=0, sd=10)
bandwidth <- (4 / (3 * length(data)))^(1 / 5) * sd(data)
f <- function(x) {
  sum(sapply(data, function(mu) dnorm(x, mean=mu, sd=bandwidth))) / length(data)
}
xs <- seq(from=min(data), to=max(data), length.out=1000)
ys <- sapply(xs, f)
plot(x=data, y=replicate(length(data), 0), col=c("red"), pch=c(19), ylim=c(0, max(ys)))
par(new=TRUE)
plot(x=xs, y=sapply(xs, f), ylim=c(0, max(ys)))
integrate(Vectorize(f), -Inf, Inf)

# Exercise: Let X1, ..., XN be random variables, 
# distributed according to the N normal distributions 
# described above. Let X be a random variable distributed 
# according to the composite distribution above. Compute 
# E(X) (the answer is simple enough that you can even 
# state it in English) and V(X) (the answer is not so 
# simple).