If you have difficulty invoking load_ext, try doing pip install rpy2 --upgrade
in your command line and restarting the Jupyter Python kernel.
%load_ext rpy2.ipython
%%R
# First time users: install packages by uncommenting the following line.
#install.packages(c('dplyr', 'ggplot2', 'foreach', 'lmtest', 'sandwich', 'doMC'))
library('plyr')
library('dplyr')
library('ggplot2')
library('foreach')
library('lmtest')
library('sandwich')
library('broom')
options(digits=3)
setwd('~/www-15-tutorial')
source('css_stats.R')
Attaching package: ‘dplyr’ The following objects are masked from ‘package:plyr’: arrange, count, desc, failwith, id, mutate, rename, summarise, summarize The following object is masked from ‘package:stats’: filter The following objects are masked from ‘package:base’: intersect, setdiff, setequal, union foreach: simple, scalable parallel programming from Revolution Analytics Use Revolution R for scalability, fault tolerance and more. http://www.revolutionanalytics.com Loading required package: zoo Attaching package: ‘zoo’ The following objects are masked from ‘package:base’: as.Date, as.Date.numeric Loading required package: grid Loading required package: lattice Loading required package: survival Loading required package: splines Loading required package: Formula Attaching package: ‘Hmisc’ The following objects are masked from ‘package:dplyr’: combine, src, summarize The following objects are masked from ‘package:plyr’: is.discrete, summarize The following objects are masked from ‘package:base’: format.pval, round.POSIXt, trunc.POSIXt, units
%%R
N <- 100
p <- 0.2
num.replications <- 2000
replications <- replicate(num.replications, mean(rbinom(N, 1, p)))
print(quantile(replications, c(0.025, 0.975))) # 95% interval
hist(replications, xlim=c(0,0.4)) # histogram
2.5% 97.5% 0.13 0.28
%%R
# Alternatively, we can use the standard deviation of the sampling distribution.
# which is approximately normal for sufficiently large N.
est.se <- sd(replications)
mean.est.p <- mean(replications)
mean.est.p
est.se
c(mean.est.p - 1.96*est.se, mean.est.p + 1.96*est.se)
[1] 0.123 0.277
The standard deviation of $k$ draws from a Bernoulli distribution is approximated by:
$$\sigma = \sqrt{\frac{p(p-1)}{N}}$$
This is the standard error for our experiment!
%%R
single.trial <- rbinom(N, 1, p)
p.single.trial <- mean(single.trial)
# normal approximation of the SE of a binomial
se <- sqrt(p.single.trial*(1-p.single.trial)/N)
se
[1] 0.0384
%%R
# norm approx 95% CI
c(p.single.trial - 1.96*se, p.single.trial + 1.96*se)
[1] 0.105 0.255
%%R
# let's construct the confidence intervals for p for a few values of p, N
d <- expand.grid(p=c(0.05, 0.25, 0.5), n=seq(100, 2000, 100))
d <- d %>% mutate(se=sqrt(p*(1-p)/n))
qplot(n, p, data=d, color=factor(p),
ylab='p (with 95% confidence interval)') +
geom_errorbar(aes(ymin=p-1.96*se, ymax=p+1.96*se))
%%R
# you can see that the standard error rapidly diminishes with N.
# (sqrt(1/n), in particular).
qplot(n, se, data=d, color=factor(p), geom='line', ylab='standard error')
Consider simulated experiment with binary treatment, $D$:
$$y = \mu + D*\delta + \epsilon$$
$\delta$ is the average treatment effect (ATE)
%%R
run.experiment <- function(n, ate) {
data.frame(
y0=rnorm(n, mean=10, sd=3)
) %>%
mutate(
y1=y0 + ate,
D=rbinom(n, 1, 0.5),
y=D*y1 + (1-D)*y0
)
}
my.experiment <- run.experiment(2000, 2.0)
head(my.experiment)
y0 y1 D y 1 9.56 11.56 0 9.56 2 9.03 11.03 1 11.03 3 13.57 15.57 0 13.57 4 12.31 14.31 1 14.31 5 7.27 9.27 1 9.27 6 7.56 9.56 1 9.56
%%R
qplot(y, geom='histogram', data=my.experiment, facets=D~.) + theme_bw()
stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this. stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
%%R
est.ate <- function(exp) {
mean(exp[exp$D==1,]$y) - mean(exp[exp$D==0,]$y)
}
num.replications <- 1000
ates <- replicate(
num.replications,
est.ate(run.experiment(n=1e3, 2.0)))
hist(ates, main='sampling distribution of ATE')
%%R
# 95% of the estimated treated effects lie within this range
quantile(ates, c(0.025, 0.975))
# Large sample approximations let us make statements about the
# variance of the ATE over different (similar) populations (or
# possible randomizations).
est.ate(my.experiment)
with(my.experiment, t.test(y[D==1], y[D==0]))
Welch Two Sample t-test data: y[D == 1] and y[D == 0] t = 14.4, df = 1998, p-value < 2.2e-16 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: 1.64 2.15 sample estimates: mean of x mean of y 12.0 10.1
%%R
m <- lm(y ~ D, data=my.experiment)
est.ate <- coeftest(m)['D', 'Estimate']
est.se <- coeftest(m)['D', 'Std. Error']
c(est.ate-1.96*est.se, est.ate+1.96*est.se)
[1] 1.64 2.15
%%R
weighted.est.ate <- function(exp) {
m <- lm(y ~ D, data=exp, weights=.weights)
coeftest(m)['D', 'Estimate']
}
replicates <- iid.bootstrap(my.experiment, weighted.est.ate)
print(quantile(replicates, c(0.025, 0.975)))
hist(replicates, main='bootstrap distribution of ATE for a single experiment')
2.5% 97.5% 1.62 2.16
%%R
# Here is a function that returns confidence intervals for a dataframe
# with an outcome y and a binary treatment D. We will use this later.
get.ci <- function(experiment) {
t <- coeftest(lm(y ~ D, data=experiment))
est.ate <- t['D', 'Estimate']
est.se <- t['D', 'Std. Error']
p.value <- t['D', 'Pr(>|t|)']
return(list(est.ate=est.ate, est.se=est.se, p.value=p.value))
}
get.ci(my.experiment)
$est.ate [1] 1.89 $est.se [1] 0.132 $p.value [1] 1.54e-44
%%R
# This function runs an experiment multiple times (num.replications),
# and generates a dataframe containing estimates of the treatment effects and
# standard errors for each experiment, sorted by the size of the effect.
replicate.experiment <- function(sim.func, params, num.replications=100) {
# Note: you can parallelize this process below by using %dopar% if
# you have the 'doMC' package installed.
exps <- foreach(replication=1:num.replications, .combine=rbind) %do% {
as.data.frame(get.ci(do.call(sim.func, params)))
}
exps <- exps[order(exps$est.ate),]
exps <- mutate(exps,
rank=1:nrow(exps),
significant=sign(est.ate-1.96*est.se)==sign(est.ate+1.96*est.se)
)
exps
}
%%R
# run 300 trials of an experiment with no ATE
null.experiments <- replicate.experiment(run.experiment,
list(n=1e3, ate=0.0),
num.replications=200)
qplot(est.ate, data=null.experiments, main="sampling distribution of ATE")
stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
%%R
# plot the results of the experiments, rank-ordered by effect size, along
# with their confidence intervals. Experiments that are "significant" have
# confidence intervals that do not cross zero and are highlighted in blue.
print(mean(null.experiments$significant))
qplot(rank, est.ate, data=null.experiments, color=significant,
xlab='experiment number (ranked by effect size)', ylab='estimated ATE',
main='demonstration of Type I errors') +
geom_pointrange(aes(ymin=est.ate-1.96*est.se, ymax=est.ate+1.96*est.se)) +
geom_point()
[1] 0.07
%%R
sad.experiments <- replicate.experiment(run.experiment,
list(n=250, ate=0.5),
num.replications=200)
qplot(rank, est.ate, data=sad.experiments, color=significant,
xlab='experiment number (ranked by effect size)', ylab='estimated ATE',
main='demonstration of Type II errors') +
geom_pointrange(aes(ymin=est.ate-1.96*est.se, ymax=est.ate+1.96*est.se)) +
geom_point() + geom_hline(yintercept=0.5)
%%R
# Fraction of experiments that are statistically significant.
# This is called the statistical power of the experiment.
print(mean(sad.experiments$significant))
### Type M errors
# when experiments are underpowered, only reporting on significant experiments
# can massively overstate effects.
with(subset(sad.experiments, significant==TRUE), mean(est.ate))
[1] 0.275 [1] 0.937
%%R
run.experiment2 <- function(n, ate, prop.treated=0.5, y.sd=3) {
data.frame(
y0=rnorm(n, mean=10, sd=y.sd)
) %>%
mutate(
y1=y0 + ate,
D=rbinom(n, 1, prop.treated),
y=D*y1 + (1-D)*y0
)
}
Let's simulate a few replications with varying proportions of users assigned to treatment and control.
%%R
exps <- foreach(p=seq(0.04, 0.96, 0.04), .combine=rbind) %do% {
varying.prop = list(n=1e3, ate=1, prop.treated=p, y.sd=3)
replications <- replicate.experiment(
sim.func=run.experiment2,
params=varying.prop,
num.replications=250
)
data.frame(
prop.treated=p,
sim.se=sd(replications$est.ate),
sim.ate=mean(replications$est.ate),
norm.est.se=head(replications$est.se, 1)
)
}
%%R
qplot(prop.treated, 1.96*sim.se, data=exps,
ylab='95% CI width', xlab='proportion of subjects in treatment',
ylim=c(0, 2.0), geom=c('line','point'),main='Precision of estimated ATE (simulation)')
%%R
qplot(prop.treated, 1.96*norm.est.se, data=exps,
ylab='95% CI width', xlab='proportion of subjects in treatment',
ylim=c(0,2.0), main='Precision of estimated ATE (approximation)')
%%R
# We will use a stripped down version that doesn't use replications for
# demonstration purposes, using get.ci() with a single trial for each
# parameterization.
n <- 1e3
true.ate <- 0.1
p=0.5
exps <- foreach(n=seq(100, 1e3, 1e2), .combine=rbind) %do% {
foreach(my.sd=c(1, 2, 3), .combine=rbind) %do% {
my.experiment <- run.experiment2(
n=n, ate=1, prop.treated=0.5, y.sd=my.sd)
my.ses <- get.ci(my.experiment)
data.frame(n=n, sd=my.sd,
est.ate=my.ses$est.ate,
est.se=my.ses$est.se)
}
}
qplot(n, 1.96*est.se, data=exps, color=factor(sd), geom=c('line', 'point')) + geom_hline(aes(yintercept=true.ate), linetype='dashed')