import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 18})
import sys
import logging
logging.basicConfig(level=logging.INFO, stream=sys.stdout)

np.random.seed(43)

def ar_1_process(n_samples, c, phi, eps):
    '''
    Generate a correlated random sequence with the AR(1) process.

    Parameters
    ----------
    n_samples: :obj:`int`
        Sample size.
    c: :obj:`float`
        Constant term.
    phi: :obj:`float`
        Correlation magnitude.
    eps: :obj:`float`
        Shock magnitude.
    '''
    ys = np.zeros(n_samples)
    if abs(phi) >= 1:
        raise ValueError("abs(phi) must be smaller than 1.")
    # draw initial value from normal distribution with known mean and variance
    ys[0] = np.random.normal(loc=c / (1 - phi), scale=np.sqrt(eps**2 / (1 - phi**2)))
    for i in range(1, n_samples):
        ys[i] = c + phi * ys[i - 1] + np.random.normal(loc=0., scale=eps)
    return ys

# generate simulation data using the AR(1) process

logging.info("Generating data sets for the tutorial ...")

N_SAMPLES = 100000

C_1 = 2.0
PHI_1 = 0.85
EPS_1 = 2.0
time_series_1 = ar_1_process(N_SAMPLES, C_1, PHI_1, EPS_1)

C_2 = 0.05
PHI_2 = 0.999
EPS_2 = 1.0
time_series_2 = ar_1_process(N_SAMPLES, C_2, PHI_2, EPS_2)

logging.info("Done")

INFO:root:Generating data sets for the tutorial ...
INFO:root:Done


fig = plt.figure(figsize=(10, 6))
plt.title("The first 1000 samples of both time series")
plt.plot(time_series_1[0:1000], label="time series 1")
plt.plot(time_series_2[0:1000], label="time series 2")
plt.xlabel("$i$")
plt.ylabel("$X_i$")
plt.legend()
plt.show()


fig = plt.figure(figsize=(10, 6))
plt.plot(time_series_1[1000:1050], "x")
fig.axes[0].margins(y=0.1)
plt.xlabel("$i$")
plt.ylabel("$X_i$")
plt.show()


fig = plt.figure(figsize=(10, 6))
plt.plot(np.arange(2000, 42000, 800), time_series_1[2000:42000:800], "x")
fig.axes[0].margins(y=0.1)
plt.xlabel("$i$")
plt.ylabel("$X_i$")
fig.axes[0].xaxis.set_major_locator(plt.MultipleLocator(base=8000))
plt.show()


BIN_SIZE = 2000


# SOLUTION CELL
N_BINS = N_SAMPLES // BIN_SIZE
bin_avgs = np.zeros(N_BINS)
for i in range(N_BINS):
    bin_avgs[i] = np.average(time_series_1[i * BIN_SIZE:(i + 1) * BIN_SIZE])


# SOLUTION CELL
avg = np.average(bin_avgs)
sem = np.sqrt(np.sum((bin_avgs - avg)**2) / (N_BINS - 1.5) / N_BINS)


print(f"Best guess for measured quantity: {avg:.3f}")
print(f"Standard error of the mean: {sem:.3f}")

Best guess for measured quantity: 13.362
Standard error of the mean: 0.042


# SOLUTION CELL
def do_binning_analysis(data, bin_size):
    n_samples = len(data)
    n_bins = n_samples // bin_size
    bin_avgs = np.mean(data[:n_bins * bin_size].reshape((n_bins, -1)), axis=1)
    return np.std(bin_avgs, ddof=1.5) / np.sqrt(n_bins)


# SOLUTION CELL
sizes = np.arange(3, 5001, dtype=int)
sems = np.zeros(5001 - 3, dtype=float)
for s in range(len(sizes)):
    sems[s] = do_binning_analysis(time_series_1, sizes[s])

plt.figure(figsize=(10, 6))
plt.plot(sizes, sems, "x")
plt.xscale("log")
plt.xlabel("$N_B$")
plt.ylabel("SEM")
plt.show()


from scipy.optimize import curve_fit

# only fit to the first couple of SEMs
CUTOFF = 600

# sizes of the corresponding bins
sizes_subset = np.arange(3, 3 + CUTOFF, dtype=int)

def fit_fn(x, a, b, c):
    return -np.exp(-a * x) * b + c

fit_params, _ = curve_fit(fit_fn, sizes_subset, sems[:CUTOFF], (0.05, 1, 0.5))

fit_sems = fit_fn(sizes, *fit_params)

# compute analytical solutions for AR(1) process
AN_SIGMA_1 = np.sqrt(EPS_1 ** 2 / (1 - PHI_1 ** 2))
AN_TAU_EXP_1 = -1 / np.log(PHI_1)
AN_SEM_1 = np.sqrt(2 * AN_SIGMA_1 ** 2 * AN_TAU_EXP_1 / N_SAMPLES)


plt.figure(figsize=(10, 6))
plt.plot(sizes, sems, "x", label="binning analysis")
plt.plot(sizes[(0, -1),], np.repeat(AN_SEM_1, 2), "-.", label="analytical solution")
plt.plot(sizes, fit_sems, "-", label="fit")
plt.xscale("log")
plt.xlabel("$N_B$")
plt.ylabel("SEM")
plt.legend()
plt.show()

print(f"Final Standard Error of the Mean: {fit_params[2]:.4f}")
print(f"Analytical Standard Error of the Mean: {AN_SEM_1:.4f}")

Final Standard Error of the Mean: 0.0419
Analytical Standard Error of the Mean: 0.0421


sizes = np.arange(3, 5001, dtype=int)
sems = np.zeros(5001 - 3, dtype=float)
for s in range(len(sizes)):
    sems[s] = do_binning_analysis(time_series_2, sizes[s])

# compute analytical solutions for AR(1) process
AN_SIGMA_2 = np.sqrt(EPS_2 ** 2 / (1 - PHI_2 ** 2))
AN_TAU_EXP_2 = -1 / np.log(PHI_2)
AN_SEM_2 = np.sqrt(2 * AN_SIGMA_2 ** 2 * AN_TAU_EXP_2 / N_SAMPLES)

plt.figure(figsize=(10, 6))
plt.plot(sizes, sems, "x", label="binning analysis")
plt.plot(sizes[(0, -1),], np.repeat(AN_SEM_2, 2), "-.", label="analytical solution")
plt.xscale("log")
plt.xlabel("$N_B$")
plt.ylabel("SEM")
plt.show()

Tutorial: Error Estimation - Part 1 (Introduction and Binning Analysis)¶

Table of contents¶

Data generation¶

Introduction¶

Standard deviation¶

Standard error of the mean¶

Confidence interval¶

Interquartile range¶

Now – what do we use?

Uncorrelated samples¶

Binning analysis¶

Exercise¶

Exercise¶

Exercise¶

Exercise¶

References¶