YWBAT
- apply the clt to find the mean and std of a population
- define the clt
- use sampling to find sampling statistics
- Watch Mr. Nystrom
- Make some data
- apply clt to find stats on our data
import pandas as pd
import numpy as np
import scipy.stats as scs
import matplotlib.pyplot as plt
np.random.seed(42)
def make_hist(arr):
plt.figure(figsize=(5, 5))
plt.grid(linestyle='dashed')
plt.hist(arr, color='r', alpha=0.5)
plt.show()
# height of a population
# we don't know what our data is
loc = np.random.randint(48, 72)
scale = np.random.randint(5, 9)
popsize = np.random.randint(1000, 1500)
pop = np.random.normal(loc=loc, scale=scale, size=popsize)
sample_means = []
for i in range(30):
samp = np.random.choice(pop, size=100, replace=False)
sample_means.append(samp.mean())
plt.figure(figsize=(5, 5))
plt.grid(linestyle='dashed')
plt.hist(sample_means, color='r', alpha=0.5)
plt.show()
# how do we test for normality?
scs.skew(sample_means), scs.kurtosis(sample_means, fisher=False)
(-0.7136669566700377, 3.8101325840693745)
print(np.mean(sample_means) - 3*np.std(sample_means), np.mean(sample_means) + 3*np.std(sample_means))
51.79585182468757 56.739289606174104
np.mean(sample_means)
54.267570715430836
sample_stds = []
for i in range(30):
samp = np.random.choice(pop, size=100, replace=False)
sample_stds.append(samp.std())
plt.figure(figsize=(5, 5))
plt.grid(linestyle='dashed')
plt.hist(sample_stds, color='r', alpha=0.5)
plt.show()
scs.skew(sample_stds), scs.kurtosis(sample_stds, fisher=False)
(0.07491090271341218, 2.2486691017922027)
print(np.mean(sample_stds) - 3*np.std(sample_stds), np.mean(sample_stds) + 3*np.std(sample_stds))
6.627438005035517 10.250677641064202
np.mean(sample_stds)
8.43905782304986
# spread of our sampling means?
np.mean(sample_stds) / 10
0.8439057823049859
It did
np.mean(sample_means), pop.mean()
(54.267570715430836, 54.254315264939486)
np.mean(sample_stds), pop.std()
(8.43905782304986, 8.439896288978073)
# hours of sleep students get at flatiron
students = np.random.normal(5, 0.2, size=1000)
# hours of sleep employees get at flatiron
employees = np.random.normal(8, 1.0, size=1000)
all_flatiron = np.concatenate([students, employees])
all_flatiron.shape
(2000,)
make_hist(all_flatiron)
sample_means = []
sample_stds = []
for i in range(30):
samp = np.random.choice(all_flatiron, size=100)
sample_means.append(np.mean(samp))
sample_stds.append(np.std(samp))
make_hist(sample_means)
make_hist(sample_stds)
np.mean(all_flatiron), np.mean(sample_means)
(6.506361834201236, 6.494152262884501)
np.std(all_flatiron), np.mean(sample_stds)
(1.6728808853851138, 1.6536313679786818)
# What if we took a sample of people and got a mean of 7.0
# What is the probability that the sample came from flatiron?
mu = 7.0
scs.ttest_1samp(sample_means, 7.0)
Ttest_1sampResult(statistic=-16.993119437706515, pvalue=1.2900103036426018e-16)