# Distributions in Pandas¶

In [2]:
import pandas as pd
import numpy as np

In [ ]:
np.random.binomial(1, 0.5)

In [ ]:
np.random.binomial(1000, 0.5)/1000

In [ ]:
chance_of_tornado = 0.01/100

In [ ]:
chance_of_tornado = 0.01

two_days_in_a_row = 0
two_days_in_a_row+=1

print('{} tornadoes back to back in {} years'.format(two_days_in_a_row, 1000000/365))

In [ ]:
np.random.uniform(0, 1)

In [ ]:
np.random.normal(0.75)


Formula for standard deviation $$\sqrt{\frac{1}{N} \sum_{i=1}^N (x_i - \overline{x})^2}$$

In [ ]:
distribution = np.random.normal(0.75,size=1000)

np.sqrt(np.sum((np.mean(distribution)-distribution)**2)/len(distribution))

In [ ]:
np.std(distribution)

In [ ]:
import scipy.stats as stats
stats.kurtosis(distribution)

In [ ]:
stats.skew(distribution)

In [ ]:
chi_squared_df2 = np.random.chisquare(2, size=10000)
stats.skew(chi_squared_df2)

In [ ]:
chi_squared_df5 = np.random.chisquare(5, size=10000)
stats.skew(chi_squared_df5)

In [ ]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

output = plt.hist([chi_squared_df2,chi_squared_df5], bins=50, histtype='step',
label=['2 degrees of freedom','5 degrees of freedom'])
plt.legend(loc='upper right')


# Hypothesis Testing¶

In [3]:
df = pd.read_csv('./Data_Science_with_Python/course1_downloads/grades.csv')

In [4]:
df.head()

Out[4]:
0 B73F2C11-70F0-E37D-8B10-1D20AFED50B1 92.733946 2015-11-02 06:55:34.282000000 83.030552 2015-11-09 02:22:58.938000000 67.164441 2015-11-12 08:58:33.998000000 53.011553 2015-11-16 01:21:24.663000000 47.710398 2015-11-20 13:24:59.692000000 38.168318 2015-11-22 18:31:15.934000000
1 98A0FAE0-A19A-13D2-4BB5-CFBFD94031D1 86.790821 2015-11-29 14:57:44.429000000 86.290821 2015-12-06 17:41:18.449000000 69.772657 2015-12-10 08:54:55.904000000 55.098125 2015-12-13 17:32:30.941000000 49.588313 2015-12-19 23:26:39.285000000 44.629482 2015-12-21 17:07:24.275000000
2 D0F62040-CEB0-904C-F563-2F8620916C4E 85.512541 2016-01-09 05:36:02.389000000 85.512541 2016-01-09 06:39:44.416000000 68.410033 2016-01-15 20:22:45.882000000 54.728026 2016-01-11 12:41:50.749000000 49.255224 2016-01-11 17:31:12.489000000 44.329701 2016-01-17 16:24:42.765000000
3 FFDF2B2C-F514-EF7F-6538-A6A53518E9DC 86.030665 2016-04-30 06:50:39.801000000 68.824532 2016-04-30 17:20:38.727000000 61.942079 2016-05-12 07:47:16.326000000 49.553663 2016-05-07 16:09:20.485000000 49.553663 2016-05-24 12:51:18.016000000 44.598297 2016-05-26 08:09:12.058000000
4 5ECBEEB6-F1CE-80AE-3164-E45E99473FB4 64.813800 2015-12-13 17:06:10.750000000 51.491040 2015-12-14 12:25:12.056000000 41.932832 2015-12-29 14:25:22.594000000 36.929549 2015-12-28 01:29:55.901000000 33.236594 2015-12-29 14:46:06.628000000 33.236594 2016-01-05 01:06:59.546000000
In [5]:
len(df)

Out[5]:
2315
In [6]:
early = df[df['assignment1_submission'] <= '2015-12-31']
late = df[df['assignment1_submission'] > '2015-12-31']

In [7]:
early.mean()

Out[7]:
assignment1_grade    74.972741
dtype: float64
In [8]:
late.mean()

Out[8]:
assignment1_grade    74.017429
dtype: float64
In [10]:
from scipy import stats
stats.ttest_ind?

In [11]:
stats.ttest_ind(early['assignment1_grade'], late['assignment1_grade'])

Out[11]:
Ttest_indResult(statistic=1.400549944897566, pvalue=0.16148283016060577)
In [ ]:
stats.ttest_ind(early['assignment2_grade'], late['assignment2_grade'])

In [ ]:
stats.ttest_ind(early['assignment3_grade'], late['assignment3_grade'])