Skip to content

bootstrap samples in statistics #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: minux
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 120 additions & 0 deletions Bayesian Inference Python Statistics.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# The Engineering World - A Place For Learning And Exploring\n",
"\n",
"## Bayesian Inference "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Standard imports"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import math"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib\n",
"import matplotlib.pyplot as pp"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'pymc3'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-5-2e5c536b7c65>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mimport\u001b[0m \u001b[0mpymc3\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mpm\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'pymc3'"
]
}
],
"source": [
"import pymc3 as pm"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
382 changes: 382 additions & 0 deletions Cross validation Python Statistics Training.ipynb

Large diffs are not rendered by default.

689 changes: 689 additions & 0 deletions Goodness Of Fit.ipynb

Large diffs are not rendered by default.

95 changes: 95 additions & 0 deletions Hypothesis Testing in Python Statistics.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Python statistics essential training - 04_04_testing"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Standard imports"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import math\n",
"import io"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import matplotlib\n",
"import matplotlib.pyplot as pp\n",
"\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"import scipy.stats\n",
"import scipy.optimize\n",
"import scipy.spatial"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
577 changes: 577 additions & 0 deletions Logistic Regression.ipynb

Large diffs are not rendered by default.

55 changes: 55 additions & 0 deletions TEW_06_hypothesis_testing_p_value_CI.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#coding:utf-8
"""
------------------------------------------------
@File Name : TEW_06_hypothesis_testing_p_value_CI
@Function :
@Author : Minux
@Date : 2018/10/16
@Revised Date : 2018/10/16
------------------------------------------------
"""
import math
import io
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import scipy.stats
import scipy.optimize
import scipy.spatial

cholera = pd.read_csv('cholera.csv') # 霍乱数据
pumps = pd.read_csv('pumps.csv') # 水泵数据

def Plot_Cholera_func():
fig = plt.figure(figsize=(10, 10))
img = plt.imread('london.png')
plt.imshow(img, extent=[-0.38, 0.38, -0.38, 0.38])
plt.scatter(pumps.x, pumps.y, color='b')
plt.scatter(cholera.x, cholera.y, color='r', s=3)
plt.show()

def Data_stat_info():
print(cholera.closest.value_counts())
print('-'*10,'GroupBy_Closest','-'*10)
print(cholera.groupby('closest').deaths.sum())

def simulate(n):
return pd.DataFrame({'closest':np.random.choice([0,1,4,5], size=n, p=[0.65, 0.15, 0.10, 0.10])})

def sampling_function():
sampling = pd.DataFrame({'counts':[simulate(489).closest.value_counts()[0] for _ in range(10000)]})
# sampling.counts.hist(histtype='step')
# plt.show()
# 计算p-value
# the smaller p-value the more strongly we can reject the null hypothesis
p_value = 100.0 - scipy.stats.percentileofscore(sampling.counts, score=340)
print(p_value)


if __name__ == '__main__':
sampling_function()



100 changes: 100 additions & 0 deletions TEW_07_Anova_Fitting_Models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#coding:utf-8
"""
------------------------------------------------
@File Name : TEW_07_Anova_Fitting_Models
@Function :
@Author : Minux
@Date : 2018/10/23
@Revised Date : 2018/10/23
------------------------------------------------
"""
import math
import numpy
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np

gap_minder = pd.read_csv('gapminder.csv')
g_data = gap_minder.query('year==1985')

size = g_data.population * 1e-6
colors = g_data.region.map({'Africa':'skyblue','Europe':'gold','America':'palegreen','Asia':'red'})

def plot_data():
g_data.plot.scatter('age5_surviving','babies_per_woman',c=colors, s=size, linewidths=0.5, edgecolor='k', alpha=0.5)

model = smf.ols(formula='babies_per_woman ~ 1', data=g_data)
grand_mean = model.fit()

def plot_fit(fit_model):
plot_data()
plt.scatter(g_data.age5_surviving, fit_model.predict(g_data), c=colors, s=30, linewidths=0.5,
edgecolors='k', marker='D')
plt.show()

# plot_fit(grand_mean)
print(np.char.center('mean', 30, '-'))
'''mean'''
print(grand_mean.params)
print(g_data.babies_per_woman.mean())

print(np.char.center('group mean', 30, '-'))

'''group means'''
group_means = smf.ols(formula='babies_per_woman ~ -1+region', data=g_data).fit()
# plot_fit(group_means)

print(group_means.params)
print(g_data.groupby('region').babies_per_woman.mean())

print(np.char.center('surviving', 30, '-'))
surviving = smf.ols(formula='babies_per_woman ~ -1 + region + age5_surviving', data=g_data).fit()
# plot_fit(surviving)

'''add intersection term'''
surviving_by_region_population = smf.ols(formula='babies_per_woman ~ -1+region+age5_surviving:region'
'-age5_surviving + population', data=g_data).fit()
# plot_fit(surviving_by_region)
print(surviving_by_region_population.params)

'''
Measure of Godness of Fit
Mean Squared Error of Residuals
R^2 = (Explained Variance)/(Total Variance)
F-statistics : explanatory power of fit parameters compared to random fit vectors
'''
print(np.char.center('Statistics_Indicator',30,'-'))
def statistics_indicator(*args):
for arg in args:
print(np.char.center(arg,30,'-'))
if arg is 'resid':
for model in [group_means, surviving, surviving_by_region_population]:
print(model.mse_resid)
elif arg is 'rsquared':
for model in [group_means, surviving, surviving_by_region_population]:
print(model.rsquared)
elif arg is 'f_value':
for model in [group_means, surviving, surviving_by_region_population]:
print(model.fvalue)
else:
continue

statistics_indicator('resid','rsquared','f_value','xx')

print(surviving.summary())

print(sm.stats.anova_lm(group_means))












Binary file added bootstrap_sample.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading