Skip to content

cross_validation #3

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions TEW_06_hypothesis_testing_p_value_CI.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#coding:utf-8
"""
------------------------------------------------
@File Name : TEW_06_hypothesis_testing_p_value_CI
@Function :
@Author : Minux
@Date : 2018/10/16
@Revised Date : 2018/10/16
------------------------------------------------
"""
import math
import io
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import scipy.stats
import scipy.optimize
import scipy.spatial

cholera = pd.read_csv('cholera.csv') # 霍乱数据
pumps = pd.read_csv('pumps.csv') # 水泵数据

def Plot_Cholera_func():
fig = plt.figure(figsize=(10, 10))
img = plt.imread('london.png')
plt.imshow(img, extent=[-0.38, 0.38, -0.38, 0.38])
plt.scatter(pumps.x, pumps.y, color='b')
plt.scatter(cholera.x, cholera.y, color='r', s=3)
plt.show()

def Data_stat_info():
print(cholera.closest.value_counts())
print('-'*10,'GroupBy_Closest','-'*10)
print(cholera.groupby('closest').deaths.sum())

def simulate(n):
return pd.DataFrame({'closest':np.random.choice([0,1,4,5], size=n, p=[0.65, 0.15, 0.10, 0.10])})

def sampling_function():
sampling = pd.DataFrame({'counts':[simulate(489).closest.value_counts()[0] for _ in range(10000)]})
# sampling.counts.hist(histtype='step')
# plt.show()
# 计算p-value
# the smaller p-value the more strongly we can reject the null hypothesis
p_value = 100.0 - scipy.stats.percentileofscore(sampling.counts, score=340)
print(p_value)


if __name__ == '__main__':
sampling_function()



100 changes: 100 additions & 0 deletions TEW_07_Anova_Fitting_Models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#coding:utf-8
"""
------------------------------------------------
@File Name : TEW_07_Anova_Fitting_Models
@Function :
@Author : Minux
@Date : 2018/10/23
@Revised Date : 2018/10/23
------------------------------------------------
"""
import math
import numpy
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np

gap_minder = pd.read_csv('gapminder.csv')
g_data = gap_minder.query('year==1985')

size = g_data.population * 1e-6
colors = g_data.region.map({'Africa':'skyblue','Europe':'gold','America':'palegreen','Asia':'red'})

def plot_data():
g_data.plot.scatter('age5_surviving','babies_per_woman',c=colors, s=size, linewidths=0.5, edgecolor='k', alpha=0.5)

model = smf.ols(formula='babies_per_woman ~ 1', data=g_data)
grand_mean = model.fit()

def plot_fit(fit_model):
plot_data()
plt.scatter(g_data.age5_surviving, fit_model.predict(g_data), c=colors, s=30, linewidths=0.5,
edgecolors='k', marker='D')
plt.show()

# plot_fit(grand_mean)
print(np.char.center('mean', 30, '-'))
'''mean'''
print(grand_mean.params)
print(g_data.babies_per_woman.mean())

print(np.char.center('group mean', 30, '-'))

'''group means'''
group_means = smf.ols(formula='babies_per_woman ~ -1+region', data=g_data).fit()
# plot_fit(group_means)

print(group_means.params)
print(g_data.groupby('region').babies_per_woman.mean())

print(np.char.center('surviving', 30, '-'))
surviving = smf.ols(formula='babies_per_woman ~ -1 + region + age5_surviving', data=g_data).fit()
# plot_fit(surviving)

'''add intersection term'''
surviving_by_region_population = smf.ols(formula='babies_per_woman ~ -1+region+age5_surviving:region'
'-age5_surviving + population', data=g_data).fit()
# plot_fit(surviving_by_region)
print(surviving_by_region_population.params)

'''
Measure of Godness of Fit
Mean Squared Error of Residuals
R^2 = (Explained Variance)/(Total Variance)
F-statistics : explanatory power of fit parameters compared to random fit vectors
'''
print(np.char.center('Statistics_Indicator',30,'-'))
def statistics_indicator(*args):
for arg in args:
print(np.char.center(arg,30,'-'))
if arg is 'resid':
for model in [group_means, surviving, surviving_by_region_population]:
print(model.mse_resid)
elif arg is 'rsquared':
for model in [group_means, surviving, surviving_by_region_population]:
print(model.rsquared)
elif arg is 'f_value':
for model in [group_means, surviving, surviving_by_region_population]:
print(model.fvalue)
else:
continue

statistics_indicator('resid','rsquared','f_value','xx')

print(surviving.summary())

print(sm.stats.anova_lm(group_means))












Binary file added bootstrap_sample.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
77 changes: 77 additions & 0 deletions engineering_04_boostraping_match.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#coding:utf-8
"""
------------------------------------------------
@File Name : engineering_04_boostraping_match
@Function :
@Author : Minux
@Date : 2018/10/12
@Revised Date : 2018/10/12
------------------------------------------------
"""
import math
import io
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import scipy.stats
import scipy.optimize
import scipy.spatial


def generate_random_grades():
df_grades = pd.DataFrame()
df_grades['grades'] = np.random.random_sample(100)*10
df_grades.to_csv('grades.csv', index=False)

pop = pd.read_csv(r'./grades.csv')
def data_peek():
print(pop.head())
print(pop.describe())
pop.grades.hist(histtype='step')
plt.show()


def bootstrap_sample(stats_flag=True):
bootstrap = pd.DataFrame({'sample_mean':[pop.sample(100, replace=True).grades.mean() for _ in range(10000)]})
# print(bootstrap.head(10))
if stats_flag:
print('quantile(0.025) is {}'.format(bootstrap.sample_mean.quantile(0.025)))
print('quantile(0.975) is {}'.format(bootstrap.sample_mean.quantile(0.975)))
else:
bootstrap.sample_mean.hist(histtype='step')
plt.axvline(pop.grades.mean(), color='C1')
plt.savefig('bootstrap_sample.png')
plt.show()
n1 = scipy.stats.norm(7.5,1)
n2 = scipy.stats.norm(4,1)

def bimodal_distribution():
x = np.linspace(0, 10, 100)
plt.plot(x, 0.5*n1.pdf(x)+0.5*n2.pdf(x))
plt.show()

def draw():
while True:
v = n1.rvs() if np.random.rand() < 0.5 else n2.rvs()
if 0<=v<=10:
return v

def data_set(n=100):
return pd.DataFrame({'grade':[draw() for _ in range(n)]})

def plot_sample_distribution():
mean = pd.DataFrame({'mean_grade':[data_set().grade.mean() for _ in range(1000)]})
mean.mean_grade.hist(histtype='step')
bootstrap = pd.DataFrame({'sample_mean': [pop.sample(100, replace=True).grades.mean() for _ in range(1000)]})
bootstrap.sample_mean.hist(histtype='step')
plt.show()

if __name__ == '__main__':
# generate_random_grades()
# data_peek()
# bootstrap_sample()
# bimodal_distribution()
plot_sample_distribution()


36 changes: 36 additions & 0 deletions enginerring_01_two_quantiative_variables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#coding:utf-8
"""
------------------------------------------------
@File Name : enginerring_01_two_quantiative_variables
@Function :
@Author : Minux
@Date : 2018/9/17
@Revised Date : 2018/9/17
------------------------------------------------
"""
import numpy as np
import scipy.stats
import pandas as pd
import matplotlib.pyplot as plt
import re
import mailbox
import csv

gapminder = pd.read_csv('gapminder.csv')
# print(gapminder.info())
italy = gapminder.query('country == "Italy"')
# italy.plot.scatter('year','population')

# gapminder.query('country == "India"').plot.scatter('year','population',label='India')

# italy.plot.scatter('year','gdp_per_day',logy=True)

# italy.plot.scatter('gdp_per_day','life_expectancy',logx=True)

size = np.where(italy.year%10==0,32,2)
data = gapminder.query('(country == "Italy") or (country == "United States")')
color = np.where(data.country == 'Italy','blue','orange')
data.plot.scatter('gdp_per_day','life_expectancy',logx=True,c=color,s=size)

plt.legend()
plt.show()
99 changes: 99 additions & 0 deletions enginerring_02_graph_visualization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#coding:utf-8
"""
------------------------------------------------
@File Name : enginerring_02_graph_visualization
@Function :
@Author : Minux
@Date : 2018/9/24
@Revised Date : 2018/9/24
------------------------------------------------
"""
import numpy as np
import scipy.stats
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from matplotlib import style
style.use('ggplot')
import pandas.plotting

from IPython import display
from ipywidgets import interact,widgets

import re
import mailbox
import csv


gap_minder = pd.read_csv('gapminder.csv')

def plot_year(year,info_tag=False):
data = gap_minder[gap_minder.year==year].sort_values('population', ascending=False)

if info_tag:
population_info()

color = data.age5_surviving
area = 3e-6*data.population

# customize edge color
edgecolor = data.region.map({'Africa':'skyblue','Europe':'gold','America':'palegreen','Asia':'coral'})

# plt.cla()
data.plot.scatter('gdp_per_day','life_expectancy',logx=True,s=area,c=color,
colormap=matplotlib.cm.get_cmap('Purples_r'),vmin=55,vmax=100,linewidths=1,edgecolors=edgecolor,
sharex=False, figsize=(10,7))

for level in [4, 16, 64]:
plt.axvline(level, linestyle=':', color='k')

plt.axis(xmin=1, xmax=500, ymin=30, ymax=100)
plt.title('GDP-LIFE-EXPECTANCY_{}'.format(year))
plt.xlabel('$gdp-per-day$')
plt.ylabel('$life-expectancy$')


def population_info():
res = gap_minder[gap_minder.year==2015].groupby('region').population.sum()
print(res)

def dynamic_plotting_func():
# interact(plot_year, year=range(1960,1970))
# population_info()
year_list = [1965, 1966, 1967]
plt.ion()
for _year in year_list:
plot_year(_year)
plt.pause(2)
if _year != year_list[-1]:
plt.close()
plt.ioff()
plt.show()

def plot_matrix_func():
gap_minder.set_index('year',inplace=True)
gap_minder['log10_gdp_per_day'] = np.log10(gap_minder['gdp_per_day'])
data = gap_minder.loc[2015,['log10_gdp_per_day','life_expectancy','age5_surviving','babies_per_woman']]
pandas.plotting.scatter_matrix(data, figsize=(9,9))
plt.show()

if __name__ == '__main__':
plot_matrix_func()


















Loading