theengineeringworld · siliconMagic · Sep 24, 2018 · Sep 24, 2018 · Sep 24, 2018 · Oct 12, 2018
diff --git a/TEW_06_hypothesis_testing_p_value_CI.py b/TEW_06_hypothesis_testing_p_value_CI.py
@@ -0,0 +1,55 @@
+#coding:utf-8
+"""
+------------------------------------------------
+@File Name    : TEW_06_hypothesis_testing_p_value_CI
+@Function     : 
+@Author       : Minux
+@Date         : 2018/10/16
+@Revised Date : 2018/10/16
+------------------------------------------------
+"""
+import math
+import io
+import numpy as np
+import pandas as pd
+
+import matplotlib.pyplot as plt
+
+import scipy.stats
+import scipy.optimize
+import scipy.spatial
+
+cholera = pd.read_csv('cholera.csv') # 霍乱数据
+pumps = pd.read_csv('pumps.csv')     # 水泵数据
+
+def Plot_Cholera_func():
+    fig = plt.figure(figsize=(10, 10))
+    img = plt.imread('london.png')
+    plt.imshow(img, extent=[-0.38, 0.38, -0.38, 0.38])
+    plt.scatter(pumps.x, pumps.y, color='b')
+    plt.scatter(cholera.x, cholera.y, color='r', s=3)
+    plt.show()
+
+def Data_stat_info():
+    print(cholera.closest.value_counts())
+    print('-'*10,'GroupBy_Closest','-'*10)
+    print(cholera.groupby('closest').deaths.sum())
+
+def simulate(n):
+    return pd.DataFrame({'closest':np.random.choice([0,1,4,5], size=n, p=[0.65, 0.15, 0.10, 0.10])})
+
+def sampling_function():
+    sampling = pd.DataFrame({'counts':[simulate(489).closest.value_counts()[0] for _ in range(10000)]})
+    # sampling.counts.hist(histtype='step')
+    # plt.show()
+    # 计算p-value
+    # the smaller p-value the more strongly we can reject the null hypothesis
+    p_value = 100.0 - scipy.stats.percentileofscore(sampling.counts, score=340)
+    print(p_value)
+
+
+if __name__ == '__main__':
+    sampling_function()
+
+
+
diff --git a/TEW_07_Anova_Fitting_Models.py b/TEW_07_Anova_Fitting_Models.py
@@ -0,0 +1,100 @@
+#coding:utf-8
+"""
+------------------------------------------------
+@File Name    : TEW_07_Anova_Fitting_Models
+@Function     : 
+@Author       : Minux
+@Date         : 2018/10/23
+@Revised Date : 2018/10/23
+------------------------------------------------
+"""
+import math
+import numpy
+import pandas as pd
+import matplotlib.pyplot as plt
+import statsmodels.api as sm
+import statsmodels.formula.api as smf
+import numpy as np
+
+gap_minder = pd.read_csv('gapminder.csv')
+g_data = gap_minder.query('year==1985')
+
+size = g_data.population * 1e-6
+colors = g_data.region.map({'Africa':'skyblue','Europe':'gold','America':'palegreen','Asia':'red'})
+
+def plot_data():
+    g_data.plot.scatter('age5_surviving','babies_per_woman',c=colors, s=size, linewidths=0.5, edgecolor='k', alpha=0.5)
+
+model = smf.ols(formula='babies_per_woman ~ 1', data=g_data)
+grand_mean = model.fit()
+
+def plot_fit(fit_model):
+    plot_data()
+    plt.scatter(g_data.age5_surviving, fit_model.predict(g_data), c=colors, s=30, linewidths=0.5,
+                edgecolors='k', marker='D')
+    plt.show()
+
+# plot_fit(grand_mean)
+print(np.char.center('mean', 30, '-'))
+'''mean'''
+print(grand_mean.params)
+print(g_data.babies_per_woman.mean())
+
+print(np.char.center('group mean', 30, '-'))
+
+'''group means'''
+group_means = smf.ols(formula='babies_per_woman ~ -1+region', data=g_data).fit()
+# plot_fit(group_means)
+
+print(group_means.params)
+print(g_data.groupby('region').babies_per_woman.mean())
+
+print(np.char.center('surviving', 30, '-'))
+surviving = smf.ols(formula='babies_per_woman ~ -1 + region + age5_surviving', data=g_data).fit()
+# plot_fit(surviving)
+
+'''add intersection term'''
+surviving_by_region_population = smf.ols(formula='babies_per_woman ~ -1+region+age5_surviving:region'
+                                      '-age5_surviving + population', data=g_data).fit()
+# plot_fit(surviving_by_region)
+print(surviving_by_region_population.params)
+
+'''
+Measure of Godness of Fit
+Mean Squared Error of Residuals
+R^2 = (Explained Variance)/(Total Variance)
+F-statistics : explanatory power of fit parameters compared to random fit vectors
+'''
+print(np.char.center('Statistics_Indicator',30,'-'))
+def statistics_indicator(*args):
+    for arg in args:
+        print(np.char.center(arg,30,'-'))
+        if arg is 'resid':
+            for model in [group_means, surviving, surviving_by_region_population]:
+                print(model.mse_resid)
+        elif arg is 'rsquared':
+            for model in [group_means, surviving, surviving_by_region_population]:
+                print(model.rsquared)
+        elif arg is 'f_value':
+            for model in [group_means, surviving, surviving_by_region_population]:
+                print(model.fvalue)
+        else:
+            continue
+
+statistics_indicator('resid','rsquared','f_value','xx')
+
+print(surviving.summary())
+
+print(sm.stats.anova_lm(group_means))
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/bootstrap_sample.png b/bootstrap_sample.png
diff --git a/engineering_04_boostraping_match.py b/engineering_04_boostraping_match.py
@@ -0,0 +1,77 @@
+#coding:utf-8
+"""
+------------------------------------------------
+@File Name    : engineering_04_boostraping_match
+@Function     : 
+@Author       : Minux
+@Date         : 2018/10/12
+@Revised Date : 2018/10/12
+------------------------------------------------
+"""
+import math
+import io
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+
+import scipy.stats
+import scipy.optimize
+import scipy.spatial
+
+
+def generate_random_grades():
+    df_grades = pd.DataFrame()
+    df_grades['grades'] = np.random.random_sample(100)*10
+    df_grades.to_csv('grades.csv', index=False)
+
+pop = pd.read_csv(r'./grades.csv')
+def data_peek():
+    print(pop.head())
+    print(pop.describe())
+    pop.grades.hist(histtype='step')
+    plt.show()
+
+
+def bootstrap_sample(stats_flag=True):
+    bootstrap = pd.DataFrame({'sample_mean':[pop.sample(100, replace=True).grades.mean() for _ in range(10000)]})
+    # print(bootstrap.head(10))
+    if stats_flag:
+        print('quantile(0.025) is {}'.format(bootstrap.sample_mean.quantile(0.025)))
+        print('quantile(0.975) is {}'.format(bootstrap.sample_mean.quantile(0.975)))
+    else:
+        bootstrap.sample_mean.hist(histtype='step')
+        plt.axvline(pop.grades.mean(), color='C1')
+        plt.savefig('bootstrap_sample.png')
+        plt.show()
+n1 = scipy.stats.norm(7.5,1)
+n2 = scipy.stats.norm(4,1)
+
+def bimodal_distribution():
+    x = np.linspace(0, 10, 100)
+    plt.plot(x, 0.5*n1.pdf(x)+0.5*n2.pdf(x))
+    plt.show()
+
+def draw():
+    while True:
+        v = n1.rvs() if np.random.rand() < 0.5 else n2.rvs()
+        if 0<=v<=10:
+            return v
+
+def data_set(n=100):
+    return pd.DataFrame({'grade':[draw() for _ in range(n)]})
+
+def plot_sample_distribution():
+    mean = pd.DataFrame({'mean_grade':[data_set().grade.mean() for _ in range(1000)]})
+    mean.mean_grade.hist(histtype='step')
+    bootstrap = pd.DataFrame({'sample_mean': [pop.sample(100, replace=True).grades.mean() for _ in range(1000)]})
+    bootstrap.sample_mean.hist(histtype='step')
+    plt.show()
+
+if __name__ == '__main__':
+    # generate_random_grades()
+    # data_peek()
+    # bootstrap_sample()
+    # bimodal_distribution()
+    plot_sample_distribution()
+
+
diff --git a/enginerring_01_two_quantiative_variables.py b/enginerring_01_two_quantiative_variables.py
@@ -0,0 +1,36 @@
+#coding:utf-8
+"""
+------------------------------------------------
+@File Name    : enginerring_01_two_quantiative_variables
+@Function     : 
+@Author       : Minux
+@Date         : 2018/9/17
+@Revised Date : 2018/9/17
+------------------------------------------------
+"""
+import numpy as np
+import scipy.stats
+import pandas as pd
+import matplotlib.pyplot as plt
+import re
+import mailbox
+import csv
+
+gapminder = pd.read_csv('gapminder.csv')
+# print(gapminder.info())
+italy = gapminder.query('country == "Italy"')
+# italy.plot.scatter('year','population')
+
+# gapminder.query('country == "India"').plot.scatter('year','population',label='India')
+
+# italy.plot.scatter('year','gdp_per_day',logy=True)
+
+# italy.plot.scatter('gdp_per_day','life_expectancy',logx=True)
+
+size = np.where(italy.year%10==0,32,2)
+data = gapminder.query('(country == "Italy") or (country == "United States")')
+color = np.where(data.country == 'Italy','blue','orange')
+data.plot.scatter('gdp_per_day','life_expectancy',logx=True,c=color,s=size)
+
+plt.legend()
+plt.show()
diff --git a/enginerring_02_graph_visualization.py b/enginerring_02_graph_visualization.py
@@ -0,0 +1,99 @@
+#coding:utf-8
+"""
+------------------------------------------------
+@File Name    : enginerring_02_graph_visualization
+@Function     : 
+@Author       : Minux
+@Date         : 2018/9/24
+@Revised Date : 2018/9/24
+------------------------------------------------
+"""
+import numpy as np
+import scipy.stats
+import pandas as pd
+import matplotlib.pyplot as plt
+import matplotlib
+from matplotlib import style
+style.use('ggplot')
+import pandas.plotting
+
+from IPython import display
+from ipywidgets import interact,widgets
+
+import re
+import mailbox
+import csv
+
+
+gap_minder = pd.read_csv('gapminder.csv')
+
+def plot_year(year,info_tag=False):
+    data = gap_minder[gap_minder.year==year].sort_values('population', ascending=False)
+
+    if info_tag:
+        population_info()
+
+    color = data.age5_surviving
+    area = 3e-6*data.population
+
+    # customize edge color
+    edgecolor = data.region.map({'Africa':'skyblue','Europe':'gold','America':'palegreen','Asia':'coral'})
+
+    # plt.cla()
+    data.plot.scatter('gdp_per_day','life_expectancy',logx=True,s=area,c=color,
+                      colormap=matplotlib.cm.get_cmap('Purples_r'),vmin=55,vmax=100,linewidths=1,edgecolors=edgecolor,
+                      sharex=False, figsize=(10,7))
+
+    for level in [4, 16, 64]:
+        plt.axvline(level, linestyle=':', color='k')
+
+    plt.axis(xmin=1, xmax=500, ymin=30, ymax=100)
+    plt.title('GDP-LIFE-EXPECTANCY_{}'.format(year))
+    plt.xlabel('$gdp-per-day$')
+    plt.ylabel('$life-expectancy$')
+
+
+def population_info():
+    res = gap_minder[gap_minder.year==2015].groupby('region').population.sum()
+    print(res)
+
+def dynamic_plotting_func():
+    # interact(plot_year, year=range(1960,1970))
+    # population_info()
+    year_list = [1965, 1966, 1967]
+    plt.ion()
+    for _year in year_list:
+        plot_year(_year)
+        plt.pause(2)
+        if _year != year_list[-1]:
+            plt.close()
+    plt.ioff()
+    plt.show()
+
+def plot_matrix_func():
+    gap_minder.set_index('year',inplace=True)
+    gap_minder['log10_gdp_per_day'] = np.log10(gap_minder['gdp_per_day'])
+    data = gap_minder.loc[2015,['log10_gdp_per_day','life_expectancy','age5_surviving','babies_per_woman']]
+    pandas.plotting.scatter_matrix(data, figsize=(9,9))
+    plt.show()
+
+if __name__ == '__main__':
+    plot_matrix_func()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+