From 186888ea5327265e6426a6fa0af833a49cd15f05 Mon Sep 17 00:00:00 2001 From: Linagyu Min Date: Mon, 24 Sep 2018 22:16:38 +0800 Subject: [PATCH 1/6] plotting_two_quantiative_minux --- enginerring_01_two_quantiative_variables.py | 36 +++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 enginerring_01_two_quantiative_variables.py diff --git a/enginerring_01_two_quantiative_variables.py b/enginerring_01_two_quantiative_variables.py new file mode 100644 index 0000000..38b020b --- /dev/null +++ b/enginerring_01_two_quantiative_variables.py @@ -0,0 +1,36 @@ +#coding:utf-8 +""" +------------------------------------------------ +@File Name : enginerring_01_two_quantiative_variables +@Function : +@Author : Minux +@Date : 2018/9/17 +@Revised Date : 2018/9/17 +------------------------------------------------ +""" +import numpy as np +import scipy.stats +import pandas as pd +import matplotlib.pyplot as plt +import re +import mailbox +import csv + +gapminder = pd.read_csv('gapminder.csv') +# print(gapminder.info()) +italy = gapminder.query('country == "Italy"') +# italy.plot.scatter('year','population') + +# gapminder.query('country == "India"').plot.scatter('year','population',label='India') + +# italy.plot.scatter('year','gdp_per_day',logy=True) + +# italy.plot.scatter('gdp_per_day','life_expectancy',logx=True) + +size = np.where(italy.year%10==0,32,2) +data = gapminder.query('(country == "Italy") or (country == "United States")') +color = np.where(data.country == 'Italy','blue','orange') +data.plot.scatter('gdp_per_day','life_expectancy',logx=True,c=color,s=size) + +plt.legend() +plt.show() \ No newline at end of file From 43554f041c377bc33734f59cee55c77697a7009f Mon Sep 17 00:00:00 2001 From: Linagyu Min Date: Mon, 24 Sep 2018 23:31:51 +0800 Subject: [PATCH 2/6] plot_matrix --- enginerring_02_graph_visualization.py | 99 +++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 enginerring_02_graph_visualization.py diff --git a/enginerring_02_graph_visualization.py b/enginerring_02_graph_visualization.py new file mode 100644 index 0000000..cfb1e0d --- /dev/null +++ b/enginerring_02_graph_visualization.py @@ -0,0 +1,99 @@ +#coding:utf-8 +""" +------------------------------------------------ +@File Name : enginerring_02_graph_visualization +@Function : +@Author : Minux +@Date : 2018/9/24 +@Revised Date : 2018/9/24 +------------------------------------------------ +""" +import numpy as np +import scipy.stats +import pandas as pd +import matplotlib.pyplot as plt +import matplotlib +from matplotlib import style +style.use('ggplot') +import pandas.plotting + +from IPython import display +from ipywidgets import interact,widgets + +import re +import mailbox +import csv + + +gap_minder = pd.read_csv('gapminder.csv') + +def plot_year(year,info_tag=False): + data = gap_minder[gap_minder.year==year].sort_values('population', ascending=False) + + if info_tag: + population_info() + + color = data.age5_surviving + area = 3e-6*data.population + + # customize edge color + edgecolor = data.region.map({'Africa':'skyblue','Europe':'gold','America':'palegreen','Asia':'coral'}) + + # plt.cla() + data.plot.scatter('gdp_per_day','life_expectancy',logx=True,s=area,c=color, + colormap=matplotlib.cm.get_cmap('Purples_r'),vmin=55,vmax=100,linewidths=1,edgecolors=edgecolor, + sharex=False, figsize=(10,7)) + + for level in [4, 16, 64]: + plt.axvline(level, linestyle=':', color='k') + + plt.axis(xmin=1, xmax=500, ymin=30, ymax=100) + plt.title('GDP-LIFE-EXPECTANCY_{}'.format(year)) + plt.xlabel('$gdp-per-day$') + plt.ylabel('$life-expectancy$') + + +def population_info(): + res = gap_minder[gap_minder.year==2015].groupby('region').population.sum() + print(res) + +def dynamic_plotting_func(): + # interact(plot_year, year=range(1960,1970)) + # population_info() + year_list = [1965, 1966, 1967] + plt.ion() + for _year in year_list: + plot_year(_year) + plt.pause(2) + if _year != year_list[-1]: + plt.close() + plt.ioff() + plt.show() + +def plot_matrix_func(): + gap_minder.set_index('year',inplace=True) + gap_minder['log10_gdp_per_day'] = np.log10(gap_minder['gdp_per_day']) + data = gap_minder.loc[2015,['log10_gdp_per_day','life_expectancy','age5_surviving','babies_per_woman']] + pandas.plotting.scatter_matrix(data, figsize=(9,9)) + plt.show() + +if __name__ == '__main__': + plot_matrix_func() + + + + + + + + + + + + + + + + + + From bfcba8d65fc3e64bbf36dc2af886f66b83dcaf78 Mon Sep 17 00:00:00 2001 From: Linagyu Min Date: Mon, 24 Sep 2018 23:35:17 +0800 Subject: [PATCH 3/6] minux_plot_matrix From 4b72692884b8238cb6265161c02cc7790171dfc5 Mon Sep 17 00:00:00 2001 From: Linagyu Min Date: Fri, 12 Oct 2018 12:29:36 +0800 Subject: [PATCH 4/6] bootstrap samples in statistics --- bootstrap_sample.png | Bin 0 -> 12371 bytes engineering_04_boostraping_match.py | 77 +++++++++++++++++++++ grades.csv | 101 ++++++++++++++++++++++++++++ 3 files changed, 178 insertions(+) create mode 100644 bootstrap_sample.png create mode 100644 engineering_04_boostraping_match.py create mode 100644 grades.csv diff --git a/bootstrap_sample.png b/bootstrap_sample.png new file mode 100644 index 0000000000000000000000000000000000000000..51c754970e562dea038cbd4e99bba0b4b8c83660 GIT binary patch literal 12371 zcmeHt2{@E}+xMj?EmF56WGPD_w2&fcvy83m*+~+kA#0XF>P{PKY(p{@fPKRTJK_$A`5AG><1ZC~LQY;w|gxM##}>u8_X zcAepJa{czvM>`cOyxy17`ntk(v7<6raru5D+l(=!84#p$|JsxUd@1Cf zBEpCu$96L8h9{3Po`XH><1b+L;5pO6y@= z$12|+<&kq!7ZDMe>}>jCWyx5{yF@4FghfW$xpN{&9Ji>LnCk~7jxN)O;>>>XyScf= z(p9>n)!Cv+w$I%a)H~|_Qpeh3 zA`$+bdk}`ahZP*zgfbR4gSq=7og1@VoB|D9O3aLYW)~60ub#tAz5M+J>Re7~cvm>a z?$sG_ocyM{cVicne6WGAfyde5FNOV1X`xQV_#Be%M%B=8EJkbdR0b3BaX-gB*x~yh zBFqS4v40akg7{f7`@vHW4IB| zP=(Jxl}6+)67UP4O2?y>Rm8 zm#c<`uT<64TqoN^tZi&Eh{OR06CWR9dV0FIxn%hQfi&Nmtf8Z*_$%yNJrk46Cr`GI zjEpSD)A$1i%PXB>IF7N@lC+<8}5S108*bq&+<-nu?CEG((qizX^BZ z)$%#yaQ^k*wila7Sl_sDi$ug56*<1yzJ2?!<7qlh0Cu(gUKfU0m;C&ELgI(}TUQ#) z^X`M<*3F<`vkX>@Yp=t)G4$8p01=Ayfx)rz|bqy**H7I59DSz96jm z`bb3m2zN+GNJ6pm*yOhl_hCh_25MGT30&OV_*Pd}SAt+pr&vyBuNQZ}T94iK>Yk5= zDvRaCNmPBi262*a+7s__FWR?LyHXeufyxSTgJVbz83Pc`h{Dk#l@Xh z*jR1~d+}m0%9qyfE+#Infh#mFb(?PKD|V&^`>tL}N=hOK2G=Bf`t-?-tfsCWS~K(` zuz2?4jcU>p$pJA-f>Q=sVuZn+9n7RyPI}n-t%%#^?-x z|4k$pQ`2Z|9i6b~=&YimA_<%+Y2LG3R9w6rHplRA$2A5v0-iP%8a`Gqpxha4YtExV zZ8@C9>g@DBDV1TYEL9xO8cys!B1oehI}cDdy))*s|0(3$>TH!KIjb=5&XAI+;|C_2 z175$e8`Ms?*lnz@m3)69HtbePZIB`5anq&?N%Ww(;(i>dx$;zddpp7B^e=a1;eNW; z6$Q3V?3^^Ow8)V(`Bj%BK*xw5Hm>M3%%xUg<)RyL?Z12_ZdD`spI4ctSUw=Sv&!%I znBga0-*SmnK2c`i?j_%qUXtutcCV8DQulf?RrJgmLjt-*dwPs+_ZBqB+$48orDtWe zFZ~$8Tb;u7r6wl}lU@=Cza%9l=E+I#M3CK@P-=}Yk6#iq$ZjL&RAfT~@i@y1kLc5P zE)o{HZDJ?h)$q11PWQUbe!EaU-zZly6eK5Q*LFnX<*^F{DeommQ{OdeT~m_*R4UX; zXl!gwnobH@BHHAdPempY8nNe0(Op#HN|y3n_s&MPAc&>p_@-|JAvIlN;|xrQu*jJ+ z-L(9h4O|#)UERbXFiuZA|@p&I3j-i2v6C`+q<251Jcnuzl?86Ln%a zy7u;&zp$~n){7V3q70P#SOI!bXRBDTwa*W79kcMm5tnb`mq#LetakCpCPv{(Fq^mR z{QCUhB`N1ob!lno<-qQ@miLu{d9%jHZ<)F@D|-I;jbp`o1x`G*&xOIU>LE+yajS>0 z33ZK((sfcaMPQpysZ`?e+?x&P_RI66+THHS!_)-xb+)DHwDHkPo$)0 zX6mY{J`tRFWfRl9Fwsi=#CGL~goLIfaVh@cE}nG7r~5F3hpc?r)4he*5B^LgYm2>` zOPk@;41dvD)ChI;^&?bIz`1Ge5NLYOp6x_i$eA-|&=zvy#0hWcEly5OZCzcdb8}7_ zF=q{p3+#UwEBC&PF!wPu%Q4VQ$u$)Z3=Fihw_nicY;VU9fCHA9IZ$r9NZMwI?8ok~ zIoM4b-Ir$WLUxxUFM5tF$S(RF(JJk{fn2K6Ow&4?yk{)kyb7$ zDGTzW$uraU?YxNdWo26^_u}H>nv(e7c7X@s^Bg~jpZk2cC(pLku*N7{D4=REYW>HM zGmn*_eoDP6+T~Eph|12$(6O^if1jJn&cWf{!L_-8Oij%z^znMn-aZl$(*X6bgH%<+zrHTrVpri5@)orYh1z zBElqlQCaY{rp?ujO*)9oTgP5LzMPer(kr<(O&u$giX4z4 z;|3Mu8RP5`iXyguV>-|Gf|exA7mDZ26bm-dbhyOu%&K_7D;uJz_40`gsx*)bU^&o6A_V`((H=t{HftH(5nZtrb3$ zzp@%Qu$-CUbn( z$ef)g?|i?EU;R;In!N}5Na(*_A9^V~m$Prn{$w`xUx zK&1bqviYBElrQ6U`fbVF9*y#8x&jnsS3ng7+gEhwzucCn%Zotu zyF`v`w!q`dm4XhWjm7#BTaSYtFaN1iC}s0Or1^iQ2Th&pKLsmWst6ii*~!^CEGo*Y z`nW`d8m_RkYRnFcrOwq$lJh&wT3tI$4`YDRBmu{v0$mb0d{`N3m%h+?MbFSM4ZbX< zhW6)%Fa4+gR0$aalwOe)*dd@ zwT8$2R@YlgS;udF>$3q0n%OiO%aO zR8vg*9KIGEE8=Tof05+%kofB@q>P+h>K%OY#0eC&Sh!EAJ5;O{;7X?pG$V44s=+On zSsSW}4qSd0AAbnGTu%QgH0IwveR8p8+XR^3HuKA~HcOJSMA*6PP+iVh&x;rSrr%H~ ztLj2O+^Z%2z1tk*M6N`;VP2}!@?8Im6>i4@usjDyn&NVkAcN8Yk<$;Dm=1Gvsz zyk}EU3U=w?&gQJb0Z|Ni3Y~abe@b_*jo8717w&ImMSDc(t5*}HqNjoT2`D|gYGxL* zzL26YQA#1UqA$kA@`WnTBaCl;cSaudhvm8d?&;iS0AJ;z+SM(UDA}4D{!E-GXw9be zIR}RyHH?pzbDwNCFmG>8P!5Az3(_Ck5Gnm6Fpw%fKb6}gdg6q(`+!G^=IhgMusP-W zV)~g^FJFEjD01k~MRd>cde2@!71P=ZtrG}Hp2Y|MO59PuucwZM`7!YiGg>3BrECdD z>n9hMspQUBSDwGuXt0HLV9GG-9b7r%2e2^C`#*`L|AeXauY9@wClFj;k(NxqO9+4+ zI5-QVQ4Vx@2gC5~J9dNtqh4QI!2;RnDfcQQy@aX-n@Qx{IkO)>exNJ{w#8cYR9~?U z+hI*zsCn1*>`L4P4`);iD zyJQUkD_jKHhXj5(XeE~Z~Q;v^yw*9y`Xp8C=r)KM%zltV{?6X&|E@B7!mZ{_(4W`t|GA zORd@1*zomUXL(KU4*ojb(Y$D4WMou2!u_{h0*BKg{h?Q)!2(m9%0G9ROyQF1<)w+q#mxjcHFqU?hEQh)Py#J1{6av=sEVVnYR?$^7)V~&%Dj6QWL0X-hv?J{~3w^2YzD zGNy3L+uC%&`vjGlz16r~26g7u)TFKJ_MVAu%$4*rYqllH%O#Bzx0F-Bzg^kB^X6ZA zSVvY?Y&YdqG`o9!rN!9F-M4uU*J5jH>r~p0B7Uk?f*?jiL!$?@b&Thv2u539KOCq< z{_WmVrmpRJgv2c4!t{ayeL!ehpvOY9GDwz0=-`+vmNnBNcp!9a+XH;2zo17bc!Oe%uVuDTt{GVC|%Kn&+&pdWVID zrDbNSFE1}g%DSevw6+T6`=~#IEeKjVAKNJbm##TK)-<-=0@%r-{~Yr-aI&O5=5CN& zdTl$sW{Q;osJ~KtXmY6E1>`@@&dv_Cd&59=o;uZ0_$6I01GOJW?H^@G-npc)?bS6k z8K4HioBsOpxRuncul($L_bx?$N2@{k0w;1>}lUC_GH-@bj@dK_&`bzx#n zOR}$Cy#fJfgc0O)oMfld7!bdc-wr_l`QI8nKDdD#J$f`0O+28ah-2dQ+Lo4YpfcJZ+t7%V z!j!sG7LAoPGy);~KxHx5`_Mcl#+wuF4FCoA^V{(mIf98Wb$j(wEu zgg!!TR_6OW?S`!j6_ItDdg<^5sz4Vi<@@*RXoHB4zX{4Y=g*@`!Cg^FiOxgC zS{a7#6E{jEZLHHurVDxnF*#*(PY49U0;GUskldXy@?N@-nY@30B9KQ;pO9zYeZtz> z+7`Sy)hkz6#|vPg7`1D+R03%cgPsGr!>T5zeRX->wONs40(jco-LrL#jf;V<#U_z} z>!%zBGC}K`gR`UshOFc&rlsi45c-vYTeBey4}z;!=x%Eb;#nwSJ(75>`vmhGn4pFR z(kLpp`CHdJE9zuM&SZ<4WII$_LJgm<37}5rOn>RfkGq0k5n+7l1%-qro=Os*qryQ* zSh&F00d$MQa*z}Vk$3N!QN%oqne!}%hx|bHKnE%qI&5I0_%vscZuj5>RtAj!)C;^+ zNLSdTuR5`W4|o?{F|Y2GG^+E~95BFz8EM9>_&4qs*`R2zF5?YM_FMTI;o7ZIn+jlZY#gOG?WgZpggTl zAfW1vVeU=a_0{E)VS<{=XQ7B&@bACZU& z-|dW>gR05DFyYuCCaUP5Sy9jOZ|H-WtV{OpcyIW$y` z^ZIjW{ihS=|3|$8HJT8_^Ls|ZVO-a_mSuSU{^qi0WHn@uCgDUTH!)ah(+N#69?>A|KI-SQ2k1CZj-^ylzAN z!K-hxL3Vtx?dKINyb6+NDf)8fyhCXsDKq=)4DXinY1Bi37dFRxmc<0^?>8Qk5T;$Vr!?ah zW%gXD7&wUIdyr<=zh}%LjF3&0WDlxfmr8HHk{Z@7&9IdezQFa;4qUt6Uv#KV(ykkE zGXPVnmx?o&gn$C(Da?f8&X^>YJ}G+R&pEfGCntMcykiI;;}vhRyU4=p9J%K%*MEv`^8Hg&R8XK_A=jN z#=?R9d?J_FtY7nyMn+0pMV&0{u$9|8O1G82AQ^GPK6!@jld^khj7KY#{%Qm%#g74D zcE(fhob9s~bb(0_gjuJ*ytTIYyM(}-r|`K8*@yYsyia*wb-qiUiB@W4a_8K3yLxLS znvc_4W7iYY(?w0)x~+}!ZoSAR((Ov?Hz64Ljkr?hH zOP`P3$lS&zJf0L`rj5g8pIckdVd0T&o0&^Y2$NrVojDrbL5YdhzAR}*VatpiV14i= z#^>sM@i)8)5A$e+^YLvhNMXMo|0hl!ghRq474_9O)+rkj81xD{a?Ept)WTC!Q)>sl z{~r!jQW5lLVyr#7uS zx41uJ@dGs=Dk<415PbH#i~{&5Q8ErQj;{dH!R(_aW3gBv3<*z*M#G_fOAQQZ>!B`H z8(~|_+p_fTMj5xg%}`awnF`oA@G^uZuHR1_-9MqS+lNrm0Q&>-7?VBoXo2@^#@2hOzHLLC1~ z`7>W;w)?AA>$n;C#c4}*&urh6EJ<5~BZ{0G_#!~A2Jr|sqC?O7Ot<--KUsL;;rqat za#vMR8F%57gV^j}bj`1ogwJpK&HY;}th8(GA$e)3TZMtc5!|c!(E{ z2f6Zf=Fk)Yyxs>77#m_0%y;ZN`t>o};ROgXZHH@LxJ7he4;?xrW!oaCsKgab-5f;h zz5mA};qZa*&z}G)w-vm1@XyH4hB~rMGxi85fyZDA ziI7nt&eTtI=asP*8frWYI_9m&dd&4v?(9f|Ycpl-$I&<={!R?MS@-tRxE)Dh>D&EX zG}^`*X@kY0eQ+=%huD(iLlqMe6oe2_2~BpuNogwDxpKC|A6s*&5B$W$B(;c18- zzh0tL9fI8$2iZf1;#=Q=9rBe$rY{ZzDjNGj^!En!|G-=I_!;18;kw3ENxf73rJaNW9*tUJysPeS@$cRR>6yi!YgvnHhv!Z`A>P*d#i*B-Dmh)LHnjx;>#ElEogB8}7 zIBO(~@>EdI2%<}&{DIE9kbt9zus#=2!5H87wmFQbCYzF=ce=E;w$^!O2xxOE=xe*? zctt`YQ~?$o9iz7v3>*91v<&D2!pDyv7sQ2dW}-F#IM8;>b0b6#4zs(1oDr6>$_96~ zpMKB)vm!8hl8{|GQ<59h57{#018bu&u5oLAv;nVY)-|b0O6sUQZQpeaos2@PV$+LBBdrKCaO_{oKGL{X$`l| zj|ymXO`*URq^S zA)wwRufko5FPdyeYHe)|i;0 z`)6*M!{k}%#_@RP{)v?dShy6ioHBxdpS+YsB?41AZap=Pi!}H&R3Bsfr?)NzME4t@ zl%^=Ap-&uc*oW4@<7jgv|bx5Btl5Y=Bsy>V0@tsbgk^P72Gv2Xog*6H)~zLCTmQR zdv8^stFxEH z4juaF33CHk@?MMAuAEy=ge9>BDDQ?UK5b0ev13OT2yt?5)45#O=$8FnlydzWHxlz* z$@+42EQXMO7o1!AwwVkzZy}7PWWr7|diU;K2XK4w`uCa2ydIxWb2o5fb-pR<<;(r# z!iu#l>O@Mckz~c{`wYQJ`6vb*S()y6 z@H`JZJFycdT0Fl$KAZ{VW(-q`@4-IJ3==b;SHEpaNV&^u@lw`h{5hSvGKf%M7 zO&cJI>ww3YE!w2m-M26<{M)8}~KG-@{?yKGJsb?=qF628y@ z(A%@l(vVc42KEV}^iAFme*AwBKmWtX2IMcX A$^ZZW literal 0 HcmV?d00001 diff --git a/engineering_04_boostraping_match.py b/engineering_04_boostraping_match.py new file mode 100644 index 0000000..268c49c --- /dev/null +++ b/engineering_04_boostraping_match.py @@ -0,0 +1,77 @@ +#coding:utf-8 +""" +------------------------------------------------ +@File Name : engineering_04_boostraping_match +@Function : +@Author : Minux +@Date : 2018/10/12 +@Revised Date : 2018/10/12 +------------------------------------------------ +""" +import math +import io +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +import scipy.stats +import scipy.optimize +import scipy.spatial + + +def generate_random_grades(): + df_grades = pd.DataFrame() + df_grades['grades'] = np.random.random_sample(100)*10 + df_grades.to_csv('grades.csv', index=False) + +pop = pd.read_csv(r'./grades.csv') +def data_peek(): + print(pop.head()) + print(pop.describe()) + pop.grades.hist(histtype='step') + plt.show() + + +def bootstrap_sample(stats_flag=True): + bootstrap = pd.DataFrame({'sample_mean':[pop.sample(100, replace=True).grades.mean() for _ in range(10000)]}) + # print(bootstrap.head(10)) + if stats_flag: + print('quantile(0.025) is {}'.format(bootstrap.sample_mean.quantile(0.025))) + print('quantile(0.975) is {}'.format(bootstrap.sample_mean.quantile(0.975))) + else: + bootstrap.sample_mean.hist(histtype='step') + plt.axvline(pop.grades.mean(), color='C1') + plt.savefig('bootstrap_sample.png') + plt.show() +n1 = scipy.stats.norm(7.5,1) +n2 = scipy.stats.norm(4,1) + +def bimodal_distribution(): + x = np.linspace(0, 10, 100) + plt.plot(x, 0.5*n1.pdf(x)+0.5*n2.pdf(x)) + plt.show() + +def draw(): + while True: + v = n1.rvs() if np.random.rand() < 0.5 else n2.rvs() + if 0<=v<=10: + return v + +def data_set(n=100): + return pd.DataFrame({'grade':[draw() for _ in range(n)]}) + +def plot_sample_distribution(): + mean = pd.DataFrame({'mean_grade':[data_set().grade.mean() for _ in range(1000)]}) + mean.mean_grade.hist(histtype='step') + bootstrap = pd.DataFrame({'sample_mean': [pop.sample(100, replace=True).grades.mean() for _ in range(1000)]}) + bootstrap.sample_mean.hist(histtype='step') + plt.show() + +if __name__ == '__main__': + # generate_random_grades() + # data_peek() + # bootstrap_sample() + # bimodal_distribution() + plot_sample_distribution() + + diff --git a/grades.csv b/grades.csv new file mode 100644 index 0000000..9cc8978 --- /dev/null +++ b/grades.csv @@ -0,0 +1,101 @@ +grades +2.885297375052974 +3.3787232652921926 +0.5157506783237786 +7.530575169035667 +9.180303136325387 +3.247731687659651 +4.950489892885538 +8.0015009035261 +2.6078622645074114 +6.009878816172289 +4.186469107435789 +6.682784643085828 +0.8751624089495602 +4.716497971747977 +1.4581406336496172 +3.19486133862394 +9.427085671165333 +6.284416630262431 +8.824918578038467 +7.890313682109427 +4.871002855810409 +0.6317857270649152 +4.991058184260214 +7.983451855523915 +9.962823526017296 +4.982658322178077 +6.670156683906837 +0.651839509549671 +2.3272059025484704 +5.699657540517821 +2.939261511228751 +7.686950454425226 +2.115272151014879 +4.431181488103318 +8.309128004682204 +3.6800550442415525 +9.925188218897102 +6.001069249897107 +8.825709702197125 +3.384629235908936 +0.5609961113278883 +1.819072372676147 +1.5203215232741374 +4.674224044344715 +5.662602199916055 +8.262112284506735 +0.09169696831957563 +1.7688617708731935 +2.3198470228685353 +0.8119208147379753 +7.38967923597217 +7.740937716712329 +6.02697965034129 +6.795766719596736 +6.737286587003727 +7.2815132151327155 +8.762215432029238 +1.989381334869742 +2.3957518451893645 +6.728593713620739 +3.741800153436029 +8.511174741265508 +3.1032573025601184 +7.372005794420275 +5.9909522992686535 +5.5695898456772355 +7.937879929877528 +6.5764800298040305 +0.565865516090116 +9.492054741000244 +0.3263210544678852 +2.2110798568334857 +1.7138778687188116 +6.459562568464453 +9.932064685278712 +4.013709190046729 +6.228670230996718 +0.4991353566920398 +3.661144528400065 +5.307086225597097 +8.286759912020758 +2.2077337939598807 +8.679519892820608 +9.7609243620153 +4.005484443419254 +1.8049505749718686 +6.051765899190481 +5.478650795676222 +4.099844770479421 +6.1098960622379215 +5.686654604592347 +7.229974729598268 +4.082606494697469 +0.2635738068212734 +5.92285729972296 +1.6047087942795302 +6.139898193153334 +0.6778877437965691 +7.886811347946667 +0.16635694961872893 From 5c6f824bf3f36d9da365b0c3683eedfe9821ea3e Mon Sep 17 00:00:00 2001 From: Linagyu Min Date: Tue, 16 Oct 2018 22:04:43 +0800 Subject: [PATCH 5/6] hypothesis_testing_and_p_value_Minux --- TEW_06_hypothesis_testing_p_value_CI.py | 55 +++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 TEW_06_hypothesis_testing_p_value_CI.py diff --git a/TEW_06_hypothesis_testing_p_value_CI.py b/TEW_06_hypothesis_testing_p_value_CI.py new file mode 100644 index 0000000..6c49781 --- /dev/null +++ b/TEW_06_hypothesis_testing_p_value_CI.py @@ -0,0 +1,55 @@ +#coding:utf-8 +""" +------------------------------------------------ +@File Name : TEW_06_hypothesis_testing_p_value_CI +@Function : +@Author : Minux +@Date : 2018/10/16 +@Revised Date : 2018/10/16 +------------------------------------------------ +""" +import math +import io +import numpy as np +import pandas as pd + +import matplotlib.pyplot as plt + +import scipy.stats +import scipy.optimize +import scipy.spatial + +cholera = pd.read_csv('cholera.csv') # 霍乱数据 +pumps = pd.read_csv('pumps.csv') # 水泵数据 + +def Plot_Cholera_func(): + fig = plt.figure(figsize=(10, 10)) + img = plt.imread('london.png') + plt.imshow(img, extent=[-0.38, 0.38, -0.38, 0.38]) + plt.scatter(pumps.x, pumps.y, color='b') + plt.scatter(cholera.x, cholera.y, color='r', s=3) + plt.show() + +def Data_stat_info(): + print(cholera.closest.value_counts()) + print('-'*10,'GroupBy_Closest','-'*10) + print(cholera.groupby('closest').deaths.sum()) + +def simulate(n): + return pd.DataFrame({'closest':np.random.choice([0,1,4,5], size=n, p=[0.65, 0.15, 0.10, 0.10])}) + +def sampling_function(): + sampling = pd.DataFrame({'counts':[simulate(489).closest.value_counts()[0] for _ in range(10000)]}) + # sampling.counts.hist(histtype='step') + # plt.show() + # 计算p-value + # the smaller p-value the more strongly we can reject the null hypothesis + p_value = 100.0 - scipy.stats.percentileofscore(sampling.counts, score=340) + print(p_value) + + +if __name__ == '__main__': + sampling_function() + + + From 32ad0ba9a8b59de376c5c2ba897df8cfb48362fd Mon Sep 17 00:00:00 2001 From: Linagyu Min Date: Tue, 23 Oct 2018 15:37:03 +0800 Subject: [PATCH 6/6] ANOVA --- TEW_07_Anova_Fitting_Models.py | 100 +++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 TEW_07_Anova_Fitting_Models.py diff --git a/TEW_07_Anova_Fitting_Models.py b/TEW_07_Anova_Fitting_Models.py new file mode 100644 index 0000000..5d8e96b --- /dev/null +++ b/TEW_07_Anova_Fitting_Models.py @@ -0,0 +1,100 @@ +#coding:utf-8 +""" +------------------------------------------------ +@File Name : TEW_07_Anova_Fitting_Models +@Function : +@Author : Minux +@Date : 2018/10/23 +@Revised Date : 2018/10/23 +------------------------------------------------ +""" +import math +import numpy +import pandas as pd +import matplotlib.pyplot as plt +import statsmodels.api as sm +import statsmodels.formula.api as smf +import numpy as np + +gap_minder = pd.read_csv('gapminder.csv') +g_data = gap_minder.query('year==1985') + +size = g_data.population * 1e-6 +colors = g_data.region.map({'Africa':'skyblue','Europe':'gold','America':'palegreen','Asia':'red'}) + +def plot_data(): + g_data.plot.scatter('age5_surviving','babies_per_woman',c=colors, s=size, linewidths=0.5, edgecolor='k', alpha=0.5) + +model = smf.ols(formula='babies_per_woman ~ 1', data=g_data) +grand_mean = model.fit() + +def plot_fit(fit_model): + plot_data() + plt.scatter(g_data.age5_surviving, fit_model.predict(g_data), c=colors, s=30, linewidths=0.5, + edgecolors='k', marker='D') + plt.show() + +# plot_fit(grand_mean) +print(np.char.center('mean', 30, '-')) +'''mean''' +print(grand_mean.params) +print(g_data.babies_per_woman.mean()) + +print(np.char.center('group mean', 30, '-')) + +'''group means''' +group_means = smf.ols(formula='babies_per_woman ~ -1+region', data=g_data).fit() +# plot_fit(group_means) + +print(group_means.params) +print(g_data.groupby('region').babies_per_woman.mean()) + +print(np.char.center('surviving', 30, '-')) +surviving = smf.ols(formula='babies_per_woman ~ -1 + region + age5_surviving', data=g_data).fit() +# plot_fit(surviving) + +'''add intersection term''' +surviving_by_region_population = smf.ols(formula='babies_per_woman ~ -1+region+age5_surviving:region' + '-age5_surviving + population', data=g_data).fit() +# plot_fit(surviving_by_region) +print(surviving_by_region_population.params) + +''' +Measure of Godness of Fit +Mean Squared Error of Residuals +R^2 = (Explained Variance)/(Total Variance) +F-statistics : explanatory power of fit parameters compared to random fit vectors +''' +print(np.char.center('Statistics_Indicator',30,'-')) +def statistics_indicator(*args): + for arg in args: + print(np.char.center(arg,30,'-')) + if arg is 'resid': + for model in [group_means, surviving, surviving_by_region_population]: + print(model.mse_resid) + elif arg is 'rsquared': + for model in [group_means, surviving, surviving_by_region_population]: + print(model.rsquared) + elif arg is 'f_value': + for model in [group_means, surviving, surviving_by_region_population]: + print(model.fvalue) + else: + continue + +statistics_indicator('resid','rsquared','f_value','xx') + +print(surviving.summary()) + +print(sm.stats.anova_lm(group_means)) + + + + + + + + + + + +