-
Notifications
You must be signed in to change notification settings - Fork 1
/
functions1.py
92 lines (76 loc) · 3.45 KB
/
functions1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import scipy.stats as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
def generate_incomes(n, pj):
# On génère les revenus des parents (exprimés en logs) selon une loi normale.
# La moyenne et variance n'ont aucune incidence sur le résultat final (ie. sur le caclul de la classe de revenu)
ln_y_parent = st.norm(0,1).rvs(size=n)
# Génération d'une réalisation du terme d'erreur epsilon
residues = st.norm(0,1).rvs(size=n)
return np.exp(pj*ln_y_parent + residues), np.exp(ln_y_parent)
def quantiles(l, nb_quantiles):
size = len(l)
l_sorted = l.copy()
l_sorted = l_sorted.sort_values()
quantiles = np.round(np.arange(1, nb_quantiles+1, nb_quantiles/size) -0.5 +1./size)
q_dict = {a:int(b) for a,b in zip(l_sorted,quantiles)}
return pd.Series([q_dict[e] for e in l])
def compute_quantiles(y_child, y_parents, nb_quantiles):
y_child = pd.Series(y_child)
y_parents = pd.Series(y_parents)
c_i_child = quantiles(y_child, nb_quantiles)
c_i_parent = quantiles(y_parents, nb_quantiles)
sample = pd.concat([y_child, y_parents, c_i_child, c_i_parent], axis=1)
sample.columns = ["y_child", "y_parents", "c_i_child","c_i_parent"]
return sample
def distribution(counts, nb_quantiles):
distrib = []
total = counts["counts"].sum()
if total == 0 :
return [0] * nb_quantiles
for q_p in range(1, nb_quantiles+1):
subset = counts[counts.c_i_parent == q_p]
if len(subset):
nb = subset["counts"].values[0]
distrib += [nb / total]
else:
distrib += [0]
return distrib
def conditional_distributions(sample, nb_quantiles):
counts = sample.groupby(["c_i_child","c_i_parent"]).apply(len)
counts = counts.reset_index()
counts.columns = ["c_i_child","c_i_parent","counts"]
mat = []
for child_quantile in np.arange(nb_quantiles)+1:
subset = counts[counts.c_i_child == child_quantile]
mat += [distribution(subset, nb_quantiles)]
return np.array(mat)
def plot_conditional_distributions(p, cd, nb_quantiles):
plt.figure()
# La ligne suivante sert à afficher un graphique en "stack bars", sur ce modèle : https://matplotlib.org/gallery/lines_bars_and_markers/bar_stacked.html
cumul = np.array([0] * nb_quantiles)
for i, child_quantile in enumerate(cd):
plt.bar(np.arange(nb_quantiles)+1, child_quantile, bottom=cumul, width=0.95, label = str(i+1) +"e")
cumul = cumul + np.array(child_quantile)
plt.axis([.5, nb_quantiles*1.3 ,0 ,1])
plt.title("p=" + str(p))
plt.legend()
plt.xlabel("quantile parents")
plt.ylabel("probabilité du quantile enfant")
plt.show()
def proba_cond(c_i_parent, c_i_child, mat):
return mat[c_i_child, c_i_parent]
"""pj = 0.9 # coefficient d'élasticité du pays j
nb_quantiles = 100 # nombre de quantiles (nombre de classes de revenu)
n = 1000*nb_quantiles # taille de l'échantillon
y_child, y_parents = generate_incomes(n, pj)
sample = compute_quantiles(y_child, y_parents, nb_quantiles)
cd = conditional_distributions(sample, nb_quantiles)
#plot_conditional_distributions(pj, cd, nb_quantiles) # Cette instruction prendra du temps si nb_quantiles > 10
print(cd)
c_i_child = 5
c_i_parent = 8
p = proba_cond(c_i_parent, c_i_child, cd)
print("\nP(c_i_parent = {} | c_i_child = {}, pj = {}) = {}".format(c_i_parent, c_i_child, pj, p))"""