Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update pandas>2 and networkx>3 dependencies #65

Merged
merged 6 commits into from
Oct 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
strategy:
fail-fast: false # Don't cancel entire run if one python-version fails
matrix:
python-version: ["3.7", "3.8", "3.9"]
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
name: Build and test on Python ${{ matrix.python-version }}
steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion causallib/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.9.5"
__version__ = "0.9.6"
2 changes: 1 addition & 1 deletion causallib/contrib/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
-f https://download.pytorch.org/whl/cpu/ # To support cpu torch installation
torch>=1.2.0
faiss-gpu~=1.7.0
faiss-cpu~=1.7.0 # Can also use gpu for some Python versions
2 changes: 1 addition & 1 deletion causallib/estimation/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -679,7 +679,7 @@ def _matches_to_weights_single_matching(self, s, t, match_df):
name = {0: "control", 1: "treatment"}
weights.name = "{s}_to_{t}".format(s=name[s], t=name[t])
s_to_t_matches = match_df.loc[t][self.treatments_ == s].matches
for source_idx, matches_list in s_to_t_matches.iteritems():
for source_idx, matches_list in s_to_t_matches.items():
if matches_list:
weights.loc[source_idx] += 1
for match in matches_list:
Expand Down
41 changes: 27 additions & 14 deletions causallib/simulation/CausalSimulator3.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def __init__(self, topology, var_types, prob_categories, link_types, snr, treatm

# Create a graph out of matrix topology:
self.topology = topology
self.graph_topology = nx.from_numpy_matrix(topology.transpose(), create_using=nx.DiGraph()) # type:nx.DiGraph
self.graph_topology = nx.from_numpy_array(topology.transpose(), create_using=nx.DiGraph()) # type:nx.DiGraph
self.graph_topology = nx.relabel_nodes(self.graph_topology,
dict(list(zip(list(range(self.m)), self.var_names))))

Expand Down Expand Up @@ -751,13 +751,17 @@ def generate_outcome_col(self, X_parents, link_type, snr, prob_category, outcome
elif outcome_type == SURVIVAL:
if survival_distribution == "expon":
rnd_state = np.random.randint(low=0, high=999999)
param = survival_baseline * np.exp(x_outcome)
param = survival_baseline * np.exp(x_outcome.astype(float))
x_outcome = pd.Series(
stats.expon(loc=0.0, scale=(1.0 / param)).rvs(x_outcome.size, random_state=rnd_state),
index=x_outcome.index)
cf = {i: pd.Series(
stats.expon(loc=0.0, scale=(1 / (survival_baseline * np.exp(cf[i])))).rvs(x_outcome.size,
random_state=rnd_state),
stats.expon(
loc=0.0,
scale=(1 / (survival_baseline * np.exp(cf[i].astype(float))))).rvs(
x_outcome.size,
random_state=rnd_state
),
index=x_outcome.index)
if has_treatment_parent else cf[i] for i in list(cf.keys())}
# Supplying the random state assures that the resulting outcome and cfs is consistent while sampling rvs
Expand Down Expand Up @@ -826,7 +830,7 @@ def generate_censor_col(self, X_parents, link_type, snr, prob_category, outcome_
var_name=var_name)
if survival_distribution == "expon":
# param = survival_baseline * (prob_category.iloc[0]/prob_category.loc[1]) * np.exp(x_signal) # Cox ph
param = survival_baseline * np.exp(x_signal) # Cox ph model
param = survival_baseline * np.exp(x_signal.astype(float)) # Cox ph model
survival_distribution = stats.expon(loc=0.0, scale=(1.0 / param))
x_censor = pd.Series(survival_distribution.rvs(size=x_signal.size), index=x_signal.index)
# scale values with censoring proportions - 0 is non censored, 1 is censored:
Expand Down Expand Up @@ -941,7 +945,7 @@ def _treatment_logistic_dichotomous(x_continuous, prob_category, params=None):
# compute propensities:
t = x_continuous.quantile(prob_category.iloc[1], interpolation="higher")
slope = params.get("slope", 1.0) if params is not None else 1.0
cur_propensity = 1.0 / (1 + np.exp(slope * (x_continuous - np.repeat(t, x_continuous.size))))
cur_propensity = 1.0 / (1 + np.exp(slope * (x_continuous - np.repeat(t, x_continuous.size)).astype(float)))
# assign the propensity values:
propensity.loc[:, columns_names[1]] = cur_propensity
propensity.loc[:, columns_names[0]] = np.ones(cur_propensity.size) - cur_propensity
Expand All @@ -968,11 +972,12 @@ def _treatment_odds_ratio(x_continuous, prob_category, snr):
- **propensity** (*pd.DataFrame*): The marginal conditional probability of treatment given covariates.
A DataFrame shaped (num_samples x num_of_possible_treatment_categories).
"""
x_continuous = x_continuous.astype(float)
index_names = x_continuous.index
columns_names = prob_category.index
propensity = pd.DataFrame(index=index_names, columns=columns_names)
# start with filling up the odds ratio:
for cur_category, p in prob_category.iteritems():
for cur_category, p in prob_category.items():
t = x_continuous.quantile(p, interpolation="higher")
cur_propensity = (1.0 / (1 + np.exp((x_continuous - np.repeat(t, x_continuous.size))))) # type: pd.Series
cur_propensity = cur_propensity.div(np.ones_like(cur_propensity) - cur_propensity)
Expand Down Expand Up @@ -1012,8 +1017,12 @@ def _treatment_quantile_gauss_fit(x_continuous, prob_category, snr):
columns_names = prob_category.index
propensity = pd.DataFrame(index=index_names, columns=columns_names)
# section the signal into bins based on the probabilities (quantiles)
bins = pd.qcut(x=x_continuous, q=np.cumsum(pd.Series(0, index=["null"]).append(prob_category)),
labels=columns_names)
x_continuous = x_continuous.astype(float)
bins = pd.qcut(
x=x_continuous,
q=np.cumsum(pd.concat([pd.Series(0, index=["null"]), prob_category])),
labels=columns_names
)
for cur_category in columns_names:
cur_samples_mask = (bins == cur_category)
cur_samples = x_continuous[cur_samples_mask]
Expand Down Expand Up @@ -1103,8 +1112,10 @@ def _discretize_col(x_col, prob_category, method="empiric", retbins=False, bins=
res = cutoffs.sum(axis="columns")
elif method == "empiric": # discretize according to percentiles from the empirical data itself
try:
cumulative_ps = pd.Series(0, index=["null"]).append(prob_category).cumsum()
res, bins = pd.qcut(x=x_col, q=cumulative_ps,
cumulative_ps = pd.concat(
[pd.Series(0, index=["null"]), prob_category]
).cumsum()
res, bins = pd.qcut(x=x_col.astype(float), q=cumulative_ps,
labels=prob_category.index, retbins=True)
bins = pd.Series(data=bins, index=cumulative_ps.index)
# TODO: maybe noise this a little?
Expand Down Expand Up @@ -1541,7 +1552,7 @@ def generate_random_topology(n_covariates, p, n_treatments=1, n_outcomes=1, n_ce
generated_vars = covariates + treatments + outcomes + censoring
generated_vars = pd.Series(data=generated_vars, index=generated_vars)

total_vars = given_vars.append(generated_vars)
total_vars = pd.concat([given_vars, generated_vars])
topology = pd.DataFrame(data=0, index=total_vars, columns=total_vars, dtype=bool)

# generate between the independent given set to generated set:
Expand Down Expand Up @@ -1577,10 +1588,12 @@ def generate_random_topology(n_covariates, p, n_treatments=1, n_outcomes=1, n_ce
generated_types[treatments] = TREATMENT
generated_types[outcomes] = OUTCOME
generated_types[censoring] = CENSOR
var_types = pd.Series(data=COVARIATE, index=given_vars).append(generated_types)
var_types = pd.concat(
[pd.Series(data=COVARIATE, index=given_vars), generated_types]
)

# create a hidden variables mask:
hidden_vars = given_vars.append(pd.Series(covariates)).sample(frac=p_hidden)
hidden_vars = pd.concat([given_vars, pd.Series(covariates)]).sample(frac=p_hidden)
var_types[hidden_vars] = HIDDEN

return topology, var_types
6 changes: 3 additions & 3 deletions causallib/tests/test_causal_simulator3.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ def test_random_topology_generation(self):
np.testing.assert_array_equal(T.loc[X.columns, :].sum(axis="columns"), np.zeros(5))

# Test for DAGness:
from networkx import DiGraph, from_numpy_matrix, is_directed_acyclic_graph
from networkx import DiGraph, from_numpy_array, is_directed_acyclic_graph
NUM_TESTS = 50
for test in range(NUM_TESTS):
n_cov = np.random.randint(low=10, high=100)
Expand All @@ -317,7 +317,7 @@ def test_random_topology_generation(self):
n_cen = np.random.randint(low=0, high=n_tre_out)
T, _ = CS3m.generate_random_topology(n_covariates=n_cov, p=p, n_treatments=n_tre_out, n_outcomes=n_tre_out,
n_censoring=n_cen, given_vars=[], p_hidden=0)
G = from_numpy_matrix(T.values.transpose(), create_using=DiGraph())
G = from_numpy_array(T.values.transpose(), create_using=DiGraph())
res = is_directed_acyclic_graph(G)
self.assertTrue(res)

Expand Down Expand Up @@ -350,7 +350,7 @@ def test_linear_linking(self):
outcome_types=self.no_X.outcome_types, snr=snr, effect_sizes=self.no_X.effect_sizes)
X, prop, cf = sim.generate_data(num_samples=self.NUM_SAMPLES)

singular_values = np.linalg.svd(X.values, compute_uv=False)
singular_values = np.linalg.svd(X.astype(float).values, compute_uv=False)
eps = 1e-10
rank = np.sum(singular_values > eps)
self.assertEqual(rank, 2,
Expand Down
3 changes: 2 additions & 1 deletion causallib/tests/test_overlap_weights.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def test_ow_weights_reversed_to_propensity(self):
propensity = pd.DataFrame(propensity)
ow_weights = self.estimator.compute_weight_matrix(self.data_r_100["X"], self.data_r_100["a"],
clip_min=None, clip_max=None)
propensity.columns = propensity.columns.astype(ow_weights.columns.dtype) # Avoid column dtype assert
pd.testing.assert_series_equal(propensity.loc[:, 0], ow_weights.loc[:, 1], check_names=False)
pd.testing.assert_series_equal(propensity.loc[:, 1], ow_weights.loc[:, 0], check_names=False)
pd.testing.assert_index_equal(propensity.columns, ow_weights.columns)
pd.testing.assert_index_equal(propensity.columns, ow_weights.columns)
3 changes: 3 additions & 0 deletions causallib/tests/test_survival.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,9 @@ def test_marginal_kaplan_meier_curves(self):
1: lifelines_km_a1.predict(sorted(self.t.unique()))})
marginal_curves_lifelines.columns.name = 'a'
marginal_curves_lifelines.index.name = 't'
marginal_curves_lifelines.columns = (
marginal_curves_lifelines.columns.astype(marginal_curves_causallib.columns.dtype)
) # Avoid column dtype assert

pd.testing.assert_frame_equal(marginal_curves_causallib, marginal_curves_causallib_lifelines)
pd.testing.assert_frame_equal(marginal_curves_causallib, marginal_curves_lifelines)
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
pandas>=0.25.2,<2
pandas>=0.25.2,<3
scipy>=0.19,<2
statsmodels>=0.9,<1
networkx>=1.1,<3
networkx>=1.1,<4
numpy>=1.13,<2
scikit-learn>=0.20,<1.2
matplotlib>=2.2,<4
Expand Down