Skip to content

Commit 9e4d5ab

Browse files
authored
Linear Regression for data read from a csv file
Linear Regression for a dataset read from a csv file
1 parent 62f2dd2 commit 9e4d5ab

File tree

1 file changed

+104
-0
lines changed

1 file changed

+104
-0
lines changed

linear_regression_01.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
#!/usr/bin/env python3
2+
#
3+
# Linear Regression for a dataset read from a csv file. We applied closed form
4+
# equations to determine theta1 and theta0 (James et al., 2021). We have two
5+
# datasets: CASF-2016 Ki (de Azevedo et al., 2024) and Amsterdam Apartments
6+
# dataset (Wolf, 2022).
7+
# This code is for linear models as expressed by the following equation:
8+
# y = theta0 + theta1*X, where theta0 is the fit intercept and theta1 the
9+
# coeficient, theta0 and theta1 are the model’s parameters (Géron, 2023).
10+
#
11+
# References
12+
# de Azevedo WF Jr, Quiroga R, Villarreal MA, da Silveira NJF,
13+
# Bitencourt-Ferreira G, da Silva AD, Veit-Acosta M, Oliveira PR, Tutone M,
14+
# Biziukova N, Poroikov V, Tarasova O, Baud S. SAnDReS 2.0: Development of
15+
# machine-learning models to explore the scoring function space. J Comput Chem.
16+
# 2024 Jun 20. doi: 10.1002/jcc.27449. Epub ahead of print. PMID: 38900052.
17+
#
18+
# Géron, Aurélien. 2023. Hands-on Machine Learning with Scikit-Learn, Keras,
19+
# and TensorFlow: Concepts, Tools, and Techniques to Build Intelligent Systems.
20+
# 3rd ed. CA 95472: O’Reilly.
21+
#
22+
# James, G., Witten, D., Hastie, T., & Tibshirani, R. 2021. An introduction to
23+
# statistical learning: With applications in R (2nd ed.). Springer.
24+
#
25+
# Wolf, Andrew. The Machine Learning Simplified: A Gentle Introduction to
26+
# Supervised Learning. Andrew Wolf. Kindle Edition.
27+
#
28+
################################################################################
29+
# Dr. Walter F. de Azevedo, Jr. #
30+
# https://github.com/azevedolab #
31+
# July 20, 2024 #
32+
################################################################################
33+
#
34+
# Import section
35+
import numpy as np
36+
import reg2D_model
37+
import csv
38+
from scipy.stats import pearsonr, spearmanr
39+
40+
# Read csv file
41+
# CASF-2016 dataset
42+
# de Azevedo WF Jr, Quiroga R, Villarreal MA, da Silveira NJF,
43+
# Bitencourt-Ferreira G, da Silva AD, Veit-Acosta M, Oliveira PR, Tutone M,
44+
# Biziukova N, Poroikov V, Tarasova O, Baud S. SAnDReS 2.0: Development of
45+
# machine-learning models to explore the scoring function space. J Comput Chem.
46+
# 2024 Jun 20. doi: 10.1002/jcc.27449. Epub ahead of print. PMID: 38900052.
47+
file_in = "CASF-2016_Ki_training.csv"
48+
49+
# Amsterdam Apartments dataset (Amsterdam)
50+
# Wolf, Andrew. The Machine Learning Simplified: A Gentle Introduction to
51+
# Supervised Learning. Andrew Wolf. Kindle Edition.
52+
#file_in = "Amsterdam_apartments_training_set.csv"
53+
54+
# Get header from a csv file
55+
fo = open(file_in,"r")
56+
csv_in = csv.reader(fo)
57+
for line in csv_in:
58+
i_y = line.index("pKi")
59+
i_X = line.index("Gauss 2")
60+
#i_y = line.index("Price")
61+
#i_X = line.index("Area")
62+
break
63+
fo.close()
64+
65+
# Get numerical data
66+
data = np.genfromtxt(file_in, delimiter=',', skip_header = 1)
67+
X = data[:,i_X]
68+
y = data[:,i_y]
69+
70+
# Calculate parameters (theta0,theta1) as defined elsewhere
71+
# James, G., Witten, D., Hastie, T., & Tibshirani, R. 2021. An introduction to
72+
# statistical learning: With applications in R (2nd ed.). Springer.
73+
X_bar = X.mean()
74+
y_bar = y.mean()
75+
theta1 = np.sum( (X - X_bar)*(y - y_bar) )/np.sum( (X - X_bar)**2 )
76+
theta0 = y_bar - theta1*X_bar
77+
78+
# Make predictions
79+
X_new = np.array([[X.min()],[X.max()]])
80+
y_predict = theta0 + theta1*X_new
81+
82+
# Plotting with reg2D_model() function
83+
theta0_str = "{:.8f}".format(theta0)
84+
theta1_str = "{:.8f}".format(theta1)
85+
#title_str = "CASF-2016 K$_i$ (Training Set)"
86+
title_str = "Amsterdan Apartments (Training Set)"
87+
label_str = "Predictions ( y = "+theta0_str+" + "+theta1_str+" x )"
88+
reg2D_model.plotting(title_str,
89+
#X,y,"b.",[0,120,0,120],
90+
#X_new,y_predict,"r-",label_str,"Area(m$^2$)","Price(10K*Euros)",
91+
X,y,"b.",[-200,3200,0,15],
92+
X_new,y_predict,"r-",label_str,"Gauss 2","pK$_i$",
93+
"upper left",True,file_in.replace(".csv",".png"),600)
94+
95+
# Show predictive performance
96+
msg_out = "\nPredictive performance"
97+
print(msg_out)
98+
y_full_predictions = theta0 + theta1*X
99+
rss = np.sum( np.square(y_full_predictions - y) )
100+
p_corr = pearsonr(y,y_full_predictions)
101+
s_corr = spearmanr(y,y_full_predictions)
102+
print("RSS = {:.5f}".format(rss))
103+
print("r = {:.5f}".format(p_corr[0]),"\tp-value = {:.5e}".format(p_corr[1]))
104+
print("rho = {:.5f}".format(s_corr[0]),"\tp-value = {:.5e}".format(s_corr[1]))

0 commit comments

Comments
 (0)