1
+ #!/usr/bin/env python3
2
+ #
3
+ # Linear Regression for a dataset read from a csv file. We applied closed form
4
+ # equations to determine theta1 and theta0 (James et al., 2021). We have two
5
+ # datasets: CASF-2016 Ki (de Azevedo et al., 2024) and Amsterdam Apartments
6
+ # dataset (Wolf, 2022).
7
+ # This code is for linear models as expressed by the following equation:
8
+ # y = theta0 + theta1*X, where theta0 is the fit intercept and theta1 the
9
+ # coeficient, theta0 and theta1 are the model’s parameters (Géron, 2023).
10
+ #
11
+ # References
12
+ # de Azevedo WF Jr, Quiroga R, Villarreal MA, da Silveira NJF,
13
+ # Bitencourt-Ferreira G, da Silva AD, Veit-Acosta M, Oliveira PR, Tutone M,
14
+ # Biziukova N, Poroikov V, Tarasova O, Baud S. SAnDReS 2.0: Development of
15
+ # machine-learning models to explore the scoring function space. J Comput Chem.
16
+ # 2024 Jun 20. doi: 10.1002/jcc.27449. Epub ahead of print. PMID: 38900052.
17
+ #
18
+ # Géron, Aurélien. 2023. Hands-on Machine Learning with Scikit-Learn, Keras,
19
+ # and TensorFlow: Concepts, Tools, and Techniques to Build Intelligent Systems.
20
+ # 3rd ed. CA 95472: O’Reilly.
21
+ #
22
+ # James, G., Witten, D., Hastie, T., & Tibshirani, R. 2021. An introduction to
23
+ # statistical learning: With applications in R (2nd ed.). Springer.
24
+ #
25
+ # Wolf, Andrew. The Machine Learning Simplified: A Gentle Introduction to
26
+ # Supervised Learning. Andrew Wolf. Kindle Edition.
27
+ #
28
+ ################################################################################
29
+ # Dr. Walter F. de Azevedo, Jr. #
30
+ # https://github.com/azevedolab #
31
+ # July 20, 2024 #
32
+ ################################################################################
33
+ #
34
+ # Import section
35
+ import numpy as np
36
+ import reg2D_model
37
+ import csv
38
+ from scipy .stats import pearsonr , spearmanr
39
+
40
+ # Read csv file
41
+ # CASF-2016 dataset
42
+ # de Azevedo WF Jr, Quiroga R, Villarreal MA, da Silveira NJF,
43
+ # Bitencourt-Ferreira G, da Silva AD, Veit-Acosta M, Oliveira PR, Tutone M,
44
+ # Biziukova N, Poroikov V, Tarasova O, Baud S. SAnDReS 2.0: Development of
45
+ # machine-learning models to explore the scoring function space. J Comput Chem.
46
+ # 2024 Jun 20. doi: 10.1002/jcc.27449. Epub ahead of print. PMID: 38900052.
47
+ file_in = "CASF-2016_Ki_training.csv"
48
+
49
+ # Amsterdam Apartments dataset (Amsterdam)
50
+ # Wolf, Andrew. The Machine Learning Simplified: A Gentle Introduction to
51
+ # Supervised Learning. Andrew Wolf. Kindle Edition.
52
+ #file_in = "Amsterdam_apartments_training_set.csv"
53
+
54
+ # Get header from a csv file
55
+ fo = open (file_in ,"r" )
56
+ csv_in = csv .reader (fo )
57
+ for line in csv_in :
58
+ i_y = line .index ("pKi" )
59
+ i_X = line .index ("Gauss 2" )
60
+ #i_y = line.index("Price")
61
+ #i_X = line.index("Area")
62
+ break
63
+ fo .close ()
64
+
65
+ # Get numerical data
66
+ data = np .genfromtxt (file_in , delimiter = ',' , skip_header = 1 )
67
+ X = data [:,i_X ]
68
+ y = data [:,i_y ]
69
+
70
+ # Calculate parameters (theta0,theta1) as defined elsewhere
71
+ # James, G., Witten, D., Hastie, T., & Tibshirani, R. 2021. An introduction to
72
+ # statistical learning: With applications in R (2nd ed.). Springer.
73
+ X_bar = X .mean ()
74
+ y_bar = y .mean ()
75
+ theta1 = np .sum ( (X - X_bar )* (y - y_bar ) )/ np .sum ( (X - X_bar )** 2 )
76
+ theta0 = y_bar - theta1 * X_bar
77
+
78
+ # Make predictions
79
+ X_new = np .array ([[X .min ()],[X .max ()]])
80
+ y_predict = theta0 + theta1 * X_new
81
+
82
+ # Plotting with reg2D_model() function
83
+ theta0_str = "{:.8f}" .format (theta0 )
84
+ theta1_str = "{:.8f}" .format (theta1 )
85
+ #title_str = "CASF-2016 K$_i$ (Training Set)"
86
+ title_str = "Amsterdan Apartments (Training Set)"
87
+ label_str = "Predictions ( y = " + theta0_str + " + " + theta1_str + " x )"
88
+ reg2D_model .plotting (title_str ,
89
+ #X,y,"b.",[0,120,0,120],
90
+ #X_new,y_predict,"r-",label_str,"Area(m$^2$)","Price(10K*Euros)",
91
+ X ,y ,"b." ,[- 200 ,3200 ,0 ,15 ],
92
+ X_new ,y_predict ,"r-" ,label_str ,"Gauss 2" ,"pK$_i$" ,
93
+ "upper left" ,True ,file_in .replace (".csv" ,".png" ),600 )
94
+
95
+ # Show predictive performance
96
+ msg_out = "\n Predictive performance"
97
+ print (msg_out )
98
+ y_full_predictions = theta0 + theta1 * X
99
+ rss = np .sum ( np .square (y_full_predictions - y ) )
100
+ p_corr = pearsonr (y ,y_full_predictions )
101
+ s_corr = spearmanr (y ,y_full_predictions )
102
+ print ("RSS = {:.5f}" .format (rss ))
103
+ print ("r = {:.5f}" .format (p_corr [0 ]),"\t p-value = {:.5e}" .format (p_corr [1 ]))
104
+ print ("rho = {:.5f}" .format (s_corr [0 ]),"\t p-value = {:.5e}" .format (s_corr [1 ]))
0 commit comments