Download as pdf or txt
Download as pdf or txt
You are on page 1of 1

In 

[12]: #REGRESSION SIMPLE


import pandas as pd BERRIOUA MOHAMED EL BACHIR
import statsmodels.formula.api as smf CHAKIR OUISSAL
import matplotlib.pyplot as plt GMP2
import seaborn as sns

In [2]: df = pd.read_csv('C:/Users/User/Desktop/simpleregression.csv')

plt.plot(df['SAT'],df['GPA'],'o')
df.head()

Out[2]: SAT GPA

0 1714 2.40

1 1664 2.52

2 1760 2.54

3 1685 2.74

4 1693 2.83

In [3]: sns.regplot('SAT', 'GPA', data=df)

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positiona
l argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
<AxesSubplot:xlabel='SAT', ylabel='GPA'>
Out[3]:

In [4]: model = smf.ols(formula='GPA ~ SAT', data=df).fit()


print(model.summary())

OLS Regression Results


==============================================================================
Dep. Variable: GPA R-squared: 0.406
Model: OLS Adj. R-squared: 0.399
Method: Least Squares F-statistic: 56.05
Date: Fri, 03 Mar 2023 Prob (F-statistic): 7.20e-11
Time: 18:38:29 Log-Likelihood: 12.672
No. Observations: 84 AIC: -21.34
Df Residuals: 82 BIC: -16.48
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 0.2750 0.409 0.673 0.503 -0.538 1.088
SAT 0.0017 0.000 7.487 0.000 0.001 0.002
==============================================================================
Omnibus: 12.839 Durbin-Watson: 0.950
Prob(Omnibus): 0.002 Jarque-Bera (JB): 16.155
Skew: -0.722 Prob(JB): 0.000310
Kurtosis: 4.590 Cond. No. 3.29e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.29e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

In [5]: ypred = model.predict(df['SAT'])


print(ypred)

0 3.112890
1 3.030105
2 3.189051
3 3.064875
4 3.078120
...
79 3.480452
80 3.271836
81 3.564892
82 3.523500
83 3.669201
Length: 84, dtype: float64

In [6]: ##REGRESSION MULTIPE

In [7]: import pandas as pd


import statsmodels.api as sm

In [8]: df = pd.read_csv('C:/Users/User/Desktop/50_Startups.csv')
print(df.head())

X1 X2 X3 X4 Profit
0 165349.20 136897.80 471784.10 New York 192261.83
1 162597.70 151377.59 443898.53 California 191792.06
2 153441.51 101145.55 407934.54 Florida 191050.39
3 144372.41 118671.85 383199.62 New York 182901.99
4 142107.34 91391.77 366168.42 Florida 166187.94

In [9]: y=df['X1']
x=df[['X2','X3','Profit']]
x=sm.add_constant(x)

lm = sm.OLS(y,x).fit()

print(lm.summary())

OLS Regression Results


==============================================================================
Dep. Variable: X1 R-squared: 0.949
Model: OLS Adj. R-squared: 0.946
Method: Least Squares F-statistic: 284.6
Date: Fri, 03 Mar 2023 Prob (F-statistic): 1.06e-29
Time: 18:41:20 Log-Likelihood: -532.81
No. Observations: 50 AIC: 1074.
Df Residuals: 46 BIC: 1081.
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const -5.904e+04 7474.939 -7.898 0.000 -7.41e+04 -4.4e+04
X2 0.0840 0.058 1.446 0.155 -0.033 0.201
X3 0.0052 0.020 0.264 0.793 -0.034 0.045
Profit 1.0845 0.061 17.846 0.000 0.962 1.207
==============================================================================
Omnibus: 6.269 Durbin-Watson: 1.256
Prob(Omnibus): 0.044 Jarque-Bera (JB): 5.578
Skew: 0.562 Prob(JB): 0.0615
Kurtosis: 4.189 Cond. No. 1.43e+06
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.43e+06. This might indicate that there are
strong multicollinearity or other numerical problems.

In [10]: ypred = lm.predict(x)


print(ypred)

0 163409.685195
1 163971.686139
2 158762.404598
3 151269.117228
4 130763.462932
5 121479.738386
6 123298.131130
7 123773.672033
8 120137.878165
9 114081.955175
10 109904.313883
11 106411.954540
12 106496.432656
13 99304.424308
14 99243.612116
15 93508.848044
16 90265.374161
17 90572.064986
18 86844.201171
19 87003.097723
20 80555.344044
21 76146.131486
22 72520.133451
23 69342.313645
24 67750.511290
25 69874.337004
26 68426.941232
27 67409.895162
28 68920.280699
29 63906.044924
30 59525.756657
31 59961.632727
32 57710.739541
33 55683.607989
34 60180.403189
35 53799.143871
36 51045.307230
37 43837.419635
38 35551.071927
39 36686.224131
40 36661.917058
41 33298.908289
42 27344.819469
43 27494.890236
44 24816.784491
45 21808.086524
46 5898.667970
47 -1511.285830
48 -16007.296988
49 -33060.005620
dtype: float64

In [ ]:

You might also like