Professional Documents
Culture Documents
'Name-Piyush Tiwari''/n' 'Section - C'/N' 'Roll - No-2001610100142'
'Name-Piyush Tiwari''/n' 'Section - C'/N' 'Roll - No-2001610100142'
'Section-‘C'\n'
'Roll_no-2001610100142')
Name-Piyush Tiwari
Section-C
Roll_no-2001610100142
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
Mounted at /content/drive
ins=pd.read_csv('/content/new_insurance_data.csv')
ins
num_of_steps Hospital_expenditure
NUmber_of_past_hospitalizations \
0 715428.0 4.720921e+06
0.0
1 699157.0 4.329832e+06
0.0
2 702341.0 6.884861e+06
0.0
3 700250.0 4.274774e+06
0.0
4 711584.0 3.787294e+06
0.0
... ... ...
...
1333 1091267.0 1.703805e+08
2.0
1334 1107872.0 2.015152e+08
2.0
1335 1092005.0 2.236450e+08
2.0
1336 1106821.0 2.528924e+08
3.0
1337 1100328.0 2.616317e+08
3.0
ins.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 13 columns):
# Column Non-Null Count Dtype
print(ins.isnull().sum())
age 9
sex 0
bmi 3
children 5
smoker 0
Claim_Amount 14
past_consultations 6
num_of_steps 3
Hospital_expenditure 4
NUmber_of_past_hospitalizations 2
Anual_Salary 6
region 0
charges 0
dtype: int64
ins.shape
(1338, 13)
fig,ax=plt.subplots(figsize=(8,8))
sns.distplot(ins.age)
/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed
in a future version. Please adapt your code to use either `displot` (a
figure-level function with similar flexibility) or `histplot` (an
axes-level function for histograms).
warnings.warn(msg, FutureWarning)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb7cd795070>
ins['age'].fillna(ins['age'].mean(),inplace=True)
print(ins.isnull().sum())
age 0
sex 0
bmi 3
children 5
smoker 0
Claim_Amount 14
past_consultations 6
num_of_steps 3
Hospital_expenditure 4
NUmber_of_past_hospitalizations 2
Anual_Salary 6
region 0
charges 0
dtype: int64
fig,ax=plt.subplots(figsize=(8,8))
sns.distplot(ins.bmi)
/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed
in a future version. Please adapt your code to use either `displot` (a
figure-level function with similar flexibility) or `histplot` (an
axes-level function for histograms).
warnings.warn(msg, FutureWarning)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb7cd656e50>
ins['bmi'].fillna(ins['bmi'].mean(),inplace=True)
fig,ax=plt.subplots(figsize=(8,8))
sns.distplot(ins.children)
/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed
in a future version. Please adapt your code to use either `displot` (a
figure-level function with similar flexibility) or `histplot` (an
axes-level function for histograms).
warnings.warn(msg, FutureWarning)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb7cd179e20>
ins['children'].fillna(ins['children'].mode(),inplace=True)
fig,ax=plt.subplots(figsize=(8,8))
sns.distplot(ins.Claim_Amount)
/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed
in a future version. Please adapt your code to use either `displot` (a
figure-level function with similar flexibility) or `histplot` (an
axes-level function for histograms).
warnings.warn(msg, FutureWarning)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb7cd0bda60>
ins['Claim_Amount'].fillna(ins['Claim_Amount'].mean(),inplace=True)
fig,ax=plt.subplots(figsize=(8,8))
sns.distplot(ins.past_consultations)
/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed
in a future version. Please adapt your code to use either `displot` (a
figure-level function with similar flexibility) or `histplot` (an
axes-level function for histograms).
warnings.warn(msg, FutureWarning)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb7cd036fd0>
ins['past_consultations'].fillna(ins['past_consultations'].mean(),inpl
ace=True)
fig,ax=plt.subplots(figsize=(8,8))
sns.distplot(ins.num_of_steps)
/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed
in a future version. Please adapt your code to use either `displot` (a
figure-level function with similar flexibility) or `histplot` (an
axes-level function for histograms).
warnings.warn(msg, FutureWarning)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb7ccfe63a0>
ins['num_of_steps'].fillna(ins['num_of_steps'].mean(),inplace=True)
fig,ax=plt.subplots(figsize=(8,8))
sns.distplot(ins.Hospital_expenditure)
/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed
in a future version. Please adapt your code to use either `displot` (a
figure-level function with similar flexibility) or `histplot` (an
axes-level function for histograms).
warnings.warn(msg, FutureWarning)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb7cd10e490>
ins['Hospital_expenditure'].fillna(ins['Hospital_expenditure'].mean(),
inplace=True)
fig,ax=plt.subplots(figsize=(8,8))
sns.distplot(ins.NUmber_of_past_hospitalizations)
/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed
in a future version. Please adapt your code to use either `displot` (a
figure-level function with similar flexibility) or `histplot` (an
axes-level function for histograms).
warnings.warn(msg, FutureWarning)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb7cce26850>
ins['NUmber_of_past_hospitalizations'].fillna(ins['NUmber_of_past_hosp
italizations'].median(),inplace=True)
fig,ax=plt.subplots(figsize=(8,8))
sns.distplot(ins.Anual_Salary)
/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed
in a future version. Please adapt your code to use either `displot` (a
figure-level function with similar flexibility) or `histplot` (an
axes-level function for histograms).
warnings.warn(msg, FutureWarning)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb7cce88760>
ins['Anual_Salary'].fillna(ins['Anual_Salary'].median(),inplace=True)
ins['children'].fillna(ins['children'].median(),inplace=True)
print(ins.isnull().sum())
age 0
sex 0
bmi 0
children 0
smoker 0
Claim_Amount 0
past_consultations 0
num_of_steps 0
Hospital_expenditure 0
NUmber_of_past_hospitalizations 0
Anual_Salary 0
region 0
charges 0
dtype: int64
New Section
ins.describe()
# Gender column
plt.figure(figsize=(6,6))
sns.countplot(x='sex', data=ins)
plt.title('Sex Distribution')
plt.show()
ins['sex'].value_counts()
male 676
female 662
Name: sex, dtype: int64
# bmi distribution
plt.figure(figsize=(6,6))
sns.distplot(ins['bmi'])
plt.title('BMI Distribution')
plt.show()
/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed
in a future version. Please adapt your code to use either `displot` (a
figure-level function with similar flexibility) or `histplot` (an
axes-level function for histograms).
warnings.warn(msg, FutureWarning)
# children column
plt.figure(figsize=(6,6))
sns.countplot(x='children', data=ins)
plt.title('Children')
plt.show()
ins['children'].value_counts()
0.0 574
1.0 326
2.0 240
3.0 156
4.0 25
5.0 17
Name: children, dtype: int64
plt.figure(figsize=(6,6))
sns.countplot(x='smoker', data=ins)
plt.title('smoker')
plt.show()
ins['smoker'].value_counts()
no 1064
yes 274
Name: smoker, dtype: int64
# region column
plt.figure(figsize=(6,6))
sns.countplot(x='region', data=ins)
plt.title('region')
plt.show()
ins['region'].value_counts()
southeast 364
southwest 325
northwest 325
northeast 324
Name: region, dtype: int64
/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed
in a future version. Please adapt your code to use either `displot` (a
figure-level function with similar flexibility) or `histplot` (an
axes-level function for histograms).
warnings.warn(msg, FutureWarning)
Data Pre-Processing
Encoding the categorical features
# encoding sex column
ins.replace({'sex':{'male':0,'female':1}}, inplace=True)
sns.heatmap(ins,cmap='coolwarm')
<matplotlib.axes._subplots.AxesSubplot at 0x7fb7ccae5940>
plt.figure(figsize=(10,10))
sns.heatmap(ins,annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb7ccc11e20>
q1=ins.bmi.quantile(0.25)
q3=ins.bmi.quantile(0.75)
iqr=q3-q1
insurance=ins[(ins.bmi>=q1-1.5*iqr)&(ins.bmi<=q3+1.5*iqr)]
q1=ins.past_consultations.quantile(0.25)
q3=ins.past_consultations.quantile(0.75)
iqr=q3-q1
insurance=ins[(ins.past_consultations>=q1-
1.5*iqr)&(ins.past_consultations)]
q1=ins.Hospital_expenditure.quantile(0.25)
q3=ins.Hospital_expenditure.quantile(0.75)
iqr=q3-q1
insurance=ins[(ins.Hospital_expenditure>=q1-
1.5*iqr)&(ins.Hospital_expenditure)]
q1=ins.Anual_Salary.quantile(0.25)
q3=ins.Anual_Salary.quantile(0.75)
iqr=q3-q1
insurance=ins[(ins.Anual_Salary>=q1-1.5*iqr)&(ins.Anual_Salary)]
feature VIF
0 age 12.812811
1 sex 2.019163
2 bmi 31.954395
3 children 1.961089
4 smoker 12.461581
5 Claim_Amount 6.920514
6 past_consultations 8.401528
7 num_of_steps 71.348797
8 Hospital_expenditure 18.764176
9 NUmber_of_past_hospitalizations 16.161025
10 Anual_Salary 29.406733
11 region 2.894132
ins=ins.drop(['num_of_steps'],axis=1)
col_list=[]
for col in ins.columns:
if((ins[col].dtype !='object')&(col !='charges')):
col_list.append(col)
x=ins[col_list]
vif_data=pd.DataFrame()
vif_data['feature']=x.columns
vif_data['VIF']=[variance_inflation_factor(x.values,i) for i in
range(len(x.columns))]
print(vif_data)
feature VIF
0 age 11.462460
1 sex 1.985836
2 bmi 19.410639
3 children 1.948997
4 smoker 12.148741
5 Claim_Amount 6.471067
6 past_consultations 7.634564
7 Hospital_expenditure 18.466162
8 NUmber_of_past_hospitalizations 14.198316
9 Anual_Salary 29.243856
10 region 2.614750
ins=ins.drop(['bmi'],axis=1)
col_list=[]
for col in ins.columns:
if((ins[col].dtype !='object')&(col !='charges')):
col_list.append(col)
x=ins[col_list]
vif_data=pd.DataFrame()
vif_data['feature']=x.columns
vif_data['VIF']=[variance_inflation_factor(x.values,i) for i in
range(len(x.columns))]
print(vif_data)
feature VIF
0 age 11.313883
1 sex 1.980936
2 children 1.947983
3 smoker 7.484382
4 Claim_Amount 6.140250
5 past_consultations 7.107175
6 Hospital_expenditure 18.465946
7 NUmber_of_past_hospitalizations 13.951702
8 Anual_Salary 28.828791
9 region 2.594851
ins=ins.drop(['Hospital_expenditure'],axis=1)
col_list=[]
for col in ins.columns:
if((ins[col].dtype !='object')&(col !='charges')):
col_list.append(col)
x=ins[col_list]
vif_data=pd.DataFrame()
vif_data['feature']=x.columns
vif_data['VIF']=[variance_inflation_factor(x.values,i) for i in
range(len(x.columns))]
print(vif_data)
feature VIF
0 age 11.301633
1 sex 1.980877
2 children 1.947983
3 smoker 7.018653
4 Claim_Amount 6.135856
5 past_consultations 7.087869
6 NUmber_of_past_hospitalizations 13.263038
7 Anual_Salary 5.024536
8 region 2.591447
ins=ins.drop(['NUmber_of_past_hospitalizations'],axis=1)
col_list=[]
for col in ins.columns:
if((ins[col].dtype !='object')&(col !='charges')):
col_list.append(col)
x=ins[col_list]
vif_data=pd.DataFrame()
vif_data['feature']=x.columns
vif_data['VIF']=[variance_inflation_factor(x.values,i) for i in
range(len(x.columns))]
print(vif_data)
feature VIF
0 age 9.157284
1 sex 1.954003
2 children 1.819917
3 smoker 7.000894
4 Claim_Amount 5.908022
5 past_consultations 6.895883
6 Anual_Salary 3.820693
7 region 2.546124
ins=ins.drop(['age'],axis=1)
col_list=[]
for col in ins.columns:
if((ins[col].dtype !='object')&(col !='charges')):
col_list.append(col)
x=ins[col_list]
vif_data=pd.DataFrame()
vif_data['feature']=x.columns
vif_data['VIF']=[variance_inflation_factor(x.values,i) for i in
range(len(x.columns))]
print(vif_data)
feature VIF
0 sex 1.945793
1 children 1.815134
2 smoker 4.890261
3 Claim_Amount 5.701343
4 past_consultations 6.453334
5 Anual_Salary 3.541460
6 region 2.530922
ins=ins.drop(['past_consultations'],axis=1)
col_list=[]
for col in ins.columns:
if((ins[col].dtype !='object')&(col !='charges')):
col_list.append(col)
x=ins[col_list]
vif_data=pd.DataFrame()
vif_data['feature']=x.columns
vif_data['VIF']=[variance_inflation_factor(x.values,i) for i in
range(len(x.columns))]
print(vif_data)
feature VIF
0 sex 1.935313
1 children 1.796418
2 smoker 4.269236
3 Claim_Amount 5.311758
4 Anual_Salary 2.4178745
region 2.481157
# Driver Code
duplicate = [ins]
print(Remove(duplicate))
[ sex children smoker Claim_Amount Anual_Salary region
charges
0 0 0.0 1 29087.54313 5.578497e+07 0
1121.87390
1 0 0.0 1 39053.67437 1.370089e+07 0
1131.50660
2 0 0.0 1 39023.62759 7.352311e+07 0
1135.94070
3 0 0.0 1 28185.39332 7.581968e+07 0
1136.39940
4 0 0.0 1 14697.85941 2.301232e+07 0
1137.01100
... ... ... ... ... ... ...
...
1333 1 0.0 0 63142.25346 3.101107e+09 3
55135.40209
1334 1 1.0 0 43419.95227 3.484216e+09 2
58571.07448
1335 0 3.0 0 52458.92353 3.640807e+09 3
60021.39897
1336 0 0.0 0 69927.51664 4.006359e+09 0
62592.87309
1337 1 0.0 0 63982.80926 4.117197e+09 0
63770.42801
pd.DataFrame(ins.iloc[:,-1])
charges
0 1121.87390
1 1131.50660
2 1135.94070
3 1136.39940
4 1137.01100
... ...
1333 55135.40209
1334 58571.07448
1335 60021.39897
1336 62592.87309
1337 63770.42801
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
(267, 6)
(267,)
(1071, 6)
(1071,)
regressor.fit(x_train, y_train)
LinearRegression()
Training Data
# prediction on training data
training_data_prediction =regressor.predict(x_train)
# R squared value
r2_train = metrics.r2_score(y_train, training_data_prediction)
print('R squared vale : ', r2_train)
Testing Data
# prediction on test data
test_data_prediction =regressor.predict(x_test)
# R squared value
r2_test = metrics.r2_score(y_test, test_data_prediction)
print('R squared vale : ', r2_test)
input_data = (21,54,7,8,44,88)
prediction = regressor.predict(input_data_reshaped)
print(prediction)
/usr/local/lib/python3.8/dist-packages/sklearn/base.py:450:
UserWarning: X does not have valid feature names, but LinearRegression
was fitted with feature names
warnings.warn(