Download as pdf or txt
Download as pdf or txt
You are on page 1of 28

print ('Name-Piyush Tiwari''\n'

'Section-‘C'\n'
'Roll_no-2001610100142')
Name-Piyush Tiwari
Section-C
Roll_no-2001610100142
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

from google.colab import drive


drive.mount('/content/drive')

Mounted at /content/drive

ins=pd.read_csv('/content/new_insurance_data.csv')

ins

age sex bmi children smoker Claim_Amount


past_consultations \
0 18.0 male 23.210 0.0 no 29087.54313
17.0
1 18.0 male 30.140 0.0 no 39053.67437
7.0
2 18.0 male 33.330 0.0 no 39023.62759
19.0
3 18.0 male 33.660 0.0 no 28185.39332
11.0
4 18.0 male 34.100 0.0 no 14697.85941
16.0
... ... ... ... ... ... ...
...
1333 33.0 female 35.530 0.0 yes 63142.25346
32.0
1334 31.0 female 38.095 1.0 yes 43419.95227
31.0
1335 52.0 male 34.485 3.0 yes 52458.92353
25.0
1336 45.0 male 30.360 0.0 yes 69927.51664
34.0
1337 54.0 female 47.410 0.0 yes 63982.80926
31.0

num_of_steps Hospital_expenditure
NUmber_of_past_hospitalizations \
0 715428.0 4.720921e+06
0.0
1 699157.0 4.329832e+06
0.0
2 702341.0 6.884861e+06
0.0
3 700250.0 4.274774e+06
0.0
4 711584.0 3.787294e+06
0.0
... ... ...
...
1333 1091267.0 1.703805e+08
2.0
1334 1107872.0 2.015152e+08
2.0
1335 1092005.0 2.236450e+08
2.0
1336 1106821.0 2.528924e+08
3.0
1337 1100328.0 2.616317e+08
3.0

Anual_Salary region charges


0 5.578497e+07 southeast 1121.87390
1 1.370089e+07 southeast 1131.50660
2 7.352311e+07 southeast 1135.94070
3 7.581968e+07 southeast 1136.39940
4 2.301232e+07 southeast 1137.01100
... ... ... ...
1333 3.101107e+09 northwest 55135.40209
1334 3.484216e+09 northeast 58571.07448
1335 3.640807e+09 northwest 60021.39897
1336 4.006359e+09 southeast 62592.87309
1337 4.117197e+09 southeast 63770.42801

[1338 rows x 13 columns]

ins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 13 columns):
# Column Non-Null Count Dtype

0 age 1329 non-null float64


1 sex 1338 non-null object
2 bmi 1335 non-null float64
3 children 1333 non-null float64
4 smoker 1338 non-null object
5 Claim_Amount 1324 non-null float64
6 past_consultations 1332 non-null float64
7 num_of_steps 1335 non-null float64
8 Hospital_expenditure 1334 non-null float64
9 NUmber_of_past_hospitalizations 1336 non-null float64
10 Anual_Salary 1332 non-null float64
11 region 1338 non-null object
12 charges 1338 non-null float64
dtypes: float64(10), object(3)
memory usage: 136.0+ KB

print(ins.isnull().sum())

age 9
sex 0
bmi 3
children 5
smoker 0
Claim_Amount 14
past_consultations 6
num_of_steps 3
Hospital_expenditure 4
NUmber_of_past_hospitalizations 2
Anual_Salary 6
region 0
charges 0
dtype: int64

ins.shape

(1338, 13)
fig,ax=plt.subplots(figsize=(8,8))
sns.distplot(ins.age)

/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed
in a future version. Please adapt your code to use either `displot` (a
figure-level function with similar flexibility) or `histplot` (an
axes-level function for histograms).
warnings.warn(msg, FutureWarning)

<matplotlib.axes._subplots.AxesSubplot at 0x7fb7cd795070>
ins['age'].fillna(ins['age'].mean(),inplace=True)

print(ins.isnull().sum())

age 0
sex 0
bmi 3
children 5
smoker 0
Claim_Amount 14
past_consultations 6
num_of_steps 3
Hospital_expenditure 4
NUmber_of_past_hospitalizations 2
Anual_Salary 6
region 0
charges 0
dtype: int64
fig,ax=plt.subplots(figsize=(8,8))
sns.distplot(ins.bmi)

/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed
in a future version. Please adapt your code to use either `displot` (a
figure-level function with similar flexibility) or `histplot` (an
axes-level function for histograms).
warnings.warn(msg, FutureWarning)

<matplotlib.axes._subplots.AxesSubplot at 0x7fb7cd656e50>

ins['bmi'].fillna(ins['bmi'].mean(),inplace=True)

fig,ax=plt.subplots(figsize=(8,8))
sns.distplot(ins.children)

/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed
in a future version. Please adapt your code to use either `displot` (a
figure-level function with similar flexibility) or `histplot` (an
axes-level function for histograms).
warnings.warn(msg, FutureWarning)

<matplotlib.axes._subplots.AxesSubplot at 0x7fb7cd179e20>

ins['children'].fillna(ins['children'].mode(),inplace=True)

fig,ax=plt.subplots(figsize=(8,8))
sns.distplot(ins.Claim_Amount)

/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed
in a future version. Please adapt your code to use either `displot` (a
figure-level function with similar flexibility) or `histplot` (an
axes-level function for histograms).
warnings.warn(msg, FutureWarning)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb7cd0bda60>

ins['Claim_Amount'].fillna(ins['Claim_Amount'].mean(),inplace=True)

fig,ax=plt.subplots(figsize=(8,8))
sns.distplot(ins.past_consultations)

/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed
in a future version. Please adapt your code to use either `displot` (a
figure-level function with similar flexibility) or `histplot` (an
axes-level function for histograms).
warnings.warn(msg, FutureWarning)

<matplotlib.axes._subplots.AxesSubplot at 0x7fb7cd036fd0>
ins['past_consultations'].fillna(ins['past_consultations'].mean(),inpl
ace=True)

fig,ax=plt.subplots(figsize=(8,8))
sns.distplot(ins.num_of_steps)

/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed
in a future version. Please adapt your code to use either `displot` (a
figure-level function with similar flexibility) or `histplot` (an
axes-level function for histograms).
warnings.warn(msg, FutureWarning)

<matplotlib.axes._subplots.AxesSubplot at 0x7fb7ccfe63a0>
ins['num_of_steps'].fillna(ins['num_of_steps'].mean(),inplace=True)

fig,ax=plt.subplots(figsize=(8,8))
sns.distplot(ins.Hospital_expenditure)

/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed
in a future version. Please adapt your code to use either `displot` (a
figure-level function with similar flexibility) or `histplot` (an
axes-level function for histograms).
warnings.warn(msg, FutureWarning)

<matplotlib.axes._subplots.AxesSubplot at 0x7fb7cd10e490>
ins['Hospital_expenditure'].fillna(ins['Hospital_expenditure'].mean(),
inplace=True)

fig,ax=plt.subplots(figsize=(8,8))
sns.distplot(ins.NUmber_of_past_hospitalizations)

/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed
in a future version. Please adapt your code to use either `displot` (a
figure-level function with similar flexibility) or `histplot` (an
axes-level function for histograms).
warnings.warn(msg, FutureWarning)

<matplotlib.axes._subplots.AxesSubplot at 0x7fb7cce26850>
ins['NUmber_of_past_hospitalizations'].fillna(ins['NUmber_of_past_hosp
italizations'].median(),inplace=True)

fig,ax=plt.subplots(figsize=(8,8))
sns.distplot(ins.Anual_Salary)

/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed
in a future version. Please adapt your code to use either `displot` (a
figure-level function with similar flexibility) or `histplot` (an
axes-level function for histograms).
warnings.warn(msg, FutureWarning)

<matplotlib.axes._subplots.AxesSubplot at 0x7fb7cce88760>
ins['Anual_Salary'].fillna(ins['Anual_Salary'].median(),inplace=True)

ins['children'].fillna(ins['children'].median(),inplace=True)

print(ins.isnull().sum())

age 0
sex 0
bmi 0
children 0
smoker 0
Claim_Amount 0
past_consultations 0
num_of_steps 0
Hospital_expenditure 0
NUmber_of_past_hospitalizations 0
Anual_Salary 0
region 0
charges 0
dtype: int64

New Section
ins.describe()

age bmi children Claim_Amount \


count 1338.000000 1338.000000 1338.000000 1338.000000
mean 39.310008 30.665112 1.090433 33361.327180
std 13.987500 6.094840 1.199619 15535.307255
min 18.000000 15.960000 0.000000 1920.136268
25% 27.000000 26.315000 0.000000 20947.645198
50% 39.310008 30.400000 1.000000 33611.286065
75% 51.000000 34.656250 2.000000 44978.873188
max 64.000000 53.130000 5.000000 77277.988480

past_consultations num_of_steps Hospital_expenditure \


count 1338.000000 1.338000e+03 1.338000e+03
mean 15.216216 9.100047e+05 1.584179e+07
std 7.450948 9.178297e+04 2.665309e+07
min 1.000000 6.954300e+05 2.945253e+04
25% 9.000000 8.474898e+05 4.084941e+06
50% 15.000000 9.140155e+05 7.500985e+06
75% 20.000000 9.715100e+05 1.090187e+07
max 40.000000 1.107872e+06 2.616317e+08

NUmber_of_past_hospitalizations Anual_Salary charges


count 1338.000000 1.338000e+03 1338.000000
mean 1.060538 3.686636e+08 13270.422265
std 0.533189 5.658157e+08 12110.011237
min 0.000000 2.747072e+06 1121.873900
25% 1.000000 7.755085e+07 4740.287150
50% 1.000000 1.419361e+08 9382.033000
75% 1.000000 3.225202e+08 16639.912515
max 3.000000 4.117197e+09 63770.428010

# Gender column
plt.figure(figsize=(6,6))
sns.countplot(x='sex', data=ins)
plt.title('Sex Distribution')
plt.show()
ins['sex'].value_counts()

male 676
female 662
Name: sex, dtype: int64

# bmi distribution
plt.figure(figsize=(6,6))
sns.distplot(ins['bmi'])
plt.title('BMI Distribution')
plt.show()

/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed
in a future version. Please adapt your code to use either `displot` (a
figure-level function with similar flexibility) or `histplot` (an
axes-level function for histograms).
warnings.warn(msg, FutureWarning)
# children column
plt.figure(figsize=(6,6))
sns.countplot(x='children', data=ins)
plt.title('Children')
plt.show()
ins['children'].value_counts()

0.0 574
1.0 326
2.0 240
3.0 156
4.0 25
5.0 17
Name: children, dtype: int64

plt.figure(figsize=(6,6))
sns.countplot(x='smoker', data=ins)
plt.title('smoker')
plt.show()
ins['smoker'].value_counts()

no 1064
yes 274
Name: smoker, dtype: int64

# region column
plt.figure(figsize=(6,6))
sns.countplot(x='region', data=ins)
plt.title('region')
plt.show()
ins['region'].value_counts()

southeast 364
southwest 325
northwest 325
northeast 324
Name: region, dtype: int64

# distribution of charges value


plt.figure(figsize=(6,6))
sns.distplot(ins['charges'])
plt.title('Charges Distribution')
plt.show()

/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed
in a future version. Please adapt your code to use either `displot` (a
figure-level function with similar flexibility) or `histplot` (an
axes-level function for histograms).
warnings.warn(msg, FutureWarning)

Data Pre-Processing
Encoding the categorical features
# encoding sex column
ins.replace({'sex':{'male':0,'female':1}}, inplace=True)

3 # encoding 'smoker' column


ins.replace({'smoker':{'yes':0,'no':1}}, inplace=True)

# encoding 'region' column


ins.replace({'region':
{'southeast':0,'southwest':1,'northeast':2,'northwest':3}},
inplace=True)

sns.heatmap(ins,cmap='coolwarm')
<matplotlib.axes._subplots.AxesSubplot at 0x7fb7ccae5940>

plt.figure(figsize=(10,10))
sns.heatmap(ins,annot=True)

<matplotlib.axes._subplots.AxesSubplot at 0x7fb7ccc11e20>
q1=ins.bmi.quantile(0.25)
q3=ins.bmi.quantile(0.75)
iqr=q3-q1
insurance=ins[(ins.bmi>=q1-1.5*iqr)&(ins.bmi<=q3+1.5*iqr)]

q1=ins.past_consultations.quantile(0.25)
q3=ins.past_consultations.quantile(0.75)
iqr=q3-q1
insurance=ins[(ins.past_consultations>=q1-
1.5*iqr)&(ins.past_consultations)]

q1=ins.Hospital_expenditure.quantile(0.25)
q3=ins.Hospital_expenditure.quantile(0.75)
iqr=q3-q1
insurance=ins[(ins.Hospital_expenditure>=q1-
1.5*iqr)&(ins.Hospital_expenditure)]

q1=ins.Anual_Salary.quantile(0.25)
q3=ins.Anual_Salary.quantile(0.75)
iqr=q3-q1
insurance=ins[(ins.Anual_Salary>=q1-1.5*iqr)&(ins.Anual_Salary)]

from statsmodels.stats.outliers_influence import


variance_inflation_factor
col_list=[]
for col in ins.columns:
if((ins[col].dtype !='object')&(col !='charges')):
col_list.append(col)
x=ins[col_list]
vif_data=pd.DataFrame()
vif_data['feature']=x.columns
vif_data['VIF']=[variance_inflation_factor(x.values,i) for i in
range(len(x.columns))]
print(vif_data)

feature VIF
0 age 12.812811
1 sex 2.019163
2 bmi 31.954395
3 children 1.961089
4 smoker 12.461581
5 Claim_Amount 6.920514
6 past_consultations 8.401528
7 num_of_steps 71.348797
8 Hospital_expenditure 18.764176
9 NUmber_of_past_hospitalizations 16.161025
10 Anual_Salary 29.406733
11 region 2.894132

ins=ins.drop(['num_of_steps'],axis=1)

col_list=[]
for col in ins.columns:
if((ins[col].dtype !='object')&(col !='charges')):
col_list.append(col)
x=ins[col_list]
vif_data=pd.DataFrame()
vif_data['feature']=x.columns
vif_data['VIF']=[variance_inflation_factor(x.values,i) for i in
range(len(x.columns))]
print(vif_data)

feature VIF
0 age 11.462460
1 sex 1.985836
2 bmi 19.410639
3 children 1.948997
4 smoker 12.148741
5 Claim_Amount 6.471067
6 past_consultations 7.634564
7 Hospital_expenditure 18.466162
8 NUmber_of_past_hospitalizations 14.198316
9 Anual_Salary 29.243856
10 region 2.614750

ins=ins.drop(['bmi'],axis=1)

col_list=[]
for col in ins.columns:
if((ins[col].dtype !='object')&(col !='charges')):
col_list.append(col)
x=ins[col_list]
vif_data=pd.DataFrame()
vif_data['feature']=x.columns
vif_data['VIF']=[variance_inflation_factor(x.values,i) for i in
range(len(x.columns))]
print(vif_data)

feature VIF
0 age 11.313883
1 sex 1.980936
2 children 1.947983
3 smoker 7.484382
4 Claim_Amount 6.140250
5 past_consultations 7.107175
6 Hospital_expenditure 18.465946
7 NUmber_of_past_hospitalizations 13.951702
8 Anual_Salary 28.828791
9 region 2.594851

ins=ins.drop(['Hospital_expenditure'],axis=1)

col_list=[]
for col in ins.columns:
if((ins[col].dtype !='object')&(col !='charges')):
col_list.append(col)
x=ins[col_list]
vif_data=pd.DataFrame()
vif_data['feature']=x.columns
vif_data['VIF']=[variance_inflation_factor(x.values,i) for i in
range(len(x.columns))]
print(vif_data)

feature VIF
0 age 11.301633
1 sex 1.980877
2 children 1.947983
3 smoker 7.018653
4 Claim_Amount 6.135856
5 past_consultations 7.087869
6 NUmber_of_past_hospitalizations 13.263038
7 Anual_Salary 5.024536
8 region 2.591447

ins=ins.drop(['NUmber_of_past_hospitalizations'],axis=1)

col_list=[]
for col in ins.columns:
if((ins[col].dtype !='object')&(col !='charges')):
col_list.append(col)
x=ins[col_list]
vif_data=pd.DataFrame()
vif_data['feature']=x.columns
vif_data['VIF']=[variance_inflation_factor(x.values,i) for i in
range(len(x.columns))]
print(vif_data)

feature VIF
0 age 9.157284
1 sex 1.954003
2 children 1.819917
3 smoker 7.000894
4 Claim_Amount 5.908022
5 past_consultations 6.895883
6 Anual_Salary 3.820693
7 region 2.546124

ins=ins.drop(['age'],axis=1)

col_list=[]
for col in ins.columns:
if((ins[col].dtype !='object')&(col !='charges')):
col_list.append(col)
x=ins[col_list]
vif_data=pd.DataFrame()
vif_data['feature']=x.columns
vif_data['VIF']=[variance_inflation_factor(x.values,i) for i in
range(len(x.columns))]
print(vif_data)

feature VIF
0 sex 1.945793
1 children 1.815134
2 smoker 4.890261
3 Claim_Amount 5.701343
4 past_consultations 6.453334
5 Anual_Salary 3.541460
6 region 2.530922

ins=ins.drop(['past_consultations'],axis=1)

col_list=[]
for col in ins.columns:
if((ins[col].dtype !='object')&(col !='charges')):
col_list.append(col)
x=ins[col_list]
vif_data=pd.DataFrame()
vif_data['feature']=x.columns
vif_data['VIF']=[variance_inflation_factor(x.values,i) for i in
range(len(x.columns))]
print(vif_data)

feature VIF
0 sex 1.935313
1 children 1.796418
2 smoker 4.269236
3 Claim_Amount 5.311758
4 Anual_Salary 2.4178745
region 2.481157

Splitting the Features and Target


x=ins.loc[:,
['children','sex','smoker','Claim_Amount','Anual_Salary','region']]
y=ins.iloc[:,-1]

Removal Of DuplicacY Data


# Python code to remove duplicate elements
def Remove(ins):
final_list = []
for num in ins:
if num not in final_list:
final_list.append(num)
return final_list

# Driver Code
duplicate = [ins]
print(Remove(duplicate))
[ sex children smoker Claim_Amount Anual_Salary region
charges
0 0 0.0 1 29087.54313 5.578497e+07 0
1121.87390
1 0 0.0 1 39053.67437 1.370089e+07 0
1131.50660
2 0 0.0 1 39023.62759 7.352311e+07 0
1135.94070
3 0 0.0 1 28185.39332 7.581968e+07 0
1136.39940
4 0 0.0 1 14697.85941 2.301232e+07 0
1137.01100
... ... ... ... ... ... ...
...
1333 1 0.0 0 63142.25346 3.101107e+09 3
55135.40209
1334 1 1.0 0 43419.95227 3.484216e+09 2
58571.07448
1335 0 3.0 0 52458.92353 3.640807e+09 3
60021.39897
1336 0 0.0 0 69927.51664 4.006359e+09 0
62592.87309
1337 1 0.0 0 63982.80926 4.117197e+09 0
63770.42801

[1338 rows x 7 columns]]

pd.DataFrame(ins.iloc[:,-1])

charges
0 1121.87390
1 1131.50660
2 1135.94070
3 1136.39940
4 1137.01100
... ...
1333 55135.40209
1334 58571.07448
1335 60021.39897
1336 62592.87309
1337 63770.42801

[1338 rows x 1 columns]

x_train, x_test, y_train, y_test = train_test_split(x, y,


test_size=0.8, random_state=0)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
(267, 6)
(267,)
(1071, 6)
(1071,)

Splitting the data into Training data & Testing Data


Model Training
Linear Regression
# loading the Linear Regression model
regressor = LinearRegression()

regressor.fit(x_train, y_train)

LinearRegression()

Training Data
# prediction on training data
training_data_prediction =regressor.predict(x_train)

# R squared value
r2_train = metrics.r2_score(y_train, training_data_prediction)
print('R squared vale : ', r2_train)

R squared vale : 0.9113630114171092

Testing Data
# prediction on test data
test_data_prediction =regressor.predict(x_test)

# R squared value
r2_test = metrics.r2_score(y_test, test_data_prediction)
print('R squared vale : ', r2_test)

R squared vale : 0.9165676647531819

input_data = (21,54,7,8,44,88)

# changing input_data to a numpy array


input_data_as_numpy_array = np.asarray(input_data)

# reshape the array


input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = regressor.predict(input_data_reshaped)
print(prediction)

print('The insurance cost is USD ', prediction[0])


[3520.503511]
The insurance cost is USD 3520.503511004813

/usr/local/lib/python3.8/dist-packages/sklearn/base.py:450:
UserWarning: X does not have valid feature names, but LinearRegression
was fitted with feature names
warnings.warn(

You might also like