Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 34

EXP.

NO:01 WORKING WITH PANDAS DATA FRAME

SOURCE CODE:

importio
from google.colab import files
uploaded =files.upload()

import pandas as pd
import numpy as np

d= pd.read_csv()

d.describe()

d.head()

d.head(6)

d.tail()

d.dtypes()

d.ndim

d.shape()

d.size()

d.coloumns()
d[ Age ]

a=d.sort_values(by= Age)

a1=d.sort_values(by= Age ascending=True)

a1

a2=d.sort_values(by= Age ,ascending==True,inplace=True)

a2

b=d[d[ Salary ]>20000]


b

1
OUTPUT:

Out[1]:
Sl.No Employee Age Sex Salary
s
S
e
r
v
i
c
e

Teacher 30. female 2.0 20000.


0 0

Officestaff 20. male 5.0 15000.


0 0

Teacher 42. male NaN 35000.


0 0

Teacher 29. female 4.0 25000.


0 0

Officestaff 30. female 6.0 15000.


0 0

Officestaff 31. female 2.0 NaN


0

Teacher Na male 5.0 25000.


N 0

Teacher 45. female 2.0 20000.


0 0

Officestaff 38. male 6.0 15000.


0 0

10 Teacher 44. male NaN 35000.


0 0

10 11 Teacher 28. female 4.0 20000.


0 0

11 12 Officestaff 33. female 3.0 NaN


0

12 13 Officestaff 30. male 8.0 20000.


0 0

13 14 Teacher 41. male 1.0 10000.


0 0
14 15 Officestaff 31. female 2
6.0 25000.
0 0

3
Out[2]:

SlNo Age Service Salary

count 21.000000 20.00000 18.000000 19.000000


0

mean 11.000000 34.60000 4.222222 21842.1052


0 63

std 6.204837 7.949843 2.073802 7632.26453


9

min 1.000000 20.00000 1.000000 10000.0000


0 00

25% 6.000000 30.00000 2.000000 15000.0000


0 00

50% 11.000000 32.00000 4.500000 20000.0000


0 00

75% 16.000000 42.00000 6.000000 25000.0000


0 00

max 21.000000 45.00000 8.000000 35000.0000


0 00

Out[3]:

Age Sex Services Salary


SlNo
e
Servic
Teacher 30.0 femal 2.0 20000.0

e
Officestaff 20.0 male 5.0 15000.0

Teacher 42.0 male NaN 35000.0

Teacher 29.0 femal 4.0 25000.0

e
Officestaff 30.0 femal 6.0 15000.0

4
Out[4]:

SlN Employees Age Sex Service Salary


o

Teacher 30.0 femal 2.0 20000.


e 0

Officestaff 20.0 male 5.0 15000.


0

Teacher 42.0 male NaN 35000.


0

Teacher 29.0 femal 4.0 25000.


e 0

Officestaff 30.0 femal 6.0 15000.


e 0
Officestaff 31.0 femal 2.0 NaN
e

5
EX.NO:02 BASICPLOTS USING MATPLOTLIB

SOURCE CODE:

1) PLOT(X,Y)

import matplotlib.pyplot as plt


import numpy as np
plt.style.use('_mpl-gallery')
#make data

x = np.linspace(0, 10, 100)y=4+2 *np.sin(2*x)

#plot

fig, ax=plt.subplots()

ax.plot(x,y,linewidth=2.0)

ax.set(xlim=(0,8),xticks=np.arange(1,8), ylim=(0,8),yticks=np.arange(1,8))

plt.show()

OUTPUT:

6
2) SCATTER(X,Y)

import matplotlib.pyplot as plt


import numpy as np

plt.style.use('_mpl-gallery')

#make the data

np.random.seed(3)

x=4+ np.random.normal(0,2,24)

y = 4 + np.random.normal(0, 2, len(x))

#size and color:


sizes = np.random.uniform(15, 80,len(x))colors=np.random.uniform(15,80,len(x))

#plot

fig, ax=plt.subplots()

ax.scatter(x,y,s=sizes,c=colors,vmin=0,vmax=100)

ax.set(xlim=(0,8),xticks=np.arange(1,8), ylim=(0,8),yticks=np.arange(1,8))

plt.show()

OUTPUT:

7
3) BAR(X,HEIGHT)

import matplotlib.pyplot as plt

import numpy as np

plt.style.use('_mpl-gallery')

#make data:

np.random.seed(3)

x=0.5+np.arange(8)

y=np.random.uniform(2,7,len(x))

#plot

fig, ax=plt.subplots() ax.bar(x,y,width=1,edgecolor="white",linewidth=0.7)

ax.set(xlim=(0,8),xticks=np.arange(1,8), ylim=(0,8),yticks=np.arange(1,8))

plt.show()

OUTPUT:

4) STEM(X,Y)
import matplotlib.pyplot as plt
import numpy as np

plt.style.use('_mpl-gallery')

# make data
8
np.random.seed(3)

x=0.5+np.arange(8)

y=np.random.uniform(2,7,len(x))

#plot

fig, ax=plt.subplots()

ax.stem(x,y)

ax.set(xlim=(0,8),xticks=np.arange(1,8),ylim=(0,8),yticks=np.arange(1,8))

plt.show()

OUTPUT:

9
EX.NO:3 FREQUENCY DISTRIBUTION, AVERAGES ,VARIABILITY

PROGRAM:

A histogram is a plot of the frequency distribution of numeric array by splitting it to small equal-sized bins.

1) FREQUENCEYDISTRIBUTION

import numpy as np

x=np.random.randint(low=0,high=100,size=100)

#Compute frequency and bins

frequency,bins=np.histogram(x,bins=10,range=[0,100])

#Pretty Print

for b, finzip(bins[1:], frequency):print(round(b,1),''.join(np.repeat('*',f)))

OUTPUT:

10.0* ** * ** *

20.0* * ** * ** ***

30.0* * ** * ** *** * **

40.0* ** * ** * *

50.0* * ** * ** **

60.0* * ** * ** **

70.0* * ** * ** *** * *

80.0* ** * ** *

90.0* * ** * ** *** * ** * * **

100.0* ** * ** * *

2) AVERAGE,VARIENCE,STANDARDDEVIATION

import numpy as np

# Original array

array=np.arange(10)
1
0
print(array)

1
1
r1 =np.average(array)

print("\nMean:",r1)

r2=np.sqrt(np.mean((array -np.mean(array))**2)) print("\

nstd:",r2)

r3 =np.mean((array- np.mean(array))**2)

print("\nvariance:",r3)

OUTPUT:

[0123456789]

Average:4.5

std:2.8722813232690143

variance:8.25

12
EX.NO:4 NORMAL CURVES, CORRELATION AND SCATTER

PLOTS,CORRELATIONCOEFFICENT

PROGRAM:

1) NORMAL CURVE

import numpy asnp

import matplotlib.pyplot as plt

from scipy.stats import norm

#if using a Jupyter notebook,

inlcude:

%matplotlib inline

# define constants

mu=998.8

sigma=73.10

x1=900

x2=1100

#calculate the z-transform

z1=(x1-mu) /sigma

z2=(x2-mu)/sigma

x = np.arange(z1, z2, 0.001)

# range of x in spec

x_all=np.arange(-10, 10, 0.001)

# entire range of x, both in and out of spec

#mean=0,stddev=1,since Z-transform was calculated

y=norm.pdf(x,0,1)

y2 = norm.pdf(x_all,0,1)#build the plot

fig, ax = plt.subplots(figsize=(9,6))

plt.style.use('five thirty eight')


13
ax.plot(x_all,y2)

14
ax.fill_between(x,y,0, alpha=0.3, color='b')

ax.fill_between(x_all,y2,0, alpha=0.1)

ax.set_xlim([-4,4])

ax.set_xlabel('# of Standard Deviations Outside the Mean')

ax.set_yticklabels([])

ax.set_title('Normal Curve')

plt.savefig('normal_curve.png', dpi=72, bbox_inches='tight')

plt.show()

OUTPUT:

2) SCATTERPLOT
import matplotlib.pyplot as plt
import numpy as np
x=range(50)

y=range(50)+np.random.randint(0,30,50)
plt.scatter(x,y)
plt.rcParams.update({ figure.figsize :(10,8), figure.dpi :100})plt.title( simpescatterplot )
plt.xlabel( xvalue
)
plt.ylabel( yvalue)
15
plt.show()
OUTPUT:

16
3) CORRELATION WITH SCATTER PLOT

import matplotlib.pyplot as plt

import numpy as np

x=np.random.randn(100)

y1=x*5+9
y2=-5+x

y3=np.random.randn(100)plt.rcParams.update({'figure.figsize':(10,8),'figure.dpi':100})

plt.scatter(x,y1,label=f’y1,correlation={np.round(np.corrcoef(x,y1)[0,1],2)}’)

plt.scatter(x,y2,label=f’y2,correlation={np.round(np.corrcoef(x,y2)[0,1],2)}’) plt.scatter(x,y3,label=f’y3,

correlation={np.round(np.corrcoef(x,y3)[0,1],2)}’) plt.title('scatterplotandcorrelations')

plt.legend()

plt.show()

17
OUTPUT:

18
EX.NO:5 REGRESSION

SOURCE CODE:

import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

# Create a sample

DataFrame

df = pd.DataFrame({'independent_var1': [1, 2, 3, 4, 5],


'independent_var2': [3, 4, 5, 6, 7],'independent_var3': [2, 4, 6, 8, 10],
'dependent_var': [5, 7, 9, 11, 13]})

# Fit the regression model


model = smf.ols(formula='dependent_var ~ independent_var1 + independent_var2 + independent_var3',
data=df).fit()

# Print the model summary


print(model.summary())

# Visualize the regression line


sns.lmplot(x='independent_var1', y='dependent_var', data=df, ci=None)
plt.show()

OUTPUT:

OUT[1]:

OLS Regression Results

==============================================================================
Dep. Variable: Model:dependent_var
Method: Date: OLS R-squared:1.000
Time: Least Squares Thu, 30Adj.
MarR-squared:1.000
2023
No. Observations: Df Residuals:
08:40:16 F-statistic:7.606e+30 Prob (F-statistic):1.05e-46 Log-Likelihood:16
Df Model: -321.5
5 AIC: BIC:
3 -322.2
1
Covariance Type:nonrobust
19
====================================================================================
coefstd errtP>|t|[0.0250.975]
independent_var1 independent_var2
0.15383.17e-164.86e+14 0.000 0.1540.154
1.2308 9e-16 1.37e+15 0.0001.2311.231
independent_var30.3077 6.34e-164.86e+140.0000.3080.308
==============================================================================
Omnibus:nanDurbin-Watson:0.400
Prob(Omnibus):nanJarque-Bera (JB):0.770
Skew:-0.844Prob(JB):0.680
Kurtosis:2.078Cond. No4.43e+16
==============================================================================

OUT[2]:

20
EXNO: 6 IMPLEMENT Z-TEST USING STATSMODELS

SOURCE CODE:

1) ONE SAMPLE Z-TEST

from statsmodels.stats.weightstats import ztest as ztest

#enter IQ levels for 20patients


data=[88,92, 94, 94,96, 97,97,97, 99, 99,105,109,109,109,110, 112,112,113,114,115]

#perform one sample z-test

ztest(data,value=100)

2) TWO SAMPLE Z-TEST

from statsmodels.stats.weightstats
import ztest as ztest
#enter IQ levels for20individuals from each city
cityA=[82,84,85, 89,91, 91,92,94, 99, 99,105,109,109,109,110,112,112,113,114,114]

cityB =[90,91,91, 91,95, 95,99,99, 108,109,109,114,115,116,117,117,128,129,130,133]

#perform two sample z-test

ztest(cityA, cityB, value=0)

21
OUTPUT:

ONE SAMPLE Z-TEST

(1.5976240527147705,0.1101266701438426)

TWO SAMPLE Z-TEST


(-1.9953236073282115,0.046007596761332065)

22
EXNO:7 IMPLEMENTION OF T-TEST USING SCIPY

SOURCE CODE:

1) ONE SAMPLE T-TEST

data=[14,14,16,13,12,17,15,14,15,13,15,14]
import scipy.stats as stats

#perform one sample t-test

stats.ttest_1samp(a=data, popmean=15)

2) TWO SAMPLE T-TEST

import numpy as np

group1=np.array([14,15,15,16,13,8,14,17,16,14, 19,20,21,15,15,16,16,13,14,12])

group2=np.array([15,17,14,17,14,8,12,19,19,14, 17,22,24,16,13,16,13,18,15,13])

#find variance for each group

print(np.var(group1),np.var(group2))

import scipy.stats as stats

#perform two sample t-test with equal variances

stats.ttest_ind(a=group1,b=group2,equal_var=True)

23
OUTPUT:

ONE SAMPLE T-TEST

TtestResult(statistic=-1.6848470783484626,pvalue=0.12014460742498101,df=11)

TWO SAMPLE T-TEST

7.73,12.26

TtestResult(statistic=-0.6337,pvalue=0.53005)

24
EXNO:8 ANOVA TABLE

SOURCECODE:

#enter exam scores for each group

group1= [85,86, 88,75,78,94, 98, 79,71,80]

group2= [91,92, 93,85,87,84, 82, 88,95,96]

group3= [79,78, 88,94,92,85, 83, 85,82,81]

from scipy.stats import f_oneway

#perform one-way ANOVA

f_oneway(group1, group2,group3)

import numpy as np
import pandas as pd
#create data

df=pd.DataFrame({'water':np.repeat(['daily','weekly'],15),'sun':np.tile(np.repeat(['low','med','high'],5),2),
'height':[6,6,6,5,6,5, 5,6,4,5,6,6,7, 8,7,3,4,4,4,5,4,4,4,4,4,5,6, 6,7,8]})
#view rows of data

df import statsmodels.api as sm

from statsmodels.formula.api import ols

#perform two-way ANOVA


model = ols('height ~ C(water) + C(sun) + C(water):C(sun)', data=df).fit()
sm.stats.anova_lm(model, typ=2)

25
OUTPUT:

OUT[1]:
F_oneway Result (statistic=2.3575322551335636,pvalue=0.1138479534583721 8)

OUT[2]:

water sun height

0 daily low 6

1 daily low 6

2 daily low 6

3 daily low 5

4 daily low 6

5 daily med 5

6 daily med 5

7 daily med 6

8 daily med 4

9 daily med 5

sum_sq df F PR(>F)

C(water) 8.533333 1.0 16.0000 0.000527

C(sun) 24.866667 2.0 23.3125 0.000002

C(water):c(sun) 2.466667 2.0 2.3125 0.120667

Residual 12.800000 24.0 NaN NaN

26
EXNO:9 BUILDING AND VALIDATING LINEAR MODELS

SOURCE CODE:

import numpy as np
from sklearn.linear_modelimport LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# create sample data

x = np.array([1, 2, 3, 4, 5])
y = np.array([2, 4, 5, 4, 6])

# reshape data for input to linear regression model

x = x.reshape(-1, 1)
y = y.reshape(-1, 1)

# create linear regression model

model = LinearRegression()

# fit model to data

model.fit(x, y)

# make predictions on test data

y_pred = model.predict(x)

# calculate mean squared error and r-squared values

mse = mean_squared_error(y, y_pred)


r2 = r2_score(y, y_pred)

# print results

print("Mean squared error: ", mse)


print("R-squared value: ", r2)

27
OUTPUT:

Mean squared error: 0.4799999999999998


R-squared value: 0.7272727272727274

28
EXNO:10 BUILDING AND VALIDATING LOGISTIC MODELS

SOURCE CODE:

# Import necessary libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Sample data
data = {'age': [25, 32, 28, 45, 36, 52, 40, 55, 60, 48],'income': [50000, 75000, 60000, 55000,
70000, 80000, 65000, 90000, 95000, 90000],'default': [0, 0, 0, 1, 1, 1, 0, 1, 1, 1]}

# Create a DataFrame
df = pd.DataFrame(data)

# Data preprocessing
X = df[['age', 'income']] # Features
Y = df['default'] # Target variable

# Split data into training and testing sets


X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model


Model = LogisticRegression()
Model.fit(X_train, Y_train)

# Make predictions on the test set


Y_pred = Model.predict(X_test)

# Model evaluation
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
confusion = confusion_matrix(Y_test, Y_pred)

# Print the evaluation metrics


print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1-score: {:.2f}%".format(f1 * 100))
print("Confusion Matrix:\n", confusion)

29
OUTPUT:

● The evaluation metrics

Accuracy: 100.00%

Precision: 100.00%

Recall: 100.00%

F1-score: 100.00%

Confusion Matrix:

[[1 0]

[0 1]]

30
EXNO: 11 TIME SERIES ANALYSIS

PROGRAM:

SINGLE TIME SERIES

Import pandas as pd

Import matplotlib.pyplot as plt

Import seaborn as sns

#create Data Frame df=pd.DataFrame({'date':

['1/2/2021','1/3/2021','1/4/2021','1/5/2021','1/6/2021','1/7/2021','1/8/2021'],

'value':[4,7,8,13,17,15,21]})

sns.lineplot(x='date',y='value',data=df)

To customise Line Color and Line Width

Create time series plot with custom aesthetics sns.lineplot(x='date',

y='value', data=df, linewidth=3, color='purple',

linestyle='dashed').set(title='TimeSeries Plot')

#rotatex-axislabelsby15degrees

plt.xticks(rotation=15)

import pandas as pd

import matplotlib.pyplot as plt

import seaborn assns

#create Data Frame df=pd.DataFrame({'date':

['1/1/2021','1/2/2021','1/3/2021','1/4/2021','1/1/2021','1/2/2021','1/3/2021','1/4/2021'
],

'sales':[4,7,8,13,17,15,21,28],'company':['A','A','A','A','B','B','B','B']})

#plot multiple time series


31
sns.lineplot(x='date', y='sales', hue='company', data=df)

32
OUTPUT:

OUT[1]:

<AxesSubplot:xlabel='date',ylabel='value'>

OUT[2]:

([0,1,2,3,4,5,6],
[Text(0,0,''),
Text(0, 0, ''),
Text(0, 0, ''),
Text(0, 0, ''),
Text(0, 0, ''),
Text(0, 0, ''),
Text(0, 0, '')])

33
OUT[3]:

<AxesSubplot:xlabel='date',ylabel='sales'>

34

You might also like