Data Science Practicals - Ipynb

You might also like

Download as pdf or txt
Download as pdf or txt
You are on page 1of 54

BPHE Society's

AHMEDNAGAR COLLEGE, AHMEDNAGAR


DEPARTMENT OF COMPUTER SCIENCE
TYBSc(Computer Science)

CS 358 : DATA SCIENCE PRACTICALS

Name :F

Seat no : .

CONTENTS

Sr. No Assignment Title

1 The Data Science Environment

2 Statistical Data Analysis

3 Data Preprocessing

4 Data Visualization
ASSIGNMENT 1 : THE DATA SCIENCE ENVIRONMENT

SET A

In [ ]:
#Q1. Create and view a data frame
#import the library
import pandas as pd
import numpy as np
#Enter Data
data_values={'Name':['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
'Age' : [26, 28, 20, 15, 20, 16, 18, 17, 22, 21],
'Percentage' : [56,62,42,74,32,63,74,84,96,21]
}
#Create empty dataframe with column names
data=pd.DataFrame.from_dict(data_values)
data #To view the data frame

Out[ ]: Name Age Percentage

0 A 26 56

1 B 28 62

2 C 20 42

3 D 15 74

4 E 20 32

5 F 16 63

6 G 18 74

7 H 17 84

8 I 22 96

9 J 21 21

In [ ]:
#Q2.
#print shape >> number of rows - columns
data.shape

Out[ ]: (10, 3)

In [ ]:
print("Size = {} \n Shape = {}\n Number of rows = {} \n Number of Columns = {}".
format(data.size, data.shape, data.shape[0], data.shape[1]))

Size = 30
Shape = (10, 3)
Number of rows = 10
Number of Columns = 3

In [ ]:
#feature names
print("data types")
data.dtypes

data types
Out[ ]: Name object
Age int64
Percentage int64
dtype: object

In [ ]:
print("Feature Names = {}, {}, {}".
format(data.columns[0], data.columns[1], data.columns[2]))

Feature Names = Name, Age, Percentage

In [ ]:
print("Description of Data")
data.info()

Description of Data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Name 10 non-null object
1 Age 10 non-null int64
2 Percentage 10 non-null int64
dtypes: int64(2), object(1)
memory usage: 368.0+ bytes
In [ ]: #Number of columns with null entries = 0
#Number of columns with numeric data = 2
#Number of columns with categorical data = 1
#Q3. obtaining basic statistical details of the data
data.describe(include = "all")

Out[ ]: Name Age Percentage

count 10 10.000000 10.000000

unique 10 NaN NaN

top C NaN NaN

freq 1 NaN NaN

mean NaN 20.300000 60.400000

std NaN 4.191261 23.381854

min NaN 15.000000 21.000000

25% NaN 17.250000 45.500000

50% NaN 20.000000 62.500000

75% NaN 21.750000 74.000000

max NaN 28.000000 96.000000

In [ ]:
# Mean Age = 20.3 yrs ; Mean % = 60.4 %
# Standard Deviation : sd(Age) = 4.191261 ;sd(%) = 23.381854
# Minimum Age =15 yrs ; Maximum Age = 28 yrs
# Minimum % = 21% ; Maximum % = 96%

#Q4. Adding 5 rows and 1 column


data.loc[10] = ['K',21,56 ]
data.loc[11] = ['L',21,None]
data.loc[12] = ['M',None, 45]
data.loc[13] = ['K',21,56]
data.loc[14] = ['O',25,84]
data["Remarks"] = None
data #data display
Out[ ]:
Name Age Percentage Remarks

0 A 26 56 None

1 B 28 62 None

2 C 20 42 None

3 D 15 74 None

4 E 20 32 None

5 F 16 63 None

6 G 18 74 None

7 H 17 84 None

8 I 22 96 None

9 J 21 21 None

10 K 21 56 None

11 L 21 None None

12 M None 45 None

13 K 21 56 None

14 O 25 84 None

In [ ]:
#Q5.
print("Number of Observations = ", len(data.index))
print(" \nTotal missing values in a DataFrame : \n\n",
data.isnull().sum().sum())
print(" \nTotal missing values in a DataFrame : \n\n",
data.isnull().sum().sum())
print(data.duplicated().value_counts()) #number of duplicate values

Number of Observations = 15

Total missing values in a DataFrame :

17
Total missing values in a DataFrame :

17
False 14
True 1
dtype: int64

In [ ]:
#duplicate observations = 1
#Q6. Removing a column and missing values
data2=data.drop(columns='Remarks')
data2=data2.dropna(axis=0)
#print modified data
data2

Out[ ]: Name Age Percentage

0 A 26 56

1 B 28 62

2 C 20 42

3 D 15 74

4 E 20 32

5 F 16 63

6 G 18 74

7 H 17 84

8 I 22 96

9 J 21 21

10 K 21 56

13 K 21 56

14 O 25 84

In [ ]:
#Q7. Line plot
import matplotlib.pyplot as plt
data2.plot(x="Name",y="Percentage",
title="Line Plot of Name Vs Percentage")
plt.xlabel("Names")
plt.ylabel("Percentages")
plt.show()

In [ ]:
#Q8. Scatterplot
data2.plot.scatter(x='Name',y='Percentage',
title = "Scatterplot")
plt.show()
SET B

In [ ]:
#Q1.
from google.colab import files
data=files.upload()

Choose Files No file chosen Upload widget is only available when the cell has been executed in the current browser session. Please
rerun this cell to enable.
Saving SOCR-HeightWeight.csv to SOCR-HeightWeight.csv

In [ ]:
import pandas as pd
data=pd.read_csv('SOCR-HeightWeight.csv')
data.head(10) #print first 10 rows

Out[ ]: Index Height(Inches) Weight(Pounds)

0 1 65.78331 112.9925

1 2 71.51521 136.4873

2 3 69.39874 153.0269

3 4 68.21660 142.3354

4 5 67.78781 144.2971
Index Height(Inches) Weight(Pounds)

5 6 68.69784 123.3024

6 7 69.80204 141.4947

7 8 70.01472 136.4623

8 9 67.90265 112.3723

9 10 66.78236 120.6672

In [ ]:
data.tail(10) #print last 10 rows

Out[ ]: Index Height(Inches) Weight(Pounds)

24990 24991 69.97767 125.3672

24991 24992 71.91656 128.2840

24992 24993 70.96218 146.1936

24993 24994 66.19462 118.7974

24994 24995 67.21126 127.6603

24995 24996 69.50215 118.0312

24996 24997 64.54826 120.1932

24997 24998 64.69855 118.2655

24998 24999 67.52918 132.2682

24999 25000 68.87761 124.8742

In [ ]:
data.sample(20) #print 20 random rows

Out[ ]: Index Height(Inches) Weight(Pounds)

19541 19542 70.61081 123.80970


Index Height(Inches) Weight(Pounds)

2262 2263 70.16858 128.32730

21570 21571 67.74068 113.65620

2790 2791 64.68621 94.21971

8302 8303 64.70924 130.26420

12979 12980 64.96373 123.95810

21564 21565 65.91455 113.30490

23854 23855 67.46192 119.60950

8976 8977 66.14316 134.40430

22064 22065 70.56492 143.63680

4271 4272 64.69559 116.72410

21254 21255 69.10296 143.96990

7581 7582 67.97280 123.63680

531 532 67.14847 128.58150

24211 24212 68.04343 114.95530

23597 23598 67.65681 110.02110

21629 21630 68.04764 140.07170

21042 21043 67.79875 130.18870

17338 17339 66.91180 138.93730

17868 17869 66.14494 141.46120

In [ ]:
#Q2.
print("Size = {} \n Shape of DataFrame Object = {}\n Number of rows = {} \n Number of Columns = {}".
format(data.size, data.shape, data.shape[0], data.shape[1]))
print("\n Datatypes of dataframe object")
data.dtypes

Size = 75000
Shape of DataFrame Object = (25000, 3)
Number of rows = 25000
Number of Columns = 3

Datatypes of dataframe object


Out[ ]: Index int64
Height(Inches) float64
Weight(Pounds) float64
dtype: object

In [ ]:
#Q3.
data.describe() #basic statistical details

Out[ ]: Index Height(Inches) Weight(Pounds)

count 25000.000000 25000.000000 25000.000000

mean 12500.500000 67.993114 127.079421

std 7217.022701 1.901679 11.660898

min 1.000000 60.278360 78.014760

25% 6250.750000 66.704397 119.308675

50% 12500.500000 67.995700 127.157750

75% 18750.250000 69.272958 134.892850

max 25000.000000 75.152800 170.924000

In [ ]:
#Mean Height = 67.9931 Inches ; Mean Weight = 127.0794 Pounds
#sd(Height) = 1.9017 ; sd(Weight) = 11.6609
#Minimum Height = 60.2784 Inches ; Minimum Weight = 78.0148 Pounds
#Maximum Height = 75.1528 Inches ; Maximum Weight = 170.924 Pounds

#Q4.
print("\n Description of Data")
data.info()
print("\n Number of Observations = ", len(data.index))
print(" \nTotal missing values in a DataFrame = ",data.isnull().sum().sum())

print("Number of duplicate values \n ", data.duplicated().value_counts())


Description of Data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Index 25000 non-null int64
1 Height(Inches) 25000 non-null float64
2 Weight(Pounds) 25000 non-null float64
dtypes: float64(2), int64(1)
memory usage: 586.1 KB

Number of Observations = 25000

Total missing values in a DataFrame = 0


Number of duplicate values
False 25000
dtype: int64

In [ ]:
#Q5.
#Add column "BMI"
data2=data.assign(BMI=data['Weight(Pounds)']/(data['Height(Inches)']*data['Height(Inches)']))
data2.head(1)

Out[ ]: Index Height(Inches) Weight(Pounds) BMI

0 1 65.78331 112.9925 0.026111

In [ ]:
#Q6.
print("Maximum BMI = ",max(data2['BMI']))
print("\n Minimum BMI = ",min(data2['BMI']))

Maximum BMI = 0.03701443692089851

Minimum BMI = 0.018591137267932455

In [ ]:
#Q7.
data.plot(x='Weight(Pounds)',y='Height(Inches)',kind="scatter", title = "ScatterPlot of height vs weight ")
plt.show()
ASSIGNMENT 2 : STATISTICAL DATA ANALYSIS

SET A

In [ ]:
#Q1.
import numpy as np
array = np.array([[0,1],[2,3]])
print("\n Original flattened array: \n", array)
print(" \n Maximum Value of the above flattened array : \n ", np.max(array))
print(" \n Minimum Value of the above flattened array : \n ", np.min(array))

Original flattened array:


[[0 1]
[2 3]]

Maximum Value of the above flattened array :


3

Minimum Value of the above flattened array :


0

In [ ]:
#Q2.
import numpy as np
#Inserting the two data points
a=np.array((2,3))
b=np.array((4,5))
#Euclidean Distance
print("Euclidean Distance = ", np.linalg.norm(a-b))

Euclidean Distance = 2.8284271247461903

In [ ]:
#Q3. Create and view a data frame
#import the library
import pandas as pd
import numpy as np
import scipy.stats as s
#Enter Data
data_values={'Name':['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
'Scores' : [56,62,42,74,32,63,74,84,96,21]
}
#Create empty dataframe with column names
data=pd.DataFrame.from_dict(data_values)
print(data) #To view the data frame
print("\n Mean Score = ",s.tmean(data["Scores"]) )
print("\n Maximum = ",max(data["Scores"]))
print("\n Minimum = ",min(data["Scores"]))
print("\n Range = ",
max(data["Scores"]) - min(data["Scores"]) )
q3,q1 = np.percentile(data["Scores"],[75,25])
print("\n Q3 = ", q3)
print("\n Q1 = ", q1)
print("\n IQR = ", q3 - q1)

Name Scores
0 A 56
1 B 62
2 C 42
3 D 74
4 E 32
5 F 63
6 G 74
7 H 84
8 I 96
9 J 21

Mean Score = 60.4

Maximum = 96

Minimum = 21

Range = 75

Q3 = 74.0

Q1 = 45.5

IQR = 28.5

In [ ]:
#Program to find Manhattan Distance between two points
import math
def manhattan(a,b):
return sum(abs(val1 - val2) for val1, val2 in zip(a,b))
#consider any two points
a=[2,3]
b=[4,5]
print ("Points :",a,b)
print("\n Manhattan Distance = ", manhattan(a,b))

Points : [2, 3] [4, 5]

Manhattan Distance = 4

In [ ]:
#Q4. Program to find Manhattan distance between all pairs of points
import math
def manhattan(a,b,n):
sum = 0
i = 0
for i in range(n):
sum += abs(a[i]-b[i])
return sum

In [ ]:
#Example
a=[3,5,5,6,5,4,3]
b=[-2,3,2,-5,2,3,-1]

n=len(a) #or len(b)


print("Manhattan Distance = ", manhattan(a,b,n))

Manhattan Distance = 29

In [ ]:
#Manhattan and Euclidean Distance
import scipy.spatial as sp

print("\n Manhattan Distance = ",sp.distance.minkowski(a,b,1))


print("\n Euclidean Distance = ",sp.distance.minkowski(a,b,2))

Manhattan Distance = 29.0

Euclidean Distance = 13.601470508735444

In [ ]:
#Q5.
import numpy as np
import matplotlib.pyplot as plt
n=np.array([0.5, 0.7, 1.0, 1.2, 1.3, 2.1])
b=np.array([0,1,2,3])
print("\n nums:",n)
print("\n bins:",b )
print("\n Result: \n",np.histogram(n,b))
print("\n")
plt.hist(n,b)
plt.show()

nums: [0.5 0.7 1. 1.2 1.3 2.1]

bins: [0 1 2 3]

Result:
(array([2, 3, 1]), array([0, 1, 2, 3]))

In [ ]:
#Q6.Create and view a data frame
#import the library
import pandas as pd
import numpy as np
import scipy.stats as s
#Enter Data
data_values={'Name':['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
'Graduation Percentage' : [56,62,42,74,32,63,74,84,96,21],
'Age' : [26, 28, 20, 15, 20, 16, 18, 17, 22, 21]
}
#Create empty dataframe with column names
data=pd.DataFrame.from_dict(data_values)
data #To view the data frame

Out[ ]: Name Graduation Percentage Age

0 A 56 26

1 B 62 28

2 C 42 20

3 D 74 15

4 E 32 20

5 F 63 16

6 G 74 18

7 H 84 17

8 I 96 22

9 J 21 21

In [ ]:
print("\n Average age of students = ",s.tmean(data["Age"]) )
print("\n Average Graduation Percentage = ",s.tmean(data["Graduation Percentage"]) )
print("\n All Basic Statistics of Data \n ")
data.describe(include='all')

Average age of students = 20.3

Average Graduation Percentage = 60.4

All Basic Statistics of Data

Out[ ]: Name Graduation Percentage Age

count 10 10.000000 10.000000

unique 10 NaN NaN

top C NaN NaN

freq 1 NaN NaN


Name Graduation Percentage Age

mean NaN 60.400000 20.300000

std NaN 23.381854 4.191261

min NaN 21.000000 15.000000

25% NaN 45.500000 17.250000

50% NaN 62.500000 20.000000

75% NaN 74.000000 21.750000

max NaN 96.000000 28.000000

In [ ]:
print("\n Measures of Dispersion and Position in the Distribution")
r=max(data["Graduation Percentage"]) - min(data["Graduation Percentage"])
print("\n Value of Range in the Distribution = ", r)
s=round(data["Graduation Percentage"].std(),3)
print("Value of Standard Deviation in the Distribution = ", s)
v=round(data["Graduation Percentage"].var(),3)
print("Value of Variance in the Distribution = ", v)

Measures of Dispersion and Position in the Distribution

Value of Range in the Distribution = 75


Value of Standard Deviation in the Distribution = 23.382
Value of Variance in the Distribution = 546.711
SET B

In [ ]:
#Q1.
from google.colab import files
data=files.upload()

Choose Files No file chosen Upload widget is only available when the cell has been executed in the current browser session. Please
rerun this cell to enable.
Saving iris.csv to iris.csv

In [ ]:
import pandas as numpy
#Read csv file
data=pd.read_csv('iris.csv')

In [ ]:
data.sample(13)

Out[ ]: sepal.length sepal.width petal.length petal.width variety

92 5.8 2.6 4.0 1.2 Versicolor

137 6.4 3.1 5.5 1.8 Virginica

147 6.5 3.0 5.2 2.0 Virginica

84 5.4 3.0 4.5 1.5 Versicolor

30 4.8 3.1 1.6 0.2 Setosa

2 4.7 3.2 1.3 0.2 Setosa

4 5.0 3.6 1.4 0.2 Setosa

29 4.7 3.2 1.6 0.2 Setosa

19 5.1 3.8 1.5 0.3 Setosa

60 5.0 2.0 3.5 1.0 Versicolor

52 6.9 3.1 4.9 1.5 Versicolor

6 4.6 3.4 1.4 0.3 Setosa

14 5.8 4.0 1.2 0.2 Setosa

In [ ]:
from pandas.api.types import is_numeric_dtype
print("Minimum and Maximum for all numeric attributes\n")
for col in data.columns:
if is_numeric_dtype(data[col]):
print('%s:' % (col))
print('\t Minimum = ',data[col].min())
print('\t Maximum = ',data[col].max())

Minimum and Maximum for all numeric attributes

sepal.length:
Minimum = 4.3
Maximum = 7.9
sepal.width:
Minimum = 2.0
Maximum = 4.4
petal.length:
Minimum = 1.0
Maximum = 6.9
petal.width:
Minimum = 0.1
Maximum = 2.5

In [ ]:
#Q2.
print("Number of records for different variety/class attribute \n")
data['variety'].value_counts()

Number of records for different variety/class attribute

Out[ ]: Versicolor 50
Setosa 50
Virginica 50
Name: variety, dtype: int64

In [ ]:
#Q3.
import pandas as pd
from pandas.api.types import is_numeric_dtype
print("Iris Dataset : Column wise Mean and Median \n")
for col in data.columns:
if is_numeric_dtype(data[col]):
print('%s:' % (col))
print('\t Mean = %.2f' % data[col].mean())
print('\t Median = %.2f' % data[col].median())

Iris Dataset : Column wise Mean and Median

sepal.length:
Mean = 5.84
Median = 5.80
sepal.width:
Mean = 3.06
Median = 3.00
petal.length:
Mean = 3.76
Median = 4.35
petal.width:
Mean = 1.20
Median = 1.30

SET C

In [ ]:
#Q1. Program to find Minkowskii Distance between two points
from math import *
from decimal import Decimal
def nth_root(value,root):
root_value = 1/float(root)
return round(Decimal(value)**
Decimal(root_value),3)
def minkowski(a,b,n):
return(nth_root(sum(pow(abs(i-j),n)
for i,j in zip(a,b)),n))

In [ ]:
a=[-1,5]
b=[2,4]
n=len(a) #OR root value
print("\n Minkowski Distance = ",minkowski(a,b,n))

Minkowski Distance = 3.162

In [ ]:
#Q2.
import numpy as np
a = np.arange(9).reshape((3,3))
print("Original flattened array:")
print(a)
print("Weighted average along the specified axis of the above flattened array:")
print(np.average(a, axis=1, weights=[1./4, 2./4, 2./4]))

Original flattened array:


[[0 1 2]
[3 4 5]
[6 7 8]]
Weighted average along the specified axis of the above flattened array:
[1.2 4.2 7.2]

In [ ]:
#Q3.
import numpy as np
x = np.array([0, 1, 3])
y = np.array([2, 4, 5])
print("\nOriginal array1:")
print(x)
print("\nOriginal array1:")
print(y)
print("\nCross-correlation of the said arrays:\n",np.cov(x, y))

Original array1:
[0 1 3]

Original array1:
[2 4 5]

Cross-correlation of the said arrays:


[[2.33333333 2.16666667]
[2.16666667 2.33333333]]

In [ ]:
#Q4. Wholesale Customers Data from UCI
from google.colab import files
data=files.upload()

Choose Files No file chosen Upload widget is only available when the cell has been executed in the current browser session. Please
rerun this cell to enable.
Saving Wholesale customers data.csv to Wholesale customers data.csv

In [ ]:
import pandas as pd
#Read csv file
data=pd.read_csv('Wholesale customers data.csv')
data.describe()

Out[ ]: Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen

count 440.000000 440.000000 440.000000 440.000000 440.000000 440.000000 440.000000 440.000000

mean 1.322727 2.543182 12000.297727 5796.265909 7951.277273 3071.931818 2881.493182 1524.870455

std 0.468052 0.774272 12647.328865 7380.377175 9503.162829 4854.673333 4767.854448 2820.105937

min 1.000000 1.000000 3.000000 55.000000 3.000000 25.000000 3.000000 3.000000

25% 1.000000 2.000000 3127.750000 1533.000000 2153.000000 742.250000 256.750000 408.250000

50% 1.000000 3.000000 8504.000000 3627.000000 4755.500000 1526.000000 816.500000 965.500000

75% 2.000000 3.000000 16933.750000 7190.250000 10655.750000 3554.250000 3922.000000 1820.250000

max 2.000000 3.000000 112151.000000 73498.000000 92780.000000 60869.000000 40827.000000 47943.000000


In [ ]:
import pandas as pd
from pandas.api.types import is_numeric_dtype
print("Wholesale Customers Dataset : Column wise Mean for numeric attributes \n")
for col in data.columns:
if is_numeric_dtype(data[col]):
print('%s:' % (col))
print('\t Mean = %.2f' % data[col].mean())
print(" \nCount total NaN at each column in a DataFrame : \n",
data.isnull().sum())
print(" \nTotal number of missing values in the dataset : ",
data.isnull().sum().sum())

Wholesale Customers Dataset : Column wise Mean for numeric attributes

Channel:
Mean = 1.32
Region:
Mean = 2.54
Fresh:
Mean = 12000.30
Milk:
Mean = 5796.27
Grocery:
Mean = 7951.28
Frozen:
Mean = 3071.93
Detergents_Paper:
Mean = 2881.49
Delicassen:
Mean = 1524.87

Count total NaN at each column in a DataFrame :


Channel 0
Region 0
Fresh 0
Milk 0
Grocery 0
Frozen 0
Detergents_Paper 0
Delicassen 0
dtype: int64
Total number of missing values in the dataset : 0

In [ ]:
#Q5.
from google.colab import files
data=files.upload()

Choose Files No file chosen Upload widget is only available when the cell has been executed in the current browser session. Please
rerun this cell to enable.
Saving nursery.data.csv to nursery.data.csv

In [2]:
import pandas as pd
#Read csv file
data=pd.read_csv('nursery.data.csv')

In [5]:
data.head(5)

Out[5]: usual proper complete 1 convenient convenient.1 nonprob recommended recommend

0 usual proper complete 1 convenient convenient nonprob priority priority

1 usual proper complete 1 convenient convenient nonprob not_recom not_recom

2 usual proper complete 1 convenient convenient slightly_prob recommended recommend

3 usual proper complete 1 convenient convenient slightly_prob priority priority

4 usual proper complete 1 convenient convenient slightly_prob not_recom not_recom

In [10]:
#Group by proper
import numpy as np
data_by_proper=data.groupby('proper')
data_by_proper.count()

Out[10]: usual complete 1 convenient convenient.1 nonprob recommended recommend

proper

critical 2592 2592 2592 2592 2592 2592 2592 2592

improper 2592 2592 2592 2592 2592 2592 2592 2592


usual complete 1 convenient convenient.1 nonprob recommended recommend

proper

less_proper 2592 2592 2592 2592 2592 2592 2592 2592

proper 2591 2591 2591 2591 2591 2591 2591 2591

very_crit 2592 2592 2592 2592 2592 2592 2592 2592

In [ ]:
#Q6.Create and view a data frame
#import the library
import pandas as pd
import numpy as np
import scipy.stats as s
#Enter Data
data_values={'Student' : ["1","2","3","4","5","6","7","8","9","10"],
'Subject 1':[41,62,35,15,21,65,84,75,42,95],
'Subject 2' : [56,62,42,74,32,63,74,84,96,21],
'Subject 3' : [26, 28, 20, 15, 20, 16, 18, 17, 22, 21],
'Subject 4' : [41,75,84,62,13,56,42,84,95,23],
'Subject 5' : [45,74,62,31,21,54,45,86,95,32]
}
#Create empty dataframe with column names
data=pd.DataFrame.from_dict(data_values)
data #To view the data frame

Out[ ]: Student Subject 1 Subject 2 Subject 3 Subject 4 Subject 5

0 1 41 56 26 41 45

1 2 62 62 28 75 74

2 3 35 42 20 84 62

3 4 15 74 15 62 31

4 5 21 32 20 13 21

5 6 65 63 16 56 54

6 7 84 74 18 42 45

7 8 75 84 17 84 86

8 9 42 96 22 95 95
Student Subject 1 Subject 2 Subject 3 Subject 4 Subject 5

9 10 95 21 21 23 32

In [ ]:
from pandas.api.types import is_numeric_dtype
from scipy.stats.mstats import gmean
import statistics as stat
print("Subject wise Mean \n")
for col in data.columns:
if is_numeric_dtype(data[col]):
print('%s:' % (col))
print('\t Arithmetic Mean = %.2f' % data[col].mean())
print('\t Geometric Mean = %.2f' % gmean(data[col]))
print('\t Harmonic Mean = %.2f' % stat.harmonic_mean(data[col]))

Subject wise Mean

Subject 1:
Arithmetic Mean = 53.50
Geometric Mean = 46.35
Harmonic Mean = 38.71
Subject 2:
Arithmetic Mean = 60.40
Geometric Mean = 55.41
Harmonic Mean = 49.53
Subject 3:
Arithmetic Mean = 20.30
Geometric Mean = 19.93
Harmonic Mean = 19.58
Subject 4:
Arithmetic Mean = 57.50
Geometric Mean = 49.59
Harmonic Mean = 39.96
Subject 5:
Arithmetic Mean = 54.50
Geometric Mean = 49.33
Harmonic Mean = 44.27

In [ ]:
#Q7.
from google.colab import files
data=files.upload()
Choose Files No file chosen Upload widget is only available when the cell has been executed in the current browser session. Please
rerun this cell to enable.
Saving iris.csv to iris.csv

In [ ]:
import pandas as pd
#Read csv file
data=pd.read_csv('iris.csv')

In [ ]:
! pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

In [ ]:
import pandas as pd
data=pd.read_csv('iris.csv')

In [ ]:
import pandas_profiling
profile = data.profile_report(title="Statistal Data Analysis")
profile

Overview

Dataset statistics Variable types


Number of variables 5 Numeric 4

Number of observations 150 Categorical 1

Missing cells 0
Missing cells (%) 0.0%

Duplicate rows 1

Duplicate rows (%) 0.7%

Total size in memory 6.0 KiB

Average record size in memory 40.9 B

Alerts
Dataset has 1 (0.7%) duplicate rows Duplicates

sepal.length is highly correlated with petal.length and 1 other fields (petal.length, High correlation
petal.width)

petal.length is highly correlated with sepal.length and 1 other fields (sepal.length, High correlation
petal.width)

petal.width is highly correlated with sepal.length and 1 other fields (sepal.length, High correlation
petal.length)

sepal.length is highly correlated with petal.length and 1 other fields (petal.length, High correlation
petal.width)

Out[ ]:

In [ ]:
#Saving the file
profile.to_file("Data Analysis.html")
ASSIGNMENT 3 : DATA PREPROCESSING

SET A

In [ ]:
from google.colab import files
data=files.upload()

Choose Files No file chosen Upload widget is only available when the cell has been executed in the current browser session. Please
rerun this cell to enable.
Saving Data.csv to Data.csv

In [ ]:
import pandas as pd
import io
data = pd.read_csv('Data.csv',sep = ',')
data

Out[ ]: Country Age Salary Purchased

0 France 44.0 72000.0 No

1 Spain 27.0 48000.0 Yes

2 Germany 30.0 54000.0 No

3 Spain 38.0 61000.0 No

4 Germany 40.0 NaN Yes

5 France 35.0 58000.0 Yes

6 Spain NaN 52000.0 No

7 France 48.0 79000.0 Yes

8 Germany 50.0 83000.0 No


Country Age Salary Purchased

9 France 37.0 67000.0 Yes

In [ ]:
#Q1.a
data.describe()

Out[ ]: Age Salary

count 9.000000 9.000000

mean 38.777778 63777.777778

std 7.693793 12265.579662

min 27.000000 48000.000000

25% 35.000000 54000.000000

50% 38.000000 61000.000000

75% 44.000000 72000.000000

max 50.000000 83000.000000

In [ ]:
#b.)
print("Size = {} \n Shape of DataFrame Object = {}\n Number of rows = {} \n Number of Columns = {}".
format(data.size, data.shape, data.shape[0], data.shape[1]))

Size = 40
Shape of DataFrame Object = (10, 4)
Number of rows = 10
Number of Columns = 4

In [ ]:
#c.)
print("\n first 3 rows from Dataset")
data.head(3)

first 3 rows from Dataset


Out[ ]:
Country Age Salary Purchased

0 France 44.0 72000.0 No

1 Spain 27.0 48000.0 Yes

2 Germany 30.0 54000.0 No

In [ ]:
#Q2.
#Handling Missing values
data.fillna(data.mean())

Out[ ]: Country Age Salary Purchased

0 France 44.000000 72000.000000 No

1 Spain 27.000000 48000.000000 Yes

2 Germany 30.000000 54000.000000 No

3 Spain 38.000000 61000.000000 No

4 Germany 40.000000 63777.777778 Yes

5 France 35.000000 58000.000000 Yes

6 Spain 38.777778 52000.000000 No

7 France 48.000000 79000.000000 Yes

8 Germany 50.000000 83000.000000 No

9 France 37.000000 67000.000000 Yes

In [ ]:
#Q3. a. Applying OneHot Encoding on Country Column
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
enc_data= pd.DataFrame(enc.fit_transform(data[['Country']]).toarray())
enc_data

Out[ ]:
0 1 2 3

0 1.0 0.0 0.0 0.0

1 0.0 0.0 0.0 1.0

2 0.0 0.0 1.0 0.0

3 0.0 0.0 0.0 1.0

4 0.0 0.0 1.0 0.0

5 1.0 0.0 0.0 0.0

6 0.0 0.0 0.0 1.0

7 0.0 1.0 0.0 0.0

8 0.0 0.0 1.0 0.0

9 1.0 0.0 0.0 0.0

In [ ]:
data_merge= data.join(enc_data)
data_merge

Out[ ]: Country Age Salary Purchased 0 1 2 3

0 France 44.0 72000.0 No 1.0 0.0 0.0 0.0

1 Spain 27.0 48000.0 Yes 0.0 0.0 0.0 1.0

2 Germany 30.0 54000.0 No 0.0 0.0 1.0 0.0

3 Spain 38.0 61000.0 No 0.0 0.0 0.0 1.0

4 Germany 40.0 NaN Yes 0.0 0.0 1.0 0.0

5 France 35.0 58000.0 Yes 1.0 0.0 0.0 0.0

6 Spain NaN 52000.0 No 0.0 0.0 0.0 1.0

7 France 48.0 79000.0 Yes 0.0 1.0 0.0 0.0

8 Germany 50.0 83000.0 No 0.0 0.0 1.0 0.0

9 France 37.0 67000.0 Yes 1.0 0.0 0.0 0.0


In [ ]:
#Q3. b. Applying label encoding on purchased column
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
data['Purchased'] = labelencoder.fit_transform(data['Purchased'])
data

Out[ ]: Country Age Salary Purchased

0 France 44.0 72000.0 0

1 Spain 27.0 48000.0 1

2 Germany 30.0 54000.0 0

3 Spain 38.0 61000.0 0

4 Germany 40.0 NaN 1

5 France 35.0 58000.0 1

6 Spain NaN 52000.0 0

7 France 48.0 79000.0 1

8 Germany 50.0 83000.0 0

9 France 37.0 67000.0 1

In [ ]:
#The purchased labels are replaces by numbers 0 and 1,
# where 'No' is assigned 0, and 'Yes' is assigned 1.

SET B

In [ ]:
#Q1.
from google.colab import files
data=files.upload()

Choose Files No file chosen Upload widget is only available when the cell has been executed in the current browser session. Please
rerun this cell to enable.
Saving winequality-red.csv to winequality-red.csv

In [ ]:
import pandas as pd
#Read csv file
data=pd.read_csv('winequality-red.csv',sep=';')
data.shape

Out[ ]: (1599, 12)

In [ ]:
#Q2. Rescaling Data
import pandas, scipy, numpy
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
array=data.values
#Separating data into input and output components
data_scaler=preprocessing.MinMaxScaler(feature_range=(0,1))
data_scaled = data_scaler.fit_transform(array)
print("\n Min Max Scaled Data \n \n ")
print(data_scaled.round(3))

Min Max Scaled Data

[[0.248 0.397 0. ... 0.138 0.154 0.4 ]


[0.283 0.521 0. ... 0.21 0.215 0.4 ]
[0.283 0.438 0.04 ... 0.192 0.215 0.4 ]
...
[0.15 0.267 0.13 ... 0.251 0.4 0.6 ]
[0.115 0.36 0.12 ... 0.228 0.277 0.4 ]
[0.124 0.13 0.47 ... 0.198 0.4 0.6 ]]

In [ ]:
# This gives us values between 0 and 1.
# Rescaling data proves of use with neural networks,
# optimization algorithms and those that use distance measures like
# k-nearest neighbors and weight inputs like regression.

In [ ]:
#Q3. Standardizing Data
from sklearn.preprocessing import StandardScaler
import scipy.stats as s
scaler=StandardScaler().fit(data)
std_data=scaler.transform(data)
print("\n Standardized Data \n ")
print(std_data)
print("\n Standardized Mean : ",s.tmean(std_data).round(2))
print(" Standardized Standard Deviation : ",round(std_data.std(),2))

Standardized Data

[[-0.52835961 0.96187667 -1.39147228 ... -0.57920652 -0.96024611


-0.78782264]
[-0.29854743 1.96744245 -1.39147228 ... 0.1289504 -0.58477711
-0.78782264]
[-0.29854743 1.29706527 -1.18607043 ... -0.04808883 -0.58477711
-0.78782264]
...
[-1.1603431 -0.09955388 -0.72391627 ... 0.54204194 0.54162988
0.45084835]
[-1.39015528 0.65462046 -0.77526673 ... 0.30598963 -0.20930812
-0.78782264]
[-1.33270223 -1.21684919 1.02199944 ... 0.01092425 0.54162988
0.45084835]]

Standardized Mean : 0.0


Standardized Standard Deviation : 1.0

In [ ]:
#Q4. Normalizing Data
import numpy as np
import pandas as pd
import scipy.stats as s
from sklearn import preprocessing
norm_data=preprocessing.normalize(data,norm='l1')
print("\n Normalized Data \n ")
norm_data

Normalized Data

Out[ ]: array([[0.0992705 , 0.00939045, 0. , ..., 0.00751236, 0.12610036,


0.06707466],
[0.06338639, 0.00715129, 0. , ..., 0.00552599, 0.07963932,
0.0406323 ],
[0.07823549, 0.00762295, 0.00040121, ..., 0.00651962, 0.09829587,
0.05015095],
...,
[0.06269796, 0.00507555, 0.00129377, ..., 0.00746404, 0.10947263,
0.05971234],
[0.0560754 , 0.00613028, 0.00114052, ..., 0.00674806, 0.09694392,
0.04752153],
[0.06487013, 0.00335162, 0.00508149, ..., 0.00713571, 0.11892857,
0.06487013]])

In [ ]:
#Q5. Binarizing Data
binarized_data=preprocessing.Binarizer(threshold=0.0).fit(data).transform(data)
print("\n Binarized Data \n ")
binarized_data

Binarized Data

Out[ ]: array([[1., 1., 0., ..., 1., 1., 1.],


[1., 1., 0., ..., 1., 1., 1.],
[1., 1., 1., ..., 1., 1., 1.],
...,
[1., 1., 1., ..., 1., 1., 1.],
[1., 1., 1., ..., 1., 1., 1.],
[1., 1., 1., ..., 1., 1., 1.]])
SET C

In [ ]:
#Q1.
from google.colab import files
data=files.upload()

Choose Files No file chosen Upload widget is only available when the cell has been executed in the current browser session. Please
rerun this cell to enable.
Saving Student_bucketing.csv to Student_bucketing.csv

In [ ]:
import pandas as pd
import io
data= pd.read_csv('Student_bucketing.csv')
data=pd.DataFrame(data)

In [ ]:
#Q2.
print("First 5 Rows of the dataset \n ")
data.head(5)

First 5 Rows of the dataset

Out[ ]: Student_id Age Grade Employed marks

0 1 19 1st Class yes 29


Student_id Age Grade Employed marks

1 2 20 2nd Class no 41

2 3 18 1st Class no 57

3 4 21 2nd Class no 29

4 5 19 1st Class no 57

In [ ]:
#Q3.
import pandas as pd
data['bucket']=pd.cut(data['marks'],5,
labels=['Poor','Below_average','Average','Above_average','Excellent'])
data.head(10)

Out[ ]: Student_id Age Grade Employed marks bucket

0 1 19 1st Class yes 29 Poor

1 2 20 2nd Class no 41 Below_average

2 3 18 1st Class no 57 Average

3 4 21 2nd Class no 29 Poor

4 5 19 1st Class no 57 Average

5 6 20 2nd Class yes 53 Average

6 7 19 3rd Class yes 78 Above_average

7 8 21 3rd Class yes 70 Above_average

8 9 22 3rd Class yes 97 Excellent

9 10 21 1st Class no 58 Average


ASSIGNMENT 4 : DATA VISUALIZATION

SET A

In [ ]:
#Q1.
from matplotlib import pyplot as plt
import numpy as np
# generate random array using NumPy
a1 = np.random.randn(50)
a2 = np.random.randn(50)
plt.plot(a1,color="k",linewidth=1,linestyle=':')
plt.title("Line Chart")
plt.show()

In [ ]:
plt.scatter(a1,a2,c=np.random.randn(50) ,marker ='*',alpha = 0.9)
plt.title("Scatter Plot")
plt.show()
In [ ]:
plt.hist(a2,bins=15,facecolor ='lawngreen',edgecolor = "k",alpha=0.7)
print("Histogram")

Histogram

In [ ]:
box=plt.boxplot(a2,vert=False,patch_artist = True)
print("Boxplot")
Boxplot

In [ ]:
#Q2.
a3=np.append(a2,[[5,-4]])
plt.boxplot(a3,vert=False)
print("Boxplot with outliers")
plt.show()

Boxplot with outliers

In [ ]:
#Q3.
from matplotlib import pyplot as plt
import numpy as np
subjects=['English','Comp Sci','Maths','Physics','Statistics','Algebra','Mechanics']
marks =[45,74,62,31,21,87,95]
plt.pie(marks,labels = subjects,autopct='%1.1f%%')
print("Pie Plot")
plt.show()

Pie Plot

In [ ]:
print("Bar Plot")
bar=plt.bar(subjects,marks,color='g')
def gradientbars(bars):
grad = np.atleast_2d(np.linspace(0,1,256)).T
ax = bars[0].axes
lim = ax.get_xlim()+ax.get_ylim()
for bar in bars:
bar.set_zorder(1)
bar.set_facecolor("none")
x,y = bar.get_xy()
w, h = bar.get_width(), bar.get_height()
ax.imshow(grad, extent=[x,x+w,y,y+h], aspect="auto", zorder=0)
ax.axis(lim)

gradientbars(bar)
plt.show()

Bar Plot
In [ ]:
#Q4.
from google.colab import files
data=files.upload()

Choose Files No file chosen Upload widget is only available when the cell has been executed in the current browser session. Please
rerun this cell to enable.
Saving iris.csv to iris.csv

In [ ]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data= pd.read_csv("iris.csv")
sns.countplot(x='variety',data = data)
plt.title("Iris Species Count")
plt.show()
In [ ]:
#Q5.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data= pd.read_csv("iris.csv")
ax=plt.subplots(1,1,figsize=(10,8))
data['variety'].value_counts().plot.pie(explode=[0.1,0.1,0.1],autopct='%1.1f%%',shadow=True,figsize=(10,8))
plt.title("Iris Species %")
plt.show()
In [ ]:
#Q6.
import seaborn as sns
iris_setosa=data.loc[data["variety"]=="Setosa"]
iris_virginica=data.loc[data["variety"]=="Virginica"]
iris_versicolor=data.loc[data["variety"]=="Versicolor"]

sns.FacetGrid(data,hue="variety").map(sns.histplot,"petal.length").add_legend()
sns.FacetGrid(data,hue="variety").map(sns.histplot,"petal.width").add_legend()
sns.FacetGrid(data,hue="variety").map(sns.histplot,"sepal.length").add_legend()
sns.FacetGrid(data,hue="variety").map(sns.histplot,"sepal.width").add_legend()
plt.show()
SET B

In [ ]:
#Q1.
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv("iris.csv")
fig = data[data.variety=='Setosa'].plot.scatter(x='petal.length',y='petal.width',color='cyan', label='Setosa')
data[data.variety=='Versicolor'].plot.scatter(x='petal.length',y='petal.width',color='violet', label='versicolor',ax=fig)
data[data.variety=='Virginica'].plot.scatter(x='petal.length',y='petal.width',color='lawngreen', label='virginica', ax=fig)
fig.set_xlabel("Petal Length")
fig.set_ylabel("Petal Width")
fig.set_title(" Petal Length VS Width")
fig=plt.gcf()
fig.set_size_inches(12,8)
plt.show()
In [ ]:
#Q2.
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv("iris.csv")
fig = data[data.variety=='Setosa'].plot.scatter(x='sepal.length',y='sepal.width',color='g', label='Setosa')
data[data.variety=='Versicolor'].plot.scatter(x='sepal.length',y='sepal.width',color='r', label='versicolor',ax=fig)
data[data.variety=='Virginica'].plot.scatter(x='sepal.length',y='sepal.width',color='gold', label='virginica', ax=fig)
fig.set_xlabel("Sepal Length")
fig.set_ylabel("Sepal Width")
fig.set_title(" Sepal Length VS Width")
fig=plt.gcf()
fig.set_size_inches(12,8)
plt.show()
In [ ]:
#Q3.
import seaborn as sns
import matplotlib.pyplot as plt

def graph(a):
sns.boxplot(x="variety", y=a, data=data)

plt.figure(figsize=(10,10))

plt.subplot(221)
graph('sepal.length')

plt.subplot(222)
graph('sepal.width')
plt.subplot(223)
graph('petal.length')

plt.subplot(224)
graph('petal.width')

plt.show()
SET C

In [ ]:
#Q1.
#Plot to compare all features of iris dataset
import seaborn as sns
import matplotlib.pyplot as plt
sns.pairplot(data,hue='variety', height=2)
plt.show()
In [ ]: #Q2.
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10,10))

plt.subplot(221)
sns.boxplot(x="variety", y="sepal.length", data=data,palette="bwr");

plt.subplot(222)
sns.boxplot(x="variety", y="sepal.width", data=data,palette="magma")

plt.subplot(223)
sns.boxplot(x="variety", y="petal.length", data=data,palette="autumn")

plt.subplot(224)
sns.boxplot(x="variety", y="petal.width", data=data,palette="GnBu")

plt.show()
In [ ]:
#Q3.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data= pd.read_csv("iris.csv")
g = sns.jointplot(x="sepal.length", y="sepal.width",shade=True, data=data, kind="kde", color="b")
g.plot_joint(plt.scatter, c="gold", s=40, linewidth=1, marker="*")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("$SepalLength$", "$SepalWidth$")
plt.show()

THE END

You might also like