Download as pdf or txt
Download as pdf or txt
You are on page 1of 7

Week-07 Outliers, Hypothesis and Natural Language Processing

[25]: import pandas as pd


import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

[26]: iris = pd.read_csv('iris.csv')


iris

[26]: sepal_length sepal_width petal_length petal_width species


0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
.. … … … … …
145 6.7 3.0 5.2 2.3 virginica
146 6.3 2.5 5.0 1.9 virginica
147 6.5 3.0 5.2 2.0 virginica
148 6.2 3.4 5.4 2.3 virginica
149 5.9 3.0 5.1 1.8 virginica

[150 rows x 5 columns]

[27]: iris.columns

[27]: Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',


'species'],
dtype='object')

[28]: import pandas as pd


from sklearn.preprocessing import LabelEncoder

1
from sklearn.model_selection import train_test_split

[29]: target_column = 'species'


X = iris.drop(target_column, axis=1)
y = iris[target_column]

[30]: le = LabelEncoder()
y_encoded = le.fit_transform(y)
iris[target_column] = y_encoded

[31]: sns.heatmap(iris.corr(method='pearson').drop(
[], axis=1).drop([], axis=0),
annot = True);

plt.show()

2
#Treating Outliers
var = iris['sepal_width']
[34]: var

[34]: 0 3.5
1 3.0
2 3.2
3 3.1
4 3.6

145 3.0
146 2.5
147 3.0
148 3.4
149 3.0
Name: sepal_width, Length: 150, dtype: float64

[35]: q1 = np.percentile(var, 25)


q3 = np.percentile(var, 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

outliers = [x for x in var if x < lower_bound or x > upper_bound]


outliers

[35]: [4.4, 4.1, 4.2, 2.0]

[36]: median_data = var.median()


median_data

[36]: 3.0

[37]: for i in range(len(var)):


if var[i] in outliers:
var[i] = median_data

print("Data with Outliers Replaced by Median:\n", var)

3
Data with Outliers Replaced by Median:
0 3.5
1 3.0
2 3.2
3 3.1
4 3.6

145 3.0
146 2.5
147 3.0
148 3.4
149 3.0
Name: sepal_width, Length: 150, dtype: float64

[38]: q1 = np.percentile(var, 25)


q3 = np.percentile(var, 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

outliers = [x for x in var if x < lower_bound or x > upper_bound]


print(outliers)

[32]: import sweetviz as sv

[33]: advert_report = sv.analyze(iris)


#display the report
advert_report.show_html('Advertising.html')
Report Advertising.html was generated! NOTEBOOK/COLAB USERS: the web browser
MAY not pop up, regardless, the report IS saved in your notebook/colab files.

4
if(len(outliers) == 0):
print("No outliers.")

[]
No outliers.
Hypothesis
[39]: import numpy as np
from scipy.stats import kstest, norm

# Generate a sample of data that you want to test


np.random.seed(0) # Setting a seed for reproducibility
sample_data = np.random.normal(loc=0, scale=1, size=1000) # Sample data from a␣
↪normal distribution

# Perform a KS test to check if the sample_data follows a normal distribution


ks_statistic, p_value = kstest(var, 'norm')

# Define the significance level (alpha)


alpha = 0.05

# Check the result of the KS test


if p_value < alpha:
print(f"The data does NOT follow a normal distribution (p-value =␣
↪{p_value})")

else:
print(f"The data follows a normal distribution (p-value = {p_value})")

The data does NOT follow a normal distribution (p-value =


5.8803781394734095e-279)

[40]: # Generate a sample of data that you want to test


np.random.seed(0) # Setting a seed for reproducibility
sample_data_1 = np.random.normal(0,1,100) # Sample data from a normal␣
↪distribution

# Perform a KS test to check if the sample_data follows a normal distribution


ks_statistic, p_value = kstest(sample_data_1, 'norm')

# Define the significance level (alpha)


alpha = 0.05

# Check the result of the KS test


if p_value < alpha:
print(f"The sample does NOT follow a normal distribution (p-value =␣
↪{p_value})")

else:

5
print(f"The sample follows a normal distribution (p-value = {p_value})")

The sample follows a normal distribution (p-value = 0.8667717341286251)

[41]: # Generate a sample of data that you want to test


np.random.seed(0) # Setting a seed for reproducibility
sample_data_2 = np.random.uniform(0,1,100) # Sample data from a normal␣
↪distribution

# Perform a KS test to check if the sample_data follows a normal distribution


ks_statistic, p_value = kstest(sample_data_2, 'norm')

# Define the significance level (alpha)


alpha = 0.05

# Check the result of the KS test


if p_value < alpha:
print(f"The sample does NOT follow a normal distribution (p-value =␣
↪{p_value})")

else:
print(f"The sample follows a normal distribution (p-value = {p_value})")

The sample does NOT follow a normal distribution (p-value =


7.902176095057778e-24)
Natural Language Processing
[ ]: # This is related to convering a text in to vector
import pandas as pd
import numpy as np
import collections
import re

[ ]: #Sample documents
doc1 = 'Game of Thrones is an amazing tv series!, Game of Thrones is the best␣
↪tv series! and Game of Thrones is so great'

#Sentance without punctuations and split them


w_doc1= re.sub(r'[^\w\s]','', doc1.lower()).split()
# Print the sentence without punctuation
print(w_doc1)

['game', 'of', 'thrones', 'is', 'an', 'amazing', 'tv', 'series', 'game', 'of',
'thrones', 'is', 'the', 'best', 'tv', 'series', 'and', 'game', 'of', 'thrones',
'is', 'so', 'great']

[ ]: import nltk
from nltk.corpus import stopwords

6
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data…


[nltk_data] Unzipping corpora/stopwords.zip.
True

[ ]: stop_words = set(stopwords.words('english'))
filtered_words = [word for word in w_doc1 if word.lower() not in stop_words]

# Reconstruct the text without stop words


filtered_text = ' '.join(filtered_words)

# Print the text without stop words


print(filtered_text)

game thrones amazing tv series game thrones best tv series game thrones great

[ ]: from sklearn.feature_extraction.text import CountVectorizer


doc1 = ['Game of Thrones is an amazing tv series!, Game of Thrones is the best␣
↪tv series! and Game of Thrones is so great']

# Create an instance of CountVectorizer


vectorizer = CountVectorizer()
# Fit the vectorizer on the sentences and transform them into a Bag of Words␣
↪representation

X = vectorizer.fit_transform(doc1)
# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()
# Convert the Bag of Words representation to a dense matrix and print it
print(X.toarray())
print("Feature names (words):", feature_names)

[[1 1 1 1 3 1 3 3 2 1 1 3 2]]
Feature names (words): ['amazing' 'an' 'and' 'best' 'game' 'great' 'is' 'of'
'series' 'so' 'the'
'thrones' 'tv']

You might also like