Outliers, Hypothesis and Natural Language Processing

Week-07 Outliers, Hypothesis and Natural Language Processing
[25]: import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
[26]: iris = pd.read_csv('iris.csv')

iris
[26]: sepal_length sepal_width petal_length petal_width species

0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
.. … … … … …
145 6.7 3.0 5.2 2.3 virginica
146 6.3 2.5 5.0 1.9 virginica
147 6.5 3.0 5.2 2.0 virginica
148 6.2 3.4 5.4 2.3 virginica
149 5.9 3.0 5.1 1.8 virginica
[150 rows x 5 columns]
[27]: iris.columns
[27]: Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',

'species'],
dtype='object')
[28]: import pandas as pd

from sklearn.preprocessing import LabelEncoder
1
from sklearn.model_selection import train_test_split
[29]: target_column = 'species'

X = iris.drop(target_column, axis=1)
y = iris[target_column]
[30]: le = LabelEncoder()
y_encoded = le.fit_transform(y)
iris[target_column] = y_encoded
[31]: sns.heatmap(iris.corr(method='pearson').drop(
[], axis=1).drop([], axis=0),
annot = True);
plt.show()
2
#Treating Outliers
var = iris['sepal_width']
[34]: var
[34]: 0 3.5
1 3.0
2 3.2
3 3.1
4 3.6
…
145 3.0
146 2.5
147 3.0
148 3.4
149 3.0
Name: sepal_width, Length: 150, dtype: float64
[35]: q1 = np.percentile(var, 25)

q3 = np.percentile(var, 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = [x for x in var if x < lower_bound or x > upper_bound]

outliers
[35]: [4.4, 4.1, 4.2, 2.0]
[36]: median_data = var.median()

median_data
[36]: 3.0
[37]: for i in range(len(var)):

if var[i] in outliers:
var[i] = median_data
print("Data with Outliers Replaced by Median:\n", var)
3
Data with Outliers Replaced by Median:
0 3.5
1 3.0
2 3.2
3 3.1
4 3.6
…
145 3.0
146 2.5
147 3.0
148 3.4
149 3.0
Name: sepal_width, Length: 150, dtype: float64
[38]: q1 = np.percentile(var, 25)

q3 = np.percentile(var, 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = [x for x in var if x < lower_bound or x > upper_bound]

print(outliers)
[32]: import sweetviz as sv
[33]: advert_report = sv.analyze(iris)

#display the report
advert_report.show_html('Advertising.html')
Report Advertising.html was generated! NOTEBOOK/COLAB USERS: the web browser
MAY not pop up, regardless, the report IS saved in your notebook/colab files.
4
if(len(outliers) == 0):
print("No outliers.")
[]
No outliers.
Hypothesis
[39]: import numpy as np
from scipy.stats import kstest, norm
# Generate a sample of data that you want to test

np.random.seed(0) # Setting a seed for reproducibility
sample_data = np.random.normal(loc=0, scale=1, size=1000) # Sample data from a␣
↪normal distribution
# Perform a KS test to check if the sample_data follows a normal distribution

ks_statistic, p_value = kstest(var, 'norm')
# Define the significance level (alpha)

alpha = 0.05
# Check the result of the KS test

if p_value < alpha:
print(f"The data does NOT follow a normal distribution (p-value =␣
↪{p_value})")
else:
print(f"The data follows a normal distribution (p-value = {p_value})")
The data does NOT follow a normal distribution (p-value =

5.8803781394734095e-279)
[40]: # Generate a sample of data that you want to test

sample_data_1 = np.random.normal(0,1,100) # Sample data from a normal␣
↪distribution

ks_statistic, p_value = kstest(sample_data_1, 'norm')

alpha = 0.05

if p_value < alpha:
print(f"The sample does NOT follow a normal distribution (p-value =␣
↪{p_value})")
else:
5
print(f"The sample follows a normal distribution (p-value = {p_value})")
The sample follows a normal distribution (p-value = 0.8667717341286251)
[41]: # Generate a sample of data that you want to test

sample_data_2 = np.random.uniform(0,1,100) # Sample data from a normal␣
↪distribution

ks_statistic, p_value = kstest(sample_data_2, 'norm')

alpha = 0.05

if p_value < alpha:
print(f"The sample does NOT follow a normal distribution (p-value =␣
↪{p_value})")
else:
print(f"The sample follows a normal distribution (p-value = {p_value})")
The sample does NOT follow a normal distribution (p-value =

7.902176095057778e-24)
Natural Language Processing
[ ]: # This is related to convering a text in to vector
import pandas as pd
import numpy as np
import collections
import re
[ ]: #Sample documents
doc1 = 'Game of Thrones is an amazing tv series!, Game of Thrones is the best␣
↪tv series! and Game of Thrones is so great'
#Sentance without punctuations and split them

w_doc1= re.sub(r'[^\w\s]','', doc1.lower()).split()
# Print the sentence without punctuation
print(w_doc1)
['game', 'of', 'thrones', 'is', 'an', 'amazing', 'tv', 'series', 'game', 'of',
'thrones', 'is', 'the', 'best', 'tv', 'series', 'and', 'game', 'of', 'thrones',
'is', 'so', 'great']
[ ]: import nltk
from nltk.corpus import stopwords
6
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to /root/nltk_data…

[nltk_data] Unzipping corpora/stopwords.zip.
True
[ ]: stop_words = set(stopwords.words('english'))
filtered_words = [word for word in w_doc1 if word.lower() not in stop_words]
# Reconstruct the text without stop words

filtered_text = ' '.join(filtered_words)
# Print the text without stop words

print(filtered_text)
game thrones amazing tv series game thrones best tv series game thrones great
[ ]: from sklearn.feature_extraction.text import CountVectorizer

doc1 = ['Game of Thrones is an amazing tv series!, Game of Thrones is the best␣
↪tv series! and Game of Thrones is so great']
# Create an instance of CountVectorizer

vectorizer = CountVectorizer()
# Fit the vectorizer on the sentences and transform them into a Bag of Words␣
↪representation
X = vectorizer.fit_transform(doc1)
# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()
# Convert the Bag of Words representation to a dense matrix and print it
print(X.toarray())
print("Feature names (words):", feature_names)
[[1 1 1 1 3 1 3 3 2 1 1 3 2]]
Feature names (words): ['amazing' 'an' 'and' 'best' 'game' 'great' 'is' 'of'
'series' 'so' 'the'
'thrones' 'tv']

Outliers, Hypothesis and Natural Language Processing

Uploaded by

Document Information

Original Description:

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Outliers, Hypothesis and Natural Language Processing

Uploaded by

Copyright:

Available Formats

Week-07 Outliers, Hypothesis and Natural Language Processing

[25]: import pandas as pd

[26]: iris = pd.read_csv('iris.csv')

[26]: sepal_length sepal_width petal_length petal_width species

[150 rows x 5 columns]

[27]: Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',

[28]: import pandas as pd

[29]: target_column = 'species'

[35]: q1 = np.percentile(var, 25)

outliers = [x for x in var if x < lower_bound or x > upper_bound]

[35]: [4.4, 4.1, 4.2, 2.0]

[36]: median_data = var.median()

[37]: for i in range(len(var)):

print("Data with Outliers Replaced by Median:\n", var)

[38]: q1 = np.percentile(var, 25)

outliers = [x for x in var if x < lower_bound or x > upper_bound]

[32]: import sweetviz as sv

[33]: advert_report = sv.analyze(iris)

# Generate a sample of data that you want to test

# Perform a KS test to check if the sample_data follows a normal distribution

# Define the significance level (alpha)

# Check the result of the KS test

The data does NOT follow a normal distribution (p-value =

[40]: # Generate a sample of data that you want to test

# Perform a KS test to check if the sample_data follows a normal distribution

# Define the significance level (alpha)

# Check the result of the KS test

The sample follows a normal distribution (p-value = 0.8667717341286251)

[41]: # Generate a sample of data that you want to test

# Perform a KS test to check if the sample_data follows a normal distribution

# Define the significance level (alpha)

# Check the result of the KS test

The sample does NOT follow a normal distribution (p-value =

#Sentance without punctuations and split them

[nltk_data] Downloading package stopwords to /root/nltk_data…

# Reconstruct the text without stop words

# Print the text without stop words

[ ]: from sklearn.feature_extraction.text import CountVectorizer

# Create an instance of CountVectorizer

You might also like