Download as pdf or txt
Download as pdf or txt
You are on page 1of 9

import pandas as pd

# Load the dataset from the uploaded file


file_path = '/content/employee_attrition_data.csv'
employee_attrition_data = pd.read_csv(file_path)

# Display the first few rows of the dataset


print("First few rows of the dataset:")
print(employee_attrition_data.head())

# Display summary information about the dataset


print("\nSummary Information of the dataset:")
print(employee_attrition_data.info())

# Calculate basic statistics for numerical columns


print("\nBasic Statistics of the dataset:")
print(employee_attrition_data.describe())

First few rows of the dataset:


Employee_ID Age Gender Department Job_Title
Years_at_Company \
0 0 27 Male Marketing Manager 9

1 1 53 Female Sales Engineer 10

2 2 59 Female Marketing Analyst 8

3 3 42 Female Engineering Manager 1

4 4 44 Female Sales Engineer 10

Satisfaction_Level Average_Monthly_Hours Promotion_Last_5Years


Salary \
0 0.586251 151 0
60132
1 0.261161 221 1
79947
2 0.304382 184 0
46958
3 0.480779 242 0
40662
4 0.636244 229 1
74307

Attrition
0 0
1 0
2 1
3 0
4 0
Summary Information of the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Employee_ID 1000 non-null int64
1 Age 1000 non-null int64
2 Gender 1000 non-null object
3 Department 1000 non-null object
4 Job_Title 1000 non-null object
5 Years_at_Company 1000 non-null int64
6 Satisfaction_Level 1000 non-null float64
7 Average_Monthly_Hours 1000 non-null int64
8 Promotion_Last_5Years 1000 non-null int64
9 Salary 1000 non-null int64
10 Attrition 1000 non-null int64
dtypes: float64(1), int64(7), object(3)
memory usage: 86.1+ KB
None

Basic Statistics of the dataset:


Employee_ID Age Years_at_Company Satisfaction_Level
\
count 1000.000000 1000.000000 1000.000000 1000.000000

mean 499.500000 42.205000 5.605000 0.505995

std 288.819436 10.016452 2.822223 0.289797

min 0.000000 25.000000 1.000000 0.001376

25% 249.750000 33.000000 3.000000 0.258866

50% 499.500000 43.000000 6.000000 0.505675

75% 749.250000 51.000000 8.000000 0.761135

max 999.000000 59.000000 10.000000 0.999979

Average_Monthly_Hours Promotion_Last_5Years Salary


Attrition
count 1000.000000 1000.000000 1000.000000
1000.000000
mean 199.493000 0.486000 64624.980000
0.495000
std 29.631908 0.500054 20262.984333
0.500225
min 150.000000 0.000000 30099.000000
0.000000
25% 173.000000 0.000000 47613.500000
0.000000
50% 201.000000 0.000000 64525.000000
0.000000
75% 225.000000 1.000000 81921.000000
1.000000
max 249.000000 1.000000 99991.000000
1.000000

import pandas as pd

file_path = '/content/employee_attrition_data.csv'
employee_attrition_data = pd.read_csv(file_path)

# Check for missing values


missing_values = employee_attrition_data.isnull().sum()
print("Missing values in each column:")
print(missing_values)

# One-hot encode categorical variables


encoded_data = pd.get_dummies(employee_attrition_data,
columns=['Gender', 'Department', 'Job_Title'])

# Display the first few rows of the encoded dataset


print("First few rows of the encoded dataset:")
print(encoded_data.head())

Missing values in each column:


Employee_ID 0
Age 0
Gender 0
Department 0
Job_Title 0
Years_at_Company 0
Satisfaction_Level 0
Average_Monthly_Hours 0
Promotion_Last_5Years 0
Salary 0
Attrition 0
dtype: int64
First few rows of the encoded dataset:
Employee_ID Age Years_at_Company Satisfaction_Level \
0 0 27 9 0.586251
1 1 53 10 0.261161
2 2 59 8 0.304382
3 3 42 1 0.480779
4 4 44 10 0.636244
Average_Monthly_Hours Promotion_Last_5Years Salary Attrition \
0 151 0 60132 0
1 221 1 79947 0
2 184 0 46958 1
3 242 0 40662 0
4 229 1 74307 0

Gender_Female Gender_Male Department_Engineering


Department_Finance \
0 False True False
False
1 True False False
False
2 True False False
False
3 True False True
False
4 True False False
False

Department_HR Department_Marketing Department_Sales \


0 False True False
1 False False True
2 False True False
3 False False False
4 False False True

Job_Title_Accountant Job_Title_Analyst Job_Title_Engineer \


0 False False False
1 False False True
2 False True False
3 False False False
4 False False True

Job_Title_HR Specialist Job_Title_Manager


0 False True
1 False False
2 False False
3 False True
4 False False

import matplotlib.pyplot as plt


import seaborn as sns

# Generate summary statistics for all variables


summary_statistics = encoded_data.describe()
print("Summary Statistics:")
print(summary_statistics)

# Histograms for numerical variables


fig, axes = plt.subplots(1, 3, figsize=(18, 5))

sns.histplot(encoded_data['Age'], kde=True, ax=axes[0])


axes[0].set_title('Age Distribution')

sns.histplot(encoded_data['Satisfaction_Level'], kde=True, ax=axes[1])


axes[1].set_title('Satisfaction Level Distribution')

sns.histplot(encoded_data['Salary'], kde=True, ax=axes[2])


axes[2].set_title('Salary Distribution')

plt.show()

# Count plots for original categorical variables


fig, axes = plt.subplots(1, 2, figsize=(18, 5))

sns.countplot(data=employee_attrition_data, x='Department',
ax=axes[0])
axes[0].set_title('Department Count')

sns.countplot(data=employee_attrition_data, x='Job_Title', ax=axes[1])


axes[1].set_title('Job Title Count')

plt.show()

# Generate a correlation matrix


correlation_matrix = encoded_data.corr()

# Plot the correlation matrix


plt.figure(figsize=(16, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm',
fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

Summary Statistics:
Employee_ID Age Years_at_Company Satisfaction_Level
\
count 1000.000000 1000.000000 1000.000000 1000.000000

mean 499.500000 42.205000 5.605000 0.505995

std 288.819436 10.016452 2.822223 0.289797

min 0.000000 25.000000 1.000000 0.001376

25% 249.750000 33.000000 3.000000 0.258866

50% 499.500000 43.000000 6.000000 0.505675

75% 749.250000 51.000000 8.000000 0.761135


max 999.000000 59.000000 10.000000 0.999979

Average_Monthly_Hours Promotion_Last_5Years Salary


Attrition
count 1000.000000 1000.000000 1000.000000
1000.000000
mean 199.493000 0.486000 64624.980000
0.495000
std 29.631908 0.500054 20262.984333
0.500225
min 150.000000 0.000000 30099.000000
0.000000
25% 173.000000 0.000000 47613.500000
0.000000
50% 201.000000 0.000000 64525.000000
0.000000
75% 225.000000 1.000000 81921.000000
1.000000
max 249.000000 1.000000 99991.000000
1.000000
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# Select features for clustering (excluding the target variable


'Attrition' and identifier 'Employee_ID')
features = encoded_data.drop(columns=['Employee_ID', 'Attrition'])

# Apply K-means clustering


kmeans = KMeans(n_clusters=3, random_state=42)
encoded_data['Cluster'] = kmeans.fit_predict(features)

# Visualize the clusters


plt.figure(figsize=(12, 6))
sns.scatterplot(data=encoded_data, x='Satisfaction_Level',
y='Average_Monthly_Hours', hue='Cluster', palette='viridis')
plt.title('K-means Clustering of Employees')
plt.show()

/usr/local/lib/python3.10/dist-packages/sklearn/cluster/
_kmeans.py:870: FutureWarning: The default value of `n_init` will
change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly
to suppress the warning
warnings.warn(

from sklearn.model_selection import train_test_split


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Select features and target


X = encoded_data.drop(columns=['Employee_ID', 'Attrition', 'Cluster'])
y = encoded_data['Attrition']

# Split the data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)

# Apply logistic regression


logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

# Predict on the test set


y_pred = logreg.predict(X_test)

# Evaluate the model


classification_report_logreg = classification_report(y_test, y_pred)
confusion_matrix_logreg = confusion_matrix(y_test, y_pred)

print("Classification Report:")
print(classification_report_logreg)
print("\nConfusion Matrix:")
print(confusion_matrix_logreg)

Classification Report:
precision recall f1-score support

0 0.51 0.59 0.55 102


1 0.49 0.41 0.44 98

accuracy 0.50 200


macro avg 0.50 0.50 0.49 200
weighted avg 0.50 0.50 0.50 200

Confusion Matrix:
[[60 42]
[58 40]]

You might also like