Download as pdf or txt
Download as pdf or txt
You are on page 1of 5

23/07/2023, 23:52 2b

In [63]: import pandas as pd


import numpy as np
from sklearn.model_selection import cross_val_score

import matplotlib.pyplot as plt


%matplotlib inline
import math

import warnings
warnings.filterwarnings('ignore')

from sklearn import metrics

from scipy.special import legendre


from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.impute import SimpleImputer

#df = pd.read_csv('/Users/sangeetha/consolidated_output.csv')
df = pd.read_csv('/Users/sangeetha/outputweo1.csv')

to_drop = ['match_id','match_num',
'duration',
'totalsets',
'P1_setswon',
'P2_setswon','P1_gameswon','P2_gameswon',
'P1_pointswon','P2_pointswon','winner','player1_rating','player2_
'prob',
'weo_prob',
'win_play_2',
'updated_win_match_1','updated_win_match_2',
'updated_total_match_1', 'updated_total_match_2',
'player1_fg','player2_fg',
'current_rating_player1','current_rating_player2',
'updated_rating_player1','updated_rating_player2',
#'current_weo_rating_player1','current_weo_rating_player2']
'updated_elo_rating_player1','updated_elo_rating_player2']
df['win_ratio_player1'] = df['win_match_1'] / df['total_match_1']
df['win_ratio_player2'] = df['win_match_2'] / df['total_match_2']

df.drop(columns=to_drop, inplace=True)

to_drop = [
'total_match_1', 'total_match_2',
'win_match_1', 'win_match_2']
df.drop(columns=to_drop, inplace=True)

print(df.columns)

localhost:8888/nbconvert/html/Downloads/2b.ipynb?download=false 1/5
23/07/2023, 23:52 2b
Index(['year', 'player1', 'player2', 'win_play_1',
'current_weo_rating_player1', 'current_weo_rating_player2',
'win_ratio_player1', 'win_ratio_player2'],
dtype='object')

In [64]: # Handle missing values, excluding the 'match_id' column from imputation
imputer = SimpleImputer(strategy='mean')
numeric_cols = df.select_dtypes(include=[float, int]).columns
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

# Splitting the data into training and test sets based on the year (2019)
train_data = df[df['year'] < 2019]
test_data = df[df['year'] == 2019]

# Drop unnecessary columns (match_id, year) and target column (winner) from
X_train = train_data.drop(columns=['year', 'win_play_1'])
y_train = train_data['win_play_1']

# Drop unnecessary columns (match_id, year) and target column (winner) from
X_test = test_data.drop(columns=['year', 'win_play_1'])
y_test = test_data['win_play_1']

# Combine player1 and player2 names from both training and test sets for lab
combined_players = pd.concat([X_train['player1'], X_train['player2'], X_test
label_encoder = LabelEncoder()
label_encoder.fit(combined_players)

# Encode player names as categorical variables


X_train['player1'] = label_encoder.transform(X_train['player1'])
X_train['player2'] = label_encoder.transform(X_train['player2'])
X_test['player1'] = label_encoder.transform(X_test['player1'])
X_test['player2'] = label_encoder.transform(X_test['player2'])

##############################################
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
##########################################

# Training the first classification algorithm - Random Forest


#rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier = RandomForestClassifier(n_estimators=101, random_state=42)
rf_scores = cross_val_score(rf_classifier, X_train, y_train, cv=100)

rf_classifier.fit(X_train, y_train)

# Predicting outcomes using the trained Random Forest model


rf_predictions = rf_classifier.predict(X_test)

# Calculate accuracy for Random Forest


rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_recall = recall_score(y_test, rf_predictions)

# Training the second classification algorithm - Support Vector Machine (SVM


svm_classifier = SVC(random_state=42)
svm_classifier.fit(X_train, y_train)

# Predicting outcomes using the trained SVM model


svm_predictions = svm_classifier.predict(X_test)

# Calculate accuracy for SVM

localhost:8888/nbconvert/html/Downloads/2b.ipynb?download=false 2/5
23/07/2023, 23:52 2b
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_recall = recall_score(y_test, svm_predictions)

##########Feature importance in random dorest


feature_importances = rf_classifier.feature_importances_

top_feature_indices = np.argsort(feature_importances)[::-1][:10]
top_features = [(f"Feature {index}", importance) for index, importance in zi

# Reporting the accuracy of both algorithms


print("Random Forest Accuracy for :", rf_accuracy)
print("SVM Accuracy for :", svm_accuracy)
print("Random Forest Recall for :", rf_recall)
print("SVM Recall for:", svm_recall)

Random Forest Accuracy for : 0.717741935483871


SVM Accuracy for : 0.7096774193548387
Random Forest Recall for : 0.7049180327868853
SVM Recall for: 0.7213114754098361

In [65]: top_feature_indices = np.argsort(feature_importances)[::-1][:10]


top_features = [(f"Feature {index}", importance) for index, importance in zi
print(top_features)

[('Feature 3', 0.22354211445044533), ('Feature 2', 0.2132744524412143), ('Fe


ature 0', 0.17385511162671627), ('Feature 1', 0.1626429559386697), ('Feature
5', 0.11334771871328483), ('Feature 4', 0.11333764682966967)]

In [66]: X_df = pd.DataFrame(X_train, columns=['Feature_' + str(i) for i in range(tra

# Create a DataFrame to store feature names and their importances


feature_importance_df = pd.DataFrame({'Feature': X_df.columns, 'Importance':

# Sort the DataFrame by importance in descending order


feature_importance_df = feature_importance_df.sort_values(by='Importance', a

# Extract the top features (e.g., top 10 features)


top_features = feature_importance_df.head(10)
print(top_features)

Feature Importance
3 Feature_3 0.223542
2 Feature_2 0.213274
0 Feature_0 0.173855
1 Feature_1 0.162643
5 Feature_5 0.113348
4 Feature_4 0.113338

In [67]: dftemp = train_data.drop(columns=['year', 'win_play_1'])


dflist = dftemp.columns.tolist
print(dflist)

<bound method IndexOpsMixin.tolist of Index(['player1', 'player2', 'current_


weo_rating_player1',
'current_weo_rating_player2', 'win_ratio_player1', 'win_ratio_player
2'],
dtype='object')>

In [54]: import numpy as np


import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, precision_score, recall_score,
cm = confusion_matrix(y_test, rf_predictions)

localhost:8888/nbconvert/html/Downloads/2b.ipynb?download=false 3/5
23/07/2023, 23:52 2b
# Create a heatmap using seaborn
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, square=True,
xticklabels=["Negative", "Positive"],
yticklabels=["Negative", "Positive"])

# Add labels and title


plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix for Random Forest")

plt.show()

precision = precision_score(y_test, rf_predictions)


recall = recall_score(y_test, rf_predictions)
f1 = f1_score(y_test, rf_predictions)
accuracy = accuracy_score(y_test, rf_predictions)
print("F1 Score for Random Forest :", f1)
print("Precision for Random Forest :", precision)
print("Accuracy for Random Forest :", accuracy)
print("Recall for Random Forest :", recall)

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, svm_predictions)

# Create a heatmap using seaborn


plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Oranges", cbar=False, square=True
xticklabels=["Negative", "Positive"],
yticklabels=["Negative", "Positive"])

# Add labels and title


plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix for SVM")

plt.show()

precision = precision_score(y_test, svm_predictions)


recall = recall_score(y_test, svm_predictions)
f1 = f1_score(y_test, svm_predictions)
accuracy = accuracy_score(y_test, svm_predictions)
print("F1 Score for SVM:", f1)
print("Precision for SVM:", precision)
print("Accuracy for SVM:", accuracy)
print("Recall for SVM:", recall)

localhost:8888/nbconvert/html/Downloads/2b.ipynb?download=false 4/5
23/07/2023, 23:52 2b

F1 Score for Random Forest : 0.7107438016528925


Precision for Random Forest : 0.7166666666666667
Accuracy for Random Forest : 0.717741935483871
Recall for Random Forest : 0.7049180327868853

F1 Score for SVM: 0.7096774193548387


Precision for SVM: 0.6984126984126984
Accuracy for SVM: 0.7096774193548387
Recall for SVM: 0.7213114754098361

In [ ]:

In [ ]:

localhost:8888/nbconvert/html/Downloads/2b.ipynb?download=false 5/5

You might also like