Import As Import As From Import Import As Matplotlib Import Import

23/07/2023, 23:52 2b
In [63]: import pandas as pd

import numpy as np
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

%matplotlib inline
import math
import warnings
warnings.filterwarnings('ignore')
from sklearn import metrics
from scipy.special import legendre

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.impute import SimpleImputer
#df = pd.read_csv('/Users/sangeetha/consolidated_output.csv')
df = pd.read_csv('/Users/sangeetha/outputweo1.csv')
to_drop = ['match_id','match_num',
'duration',
'totalsets',
'P1_setswon',
'P2_setswon','P1_gameswon','P2_gameswon',
'P1_pointswon','P2_pointswon','winner','player1_rating','player2_
'prob',
'weo_prob',
'win_play_2',
'updated_win_match_1','updated_win_match_2',
'updated_total_match_1', 'updated_total_match_2',
'player1_fg','player2_fg',
'current_rating_player1','current_rating_player2',
'updated_rating_player1','updated_rating_player2',
#'current_weo_rating_player1','current_weo_rating_player2']
'updated_elo_rating_player1','updated_elo_rating_player2']
df['win_ratio_player1'] = df['win_match_1'] / df['total_match_1']
df['win_ratio_player2'] = df['win_match_2'] / df['total_match_2']
df.drop(columns=to_drop, inplace=True)
to_drop = [
'total_match_1', 'total_match_2',
'win_match_1', 'win_match_2']
df.drop(columns=to_drop, inplace=True)
print(df.columns)
localhost:8888/nbconvert/html/Downloads/2b.ipynb?download=false 1/5
23/07/2023, 23:52 2b
Index(['year', 'player1', 'player2', 'win_play_1',
'current_weo_rating_player1', 'current_weo_rating_player2',
'win_ratio_player1', 'win_ratio_player2'],
dtype='object')
In [64]: # Handle missing values, excluding the 'match_id' column from imputation
imputer = SimpleImputer(strategy='mean')
numeric_cols = df.select_dtypes(include=[float, int]).columns
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
# Splitting the data into training and test sets based on the year (2019)
train_data = df[df['year'] < 2019]
test_data = df[df['year'] == 2019]
# Drop unnecessary columns (match_id, year) and target column (winner) from
X_train = train_data.drop(columns=['year', 'win_play_1'])
y_train = train_data['win_play_1']
# Drop unnecessary columns (match_id, year) and target column (winner) from
X_test = test_data.drop(columns=['year', 'win_play_1'])
y_test = test_data['win_play_1']
# Combine player1 and player2 names from both training and test sets for lab
combined_players = pd.concat([X_train['player1'], X_train['player2'], X_test
label_encoder = LabelEncoder()
label_encoder.fit(combined_players)
# Encode player names as categorical variables

X_train['player1'] = label_encoder.transform(X_train['player1'])
X_train['player2'] = label_encoder.transform(X_train['player2'])
X_test['player1'] = label_encoder.transform(X_test['player1'])
X_test['player2'] = label_encoder.transform(X_test['player2'])
##############################################
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
##########################################
# Training the first classification algorithm - Random Forest

#rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier = RandomForestClassifier(n_estimators=101, random_state=42)
rf_scores = cross_val_score(rf_classifier, X_train, y_train, cv=100)
rf_classifier.fit(X_train, y_train)
# Predicting outcomes using the trained Random Forest model

rf_predictions = rf_classifier.predict(X_test)
# Calculate accuracy for Random Forest

rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_recall = recall_score(y_test, rf_predictions)
# Training the second classification algorithm - Support Vector Machine (SVM

svm_classifier = SVC(random_state=42)
svm_classifier.fit(X_train, y_train)
# Predicting outcomes using the trained SVM model

svm_predictions = svm_classifier.predict(X_test)
# Calculate accuracy for SVM
23/07/2023, 23:52 2b
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_recall = recall_score(y_test, svm_predictions)
##########Feature importance in random dorest

feature_importances = rf_classifier.feature_importances_
top_feature_indices = np.argsort(feature_importances)[::-1][:10]
top_features = [(f"Feature {index}", importance) for index, importance in zi
# Reporting the accuracy of both algorithms

print("Random Forest Accuracy for :", rf_accuracy)
print("SVM Accuracy for :", svm_accuracy)
print("Random Forest Recall for :", rf_recall)
print("SVM Recall for:", svm_recall)
Random Forest Accuracy for : 0.717741935483871

SVM Accuracy for : 0.7096774193548387
Random Forest Recall for : 0.7049180327868853
SVM Recall for: 0.7213114754098361
In [65]: top_feature_indices = np.argsort(feature_importances)[::-1][:10]

top_features = [(f"Feature {index}", importance) for index, importance in zi
print(top_features)
[('Feature 3', 0.22354211445044533), ('Feature 2', 0.2132744524412143), ('Fe

ature 0', 0.17385511162671627), ('Feature 1', 0.1626429559386697), ('Feature
5', 0.11334771871328483), ('Feature 4', 0.11333764682966967)]
In [66]: X_df = pd.DataFrame(X_train, columns=['Feature_' + str(i) for i in range(tra
# Create a DataFrame to store feature names and their importances

feature_importance_df = pd.DataFrame({'Feature': X_df.columns, 'Importance':
# Sort the DataFrame by importance in descending order

feature_importance_df = feature_importance_df.sort_values(by='Importance', a
# Extract the top features (e.g., top 10 features)

top_features = feature_importance_df.head(10)
print(top_features)
Feature Importance
3 Feature_3 0.223542
2 Feature_2 0.213274
0 Feature_0 0.173855
1 Feature_1 0.162643
5 Feature_5 0.113348
4 Feature_4 0.113338
In [67]: dftemp = train_data.drop(columns=['year', 'win_play_1'])

dflist = dftemp.columns.tolist
print(dflist)
<bound method IndexOpsMixin.tolist of Index(['player1', 'player2', 'current_

weo_rating_player1',
'current_weo_rating_player2', 'win_ratio_player1', 'win_ratio_player
2'],
dtype='object')>
In [54]: import numpy as np

from sklearn.metrics import confusion_matrix, precision_score, recall_score,
cm = confusion_matrix(y_test, rf_predictions)
23/07/2023, 23:52 2b
# Create a heatmap using seaborn
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, square=True,
xticklabels=["Negative", "Positive"],
yticklabels=["Negative", "Positive"])
# Add labels and title

plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix for Random Forest")
plt.show()
precision = precision_score(y_test, rf_predictions)

recall = recall_score(y_test, rf_predictions)
f1 = f1_score(y_test, rf_predictions)
accuracy = accuracy_score(y_test, rf_predictions)
print("F1 Score for Random Forest :", f1)
print("Precision for Random Forest :", precision)
print("Accuracy for Random Forest :", accuracy)
print("Recall for Random Forest :", recall)
import numpy as np
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, svm_predictions)
# Create a heatmap using seaborn

plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Oranges", cbar=False, square=True
xticklabels=["Negative", "Positive"],
yticklabels=["Negative", "Positive"])
# Add labels and title

plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix for SVM")
plt.show()
precision = precision_score(y_test, svm_predictions)

recall = recall_score(y_test, svm_predictions)
f1 = f1_score(y_test, svm_predictions)
accuracy = accuracy_score(y_test, svm_predictions)
print("F1 Score for SVM:", f1)
print("Precision for SVM:", precision)
print("Accuracy for SVM:", accuracy)
print("Recall for SVM:", recall)
23/07/2023, 23:52 2b
F1 Score for Random Forest : 0.7107438016528925

Precision for Random Forest : 0.7166666666666667
Accuracy for Random Forest : 0.717741935483871
Recall for Random Forest : 0.7049180327868853
F1 Score for SVM: 0.7096774193548387

Precision for SVM: 0.6984126984126984
Accuracy for SVM: 0.7096774193548387
Recall for SVM: 0.7213114754098361
In [ ]:
In [ ]:

Import As Import As From Import Import As Matplotlib Import Import

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Import As Import As From Import Import As Matplotlib Import Import

Uploaded by

Copyright:

Available Formats

23/07/2023, 23:52 2b

In [63]: import pandas as pd

import matplotlib.pyplot as plt

from sklearn import metrics

from scipy.special import legendre

# Encode player names as categorical variables

# Training the first classification algorithm - Random Forest

# Predicting outcomes using the trained Random Forest model

# Calculate accuracy for Random Forest

# Training the second classification algorithm - Support Vector Machine (SVM

# Predicting outcomes using the trained SVM model

# Calculate accuracy for SVM

##########Feature importance in random dorest

# Reporting the accuracy of both algorithms

Random Forest Accuracy for : 0.717741935483871

In [65]: top_feature_indices = np.argsort(feature_importances)[::-1][:10]

[('Feature 3', 0.22354211445044533), ('Feature 2', 0.2132744524412143), ('Fe

In [66]: X_df = pd.DataFrame(X_train, columns=['Feature_' + str(i) for i in range(tra

# Create a DataFrame to store feature names and their importances

# Sort the DataFrame by importance in descending order

# Extract the top features (e.g., top 10 features)

In [67]: dftemp = train_data.drop(columns=['year', 'win_play_1'])

<bound method IndexOpsMixin.tolist of Index(['player1', 'player2', 'current_

In [54]: import numpy as np

# Add labels and title

precision = precision_score(y_test, rf_predictions)

# Create a heatmap using seaborn

# Add labels and title

precision = precision_score(y_test, svm_predictions)

F1 Score for Random Forest : 0.7107438016528925

F1 Score for SVM: 0.7096774193548387

You might also like