Import As Import As From Import

12/8/2019 Untitled19
In [53]: import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
df=pd.read_csv(r"C:\Users\User\Desktop\Cristano_Ronaldo_Final_v1\data.csv")
In [54]: df.head() #Printing Out the 1st 5 elements
Out[54]:
Unnamed:
match_event_id location_x location_y remaining_min power_of_shot knockout_match game_season remaining_sec distance_of_sho
0
0 0 10.0 167.0 72.0 10.0 1.0 0.0 2000-01 27.0 38
1 1 12.0 -157.0 0.0 10.0 1.0 0.0 2000-01 22.0 35
2 2 35.0 -101.0 135.0 7.0 1.0 0.0 2000-01 45.0 36
3 3 43.0 138.0 175.0 6.0 1.0 0.0 2000-01 52.0 42
4 4 155.0 0.0 0.0 NaN 2.0 0.0 2000-01 19.0 20
localhost:8888/notebooks/Untitled19.ipynb# 1/23
In [55]: pd.set_option('display.max_columns', None) #this function helps us view all the columns available
df=df.rename(columns={ df.columns[0]: "Index" })
df.head()
Out[55]:
Index match_event_id location_x location_y remaining_min power_of_shot knockout_match game_season remaining_sec distance_of_shot is
0 0 10.0 167.0 72.0 10.0 1.0 0.0 2000-01 27.0 38.0
1 1 12.0 -157.0 0.0 10.0 1.0 0.0 2000-01 22.0 35.0
2 2 35.0 -101.0 135.0 7.0 1.0 0.0 2000-01 45.0 36.0
3 3 43.0 138.0 175.0 6.0 1.0 0.0 2000-01 52.0 42.0
4 4 155.0 0.0 0.0 NaN 2.0 0.0 2000-01 19.0 20.0
In [56]: df.is_goal.unique() #checking out unique values in is_goal
Out[56]: array([nan, 0., 1.])
In [57]: pd.value_counts(df.is_goal) #counting the number of values of each unique type in is_goal
Out[57]: 0.0 13550

1.0 10879
Name: is_goal, dtype: int64
In [58]: df.area_of_shot.unique() #checking out unique values
Out[58]: array(['Right Side(R)', 'Left Side(L)', 'Left Side Center(LC)',

'Right Side Center(RC)', 'Center(C)', nan, 'Mid Ground(MG)'],
dtype=object)
In [59]: pd.value_counts(df.area_of_shot) #counting the number of values of each unique type
Out[59]: Center(C) 12761

Right Side Center(RC) 4562
Right Side(R) 4370
Left Side Center(LC) 3848
Left Side(L) 3573
Mid Ground(MG) 81
Name: area_of_shot, dtype: int64
In [60]: df.area_of_shot.fillna("Center(C)",inplace=True) #filling the missing values
In [61]: pd.value_counts(df.area_of_shot) #counting the number of values of each unique type
Out[61]: Center(C) 14263

Right Side Center(RC) 4562
Right Side(R) 4370
Left Side Center(LC) 3848
Left Side(L) 3573
Mid Ground(MG) 81
Name: area_of_shot, dtype: int64
In [62]: # Assigning integral values for each unique value

df.area_of_shot.replace({'Center(C)':1, 'Right Side Center(RC)':2,'Right Side(R)':3,'Left Side Center(LC)':4,'Left Side(
df.area_of_shot
Out[62]: 0 3
1 5
2 4
3 2
4 1
5 5
6 1
7 1
8 5
9 1
10 4
11 2
12 5
13 5
14 1
15 1
16 1
17 4
18 4
19 1
20 1
21 2
22 1
23 4
24 1
25 1
26 1
27 4
28 2
29 1
..
30667 1
30668 1
30669 1
30670 1
30671 1
30672 3
30673 3
30674 2
30675 4
30676 5
30677 4
30678 1
30679 1
30680 1
30681 1
30682 5
30683 1
30684 5
30685 1
30686 1
30687 1
30688 5
30689 6
30690 5
30691 1
30692 1
30693 1
30694 4
30695 1
30696 1
Name: area_of_shot, Length: 30697, dtype: int64
In [63]: df.area_of_shot.unique() #checking out unique values
Out[63]: array([3, 5, 4, 2, 1, 6], dtype=int64)
In [64]: df.shot_basics.unique() #checking out unique values
Out[64]: array(['Mid Range', 'Goal Area', 'Goal Line', 'Penalty Spot', nan,
'Right Corner', 'Mid Ground Line', 'Left Corner'], dtype=object)
In [65]: pd.value_counts(df.shot_basics) #counting the number of values of each unique type
Out[65]: Mid Range 11955

Goal Area 6787
Penalty Spot 5321
Goal Line 4357
Right Corner 367
Left Corner 268
Mid Ground Line 67
Name: shot_basics, dtype: int64
In [66]: df.shot_basics.fillna("Mid Range",inplace=True) #filling the missing values

pd.value_counts(df.shot_basics) #counting the number of values of each unique type
Out[66]: Mid Range 13530

Goal Area 6787
Penalty Spot 5321
Goal Line 4357
Right Corner 367
Left Corner 268
Mid Ground Line 67
Name: shot_basics, dtype: int64

df.shot_basics.replace({'Mid Range':1, 'Goal Area':2,'Penalty Spot':3,'Goal Line':4,'Right Corner':5,'Mid Ground Line':6
df.shot_basics
Out[67]: 0 1
1 1
2 1
3 1
4 2
5 1
6 2
7 2
8 4
9 4
10 3
11 1
12 4
13 1
14 4
15 1
16 2
17 1
18 1
19 1
20 1
21 1
22 4
23 1
24 4
25 2
26 1
27 3
28 1
29 4
..
30667 3
30668 1
30669 1
30670 4
30671 2
30672 1
30673 1
30674 3
30675 3
30676 1
30677 1
30678 2
30679 2
30680 2
30681 3
30682 4
30683 1
30684 1
30685 3
30686 4
30687 4
30688 1
30689 6
30690 1
30691 2
30692 1
30693 2
30694 1
30695 3
30696 4
Name: shot_basics, Length: 30697, dtype: int64
In [68]: df.range_of_shot.unique() #checking out unique values
Out[68]: array(['16-24 ft.', '8-16 ft.', 'Less Than 8 ft.', '24+ ft.', nan,
'Back Court Shot'], dtype=object)
In [69]: pd.value_counts(df.range_of_shot) #counting the number of values of each unique type
Out[69]: Less Than 8 ft. 8933

16-24 ft. 7892
8-16 ft. 6290
24+ ft. 5937
Back Court Shot 81
Name: range_of_shot, dtype: int64
In [70]: df.range_of_shot.fillna("Less Than 8 ft.",inplace=True) #filling the missing values

pd.value_counts(df.range_of_shot) #counting the number of values of each unique type
Out[70]: Less Than 8 ft. 10497

16-24 ft. 7892
8-16 ft. 6290
24+ ft. 5937
Back Court Shot 81
Name: range_of_shot, dtype: int64

df.range_of_shot.replace({'Less Than 8 ft.':1, '8-16 ft.':2,'16-24 ft.':3,'24+ ft.':4,'Back Court Shot':5},inplace=True)
df.range_of_shot
Out[71]: 0 3
1 2
2 3
3 3
4 1
5 2
6 1
7 1
8 2
9 2
10 4
11 3
12 2
13 2
14 1
15 3
16 1
17 1
18 3
19 1
20 2
21 3
22 2
23 3
24 1
25 1
26 3
27 4
28 3
29 2
..
30667 4
30668 3
30669 3
30670 1
30671 1
30672 2
30673 2
30674 4
30675 4
30676 2
30677 3
30678 1
30679 1
30680 1
30681 4
30682 2
30683 3
30684 2
30685 4
30686 2
30687 2
30688 2
30689 5
30690 2
30691 1
30692 1
30693 1
30694 3
30695 1
30696 1
Name: range_of_shot, Length: 30697, dtype: int64
In [72]: pd.value_counts(df.power_of_shot) #counting the number of values of each unique type
Out[72]: 3.0 7885

1.0 7659
4.0 6910
2.0 6399
5.0 314
6.0 37
7.0 7
Name: power_of_shot, dtype: int64
In [73]: df.power_of_shot.fillna(df.power_of_shot.mean(),inplace=True) #filling the missing values
In [74]: pd.value_counts(df.power_of_shot) #counting the number of values of each unique type
Out[74]: 3.000000 7885

1.000000 7659
4.000000 6910
2.000000 6399
2.519359 1486
5.000000 314
6.000000 37
7.000000 7
Name: power_of_shot, dtype: int64
In [75]: df.distance_of_shot.fillna(df.distance_of_shot.mean(),inplace=True) #filling the missing values
In [76]: df.distance_of_shot
Out[76]: 0 38.000000
1 35.000000
2 36.000000
3 42.000000
4 20.000000
5 34.000000
6 20.000000
7 22.000000
8 32.000000
9 32.000000
10 45.000000
11 37.000000
12 33.448884
13 29.000000
14 25.000000
15 40.000000
16 20.000000
17 45.000000
18 36.000000
19 20.000000
20 34.000000
21 38.000000
22 31.000000
23 38.000000
24 27.000000
25 20.000000
26 40.000000
27 46.000000
28 39.000000
29 28.000000
...
30667 45.000000
30668 42.000000
30669 37.000000
30670 27.000000
30671 20.000000
30672 30.000000
30673 31.000000
30674 44.000000
30675 45.000000
30676 34.000000
30677 38.000000
30678 33.448884
30679 20.000000
30680 20.000000
30681 46.000000
30682 28.000000
30683 41.000000
30684 33.000000
30685 46.000000
30686 29.000000
30687 30.000000
30688 33.000000
30689 87.000000
30690 35.000000
30691 20.000000
30692 24.000000
30693 20.000000
30694 41.000000
30695 46.000000
30696 27.000000
Name: distance_of_shot, Length: 30697, dtype: float64
In [ ]:
In [118]: df.location_x.fillna(df.location_x.mean(),inplace=True) #filling the missing values
In [82]: df.location_y.fillna(df.location_y.mean(),inplace=True) #filling the missing values
In [84]: import numpy as np

df2=df[~np.isfinite(df.is_goal)] #Assigning all the rows of df with missing values in is_goal to df2
df2
Out[84]:
Index match_event_id location_x location_y remaining_min power_of_shot knockout_match game_season remaining_sec distance_o
0 0 10.0 167.000000 72.000000 10.0 1.000000 0.0 2000-01 27.0
7 7 254.0 1.000000 28.000000 8.0 3.000000 0.0 2000-01 5.0
16 16 100.0 0.000000 0.000000 0.0 1.000000 0.0 2000-01 1.0
19 19 249.0 0.000000 0.000000 10.0 3.000000 0.0 NaN 46.0
21 21 265.0 134.000000 127.000000 9.0 3.000000 0.0 NaN 4.0
32 32 4.0 163.000000 76.000000 11.0 1.000000 0.0 2000-01 26.0
33 33 8.0 70.000000 194.000000 10.0 1.000000 0.0 2000-01 58.0
In [85]: df=df[np.isfinite(df.is_goal)] #Assigning all the rows of df with finding values in is_goal to df
df
Out[85]:
Index match_event_id location_x location_y remaining_min power_of_shot knockout_match game_season remaining_sec distance_o
1 1 12.0 -157.000000 0.000000 10.0 1.000000 0.0 2000-01 22.0 35.
2 2 35.0 -101.000000 135.000000 7.0 1.000000 0.0 2000-01 45.0 36.
3 3 43.0 138.000000 175.000000 6.0 1.000000 0.0 2000-01 52.0 42.
4 4 155.0 0.000000 0.000000 NaN 2.000000 0.0 2000-01 19.0 20.
5 5 244.0 -145.000000 -11.000000 9.0 3.000000 0.0 NaN 32.0 34.
6 6 251.0 0.000000 0.000000 8.0 2.519359 0.0 2000-01 52.0 20.
8 8 265.0 -65.000000 91.126933 6.0 3.000000 0.0 2000-01 12.0 32.
In [86]: # The column shot_id_number has a lot of nan values.It is important to fill those values since this column us needed in
# submission file. A loop is written and and the missing values of shot_id_number is assigned by adding to the correspon
# value of the Index column
i=0
while i<6268:
df2.shot_id_number.iloc[i]=df2.Index.iloc[i]+1
i=i+1
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py:189: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-co

py (http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy)
self._setitem_with_indexer(indexer, value)
In [87]: df2.shot_id_number
Out[87]: 0 1.0
7 8.0
16 17.0
19 20.0
21 22.0
32 33.0
33 34.0
34 35.0
35 36.0
36 37.0
37 38.0
44 45.0
49 50.0
54 55.0
59 60.0
61 62.0
65 66.0
66 67.0
70 71.0
71 72.0
75 76.0
79 80.0
84 85.0
85 86.0
86 87.0
91 92.0
94 95.0
96 97.0
103 104.0
112 113.0
...
30567 30568.0
30569 30570.0
30580 30581.0
30583 30584.0
30590 30591.0
30593 30594.0
30613 30614.0
30616 30617.0
30617 30618.0
30625 30626.0
30629 30630.0
30630 30631.0
30631 30632.0
30633 30634.0
30635 30636.0
30636 30637.0
30638 30639.0
30646 30647.0
30648 30649.0
30655 30656.0
30659 30660.0
30664 30665.0
30668 30669.0
30679 30680.0
30680 30681.0
30681 30682.0
30682 30683.0
30686 30687.0
30687 30688.0
30693 30694.0
Name: shot_id_number, Length: 6268, dtype: float64
In [ ]:
In [88]: model=LogisticRegression() #Logistic regression is used
In [89]: X=df[['location_x','location_y','power_of_shot','distance_of_shot','area_of_shot','shot_basics','range_of_shot']]
# important features are taken to be fed to logistic regression
In [90]: X.head()
Out[90]:
location_x location_y power_of_shot distance_of_shot area_of_shot shot_basics range_of_shot
1 -157.0 0.0 1.0 35.0 5 1 2
2 -101.0 135.0 1.0 36.0 4 1 3
3 138.0 175.0 1.0 42.0 2 1 3
4 0.0 0.0 2.0 20.0 1 2 1
5 -145.0 -11.0 3.0 34.0 5 1 2
In [91]: Y=df[['is_goal']]
In [92]: model.fit(X,Y) #the model is trained
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be

changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:761: DataConversionWarning: A column-vector y wa
s passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
Out[92]: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,

intercept_scaling=1, max_iter=100, multi_class='warn',
n_jobs=None, penalty='l2', random_state=None, solver='warn',
tol=0.0001, verbose=0, warm_start=False)
In [93]: X2=df2[['location_x','location_y','power_of_shot','distance_of_shot','area_of_shot','shot_basics','range_of_shot']]
In [94]: X2.head()
Out[94]:
location_x location_y power_of_shot distance_of_shot area_of_shot shot_basics range_of_shot
0 167.0 72.0 1.0 38.0 3 1 3
7 1.0 28.0 3.0 22.0 1 2 1
16 0.0 0.0 1.0 20.0 1 2 1
19 0.0 0.0 3.0 20.0 1 1 1
21 134.0 127.0 3.0 38.0 2 1 3
In [101]: k=model.predict_proba(X2) # probability of occurence of goal is predicted

print(k[:,1])
[0.40169024 0.57141636 0.61488025 ... 0.48234342 0.46908281 0.58139038]
In [102]: df2.is_goal=k
In [111]: k2=df2[['shot_id_number','is_goal']]
k2
Out[111]:
shot_id_number is_goal
0 1.0 0.598310
7 8.0 0.428584
16 17.0 0.385120
19 20.0 0.396475
21 22.0 0.600862
32 33.0 0.589593
33 34.0 0.595674
34 35.0 0.396205
35 36.0 0.403566
36 37.0 0.385752
37 38.0 0.594587
44 45.0 0.425638
49 50.0 0.628595
54 55.0 0.697189
59 60.0 0.684408
61 62.0 0.471838
65 66.0 0.532036
66 67.0 0.553568
70 71.0 0.610627
71 72.0 0.630920
75 76.0 0.652395
79 80.0 0.517826
84 85.0 0.473821
85 86.0 0.580721
86 87.0 0.578107
91 92.0 0.571578
94 95.0 0.541471
96 97.0 0.607626
103 104.0 0.606294
112 113.0 0.645887
... ... ...
30567 30568.0 0.561095
30569 30570.0 0.396179
30580 30581.0 0.611101
30583 30584.0 0.724255
30590 30591.0 0.488396
30593 30594.0 0.396179
30613 30614.0 0.578843
30616 30617.0 0.627606
30617 30618.0 0.481273
30625 30626.0 0.532360
30629 30630.0 0.534439
30630 30631.0 0.627044
30631 30632.0 0.535716
30633 30634.0 0.484675
30635 30636.0 0.432006
30636 30637.0 0.432006
30638 30639.0 0.601569
30646 30647.0 0.688807
30648 30649.0 0.463663
30655 30656.0 0.569293
30659 30660.0 0.596168
30664 30665.0 0.494496
30668 30669.0 0.615066
30679 30680.0 0.396179
30680 30681.0 0.396179
30681 30682.0 0.647187
30682 30683.0 0.593148
30686 30687.0 0.517657
30687 30688.0 0.530917
30693 30694.0 0.418610
6268 rows × 2 columns
In [115]: # the predicted values are written into a csv file

k2.to_csv("C:/Users/User/Desktop/Cristano_Ronaldo_Final_v1/sample_submission.csv",index=False)
In [ ]:

Import As Import As From Import

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Import As Import As From Import

Uploaded by

Copyright:

Available Formats

12/8/2019 Untitled19

In [53]: import pandas as pd

In [54]: df.head() #Printing Out the 1st 5 elements

0 0 10.0 167.0 72.0 10.0 1.0 0.0 2000-01 27.0 38

1 1 12.0 -157.0 0.0 10.0 1.0 0.0 2000-01 22.0 35

2 2 35.0 -101.0 135.0 7.0 1.0 0.0 2000-01 45.0 36

3 3 43.0 138.0 175.0 6.0 1.0 0.0 2000-01 52.0 42

4 4 155.0 0.0 0.0 NaN 2.0 0.0 2000-01 19.0 20

0 0 10.0 167.0 72.0 10.0 1.0 0.0 2000-01 27.0 38.0

1 1 12.0 -157.0 0.0 10.0 1.0 0.0 2000-01 22.0 35.0

2 2 35.0 -101.0 135.0 7.0 1.0 0.0 2000-01 45.0 36.0

3 3 43.0 138.0 175.0 6.0 1.0 0.0 2000-01 52.0 42.0

4 4 155.0 0.0 0.0 NaN 2.0 0.0 2000-01 19.0 20.0

In [56]: df.is_goal.unique() #checking out unique values in is_goal

Out[56]: array([nan, 0., 1.])

Out[57]: 0.0 13550

In [58]: df.area_of_shot.unique() #checking out unique values

Out[58]: array(['Right Side(R)', 'Left Side(L)', 'Left Side Center(LC)',

In [59]: pd.value_counts(df.area_of_shot) #counting the number of values of each unique type

Out[59]: Center(C) 12761

In [60]: df.area_of_shot.fillna("Center(C)",inplace=True) #filling the missing values

In [61]: pd.value_counts(df.area_of_shot) #counting the number of values of each unique type

Out[61]: Center(C) 14263

In [62]: # Assigning integral values for each unique value

In [63]: df.area_of_shot.unique() #checking out unique values

Out[63]: array([3, 5, 4, 2, 1, 6], dtype=int64)

In [64]: df.shot_basics.unique() #checking out unique values

In [65]: pd.value_counts(df.shot_basics) #counting the number of values of each unique type

Out[65]: Mid Range 11955

In [66]: df.shot_basics.fillna("Mid Range",inplace=True) #filling the missing values

Out[66]: Mid Range 13530

In [67]: # Assigning integral values for each unique value

In [68]: df.range_of_shot.unique() #checking out unique values

In [69]: pd.value_counts(df.range_of_shot) #counting the number of values of each unique type

Out[69]: Less Than 8 ft. 8933

In [70]: df.range_of_shot.fillna("Less Than 8 ft.",inplace=True) #filling the missing values

Out[70]: Less Than 8 ft. 10497

In [71]: # Assigning integral values for each unique value

In [72]: pd.value_counts(df.power_of_shot) #counting the number of values of each unique type

Out[72]: 3.0 7885

In [73]: df.power_of_shot.fillna(df.power_of_shot.mean(),inplace=True) #filling the missing values

In [74]: pd.value_counts(df.power_of_shot) #counting the number of values of each unique type

Out[74]: 3.000000 7885

In [75]: df.distance_of_shot.fillna(df.distance_of_shot.mean(),inplace=True) #filling the missing values

In [118]: df.location_x.fillna(df.location_x.mean(),inplace=True) #filling the missing values

In [82]: df.location_y.fillna(df.location_y.mean(),inplace=True) #filling the missing values

In [84]: import numpy as np

0 0 10.0 167.000000 72.000000 10.0 1.000000 0.0 2000-01 27.0

7 7 254.0 1.000000 28.000000 8.0 3.000000 0.0 2000-01 5.0

16 16 100.0 0.000000 0.000000 0.0 1.000000 0.0 2000-01 1.0

19 19 249.0 0.000000 0.000000 10.0 3.000000 0.0 NaN 46.0

21 21 265.0 134.000000 127.000000 9.0 3.000000 0.0 NaN 4.0

32 32 4.0 163.000000 76.000000 11.0 1.000000 0.0 2000-01 26.0

33 33 8.0 70.000000 194.000000 10.0 1.000000 0.0 2000-01 58.0

1 1 12.0 -157.000000 0.000000 10.0 1.000000 0.0 2000-01 22.0 35.

2 2 35.0 -101.000000 135.000000 7.0 1.000000 0.0 2000-01 45.0 36.

3 3 43.0 138.000000 175.000000 6.0 1.000000 0.0 2000-01 52.0 42.

4 4 155.0 0.000000 0.000000 NaN 2.000000 0.0 2000-01 19.0 20.

5 5 244.0 -145.000000 -11.000000 9.0 3.000000 0.0 NaN 32.0 34.

6 6 251.0 0.000000 0.000000 8.0 2.519359 0.0 2000-01 52.0 20.

8 8 265.0 -65.000000 91.126933 6.0 3.000000 0.0 2000-01 12.0 32.

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-co