Python Project

You might also like

Download as pdf or txt
Download as pdf or txt
You are on page 1of 1

Project : Movielens Case Study

In [1]: #Import the required libraries


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split


from sklearn.neighbors import KNeighborsClassifier

In [2]: df1 = pd.read_csv("movies.dat", sep='::', header=None, engine='python')


df1.columns = ['MovieID','Title','Genres']
df1.head()

Out[2]:
MovieID Title Genres

0 1 Toy Story (1995) Animation|Children's|Comedy

1 2 Jumanji (1995) Adventure|Children's|Fantasy

2 3 Grumpier Old Men (1995) Comedy|Romance

3 4 Waiting to Exhale (1995) Comedy|Drama

4 5 Father of the Bride Part II (1995) Comedy

In [3]: df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
MovieID 3883 non-null int64
Title 3883 non-null object
Genres 3883 non-null object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB

In [4]: df2 = pd.read_csv("ratings.dat", sep='::', header=None, engine='python')


df2.columns = ['UserID','MovieID','Rating','Timestamp']
df2.head()

Out[4]:
UserID MovieID Rating Timestamp

0 1 1193 5 978300760

1 1 661 3 978302109

2 1 914 3 978301968

3 1 3408 4 978300275

4 1 2355 5 978824291

In [5]: df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
UserID 1000209 non-null int64
MovieID 1000209 non-null int64
Rating 1000209 non-null int64
Timestamp 1000209 non-null int64
dtypes: int64(4)
memory usage: 30.5 MB

In [6]: df3 = pd.read_csv("users.dat", sep='::', header=None, engine='python')


df3.columns = ['UserID','Gender','Age','Occupation','Zip-code']
df3.head()

Out[6]:
UserID Gender Age Occupation Zip-code

0 1 F 1 10 48067

1 2 M 56 16 70072

2 3 M 25 15 55117

3 4 M 45 7 02460

4 5 M 25 20 55455

In [7]: df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
UserID 6040 non-null int64
Gender 6040 non-null object
Age 6040 non-null int64
Occupation 6040 non-null int64
Zip-code 6040 non-null object
dtypes: int64(3), object(2)
memory usage: 236.0+ KB

Merging datasets

In [8]: finalDF = pd.merge(pd.merge(df1,df2, on = 'MovieID'),df3, on = 'UserID')


finalDF.head()

Out[8]:
Zip-
MovieID Title Genres UserID Rating Timestamp Gender Age Occupation
code

Toy Story
0 1 Animation|Children's|Comedy 1 5 978824268 F 1 10 48067
(1995)

Pocahontas
1 48 Animation|Children's|Musical|Romance 1 5 978824351 F 1 10 48067
(1995)

Apollo 13
2 150 Drama 1 5 978301777 F 1 10 48067
(1995)

Star Wars:
Episode IV -
3 260 Action|Adventure|Fantasy|Sci-Fi 1 4 978300760 F 1 10 48067
A New Hope
(1977)

Schindler's
4 527 Drama|War 1 5 978824195 F 1 10 48067
List (1993)

User Age Distribution

In [9]: users_Age = df3.groupby(['Age']).size()


users_Age

Out[9]: Age
1 222
18 1103
25 2096
35 1193
45 550
50 496
56 380
dtype: int64

In [10]: plt.figure(figsize = (4,6))


users_Age.plot.bar(color='r',width=.4,alpha=0.8)
plt.title("User Age Distribution")
plt.xlabel("Age")
plt.ylabel("No of Users")
plt.show()

In [11]: plt.figure(figsize = (4,6))


plt.hist(df3['Age'])
plt.xlabel('Age Distribution')
plt.ylabel('No. of Users')
plt.show()

Overall rating by users

In [12]: df2['Rating'].unique()

Out[12]: array([5, 3, 4, 2, 1], dtype=int64)

In [13]: users_Overall_Ratings = df2.groupby(['Rating'],axis = 0).UserID.size()


print (users_Overall_Ratings)

Rating
1 56174
2 107557
3 261197
4 348971
5 226310
Name: UserID, dtype: int64

In [14]: plt.figure(figsize = (4,6))


users_Overall_Ratings.plot.bar(color='r',width=.4,alpha=0.8)
plt.xlabel('Ratings')
plt.ylabel('No. of Users')
plt.show()

User rating of the movie “Toy Story”

In [15]: MovieTitles= finalDF["Title"].unique()


toyMovie = []
for i in MovieTitles:
if i.startswith("Toy Story") == True:
toyMovie.append(i)
toyMovie

Out[15]: ['Toy Story (1995)', 'Toy Story 2 (1999)']

In [16]: Toy1995 = finalDF[finalDF['Title'] == 'Toy Story (1995)']


Toy1995_rating = Toy1995.groupby('Rating')['UserID'].count()
print(Toy1995_rating)
Toy1995_rating.plot(kind='bar')

Rating
1 16
2 61
3 345
4 835
5 820
Name: UserID, dtype: int64

Out[16]: <matplotlib.axes._subplots.AxesSubplot at 0xb8771d0>

In [17]: Toy1995_rating.plot(kind='pie')

Out[17]: <matplotlib.axes._subplots.AxesSubplot at 0xb873c50>

In [18]: Toy1999 = finalDF[finalDF['Title'] == 'Toy Story 2 (1999)']


Toy1999_rating = Toy1999.groupby('Rating')['UserID'].count()
print(Toy1999_rating)
Toy1999_rating.plot(kind='bar')

Rating
1 25
2 44
3 214
4 578
5 724
Name: UserID, dtype: int64

Out[18]: <matplotlib.axes._subplots.AxesSubplot at 0x1c97f0f0>

In [19]: Toy1999_rating.plot(kind='pie')

Out[19]: <matplotlib.axes._subplots.AxesSubplot at 0x1c98dd68>

In [20]: Toy = finalDF[(finalDF['Title'].isin(['Toy Story 2 (1999)','Toy Story (1995)']))]


Toy_rating = Toy.groupby('Rating')['UserID'].count()
print(Toy_rating)
Toy_rating.plot(kind='bar')

Rating
1 41
2 105
3 559
4 1413
5 1544
Name: UserID, dtype: int64

Out[20]: <matplotlib.axes._subplots.AxesSubplot at 0xb842550>

In [21]: Toy_rating.plot(kind='pie')

Out[21]: <matplotlib.axes._subplots.AxesSubplot at 0xb860358>

Viewership of the movie “Toy Story” by age group

In [22]: Rating_Toy_Story = (Toy).groupby('Age',axis=0).Rating.count()


plt.figure(figsize = (4,6))
Rating_Toy_Story.plot.bar(color='magenta',width=.4,alpha=0.8)
plt.xlabel('User Age rates Toy Story')
plt.ylabel('No. of Users')
plt.show()

Top 25 movies by viewership rating

In [23]: Top = finalDF.groupby('MovieID')['Rating'].count().nlargest(25)


print(Top)
Top.plot(kind='bar')
plt.xlabel('Movie ID Num')
plt.ylabel('Ratings Count')
plt.show()

MovieID
2858 3428
260 2991
1196 2990
1210 2883
480 2672
2028 2653
589 2649
2571 2590
1270 2583
593 2578
1580 2538
1198 2514
608 2513
2762 2459
110 2443
2396 2369
1197 2318
527 2304
1617 2288
1265 2278
1097 2269
2628 2250
2997 2241
318 2227
858 2223
Name: Rating, dtype: int64

In [24]: Top.plot(kind ='pie')

Out[24]: <matplotlib.axes._subplots.AxesSubplot at 0x1c9654a8>

Rating for a particular user of user id = 2696

In [25]: Rating_of_2696 = finalDF[finalDF['UserID'] == 2696].groupby('Rating')['Rating'].count()


print(Rating_of_2696)
Rating_of_2696.plot(kind='bar')
plt.xlabel('Ratings by User 2696')
plt.ylabel('Rating Counts')
plt.show()

Rating
1 2
2 3
3 3
4 11
5 1
Name: Rating, dtype: int64

In [26]: Rating_of_2696.plot(kind='pie')

Out[26]: <matplotlib.axes._subplots.AxesSubplot at 0x1c99dfd0>

In [27]: #Find out all the unique genres


finalDF['Genres'].unique

Out[27]: <bound method Series.unique of 0 Animation|Children's|Comedy


1 Animation|Children's|Musical|Romance
2 Drama
3 Action|Adventure|Fantasy|Sci-Fi
4 Drama|War
5 Children's|Drama
6 Animation|Children's|Comedy|Musical
7 Animation|Children's|Musical
8 Animation|Children's|Musical
9 Crime|Drama|Thriller
10 Animation|Children's|Musical
11 Animation
12 Animation|Comedy|Thriller
13 Animation|Children's|Musical
14 Musical|Romance
15 Adventure|Children's|Drama|Musical
16 Musical
17 Animation|Children's|Musical
18 Children's|Comedy|Musical
19 Animation|Children's|Musical
20 Musical
21 Children's|Drama|Fantasy|Sci-Fi
22 Drama
23 Action|Adventure|Comedy|Romance
24 Drama
25 Drama
26 Comedy|Sci-Fi
27 Action|Adventure|Drama
28 Drama
29 Adventure|Animation|Children's|Comedy|Musical
...
1000179 Action|Thriller
1000180 Comedy
1000181 Comedy|Romance
1000182 Sci-Fi|Thriller
1000183 Thriller
1000184 Thriller
1000185 Comedy|Drama
1000186 Action|Thriller
1000187 Comedy
1000188 Thriller
1000189 Drama|War
1000190 Horror|Romance
1000191 Action|Thriller
1000192 Animation|Children's|Comedy
1000193 Drama|Mystery|Thriller
1000194 Comedy
1000195 Horror|Mystery|Thriller
1000196 Comedy|Crime
1000197 Action|Thriller
1000198 Comedy|Drama
1000199 Drama
1000200 Drama|Thriller
1000201 Comedy
1000202 Animation|Children's
1000203 Thriller
1000204 Drama|Thriller
1000205 Comedy|Horror|Thriller
1000206 Comedy|Romance
1000207 Action|Thriller
1000208 Action|Drama
Name: Genres, Length: 1000209, dtype: object>

In [28]: #one-hot encoding for genre


ohe_genre=pd.concat([pd.get_dummies(finalDF['Genres']),finalDF.iloc[:,[0,1,3,4,5,6,7,8,9]]],axis=1)
ohe_genre.head()

Out[28]:
Action|Adventure|Anima
Action Action|Adventure Action|Adventure|Animation Action|Adventure|Animation|Children's|Fantasy

0 0 0 0 0

1 0 0 0 0

2 0 0 0 0

3 0 0 0 0

4 0 0 0 0

5 rows × 310 columns

Machine Learning

In [29]: #creating features and label by taking sample data

features = finalDF.iloc[:500,[0,7,8]]
label = finalDF.iloc[:500,4]

In [30]: features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 499
Data columns (total 3 columns):
MovieID 500 non-null int64
Age 500 non-null int64
Occupation 500 non-null int64
dtypes: int64(3)
memory usage: 15.6 KB

In [31]: #train_test_split

X_train, X_test, y_train, y_test = train_test_split(features,label,test_size=0.2,random_state=912)

In [32]: #create model

model = KNeighborsClassifier(n_neighbors=15)
model.fit(X_train,y_train)

Out[32]: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',


metric_params=None, n_jobs=None, n_neighbors=15, p=2,
weights='uniform')

In [33]: print(model.score(X_train,y_train))
print(model.score(X_test,y_test))

0.4575
0.47

Histogram for movie

In [34]: plt.figure(figsize = (4,6))


plt.hist(finalDF['MovieID'],color = 'b')
plt.xlabel('movie id')
plt.show()

Histogram for age

In [35]: plt.figure(figsize = (4,6))


plt.hist(finalDF['Age'],color = 'g')
plt.xlabel('Age Distribution')
plt.show()

Histogram for Occupation

In [36]: plt.figure(figsize = (4,6))


plt.hist(finalDF['Occupation'],color = 'r')
plt.xlabel('occupation id')
plt.show()

You might also like