Python Project

Project : Movielens Case Study
In [1]: #Import the required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
In [2]: df1 = pd.read_csv("movies.dat", sep='::', header=None, engine='python')

df1.columns = ['MovieID','Title','Genres']
df1.head()
Out[2]:
MovieID Title Genres
0 1 Toy Story (1995) Animation|Children's|Comedy
1 2 Jumanji (1995) Adventure|Children's|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama
4 5 Father of the Bride Part II (1995) Comedy
In [3]: df1.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
MovieID 3883 non-null int64
Title 3883 non-null object
Genres 3883 non-null object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB
In [4]: df2 = pd.read_csv("ratings.dat", sep='::', header=None, engine='python')

df2.columns = ['UserID','MovieID','Rating','Timestamp']
df2.head()
Out[4]:
UserID MovieID Rating Timestamp
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
3 1 3408 4 978300275
4 1 2355 5 978824291
In [5]: df2.info()
UserID 1000209 non-null int64
Rating 1000209 non-null int64
Timestamp 1000209 non-null int64
dtypes: int64(4)
memory usage: 30.5 MB
In [6]: df3 = pd.read_csv("users.dat", sep='::', header=None, engine='python')

df3.columns = ['UserID','Gender','Age','Occupation','Zip-code']
df3.head()
Out[6]:
UserID Gender Age Occupation Zip-code
0 1 F 1 10 48067
1 2 M 56 16 70072
2 3 M 25 15 55117
3 4 M 45 7 02460
4 5 M 25 20 55455
In [7]: df3.info()
UserID 6040 non-null int64
Gender 6040 non-null object
Age 6040 non-null int64
Occupation 6040 non-null int64
Zip-code 6040 non-null object
dtypes: int64(3), object(2)
memory usage: 236.0+ KB
Merging datasets
In [8]: finalDF = pd.merge(pd.merge(df1,df2, on = 'MovieID'),df3, on = 'UserID')

finalDF.head()
Out[8]:
Zip-
MovieID Title Genres UserID Rating Timestamp Gender Age Occupation
code
Toy Story
0 1 Animation|Children's|Comedy 1 5 978824268 F 1 10 48067
(1995)
Pocahontas
1 48 Animation|Children's|Musical|Romance 1 5 978824351 F 1 10 48067
(1995)
Apollo 13
2 150 Drama 1 5 978301777 F 1 10 48067
(1995)
Star Wars:
Episode IV -
3 260 Action|Adventure|Fantasy|Sci-Fi 1 4 978300760 F 1 10 48067
A New Hope
(1977)
Schindler's
4 527 Drama|War 1 5 978824195 F 1 10 48067
List (1993)
User Age Distribution
In [9]: users_Age = df3.groupby(['Age']).size()

users_Age
Out[9]: Age
1 222
18 1103
25 2096
35 1193
45 550
50 496
56 380
dtype: int64
In [10]: plt.figure(figsize = (4,6))

users_Age.plot.bar(color='r',width=.4,alpha=0.8)
plt.title("User Age Distribution")
plt.xlabel("Age")
plt.ylabel("No of Users")
plt.show()

plt.hist(df3['Age'])
plt.xlabel('Age Distribution')
plt.ylabel('No. of Users')
plt.show()
Overall rating by users
In [12]: df2['Rating'].unique()
Out[12]: array([5, 3, 4, 2, 1], dtype=int64)
In [13]: users_Overall_Ratings = df2.groupby(['Rating'],axis = 0).UserID.size()

print (users_Overall_Ratings)
Rating
1 56174
2 107557
3 261197
4 348971
5 226310
Name: UserID, dtype: int64

users_Overall_Ratings.plot.bar(color='r',width=.4,alpha=0.8)
plt.xlabel('Ratings')
plt.show()
User rating of the movie “Toy Story”
In [15]: MovieTitles= finalDF["Title"].unique()

toyMovie = []
for i in MovieTitles:
if i.startswith("Toy Story") == True:
toyMovie.append(i)
toyMovie
Out[15]: ['Toy Story (1995)', 'Toy Story 2 (1999)']
In [16]: Toy1995 = finalDF[finalDF['Title'] == 'Toy Story (1995)']

Toy1995_rating = Toy1995.groupby('Rating')['UserID'].count()
print(Toy1995_rating)
Toy1995_rating.plot(kind='bar')
Rating
1 16
2 61
3 345
4 835
5 820
Out[16]: <matplotlib.axes._subplots.AxesSubplot at 0xb8771d0>
In [17]: Toy1995_rating.plot(kind='pie')
Out[17]: <matplotlib.axes._subplots.AxesSubplot at 0xb873c50>
In [18]: Toy1999 = finalDF[finalDF['Title'] == 'Toy Story 2 (1999)']

Toy1999_rating = Toy1999.groupby('Rating')['UserID'].count()
print(Toy1999_rating)
Toy1999_rating.plot(kind='bar')
Rating
1 25
2 44
3 214
4 578
5 724
Out[18]: <matplotlib.axes._subplots.AxesSubplot at 0x1c97f0f0>
In [19]: Toy1999_rating.plot(kind='pie')
Out[19]: <matplotlib.axes._subplots.AxesSubplot at 0x1c98dd68>
In [20]: Toy = finalDF[(finalDF['Title'].isin(['Toy Story 2 (1999)','Toy Story (1995)']))]

Toy_rating = Toy.groupby('Rating')['UserID'].count()
print(Toy_rating)
Toy_rating.plot(kind='bar')
Rating
1 41
2 105
3 559
4 1413
5 1544
Out[20]: <matplotlib.axes._subplots.AxesSubplot at 0xb842550>
In [21]: Toy_rating.plot(kind='pie')
Out[21]: <matplotlib.axes._subplots.AxesSubplot at 0xb860358>
Viewership of the movie “Toy Story” by age group
In [22]: Rating_Toy_Story = (Toy).groupby('Age',axis=0).Rating.count()

plt.figure(figsize = (4,6))
Rating_Toy_Story.plot.bar(color='magenta',width=.4,alpha=0.8)
plt.xlabel('User Age rates Toy Story')
plt.show()
Top 25 movies by viewership rating
In [23]: Top = finalDF.groupby('MovieID')['Rating'].count().nlargest(25)

print(Top)
Top.plot(kind='bar')
plt.xlabel('Movie ID Num')
plt.ylabel('Ratings Count')
plt.show()
MovieID
2858 3428
260 2991
1196 2990
1210 2883
480 2672
2028 2653
589 2649
2571 2590
1270 2583
593 2578
1580 2538
1198 2514
608 2513
2762 2459
110 2443
2396 2369
1197 2318
527 2304
1617 2288
1265 2278
1097 2269
2628 2250
2997 2241
318 2227
858 2223
Name: Rating, dtype: int64
In [24]: Top.plot(kind ='pie')
Out[24]: <matplotlib.axes._subplots.AxesSubplot at 0x1c9654a8>
Rating for a particular user of user id = 2696
In [25]: Rating_of_2696 = finalDF[finalDF['UserID'] == 2696].groupby('Rating')['Rating'].count()

print(Rating_of_2696)
Rating_of_2696.plot(kind='bar')
plt.xlabel('Ratings by User 2696')
plt.ylabel('Rating Counts')
plt.show()
Rating
1 2
2 3
3 3
4 11
5 1
Name: Rating, dtype: int64
In [26]: Rating_of_2696.plot(kind='pie')
Out[26]: <matplotlib.axes._subplots.AxesSubplot at 0x1c99dfd0>
In [27]: #Find out all the unique genres

finalDF['Genres'].unique
Out[27]: <bound method Series.unique of 0 Animation|Children's|Comedy

1 Animation|Children's|Musical|Romance
2 Drama
3 Action|Adventure|Fantasy|Sci-Fi
4 Drama|War
5 Children's|Drama
6 Animation|Children's|Comedy|Musical
7 Animation|Children's|Musical
9 Crime|Drama|Thriller
11 Animation
12 Animation|Comedy|Thriller
14 Musical|Romance
15 Adventure|Children's|Drama|Musical
16 Musical
18 Children's|Comedy|Musical
20 Musical
21 Children's|Drama|Fantasy|Sci-Fi
22 Drama
23 Action|Adventure|Comedy|Romance
24 Drama
25 Drama
26 Comedy|Sci-Fi
27 Action|Adventure|Drama
28 Drama
29 Adventure|Animation|Children's|Comedy|Musical
...
1000179 Action|Thriller
1000180 Comedy
1000181 Comedy|Romance
1000182 Sci-Fi|Thriller
1000183 Thriller
1000184 Thriller
1000185 Comedy|Drama
1000187 Comedy
1000188 Thriller
1000189 Drama|War
1000190 Horror|Romance
1000192 Animation|Children's|Comedy
1000193 Drama|Mystery|Thriller
1000194 Comedy
1000195 Horror|Mystery|Thriller
1000196 Comedy|Crime
1000198 Comedy|Drama
1000199 Drama
1000200 Drama|Thriller
1000201 Comedy
1000202 Animation|Children's
1000203 Thriller
1000204 Drama|Thriller
1000205 Comedy|Horror|Thriller
1000206 Comedy|Romance
1000208 Action|Drama
Name: Genres, Length: 1000209, dtype: object>
In [28]: #one-hot encoding for genre

ohe_genre=pd.concat([pd.get_dummies(finalDF['Genres']),finalDF.iloc[:,[0,1,3,4,5,6,7,8,9]]],axis=1)
ohe_genre.head()
Out[28]:
Action|Adventure|Anima
Action Action|Adventure Action|Adventure|Animation Action|Adventure|Animation|Children's|Fantasy
0 0 0 0 0
1 0 0 0 0
2 0 0 0 0
3 0 0 0 0
4 0 0 0 0
5 rows × 310 columns
Machine Learning
In [29]: #creating features and label by taking sample data
features = finalDF.iloc[:500,[0,7,8]]
label = finalDF.iloc[:500,4]
In [30]: features.info()
Int64Index: 500 entries, 0 to 499
Age 500 non-null int64
Occupation 500 non-null int64
dtypes: int64(3)
memory usage: 15.6 KB
In [31]: #train_test_split
X_train, X_test, y_train, y_test = train_test_split(features,label,test_size=0.2,random_state=912)
In [32]: #create model
model = KNeighborsClassifier(n_neighbors=15)
model.fit(X_train,y_train)
Out[32]: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',

metric_params=None, n_jobs=None, n_neighbors=15, p=2,
weights='uniform')
In [33]: print(model.score(X_train,y_train))
print(model.score(X_test,y_test))
0.4575
0.47
Histogram for movie

plt.hist(finalDF['MovieID'],color = 'b')
plt.xlabel('movie id')
plt.show()
Histogram for age

plt.hist(finalDF['Age'],color = 'g')
plt.xlabel('Age Distribution')
plt.show()
Histogram for Occupation

plt.hist(finalDF['Occupation'],color = 'r')
plt.xlabel('occupation id')
plt.show()

Python Project

Uploaded by

Copyright:

Available Formats

You might also like

Python Project

Uploaded by

Document Information

Original Description:

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Python Project

Uploaded by

Copyright:

Available Formats

Project : Movielens Case Study

In [1]: #Import the required libraries

from sklearn.model_selection import train_test_split

In [2]: df1 = pd.read_csv("movies.dat", sep='::', header=None, engine='python')

0 1 Toy Story (1995) Animation|Children's|Comedy

1 2 Jumanji (1995) Adventure|Children's|Fantasy

2 3 Grumpier Old Men (1995) Comedy|Romance

3 4 Waiting to Exhale (1995) Comedy|Drama

4 5 Father of the Bride Part II (1995) Comedy

In [4]: df2 = pd.read_csv("ratings.dat", sep='::', header=None, engine='python')

In [6]: df3 = pd.read_csv("users.dat", sep='::', header=None, engine='python')

In [8]: finalDF = pd.merge(pd.merge(df1,df2, on = 'MovieID'),df3, on = 'UserID')

User Age Distribution

In [9]: users_Age = df3.groupby(['Age']).size()

In [10]: plt.figure(figsize = (4,6))

In [11]: plt.figure(figsize = (4,6))

Overall rating by users

Out[12]: array([5, 3, 4, 2, 1], dtype=int64)

In [13]: users_Overall_Ratings = df2.groupby(['Rating'],axis = 0).UserID.size()

In [14]: plt.figure(figsize = (4,6))

User rating of the movie “Toy Story”

In [15]: MovieTitles= finalDF["Title"].unique()

Out[15]: ['Toy Story (1995)', 'Toy Story 2 (1999)']

In [16]: Toy1995 = finalDF[finalDF['Title'] == 'Toy Story (1995)']

Out[16]: <matplotlib.axes._subplots.AxesSubplot at 0xb8771d0>

Out[17]: <matplotlib.axes._subplots.AxesSubplot at 0xb873c50>

In [18]: Toy1999 = finalDF[finalDF['Title'] == 'Toy Story 2 (1999)']

Out[18]: <matplotlib.axes._subplots.AxesSubplot at 0x1c97f0f0>

Out[19]: <matplotlib.axes._subplots.AxesSubplot at 0x1c98dd68>

In [20]: Toy = finalDF[(finalDF['Title'].isin(['Toy Story 2 (1999)','Toy Story (1995)']))]

Out[20]: <matplotlib.axes._subplots.AxesSubplot at 0xb842550>

Out[21]: <matplotlib.axes._subplots.AxesSubplot at 0xb860358>

Viewership of the movie “Toy Story” by age group

In [22]: Rating_Toy_Story = (Toy).groupby('Age',axis=0).Rating.count()

Top 25 movies by viewership rating

In [23]: Top = finalDF.groupby('MovieID')['Rating'].count().nlargest(25)

In [24]: Top.plot(kind ='pie')

Out[24]: <matplotlib.axes._subplots.AxesSubplot at 0x1c9654a8>

Rating for a particular user of user id = 2696

In [25]: Rating_of_2696 = finalDF[finalDF['UserID'] == 2696].groupby('Rating')['Rating'].count()

Out[26]: <matplotlib.axes._subplots.AxesSubplot at 0x1c99dfd0>

In [27]: #Find out all the unique genres

Out[27]: <bound method Series.unique of 0 Animation|Children's|Comedy

In [28]: #one-hot encoding for genre

5 rows × 310 columns

In [29]: #creating features and label by taking sample data

X_train, X_test, y_train, y_test = train_test_split(features,label,test_size=0.2,random_state=912)

In [32]: #create model

Out[32]: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',

Histogram for movie

In [34]: plt.figure(figsize = (4,6))

Histogram for age

In [35]: plt.figure(figsize = (4,6))

Histogram for Occupation

In [36]: plt.figure(figsize = (4,6))

You might also like