Professional Documents
Culture Documents
Lab - 4
Lab - 4
import pandas as pd
import numpy as np
import sklearn
from sklearn.decomposition import TruncatedSVD
columns = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data', sep='\t ', names=columns)
columns = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', '
unknown', 'Action', 'Adventure',
'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy'
, 'Film-Noir', 'Horror',
'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=columns, encoding='latin-1')
movie_names = movies[['item_id', 'movie title']]
Out[2]:
In [3]:
Out[3]:
3 Ninjas:
'Til 2 20,000
12 2001: A High 39 Year You
There 101 Days Leagues Yankee Yo
movie 1-900 Angry 187 Space Noon At Steps, of the So
Was Dalmatians in the Under ... Zulu Frankens
title (1994) Men (1997) Odyssey Mega The Horse Crazy
You (1996) Valley the Sea (1994) (19
(1957) (1968) Mountain (1935) (1997) (1994)
(1997) (1996) (1954)
(1998)
user_id
1 0 0 2 5 0 0 3 4 0 0 ... 0 0 0
2 0 0 0 0 0 0 0 0 1 0 ... 0 0 0
3 0 0 0 0 2 0 0 0 0 0 ... 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0
5 0 0 2 0 0 0 0 4 0 0 ... 0 0 0
In [4]:
X = rating_crosstab.T
In [5]:
SVD = TruncatedSVD(n_components=12, random_state=5)
resultant_matrix = SVD.fit_transform(X)
resultant_matrix.shape
Out[5]:
(1664, 12)
In [6]:
corr_mat = np.corrcoef(resultant_matrix)
corr_mat.shape
Out[6]:
(1664, 1664)
In [7]:
col_idx = rating_crosstab.columns.get_loc("Star Wars (1977)")
corr_specific = corr_mat[col_idx]
pd.DataFrame({'corr_specific':corr_specific, 'Movies': rating_crosstab.columns})\
.sort_values('corr_specific', ascending=False)\
.head(10)
Out[7]:
corr_specific Movies