Professional Documents
Culture Documents
Bag of Words 03 and 04 Model
Bag of Words 03 and 04 Model
In [1]:
import nltk
In [2]:
import re
In [21]:
import heapq
import numpy as np
In [4]:
paragraph = """Thank you all so very much. Thank you to the Academy.Thank you to all of you
In [5]:
dataset = nltk.sent_tokenize(paragraph)
for i in range(len(dataset)):
dataset[i] = dataset[i].lower()
dataset[i] = re.sub(r'\W',' ',dataset[i])
dataset[i] = re.sub(r'\s+',' ',dataset[i])
In [6]:
dataset
Out[6]:
In [7]:
word2count = {}
for data in dataset:
words = nltk.word_tokenize(data)
for word in words:
if word not in word2count.keys():
word2count[word] = 1
else:
word2count[word] += 1
127.0.0.1:8888/notebooks/Untitled10.ipynb?kernel_name=python3 1/4
4/15/2020 Untitled10 - Jupyter Notebook
In [8]:
word2count
Out[8]:
{'thank': 3,
'you': 4,
'all': 2,
'so': 1,
'very': 1,
'much': 1,
'to': 3,
'the': 3,
'academy': 1,
'of': 1,
'in': 1,
'this': 2,
'room': 1,
'i': 1,
'have': 1,
'congratulate': 1,
'other': 1,
'incredible': 1,
'nominees': 1,
'year': 1,
'revenant': 1,
'was': 1}
In [18]:
freq_words = heapq.nlargest(100,word2count,key=word2count.get)
127.0.0.1:8888/notebooks/Untitled10.ipynb?kernel_name=python3 2/4
4/15/2020 Untitled10 - Jupyter Notebook
In [19]:
freq_words
Out[19]:
['you',
'thank',
'to',
'the',
'all',
'this',
'so',
'very',
'much',
'academy',
'of',
'in',
'room',
'i',
'have',
'congratulate',
'other',
'incredible',
'nominees',
'year',
'revenant',
'was']
In [22]:
X = np.asarray(X)
In [23]:
Out[23]:
array([[1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0],
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]])
In [ ]:
127.0.0.1:8888/notebooks/Untitled10.ipynb?kernel_name=python3 3/4
4/15/2020 Untitled10 - Jupyter Notebook
127.0.0.1:8888/notebooks/Untitled10.ipynb?kernel_name=python3 4/4