Dsbda 7

In [17]:
# Tokenization using NLTK

from nltk import word_tokenize, sent_tokenize
sent = "Tokenization refers to break down the text into smaller units ."
print(word_tokenize(sent))
print(sent_tokenize(sent))
['Tokenization', 'refers', 'to', 'break', 'down', 'the', 'text', 'into', 'smaller', 'units', '.']
['Tokenization refers to break down the text into smaller units .']
In [29]:
#Text to tokenize
text = "This is a tokenize test"
In [30]:
from nltk.tokenize import word_tokenize
word_tokenize(text)
['This', 'is', 'a', 'tokenize', 'test']

Out[30]:
In [19]:
#STOP WORD REMOVAL|
text = "S&P and NASDAQ are the two most popular indices in US"
In [20]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
text_tokens = word_tokenize(text)
tokens_without_sw= [word for word in text_tokens if not word in stop_words]
print(tokens_without_sw)
['S', '&', 'P', 'NASDAQ', 'two', 'popular', 'indices', 'US']
In [10]:
#steamming
text = "It's a Stemming testing"
In [11]:
parsed_text = word_tokenize(text)
In [22]:
# Initialize stemmer.
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
# Stem each word.

[(word, stemmer.stem(word)) for i, word in enumerate(parsed_text)
if word.lower() != stemmer.stem(parsed_text[i])]
[('Stemming', 'stem'), ('testing', 'test')]

Out[22]:
In [26]:
from nltk.stem import PorterStemmer
# create an object of class PorterStemmer

porter = PorterStemmer()
print(porter.stem("play"))
print(porter.stem("playing"))
print(porter.stem("plays"))
print(porter.stem("played"))
play
play
play
play
In [23]:
#lemmatizenation
text = "This world has a lot of faces "
In [24]:
from textblob import Word
parsed_data= TextBlob(text).words
parsed_data
WordList(['This', 'world', 'has', 'a', 'lot', 'of', 'faces'])

Out[24]:
In [25]:
[(word, word.lemmatize()) for i, word in enumerate(parsed_data)
if word != parsed_data[i].lemmatize()]
[('has', 'ha'), ('faces', 'face')]

Out[25]:
In [27]:
#pos Tagging
text = 'Google is looking at buying U.K. startup for $1 billion'
In [28]:
TextBlob(text).tags
[('Google', 'NNP'),
Out[28]:
('is', 'VBZ'),
('looking', 'VBG'),
('at', 'IN'),
('buying', 'VBG'),
('U.K.', 'NNP'),
('startup', 'NN'),
('for', 'IN'),
('1', 'CD'),
('billion', 'CD')]
In [36]:
import pandas as pd
import numpy as np
In [8]:
#CREATE WORD SET FOR CORPUS
corpus = ['data science is one of the most important fields of science',
'this is one of the best data science courses',
'data scientists analyze data' ]
In [9]:
words_set = set()
for doc in corpus:

words = doc.split(' ')
words_set = words_set.union(set(words))
print('Number of words in the corpus:',len(words_set))

print('The words in the corpus: \n', words_set)
Number of words in the corpus: 14

The words in the corpus:
{'the', 'analyze', 'courses', 'scientists', 'science', 'one', 'fields', 'best', 'important', 'is', 'most', 'this', 'data', 'of'}
In [12]:
import math
from collections import Counter
def calculate_tf(text):
words = text.split()
word_count = Counter(words)
total_words = len(words)
tf = {word: word_count[word] / total_words for word in word_count}
return tf
In [13]:
def calculate_idf(documents):
total_docs = len(documents)
idf = {}
for doc in documents:
words = set(doc.split())
for word in words:
idf[word] = idf.get(word, 0) + 1
for word in idf:
idf[word] = math.log(total_docs / (idf[word] + 1)) # Adding 1 to avoid division by zero
return idf
In [14]:
def calculate_tfidf(tf, idf):
tfidf = {word: tf[word] * idf.get(word, 0) for word in tf}
return tfidf
In [15]:
def represent_document(document, idf):
tf = calculate_tf(document)
tfidf = calculate_tfidf(tf, idf)
return tfidf
In [16]:
documents = [
"This is the first document.",
"This document is the second document.",
"And this is the third one.",
"Is this the first document?",
]
idf = calculate_idf(documents)
document_representation = represent_document(documents[0], idf)
print(document_representation)
{'This': 0.05753641449035617, 'is': 0.0, 'the': -0.044628710262841945, 'first': 0.05753641449035617, 'document.': 0.05753641449035617}
In [ ]:

Dsbda 7

Uploaded by

Copyright:

Available Formats

You might also like

Dsbda 7

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Dsbda 7

Uploaded by

Copyright:

Available Formats

In [17]:

# Tokenization using NLTK

['This', 'is', 'a', 'tokenize', 'test']

['S', '&', 'P', 'NASDAQ', 'two', 'popular', 'indices', 'US']

# Stem each word.

[('Stemming', 'stem'), ('testing', 'test')]

# create an object of class PorterStemmer

WordList(['This', 'world', 'has', 'a', 'lot', 'of', 'faces'])

[('has', 'ha'), ('faces', 'face')]

for doc in corpus:

print('Number of words in the corpus:',len(words_set))

Number of words in the corpus: 14

You might also like