Dsbda 7

You might also like

Download as pdf or txt
Download as pdf or txt
You are on page 1of 1

In [17]:

# Tokenization using NLTK


from nltk import word_tokenize, sent_tokenize
sent = "Tokenization refers to break down the text into smaller units ."
print(word_tokenize(sent))
print(sent_tokenize(sent))

['Tokenization', 'refers', 'to', 'break', 'down', 'the', 'text', 'into', 'smaller', 'units', '.']
['Tokenization refers to break down the text into smaller units .']

In [29]:
#Text to tokenize
text = "This is a tokenize test"

In [30]:
from nltk.tokenize import word_tokenize
word_tokenize(text)

['This', 'is', 'a', 'tokenize', 'test']


Out[30]:

In [19]:
#STOP WORD REMOVAL|
text = "S&P and NASDAQ are the two most popular indices in US"

In [20]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
text_tokens = word_tokenize(text)
tokens_without_sw= [word for word in text_tokens if not word in stop_words]

print(tokens_without_sw)

['S', '&', 'P', 'NASDAQ', 'two', 'popular', 'indices', 'US']

In [10]:
#steamming
text = "It's a Stemming testing"

In [11]:
parsed_text = word_tokenize(text)

In [22]:
# Initialize stemmer.
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

# Stem each word.


[(word, stemmer.stem(word)) for i, word in enumerate(parsed_text)
if word.lower() != stemmer.stem(parsed_text[i])]

[('Stemming', 'stem'), ('testing', 'test')]


Out[22]:

In [26]:
from nltk.stem import PorterStemmer

# create an object of class PorterStemmer


porter = PorterStemmer()
print(porter.stem("play"))
print(porter.stem("playing"))
print(porter.stem("plays"))
print(porter.stem("played"))

play
play
play
play

In [23]:
#lemmatizenation
text = "This world has a lot of faces "

In [24]:
from textblob import Word
parsed_data= TextBlob(text).words
parsed_data

WordList(['This', 'world', 'has', 'a', 'lot', 'of', 'faces'])


Out[24]:

In [25]:
[(word, word.lemmatize()) for i, word in enumerate(parsed_data)
if word != parsed_data[i].lemmatize()]

[('has', 'ha'), ('faces', 'face')]


Out[25]:

In [27]:
#pos Tagging
text = 'Google is looking at buying U.K. startup for $1 billion'

In [28]:
TextBlob(text).tags

[('Google', 'NNP'),
Out[28]:
('is', 'VBZ'),
('looking', 'VBG'),
('at', 'IN'),
('buying', 'VBG'),
('U.K.', 'NNP'),
('startup', 'NN'),
('for', 'IN'),
('1', 'CD'),
('billion', 'CD')]

In [36]:
import pandas as pd
import numpy as np

In [8]:
#CREATE WORD SET FOR CORPUS
corpus = ['data science is one of the most important fields of science',
'this is one of the best data science courses',
'data scientists analyze data' ]

In [9]:
words_set = set()

for doc in corpus:


words = doc.split(' ')
words_set = words_set.union(set(words))

print('Number of words in the corpus:',len(words_set))


print('The words in the corpus: \n', words_set)

Number of words in the corpus: 14


The words in the corpus:
{'the', 'analyze', 'courses', 'scientists', 'science', 'one', 'fields', 'best', 'important', 'is', 'most', 'this', 'data', 'of'}

In [12]:
import math
from collections import Counter

def calculate_tf(text):
words = text.split()
word_count = Counter(words)
total_words = len(words)
tf = {word: word_count[word] / total_words for word in word_count}
return tf

In [13]:
def calculate_idf(documents):
total_docs = len(documents)
idf = {}
for doc in documents:
words = set(doc.split())
for word in words:
idf[word] = idf.get(word, 0) + 1
for word in idf:
idf[word] = math.log(total_docs / (idf[word] + 1)) # Adding 1 to avoid division by zero
return idf

In [14]:
def calculate_tfidf(tf, idf):
tfidf = {word: tf[word] * idf.get(word, 0) for word in tf}
return tfidf

In [15]:
def represent_document(document, idf):
tf = calculate_tf(document)
tfidf = calculate_tfidf(tf, idf)
return tfidf

In [16]:
documents = [
"This is the first document.",
"This document is the second document.",
"And this is the third one.",
"Is this the first document?",
]

idf = calculate_idf(documents)
document_representation = represent_document(documents[0], idf)
print(document_representation)

{'This': 0.05753641449035617, 'is': 0.0, 'the': -0.044628710262841945, 'first': 0.05753641449035617, 'document.': 0.05753641449035617}

In [ ]:

You might also like