Professional Documents
Culture Documents
PR 7
PR 7
PR 7
Roll No :- 02
import pandas as pd
text= "Tokenization is the first step in text analytics.The process of breaking down a text paragraph into smaller ch
Tokenization
# sentence Tokenization
from nltk.tokenize import sent_tokenize
sents = sent_tokenize(text)
nltk.download('punkt')
sents
['Tokenization is the first step in text analytics.The process of breaking down a text paragraph into smaller c
hunks such as words or sentences is called Tokenization.']
# word Tokenization
from nltk.tokenize import word_tokenize
word = word_tokenize(text)
print(word)
['Tokenization', 'is', 'the', 'first', 'step', 'in', 'text', 'analytics.The', 'process', 'of', 'breaking', 'dow
n', 'a', 'text', 'paragraph', 'into', 'smaller', 'chunks', 'such', 'as', 'words', 'or', 'sentences', 'is', 'cal
led', 'Tokenization', '.']
POS Tagging
from nltk import pos_tag
Pos = pos_tag(word)
print(Pos)
[('Tokenization', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('first', 'JJ'), ('step', 'NN'), ('in', 'IN'), ('text',
'NN'), ('analytics.The', 'NN'), ('process', 'NN'), ('of', 'IN'), ('breaking', 'VBG'), ('down', 'RP'), ('a', 'DT
'), ('text', 'NN'), ('paragraph', 'NN'), ('into', 'IN'), ('smaller', 'JJR'), ('chunks', 'NNS'), ('such', 'JJ'),
('as', 'IN'), ('words', 'NNS'), ('or', 'CC'), ('sentences', 'NNS'), ('is', 'VBZ'), ('called', 'VBN'), ('Tokeniz
ation', 'NN'), ('.', '.')]
stop_words = set(stopwords.words("english"))
filtered_tokens = [word for word in word if word.lower() not in stop_words]
print(filtered_tokens)
['Tokenization', 'first', 'step', 'text', 'analytics.The', 'process', 'breaking', 'text', 'paragraph', 'smaller
', 'chunks', 'words', 'sentences', 'called', 'Tokenization', '.']
Stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print(stemmer_tokenize)
['token', 'first', 'step', 'text', 'analytics.th', 'process', 'break', 'text', 'paragraph', 'smaller', 'chunk',
'word', 'sentenc', 'call', 'token', '.']
Lemmatization.
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer_tokenize)
['Tokenization', 'first', 'step', 'text', 'analytics.The', 'process', 'breaking', 'text', 'paragraph', 'smaller
', 'chunk', 'word', 'sentence', 'called', 'Tokenization', '.']
preprocessed_text = ''.join(lemmatizer_tokenize)
tfidf_vectorizer = TfidfVectorizer()
tfidf_representation = tfidf_vectorizer.fit_transform([preprocessed_text])
print('preprocessed text:',preprocessed_text)
print("\nTF-IDF Representation:")
print(tfidf_representation.toarray())
TF-IDF Representation:
[[0.70710678 0.70710678]]
Loading [MathJax]/jax/output/CommonHTML/fonts/TeX/fontdata.js