PR 7

You might also like

Download as pdf or txt
Download as pdf or txt
You are on page 1of 2

Name :- Nisha Ambike

Roll No :- 02

Title :- Text Analytics

import pandas as pd

# Natural Language Toolkit


import nltk

# Regular expression operations


import re

text= "Tokenization is the first step in text analytics.The process of breaking down a text paragraph into smaller ch

Tokenization
# sentence Tokenization
from nltk.tokenize import sent_tokenize

sents = sent_tokenize(text)

nltk.download('punkt')

[nltk_data] Downloading package punkt to


[nltk_data] C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data] Package punkt is already up-to-date!
True

sents

['Tokenization is the first step in text analytics.The process of breaking down a text paragraph into smaller c
hunks such as words or sentences is called Tokenization.']

# word Tokenization
from nltk.tokenize import word_tokenize

word = word_tokenize(text)

print(word)

['Tokenization', 'is', 'the', 'first', 'step', 'in', 'text', 'analytics.The', 'process', 'of', 'breaking', 'dow
n', 'a', 'text', 'paragraph', 'into', 'smaller', 'chunks', 'such', 'as', 'words', 'or', 'sentences', 'is', 'cal
led', 'Tokenization', '.']

POS Tagging
from nltk import pos_tag

Pos = pos_tag(word)

print(Pos)

[('Tokenization', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('first', 'JJ'), ('step', 'NN'), ('in', 'IN'), ('text',
'NN'), ('analytics.The', 'NN'), ('process', 'NN'), ('of', 'IN'), ('breaking', 'VBG'), ('down', 'RP'), ('a', 'DT
'), ('text', 'NN'), ('paragraph', 'NN'), ('into', 'IN'), ('smaller', 'JJR'), ('chunks', 'NNS'), ('such', 'JJ'),
('as', 'IN'), ('words', 'NNS'), ('or', 'CC'), ('sentences', 'NNS'), ('is', 'VBZ'), ('called', 'VBN'), ('Tokeniz
ation', 'NN'), ('.', '.')]

Stop Words Removal


nltk.download('stopwords')

[nltk_data] Downloading package stopwords to


[nltk_data] C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data] Package stopwords is already up-to-date!
True

from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))
filtered_tokens = [word for word in word if word.lower() not in stop_words]
print(filtered_tokens)
['Tokenization', 'first', 'step', 'text', 'analytics.The', 'process', 'breaking', 'text', 'paragraph', 'smaller
', 'chunks', 'words', 'sentences', 'called', 'Tokenization', '.']

Stemming
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

stemmer_tokenize = [stemmer.stem (word) for word in filtered_tokens ]

print(stemmer_tokenize)

['token', 'first', 'step', 'text', 'analytics.th', 'process', 'break', 'text', 'paragraph', 'smaller', 'chunk',
'word', 'sentenc', 'call', 'token', '.']

Lemmatization.
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

lemmatizer_tokenize = [lemmatizer.lemmatize (word) for word in filtered_tokens ]

print(lemmatizer_tokenize)

['Tokenization', 'first', 'step', 'text', 'analytics.The', 'process', 'breaking', 'text', 'paragraph', 'smaller
', 'chunk', 'word', 'sentence', 'called', 'Tokenization', '.']

2.Create representation of document by calculating Term Frequency and Inverse Document


Frequency.

preprocessed_text = ''.join(lemmatizer_tokenize)

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_representation = tfidf_vectorizer.fit_transform([preprocessed_text])
print('preprocessed text:',preprocessed_text)
print("\nTF-IDF Representation:")
print(tfidf_representation.toarray())

preprocessed text: Tokenizationfirststeptextanalytics.Theprocessbreakingtextparagraphsmallerchunkwordsentenceca


lledTokenization.

TF-IDF Representation:
[[0.70710678 0.70710678]]

Loading [MathJax]/jax/output/CommonHTML/fonts/TeX/fontdata.js

You might also like