Sahil NLP

Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-
Q1:-Write a python program to tokenization to statement.
nltk.download('punkt')
from nltk.tokenize import word_tokenize
sentence = "Tokenize this sentence."
tokens = word_tokenize(sentence)
print(tokens)
Q2:-Remove the stopwords.
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
filtered_sentence = [word for word in tokens if not word.lower() in stop_words]
print(filtered_sentence)
import nltk
tokens = ["the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"]
newfiltered_sentence = [word for word in tokens if not word.lower() in stop_words]
print(newfiltered_sentence)
Q3:-To carry out Stemming or Lemmatization.

Roll No:-
from nltk.stem import PorterStemmer

ps = PorterStemmer()
stemmed_words = [ps.stem(word) for word in tokens]
print(stemmed_words)
Q4:-To Carry out parts of speech tagging.
nltk.download('averaged_perceptron_tagger')
tagged_words = nltk.pos_tag(tokens)
print(tagged_words)
import nltk
sentence = "This is a sample sentence"
tokens = nltk.word_tokenize(sentence)
tagged_words = nltk.pos_tag(tokens)
print(tagged_words)
Q5:-To carry out chunking of the word based on parts of Speech tagging.
nltk.download('maxent_ne_chunker')
nltk.download('words')
ne_chunks = nltk.ne_chunk(tagged_words)
print(ne_chunks)
Q6:-Regular expression tagger.
import nltk
from nltk import pos_tag, RegexpParser
Roll No:-
def tokenize_and_chunk_input():
user_input = input("Enter a sentence: ")
tokens = word_tokenize(user_input)
tagged_tokens = pos_tag(tokens)
grammar = r"""
NP: {<DT|JJ|NN.*>+}
PP: {<IN><NP>}
VP: {<VB.*><NP|PP|CLAUSE>+$}
CLAUSE: {<NP><VP>}
"""
chunk_parser = RegexpParser(grammar)
chunks = chunk_parser.parse(tagged_tokens)
return chunks
if __name__ == "__main__":
print("Tokenizing and chunking input from the user using NLTK...")
chunked_input = tokenize_and_chunk_input()
print("Chunked input:", chunked_input)
nltk.download('maxent_ne_chunker')
nltk.download('words')
ne_chunks = nltk.ne_chunk(tagged_words)
print(ne_chunks)
Q7:-Write a program to take input user and to carry out all the basic operations of NLP.
import nltk
from nltk import pos_tag, RegexpParser
def tokenize_and_chunk_input():
user_input = input("Enter a sentence: ")
tokens = word_tokenize(user_input)
grammar = r"""
NP: {<DT|JJ|NN.*>+}
PP: {<IN><NP>}
VP: {<VB.*><NP|PP|CLAUSE>+$}
CLAUSE: {<NP><VP>}
"""
chunk_parser = RegexpParser(grammar)
chunks = chunk_parser.parse(tagged_tokens)
Roll No:-
return chunks
if __name__ == "__main__":
print("Tokenizing and chunking input from the user using NLTK...")
chunked_input = tokenize_and_chunk_input()
print("Chunked input:", chunked_input)
Q8:- To calculate the TFID Term frequency inverse documents frequency given set of sentences.
from sklearn.feature_extraction.text import TfidfVectorizer

documents = [
"This is the first document.",
"This document is the second document.",
"And this is the third one.",
"Is this the first document?",
]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
print(tfidf_matrix.toarray())
Q9:-Write a program to count vectorsation and transform each document into a vector of word
count.
Roll No:-
from sklearn.feature_extraction.text import CountVectorizer

documents = [
]
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(documents)
print(bow_matrix.toarray())
Q10:-Write a python program ro find similar word using word to word modle.
from gensim.models import Word2Vec

sentences = [
["this", "is", "the", "first", "sentence", "for", "word2vec"],
["this", "is", "the", "second", "sentence"],
["yet", "another", "sentence"],
["one", "more", "sentence"],
["and", "the", "final", "sentence"],
]
model = Word2Vec(sentences, min_count=1)
similar_words = model.wv.most_similar("sentence")
print(similar_words)
Q11:-Write a python program to find similar word from given two paragraph.
import nltk
from nltk.corpus import gutenberg
nltk.download('gutenberg')
paragraph1 = "Hello and welcome! How can I assist you today?"
paragraph2 = "Greetings! I'm here to help. What can I do for you?"
words_paragraph1 = set(nltk.word_tokenize(paragraph1.lower()))
words_paragraph2 = set(nltk.word_tokenize(paragraph2.lower()))
Roll No:-
similar_words = words_paragraph1.intersection(words_paragraph2)
print("Similar words between the two paragraphs:")
print(similar_words)
Q12:-Write a python program to tokenize the given sentence using two methods sentence by
sentence.
import nltk
def tokenize_sentences(text):
sentences = nltk.sent_tokenize(text)
return sentences
text = "Tokenization is the process of breaking text into sentences. It's an important step in natural
language processing."
sentences = tokenize_sentences(text)
for sentence in sentences:
print(sentence)
Q13:-Write a python program to take a sample of atleast 5 line and tokenization either by word or
sentence.
import nltk
return sentences
text = "Tokenization is the process of breaking text into sentences. It's an important step in natural
language processing. Gjh. yhfh. Jhyfhu "
sentences = tokenize_sentences(text)
print(sentence)
nltk.sent_tokenize(text)
Roll No:-
nltk.word_tokenize(text)
return sentences
def tokenize_words(sentence):
words = nltk.word_tokenize(sentence)
return words

print(sentence)
Roll No:-
Q14:-Write a python program to download sample text from guntaberg and tokenization by send
as well as verb. The sample text
import nltk
from nltk.corpus import gutenberg
from nltk.tag import pos_tag
nltk.download('gutenberg')
sample_text = gutenberg.raw('shakespeare-hamlet.txt')[:1000]
tokens = word_tokenize(sample_text)
nouns = [word for word, pos in tagged_tokens if pos.startswith('NN')]
verbs = [word for word, pos in tagged_tokens if pos.startswith('VB')]
print("Nouns:")
print(nouns)
print("\nVerbs:")
print(verbs)
Q15:-Write a python program to remove stopwords from a given text using build in stopwords list
from NLTK.
import nltk
def remove_stopwords(text):
words = text.split()
filtered_text = [word for word in words if word.lower() not in stop_words]
return ' '.join(filtered_text)
text = "This is a sample sentence with some stop words that need to be removed."
filtered_text = remove_stopwords(text)
print("Filtered text:", filtered_text)

Roll No:-
Q16:-Write a python program to remove stopwords from customize list of stopwords.
def remove_stopwords_custom(text, custom_stopwords):

filtered_text = [word for word in words if word.lower() not in custom_stopwords]
custom_stopwords = {'is', 'a', 'with', 'to', 'be'}
filtered_text_custom = remove_stopwords_custom(text, custom_stopwords)
print("Filtered text (custom):", filtered_text_custom)
Q17:-Write a python program using above library of stopwords.
def remove_stopwords_custom(text, custom_stopwords):

filtered_text = [word for word in words if word.lower() not in custom_stopwords]
custom_stopwords = {'is', 'a', 'with', 'to', 'be'}
filtered_text_custom = remove_stopwords_custom(text, custom_stopwords)
print("Filtered text (custom):", filtered_text_custom)
import spacy
nlp = spacy.load("en_core_web_sm")
text = "This is a sample sentence with some stop words that need to be removed."
def remove_stopwords_spacy(text):
doc = nlp(text)
filtered_text = [token.text for token in doc if not token.is_stop]
filtered_text_spacy = remove_stopwords_spacy(text)
print("Filtered text (spaCy):", filtered_text_spacy)
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def remove_stopwords_sklearn(text):
stop_words = set(ENGLISH_STOP_WORDS)
filtered_text = [word for word in words if word.lower() not in stop_words]
filtered_text_sklearn = remove_stopwords_sklearn(text)
print("Filtered text (scikit-learn):", filtered_text_sklearn)
Roll No:-
from gensim.parsing.preprocessing import remove_stopwords as gensim_remove_stopwords

filtered_text_gensim = gensim_remove_stopwords(text)
print("Filtered text (Gensim):", filtered_text_gensim)
from nltk.stem import SnowballStemmer

def nltk_snowball_stemming(text):
stemmer = SnowballStemmer("english")
tokens = text.split()
stemmed_tokens = [stemmer.stem(token) for token in tokens]
return " ".join(stemmed_tokens)
text = "I am going to the store to buy some apples and oranges"
stemmed_text = nltk_snowball_stemming(text)
print(stemmed_text)
Q18:-Write a python program to describe the stemming using porter Stemming.
from nltk.stem import PorterStemmer

def nltk_stemming(text):
stemmer = PorterStemmer()
tokens = text.split()
stemmed_tokens = [stemmer.stem(token) for token in tokens]
return " ".join(stemmed_tokens)
text = "I am going to the store to buy some apples and oranges"
stemmed_text = nltk_stemming(text)
print(stemmed_text)
Q19:-Write a python program to describe the stemming using Lancaster Stemming.
from nltk.stem import LancasterStemmer

lancaster_stemmer = LancasterStemmer()
words = ["running", "flies", "swimming", "happier", "cats", "dogs"]
stemmed_words = [lancaster_stemmer.stem(word) for word in words]
for original, stemmed in zip(words, stemmed_words):

print(f"Original: {original}, Stemmed: {stemmed}")
Roll No:-
Q20:-Write a python program to describe the stemming using Regular expression Stemming.
import re
def regex_stemmer(word):
patterns = [
(r's$', ''),
(r'ed$', ''),
(r'ing$', '')
]
for pattern, replacement in patterns:
if re.search(pattern, word):
return re.sub(pattern, replacement, word)
return word
words = ["running", "flies", "swimming", "happier", "cats", "dogs"]
stemmed_words = [regex_stemmer(word) for word in words]
for original, stemmed in zip(words, stemmed_words):
print(f"Original: {original}, Stemmed: {stemmed}")

documents = [
]
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
print("Feature Names:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())
Roll No:-
import nltk
nltk.download('hmm_treebank_pos_tagger')
hmm_tagger = nltk.tag.HiddenMarkovModelTagger.train([default_pos_tags])
hmm_pos_tags = hmm_tagger.tag(words)
print("\nStochastic POS tagging (Hidden Markov Model):")
print(hmm_pos_tags)
Q21:-Write a python program to POS tagging.
import nltk
from nltk.tag import pos_tag
from nltk.corpus import wordnet
nltk.download('wordnet')
text = "The quick brown fox jumps over the lazy dog."
words = word_tokenize(text)
default_pos_tags = pos_tag(words)
print("Default POS tagging:")
print(default_pos_tags)
print("\nRule-based POS tagging (same as default tagger):")
print(default_pos_tags)
nltk.download('hmm_treebank_pos_tagger')
hmm_tagger = nltk.tag.HiddenMarkovModelTagger.train([default_pos_tags])
hmm_pos_tags = hmm_tagger.tag(words)
print(hmm_pos_tags)
print(hmm_pos_tags)
def get_wordnet_pos(tag):
if tag.startswith('J'):
return wordnet.ADJ
elif tag.startswith('V'):
Roll No:-
return wordnet.VERB
elif tag.startswith('N'):
return wordnet.NOUN
elif tag.startswith('R'):
return wordnet.ADV
else:
return None
wordnet_tags = pos_tag(words)
wordnet_pos_tags = [(word, get_wordnet_pos(tag)) for word, tag in wordnet_tags]
print("\nDictionary-based POS tagging (WordNet):")
print(wordnet_pos_tags)
Q22:-Write a python program to develop chatbot that helps to diagnosis simple flue symptoms.
import random
greetings = ["Hello!", "Hi there!", "Welcome!", "Greetings!"]
common_questions = [
"What is your name?",
"How can I help you today?",
"What symptoms are you experiencing?",
"Do you have any allergies?",
"Are you currently taking any medications?",
]
responses = [
"I'm sorry, I'm just a chatbot and cannot provide medical advice. It's best to consult with a healthcare
professional.",
"Please consult with a doctor for proper diagnosis and treatment.",
"It's important to seek medical attention for your condition.",
"I recommend reaching out to a healthcare professional to discuss your concerns.",
]
def get_random_greeting():
return random.choice(greetings)
def respond(user_input):
if user_input.endswith("?"):
return random.choice(responses)
else:
return random.choice(common_questions)
def chat():
print(get_random_greeting())
while True:
user_input = input(">")
Roll No:-
if user_input.lower() == "exit":
break
print(respond(user_input))
chat()
Q22:-Write a python program to create an interactive health chatbot with possible diagnosis and
home treatment for the probably symptoms.
class HealthChatbot:
def __init__(self):
self.symptoms = []
def greet_user(self):
print("Hello! I am your health chatbot. Let's check your symptoms.")
def ask_symptoms(self):
print("Please answer the following questions with 'yes' or 'no'.")
self.symptoms.append(input("Do you have a fever? ").lower())
self.symptoms.append(input("Do you have a cough? ").lower())
self.symptoms.append(input("Do you have difficulty breathing? ").lower())
def diagnose(self):
fever = self.symptoms[0] == 'yes'
cough = self.symptoms[1] == 'yes'
difficulty_breathing = self.symptoms[2] == 'yes'
if fever and cough and difficulty_breathing:
print("Based on your symptoms, you may have pneumonia. Please consult a doctor immediately.")
elif fever and cough:
print("Based on your symptoms, you may have a common cold or flu. Get plenty of rest and
fluids.")
elif difficulty_breathing:
print("Based on your symptoms, you may have a respiratory issue. Seek medical attention
promptly.")
else:
print("Based on your symptoms, you seem to be generally healthy. However, if you feel unwell,
consult a doctor.")
def start(self):
Roll No:-
self.greet_user()
self.ask_symptoms()
self.diagnose()
if __name__ == "__main__":
chatbot = HealthChatbot()
chatbot.start()
Q23:-Write a python program using SVM and TFIDM analysis the given corpus of words.
from sklearn.datasets import fetch_20newsgroups

from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
categories = ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
news = fetch_20newsgroups(categories=categories)
X_train, X_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.2, random_state=42)
model = make_pipeline(TfidfVectorizer(), SVC())
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Q24:-Write a python program to calculate bag of words after carry out the preprocessing of the
text as following convert all the text into lowercase alphabet and replace all the punctuation with a
Space.
from collections import Counter

def preprocess_text(text):
text = text.upper()
for punctuation in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~':
text = text.replace(punctuation, ' ')
return text
def create_bag_of_words(text):
text = preprocess_text(text)
bag_of_words = Counter(words)
return bag_of_words
Roll No:-
text = "HOW how is the boss is a .Simple Example is sometimes better than better."
bow = create_bag_of_words(text)
print("Bag of Words:")
for word, count in bow.items():
print(f"{word}: {count}")

Sahil NLP

Uploaded by

Copyright:

Available Formats

You might also like

Sahil NLP

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Sahil NLP

Uploaded by

Copyright:

Available Formats

Name:- Vipin Rawat Course:- MCA Section:- 4E

Q1:-Write a python program to tokenization to statement.

Q2:-Remove the stopwords.

from nltk.corpus import stopwords

Q3:-To carry out Stemming or Lemmatization.

from nltk.stem import PorterStemmer

Q4:-To Carry out parts of speech tagging.

Q6:-Regular expression tagger.

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer

from gensim.models import Word2Vec

for sentence in sentences:

print("Filtered text:", filtered_text)

Q16:-Write a python program to remove stopwords from customize list of stopwords.

def remove_stopwords_custom(text, custom_stopwords):

Q17:-Write a python program using above library of stopwords.

def remove_stopwords_custom(text, custom_stopwords):

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

from gensim.parsing.preprocessing import remove_stopwords as gensim_remove_stopwords

from nltk.stem import SnowballStemmer

Q18:-Write a python program to describe the stemming using porter Stemming.

from nltk.stem import PorterStemmer

Q19:-Write a python program to describe the stemming using Lancaster Stemming.

from nltk.stem import LancasterStemmer

for original, stemmed in zip(words, stemmed_words):

from sklearn.feature_extraction.text import TfidfVectorizer

Q21:-Write a python program to POS tagging.

from sklearn.datasets import fetch_20newsgroups

from collections import Counter

You might also like