Sahil NLP

You might also like

Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 16

Name:- Vipin Rawat Course:- MCA Section:- 4E

Roll No:-

Q1:-Write a python program to tokenization to statement.

nltk.download('punkt')
from nltk.tokenize import word_tokenize
sentence = "Tokenize this sentence."
tokens = word_tokenize(sentence)
print(tokens)

Q2:-Remove the stopwords.

from nltk.corpus import stopwords


nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
filtered_sentence = [word for word in tokens if not word.lower() in stop_words]
print(filtered_sentence)

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
tokens = ["the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"]
stop_words = set(stopwords.words('english'))
newfiltered_sentence = [word for word in tokens if not word.lower() in stop_words]
print(newfiltered_sentence)

Q3:-To carry out Stemming or Lemmatization.


Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

from nltk.stem import PorterStemmer


ps = PorterStemmer()
stemmed_words = [ps.stem(word) for word in tokens]
print(stemmed_words)

Q4:-To Carry out parts of speech tagging.

nltk.download('averaged_perceptron_tagger')
tagged_words = nltk.pos_tag(tokens)
print(tagged_words)

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
sentence = "This is a sample sentence"
tokens = nltk.word_tokenize(sentence)
tagged_words = nltk.pos_tag(tokens)
print(tagged_words)

Q5:-To carry out chunking of the word based on parts of Speech tagging.

nltk.download('maxent_ne_chunker')
nltk.download('words')
ne_chunks = nltk.ne_chunk(tagged_words)
print(ne_chunks)

Q6:-Regular expression tagger.

import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag, RegexpParser
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

def tokenize_and_chunk_input():
user_input = input("Enter a sentence: ")
tokens = word_tokenize(user_input)
tagged_tokens = pos_tag(tokens)
grammar = r"""
NP: {<DT|JJ|NN.*>+}
PP: {<IN><NP>}
VP: {<VB.*><NP|PP|CLAUSE>+$}
CLAUSE: {<NP><VP>}
"""
chunk_parser = RegexpParser(grammar)
chunks = chunk_parser.parse(tagged_tokens)
return chunks
if __name__ == "__main__":
print("Tokenizing and chunking input from the user using NLTK...")
chunked_input = tokenize_and_chunk_input()
print("Chunked input:", chunked_input)

nltk.download('maxent_ne_chunker')
nltk.download('words')
ne_chunks = nltk.ne_chunk(tagged_words)
print(ne_chunks)

Q7:-Write a program to take input user and to carry out all the basic operations of NLP.

import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag, RegexpParser
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
def tokenize_and_chunk_input():
user_input = input("Enter a sentence: ")
tokens = word_tokenize(user_input)
tagged_tokens = pos_tag(tokens)
grammar = r"""
NP: {<DT|JJ|NN.*>+}
PP: {<IN><NP>}
VP: {<VB.*><NP|PP|CLAUSE>+$}
CLAUSE: {<NP><VP>}
"""
chunk_parser = RegexpParser(grammar)
chunks = chunk_parser.parse(tagged_tokens)
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

return chunks
if __name__ == "__main__":
print("Tokenizing and chunking input from the user using NLTK...")
chunked_input = tokenize_and_chunk_input()
print("Chunked input:", chunked_input)

Q8:- To calculate the TFID Term frequency inverse documents frequency given set of sentences.

from sklearn.feature_extraction.text import TfidfVectorizer


documents = [
"This is the first document.",
"This document is the second document.",
"And this is the third one.",
"Is this the first document?",
]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
print(tfidf_matrix.toarray())

Q9:-Write a program to count vectorsation and transform each document into a vector of word
count.
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

from sklearn.feature_extraction.text import CountVectorizer


documents = [
"This is the first document.",
"This document is the second document.",
"And this is the third one.",
"Is this the first document?",
]
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(documents)
print(bow_matrix.toarray())

Q10:-Write a python program ro find similar word using word to word modle.

from gensim.models import Word2Vec


sentences = [
["this", "is", "the", "first", "sentence", "for", "word2vec"],
["this", "is", "the", "second", "sentence"],
["yet", "another", "sentence"],
["one", "more", "sentence"],
["and", "the", "final", "sentence"],
]
model = Word2Vec(sentences, min_count=1)
similar_words = model.wv.most_similar("sentence")
print(similar_words)

Q11:-Write a python program to find similar word from given two paragraph.

import nltk
from nltk.corpus import gutenberg
nltk.download('gutenberg')
nltk.download('punkt')
paragraph1 = "Hello and welcome! How can I assist you today?"
paragraph2 = "Greetings! I'm here to help. What can I do for you?"
words_paragraph1 = set(nltk.word_tokenize(paragraph1.lower()))
words_paragraph2 = set(nltk.word_tokenize(paragraph2.lower()))
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

similar_words = words_paragraph1.intersection(words_paragraph2)
print("Similar words between the two paragraphs:")
print(similar_words)

Q12:-Write a python program to tokenize the given sentence using two methods sentence by
sentence.

import nltk
nltk.download('punkt')
def tokenize_sentences(text):
sentences = nltk.sent_tokenize(text)
return sentences
text = "Tokenization is the process of breaking text into sentences. It's an important step in natural
language processing."
sentences = tokenize_sentences(text)
for sentence in sentences:
print(sentence)

Q13:-Write a python program to take a sample of atleast 5 line and tokenization either by word or
sentence.

import nltk
nltk.download('punkt')
def tokenize_sentences(text):
sentences = nltk.sent_tokenize(text)
return sentences
text = "Tokenization is the process of breaking text into sentences. It's an important step in natural
language processing. Gjh. yhfh. Jhyfhu "
sentences = tokenize_sentences(text)
for sentence in sentences:
print(sentence)

nltk.sent_tokenize(text)
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

nltk.word_tokenize(text)

def tokenize_sentences(text):
sentences = nltk.sent_tokenize(text)
return sentences
def tokenize_words(sentence):
words = nltk.word_tokenize(sentence)
return words

for sentence in sentences:


print(sentence)
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

Q14:-Write a python program to download sample text from guntaberg and tokenization by send
as well as verb. The sample text

import nltk
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
sample_text = gutenberg.raw('shakespeare-hamlet.txt')[:1000]
tokens = word_tokenize(sample_text)
tagged_tokens = pos_tag(tokens)
nouns = [word for word, pos in tagged_tokens if pos.startswith('NN')]
verbs = [word for word, pos in tagged_tokens if pos.startswith('VB')]
print("Nouns:")
print(nouns)
print("\nVerbs:")
print(verbs)

Q15:-Write a python program to remove stopwords from a given text using build in stopwords list
from NLTK.

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
def remove_stopwords(text):
stop_words = set(stopwords.words('english'))
words = text.split()
filtered_text = [word for word in words if word.lower() not in stop_words]
return ' '.join(filtered_text)
text = "This is a sample sentence with some stop words that need to be removed."
filtered_text = remove_stopwords(text)

print("Filtered text:", filtered_text)


Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

Q16:-Write a python program to remove stopwords from customize list of stopwords.

def remove_stopwords_custom(text, custom_stopwords):


words = text.split()
filtered_text = [word for word in words if word.lower() not in custom_stopwords]
return ' '.join(filtered_text)
custom_stopwords = {'is', 'a', 'with', 'to', 'be'}
filtered_text_custom = remove_stopwords_custom(text, custom_stopwords)
print("Filtered text (custom):", filtered_text_custom)

Q17:-Write a python program using above library of stopwords.

def remove_stopwords_custom(text, custom_stopwords):


words = text.split()
filtered_text = [word for word in words if word.lower() not in custom_stopwords]
return ' '.join(filtered_text)
custom_stopwords = {'is', 'a', 'with', 'to', 'be'}
filtered_text_custom = remove_stopwords_custom(text, custom_stopwords)
print("Filtered text (custom):", filtered_text_custom)

import spacy
nlp = spacy.load("en_core_web_sm")
text = "This is a sample sentence with some stop words that need to be removed."
def remove_stopwords_spacy(text):
doc = nlp(text)
filtered_text = [token.text for token in doc if not token.is_stop]
return ' '.join(filtered_text)
filtered_text_spacy = remove_stopwords_spacy(text)
print("Filtered text (spaCy):", filtered_text_spacy)

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS


def remove_stopwords_sklearn(text):
stop_words = set(ENGLISH_STOP_WORDS)
words = text.split()
filtered_text = [word for word in words if word.lower() not in stop_words]
return ' '.join(filtered_text)
filtered_text_sklearn = remove_stopwords_sklearn(text)
print("Filtered text (scikit-learn):", filtered_text_sklearn)
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

from gensim.parsing.preprocessing import remove_stopwords as gensim_remove_stopwords


filtered_text_gensim = gensim_remove_stopwords(text)
print("Filtered text (Gensim):", filtered_text_gensim)

from nltk.stem import SnowballStemmer


def nltk_snowball_stemming(text):
stemmer = SnowballStemmer("english")
tokens = text.split()
stemmed_tokens = [stemmer.stem(token) for token in tokens]
return " ".join(stemmed_tokens)
text = "I am going to the store to buy some apples and oranges"
stemmed_text = nltk_snowball_stemming(text)
print(stemmed_text)

Q18:-Write a python program to describe the stemming using porter Stemming.

from nltk.stem import PorterStemmer


def nltk_stemming(text):
stemmer = PorterStemmer()
tokens = text.split()
stemmed_tokens = [stemmer.stem(token) for token in tokens]
return " ".join(stemmed_tokens)
text = "I am going to the store to buy some apples and oranges"
stemmed_text = nltk_stemming(text)
print(stemmed_text)

Q19:-Write a python program to describe the stemming using Lancaster Stemming.

from nltk.stem import LancasterStemmer


lancaster_stemmer = LancasterStemmer()
words = ["running", "flies", "swimming", "happier", "cats", "dogs"]
stemmed_words = [lancaster_stemmer.stem(word) for word in words]

for original, stemmed in zip(words, stemmed_words):


print(f"Original: {original}, Stemmed: {stemmed}")
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

Q20:-Write a python program to describe the stemming using Regular expression Stemming.

import re
def regex_stemmer(word):
patterns = [
(r's$', ''),
(r'ed$', ''),
(r'ing$', '')
]
for pattern, replacement in patterns:
if re.search(pattern, word):
return re.sub(pattern, replacement, word)
return word
words = ["running", "flies", "swimming", "happier", "cats", "dogs"]
stemmed_words = [regex_stemmer(word) for word in words]
for original, stemmed in zip(words, stemmed_words):
print(f"Original: {original}, Stemmed: {stemmed}")

from sklearn.feature_extraction.text import TfidfVectorizer


documents = [
"This is the first document.",
"This document is the second document.",
"And this is the third one.",
"Is this the first document?",
]
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
print("Feature Names:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

import nltk
nltk.download('punkt')
nltk.download('hmm_treebank_pos_tagger')
nltk.download('averaged_perceptron_tagger')
hmm_tagger = nltk.tag.HiddenMarkovModelTagger.train([default_pos_tags])
hmm_pos_tags = hmm_tagger.tag(words)
print("\nStochastic POS tagging (Hidden Markov Model):")
print(hmm_pos_tags)

Q21:-Write a python program to POS tagging.

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
text = "The quick brown fox jumps over the lazy dog."
words = word_tokenize(text)
default_pos_tags = pos_tag(words)
print("Default POS tagging:")
print(default_pos_tags)
print("\nRule-based POS tagging (same as default tagger):")
print(default_pos_tags)
nltk.download('hmm_treebank_pos_tagger')
hmm_tagger = nltk.tag.HiddenMarkovModelTagger.train([default_pos_tags])
hmm_pos_tags = hmm_tagger.tag(words)
print("\nStochastic POS tagging (Hidden Markov Model):")
print(hmm_pos_tags)
print("\nStochastic POS tagging (Hidden Markov Model):")
print(hmm_pos_tags)
def get_wordnet_pos(tag):
if tag.startswith('J'):
return wordnet.ADJ
elif tag.startswith('V'):
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

return wordnet.VERB
elif tag.startswith('N'):
return wordnet.NOUN
elif tag.startswith('R'):
return wordnet.ADV
else:
return None
wordnet_tags = pos_tag(words)
wordnet_pos_tags = [(word, get_wordnet_pos(tag)) for word, tag in wordnet_tags]
print("\nDictionary-based POS tagging (WordNet):")
print(wordnet_pos_tags)

Q22:-Write a python program to develop chatbot that helps to diagnosis simple flue symptoms.

import random
greetings = ["Hello!", "Hi there!", "Welcome!", "Greetings!"]
common_questions = [
"What is your name?",
"How can I help you today?",
"What symptoms are you experiencing?",
"Do you have any allergies?",
"Are you currently taking any medications?",
]
responses = [
"I'm sorry, I'm just a chatbot and cannot provide medical advice. It's best to consult with a healthcare
professional.",
"Please consult with a doctor for proper diagnosis and treatment.",
"It's important to seek medical attention for your condition.",
"I recommend reaching out to a healthcare professional to discuss your concerns.",
]
def get_random_greeting():
return random.choice(greetings)
def respond(user_input):
if user_input.endswith("?"):
return random.choice(responses)
else:
return random.choice(common_questions)
def chat():
print(get_random_greeting())
while True:
user_input = input(">")
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

if user_input.lower() == "exit":
break
print(respond(user_input))
chat()

Q22:-Write a python program to create an interactive health chatbot with possible diagnosis and
home treatment for the probably symptoms.

class HealthChatbot:
def __init__(self):
self.symptoms = []
def greet_user(self):
print("Hello! I am your health chatbot. Let's check your symptoms.")
def ask_symptoms(self):
print("Please answer the following questions with 'yes' or 'no'.")
self.symptoms.append(input("Do you have a fever? ").lower())
self.symptoms.append(input("Do you have a cough? ").lower())
self.symptoms.append(input("Do you have difficulty breathing? ").lower())
def diagnose(self):
fever = self.symptoms[0] == 'yes'
cough = self.symptoms[1] == 'yes'
difficulty_breathing = self.symptoms[2] == 'yes'
if fever and cough and difficulty_breathing:
print("Based on your symptoms, you may have pneumonia. Please consult a doctor immediately.")
elif fever and cough:
print("Based on your symptoms, you may have a common cold or flu. Get plenty of rest and
fluids.")
elif difficulty_breathing:
print("Based on your symptoms, you may have a respiratory issue. Seek medical attention
promptly.")
else:
print("Based on your symptoms, you seem to be generally healthy. However, if you feel unwell,
consult a doctor.")
def start(self):
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

self.greet_user()
self.ask_symptoms()
self.diagnose()
if __name__ == "__main__":
chatbot = HealthChatbot()
chatbot.start()

Q23:-Write a python program using SVM and TFIDM analysis the given corpus of words.

from sklearn.datasets import fetch_20newsgroups


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
categories = ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
news = fetch_20newsgroups(categories=categories)
X_train, X_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.2, random_state=42)
model = make_pipeline(TfidfVectorizer(), SVC())
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Q24:-Write a python program to calculate bag of words after carry out the preprocessing of the
text as following convert all the text into lowercase alphabet and replace all the punctuation with a
Space.

from collections import Counter


def preprocess_text(text):
text = text.upper()
for punctuation in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~':
text = text.replace(punctuation, ' ')
return text
def create_bag_of_words(text):
text = preprocess_text(text)
words = text.split()
bag_of_words = Counter(words)
return bag_of_words
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

text = "HOW how is the boss is a .Simple Example is sometimes better than better."
bow = create_bag_of_words(text)
print("Bag of Words:")
for word, count in bow.items():
print(f"{word}: {count}")

You might also like