20BCP112_NLP Lab_LAB_Manual (1)

PANDIT DEENDAYAL ENERGY UNIVERSITYSCHOOL OF TECHNOLOGY
CSE DEPARTMENT
Natural Language Processing Lab Manual
Submitted to,
Dr. Santosh Bharti
Submitted by,
Devshree
Jadeja
(20BCP112)
1. Tokenization
1.1 Simple Tokenization
1.2 Advanced Tokenization
2. Stemming
2.1 Simple Stemming
2.2 Library Stemming
2.3 Porter Stemming
3. Lemmatization
3.1 Simple Lemmatization
3.2 Library Lemmatization
4. POS tagging using HMM
5. Turney method of Sentiment Analysis (/Unsupervised)
6. Supervised sentiment analysis
7. text summarization using TF-IDF
8. Multi-document Text Summarization
9. term -incidence matrix for information retrieval

Lab 1: Tokenization (Simple)
1. Sentence Tokenization:
def extract_sentences(paragraph):
paragraph = paragraph.replace("? ", "|")
paragraph = paragraph.replace("! ", "|")
paragraph = paragraph.replace(". ", "|")
sentences = paragraph.split("|")
sentences = [sentence.strip() for sentence in sentences if

sentence.strip()]
return sentences
paragraph = "This is the first sentence. The second one follows? And here
comes the third sentence! My email is abc@gmail.com, you can contact
here."
sentences = extract_sentences(paragraph)
for idx, sentence in enumerate(sentences, start=1):

print(f"Sentence {idx}: {sentence}")
Output:
2. Sentence Tokenization using NLTK:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
text = "Dr. Ami Shah is a MD doctor. Please feel free to contact her on
ami@gmail.com or etc., many ways are there. " + \
"Khushi said: 'Today is her birthday. So she is very happy.' So everyone
wished her. "
print("Sentence tokenization: ", sent_tokenize(text))
Output:
Sentence tokenization: ['Dr. Ami Shah is a MD doctor.', 'Please feel free to contact her on
ami@gmail.com or etc., many ways are there.', "Khushi said: 'Today is her birthday.", "So she
is very happy.'", 'So everyone wished her.']
3. Word Tokenization using punkt:
import nltk
from nltk.tokenize import word_tokenize
text = "Dr. Ami Shah is a MD doctor. Please feel free to contact her on
ami@gmail.com etc. many ways are there. " + \
"Don't do this. " + \
"Researcher's said AI will change the word. " + \
"You can't go there. "
print("\n\nWord tokenization: " ,word_tokenize(text))
Output:
Word tokenization: ['Dr.', 'Ami', 'Shah', 'is', 'a', 'MD', 'doctor', '.', 'Please', 'feel', 'free',
'to',
'contact', 'her', 'on', 'ami', '@', 'gmail.com', 'etc', '.', 'many', 'ways', 'are', 'there', '.', 'Do', "n't",
'do',
'this', '.', 'Researcher', "'s", 'said', 'AI', 'will', 'change', 'the', 'word', '.', 'You', 'ca', "n't", 'go',
'there','.']
4. Word tokenization without NLTK:

def word_tokenize(sentence):
words = [] # Create an empty list to store the words.
current_word = "" # Initialize an empty string for the current word.
for char in sentence:

if char.isalnum() or char == "'" or char == "@": # Check if the character is a
letter, digit, or an apostrophe.
current_word += char # Add the character to the current word.
else:
if current_word: # Check if there's a current word (not empty).
words.append(current_word) # Append the current word to the list of
words.
current_word = "" # Reset the current word.
if current_word:
words.append(current_word) # Append the last word (if any) to the list.
return words
sentence = "Dr. Ami is not available today. Please feel free to contact her on
ami@gmail.com. Don't come. You can't go there."
tokens = word_tokenize(sentence)
print(tokens)
Output:
['Dr', 'Ami', 'is', 'not', 'available', 'today', 'Please', 'feel', 'free', 'to', 'contact', 'her', 'on',
'ami@gmail','com', "Don't", 'come', 'You', "can't", 'go', 'there']
import nltk
# nltk.download('punkt')
def nltk_tokenize(text):
# Use NLTK's recommended word tokenizer (which handles punctuation and
spaces)
tokens = word_tokenize(text)
return tokens
# Example usage
text = "This is an example sentence to demonstrate NLTK tokenization, including
punctuation!"
tokens = nltk_tokenize(text)
print(tokens)
Lab 2 : Stemming and Lemmatization
1. Stemming:
def stemx(word):
suffix = ['ed', 'es', 'ing', 'ship', 's', 'less', 'ion', 'ly', 'ions',
'full', 'able']
prefix = ['un', 'pre', 'dis', 're', 'il', 'im', 'in', 'de', 'mis', 'sub',
'non', 'anti']
for x in prefix:
if word.startswith(x):
word = word[len(x):]
for y in suffix:
if word.endswith(y):
word = word[:-len(y)]
return word
words = ['going', 'friends', 'friendship', 'happily', 'friendly',

'programming']
for word in words:
print(word + " : ", stemx(word))
Output:
going : go
friends : friend
friendship : friend
happily : happi
friendly : friend
programming : programm
2. Stemming using Porter Stemmer from NLTK:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
words = ["friendship", "programs", "programmer", "programming",

"programmers", "predefined", "happily"]
for w in words:
print(w, " : ", ps.stem(w))
Output:
friendship : friendship
programs : program
programmer :
programm
programming : program
programmers :
programmpredefined :
predefin happily :
happili
3. Lemmatization without NLTK:
from PyDictionary import PyDictionary
class BasicLemmatizer:
def init (self):
self.dictionary = PyDictionary()
def lemmatize(self, word):

synonyms = self.dictionary.synonym(word)
if not synonyms:
return word
# If the word itself is in the synonyms list, return it
if word in synonyms:
return word
# Otherwise, return the first synonym as a potential base form
return synonyms[0]
words = ['smiling', 'died', 'purchased', 'went', 'bought', 'better',

'mice']
lemmatizer = BasicLemmatizer()
for word in words:
print(f"Lemmatized word (basic): {lemmatizer.lemmatize(word)}")
4. Lemmatization using NLTK:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
words1 = ['smiling', 'died', 'purchased', 'went', 'bought']
for w in words1:
print(w + " : " + lemmatizer.lemmatize(w, pos="v")) # v - verb
words2 = ['better', 'worst', 'happiest']

for x in words2:
print(x + " : ", lemmatizer.lemmatize(x, pos ="a")) # a - adjective
words3 = ['babies', 'mice', 'feet']

for y in words3:
print(y + " : ", lemmatizer.lemmatize(y, pos ="n")) # n - noun
Output:
smiling :
smiledied :
die
purchased :
purchasewent : go
bought : buy
better : good
worst : bad
happiest :
happybabies :
baby mice :
mouse feet :
foot
5. Porter Stemmer:
class PorterStemmer:
def init (self):pass
def _contains_vowel(self, word):

return any(char in "aeiou" for char in word)
# m: count of VC pair
def _measure(self, word):
return ''.join(['V' if char in "aeiou" else 'C' for char inword]).count('VC')
def _double_consonant(self, word):

if len(word) >= 2 and word[-1] == word[-2] and word[-1] not in"aeiou":
return True
return False
def _replace_suffix(self, word, old, new):if

word.endswith(old):
return word[:-len(old)] + newreturn
word
def _ends_with_cvc(self, word):if len(word)

< 3:
return False
consonants = "bcdfghjklmnpqrstvwxyz"vowels =
"aeiou"
return word[-3] in consonants and word[-2] in vowels and word[-1]in consonants
def _step1(self, word):

if word.endswith("sses"): return
word[:-4] + "ss"
elif word.endswith("ies"):return
word[:-3] + "i"
elif word.endswith("ss"):return
word
elif word.endswith("s"):return
word[:-1]
return word
if word.endswith("eed"):stem =
word[:-3]
if self._measure(stem) > 0:return
stem + "ee"
elif word.endswith("ed"):stem =
word[:-2]
if self._contains_vowel(stem):return stem
elif word.endswith("ing"):stem =
word[:-3]
if self._contains_vowel(stem):return stem
return word

if word.endswith("y") and self._contains_vowel(word[:-1]):return word[:-1] + "i"
return word
def _step4(self, word):suffixes =

{
"ational": "ate",
"tional": "tion",
"izer": "ize",
"abli": "able",
"eli": "e",
"ization": "ize",
"ation": "ate",
"biliti": "ble",
}
for key in suffixes:
if word.endswith(key):
stem = word[:-len(key)]
if self._measure(stem) > 0: return stem +
suffixes[key]
return word

suffixes = {
"icate": "ic",
"ative": "",
"alize": "al",
"iciti": "ic",
"ical": "ic",
"ful": "",
"ness": ""
}
for key in suffixes:
if word.endswith(key):
stem = word[:-len(key)]
if self._measure(stem) > 0: return stem +
suffixes[key]
return word

suffixes = ["al", "ance", "ence", "er", "ic", "able", "ible","ant", "ement",
"ment", "ent", "ou", "ism", "ate", "iti", "ous", "ive",
"ize"]
for suffix in suffixes:
if word.endswith(suffix):
stem = word[:-len(suffix)] if
self._measure(stem) > 1:
return stem
return word
def _step7a(self, word): if

word.endswith("e"):
stem = word[:-1]
if self._measure(stem) > 1:return
stem
elif self._measure(stem) == 1 and not
self._ends_with_cvc(stem):
return stem
return word
def _step7b(self, word):

if self._measure(word) > 1 and self._double_consonant(word) and
word.endswith("l"):
return word[:-1]
return word
def stem(self, word):

word = self._step1(word)
word = self._step7a(word)
word = self._step7b(word)
return word
def test_stemmer():
stemmer = PorterStemmer()
words = ["computers", "singing", "controlling", "generalizations",

"elephants", "doing", "relational","caresses"]
results = []
for word in words:

results.append((word, stemmer.stem(word)))
return print(results)
test_stemmer()
Output:
[('computers', 'comput'), ('singing', 'sing'), ('controlling', 'control'), ('generalizations',

'gener'),
('elephants', 'eleph'), ('doing', 'do'), ('relational', 'relat'), ('caresses', 'caress')]
Lab 3 : POS Tagging
Code:
from future import division #To avoid integer division

from operator import itemgetter
###Training Phase###
with open("wsj_training.txt", "r") as myfile:

tr_str = myfile.read()
tr_li = tr_str.split()
num_words_train = len(tr_li)
train_li_words = ['']
train_li_words*= num_words_train
train_li_tags = ['']
train_li_tags*= num_words_train
for i in range(num_words_train):
temp_li = tr_li[i].split("/")
train_li_words[i] = temp_li[0]
train_li_tags[i] = temp_li[1]
dict2_tag_follow_tag_ = {} # for transition probability
dict2_word_tag = {}
dict_word_tag_baseline = {} # for emission probability
for i in range(num_words_train-1):
outer_key = train_li_tags[i]
inner_key = train_li_tags[i+1]
dict2_tag_follow_tag_[outer_key]=dict2_tag_follow_tag_.get(outer_key,{})
dict2_tag_follow_tag_[outer_key][inner_key] =
dict2_tag_follow_tag_[outer_key].get(inner_key,0)
dict2_tag_follow_tag_[outer_key][inner_key]+=1
outer_key = train_li_words[i]inner_key
= train_li_tags[i]
dict2_word_tag[outer_key]=dict2_word_tag.get(outer_key,{})
dict2_word_tag[outer_key][inner_key] =
dict2_word_tag[outer_key].get(inner_key,0)
dict2_word_tag[outer_key][inner_key]+=1
dict2_tag_follow_tag_['.'] = dict2_tag_follow_tag_.get('.',{})
dict2_tag_follow_tag_['.'][train_li_tags[0]] =
dict2_tag_follow_tag_['.'].get(train_li_tags[0],0)
dict2_tag_follow_tag_['.'][train_li_tags[0]]+=1
last_index = num_words_train-1
#Accounting for the last word-tag pair outer_key =

train_li_words[last_index] inner_key =
train_li_tags[last_index]
dict2_word_tag[outer_key]=dict2_word_tag.get(outer_key,{})
dict2_word_tag[outer_key][inner_key] =
dict2_word_tag[outer_key].get(inner_key,0)
dict2_word_tag[outer_key][inner_key]+=1
for key in dict2_tag_follow_tag_: di =

dict2_tag_follow_tag_[key]s =
sum(di.values())
for innkey in di: di[innkey]
/= s
di = di.items()
di = sorted(di,key=lambda x: x[0])
dict2_tag_follow_tag_[key] = di
for key in dict2_word_tag: di =

dict2_word_tag[key]
dict_word_tag_baseline[key] = max(di, key=di.get)s =
sum(di.values())
for innkey in di: di[innkey]
/= s
di = di.items()
di = sorted(di,key=lambda x: x[0])
dict2_word_tag[key] = di
###Testing Phase###
with open("wsj_test.txt", "r") as myfile:te_str =

myfile.read()
te_li = te_str.split() num_words_test

= len(te_li)
test_li_words = [''] test_li_words*=

num_words_test
test_li_tags = [''] test_li_tags*=

num_words_test
output_li = [''] output_li*=

num_words_test
output_li_baseline = [''] output_li_baseline*=

num_words_test
num_errors = 0
num_errors_baseline = 0
for i in range(num_words_test): temp_li =

te_li[i].split("/")test_li_words[i] =
temp_li[0]test_li_tags[i] = temp_li[1]
output_li_baseline[i] = dict_word_tag_baseline.get(temp_li[0],'')#If unknown word - tag =

'NNP'
if output_li_baseline[i]=='':
output_li_baseline[i]='NNP'
if output_li_baseline[i]!=test_li_tags[i]:
num_errors_baseline+=1
if i==0: #Accounting for the 1st word in the test document for theViterbi
di_transition_probs = dict2_tag_follow_tag_['.']else:
di_transition_probs = dict2_tag_follow_tag_[output_li[i-1]]di_emission_probs =
dict2_word_tag.get(test_li_words[i],'')
#If unknown word - tag = 'NNP'if

di_emission_probs=='':
output_li[i]='NNP'
else:
max_prod_prob = 0
counter_trans = 0
counter_emis =0
prod_prob = 0
while counter_trans < len(di_transition_probs) and counter_emis <
len(di_emission_probs):
tag_tr = di_transition_probs[counter_trans][0]tag_em =
di_emission_probs[counter_emis][0]
if tag_tr < tag_em:
counter_trans+=1
elif tag_tr > tag_em:
counter_emis+=1
else:
prod_prob = di_transition_probs[counter_trans][1] *
di_emission_probs[counter_emis][1]
if prod_prob > max_prod_prob:
max_prod_prob = prod_prob
output_li[i] = tag_tr
#print "i=",i," and output=",output_li[i]
counter_trans+=1
counter_emis+=1
if output_li[i]=='': #In case there are no matching entries between the
transition tags and emission tags, we choose the most frequent emission
tag
output_li[i] = max(di_emission_probs,key=itemgetter(1))[0]
if output_li[i]!=test_li_tags[i]:
num_errors+=1
print("Fraction of errors (Baseline)

:",num_errors_baseline/num_words_test)
print("Fraction of errors (Viterbi):",num_errors/num_words_test)
print("Tags suggested by Baseline Algorithm:",output_li_baseline)
print("Tags suggested by Viterbi Algorithm:", output_li)
print("Correct tags:",test_li_tags)
Output:
Fraction of errors (Baseline) :

0.15384615384615385Fraction of errors
(Viterbi): 0.07692307692307693
Tags suggested by Baseline Algorithm: ['NNP', 'VBD', 'IN', 'VBN', 'NNP', 'VBN', 'TO', 'NNS',
'RB', 'CD', 'NN', 'IN', '.']
Tags suggested by Viterbi Algorithm: ['NNP', 'VBD', 'IN', 'VBD', 'NNP', 'VBN', 'TO', 'NNS',
'RB', 'CD', 'NN', 'IN', '.']
Correct tags: ['NNP', 'VBD', 'WDT', 'VBD', 'NNP', 'VBN', 'TO', 'NNS', 'RB', 'CD', 'NN', 'IN', '.']
Lab 4 : Sentiment Analysis
1. Sentiment Analysis using Supervised Learning Model
import nltk
from nltk.corpus import stopwords
stopwords=set(stopwords.words('english'))
pos_tweets=[('It is not impossible', 'positive'),

('You are my lovely friend', 'Positive'),
('She is beautiful girl', 'Positive'),
('He is looking handsome', 'Positive'),
('Exercise is good for health', 'Positive'),
('Today\'s weather is fantastic', 'Positive'),
('I love Mango', 'Positive')]
neg_tweets=[('You are my enemy friend', 'Negative'),

('She is looking ugly ', 'Negative'),
('He is looking horrible', 'Negative'),
('Sleeping more makes you lazy', 'Negative'),
('Today\'s weather is very bad', 'Negative'),
('I hate Banana', 'Negative')]
Senti_tweets=[]
for (words, sentiment) in pos_tweets + neg_tweets:
words_filtered=[e.lower() for e in words.split() if len(e)>=3]
Senti_tweets.append((words_filtered, sentiment))
print(Senti_tweets)
def get_words_in_tweets(tweets):
all_words=[]
for (words, sentiment) in Senti_tweets:
all_words.extend(words)
return (all_words)
def get_word_features(wordlist):
wordlist=nltk.FreqDist(wordlist)
word_features=wordlist.keys()
return word_features
word_features=get_word_features(get_words_in_tweets(Senti_tweets))
print(word_features)
word_features_filtered=[]
for w in word_features:
if w not in stopwords:
word_features_filtered.append(w)
print(word_features_filtered)
def extract_features(document):
document_words=set(document)
features={}
for word in word_features_filtered:
features['contains(%s)' %word] = (word in document_words)
return features
training_set = nltk.classify.apply_features(extract_features,
Senti_tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)
test_tweet='This is a horrible book'

print()
print("{}: Sentiment = {}".format(test_tweet,
classifier.classify(extract_features(test_tweet.split()))))
Output:
[(['not', 'impossible'], 'positive'), (['you', 'are', 'lovely', 'friend'], 'Positive'), (['she', 'beautiful',
'girl'], 'Positive'), (['looking', 'handsome'], 'Positive'), (['exercise', 'good', 'for', 'health'],
'Positive'),
(["today's", 'weather', 'fantastic'], 'Positive'), (['love', 'mango'], 'Positive'), (['you', 'are',
'enemy',
'friend'], 'Negative'), (['she', 'looking', 'ugly'], 'Negative'), (['looking', 'horrible'], 'Negative'),
(['sleeping', 'more', 'makes', 'you', 'lazy'], 'Negative'), (["today's", 'weather', 'very',
'bad'],'Negative'), (['hate', 'banana'], 'Negative')]
dict_keys(['not', 'impossible', 'you', 'are', 'lovely', 'friend', 'she', 'beautiful', 'girl', 'looking',
'handsome', 'exercise', 'good', 'for', 'health', "today's", 'weather', 'fantastic', 'love', 'mango',
'enemy', 'ugly', 'horrible', 'sleeping', 'more', 'makes', 'lazy', 'very', 'bad', 'hate', 'banana'])
['impossible', 'lovely', 'friend', 'beautiful', 'girl', 'looking', 'handsome', 'exercise', 'good',
'health',
"today's", 'weather', 'fantastic', 'love', 'mango', 'enemy', 'ugly', 'horrible', 'sleeping', 'makes',
'lazy','bad', 'hate', 'banana']
This is a horrible book: Sentiment = Negative
2. Sentiment Analysis on csv data
import pandas as pd
df = pd.read_csv('full-corpus.csv')
df.head()
tweets_data = list(zip(df['TweetText'], df['Sentiment']))
# Split the data into positive and negative tweets

pos_tweets = [(text, 'Positive') for text, sentiment in tweets_data ifsentiment.lower() ==
'positive']
neg_tweets = [(text, 'Negative') for text, sentiment in tweets_data ifsentiment.lower() ==
'negative']
import nltk
def preprocess(tweet): tweet =

tweet.lower()
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) #
Convert URLs to the word URL
tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # Convert @username to
AT_USER
tweet = re.sub('[\s]+', ' ', tweet) # Remove additional whitespaces tweet =
re.sub(r'#([^\s]+)', r'\1', tweet) # Replace #word with wordtweet = tweet.strip('\'"')
# Trim
return tweet
# Extract word features

def get_word_features(wordlist): wordlist =
nltk.FreqDist(wordlist)word_features =
wordlist.keys() return word_features
import re
import nltk
# Word tokenizer
def tokenize(tweet):
return nltk.word_tokenize(tweet)
# Remove stopwords and get words from tweetsdef

get_words_in_tweets(tweets):
all_words = []
for (words, sentiment) in tweets:
all_words.extend(tokenize(words))
return all_words#
Extract features
def extract_features(document): document_words =
set(tokenize(document))features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)return features
# Replace the hardcoded sample tweets with the extracted tweets tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
tweets.append((preprocess(words), sentiment))
import nltk
# Create word features and train the classifier

word_features = get_word_features(get_words_in_tweets(tweets)) training_set =
nltk.classify.apply_features(extract_features, tweets)classifier =
nltk.NaiveBayesClassifier.train(training_set)
# Test the classifier (optional, if you want to test with a sample tweet)test_tweet = "It’s easy to be
brave when you’re hiding behind a keyboard.You and your Hamas friends will regret your barbaric
actions very soon." print(classifier.classify(extract_features(preprocess(test_tweet))))
from sklearn.metrics import accuracy_score, precision_score, recall_score,classification_report
# Assuming tweets is a list of (tweet, sentiment) pairs# Splitting data into

80% training and 20% testing train_size = int(len(tweets) * 0.8)
train_tweets = tweets[:train_size]test_tweets =
tweets[train_size:]
# Train the classifier with the training data

training_set = nltk.classify.apply_features(extract_features,train_tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)# Predict the sentiments
of the test set

predictions = [classifier.classify(extract_features(tweet)) for tweet, _
in test_tweets]
actual = [sentiment for _, sentiment in test_tweets]
# Calculate accuracy, precision, and recall

accuracy = accuracy_score(actual, predictions)
precision = precision_score(actual, predictions, average='weighted')
recall = recall_score(actual, predictions, average='weighted')
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
# If you want a detailed report for each class (Positive, Negative, etc.)
print(classification_report(actual, predictions))
Output:
Negative
Lab 5 : Turney’s method of Sentiment Analysis
Code:
import math
import re
import json
import nltk
nltk.download('averaged_perceptron_tagger',quiet=True)
def loadReviews(fileName):
list_pos = []
list_neg = []
data = []
with open(fileName, 'r') as f:
for line in f:
data.append(json.loads(line))
for elem in data:

if float(elem["overall"]) >= 3.0:
list_pos.append(elem["reviewText"])
else:
list_neg.append(elem["reviewText"])
return list_pos, list_neg
def make_datasets(fileName):
all_positive_reviews, all_negative_reviews = loadReviews(fileName)
dataset = {'train': {'neg': [], 'pos': []}, 'test': {'neg': [],
'pos': []}}
dataset['train']['pos'] = (all_positive_reviews[:20000])
dataset['train']['neg'] = (all_negative_reviews[:20000])
dataset['test']['pos'] = (all_positive_reviews[-50:])
dataset['test']['neg'] = (all_negative_reviews[-50:])
return dataset
def find_pattern(postag):
tag_pattern = []
for k in range(len(postag) - 2):
if postag[k][1] == "JJ" and (postag[k + 1][1] == "NN" or
postag[k + 1][1] == "NNS"):
tag_pattern.append("".join(postag[k][0]) + " " +
"".join(postag[k + 1][0]))
elif ((postag[k][1] == "RB" or postag[k][1] == "RBR" or

postag[k][1] == "RBS") and postag[k + 1][1] == "JJ" and
postag[k + 2][1] != "NN" and postag[k + 2][1] != "NNS"):
elif postag[k][1] == "JJ" and postag[k + 1][1] == "JJ" and

postag[k + 2][1] != "NN" and postag[k + 2][
1] != "NNS":
elif (postag[k][1] == "NN" or postag[k][1] == "NNS") and

postag[k + 1][1] == "JJ" and postag[k + 2][
1] != "NN" and postag[k + 2][1] != "NNS":
elif ((postag[k][1] == "RB" or postag[k][1] == "RBR" or

postag[k][1] == "RBS") and (
postag[k + 1][1] == "VB" or postag[k + 1][1] == "VBD" or
postag[k + 1][1] == "VBN" or postag[k + 1][
1] == "VBG")):
return tag_pattern
def near_operator(phrase, word, text):

try:
string = word + r'\W+(?:\w+\W+){0,400}?' + phrase + r'|' +
phrase + r'\W+(?:\w+\W+){0,400}?' + word
freq_phrase_near_word = (len(re.findall(string, text)))
return freq_phrase_near_word
except:
return 0
class Turney(object):
def __init__(self, dataset):

self.datasets = dataset
self.pos_phrases_hits = []
self.neg_phrases_hits = []
self.pos_hits = 0.01
self.neg_hits = 0.01
self.accuracy = 0
def turney(self):
tp = 0
fp = 0
tn = 0
fn = 0
for boolean, test_klass in enumerate(['pos', 'neg']):
for i, data in enumerate(self.datasets['test'][test_klass]):
print(str(i) + " out of " +
str(len(self.datasets['test'][test_klass])) + " --> round " +
str(boolean))
phrases =
find_pattern(nltk.pos_tag(nltk.word_tokenize(data)))
if len(phrases) == 0:
continue
self.pos_phrases_hits = [0.01] * len(phrases)
self.neg_phrases_hits = [0.01] * len(phrases)
self.pos_hits = 0.01
self.neg_hits = 0.01
for train_klass in ['pos', 'neg']:

for text in self.datasets['train'][train_klass]:
for ind, phrase in enumerate(phrases):
self.pos_phrases_hits[ind] +=
near_operator(phrase, "excellent", text)
self.neg_phrases_hits[ind] +=
near_operator(phrase, "poor", text)
self.pos_hits += text.count("excellent")
self.neg_hits += text.count("poor")
res = self.calculate_sentiment(boolean)
# compute if correct prediction
if res == 1 and boolean == 0:
fp += 1
elif res == 1 and boolean == 1:
tp += 1
fn += 1
tn += 1
print("Accuracy: " + str(self.accuracy / 100))

print("True positive: " + str(tp))
print("False positive: " + str(fp))
print("True negative: " + str(tn))
print("False negative: " + str(fn))
print("Recall-positive: " + str(tp / (tp + fn)))
print("Precision-positive: " + str(tp / (tp + fp)))
print("Recall-negative: " + str(tn / (tn + fp)))
print("Precision-negative: " + str(tn / (tn + fn)))
def calculate_sentiment(self, is_negative=0):

polarities = [0] * len(self.pos_phrases_hits)
for i in range(len(self.pos_phrases_hits)):
polarities[i] = math.log(
(self.pos_phrases_hits[i] * self.neg_hits) /
(self.neg_phrases_hits[i] * self.pos_hits), 2)
pmi = sum(polarities) / len(polarities)
if (pmi > 0 and is_negative == 0) or (pmi < 0 and is_negative ==
1):
self.accuracy += 1
return 1
return 0
if __name__ == "__main__":
FILE_PATH = './Turney/reviews.json'
datasets = make_datasets(FILE_PATH)
turney = Turney(datasets)
turney.turney()
Output:
Lab 6 : Text Summarization
1. Single document summarization using TF-IDF:
import nltk nltk.download('punkt')

import nltk
nltk.download('stopwords')
import math
from nltk import sent_tokenize, word_tokenize, PorterStemmerfrom nltk.corpus
import stopwords
text = """ Unverified reports of ‘40 babies beheaded’ in Israel-Hamas war inflame social media No photo
evidence had been made public as of Thursdaymorning corroborating claims that babies had been
beheaded. Israel has published photos of dead infants after the terror attack.
Editor’s note: This story includes graphic descriptions of violent acts that some readers may find
disturbing.
A series of shocking reports have spread horrific claims of baby beheadings by Hamas militants across
social and mainstream media in recentdays, adding a particularly incendiary element to an already
violent and bitter war. But the reports are still unconfirmed, and in some cases have been retracted.
The most high-profile claim came Wednesday night when President Joe Biden said that he had seen
photographic evidence of terrorists beheading children. The White House later clarified that Biden was
referring to newsreports about beheadings, which have not included or referred to photographic
evidence.
Photos have been published by Hamas showing beheaded soldiers and the X account belonging to
Israeli Prime Minister Benjamin Netanyahu posted pictures on Thursday of babies killed and burned
by Hamas. No photo evidence had been made public as of Thursday morning corroborating claimsthat
babies had been beheaded.
Unverified information spreads quickly on social media, particularly around breaking news events,
reaching even larger audiences when it isshared by mainstream news outlets, politicians and
people with large
followings. Follow-ups that retract or add context are less likely to berepeated or reach the same
audience.
Biden’s statement followed a series of news reports and comments from Israeli officials, most of which
have since been softened or walked back. Easily debunked misinformation like fake press releases have
circulated widely since the start of the war, but such stories often die down quickly once proven false.
The claims about beheadings, difficult to verify, have continued to spread thanks in part to the lack of
clarity.
Alexei Abrahams, a disinformation researcher at McGill University in Montreal, said that even without
the allegations of beheaded babies, justthe facts themselves are horrifying enough to have the kind of
effect youexpect.
It may turn out that the slaughter was done in a particularly barbaric way. But one way or another, this
is an absolutely shocking, unprecedentedevent of violence, Abrahams said. The general concern, of
course, is that it’s going to exacerbate what is already a very fraught situation.
On Wednesday, a spokesman for Israeli Prime Minister Benjamin Netanyahu told CNN that babies and
toddlers were found with their 'heads decapitated' in southern Israel after Hamas’ attack. By Thursday
morning,an Israeli official told CNN the government had not confirmed claims of the beheadings.
A senior State Department official said Thursday morning that the agency was not in a position to
confirm the beheading claims.
Many of the reports appear to have originated from Israeli soldiers andpeople affiliated with the
Israel Defense Force (IDF).
An IDF spokesperson told Business Insider on Tuesday that soldiers had found decapitated babies, but
said Wednesday it would not investigate or provide further evidence regarding the claim. Late
Wednesday, an IDF spokesperson said in a video on X that the IDF had relative confidence of the claims.
On Thursday, in a call with a group of international journalists, ColonelGolan Vach, the head of the
IDF’s national search and rescue unit, said that he had “found one baby with his head cut.'
Marc Owen Jones, an associate professor of Middle East studies at Hamad Bin Khalifa University in
Qatar who studies misinformation, told NBC Newsthat he found that the source of the '40 babies
beheaded” allegations largely stemmed from a viral Israeli news broadcast clip that did not specifically
refer to the allegation.
Nicole Zedeck, a correspondent for the privately owned Israeli news outleti24NEWS, said in the video
that Israeli soldiers told her they’d found
“babies, their heads cut off.' The video has been viewed more than 11 million times on X, according to
its view counter. In another tweet, Zedeck wrote that soldiers told her they believe “40 babies/children
werekilled.
Somehow those two bits of information were connected, the story became ‘40 babies were beheaded,’
and in the British press today, about six or seven newspapers had it on their front pages,' Jones said.
An IDF spokesperson, Doron Spielman, told NBC News on Tuesday that hecould not confirm
i24NEWS’s report.
Yossi Landau, the head of operations for the southern region of Zaka, Israel’s volunteer civilian
emergency response organization, told CBS Newsthat he saw the bodies of beheaded children and
babies, parents and children who had been tortured and had their hands bound, and “a lot more that
cannot be described for now, because it’s very hard to describe.
By Wednesday, the claims, though still contentious, were going viral online — being used as evidence of
Hamas’ depravity. On Wednesday, the phrase “Did Hamas kill babies” saw the biggest increase in search
intereston Google of anything related to the war.
Stranger Things star Noah Schnapp posted the shocking claim to his 25 million Instagram followers:
'40 babies were beheaded and burned alive in front of their parents by Hamas.' Sen. Ted Cruz, R-Texas,
mentioned beheaded babies in a post on X, and Rep. Mike McCaul, R-Texas, echoed theallegations on
CNN.
Jones found that the '40 babies beheaded' claim had over 44 million impressions on X, with over
300,000 likes and more than 100,000 reposts.The main accounts propagating the claims were
i24NEWS and the official Israel account, Jones’ data showed.
Baby stories are very emotive. Historically, they’re stories that can be used to rationalize a very brutal
response, Jones said. It’s such a volatile information environment that such claims will inevitably be
takenout of context, both deliberately and accidentally.
"""
def _create_frequency_matrix(sentences):
frequency_matrix = {}
stopWords = set(stopwords.words("english"))ps =
PorterStemmer()
for sent in sentences:freq_table

= {}
words = word_tokenize(sent)for
word in words:
word = word.lower() word
= ps.stem(word) if word in
stopWords:
continue
if word in freq_table:
freq_table[word] += 1
else:
freq_table[word] = 1
frequency_matrix[sent[:15]] = freq_table
return frequency_matrix
def _create_tf_matrix(freq_matrix):tf_matrix = {}
for sent, f_table in freq_matrix.items():tf_table = {}
count_words_in_sentence = len(f_table)for word,

count in f_table.items():
tf_table[word] = count / count_words_in_sentencetf_matrix[sent] =
tf_table
return tf_matrix
def _create_documents_per_words(freq_matrix):
word_per_doc_table = {}
for sent, f_table in freq_matrix.items():for word, count in

f_table.items():
if word in word_per_doc_table:
word_per_doc_table[word] += 1
else:
word_per_doc_table[word] = 1
return word_per_doc_table
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):idf_matrix = {}
for sent, f_table in freq_matrix.items():idf_table = {}
for word in f_table.keys():

idf_table[word] = math.log10(total_documents /
float(count_doc_per_words[word]))
idf_matrix[sent] = idf_tablereturn
idf_matrix
def _score_sentences(tf_idf_matrix) -> dict:sentenceValue =

{}
for sent, f_table in tf_idf_matrix.items():

total_score_per_sentence = 0
count_words_in_sentence = len(f_table)for word,

score in f_table.items():
total_score_per_sentence += score
sentenceValue[sent] = total_score_per_sentence /
count_words_in_sentence
return sentenceValue
def _find_average_score(sentenceValue) -> int:sumValues = 0

for entry in sentenceValue:
sumValues += sentenceValue[entry]
# Average value of a sentence from original summary_text average =

(sumValues / len(sentenceValue))
return average
def _generate_summary(sentences, sentenceValue, threshold):sentence_count = 0
summary = ''
for sentence in sentences:

if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]]
>= (threshold):
summary += " " + sentence
sentence_count += 1
return summary
# 1 Sentence Tokenization sentences =

sent_tokenize(text) total_documents =
len(sentences)print(total_documents)
41
# 2 Creating the Frequency matrix of the words in each sentence.freq_matrix =

_create_frequency_matrix(sentences)
# 3 Calculate Term Frequency matrix tf_matrix =

_create_tf_matrix(freq_matrix)
# 5 Calculate IDF and generate a matrix

idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words,total_documents)
# 6 Calculate TF-IDF and generate a matrix

tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
# 7 Important Algorithm: score the sentences sentence_scores =

_score_sentences(tf_idf_matrix)
# 8 Find the threshold

threshold = _find_average_score(sentence_scores)print(threshold)
0.07720921963564482
# 9 Important step: Generate the summary
summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold)
print(summary)
Output:
Israel has published photos of dead infants after the terror attack. But the reports are still
unconfirmed, and in some cases have been retracted. Follow-ups that retract or add context
are less likely to be repeated or reach the same audience. It may turn out that the slaughter
was done in a particularly barbaric way. The video has been viewed more than 11 million
times on X, according to its view counter. Baby stories are very emotive.
Lab 7 : Multi document text summarization
1. Multi document summarization
articles = []
import pandas as pd
import numpy as np
articles.append({
"Title": "Article1",
"Content":""" """})
articles.append({
"Content":""" """})
articles.append({
"Content":""" """})
articles.append({
"Content":""" """})
articles.append({
"Content":""" """})
articles.append({
"Content":""" """})
articles.append({
"Content":""" """})
df = pd.DataFrame(articles)
print(df)
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate the vectorizer with stop_words set to 'english'
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
# Compute the TF-IDF vectors for the articles

tfidf_matrix = tfidf_vectorizer.fit_transform(df['Content'])
from sklearn.metrics.pairwise import linear_kernel

# Compute cosine similarities between all pairs of articles cosine_similarities =
linear_kernel(tfidf_matrix, tfidf_matrix)print(cosine_similarities)
# Extract the most representative sentence from each articlesummaries = []

for idx, article in enumerate(df['Content']):# Tokenize the
article into sentences sentences = article.split('. ')
# Compute the TF-IDF vectors for the sentences sentence_vectors =

tfidf_vectorizer.transform(sentences)
# Compute cosine similarities between the article and its sentences cosine_similarities_sentences
= linear_kernel(tfidf_matrix[idx:idx+1],
sentence_vectors).flatten()
# Find the sentence with the highest cosine similarity top_sentence_idx =

cosine_similarities_sentences.argsort()[-1]top_sentence = sentences[top_sentence_idx]
summaries.append(top_sentence)
# Add summaries to the dataframe

df['Summary'] = summaries print(df[['Title',
'Summary']])
# Extract the most representative sentence from each article (as we did before)summaries = []
for idx, article in enumerate(df['Content']):sentences =
article.split('. ')
sentence_vectors = tfidf_vectorizer.transform(sentences) cosine_similarities_sentences =
linear_kernel(tfidf_matrix[idx:idx+1],
sentence_vectors).flatten()
top_sentence_idx = cosine_similarities_sentences.argsort()[-1]top_sentence =
sentences[top_sentence_idx] summaries.append(top_sentence)
# Create a merged summary by concatenating the most representative sentences from eacharticle
merged_summary = '. '.join(summaries)
print("Merged Summary:")
print(merged_summary)
Output:
Title Content
0 Article1 Unverified reports of ‘40 babies beheaded’ in
...1 Article2 The 'horrendous toll' on children caught in
th...
2 Article3 What we actually know about the viral report
o...3 Article4 Israel releases horrific images of slain child...
4 Article5 ‘I would see and have confirmed pictures of
te...5 Article6 At least 40 babies killed, beheaded in
Israeli...
6 Article7 ‘I would see and have confirmed pictures of te…
[[1. 0.14483033 0.39585 0.2582229 0.26818106 0.20422504

0.27895569]
[0.14483033 1. 0.22168557 0.24798395 0.2299564 0.21169182
0.23901851]
[0.39585 0.22168557 1. 0.28336111 0.30100884 0.31592353
0.31263709]
[0.2582229 0.24798395 0.28336111 1. 0.20396822 0.18262805
0.21117037]
[0.26818106 0.2299564 0.30100884 0.20396822 1. 0.18920684
0.96356329]
[0.20422504 0.21169182 0.31592353 0.18262805 0.18920684 1.
0.1972829 ]
[0.27895569 0.23901851 0.31263709 0.21117037 0.96356329 0.1972829
1. ]]
Title Summary
0 Article1 In another tweet, Zedeck wrote that soldiers
t... 1 Article2 "Depriving children of access to food and
esse...
2 Article3 What happened in Kibbutz Kfar Aza is a
massacr...3 Article4 Secretary of State Antony Blinken and
NATO def...4 Article5 I never thought I’d ever — anyway.”
He did not... 5 Article6 At least 40 babies killed, beheaded
in Israeli...
6 Article7 I never thought I’d ever — anyway.” He did
not…Merged Summary:
In another tweet, Zedeck wrote that soldiers told her they believe “40 babies/children were
killed.”
“Somehow those two bits of information were connected, the story became ‘40 babies were
beheaded,’ and in the British press today, about six or seven newspapers had it on their front
pages,” Jones said.
An IDF spokesperson, Doron Spielman, told NBC News on Tuesday that he could not confirm
i24NEWS’s report.
Yossi Landau, the head of operations for the southern region of Zaka, Israel’s volunteer
civilian emergency response organization, told CBS News that he saw the bodies of beheaded
children and babies, parents and children who had been tortured and had their hands bound,
and “a lot more that cannot be described for now, because it’s very hard to describe.”
By Wednesday, the claims, though still contentious, were going viral online — being used as
evidence of Hamas’ depravity. "Depriving children of access to food and essential services
puts their lives at risk, as do attacks on civilian areas and infrastructure."
According to UNICEF, 80% of those who live in the Gaza Strip rely on some form of
humanitarian assistance.
The conflict has led to "grave humanitarian consequences," Lynn Hastings, a
humanitariancoordinator for the Gaza Strip for the United Nations Office for the
Coordination ofHumanitarian Affairs, while calling for all parties to abide by international
humanitarian law. "Civilians, especially children, medical facilities, humanitarian
personnel health workers, andjournalists must be protected," Hastings said in a
statement Tuesday. What happened in KibbutzKfar Aza is a massacre in which women,
children and toddlers and elderly were brutallybutchered in an ISIS way of action."
What happened at the kibbutz?
The Kfar Aza kibbutz is one of several self-contained Israeli settlements close to the
Gazaborder.
It is located between Netivot and Sderot - around three miles from the border in southern
Israel. Because of its proximity to Gaza and the unprecedented nature of last weekend's
incursion,which saw Hamas militants breach the usually heavily guarded border on foot - it
was one of thefirst sites they reached on Saturday.
Four days later, journalists got to see the destruction left
behind.Ramsay said the scene "can only be described as a
massacre".
"The stories here are shocking - families being woken without warning to voices outside
their houses, mums and dads hiding their children in cupboards, wine cellars and basements,
husbandsand wives becoming separated in the fight," he said.
He added it took 17 hours for help to arrive, as the IDF focused on urban areas first - leaving
residents defenceless and numbers of dead high.
Why are there reports of 'babies being beheaded'?
Claims Hamas fighters beheaded babies have only been reported by one journalist - Nicole
Zedek from i24 - and have not been verified by Sky News.
Ms Zedek was among the reporters invited to see what was left at the kibbutz on Tuesday.
In one live broadcast, which has since been viewed millions of times on X, formerly known
as Twitter, she says: "Talking to some of the soldiers here, they say what they witnessed as
they've been walking through these communities is bodies of babies with their heads cut off
and familiesgunned down in their beds.
"We can see some of these soldiers right now, comforting each other."
She is also filmed speaking to the deputy commander of the IDF's unit 71, David Ben Zion,
who describes Hamas fighters as "aggressive" and "very bad".
He says: "They cut off heads… of children, of women."
And in another live broadcast, Zedek describes "40 babies at least were taken out on
gurneys" - which is where the widely shared 40 figure comes from.
show more (open the raw output data in a text editor) ...
On Wednesday, Heinrich quoted CNN that infants and young children had been discovered
with "decapitated" bodies in the community of Kfar Aza.
An Israeli Defense Forces spokesperson also told The Intercept that they could not confirm
it officially, but they believed the report.
Yossi Landau, a representative from Israel's volunteer civilian emergency response
organization, Zaka, shared with CBS News that he witnessed the gruesome sight of children
and infants who had been decapitated.
“I saw a lot more that cannot be described for now, because it’s very hard to describe,” he
said. Nicole Zedek, a television reporter for i24NEWS based in Tel Aviv, was the initial
source toreport the allegations of child beheadings on Tuesday
Lab 8 : Term Incidence Matrix
Code:
Doc_1 = """ """
Doc_2 = """ """
Doc_3 = """ """
Doc_4 = """ """
Doc_5 = """ """
Doc_6 = """ """
Doc_7 = """ """
import nltk
import nltk
stop_words = set(stopwords.words('english'))#getting
unique terms in the documents

# set - does not allow duplicate elements
unique_terms= set() # creating an empty setfor doc in docs:

# tokenize words
for term in doc.split():
if term.lower() not in stop_words:
unique_terms.add(term)
print(unique_terms)
print(len(unique_terms))
# Initialize NLTK's word tokenizer

nltk.download('punkt') # Download the required dataset (only need to do
this once)
# Tokenize words in each document

tokenized_docs = [word_tokenize(doc) for doc in docs]
# Print tokenized documents

for i, doc_tokens in enumerate(tokenized_docs, 1):
print(f"Tokens in Doc_{i}: {doc_tokens}")
#creating term-document incidence matrix in the form of a dictionary

doc_term_matrix = {}
for term in unique_terms:

doc_term_matrix[term] = []
for doc in docs:

if term in doc:
doc_term_matrix[term].append(1)
else: doc_term_matrix[term].append(0)
for term, term_list in doc_term_matrix.items():

print(f"{term}: {term_list}")
Output:
{'contentious,', 'personnel', 'photo', 'expressed', 'body', 'killing,', 'particularly', 'spoke',

'Coordination', '"unconfirmed".', 'caught', "I'm", 'reposts.', '"Please', 'still', 'thousands',
'targeted',
'linking', 'report.', 'babies”', 'Unverified', 'called', 'comparing', '"To', 'bitter', '260',
'small', 'Kibbutz', 'retract', 'decapitated', 'another', 'large', 'unconfirmed,', 'United',
'related', 'harmed','point', 'suffering', 'founded', 'possible', 'located', 'believed', 'mums',
'Wednesday,', 'Barkat',
……..
}
1534
Tokens in Doc_1: ['Unverified', 'reports', 'of', '‘', '40', 'babies', 'beheaded', '’', 'in', 'Israel-
Hamas',
'war', 'inflame', 'social', 'media', 'No', 'photo', 'evidence', 'had', 'been', 'made', 'public',
'as', 'of',,'.',. ']
Tokens in Doc_2: ['The', "'horrendous", 'toll', "'", 'on', 'children', 'caught', 'in', 'the', 'Israel-
Gaza',
'conflict', 'Hundreds', 'of', 'children', 'have', 'been', 'killed', 'so', 'far', ',', 'with', 'the', 'true',
'total','Russell', 'said', '. ']
Tokens in Doc_3: ['What', 'we', 'actually', 'know', 'about', 'the', 'viral', 'report', 'of', 'beheaded',
'babies', 'in', 'Israel', 'One', 'journalist', 'from', 'the', 'Tel', 'Aviv-based', 'news', 'channel', 'i24',
'said','war', '. ']
Tokens in Doc_4: ['Israel', 'releases', 'horrific', 'images', 'of', 'slain', 'children', 'after', 'Hamas',
'attack', 'JERUSALEM', '/', 'TEL', 'AVIV', '/', 'BRUSSELS', '-', 'CONTENT', 'WARNING', ':',
'right', 'things', '.', ……’]
Tokens in Doc_5: ['‘', 'I', 'would', 'see', 'and', 'have', 'confirmed', 'pictures', 'of', 'terrorists',
'beheading', 'children', ',', '’', 'Joe', 'Biden', 'decries', 'Hamas', 'atrocity', 'in', 'Israel',
'Reports','attack', '. ']
Tokens in Doc_6: ['At', 'least', '40', 'babies', 'killed', ',', 'beheaded', 'in', 'Israeli', 'kibbutz',
'outside',
'Gaza', 'Strip', ',', 'reports', 'say', 'KFAR', 'AZA', ',', 'Israel', '(', 'TND', ')', '—', 'Dozens',
'of','Monday', '.']
Tokens in Doc_7: ['‘', 'I', 'would', 'see', 'and', 'have', 'confirmed', 'pictures', 'of', 'terrorists',
'beheading', 'children', ',', '’', 'Joe', 'Biden', 'decries', 'Hamas', 'atrocity', 'in', 'Israel',
'Reports',, 'the','attack', ‘… ']
contentious,: [1, 0, 0, 0, 0, 0, 0]
personnel: [0, 1, 0, 0, 0, 0, 0]
photo: [1, 0, 0, 1, 0, 0, 0]
expressed: [0, 0, 0, 0, 1, 0, 1]
body: [0, 0, 1, 1, 0, 0, 0]
killing,: [0, 1, 0, 0, 0, 0, 0]
particularly: [1, 0, 0, 1, 0, 0, 0]
spoke: [1, 1, 1, 0, 1, 0, 1]
Coordination: [0, 1, 0, 0, 0, 0, 0]
"unconfirmed".: [0, 0, 1, 0, 0, 0, 0]
caught: [0, 1, 0, 0, 0, 0, 0]
I'm: [0, 1, 0, 0, 0, 1, 0]
reposts.: [1, 0, 0, 0, 0, 0, 0]
"Please: [0, 0, 1, 0, 0, 0, 0]
still: [1, 1, 0, 0, 0, 1, 0]
thousands: [0, 0, 0, 1, 0, 0, 0]
targeted: [0, 0, 0, 1, 0, 0, 0]
linking: [0, 0, 1, 0, 0, 0, 0]
report.: [1, 0, 0, 0, 1, 0, 1]
babies”: [1, 0, 0, 0, 0, 0, 0]
Unverified: [1, 0, 0, 0, 0, 0, 0]
called: [0, 1, 0, 0, 0, 0, 0]
comparing: [0, 0, 0, 1, 0, 0, 0]
"To: [0, 0, 1, 0, 0, 0, 0]
bitter: [1, 0, 0, 0, 0, 0, 0]
260: [0, 1, 0, 0, 0, 0, 0]
small: [0, 0, 1, 0, 0, 1, 0]
Kibbutz: [0, 0, 1, 0, 0, 1, 0]
retract: [1, 0, 0, 0, 0, 0, 0]
decapitated: [1, 0, 0, 0, 1, 0, 1]
another: [1, 1, 1, 0, 1, 0, 1]
war: [1, 1, 1, 0, 0, 1, 0]
docs = ["Doc_1", "Doc_2", "Doc_3", "Doc_4", "Doc_5", "Doc_6", "Doc_7"]
# Convert the document list to a NumPy array

docs_array = np.array(docs, dtype='object')
# query to find documents containing killing and war

import numpy as np
v1 = np.array(doc_term_matrix['killing'])
v2 = np.array(doc_term_matrix['war'])
print(v1)
print(v2)
print(' ---------------')
v3 = v1 & v2
print(v3)
[0 1 0 0 0 0 0]
[1 1 1 0 0 1 0]
[0 1 0 0 0 0 0]
matching_doc_indices = np.where(v3 == 1)[0]

print("Documents where 'killing' and 'war' are both present:")
for doc_index in matching_doc_indices:
print(docs_array[doc_index])
Documents where 'killing' and 'war' are both

present:Doc_2
# query to find documents containing killing or war

import numpy as np
v1 = np.array(doc_term_matrix['killing'])
v2 = np.array(doc_term_matrix['war'])
print(v1)
print(v2)
print(' ---------------')
v = v1 | v2
print(v)
[0 1 0 0 0 0 0]
[1 1 1 0 0 1 0]
[1 1 1 0 0 1 0]
matching_doc_indices = np.where(v == 1)[0]

print("Documents where either 'killing' or 'war' are present:")
for doc_index in matching_doc_indices:
print(docs_array[doc_index])
Documents where either 'killing' or 'war' are

present:Doc_1
Doc_2
Doc_3
Doc_6
9. term-incidence matrix for information retrieval
import pandas as pd
In [ ]: import numpy as np
articles = []
In [ ]:
articles.append({
In [ ]:
"Content":"""Unverified reports of ‘40 babies beheaded’ in Israel-Hamas war inflame social media
No photo evidence had been made public as of Thursday morning corroborating claims that babies had been beheaded.
Israel has pu Editor’s note: This story includes graphic descriptions of violent acts that some readers may find
disturbing.
A series of shocking reports have spread horrific claims of baby beheadings by Hamas militants across social and mainstream med The most
high-profile claim came Wednesday night when President Joe Biden said that he had seen photographic evidence of terror Photos have been
published by Hamas showing beheaded soldiers and the X account belonging to Israeli Prime Minister Benjamin Ne Unverified information
spreads quickly on social media, particularly around breaking news events, reaching even larger audience Biden’s statement fo llowed a series
of news reports and comments from Israeli officials, most of which have since been softened Alexei Abrahams, a disinformation researcher
at McGill University in Montreal, said that even without the allegations of behead “It may turn out that the slaughter was done in a
particularly barbaric way. But one way or another, this is an absolutely shoc On Wednesday, a spokesman for Israeli Prime Minister Benjamin
Netanyahu told CNN that babies and toddlers were found with their A senior State Department official said Thursday morning th at the agency
was not in a position to confirm the beheading claims. Many of the reports appear to have originated from Israeli soldiers and people
affiliated with the Israel Defense Force (IDF).
An IDF spokesperson told Business Insider on Tuesday that soldiers had found decapitated babies, but said Wednesday it would no On
Thursday, in a call with a group of international journalists, Colonel Golan Vach, the head of the IDF’s national search and Marc Owen
Jones, an associate professor of Middle East studies at Hamad Bin Khalifa University in Qatar who studies misinforma Nicole Zedeck, a
correspondent for the privately owned Israeli news outlet i24NEWS, said in the video that Israeli soldiers tol “Somehow those two bits of
information were connected, the story became ‘40 babies were beheaded,’ and in the British press tod
An IDF spokesperson, Doron Spielman, told NBC News on Tuesday that he could not confirm i24NEWS’s report.
Yossi Landau, the head of operations for the southern region of Zaka, Israel’s volunteer civilian emergency response organizati By Wednesday,
the claims, though still contentious, were going viral online — being used as evidence of Hamas’ depravity. On We “Stranger Things” star
Noah Schnapp posted the shocking claim to his 25 million Instagram followers: “40 babies were beheaded a Jones found that the “40 babies
beheaded” claim had over 44 million impressions on X, with over 300,000 likes and more than 100
“Baby stories are very emotive. Historically, they’re stories that can be used to rationalize a very brutal response,” Jones sa
In [ ]:
articles.append({
"Content": """The 'horrendous toll' on children caught in the Israel-Gaza conflict
H ar, with the true total still unclear.
u
n
d
r
e
d
s
o
f
c
h
i
l
d
r
e
n
h
a
v
e
b
e
e
n
k
i
l
l
e
d
s
o
f
The Israel-Hamas conflict is taking a "horrendous toll" on families, humanitarian organizations like UNICEF decried this week, In the
days since Hamas' surprise assault on Israel, images from both regions have shown crying children running through the st "Nothing
justifies the killing, maiming or abducting of children -- grave rights violations which UNICEF wholeheartedly condemn According to
Palestinian authorities, 900 people have been killed in Gaza so far -- including 260 children and 230 women. The n In remarks at the
White House Tuesday, President Joe Biden described "Hamas' bloodthirstiness" as reminiscent of ISIS rampages At least 100 civilians and
soldiers have also been taken hostage by Hamas militants, Israeli officials said. Hamas leaders on M Among those abducted were 12- and
16-year-old brothers, their mother told ABC News. The woman, who asked not to be identified f "I want the world to demand the release of
those innocent civilians. I want these children and women and babies back home, and In response to the assault, Israel on Monday carried
out a "complete siege," cutting off power and blocking food and water from UNICEF is "extremely alarmed" about those measures,
spokesperson James Elder said at a press briefing Tuesday in Geneva.
"This will add another layer of suffering to the existing catastrophe faced by families in Gaza," Elder said.
"Depriving childr According to UNICEF, 80% of those who live in the Gaza Strip rely on some form of humanitarian
assistance.
The conflict has led to "grave humanitarian consequences," Lynn Hastings, a humanitarian coordinator for the Gaza Strip for the "Civilians,
especially children, medical facilities, humanitarian personnel health workers, and journalists must be protected," UNICEF has also called
on all parties to protect children from harm, in accordance with international humanitarian law.
"I remind all parties that in this war, as in all wars, it is children who suffer first and suffer most,"
Russell said.""" })
In [ ]:
articles.append({
"Content": """What we actually know about the viral report of beheaded babies in Israel
One journalist from the Tel Aviv-based news channel i24 said a soldier told her they had "witnessed… bodies of babies with thei
Reports that Israeli soldiers discovered babies that had been beheaded in the Kfar Aza kibbutz are circulating on social and tr
The Israel Defence Forces (IDF) invited foreign journalists to see the aftermath of a massacre by Hamas militants at the kibbut
Sky's chief correspondent Stuart Ramsay was among those to go and see "stretcher-bearers bringing out a small child" and a bask
But in her TV reports, one journalist from the Tel Aviv-based news channel i24 said a soldier had told her they had "witnessed…
In a statement to Sky News, the IDF said: "We cannot confirm any numbers. What happened in Kibbutz Kfar Aza is a massacre in wh
What happened at the kibbutz?
The Kfar Aza kibbutz is one of several self-contained Israeli settlements close to the Gaza border.
It is located between Netivot and Sderot - around three miles from the border in southern Israel.
Because of its proximity to Gaza and the unprecedented nature of last weekend's incursion, which saw Hamas
militants breach the Four days later, journalists got to see the destruction left behind. Ramsay said the scene
"can only be described as a massacre".
"The stories here are shocking - families being woken without warning to voices outside their houses, mums and
dads hiding thei He added it took 17 hours for help to arrive, as the IDF focused on urban areas first - leaving
residents defenceless and numbe Why are there reports of 'babies being beheaded'?
Claims Hamas fighters beheaded babies have only been reported by one journalist - Nicole Zedek from i24 - and have
not been ver Ms Zedek was among the reporters invited to see what was left at the kibbutz on Tuesday.
In one live broadcast, which has since been viewed millions of times on X, formerly known as Twitter, she says:
"Talking to som "We can see some of these soldiers right now, comforting each other."
She is also filmed speaking to the deputy commander of the IDF's unit 71, David Ben Zion, who describes Hamas
fighters as "aggr He says: "They cut off heads… of children, of women."
And in a similar claim: "We've s
another
live
broadca
st,
Zedek
describ
es "40
babies
at
least
were
taken
out on
gurneys
" -
which
is
where
the
widely
share
In an
intervi
ew with
Sky's
Mark
Austin
on
Tuesday
evening
,
Israeli
economy
ministe
r Nir
Barkat
echoed
CBS News in the US said on Wednesday that Yossi Landau, head of operations at Zaka, Israel's volunteer civilian
emergency respo But when asked directly whether "40 babies were beheaded", an IDF spokesman said children were
killed - but that reports of beh It was later reported by at least one major TV news network that the reports of
babies being beheaded had been "confirmed" by a This was subsequently attributed to Tal Heinrich, a freelance news
anchor who appears to have been drafted in by Benjamin Netan The only available public statement on the matter
from Ms Heinrich at the time of writing was an interview she conducted with L Ms Heinrich, who was quoted by LBC
as a spokesperson for Mr Netanyahu's office, said: "Toddlers, babies, I can tell you some of Replying to a later
post on X linking to a story citing her comments, she said on Wednesday evening: "Please note: We said that
'Important to separate facts from speculation'
Ramsay interviewed two IDF majors - one of whom was a spokesman.
Ramsay said: "At no point did either he, or the other major I spoke to, ever mention that Hamas had beheaded or
killed 40 babie "There is no doubt that a horrific attack took place at Kfar Aza, and it needed to be reported,
and we did see the bodies of th "But it's important to separate the facts from speculation in a situation like
this.
"To reiterate - the IDF had every opportunity to inform the world's media of any story that had become apparent
as the military And another journalist, Oren Ziv, who works for independent news outlet 972 mag, was also
present and given the opportunity to In a post on X, he said of the baby claims: "During the tour we didn't see
any evidence of this, and the army spokesperson or c Footage shows how Hamas fighters broke into the kibbutz
Adding to the confusion, the White House was forced into a remarkable climbdown on Wednesday night after President Joe Biden ap
In a speech to a Jewish community gathering in Washington, which was televised live, he said: "I never really thought that I wo
However, after Sky News' US partner NBC approached the White House for further details on President Biden's remarks, two senior
He had not in fact seen any images or had independent confirmation of child beheadings.
Digital investigations journalist Victoria Elms, who works on Sky's Data and Forensics unit, adds: "Social media
has been awash "Videos from the Syrian conflict, excerpts from video games and TikToks made months ago have all
been widely shared, falsely cl She says misinformation is often shared "unintentionally", but "there are some
In [ ]: who post and share false material with the inten "This is especially dangerous during times of conflict, where
it may be even harder than usual to independently verify informat "As the conflict draws on, we would urge users
to be vigilant when consuming online content related to the war.""", })
articles.append({
"Content": """Israel releases horrific images of slain children after Hamas attack
JERUSALEM / TEL AVIV / BRUSSELS - CONTENT WARNING: This story contains graphic details that may not be suitable
for all audienc Israel's government showed U.S. Secretary of State Antony Blinken and NATO defence ministers
graphic images of dead children an Prime Minister Benjamin Netanyahu's office also released on social media a
picture of a dead infant in a pool of blood and the Blinken, who flew into Tel Aviv earlier on Thursday, told
reporters he was shown photographs and videos of a baby riddled with "It's simply depravity in the worst
imaginable way," Blinken told a news briefing. "Images are worth a thousand words. These im Netanyahu has vowed
to annihilate Hamas following its deadly assault on unsuspecting Israeli communities on Saturday, which kil The
Israeli airforce has launched intense bombing raids on Gaza over the past five days and is massing tens of
thousands of tro Gaza authorities said more than 1,400 Palestinians, mainly civilians, including children, have
alread children, tied up and shot," he told fellow ministers by video link according t 'HORRIFIC PICTURES'
y been In a message on the social media site 'X', Netanyahu's office released what it said were "horrifying photos of babies murdered
killed It added: "Hamas is inhuman. Hamas is ISIS," comparing the Palestinian group to the Islamic State, which was notorious for its
and The images of the dead infants were included in the video played to NATO. It was not released to the public, but was later seen
more
tha
Israel
i
Defenc
e
Minist
er
Yoav
Gallan
t
played
a
video
to his
counte
rparts
at
NATO's
Brusse
ls
headqu
arters
that
he
said
showed
ho
"Child
ren
were
tied
up and
shot.
Yes, I
repeat
,
"They were horrific pictures of the attacks and the victims of the attacks," NATO Secretary General Jens
Stoltenberg told repor The White House said it had no reason to doubt the authenticity of the images.
Hamas has denied its militants harmed civilians, accusing Israel and the West of spreading false reports to
incite violence aga Deputy Hamas chief, Saleh Al-Arouri, said the group's fighters had only aimed to attack the
Israeli military and had been surpr The video shown to NATO, apparently taken from a mix of social media
published by Hamas and unidentified phone videos, showed t There were no images to suggest militants had
beheaded babies -- a particularly explosive accusation that first emerged in Isra U.S. President Joseph Biden
had suggested on Wednesday that he had seen images of children beheaded by militants. The White Hou Netanyahu
has not repeated a claim by his office earlier this week that Hamas had indeed cut off the heads of children,
nor did But medics, international human rights organizations and journalists have documented that militants
killed women, children and Foreign reporters shown sites targeted by Hamas, witnessed ruins of burnt-out houses
and streets scattered with dead residents NATO officials said they did not expect the alliance to be directly
involved in the conflict. But multiple NATO states, above a U.S. Defense Secretary Lloyd Austin said after the
In [ ]:
NATO meeting on Thursday that Washington was not placing any conditions on i })
articles.append({
"Content": """‘I would see and have confirmed pictures of terrorists beheading children,’ Joe Biden decries
Hamas atrocity Reports suggest up to 40 babies slaughtered by Hamas near Gaza Strip. US President Joe Biden
expresses horror at beheading of c Biden spoke to Jewish leaders at the White House on Wednesday and said, “I
never really thought that I would see and have confi We're now on WhatsApp. Click to join.
“I haven’t given up hope of bringing these folks home,” he said.
“If I told you, I wouldn’t be able to get them home.”
ALSO READ| Why Steve Scalise hasn't assumed the role of house speaker for Republicans yet?
According to reports, up to 40 babies were slaughtered in their homes near the Gaza Strip, which is controlled
by Hamas.
While a senior White House national security aide stated they hadn't viewed the mentioned images, another White
House official On Wednesday, Heinrich quoted CNN that infants and young children had been discovered with
"decapitated" bodies in the communit An Israeli Defense Forces spokesperson also told The Intercept that they
could not confirm it officially, but they believed the Yossi Landau, a representative from Israel's volunteer
civilian emergency response organization, Zaka, shared with CBS News tha “I saw a lot more that cannot be
described for now, because it’s very hard to describe,” he said.
Nicole Zedek, a television reporter for i24NEWS based in Tel Aviv, was the initial source to report the
allegations of child be She stated, “I mean, babies’ heads cut off. That’s what they encountered when they came
there. So as horrible as it is and and The IDF also shared a disturbing image online on Wednesday that showed a
blood-stained mattress of an Israeli child with blood Since Hamas' shocking attack on Israel, the US President
has refrained from taking press questions in various settings. He is e })
In [ ]: articles.append({
"Content": """At least 40 babies killed, beheaded in Israeli kibbutz outside Gaza Strip, reports say
KFAR channel i24NEWS, described the scene as "truly horrific."
AZA,
Israel
"No one could expect that it would be like this, the horrors that I'm hearing from these soldiers," Zedeck said.
(TND) — "As I mentione Several of the infants were also beheaded by Hamas terrorists, according to OpIndia.
Dozens
of
babies
were
reporte
dly
found
dead,
includi
ng some
that
had
been
beheade
d, in
an
Israeli
kibb
Several
journal
ists
were
let in
to the
Kfar
Aza
kibbutz
,
located
just
outside
the
Gaza
Strip,
to see
the
afterma
th of
the
attac
Nicole
Zedeck
, a
corres
ponden
t for
Israel
i
televi
sion
Zedeck went on to say that an official death count at the kibbutz is still unknown because soldiers are "still
collecting dead A kibbutz is a small Israeli agricultural community. Kibbutz are dotted throughout Israel,
primarily in the Negev Desert. Israel Defense Forces Major General Itai Veruv described the scene in Kfar Aza as
a "massacre" Tuesday, calling it unlike somet “It’s not a war, it’s not a battlefield, it’s a massacre,” Veruv
told The Times of Israel. “You see the babies, their mothers a The murders at Kfar Aza represent just a fraction
of the death and destruction caused by Hamas terrorists. Videos reviewed Mond An Israeli family of five was
reportedly killed by Hamas terrorists during the invasion. The family, which included three child American
families have begun to plead with the Biden administration for assistance finding their missing loved ones in
In [ ]: Israel. })
articles.append({
"Content": """‘I would see and have confirmed pictures of terrorists beheading children,’ Joe Biden decries
Hamas atrocity Reports suggest up to 40 babies slaughtered by Hamas near Gaza Strip. US President Joe Biden
expresses horror at beheading of c Biden spoke to Jewish leaders at the White House on Wednesday and said, “I
never really thought that I would see and have confi “I haven’t given up hope of bringing these folks home,” he
said.
“If I told you, I wouldn’t be able to get them home.”
According to reports, up to 40 babies were slaughtered in their homes near the Gaza Strip, which is controlled
by Hamas.
While a senior White House national security aide stated they hadn't viewed the mentioned images, another White
House official On Wednesday, Heinrich quoted CNN that infants and young children had been discovered with
"decapitated" bodies in the communit An Israeli Defense Forces spokesperson also told The Intercept that they
could not confirm it officially, but they believed the Yossi Landau, a representative from Israel's volunteer
civilian emergency response organization, Zaka, shared with CBS News tha “I saw a lot more that cannot be
described for now, because it’s very hard to describe,” he said.
Nicole Zedek, a television reporter for i24NEWS based in Tel Aviv, was the initial source to report the
allegations of child be She stated, “I mean, babies’ heads cut off. That’s what they encountered when they came
there. So as horrible as it is and and The IDF also shared a disturbing image online on Wednesday that showed a
blood-stained mattress of an Israeli child with blood Since Hamas' shocking attack on Israel, the US President
has refrained from taking press questions in various settings. He is e })
In [ ]: articles[0]
Out[ ]: {'Title': 'Article1',
'Content': 'Unverified reports of ‘40 babies beheaded’ in Israel-Hamas war inflame social media\nNo photo evidence had been m ade
public as of Thursday morning corroborating claims that babies had been beheaded. Israel has published photos of dead infa nts after the
terror attack.\nEditor’s note: This story includes graphic descriptions of violent acts that some readers may fi nd disturbing.\nA series
of shocking reports have spread horrific claims of baby beheadings by Hamas militants across social a nd mainstream media in recent days,
adding a particularly incendiary element to an already violent and bitter war. But the rep orts are still unconfirmed, and in some cases
have been retracted.\nThe most high-profile claim came Wednesday night when Pres ident Joe Biden said that he had seen photographic
evidence of terrorists beheading children. The White House later clarified that Biden was referring to news reports about beheadings,
which have not included or referred to photographic evidence.\nPhot os have been published by Hamas showing beheaded soldiers and the X
account belonging to Israeli Prime Minister Benjamin Netan yahu posted pictures on Thursday of babies killed and burned by Hamas. No
photo evidence had been made public as of Thursday m orning corroborating claims that babies had been beheaded.\nUnverified information
spreads quickly on social media, particular ly around breaking news events, reaching even larger audiences when it is shared by
mainstream news outlets, politicians and p eople with large followings. Follow-ups that retract or add context are less likely to be
repeated or reach the same audienc e.\nBiden’s statement followed a series of news reports and comments from Israeli officials, most of
which have since been sof tened or walked back. Easily debunked misinformation like\xa0fake press releases\xa0have circulated widely
since the start of the war, but such stories often die down quickly once proven false. The claims about beheadings, difficult to verify,
have con tinued to spread thanks in part to the lack of clarity.\nAlexei Abrahams, a disinformation researcher at McGill University in
Montreal, said that even without the allegations of beheaded babies, “just the facts themselves are horrifying enough to have the kind of
effect you expect.”\n“It may turn out that the slaughter was done in a particularly barbaric way. But one way or a nother, this is an
absolutely shocking, unprecedented event of violence,” Abrahams said. “The general concern, of course, is t hat it’s going to exacerbate
what is already a very fraught situation.”\nOn Wednesday, a spokesman for Israeli Prime Minister Benjamin Netanyahu told CNN that babies
and toddlers were found with their “heads decapitated” in southern Israel after Hamas’ attack. By Thursday morning, an Israeli
official\xa0told CNN\xa0the government had not confirmed claims of the beheadings.\nA senior State Department official said Thursday
morning that the agency was not in a position to confirm the beheading claim s.\nMany of the reports appear to have originated
from\xa0Israeli soldiers\xa0and people affiliated with the Israel Defense Fo rce (IDF).\nAn IDF spokesperson told Business Insider on
Tuesday that soldiers had found decapitated babies, but said Wednesda y it\xa0would not investigate\xa0or provide further evidence
regarding the claim. Late Wednesday, an IDF spokesperson\xa0said in a video on X\xa0that the IDF had “relative confidence” of the
claims.\nOn Thursday, in a call with a group of international journalists, Colonel Golan Vach, the head of the IDF’s national search and
rescue unit, said that he had “found one baby with his head cut.”\n\xa0Marc Owen Jones, an associate professor of Middle East studies at
Hamad Bin Khalifa University in Qatar wh o studies misinformation, told NBC News that he found that the source of the “40 babies
beheaded” allegations largely stemmed from a viral Israeli\xa0news broadcast clip\xa0that did not specifically refer to the
allegation.\nNicole Zedeck, a correspond ent for\xa0the privately owned Israeli news outlet i24NEWS, said in the video that Israeli
soldiers told her they’d found “bab ies, their heads cut off.” The video has been viewed more than 11 million times on X, according to
its view counter. In anothe r tweet, Zedeck wrote that soldiers told her they believe “40 babies/children were killed.”\n“Somehow those
two bits of inform ation were connected, the story became ‘40 babies were beheaded,’ and in the British press today, about six or seven
newspaper s had it on their front pages,” Jones said.\nAn IDF spokesperson, Doron Spielman, told NBC News on Tuesday that he could not c
onfirm i24NEWS’s report.\nYossi Landau, the head of operations for the southern region of Zaka, Israel’s volunteer civilian em ergency
response organization,\xa0told CBS News\xa0that he saw\xa0the bodies of beheaded children and babies, parents and chil dren who had been
tortured and had their hands bound, and “a lot more that cannot be described for now, because it’s very hard to describe.” \nBy Wednesday,
the claims, though still contentious, were going viral online — being used as evidence of Hamas’ depravity. On Wednesday, the phrase “Did
Hamas kill babies” saw the biggest increase in search interest on Google of anything related to the war.\n“Stranger Things” star Noah
Schnapp\xa0posted\xa0the shocking claim to his 25 million Instagram follower
s: “40 babies were beheaded and burned alive in front of their parents by Hamas.” Sen. Ted Cruz, R-Texas, mentioned beheaded b abies
in\xa0a post on X, and Rep. Mike McCaul, R-Texas, echoed the allegations on CNN.\nJones found that the “40 babies behead ed” claim
had\xa0over 44 million impressions\xa0on X, with over 300,000 likes and more than 100,000 reposts. The main accounts propagating the claims
were i24NEWS and the official Israel account, Jones’ data showed.\n\n“Baby stories are very emotive. Hi storically, they’re stories that
can be used to rationalize a very brutal response,” Jones said. “It’s such a volatile informa tion environment that such clai ms will
inevitably be taken out of context, both deliberately and accidentally.”'}
In [ ]: df = pd.DataFrame(articles)
In [ ]: print(df)
Title Content
0 Article1 Unverified reports of ‘40 babies beheaded’ in ...
1 Article2 The 'horrendous toll' on children caught in th...
2 Article3 What we actually know about the viral report o...
3 Article4 Israel releases horrific images of slain child...
4 Article5 ‘I would see and have confirmed pictures of te...
5 Article6 At least 40 babies killed, beheaded in Israeli...
6 Article7 ‘I would see and have confirmed pictures of te...
In [ ]: !pip install nltk
Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (3.8.1)

Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk) (8.1.7)
Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk) (1.3.2)
Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk) (2023.6.3)
Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk) (4.66.1)
In [ ]: import nltk
[nltk_data] Downloading package stopwords to /root/nltk_data...

[nltk_data] Package stopwords is already up-to-date!
Out[ ]: True
In [ ]: stop_words = set(stopwords.words('english'))
In [ ]: from nltk.tokenize import word_tokenize

unique_words = set()
all_words = []
for content in df['Content']:
words = word_tokenize(content)
filtered_words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
all_words.append(filtered_words)
unique_words.update(filtered_words)
In [ ]:
matrix = []
for word in unique_words:

row = [word]
for article_words in all_words:
row.append(1 if word in article_words else 0)
matrix.append(row)
columns = ['Word/Article'] + df['Title'].tolist()

incidence_df = pd.DataFrame(matrix, columns=columns)
In [ ]: incidence_df
Out[ ]:
Word/Article Article1 Article2 Article3 Article4 Article5 Article6 Article7
0 footage 0 0 1 0 0 0 0
1 acts 1 0 0 0 0 0 0
2 reconsider 0 0 0 0 1 0 1
3 hearing 0 0 1 0 0 1 0
4 islamic 0 0 0 1 0 0 0
... ... ... ... ... ... ... ... ...
1095 needed 0 0 1 0 0 0 0
1096 ones 0 0 0 0 0 1 0
1097 seen 1 0 1 1 0 0 0
1098 hope 0 0 0 0 1 0 1
1099 residents 0 0 1 1 0 1 0
1100 rows × 8 columns

In [ ]: incidence_df.to_csv("abc.csv")
In [ ]: word_to_check = "residents"
# List of articles
articles_to_check = ['Article1', 'Article4', 'Article6']
result = incidence_df.loc[incidence_df['Word/Article'] == word_to_check, articles_to_check]
if not result.empty:
presence = {article: bool(row) for article, row in
result.iloc[0].items()} else:
presence = {article: False for article in articles_to_check}
# Print the results

for article, is_present in presence.items():
print(f"Word '{word_to_check}' in {article}: {is_present}")
Word 'residents' in Article1: False
Word 'residents' in Article4: True
Word 'residents' in Article6: True
In [ ]: # Words that must be present and absent

words_present = ["residents"]
words_absent = ["hope"]
# Filter articles based on the

conditions filtered_articles = []
# Check if each article meets the criteria

for col in incidence_df.columns[1:]:
if all((word in incidence_df['Word/Article'].values and incidence_df.loc[incidence_df['Word/Article'] == word, col].iloc[0] all((word
in incidence_df['Word/Article'].values and incidence_df.loc[incidence_df['Word/Article'] == word, col].iloc[0]
filtered_articles.append(col)
print("Articles that match the criteria:", filtered_articles)
Articles that match the criteria: ['Article3', 'Article4', 'Article6']
In [ ]: words_present = ["residents"]
words_absent = ["hope"]
result = int('1111111', 2)
for word in words_present:
if word in incidence_df['Word/Article'].values:
binary_representation = ''.join(map(str,
incidence_df[incidence_df['Word/Article'] == word].iloc[0, 1:].values))
result &= int(binary_representation, 2)
for word in words_absent:

if word in incidence_df['Word/Article'].values:
binary_representation = ''.join(map(str,
incidence_df[incidence_df['Word/Article'] == word].iloc[0, 1:].values))
result &= ~int(binary_representation, 2)
binary_result = bin(result)[2:].zfill(7)
matching_articles = [f"Article{i+1}" for i, bit in

enumerate(binary_result) if bit == '1']
print("Articles that match the criteria:", matching_articles)
Articles that match the criteria: ['Article3', 'Article4', 'Article6']

20BCP112_NLP Lab_LAB_Manual (1)

Uploaded by

Copyright:

Available Formats

You might also like

20BCP112_NLP Lab_LAB_Manual (1)

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

20BCP112_NLP Lab_LAB_Manual (1)

Uploaded by

Copyright:

Available Formats

PANDIT DEENDAYAL ENERGY UNIVERSITYSCHOOL OF TECHNOLOGY

Natural Language Processing Lab Manual

4. POS tagging using HMM

5. Turney method of Sentiment Analysis (/Unsupervised)

6. Supervised sentiment analysis

7. text summarization using TF-IDF

8. Multi-document Text Summarization

9. term -incidence matrix for information retrieval

sentences = [sentence.strip() for sentence in sentences if

for idx, sentence in enumerate(sentences, start=1):

2. Sentence Tokenization using NLTK:

print("Sentence tokenization: ", sent_tokenize(text))

3. Word Tokenization using punkt:

from nltk.tokenize import word_tokenize

print("\n\nWord tokenization: " ,word_tokenize(text))

4. Word tokenization without NLTK:

for char in sentence:

words = ['going', 'friends', 'friendship', 'happily', 'friendly',

2. Stemming using Porter Stemmer from NLTK:

from nltk.stem import PorterStemmer

words = ["friendship", "programs", "programmer", "programming",

3. Lemmatization without NLTK:

from PyDictionary import PyDictionary

def lemmatize(self, word):

words = ['smiling', 'died', 'purchased', 'went', 'bought', 'better',

4. Lemmatization using NLTK:

from nltk.stem import WordNetLemmatizer

words2 = ['better', 'worst', 'happiest']

words3 = ['babies', 'mice', 'feet']

def _contains_vowel(self, word):

def _double_consonant(self, word):

def _replace_suffix(self, word, old, new):if

def _ends_with_cvc(self, word):if len(word)

def _step1(self, word):

def _step3(self, word):

def _step4(self, word):suffixes =

def _step5(self, word):

def _step6(self, word):

def _step7a(self, word): if

def _step7b(self, word):

def stem(self, word):

words = ["computers", "singing", "controlling", "generalizations",

for word in words:

[('computers', 'comput'), ('singing', 'sing'), ('controlling', 'control'), ('generalizations',

from future import division #To avoid integer division

with open("wsj_training.txt", "r") as myfile:

dict2_tag_follow_tag_ = {} # for transition probability

dict_word_tag_baseline = {} # for emission probability

#Accounting for the last word-tag pair outer_key =

for key in dict2_tag_follow_tag_: di =

for key in dict2_word_tag: di =

with open("wsj_test.txt", "r") as myfile:te_str =

te_li = te_str.split() num_words_test

test_li_words = [''] test_li_words*=

test_li_tags = [''] test_li_tags*=

output_li = [''] output_li*=

output_li_baseline = [''] output_li_baseline*=

for i in range(num_words_test): temp_li =

output_li_baseline[i] = dict_word_tag_baseline.get(temp_li[0],'')#If unknown word - tag =

def init(self, dataset):