IRS 122010304057.pdf

INFORMATION RETRIEVAL SYSTEMS LAB
DOCUMENTATION
A Project Report submitted in partial fulfilment of the requirements
for the award of the degree of
BACHELOR OF TECHNOLOGY
IN
COMPUTER SCIENCE AND ENGINEERING
Submitted by
122010304057 Kalyan
Under the esteemed guidance of
Dr. Prem Kumar Singh
Associate Professor
DEPARTMENT OF COMPUTER SCIENCE & ENGINEERING
GITAM
(Deemed to be University)
VISAKHAPATNAM
OCTOBER 202
DEPARTMENT OF COMPUTER SCIENCE AND ENGINEERING
GITAM SCHOOL OF TECHNOLOGY GITAM
(Deemed to be University)
CERTIFICATE
This is to certify that the project report entitled “Lab Documentation” is a Bonafide
record of work carried out by(122010304057) Kalyan, students submitted in partial
fulfilment of requirement for the award of degree of Bachelors of Technology in
Computer Science and Engineering.
Dr. Prem Kumar Singh
Professor
Task 1
Write a Program to classify the attendance as information using 75 percent threshold.
no_of_classes = [2, 3, 5, 2, 6, 5, 3, 4]
no_attended = [2, 2, 3, 1, 5, 4, 3, 2]
for i in range(len(no_of_classes)):
print(f"Attendance for the day: {no_attended[i]}/{no_of_classes[i]}")
percent_attendance = (sum(no_attended)/sum(no_of_classes))*100
print("\nFinal Attendance:")
if (percent_attendance >= 75):
print(
f"Student is eligible for exam with {percent_attendance}% attendance")
else:
print(
f"Student is not eligible for exam with {percent_attendance}% attendance")
Output:
Task 2
Implement the precision and recall using the digital library of GITAM-KRC.
def calculate_precision_recall(y_true, y_pred):
TP = sum((y_pred[i] == 1) and (y_true[i] == 1) for i in range(len(y_pred)))
FP = sum((y_pred[i] == 1) and (y_true[i] == 0) for i in range(len(y_pred)))
FN = sum((y_pred[i] == 0) and (y_true[i] == 1) for i in range(len(y_pred)))
print(f"TP: {TP}\nFP: {FP}\nFN: {FN}\n")
precision = TP / (TP + FP)
recall = TP / (TP + FN)
return precision, recall
y_true = [1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1]
y_pred = [1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1]
print(f"y_true: {y_true}\ny_pred: {y_pred}\n")
pre, rec = calculate_precision_recall(y_true, y_pred)
print(f"Precesion: {pre}\nRecall: {rec}")
Output:
Task 3
Implement any of the indexing algorithm.
class Index_algo:
def init (self):
self.index = {}
def add_document(self, doc_id, content, documents):
words = content.split()
documents.append(words)
def search(self, query, documents):
for sub_doc in documents:
if query in sub_doc:
doc_index, quer_index = documents.index(
sub_doc), sub_doc.index(query)
print(
f"Document '{' '.join(documents[doc_index])}' contains '{query}' at position[{quer_index}]"
index = Index_algo()
documents = []
for i in range(int(input("Enter No of documents: "))):
index.add_document(doc_id=i, documents=documents, content=input("Enter: "))
index.search(query=input("Enter tokens: "), documents=documents)

Task 4
Implement the n-gram algorithm.
def generate_ngrams(text, n):
words = text.split()
words.insert(0,"#")
print(words)
ngrams = []
if(len(words) % 2 != 0):
words.append("#")
for i in range(0, len(words) - n + 1):
ngram = ' '.join(words[i:i + n])
ngrams.append(ngram)
return ngrams
text = input("Enter sentence: ")
n = int(input("n-gram n value: "))
ngrams = generate_ngrams(text,
n) for ngram in ngrams:
print(ngram)
Output:
Task 5
Implement k-means clustering algorithm for Information extraction.
import numpy as np
text_data = [
"The world is thriving in social media.",
"Social media has been the biggest platform for people to express
themself.", "Some people still prefer to lead a private life.",
"Many Problems throughout the world are expressed in the online platform.",
"The amount of data the social media companies need to handle is more.",
"Many compaines are worth billions as they succeed by the no of people using their platform giving
them lots of data to generate ads.",
"Online privacy is a doom nowadays"
k=3
np.random.seed(0)
centroids = np.random.choice(len(text_data), k, replace=False)
centroid_texts = [text_data[i] for i in centroids]
def similarity(text1, text2):
words1 = set(text1.lower().split())
words2 = set(text2.lower().split())
common_words = len(words1.intersection(words2))
return common_words / (len(words1) + len(words2))
cluster_assignments = [-1] * len(text_data)
max_iterations = 100
for _ in range(max_iterations):
for i, text in enumerate(text_data):
similarities = [similarity(text, centroid_texts[j]) for j in

range(k)] cluster_assignments[i] = np.argmax(similarities)
clusters = {}
for i, cluster_id in enumerate(cluster_assignments):
if cluster_id not in clusters:
clusters[cluster_id] = []
clusters[cluster_id].append(text_data[i])
for cluster_id, cluster_texts in clusters.items():
print(f"Cluster {cluster_id + 1}:")
for text in cluster_texts:
print(f"- {text}")
print()
Output:
Task 6
Implement SDD search algorithm.
import numpy as np
documents = np.array([
[0.5, 0.4, 0.1, 0.3],
[0.2, 0.3, 0.1, 0.3],
[0.0, 0.5, 0.0, 0.1],
[0.3, 0.1, 0.1, 0.4]
])
class_labels = np.array([0, 2, 0, 1])
query_vector = np.array([0.1, 0.3, 0.0, 0.2])
alpha = 1.0 # Weight for the original query
beta = 0.65 # Weight for relevant documents
gamma = 0.25 # Weight for non-relevant documents
relevant_indices = np.where(class_labels == 0)[0]

non_relevant_indices = np.where(class_labels == 1)[0]
relevant_centroid = np.mean(documents[relevant_indices], axis=0)
print(relevant_centroid)
non_relevant_centroid = np.mean(documents[non_relevant_indices], axis=0)
print(non_relevant_centroid)
updated_query_vector = alpha * query_vector + beta * relevant_centroid - gamma * non_relevant_centroid
# Print the updated query vector

print("\nUpdated Query Vector:")
print(updated_query_vector)
# Rocchio formula
updated_query_vector = alpha * query_vector + beta * relevant_centroid - gamma

* non_relevant_centroid
# Print the updated query vector

print("\nUpdated Query Vector:")
print(updated_query_vector)
Output:
Task 7
Implement Knuth/Boyer Algorithm.
def compute_lps(pattern):
lps = [0] * len(pattern)
j = 0 # Length of the previous
longest prefix suffix
for i in range(1, len(pattern)):
while j > 0 and pattern[i] !=
pattern[j]:
j = lps[j - 1]
if pattern[i] == pattern[j]:
j += 1
lps[i] = j
return lps
def kmp_search(text, pattern):
n = len(text)
m = len(pattern)
lps = compute_lps(pattern)
i = 0 # Index for text[]
j = 0 # Index for pattern[]
flag = False
while i < n:
if pattern[j] == text[i]:
i += 1
j += 1
if j == m:
print(f"Pattern found at
index {i - j}")
j = lps[j - 1]
flag = True
elif i < n and pattern[j] !=
text[i]:
if j != 0:
j = lps[j - 1]
else:
i += 1
if (flag == False):
print("Pattern not found!")
# Example usage:
text = "I am a computer science
under graduate"
pattern = "computer s"
kmp_search(text, pattern)
Output:
Task 8
Implement Relevance feedback for Youtube search.
import random
# Sample search results (replace with actual YouTube API

data) search_results = [
"Video 1: How to Bake a Cake",
"Video 2: Python Programming Tutorial",
"Video 3: Travel Vlog - Exploring Paris",
"Video 4: Machine Learning Basics",
"Video 5: Cooking Delicious Pasta"
while 1:
user_query = input("Enter your search query\nenter 'end' to break:")
if user_query == "end":
exit()
print("Search Results:")
for i, result in enumerate(search_results, start=1):
print(f"{i}. {result}")
selected_video_index = int(input("Enter the number of the video you watched (0 to exit): "))
if selected_video_index == 0:
exit()
user_feedback = input("Did you like the video? (y/n): ")
if user_feedback.lower() == "y":
relevance_increase = 0.1
else:
relevance_increase = -0.1
relevance_scores = [random.uniform(0, 1) for _ in search_results]
relevance_scores[selected_video_index - 1] += relevance_increase
sorted_results = [result for _, result in sorted(zip(relevance_scores,

search_results), reverse=True)]
print("\nUpdated Search Results:")

for i, result in enumerate(sorted_results, start=1):
print(f"{i}. {result}")
search_results = sorted_results
Output:
Task 9
Implement page rank algorithm for information retrieval.
import numpy as np
web_graph = np.array([
[0, 1, 1, 1],
[1, 0, 0, 1],
[0, 1, 0, 1],
[1, 0, 1, 0]
])
num_pages = len(web_graph)
pagerank = np.ones(num_pages) / num_pages
print(pagerank)
damping_factor = 0.65
tolerance = 1e-6
while True:
new_pagerank = np.zeros(num_pages)
for i in range(num_pages):
for j in range(num_pages):
if web_graph[j, i] == 1:
new_pagerank[i] += pagerank[j] /
np.sum(web_graph[j, :])
new_pagerank = (1 - damping_factor) / num_pages

+ damping_factor * new_pagerank
if np.sum(np.abs(new_pagerank - pagerank)) < tolerance:
break
pagerank = new_pagerank
sorted_pages = np.argsort(pagerank)[::-1]
for i in sorted_pages:
print(f"Page {i + 1}: PageRank = {pagerank[i]:.4f}")
Output:
Task 10
Implement one of the similarity measurement algorithm.
from collections import Counter
import math
documents = [
"This is the first document.",
"This document is the second.",
"And this is the third one.",
"Is this the first document?"
def preprocess(doc):
return doc.lower().split()
def compute_tfidf(document, query_terms):
document_terms = preprocess(document)
tf = {term: document_terms.count(
term) / len(document_terms) for term in
query_terms}
idf = {term: math.log(len(documents) / (1 + sum(
1 for doc in documents if term in preprocess(doc)))
+ 1) for term in query_terms}
tfidf = {term: tf[term] * idf[term] for term in

query_terms}
return tfidf
def cosine_similarity(doc_tfidf, query_tfidf):
dot_product = sum(doc_tfidf[term] *
query_tfidf[term]
for term in query_tfidf)
doc_magnitude = math.sqrt(sum(val ** 2 for val

in doc_tfidf.values()))
query_magnitude = math.sqrt(sum(val ** 2 for val

in query_tfidf.values()))
return dot_product / (doc_magnitude *
query_magnitude)
query = "This is similarity doc"
query_terms = preprocess(query)
query_tfidf = compute_tfidf(query, query_terms)
document_similarities =
[cosine_similarity(compute_tfidf(
doc, query_terms), query_tfidf) for doc in
documents]
ranked_documents =
sorted(enumerate(document_similarities),
key=lambda x: x[1], reverse=True)
for idx, similarity in ranked_documents:

print(f"Document {idx + 1}: Similarity Score
= {similarity:.2f}")
Output:

IRS 122010304057.pdf

Uploaded by

Copyright:

Available Formats

You might also like

IRS 122010304057.pdf

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

IRS 122010304057.pdf

Uploaded by

Copyright:

Available Formats

INFORMATION RETRIEVAL SYSTEMS LAB

A Project Report submitted in partial fulfilment of the requirements

for the award of the degree of

COMPUTER SCIENCE AND ENGINEERING

Under the esteemed guidance of

Dr. Prem Kumar Singh

DEPARTMENT OF COMPUTER SCIENCE & ENGINEERING

Dr. Prem Kumar Singh

print(f"Attendance for the day: {no_attended[i]}/{no_of_classes[i]}")

if (percent_attendance >= 75):

f"Student is eligible for exam with {percent_attendance}% attendance")

f"Student is not eligible for exam with {percent_attendance}% attendance")

def calculate_precision_recall(y_true, y_pred):

TP = sum((y_pred[i] == 1) and (y_true[i] == 1) for i in range(len(y_pred)))

FP = sum((y_pred[i] == 1) and (y_true[i] == 0) for i in range(len(y_pred)))

FN = sum((y_pred[i] == 0) and (y_true[i] == 1) for i in range(len(y_pred)))

print(f"TP: {TP}\nFP: {FP}\nFN: {FN}\n")

precision = TP / (TP + FP)

recall = TP / (TP + FN)

return precision, recall

print(f"y_true: {y_true}\ny_pred: {y_pred}\n")

pre, rec = calculate_precision_recall(y_true, y_pred)

print(f"Precesion: {pre}\nRecall: {rec}")

def init (self):

def add_document(self, doc_id, content, documents):

def search(self, query, documents):

for sub_doc in documents:

doc_index, quer_index = documents.index(

f"Document '{' '.join(documents[doc_index])}' contains '{query}' at position[{quer_index}]"

for i in range(int(input("Enter No of documents: "))):

index.add_document(doc_id=i, documents=documents, content=input("Enter: "))

index.search(query=input("Enter tokens: "), documents=documents)

def generate_ngrams(text, n):

for i in range(0, len(words) - n + 1):

ngram = ' '.join(words[i:i + n])

text = input("Enter sentence: ")

n = int(input("n-gram n value: "))

n) for ngram in ngrams:

"The world is thriving in social media.",

"Online privacy is a doom nowadays"

centroids = np.random.choice(len(text_data), k, replace=False)

centroid_texts = [text_data[i] for i in centroids]

def similarity(text1, text2):

return common_words / (len(words1) + len(words2))

cluster_assignments = [-1] * len(text_data)

for i, text in enumerate(text_data):

similarities = [similarity(text, centroid_texts[j]) for j in

if cluster_id not in clusters:

print(f"Cluster {cluster_id + 1}:")

for text in cluster_texts:

[0.5, 0.4, 0.1, 0.3],

[0.2, 0.3, 0.1, 0.3],

[0.0, 0.5, 0.0, 0.1],

[0.3, 0.1, 0.1, 0.4]

class_labels = np.array([0, 2, 0, 1])

query_vector = np.array([0.1, 0.3, 0.0, 0.2])

alpha = 1.0 # Weight for the original query

beta = 0.65 # Weight for relevant documents

gamma = 0.25 # Weight for non-relevant documents