IRS 122010304057.pdf

You might also like

Download as pdf or txt
Download as pdf or txt
You are on page 1of 23

INFORMATION RETRIEVAL SYSTEMS LAB

DOCUMENTATION

A Project Report submitted in partial fulfilment of the requirements

for the award of the degree of

BACHELOR OF TECHNOLOGY

IN

COMPUTER SCIENCE AND ENGINEERING

Submitted by

122010304057 Kalyan

Under the esteemed guidance of

Dr. Prem Kumar Singh

Associate Professor

DEPARTMENT OF COMPUTER SCIENCE & ENGINEERING

GITAM

(Deemed to be University)

VISAKHAPATNAM

OCTOBER 202
DEPARTMENT OF COMPUTER SCIENCE AND ENGINEERING
GITAM SCHOOL OF TECHNOLOGY GITAM

(Deemed to be University)

CERTIFICATE

This is to certify that the project report entitled “Lab Documentation” is a Bonafide
record of work carried out by(122010304057) Kalyan, students submitted in partial
fulfilment of requirement for the award of degree of Bachelors of Technology in
Computer Science and Engineering.

Dr. Prem Kumar Singh

Professor
Task 1
Write a Program to classify the attendance as information using 75 percent threshold.

no_of_classes = [2, 3, 5, 2, 6, 5, 3, 4]

no_attended = [2, 2, 3, 1, 5, 4, 3, 2]

for i in range(len(no_of_classes)):

print(f"Attendance for the day: {no_attended[i]}/{no_of_classes[i]}")

percent_attendance = (sum(no_attended)/sum(no_of_classes))*100

print("\nFinal Attendance:")

if (percent_attendance >= 75):

print(

f"Student is eligible for exam with {percent_attendance}% attendance")

else:

print(

f"Student is not eligible for exam with {percent_attendance}% attendance")

Output:
Task 2
Implement the precision and recall using the digital library of GITAM-KRC.

def calculate_precision_recall(y_true, y_pred):

TP = sum((y_pred[i] == 1) and (y_true[i] == 1) for i in range(len(y_pred)))

FP = sum((y_pred[i] == 1) and (y_true[i] == 0) for i in range(len(y_pred)))

FN = sum((y_pred[i] == 0) and (y_true[i] == 1) for i in range(len(y_pred)))

print(f"TP: {TP}\nFP: {FP}\nFN: {FN}\n")

precision = TP / (TP + FP)

recall = TP / (TP + FN)

return precision, recall

y_true = [1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1]

y_pred = [1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1]

print(f"y_true: {y_true}\ny_pred: {y_pred}\n")

pre, rec = calculate_precision_recall(y_true, y_pred)

print(f"Precesion: {pre}\nRecall: {rec}")

Output:
Task 3
Implement any of the indexing algorithm.

class Index_algo:

def init (self):

self.index = {}

def add_document(self, doc_id, content, documents):

words = content.split()

documents.append(words)

def search(self, query, documents):

for sub_doc in documents:

if query in sub_doc:

doc_index, quer_index = documents.index(

sub_doc), sub_doc.index(query)

print(

f"Document '{' '.join(documents[doc_index])}' contains '{query}' at position[{quer_index}]"

index = Index_algo()

documents = []

for i in range(int(input("Enter No of documents: "))):

index.add_document(doc_id=i, documents=documents, content=input("Enter: "))

index.search(query=input("Enter tokens: "), documents=documents)


Task 4
Implement the n-gram algorithm.

def generate_ngrams(text, n):

words = text.split()

words.insert(0,"#")

print(words)

ngrams = []

if(len(words) % 2 != 0):

words.append("#")

for i in range(0, len(words) - n + 1):

ngram = ' '.join(words[i:i + n])

ngrams.append(ngram)

return ngrams

text = input("Enter sentence: ")

n = int(input("n-gram n value: "))

ngrams = generate_ngrams(text,

n) for ngram in ngrams:

print(ngram)

Output:
Task 5
Implement k-means clustering algorithm for Information extraction.

import numpy as np

text_data = [

"The world is thriving in social media.",

"Social media has been the biggest platform for people to express
themself.", "Some people still prefer to lead a private life.",

"Many Problems throughout the world are expressed in the online platform.",
"The amount of data the social media companies need to handle is more.",

"Many compaines are worth billions as they succeed by the no of people using their platform giving
them lots of data to generate ads.",

"Online privacy is a doom nowadays"

k=3

np.random.seed(0)

centroids = np.random.choice(len(text_data), k, replace=False)

centroid_texts = [text_data[i] for i in centroids]

def similarity(text1, text2):

words1 = set(text1.lower().split())

words2 = set(text2.lower().split())

common_words = len(words1.intersection(words2))

return common_words / (len(words1) + len(words2))

cluster_assignments = [-1] * len(text_data)

max_iterations = 100

for _ in range(max_iterations):

for i, text in enumerate(text_data):

similarities = [similarity(text, centroid_texts[j]) for j in


range(k)] cluster_assignments[i] = np.argmax(similarities)

clusters = {}
for i, cluster_id in enumerate(cluster_assignments):

if cluster_id not in clusters:

clusters[cluster_id] = []

clusters[cluster_id].append(text_data[i])
for cluster_id, cluster_texts in clusters.items():

print(f"Cluster {cluster_id + 1}:")

for text in cluster_texts:

print(f"- {text}")

print()

Output:
Task 6
Implement SDD search algorithm.

import numpy as np

documents = np.array([

[0.5, 0.4, 0.1, 0.3],

[0.2, 0.3, 0.1, 0.3],

[0.0, 0.5, 0.0, 0.1],

[0.3, 0.1, 0.1, 0.4]

])

class_labels = np.array([0, 2, 0, 1])

query_vector = np.array([0.1, 0.3, 0.0, 0.2])

alpha = 1.0 # Weight for the original query

beta = 0.65 # Weight for relevant documents

gamma = 0.25 # Weight for non-relevant documents

relevant_indices = np.where(class_labels == 0)[0]


non_relevant_indices = np.where(class_labels == 1)[0]

relevant_centroid = np.mean(documents[relevant_indices], axis=0)

print(relevant_centroid)

non_relevant_centroid = np.mean(documents[non_relevant_indices], axis=0)

print(non_relevant_centroid)

updated_query_vector = alpha * query_vector + beta * relevant_centroid - gamma * non_relevant_centroid

# Print the updated query vector


print("\nUpdated Query Vector:")
print(updated_query_vector)
# Rocchio formula

updated_query_vector = alpha * query_vector + beta * relevant_centroid - gamma


* non_relevant_centroid

# Print the updated query vector


print("\nUpdated Query Vector:")

print(updated_query_vector)

Output:
Task 7
Implement Knuth/Boyer Algorithm.

def compute_lps(pattern):

lps = [0] * len(pattern)

j = 0 # Length of the previous

longest prefix suffix

for i in range(1, len(pattern)):

while j > 0 and pattern[i] !=

pattern[j]:

j = lps[j - 1]

if pattern[i] == pattern[j]:

j += 1

lps[i] = j

return lps

def kmp_search(text, pattern):

n = len(text)

m = len(pattern)

lps = compute_lps(pattern)

i = 0 # Index for text[]

j = 0 # Index for pattern[]

flag = False

while i < n:

if pattern[j] == text[i]:
i += 1

j += 1

if j == m:

print(f"Pattern found at

index {i - j}")

j = lps[j - 1]

flag = True

elif i < n and pattern[j] !=

text[i]:

if j != 0:

j = lps[j - 1]

else:

i += 1

if (flag == False):

print("Pattern not found!")

# Example usage:

text = "I am a computer science

under graduate"

pattern = "computer s"

kmp_search(text, pattern)

Output:
Task 8
Implement Relevance feedback for Youtube search.

import random

# Sample search results (replace with actual YouTube API


data) search_results = [

"Video 1: How to Bake a Cake",

"Video 2: Python Programming Tutorial",

"Video 3: Travel Vlog - Exploring Paris",

"Video 4: Machine Learning Basics",

"Video 5: Cooking Delicious Pasta"

while 1:

user_query = input("Enter your search query\nenter 'end' to break:")

if user_query == "end":

exit()

print("Search Results:")

for i, result in enumerate(search_results, start=1):

print(f"{i}. {result}")
selected_video_index = int(input("Enter the number of the video you watched (0 to exit): "))

if selected_video_index == 0:

exit()

user_feedback = input("Did you like the video? (y/n): ")

if user_feedback.lower() == "y":

relevance_increase = 0.1

else:

relevance_increase = -0.1
relevance_scores = [random.uniform(0, 1) for _ in search_results]
relevance_scores[selected_video_index - 1] += relevance_increase

sorted_results = [result for _, result in sorted(zip(relevance_scores,


search_results), reverse=True)]

print("\nUpdated Search Results:")


for i, result in enumerate(sorted_results, start=1):

print(f"{i}. {result}")

search_results = sorted_results

Output:
Task 9
Implement page rank algorithm for information retrieval.

import numpy as np

web_graph = np.array([

[0, 1, 1, 1],

[1, 0, 0, 1],

[0, 1, 0, 1],

[1, 0, 1, 0]

])

num_pages = len(web_graph)

pagerank = np.ones(num_pages) / num_pages

print(pagerank)

damping_factor = 0.65

tolerance = 1e-6

while True:

new_pagerank = np.zeros(num_pages)

for i in range(num_pages):

for j in range(num_pages):

if web_graph[j, i] == 1:

new_pagerank[i] += pagerank[j] /

np.sum(web_graph[j, :])

new_pagerank = (1 - damping_factor) / num_pages


+ damping_factor * new_pagerank

if np.sum(np.abs(new_pagerank - pagerank)) < tolerance:

break
pagerank = new_pagerank

sorted_pages = np.argsort(pagerank)[::-1]

for i in sorted_pages:

print(f"Page {i + 1}: PageRank = {pagerank[i]:.4f}")

Output:
Task 10
Implement one of the similarity measurement algorithm.

from collections import Counter

import math

documents = [

"This is the first document.",

"This document is the second.",

"And this is the third one.",

"Is this the first document?"

def preprocess(doc):

return doc.lower().split()

def compute_tfidf(document, query_terms):

document_terms = preprocess(document)

tf = {term: document_terms.count(

term) / len(document_terms) for term in

query_terms}

idf = {term: math.log(len(documents) / (1 + sum(

1 for doc in documents if term in preprocess(doc)))

+ 1) for term in query_terms}

tfidf = {term: tf[term] * idf[term] for term in


query_terms}
return tfidf

def cosine_similarity(doc_tfidf, query_tfidf):

dot_product = sum(doc_tfidf[term] *

query_tfidf[term]

for term in query_tfidf)

doc_magnitude = math.sqrt(sum(val ** 2 for val


in doc_tfidf.values()))

query_magnitude = math.sqrt(sum(val ** 2 for val


in query_tfidf.values()))

return dot_product / (doc_magnitude *

query_magnitude)

query = "This is similarity doc"

query_terms = preprocess(query)

query_tfidf = compute_tfidf(query, query_terms)

document_similarities =

[cosine_similarity(compute_tfidf(

doc, query_terms), query_tfidf) for doc in

documents]

ranked_documents =

sorted(enumerate(document_similarities),

key=lambda x: x[1], reverse=True)

for idx, similarity in ranked_documents:


print(f"Document {idx + 1}: Similarity Score
= {similarity:.2f}")
Output:

You might also like