tp2 Indexation

You might also like

Download as txt, pdf, or txt
Download as txt, pdf, or txt
You are on page 1of 1

ex tp2

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re

class TextAnalyzer:
def __init__(self, text_list):
self.text_list = text_list
self.vectorizer = CountVectorizer(tokenizer=self.tokenize)
self.tfidf_vectorizer = TfidfVectorizer(tokenizer=self.tokenize)

def tokenize(self, text):

return re.findall(r'\b\w+\b', text.lower())

def analyze_count(self):

count_matrix = self.vectorizer.fit_transform(self.text_list)
feature_names = self.vectorizer.get_feature_names_out()

count_df = pd.DataFrame(count_matrix.toarray(), columns=feature_names)


return count_df

def analyze_tfidf(self):

tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.text_list)
feature_names = self.tfidf_vectorizer.get_feature_names_out()

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)


return tfidf_df

analyzer = TextAnalyzer(textes)

count_results = analyzer.analyze_count()
print("CountVectorizer results:")
print(count_results)

tfidf_results = analyzer.analyze_tfidf()
print("\nTfidfVectorizer results:")
print(tfidf_results)

You might also like