Professional Documents
Culture Documents
TP1 NLP
TP1 NLP
# https://www.analyticsvidhya.com/blog/2022/02/machine-learning-techniques-for-text-representation-in-
import numpy as np
# Define the corpus of text
corpus = ['This is the first document #',
'This docment is the second document.',
'And this is the third one!'
]
print(corpus)
type(corpus)
['This is the first document #', 'This docment is the second document.', 'And this is the th
list
import re
Corpus_without_Car = re.sub('[.!?#123456789]',"",str(corpus))
print(Corpus_without_Car)
['This is the first document ', 'This docment is the second document', 'And this is the thir
Corpus_minus = Corpus_without_Car.lower()
print(Corpus_minus)
['this is the first document ', 'this docment is the second document', 'and this is the thir
['this is the first document ', 'this document is the second document', 'and this is the thi
c=nlp(str(corpus_correct))
Corpus_lem=(" ".join([token.lemma_ for token in c]))
print(Corpus_lem)
type(Corpus_lem)
[ ' this be the first document ' , ' this document be the second document ' , ' and this be
str
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
def remove_stop_words(review):
review_minus_sw = []
stop_words = stopwords.words('english')
review = review.split()
review = [review_minus_sw.append(word) for word in review if word not in stop_words]
review = ' '.join(review_minus_sw)
return review
Corpus_withoutstopwords=remove_stop_words(str(Corpus_lem))
print(Corpus_withoutstopwords)
type(Corpus_withoutstopwords)
[ ' first document ' , ' document second document ' , ' third one ' ]
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Unzipping corpora/stopwords.zip.
str
import ast
corpus=ast.literal_eval(Corpus_withoutstopwords)
type(corpus)
print(corpus)
[' first document ', ' document second document ', ' third one ']
print(corpus[0].split())
print(corpus[1].split())
print(corpus[2].split())
['first', 'document']
['document', 'second', 'document']
['third', 'one']
# Bag Of Words
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
Bow = cv.fit_transform(corpus)
# Vocabulaire
cv.vocabulary_
pd.DataFrame(Bow.toarray(), columns=cv.get_feature_names_out())
0 1 1 0 0 0
1 2 0 0 1 0
2 0 0 1 0 1
0 0 1 0 0
1 1 0 1 0
2 0 0 0 1
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(norm=None)
X = tfidf_vectorizer.fit_transform(corpus)
pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
#tfidf_vectorizer.vocabulary_
# One-Hot Encoding
sentence_vectors.append(vector)
one_hot_vectors.append(sentence_vectors)
# Word2Vect
import string
import nltk
from nltk.corpus import brown
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot
nltk.download("brown")
# Brown was the first million-word electronic corpus of English,
# created in 1961 at Brown University.
document = brown.sents()
data = []
for sent in document:
new_sent = []
for word in sent:
new_word = word.lower()
if new_word[0] not in string.punctuation:
new_sent.append(new_word)
if len(new_sent) > 0:
data.append(new_sent)
# Creating Word2Vec model
model = Word2Vec(
sentences = data,
vector_size = 50,
window = 10,
epochs = 20,
)
print(new_sent)
['from', 'what', 'i', 'was', 'able', 'to', 'gauge', 'in', 'a', 'swift', 'greedy', 'glance',
vectors = []
for sentence in corpus:
sentence_vectors = []
#print(sentence)
for word in sentence.split():
vector = model.wv[word]
sentence_vectors.append(vector)
vectors.append(sentence_vectors)
#print(sentence_vectors)
#Visualizing data
words = ["france", "germany", "india", "truck", "boat", "road"]
X = model.wv[words]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
result = pca.fit_transform(X)
pyplot.scatter(result[:, 0], result[:, 1])
for i, word in enumerate(words):
pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()
#Visualizing data
words = ["plane", "car", "boat" , "house", "cat", "dog", "pet","china", "russia", "moscow","house"]
X = model.wv[words]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
pyplot.scatter(result[:, 0], result[:, 1])
for i, word in enumerate(words):
pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()