Download as pdf or txt
Download as pdf or txt
You are on page 1of 7

TP1_NLP_Nettoyage_Vectoristaion.ipynb - Colaboratory https://colab.research.google.com/drive/1ElEK51PmCL2j_pKibcPiBb...

# https://www.analyticsvidhya.com/blog/2022/02/machine-learning-techniques-for-text-representation-in-

!pip install spacy


#!python3 -m spacy download fr_core_news_md
!python -m spacy download en_core_web_sm
import spacy
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

import numpy as np
# Define the corpus of text
corpus = ['This is the first document #',
'This docment is the second document.',
'And this is the third one!'
]
print(corpus)
type(corpus)

['This is the first document #', 'This docment is the second document.', 'And this is the th
list

import re
Corpus_without_Car = re.sub('[.!?#123456789]',"",str(corpus))
print(Corpus_without_Car)

['This is the first document ', 'This docment is the second document', 'And this is the thir

Corpus_minus = Corpus_without_Car.lower()
print(Corpus_minus)

['this is the first document ', 'this docment is the second document', 'and this is the thir

from textblob import TextBlob


corpus_correct=TextBlob(Corpus_minus).correct()
print(corpus_correct)

['this is the first document ', 'this document is the second document', 'and this is the thi

c=nlp(str(corpus_correct))
Corpus_lem=(" ".join([token.lemma_ for token in c]))
print(Corpus_lem)
type(Corpus_lem)

[ ' this be the first document ' , ' this document be the second document ' , ' and this be
str

1 sur 7 01/10/2023, 21:37


TP1_NLP_Nettoyage_Vectoristaion.ipynb - Colaboratory https://colab.research.google.com/drive/1ElEK51PmCL2j_pKibcPiBb...

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
def remove_stop_words(review):
review_minus_sw = []
stop_words = stopwords.words('english')
review = review.split()
review = [review_minus_sw.append(word) for word in review if word not in stop_words]
review = ' '.join(review_minus_sw)
return review
Corpus_withoutstopwords=remove_stop_words(str(Corpus_lem))
print(Corpus_withoutstopwords)
type(Corpus_withoutstopwords)

[ ' first document ' , ' document second document ' , ' third one ' ]
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Unzipping corpora/stopwords.zip.
str

import ast
corpus=ast.literal_eval(Corpus_withoutstopwords)
type(corpus)
print(corpus)

[' first document ', ' document second document ', ' third one ']

print(corpus[0].split())
print(corpus[1].split())
print(corpus[2].split())

['first', 'document']
['document', 'second', 'document']
['third', 'one']

# Bag Of Words
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
Bow = cv.fit_transform(corpus)
# Vocabulaire
cv.vocabulary_
pd.DataFrame(Bow.toarray(), columns=cv.get_feature_names_out())

document first one second third

0 1 1 0 0 0

1 2 0 0 1 0

2 0 0 1 0 1

# Bag Of Words: N-Gram ;

2 sur 7 01/10/2023, 21:37


TP1_NLP_Nettoyage_Vectoristaion.ipynb - Colaboratory https://colab.research.google.com/drive/1ElEK51PmCL2j_pKibcPiBb...

# Bag Of Words: N-Gram ;


from sklearn.feature_extraction.text import CountVectorizer
cv1 = CountVectorizer(ngram_range=(2, 2))
Bow = cv1.fit_transform(corpus)
cv1.vocabulary_
pd.DataFrame(Bow.toarray(), columns=cv1.get_feature_names_out())

document second first document second document third one

0 0 1 0 0

1 1 0 1 0

2 0 0 0 1

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(norm=None)
X = tfidf_vectorizer.fit_transform(corpus)
pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
#tfidf_vectorizer.vocabulary_

document first one second third

0 1.287682 1.693147 0.000000 0.000000 0.000000

1 2.575364 0.000000 0.000000 1.693147 0.000000

2 0.000000 0.000000 1.693147 0.000000 1.693147

# One-Hot Encoding

# Create a set of unique words in the corpus


unique_words = set()
for sentence in corpus:
for word in sentence.split():
unique_words.add(word.lower())
# print( unique_words)
# Create a dictionary to map each
# unique word to an index
word_to_index = {}
for i, word in enumerate(unique_words):
word_to_index[word] = i
# print( word_to_index)

# Create one-hot encoded vectors for


# each word in the corpus
one_hot_vectors = []
for sentence in corpus:
sentence_vectors = []
for word in sentence.split():
vector = np.zeros(len(unique_words))
vector[word_to_index[word.lower()]] = 1

3 sur 7 01/10/2023, 21:37


TP1_NLP_Nettoyage_Vectoristaion.ipynb - Colaboratory https://colab.research.google.com/drive/1ElEK51PmCL2j_pKibcPiBb...

sentence_vectors.append(vector)
one_hot_vectors.append(sentence_vectors)

# Print the one-hot encoded vectors for the first sentence


print("One-hot encoded vectors for the first sentence:")
for vector in one_hot_vectors[0]:
print(vector)

One-hot encoded vectors for the first sentence:


[0. 0. 1. 0. 0.]
[0. 1. 0. 0. 0.]

# Word2Vect
import string
import nltk
from nltk.corpus import brown
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot
nltk.download("brown")
# Brown was the first million-word electronic corpus of English,
# created in 1961 at Brown University.

document = brown.sents()
data = []
for sent in document:
new_sent = []
for word in sent:
new_word = word.lower()
if new_word[0] not in string.punctuation:
new_sent.append(new_word)
if len(new_sent) > 0:
data.append(new_sent)
# Creating Word2Vec model
model = Word2Vec(
sentences = data,
vector_size = 50,
window = 10,
epochs = 20,
)

# Vector for word


print("Vector for first:")
print(model.wv["first"])
print()
# Finding most similar words: clustering
print("3 words similar to mother house")
words = model.wv.most_similar("house", topn=3)
for word in words:
print(word)
print()

4 sur 7 01/10/2023, 21:37


TP1_NLP_Nettoyage_Vectoristaion.ipynb - Colaboratory https://colab.research.google.com/drive/1ElEK51PmCL2j_pKibcPiBb...

[nltk_data] Downloading package brown to /root/nltk_data...


[nltk_data] Unzipping corpora/brown.zip.
Vector for first:
[ 0.5427254 2.1625948 2.53315 -1.3183608 0.5042789 1.5566359
2.4983366 -1.6894214 -1.3073367 0.04183732 3.56399 -2.6840935
-3.1445699 1.4281458 -1.2328478 1.5474263 2.7416852 2.0436335
-0.14823055 -0.753534 -3.9907327 -3.0632262 -0.06643599 -1.3816391
-0.07032058 -0.76562387 0.22765635 2.4657087 2.4059327 -3.8330004
0.27207714 0.61375684 2.0443316 -1.383371 -2.5633101 2.3859336
-2.3591983 0.63201773 -3.441393 4.301292 -2.9129264 -0.23044603
-3.5020788 -0.41193867 0.44809148 2.4430702 1.2778424 -2.492245
-0.11138619 1.251281 ]

3 words similar to mother house


('car', 0.6748865842819214)
('office', 0.641631543636322)
('front', 0.639775276184082)

print(new_sent)

['from', 'what', 'i', 'was', 'able', 'to', 'gauge', 'in', 'a', 'swift', 'greedy', 'glance',

vectors = []
for sentence in corpus:
sentence_vectors = []
#print(sentence)
for word in sentence.split():
vector = model.wv[word]
sentence_vectors.append(vector)
vectors.append(sentence_vectors)
#print(sentence_vectors)

# print matrix of sentences


# print(vector)
print(vectors[0][0])

[ 0.5427254 2.1625948 2.53315 -1.3183608 0.5042789 1.5566359


2.4983366 -1.6894214 -1.3073367 0.04183732 3.56399 -2.6840935
-3.1445699 1.4281458 -1.2328478 1.5474263 2.7416852 2.0436335
-0.14823055 -0.753534 -3.9907327 -3.0632262 -0.06643599 -1.3816391
-0.07032058 -0.76562387 0.22765635 2.4657087 2.4059327 -3.8330004
0.27207714 0.61375684 2.0443316 -1.383371 -2.5633101 2.3859336
-2.3591983 0.63201773 -3.441393 4.301292 -2.9129264 -0.23044603
-3.5020788 -0.41193867 0.44809148 2.4430702 1.2778424 -2.492245
-0.11138619 1.251281 ]

#Visualizing data
words = ["france", "germany", "india", "truck", "boat", "road"]
X = model.wv[words]
pca = PCA(n_components=2)
result = pca.fit_transform(X)

5 sur 7 01/10/2023, 21:37


TP1_NLP_Nettoyage_Vectoristaion.ipynb - Colaboratory https://colab.research.google.com/drive/1ElEK51PmCL2j_pKibcPiBb...

result = pca.fit_transform(X)
pyplot.scatter(result[:, 0], result[:, 1])
for i, word in enumerate(words):
pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()

#Visualizing data
words = ["plane", "car", "boat" , "house", "cat", "dog", "pet","china", "russia", "moscow","house"]
X = model.wv[words]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
pyplot.scatter(result[:, 0], result[:, 1])
for i, word in enumerate(words):
pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()

6 sur 7 01/10/2023, 21:37


TP1_NLP_Nettoyage_Vectoristaion.ipynb - Colaboratory https://colab.research.google.com/drive/1ElEK51PmCL2j_pKibcPiBb...

7 sur 7 01/10/2023, 21:37

You might also like