TP1 NLP

TP1_NLP_Nettoyage_Vectoristaion.ipynb - Colaboratory https://colab.research.google.com/drive/1ElEK51PmCL2j_pKibcPiBb...
# https://www.analyticsvidhya.com/blog/2022/02/machine-learning-techniques-for-text-representation-in-
!pip install spacy

#!python3 -m spacy download fr_core_news_md
!python -m spacy download en_core_web_sm
import spacy
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
import numpy as np
# Define the corpus of text
corpus = ['This is the first document #',
'This docment is the second document.',
'And this is the third one!'
]
print(corpus)
type(corpus)
['This is the first document #', 'This docment is the second document.', 'And this is the th
list
import re
Corpus_without_Car = re.sub('[.!?#123456789]',"",str(corpus))
print(Corpus_without_Car)
['This is the first document ', 'This docment is the second document', 'And this is the thir
Corpus_minus = Corpus_without_Car.lower()
print(Corpus_minus)
['this is the first document ', 'this docment is the second document', 'and this is the thir
from textblob import TextBlob

corpus_correct=TextBlob(Corpus_minus).correct()
print(corpus_correct)
['this is the first document ', 'this document is the second document', 'and this is the thi
c=nlp(str(corpus_correct))
Corpus_lem=(" ".join([token.lemma_ for token in c]))
print(Corpus_lem)
type(Corpus_lem)
[ ' this be the first document ' , ' this document be the second document ' , ' and this be
str
1 sur 7 01/10/2023, 21:37

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
def remove_stop_words(review):
review_minus_sw = []
stop_words = stopwords.words('english')
review = review.split()
review = [review_minus_sw.append(word) for word in review if word not in stop_words]
review = ' '.join(review_minus_sw)
return review
Corpus_withoutstopwords=remove_stop_words(str(Corpus_lem))
print(Corpus_withoutstopwords)
type(Corpus_withoutstopwords)
[ ' first document ' , ' document second document ' , ' third one ' ]
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Unzipping corpora/stopwords.zip.
str
import ast
corpus=ast.literal_eval(Corpus_withoutstopwords)
type(corpus)
print(corpus)
[' first document ', ' document second document ', ' third one ']
print(corpus[0].split())
['first', 'document']
['document', 'second', 'document']
['third', 'one']
# Bag Of Words
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
Bow = cv.fit_transform(corpus)
# Vocabulaire
cv.vocabulary_
pd.DataFrame(Bow.toarray(), columns=cv.get_feature_names_out())
document first one second third
0 1 1 0 0 0
1 2 0 0 1 0
2 0 0 1 0 1
# Bag Of Words: N-Gram ;
2 sur 7 01/10/2023, 21:37

# Bag Of Words: N-Gram ;

from sklearn.feature_extraction.text import CountVectorizer
cv1 = CountVectorizer(ngram_range=(2, 2))
Bow = cv1.fit_transform(corpus)
cv1.vocabulary_
pd.DataFrame(Bow.toarray(), columns=cv1.get_feature_names_out())
document second first document second document third one
0 0 1 0 0
1 1 0 1 0
2 0 0 0 1
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(norm=None)
X = tfidf_vectorizer.fit_transform(corpus)
pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
#tfidf_vectorizer.vocabulary_
document first one second third
0 1.287682 1.693147 0.000000 0.000000 0.000000
1 2.575364 0.000000 0.000000 1.693147 0.000000
2 0.000000 0.000000 1.693147 0.000000 1.693147
# One-Hot Encoding
# Create a set of unique words in the corpus

unique_words = set()
for sentence in corpus:
for word in sentence.split():
unique_words.add(word.lower())
# print( unique_words)
# Create a dictionary to map each
# unique word to an index
word_to_index = {}
for i, word in enumerate(unique_words):
word_to_index[word] = i
# print( word_to_index)
# Create one-hot encoded vectors for

# each word in the corpus
one_hot_vectors = []
sentence_vectors = []
vector = np.zeros(len(unique_words))
vector[word_to_index[word.lower()]] = 1
3 sur 7 01/10/2023, 21:37

sentence_vectors.append(vector)
one_hot_vectors.append(sentence_vectors)
# Print the one-hot encoded vectors for the first sentence

print("One-hot encoded vectors for the first sentence:")
for vector in one_hot_vectors[0]:
print(vector)
One-hot encoded vectors for the first sentence:

[0. 0. 1. 0. 0.]
[0. 1. 0. 0. 0.]
# Word2Vect
import string
import nltk
from nltk.corpus import brown
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot
nltk.download("brown")
# Brown was the first million-word electronic corpus of English,
# created in 1961 at Brown University.
document = brown.sents()
data = []
for sent in document:
new_sent = []
for word in sent:
new_word = word.lower()
if new_word[0] not in string.punctuation:
new_sent.append(new_word)
if len(new_sent) > 0:
data.append(new_sent)
# Creating Word2Vec model
model = Word2Vec(
sentences = data,
vector_size = 50,
window = 10,
epochs = 20,
)
# Vector for word

print("Vector for first:")
print(model.wv["first"])
print()
# Finding most similar words: clustering
print("3 words similar to mother house")
words = model.wv.most_similar("house", topn=3)
for word in words:
print(word)
print()
4 sur 7 01/10/2023, 21:37

[nltk_data] Downloading package brown to /root/nltk_data...

[nltk_data] Unzipping corpora/brown.zip.
Vector for first:
[ 0.5427254 2.1625948 2.53315 -1.3183608 0.5042789 1.5566359
2.4983366 -1.6894214 -1.3073367 0.04183732 3.56399 -2.6840935
-3.1445699 1.4281458 -1.2328478 1.5474263 2.7416852 2.0436335
-0.14823055 -0.753534 -3.9907327 -3.0632262 -0.06643599 -1.3816391
-0.07032058 -0.76562387 0.22765635 2.4657087 2.4059327 -3.8330004
0.27207714 0.61375684 2.0443316 -1.383371 -2.5633101 2.3859336
-2.3591983 0.63201773 -3.441393 4.301292 -2.9129264 -0.23044603
-3.5020788 -0.41193867 0.44809148 2.4430702 1.2778424 -2.492245
-0.11138619 1.251281 ]
3 words similar to mother house

('car', 0.6748865842819214)
('office', 0.641631543636322)
('front', 0.639775276184082)
print(new_sent)
['from', 'what', 'i', 'was', 'able', 'to', 'gauge', 'in', 'a', 'swift', 'greedy', 'glance',
vectors = []
sentence_vectors = []
#print(sentence)
vector = model.wv[word]
sentence_vectors.append(vector)
vectors.append(sentence_vectors)
#print(sentence_vectors)
# print matrix of sentences

# print(vector)
print(vectors[0][0])
[ 0.5427254 2.1625948 2.53315 -1.3183608 0.5042789 1.5566359

2.4983366 -1.6894214 -1.3073367 0.04183732 3.56399 -2.6840935
-3.1445699 1.4281458 -1.2328478 1.5474263 2.7416852 2.0436335
-0.14823055 -0.753534 -3.9907327 -3.0632262 -0.06643599 -1.3816391
-0.07032058 -0.76562387 0.22765635 2.4657087 2.4059327 -3.8330004
0.27207714 0.61375684 2.0443316 -1.383371 -2.5633101 2.3859336
-2.3591983 0.63201773 -3.441393 4.301292 -2.9129264 -0.23044603
-3.5020788 -0.41193867 0.44809148 2.4430702 1.2778424 -2.492245
-0.11138619 1.251281 ]
#Visualizing data
words = ["france", "germany", "india", "truck", "boat", "road"]
X = model.wv[words]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
5 sur 7 01/10/2023, 21:37

pyplot.scatter(result[:, 0], result[:, 1])
for i, word in enumerate(words):
pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()
#Visualizing data
words = ["plane", "car", "boat" , "house", "cat", "dog", "pet","china", "russia", "moscow","house"]
X = model.wv[words]
pca = PCA(n_components=2)
pyplot.scatter(result[:, 0], result[:, 1])
for i, word in enumerate(words):
pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()
6 sur 7 01/10/2023, 21:37

7 sur 7 01/10/2023, 21:37

TP1 NLP

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

TP1 NLP

Uploaded by

Copyright:

Available Formats

TP1_NLP_Nettoyage_Vectoristaion.ipynb - Colaboratory https://colab.research.google.com/drive/1ElEK51PmCL2j_pKibcPiBb...

!pip install spacy

from textblob import TextBlob

1 sur 7 01/10/2023, 21:37

document first one second third

# Bag Of Words: N-Gram ;

2 sur 7 01/10/2023, 21:37

# Bag Of Words: N-Gram ;

document second first document second document third one

document first one second third

0 1.287682 1.693147 0.000000 0.000000 0.000000

1 2.575364 0.000000 0.000000 1.693147 0.000000

2 0.000000 0.000000 1.693147 0.000000 1.693147

# Create a set of unique words in the corpus

# Create one-hot encoded vectors for

3 sur 7 01/10/2023, 21:37

# Print the one-hot encoded vectors for the first sentence

One-hot encoded vectors for the first sentence:

# Vector for word

4 sur 7 01/10/2023, 21:37

[nltk_data] Downloading package brown to /root/nltk_data...

3 words similar to mother house

# print matrix of sentences

[ 0.5427254 2.1625948 2.53315 -1.3183608 0.5042789 1.5566359

5 sur 7 01/10/2023, 21:37

6 sur 7 01/10/2023, 21:37

7 sur 7 01/10/2023, 21:37

You might also like