Download as rtf, pdf, or txt
Download as rtf, pdf, or txt
You are on page 1of 2

Code 1: libraries

# Basi libraries to manipulate data


import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Libraries for text mining


import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
from collections import Counter
from wordcloud import WordCloud
from ast import literal_eval
from textblob import TextBlob #sentiment
from PIL import Image
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Libraries for predictive models


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LinearRegression

Code 2: text cleaning


# Lowercase conversion, HTML tag removal, URL removal, digit removal,
# tokenization, stopword removal, stemming, and lemmatization.

def preprocess(sentence):
sentence=str(sentence)
sentence = sentence.lower()
sentence = sentence.replace('{html}',"")
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', sentence)
rem_url = re.sub(r'http\S+', '',cleantext)
rem_num = re.sub('[0-9]+', '', rem_url)
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(rem_num)
filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
stem_words = [PorterStemmer().stem(w) for w in filtered_words]
lemma_words=[WordNetLemmatizer().lemmatize(w) for w in stem_words]
return " ".join(filtered_words)

Code 3:
tokenizer = RegexpTokenizer(r'\w+')
vectorizer = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokenizer.tokenize)
full_text = list(x_train.values) + list(x_valid.values)
vectorizer.fit(full_text)
train_vectorized = vectorizer.transform(x_train)
test_vectorized = vectorizer.transform(x_valid)

You might also like