Download as pdf or txt
Download as pdf or txt
You are on page 1of 16

text-processing

March 24, 2024

[1]: import nltk

#tokenizing
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer

#stopwords
from nltk.corpus import stopwords

#regexp
import re

# pandas dataframe
import pandas as pd

#import count vectorizer


from sklearn.feature_extraction.text import CountVectorizer

[2]: nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml

[2]: True

[3]: #load the data used in the book examples into the Python environment:

from nltk.book import *

*** Introductory Examples for the NLTK Book ***


Loading text1, …, text9 and sent1, …, sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus

1
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908
This command loaded 9 of the text examples available from the corpora package (only
a small number of them!). It has used the variable names text1 through text9 for
theseexamples, and already assigned them values. If you type the variable name, you
get a description of the text
[4]: text1

[4]: <Text: Moby Dick by Herman Melville 1851>

Note that the first sentence of the book Moby Dick is “Call me Ishmael.” and that
this sentence has been already separated into tokens in the variable sent1
[5]: #The variables sent1 through sent9 have been set to be a list of tokens of the␣
↪first sentence of each text.

sent1

[5]: ['Call', 'me', 'Ishmael', '.']

[ ]:

0.1 Counting
[8]: #gives the total number of words in the text

len(text1)

[8]: 260819

[7]: #to find out how many unique words there are, not counting repetitions (gives␣
↪all tokens)

sorted(set(text1))

#Or we can just find the length of that list.


len(sorted(set(text3)))

[7]: 2789

[12]: #Or we can specify just to print the first 30 words in the list of sorted words:
sorted(set(text3))[:30]

[12]: ['!',
"'",

2
'(',
')',
',',
',)',
'.',
'.)',
':',
';',
';)',
'?',
'?)',
'A',
'Abel',
'Abelmizraim',
'Abidah',
'Abide',
'Abimael',
'Abimelech',
'Abr',
'Abrah',
'Abraham',
'Abram',
'Accad',
'Achbor',
'Adah',
'Adam',
'Adbeel',
'Admah']

[13]: #to count how many times the word 'Moby' has appeared in the text1
text1.count("Moby")

[13]: 84

[ ]:

0.2 Processing Text


lets use gutenberg corpus
NLTK includes a small selection of texts from the Project Gutenberg electronic text
archive, which contains some 25,000 free electronic books
[19]: # You can then view some books obtained from the Gutenberg on-line book project:
nltk.corpus.gutenberg.fileids()

3
[19]: ['austen-emma.txt',
'austen-persuasion.txt',
'austen-sense.txt',
'bible-kjv.txt',
'blake-poems.txt',
'bryant-stories.txt',
'burgess-busterbrown.txt',
'carroll-alice.txt',
'chesterton-ball.txt',
'chesterton-brown.txt',
'chesterton-thursday.txt',
'edgeworth-parents.txt',
'melville-moby_dick.txt',
'milton-paradise.txt',
'shakespeare-caesar.txt',
'shakespeare-hamlet.txt',
'shakespeare-macbeth.txt',
'whitman-leaves.txt']

[22]: #view the first file


file1 = nltk.corpus.gutenberg.fileids( ) [0]
file1

[22]: 'austen-emma.txt'

[33]: #We can get the original text, using the raw function:

emmatext = nltk.corpus.gutenberg.raw(file1)

emmatext[:120] #Since this is quite long, we can view part of it, e.g. the␣
↪first 120 characters

#len(emmatext) #count of total characters

[33]: '[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse,


handsome, clever, and rich, with a comfortable home\nan'

0.3 1. Tokenization
NLTK has several tokenizers available to break the raw text into tokens; we will use one that
separates by white space and also by special characters (punctuation)

0.3.1 Word Tokenization


[32]: emmatokens = nltk.wordpunct_tokenize(emmatext)

len(emmatokens) #total token count

4
#view the tokenized text
emmatokens[:15]

[32]: ['[',
'Emma',
'by',
'Jane',
'Austen',
'1816',
']',
'VOLUME',
'I',
'CHAPTER',
'I',
'Emma',
'Woodhouse',
',',
'handsome']

[34]: #Example
sentence="I have no money at the moment."
nltk.wordpunct_tokenize(sentence)

[34]: ['I', 'have', 'no', 'money', 'at', 'the', 'moment', '.']

[36]: #using word_tokenize


text = "God is Great! I won a lottery."
print(word_tokenize(text))

['God', 'is', 'Great', '!', 'I', 'won', 'a', 'lottery', '.']

[39]: #usigng Regexp tokenizer


text="God is Great! I won a lottery."
tokenizer = RegexpTokenizer("[\w']+")

tokenizer.tokenize(text)

[39]: ['God', 'is', 'Great', 'I', 'won', 'a', 'lottery']

0.3.2 Sentence Tokenization


[44]: #by using nltk library

text1 = "God is Great! I won a lottery."


print(sent_tokenize(text1))

5
['God is Great!', 'I won a lottery.']

[45]: text2="Let us understand the difference between sentence & word tokenizer. It␣
↪is going to be a simple example."

text2.split(". ")

[45]: ['Let us understand the difference between sentence & word tokenizer',
'It is going to be a simple example.']

[ ]:

0.4 2. Stopwords
[19]: #lookat the stopwords listed
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're",
"you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he',
'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's",
'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what',
'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is',
'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above',
'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now',
'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn',
"couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
"hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't",
'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',
"shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn',
"wouldn't"]

[49]: sent1="""He determined to drop his litigation with the monastry, and relinguish␣
↪his claims to the wood-cuting and

fishery rihgts at once. He was the more ready to do this becuase the rights had␣
↪become much less valuable, and he had

indeed the vaguest idea where the wood and river in question were."""

# set of stop words


stop_words = set(stopwords.words('english'))

# tokens of words

6
word_tokens = word_tokenize(sent1)
word_tokens[:10]

[49]: ['He',
'determined',
'to',
'drop',
'his',
'litigation',
'with',
'the',
'monastry',
',']

[50]: #empty list to get the final stop word removed text
filtered_sentence = []

# filter out the stop words


for w in word_tokens:
if w not in stop_words:
filtered_sentence.append(w)

print("\nOriginal Sentence \n")


print(" ".join(word_tokens))

print("\nFiltered Sentence \n")


print(" ".join(filtered_sentence))

Original Sentence

He determined to drop his litigation with the monastry , and relinguish his
claims to the wood-cuting and fishery rihgts at once . He was the more ready to
do this becuase the rights had become much less valuable , and he had indeed the
vaguest idea where the wood and river in question were .

Filtered Sentence

He determined drop litigation monastry , relinguish claims wood-cuting fishery


rihgts . He ready becuase rights become much less valuable , indeed vaguest idea
wood river question .

7
0.5 3. Normalizing word Formats
0.6 3.1 Lowercase
[51]: #Example
sentence="I have NO moNey at tHE moMent."

sentence.lower()

[51]: 'i have no money at the moment.'

[53]: #for already tokenized text


emmawords = [w.lower( ) for w in emmatokens]
emmawords[:15]

[53]: ['[',
'emma',
'by',
'jane',
'austen',
'1816',
']',
'volume',
'i',
'chapter',
'i',
'emma',
'woodhouse',
',',
'handsome']

[55]: # We can further view the words by getting the unique words and sorting them:
emmavocab = sorted(set(emmawords))
emmavocab[:10]

[55]: ['!', '!"', '!"--', "!'", "!'--", '!)--', '!--', '!--"', '!--(', '!--`']

[25]: #uppercased
sentence.upper()

#check Table 3.2 for more operations on strings (Chapter 3, Section 3.2 of NLTK␣
↪book)

[25]: 'I HAVE NO MONEY AT THE MOMENT.'

[26]: #select a set of words from the tokenized text


shortwords=emmawords[11:111]
shortwords[:10]

8
[26]: ['emma', 'woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich', ',']

[27]: #get the frequency count for each word


shortdist = FreqDist(shortwords)
shortdist.keys( )

for word in shortdist.keys():


print (word, shortdist[word])

emma 1
woodhouse 1
, 8
handsome 1
clever 1
and 4
rich 1
with 2
a 3
comfortable 1
home 1
happy 1
disposition 1
seemed 1
to 3
unite 1
some 1
of 6
the 4
best 1
blessings 1
existence 1
; 2
had 3
lived 1
nearly 1
twenty 1
- 1
one 1
years 1
in 2
world 1
very 2
little 1
distress 1
or 1
vex 1
her 4

9
. 2
she 1
was 1
youngest 1
two 1
daughters 1
most 1
affectionate 1
indulgent 1
father 1
consequence 1
sister 1
' 1
s 1
marriage 1
been 1
mistress 1
his 1
house 1
from 1
early 1
period 1
mother 1
died 1
too 1
long 1
ago 1
for 1
have 1
more 1

0.7 3.2 Stemming


NLTK has two stemmers, Porter and Lancaster, described in section 3.6 of the NLTK
book. To use these stemmers, you first create them
[58]: porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()

[61]: #regular-cased text- porter stemmer


emmaregstem = [porter.stem(t) for t in emmatokens]
emmaregstem[1:10]

[61]: ['emma', 'by', 'jane', 'austen', '1816', ']', 'volum', 'i', 'chapter']

[30]: #lowercased text


emmalowerstem = [porter.stem(t) for t in emmawords]
emmalowerstem[1:10]

10
[30]: ['emma', 'by', 'jane', 'austen', '1816', ']', 'volum', 'i', 'chapter']

[31]: #regular-cased text - lancaster stemmer


emmaregstem1 = [lancaster.stem(t) for t in emmatokens]
emmaregstem1[1:10]

[31]: ['emm', 'by', 'jan', 'aust', '1816', ']', 'volum', 'i', 'chapt']

[70]: #building our own simple stemmer by making a list of suffixes to take off.

def stem(word):
for suffix in ['ing','ly','ed','ious','ies','ive','es','s']:
if word.endswith(suffix):
return word[:-len(suffix)]
return word

#try the above stemmer with 'friends'


stem('friends')

[70]: 'friend'

[71]: stem('relatives')

[71]: 'relativ'

0.8 3.3 Lemmatizing


NLTK has a lemmatizer that uses the WordNet on-line thesaurus as a dictionary to look up roots
and find the word.
[74]: wnl = nltk.WordNetLemmatizer()
emmalemma=[wnl.lemmatize(t) for t in emmawords]
emmalemma[1:10]

[74]: ['emma', 'by', 'jane', 'austen', '1816', ']', 'volume', 'i', 'chapter']

[82]: wnl.lemmatize('friends')
wnl.lemmatize('relatives')

[82]: 'relative'

0.9 4. Regex:Regular Expressions for Detecting Word Patterns


[83]: emmatext[:100]

[83]: '[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse,


handsome, clever, and rich, with a'

11
[85]: #the function replace to replace all the new characters ‘\n’ with a space ‘ ‘.
newemmatext = emmatext.replace('\n', ' ')
shorttext = newemmatext[:150]

#redefined the variable shorttext to be the first 150 characters


#without newlines
shorttext

[85]: '[Emma by Jane Austen 1816] VOLUME I CHAPTER I Emma Woodhouse, handsome,
clever, and rich, with a comfortable home and happy disposition, seemed to'

[38]: pword = re.compile('\w+')


#re.findall will find the substrings that matched anywhere in the string.

re.findall(pword, shorttext)

[38]: ['Emma',
'by',
'Jane',
'Austen',
'1816',
'VOLUME',
'I',
'CHAPTER',
'I',
'Emma',
'Woodhouse',
'handsome',
'clever',
'and',
'rich',
'with',
'a',
'comfortable',
'home',
'and',
'happy',
'disposition',
'seemed',
'to']

[39]: #re.findall will find the substrings that matched anywhere in the specialtext.
specialtext = 'U.S.A. poster-print costs $12.40, with 10% off.'
re.findall(pword, specialtext)

[39]: ['U', 'S', 'A', 'poster', 'print', 'costs', '12', '40', 'with', '10', 'off']

12
[40]: #to match tokens by matching words can have an internal hyphen.
ptoken = re.compile('(\w+(-\w+)*)')
re.findall(ptoken, specialtext)

[40]: [('U', ''),


('S', ''),
('A', ''),
('poster-print', '-print'),
('costs', ''),
('12', ''),
('40', ''),
('with', ''),
('10', ''),
('off', '')]

[41]: #to match abbreviations that might have a “.” inside, like U.S.A.
#We only allow capitalized letters
pabbrev = re.compile('(([A-Z]\.)+)')
re.findall(pabbrev, specialtext)

[41]: [('U.S.A.', 'A.')]

[42]: #combine it with the words pattern to match either words or abbreviations
ptoken = re.compile('(\w+(-\w+)*|([A-Z]\.)+)')
re.findall(ptoken, specialtext)

[42]: [('U', '', ''),


('S', '', ''),
('A', '', ''),
('poster-print', '-print', ''),
('costs', '', ''),
('12', '', ''),
('40', '', ''),
('with', '', ''),
('10', '', ''),
('off', '', '')]

[43]: #order of the matching patterns really matters if


#an earlier pattern matches part of what you want to match.
ptoken = re.compile('(([A-Z]\.)+|\w+(-\w+)*)')
re.findall(ptoken, specialtext)

[43]: [('U.S.A.', 'A.', ''),


('poster-print', '', '-print'),
('costs', '', ''),
('12', '', ''),
('40', '', ''),

13
('with', '', ''),
('10', '', ''),
('off', '', '')]

[44]: #add an expression to match the currency


ptoken = re.compile(r'(([A-Z]\.)+|\w+(-\w+)*|\$?\d+(\.\d+)?)')
re.findall(ptoken, specialtext)

[44]: [('U.S.A.', 'A.', '', ''),


('poster-print', '', '-print', ''),
('costs', '', '', ''),
('$12.40', '', '', '.40'),
('with', '', '', ''),
('10', '', '', ''),
('off', '', '', '')]

Regular Expression Tokenizer using NLTK Tokenizer


[45]: #We can make a prettier regular expression that is equivalent to this one by
#using Python’s triple quotes that allows a string to go across multiple
#lines without adding a newline character

# abbreviations, e.g. U.S.A.


# words with internal hyphens
# currency, like $12.40

ptoken = re.compile(r'''([A-Z]\.)+
| \w+(-\w+)*
| \$?\d+(\.\d+)?
''', re.X)

[46]: # abbreviations, e.g. U.S.A.


# words with optional internal hyphens
# currency and percentages, e.g. $12.40, 82%
# ellipsis ex: hmm..., well...
# these are separate tokens; includes ], [

pattern = r''' (?x) [A-Z][a-z]+\.| (?:[A-Z]\.)+|


| \w+(?:-\w+)*
| \$?\d+(?:\.\d+)?%?
| \.\.\.
| [][.,;"'?():-_']'''

[47]: nltk.regexp_tokenize(shorttext[:30], pattern)

[47]: ['',
'[',

14
'',
'Emma',
'',
'',
'by',
'',
'',
'Jane',
'',
'',
'Austen',
'',
'',
'1816',
'',
']',
'',
'',
'',
'VO',
'']

[48]: nltk.regexp_tokenize(specialtext, pattern)

[48]: ['U.S.A.',
'',
'',
'poster-print',
'',
'',
'costs',
'',
'',
'$12.40',
'',
',',
'',
'',
'with',
'',
'',
'10',
'',
'',
'',
'off',
'',

15
'.',
'']

https://www.nltk.org/book/ch03.html#tab-re-symbols

0.10 Document Term Matrix- DTM


[87]: # Let's start with a 'toy' corpus
CORPUS = [
'the sky is blue',
'sky is blue and sky is beautiful',
'the beautiful sky is so blue',
'i love blue cheese'
]

[90]: #assign the count vectorizer to a variable


countvectorizer=CountVectorizer()

DTM=pd.DataFrame(countvectorizer.fit_transform(CORPUS).toarray(),
columns=countvectorizer.get_feature_names_out(),index=None)

DTM

[90]: and beautiful blue cheese is love sky so the


0 0 0 1 0 1 0 1 0 1
1 1 1 1 0 2 0 2 0 0
2 0 1 1 0 1 0 1 1 1
3 0 0 1 1 0 1 0 0 0

[ ]:

16

You might also like