Text Processing

text-processing
March 24, 2024
[1]: import nltk
#tokenizing
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
#stopwords
from nltk.corpus import stopwords
#regexp
import re
# pandas dataframe
import pandas as pd
#import count vectorizer

from sklearn.feature_extraction.text import CountVectorizer
[2]: nltk.download()
showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
[2]: True
[3]: #load the data used in the book examples into the Python environment:
from nltk.book import *
*** Introductory Examples for the NLTK Book ***

Loading text1, …, text9 and sent1, …, sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
1
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908
This command loaded 9 of the text examples available from the corpora package (only
a small number of them!). It has used the variable names text1 through text9 for
theseexamples, and already assigned them values. If you type the variable name, you
get a description of the text
[4]: text1
[4]: <Text: Moby Dick by Herman Melville 1851>
Note that the first sentence of the book Moby Dick is “Call me Ishmael.” and that
this sentence has been already separated into tokens in the variable sent1
[5]: #The variables sent1 through sent9 have been set to be a list of tokens of the␣
↪first sentence of each text.
sent1
[5]: ['Call', 'me', 'Ishmael', '.']
[ ]:
0.1 Counting
[8]: #gives the total number of words in the text
len(text1)
[8]: 260819
[7]: #to find out how many unique words there are, not counting repetitions (gives␣
↪all tokens)
sorted(set(text1))
#Or we can just find the length of that list.

len(sorted(set(text3)))
[7]: 2789
[12]: #Or we can specify just to print the first 30 words in the list of sorted words:
sorted(set(text3))[:30]
[12]: ['!',
"'",
2
'(',
')',
',',
',)',
'.',
'.)',
':',
';',
';)',
'?',
'?)',
'A',
'Abel',
'Abelmizraim',
'Abidah',
'Abide',
'Abimael',
'Abimelech',
'Abr',
'Abrah',
'Abraham',
'Abram',
'Accad',
'Achbor',
'Adah',
'Adam',
'Adbeel',
'Admah']
[13]: #to count how many times the word 'Moby' has appeared in the text1
text1.count("Moby")
[13]: 84
[ ]:
0.2 Processing Text

lets use gutenberg corpus
NLTK includes a small selection of texts from the Project Gutenberg electronic text
archive, which contains some 25,000 free electronic books
[19]: # You can then view some books obtained from the Gutenberg on-line book project:
nltk.corpus.gutenberg.fileids()
3
[19]: ['austen-emma.txt',
'austen-persuasion.txt',
'austen-sense.txt',
'bible-kjv.txt',
'blake-poems.txt',
'bryant-stories.txt',
'burgess-busterbrown.txt',
'carroll-alice.txt',
'chesterton-ball.txt',
'chesterton-brown.txt',
'chesterton-thursday.txt',
'edgeworth-parents.txt',
'melville-moby_dick.txt',
'milton-paradise.txt',
'shakespeare-caesar.txt',
'shakespeare-hamlet.txt',
'shakespeare-macbeth.txt',
'whitman-leaves.txt']
[22]: #view the first file

file1 = nltk.corpus.gutenberg.fileids( ) [0]
file1
[22]: 'austen-emma.txt'
[33]: #We can get the original text, using the raw function:
emmatext = nltk.corpus.gutenberg.raw(file1)
emmatext[:120] #Since this is quite long, we can view part of it, e.g. the␣
↪first 120 characters
#len(emmatext) #count of total characters
[33]: '[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse,

handsome, clever, and rich, with a comfortable home\nan'
0.3 1. Tokenization
NLTK has several tokenizers available to break the raw text into tokens; we will use one that
separates by white space and also by special characters (punctuation)
0.3.1 Word Tokenization

[32]: emmatokens = nltk.wordpunct_tokenize(emmatext)
len(emmatokens) #total token count
4
#view the tokenized text
emmatokens[:15]
[32]: ['[',
'Emma',
'by',
'Jane',
'Austen',
'1816',
']',
'VOLUME',
'I',
'CHAPTER',
'I',
'Emma',
'Woodhouse',
',',
'handsome']
[34]: #Example
sentence="I have no money at the moment."
nltk.wordpunct_tokenize(sentence)
[34]: ['I', 'have', 'no', 'money', 'at', 'the', 'moment', '.']
[36]: #using word_tokenize

text = "God is Great! I won a lottery."
print(word_tokenize(text))
['God', 'is', 'Great', '!', 'I', 'won', 'a', 'lottery', '.']
[39]: #usigng Regexp tokenizer

text="God is Great! I won a lottery."
tokenizer = RegexpTokenizer("[\w']+")
tokenizer.tokenize(text)
[39]: ['God', 'is', 'Great', 'I', 'won', 'a', 'lottery']
0.3.2 Sentence Tokenization

[44]: #by using nltk library
text1 = "God is Great! I won a lottery."

print(sent_tokenize(text1))
5
['God is Great!', 'I won a lottery.']
[45]: text2="Let us understand the difference between sentence & word tokenizer. It␣
↪is going to be a simple example."
text2.split(". ")
[45]: ['Let us understand the difference between sentence & word tokenizer',
'It is going to be a simple example.']
[ ]:
0.4 2. Stopwords
[19]: #lookat the stopwords listed
print(stopwords.words('english'))
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're",
"you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he',
'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's",
'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what',
'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is',
'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above',
'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now',
'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn',
"couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
"hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't",
'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',
"shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn',
"wouldn't"]
[49]: sent1="""He determined to drop his litigation with the monastry, and relinguish␣
↪his claims to the wood-cuting and
fishery rihgts at once. He was the more ready to do this becuase the rights had␣
↪become much less valuable, and he had
indeed the vaguest idea where the wood and river in question were."""
# set of stop words

stop_words = set(stopwords.words('english'))
# tokens of words
6
word_tokens = word_tokenize(sent1)
word_tokens[:10]
[49]: ['He',
'determined',
'to',
'drop',
'his',
'litigation',
'with',
'the',
'monastry',
',']
[50]: #empty list to get the final stop word removed text
filtered_sentence = []
# filter out the stop words

for w in word_tokens:
if w not in stop_words:
filtered_sentence.append(w)
print("\nOriginal Sentence \n")

print(" ".join(word_tokens))
print("\nFiltered Sentence \n")

print(" ".join(filtered_sentence))
Original Sentence
He determined to drop his litigation with the monastry , and relinguish his
claims to the wood-cuting and fishery rihgts at once . He was the more ready to
do this becuase the rights had become much less valuable , and he had indeed the
vaguest idea where the wood and river in question were .
Filtered Sentence
He determined drop litigation monastry , relinguish claims wood-cuting fishery

rihgts . He ready becuase rights become much less valuable , indeed vaguest idea
wood river question .
7
0.5 3. Normalizing word Formats
0.6 3.1 Lowercase
[51]: #Example
sentence="I have NO moNey at tHE moMent."
sentence.lower()
[51]: 'i have no money at the moment.'
[53]: #for already tokenized text

emmawords = [w.lower( ) for w in emmatokens]
emmawords[:15]
[53]: ['[',
'emma',
'by',
'jane',
'austen',
'1816',
']',
'volume',
'i',
'chapter',
'i',
'emma',
'woodhouse',
',',
'handsome']
[55]: # We can further view the words by getting the unique words and sorting them:
emmavocab = sorted(set(emmawords))
emmavocab[:10]
[55]: ['!', '!"', '!"--', "!'", "!'--", '!)--', '!--', '!--"', '!--(', '!--`']
[25]: #uppercased
sentence.upper()
#check Table 3.2 for more operations on strings (Chapter 3, Section 3.2 of NLTK␣
↪book)
[25]: 'I HAVE NO MONEY AT THE MOMENT.'
[26]: #select a set of words from the tokenized text

shortwords=emmawords[11:111]
shortwords[:10]
8
[26]: ['emma', 'woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich', ',']
[27]: #get the frequency count for each word

shortdist = FreqDist(shortwords)
shortdist.keys( )
for word in shortdist.keys():

print (word, shortdist[word])
emma 1
woodhouse 1
, 8
handsome 1
clever 1
and 4
rich 1
with 2
a 3
comfortable 1
home 1
happy 1
disposition 1
seemed 1
to 3
unite 1
some 1
of 6
the 4
best 1
blessings 1
existence 1
; 2
had 3
lived 1
nearly 1
twenty 1
- 1
one 1
years 1
in 2
world 1
very 2
little 1
distress 1
or 1
vex 1
her 4
9
. 2
she 1
was 1
youngest 1
two 1
daughters 1
most 1
affectionate 1
indulgent 1
father 1
consequence 1
sister 1
' 1
s 1
marriage 1
been 1
mistress 1
his 1
house 1
from 1
early 1
period 1
mother 1
died 1
too 1
long 1
ago 1
for 1
have 1
more 1
0.7 3.2 Stemming

NLTK has two stemmers, Porter and Lancaster, described in section 3.6 of the NLTK
book. To use these stemmers, you first create them
[58]: porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
[61]: #regular-cased text- porter stemmer

emmaregstem = [porter.stem(t) for t in emmatokens]
emmaregstem[1:10]
[61]: ['emma', 'by', 'jane', 'austen', '1816', ']', 'volum', 'i', 'chapter']
[30]: #lowercased text

emmalowerstem = [porter.stem(t) for t in emmawords]
emmalowerstem[1:10]
10
[30]: ['emma', 'by', 'jane', 'austen', '1816', ']', 'volum', 'i', 'chapter']
[31]: #regular-cased text - lancaster stemmer

emmaregstem1 = [lancaster.stem(t) for t in emmatokens]
emmaregstem1[1:10]
[31]: ['emm', 'by', 'jan', 'aust', '1816', ']', 'volum', 'i', 'chapt']
[70]: #building our own simple stemmer by making a list of suffixes to take off.
def stem(word):
for suffix in ['ing','ly','ed','ious','ies','ive','es','s']:
if word.endswith(suffix):
return word[:-len(suffix)]
return word
#try the above stemmer with 'friends'

stem('friends')
[70]: 'friend'
[71]: stem('relatives')
[71]: 'relativ'
0.8 3.3 Lemmatizing

NLTK has a lemmatizer that uses the WordNet on-line thesaurus as a dictionary to look up roots
and find the word.
[74]: wnl = nltk.WordNetLemmatizer()
emmalemma=[wnl.lemmatize(t) for t in emmawords]
emmalemma[1:10]
[74]: ['emma', 'by', 'jane', 'austen', '1816', ']', 'volume', 'i', 'chapter']
[82]: wnl.lemmatize('friends')
wnl.lemmatize('relatives')
[82]: 'relative'
0.9 4. Regex:Regular Expressions for Detecting Word Patterns

[83]: emmatext[:100]
[83]: '[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse,

handsome, clever, and rich, with a'
11
[85]: #the function replace to replace all the new characters ‘\n’ with a space ‘ ‘.
newemmatext = emmatext.replace('\n', ' ')
shorttext = newemmatext[:150]
#redefined the variable shorttext to be the first 150 characters

#without newlines
shorttext
[85]: '[Emma by Jane Austen 1816] VOLUME I CHAPTER I Emma Woodhouse, handsome,
clever, and rich, with a comfortable home and happy disposition, seemed to'
[38]: pword = re.compile('\w+')

#re.findall will find the substrings that matched anywhere in the string.
re.findall(pword, shorttext)
[38]: ['Emma',
'by',
'Jane',
'Austen',
'1816',
'VOLUME',
'I',
'CHAPTER',
'I',
'Emma',
'Woodhouse',
'handsome',
'clever',
'and',
'rich',
'with',
'a',
'comfortable',
'home',
'and',
'happy',
'disposition',
'seemed',
'to']
[39]: #re.findall will find the substrings that matched anywhere in the specialtext.
specialtext = 'U.S.A. poster-print costs $12.40, with 10% off.'
re.findall(pword, specialtext)
[39]: ['U', 'S', 'A', 'poster', 'print', 'costs', '12', '40', 'with', '10', 'off']
12
[40]: #to match tokens by matching words can have an internal hyphen.
ptoken = re.compile('(\w+(-\w+)*)')
re.findall(ptoken, specialtext)
[40]: [('U', ''),

('S', ''),
('A', ''),
('poster-print', '-print'),
('costs', ''),
('12', ''),
('40', ''),
('with', ''),
('10', ''),
('off', '')]
[41]: #to match abbreviations that might have a “.” inside, like U.S.A.
#We only allow capitalized letters
pabbrev = re.compile('(([A-Z]\.)+)')
re.findall(pabbrev, specialtext)
[41]: [('U.S.A.', 'A.')]
[42]: #combine it with the words pattern to match either words or abbreviations
ptoken = re.compile('(\w+(-\w+)*|([A-Z]\.)+)')
[42]: [('U', '', ''),

('S', '', ''),
('A', '', ''),
('poster-print', '-print', ''),
('costs', '', ''),
('12', '', ''),
('40', '', ''),
('with', '', ''),
('10', '', ''),
('off', '', '')]
[43]: #order of the matching patterns really matters if

#an earlier pattern matches part of what you want to match.
ptoken = re.compile('(([A-Z]\.)+|\w+(-\w+)*)')
[43]: [('U.S.A.', 'A.', ''),

('poster-print', '', '-print'),
('costs', '', ''),
('12', '', ''),
('40', '', ''),
13
('with', '', ''),
('10', '', ''),
('off', '', '')]
[44]: #add an expression to match the currency

ptoken = re.compile(r'(([A-Z]\.)+|\w+(-\w+)*|\$?\d+(\.\d+)?)')
[44]: [('U.S.A.', 'A.', '', ''),

('poster-print', '', '-print', ''),
('costs', '', '', ''),
('$12.40', '', '', '.40'),
('with', '', '', ''),
('10', '', '', ''),
('off', '', '', '')]
Regular Expression Tokenizer using NLTK Tokenizer

[45]: #We can make a prettier regular expression that is equivalent to this one by
#using Python’s triple quotes that allows a string to go across multiple
#lines without adding a newline character
# abbreviations, e.g. U.S.A.

# words with internal hyphens
# currency, like $12.40
ptoken = re.compile(r'''([A-Z]\.)+
| \w+(-\w+)*
| \$?\d+(\.\d+)?
''', re.X)
[46]: # abbreviations, e.g. U.S.A.

# words with optional internal hyphens
# currency and percentages, e.g. $12.40, 82%
# ellipsis ex: hmm..., well...
# these are separate tokens; includes ], [
pattern = r''' (?x) [A-Z][a-z]+\.| (?:[A-Z]\.)+|

| \w+(?:-\w+)*
| \$?\d+(?:\.\d+)?%?
| \.\.\.
| [][.,;"'?():-_']'''
[47]: nltk.regexp_tokenize(shorttext[:30], pattern)
[47]: ['',
'[',
14
'',
'Emma',
'',
'',
'by',
'',
'',
'Jane',
'',
'',
'Austen',
'',
'',
'1816',
'',
']',
'',
'',
'',
'VO',
'']
[48]: nltk.regexp_tokenize(specialtext, pattern)
[48]: ['U.S.A.',
'',
'',
'poster-print',
'',
'',
'costs',
'',
'',
'$12.40',
'',
',',
'',
'',
'with',
'',
'',
'10',
'',
'',
'',
'off',
'',
15
'.',
'']
https://www.nltk.org/book/ch03.html#tab-re-symbols
0.10 Document Term Matrix- DTM

[87]: # Let's start with a 'toy' corpus
CORPUS = [
'the sky is blue',
'sky is blue and sky is beautiful',
'the beautiful sky is so blue',
'i love blue cheese'
]
[90]: #assign the count vectorizer to a variable

countvectorizer=CountVectorizer()
DTM=pd.DataFrame(countvectorizer.fit_transform(CORPUS).toarray(),
columns=countvectorizer.get_feature_names_out(),index=None)
DTM
[90]: and beautiful blue cheese is love sky so the

0 0 0 1 0 1 0 1 0 1
1 1 1 1 0 2 0 2 0 0
2 0 1 1 0 1 0 1 1 1
3 0 0 1 1 0 1 0 0 0
[ ]:
16

Text Processing

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Text Processing

Uploaded by

Copyright:

Available Formats

text-processing

March 24, 2024

[1]: import nltk

#import count vectorizer

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml

from nltk.book import *

*** Introductory Examples for the NLTK Book ***

[4]: <Text: Moby Dick by Herman Melville 1851>

[5]: ['Call', 'me', 'Ishmael', '.']

#Or we can just find the length of that list.

0.2 Processing Text

[22]: #view the first file

#len(emmatext) #count of total characters

[33]: '[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse,

0.3.1 Word Tokenization

len(emmatokens) #total token count

[34]: ['I', 'have', 'no', 'money', 'at', 'the', 'moment', '.']

[36]: #using word_tokenize

['God', 'is', 'Great', '!', 'I', 'won', 'a', 'lottery', '.']

[39]: #usigng Regexp tokenizer

[39]: ['God', 'is', 'Great', 'I', 'won', 'a', 'lottery']

0.3.2 Sentence Tokenization

text1 = "God is Great! I won a lottery."

# set of stop words

# filter out the stop words

print("\nOriginal Sentence \n")

print("\nFiltered Sentence \n")

He determined drop litigation monastry , relinguish claims wood-cuting fishery

[51]: 'i have no money at the moment.'

[53]: #for already tokenized text

[25]: 'I HAVE NO MONEY AT THE MOMENT.'

[26]: #select a set of words from the tokenized text

[27]: #get the frequency count for each word

for word in shortdist.keys():

0.7 3.2 Stemming

[61]: #regular-cased text- porter stemmer

[30]: #lowercased text

[31]: #regular-cased text - lancaster stemmer

#try the above stemmer with 'friends'

0.8 3.3 Lemmatizing

0.9 4. Regex:Regular Expressions for Detecting Word Patterns

[83]: '[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse,

#redefined the variable shorttext to be the first 150 characters

[38]: pword = re.compile('\w+')

[40]: [('U', ''),

[41]: [('U.S.A.', 'A.')]

[42]: [('U', '', ''),

[43]: #order of the matching patterns really matters if

[43]: [('U.S.A.', 'A.', ''),

[44]: #add an expression to match the currency

[44]: [('U.S.A.', 'A.', '', ''),

Regular Expression Tokenizer using NLTK Tokenizer

# abbreviations, e.g. U.S.A.

[46]: # abbreviations, e.g. U.S.A.

pattern = r''' (?x) [A-Z][a-z]+\.| (?:[A-Z]\.)+|

[47]: nltk.regexp_tokenize(shorttext[:30], pattern)

[48]: nltk.regexp_tokenize(specialtext, pattern)

0.10 Document Term Matrix- DTM

[90]: #assign the count vectorizer to a variable

[90]: and beautiful blue cheese is love sky so the

You might also like

* Introductory Examples for the NLTK Book *