J.K. Institute of Applied Physics and Technology: Natural Language Processing Assignment

J.K.
INSTITUTE OF APPLIED
PHYSICS AND TECHNOLOGY
Natural Language Processing

Assignment
NAME:- VIVEK KUMAR ARYA

ROLL NO.:- 15
SEMESTER :- B.Tech 8th Semester
BRANCH:- Computer Science & Engineering
1.Write a program to find minimum edit distance between two input
strings.
def editDistance(str1, str2,m,n):
if m==0:
return n
if n==0:
return m
if str1[m-1]==str2[n-1]:
return editDistance(str1,str2,m-1,n-1)
return 1 + min(editDistance(str1, str2, m, n-1), # Insert
editDistance(str1, str2, m-1, n), # Remove
editDistance(str1, str2, m-1, n-1) # Replace
str1 = "sunday"
str2 = "saturday"
print editDistance(str1, str2, len(str1), len(str2))
2. Write a program to implement SOUNDEX algorithm

def soundex(query: str):
query = query.lower()
letters = [char for char in query if char.isalpha()]

if len(query) == 1:
return query + "000"
to_remove = ('a', 'e', 'i', 'o', 'u', 'y', 'h', 'w')
first_letter = letters[0]
letters = letters[1:]
letters = [char for char in letters if char not in to_remove]
if len(letters) == 0:
return first_letter + "000"
to_replace = {('b', 'f', 'p', 'v'): 1, ('c', 'g', 'j', 'k', 'q', 's', 'x', 'z'): 2,
('d', 't'): 3, ('l',): 4, ('m', 'n'): 5, ('r',): 6}
first_letter = [value if first_letter else first_letter for group, value in to_replace.items()
if first_letter in group]
letters = [value if char else char
for char in letters
for group, value in to_replace.items()
if char in group]
letters = [char for ind, char in enumerate(letters)
if (ind == len(letters) - 1 or (ind+1 < len(letters) and char != letters[ind+1]))]
if first_letter == letters[0]:
letters[0] = query[0]
else:
letters.insert(0, query[0])
first_letter = letters[0]
letters = letters[1:]
letters = [char for char in letters if isinstance(char, int)][0:3]
while len(letters) < 3:
letters.append(0)
letters.insert(0, first_letter)
string = "".join([str(l) for l in letters])
return string
print (soundex("King"))
3.Design a tokenizer, sentence segmenter and morphological analyser using

any FST toolkit.
from nltk.tokenize import sent_tokenize, word_tokenize
text = "Natural language processing (NLP) is a field " + \

"of computer science, artificial intelligence " + \
"and computational linguistics concerned with " + \
"the interactions between computers and human " + \
"(natural) languages, and, in particular, " + \
"concerned with programming computers to " + \
"fruitfully process large natural language " + \
"corpora. Challenges in natural language " + \
"processing frequently involve natural " + \
"language understanding, natural language" + \
"generation frequently from formal, machine" + \
"-readable logical forms), connecting language " + \
"and machine perception, managing human-" + \
"computer dialog systems, or some combination " + \
"thereof."
print(sent_tokenize(text))
print(word_tokenize(text))`
5. Write a program to perform stemming on a given text file. Use Porter’s

Stemmer.
from nltk.stem.porter import *
stemmer = PorterStemmer()
plurals = ['caresses', 'flies', 'dies', 'mules', 'denied',
'died', 'agreed', 'owned', 'humbled', 'sized',
'meeting', 'stating', 'siezing', 'itemization',
'sensational', 'traditional', 'reference', 'colonizer',
'plotted']
singles = [stemmer.stem(plural) for plural in plurals]
print(' '.join(singles))
6. Use any tagger to tag a text file. Now write a program to find the most
likely tag in the tagged text.
import nltk
tokens= nltk.word_tokenize("where there is a will, there will be a way.")
print("Parts of Speech : ",nltk.pos_tag(tokens))
8.Write a program to check whether a given grammar is Context Free Grammar (CFG)
or not?
termials = ['a']
nonterminals = ['S']
grammar = [('S',['a', 'S']), ('S',['a']) ];
regular = True
leftRG = False
rightRG = False
forleftSide, rightSide in grammar:
for nonterminal in nonterminals:
if not(leftRG or rightRG):
iflen(rightSide) > 1:
if (nonterminal in rightSide[0]):
leftRG = True
elif (nonterminal in rightSide[-1]):
rightRG = True
else:
regular = regular and not (nonterminal in rightSide)
ifrightRG:
regular = regular and not (nonterminal in rightSide[:-1])
ifleftRG:
regular = regular and not (nonterminal in rightSide[1:])
print(regular)
9.Write a program to convert a given CFG to CNF (Chomsky Normal Form).
from string import ascii_letters as letters
import copy
import re
# Remove large rules (more than 2 states in the right part, eg. A->BCD)
def large(rules,let,voc):
# Make a hard copy of the dictionary (as its size is changing over the # process)
new_dict = copy.deepcopy(rules)
for key in new_dict:
values = new_dict[key]
fori in range(len(values)):
# Check if we have a rule violation
iflen(values[i]) > 2:
# A -> BCD gives 1) A-> BE (if E is the first "free"
# letter from letters pool) and 2) E-> CD
for j in range(0, len(values[i]) - 2):
# replace first rule
if j==0:
rules[key][i] = rules[key][i][0] + let[0]
# add new rules
else:
rules.setdefault(new_key, []).append(values[i][j] + let[0])
voc.append(let[0])
# save letter, as it'll be used in next rule
new_key = copy.deepcopy(let[0])
# remove letter from free letters list
let.remove(let[0])
# last 2 letters remain always the same
rules.setdefault(new_key, []).append(values[i][-2:])
returnrules,let,voc
# Remove empty rules (A->e)
def empty(rules,voc):
# list with keys of empty rules
e_list = []
# find non-terminal rules and add them in list
# if key gives an empty state and is not in list, add it
if values[i] == 'e' and key not in e_list:
e_list.append(key)
# remove empty state
rules[key].remove(values[i])
# if key doesn't contain any values, remove it from dictionary
iflen(rules[key]) == 0:
if key not in rules:
voc.remove(key)
rules.pop(key, None)
# delete empty rules
# check for rules in the form A->BC or A->CB, where B is in e_list
# and C in vocabulary
iflen(values[i]) == 2:
# check for rule in the form A->BC, excluding the case that
# gives A->A as a result)
if values[i][0] in e_list and key!=values[i][1]:
rules.setdefault(key, []).append(values[i][1])
# check for rule in the form A->CB, excluding the case that
# gives A->A as a result)
if values[i][1] in e_list and key!=values[i][0]:
if values[i][0]!=values[i][1]:
rules.setdefault(key, []).append(values[i][0])
returnrules,voc
# Remove short rules (A->B)
def short(rules,voc):
# create a dictionary in the form letter:letter (at the beginning
# D(A) = {A})
D = dict(zip(voc, voc))
# just transform value from string to list, to be able to insert more values
for key in D:
D[key] = list(D[key])
# for every letter A of the vocabulary, if B->C, B in D(A) and C not in D(A)
# add C in D(A)
for letter in voc:
for key in rules:
if key in D[letter]:
values = rules[key]
iflen(values[i]) == 1 and values[i] not in D[letter]:
D.setdefault(letter, []).append(values[i])
rules,D = short1(rules,D)
returnrules,D
def short1(rules,D):
# remove short rules (with length in right side = 1)
iflen(values[i]) == 1:
rules[key].remove(values[i])
iflen(rules[key]) == 0: rules.pop(key, None)
# replace each rule A->BC with A->B'C', where B' in D(B) and C' in D(C)
for key in rules:
values = rules[key]
# search all possible B' in D(B)
for j in D[values[i][0]]:
# search all possible C' in D(C)
for k in D[values[i][1]]:
# concatenate B' and C' and insert a new rule
ifj+k not in values:
rules.setdefault(key, []).append(j + k)
returnrules,D
# Insert rules S->BC for every A->BC where A in D(S)-{S}
deffinal_rules(rules,D,S):
for let in D[S]:
# check if a key has no values
if not rules[S] and not rules[let]:
for v in rules[let]:
if v not in rules[S]:
rules.setdefault(S, []).append(v)
return rules
# Print rules
defprint_rules(rules):
for key in rules:
values = rules[key]
print(key + '->' + values[i])
return 1
def main():
rules = {}
voc = []
# This list's going to be our "letters pool" for naming new states
let = list(letters[26:]) + list(letters[:25])
let.remove('e')
# Number of grammar rules
while True:
userInput = input('Give number of rules')
try:
# check if N is integer >=2
N = int(userInput)
if N <=2: print('N must be a number >=2!')
else: break
exceptValueError:
print("That's not an int!")
# Initial state
while True:
S = input('Give initial state')
if not re.match("[a-zA-Z]*$", S): print('Initial state must be a single \
character!')
else:break
print('+------------------------------------------------------+')
print('|Give rules in the form A B (space-delimited), for A->B|')
print('|or A BCD, if more than one states in the right part |')
print('|(without spaces between right part members). |')
print('+------------------------------------------------------+')
fori in range(N):
# A rule is actually in the form fr->to. However, user gives fr to.
fr, to = map(str,input('Rule #' + str(i + 1)).split())
# Remove given letters from "letters pool"

for l in fr:
if l!='e' and l not in voc:
voc.append(l)
if l in let:
let.remove(l)
for l in to:
if l!='e' and l not in voc:
voc.append(l)
if l in let:
let.remove(l)
# Insert rule to dictionary
rules.setdefault(fr, []).append(to)
# remove large rules and print new rules
print('\nRules after large rules removal')
rules,let,voc = large(rules,let,voc)
print_rules(rules)
#print voc
# remove empty rules and print new rules
print('\nRules after empty rules removal')
rules,voc = empty(rules,voc)
print_rules(rules)
#print voc
print('\nRules after short rules removal')
rules,D = short(rules,voc)
print_rules(rules)
print('\nFinal rules')
rules = final_rules(rules,D,S)
print_rules(rules)
if __name__ == '__main__':
main()
10.Write a program to implement basic top-down parser. Use appropriate grammar

rule.
#include<iostream.h>
#include<conio.h>
#include<string.h>
class parse
intnt,t,m[20][20],i,s,n,p1,q,k,j;
char p[30][30],n1[20],t1[20],ch,b,c,f[30][30],fl[30][30];
public:
int scant(char);
intscannt(char);
void process();
void input();
};
int parse::scannt(char a)
int c=-1,i;
for(i=0;i<nt;i++)
if(n1[i]==a)
returni;
return c;
int parse::scant(char b)
int c1=-1,j;
for(j=0;j<t;j++)
if(t1[j]==b)
return j;
return c1;
}
void parse::input()
cout<<"Enter the number of productions:";
cin>>n;
cout<<"Enter the productions one by one"<<endl;
for(i=0;i<n;i++)
cin>>p[i];
nt=0;
t=0;
void parse::process()
for(i=0;i<n;i++)
if(scannt(p[i][0])==-1)
n1[nt++]=p[i][0];
for(i=0;i<n;i++)
for(j=3;j<strlen(p[i]);j++)
if(p[i][j]!='e')
if(scannt(p[i][j])==-1)
if((scant(p[i][j]))==-1)
t1[t++]=p[i][j];
t1[t++]='$';
for(i=0;i<nt;i++)
for(j=0;j<t;j++)
m[i][j]=-1;
for(i=0;i<nt;i++)
cout<<"Enter first["<<n1[i]<<"]:";
cin>>f[i];
for(i=0;i<nt;i++)
cout<<"Enter follow["<<n1[i]<<"]:";
cin>>fl[i];
for(i=0;i<n;i++)
p1=scannt(p[i][0]);
if((q=scant(p[i][3]))!=-1)
m[p1][q]=i;
if((q=scannt(p[i][3]))!=-1)
for(j=0;j<strlen(f[q]);j++)
m[p1][scant(f[q][j])]=i;
if(p[i][3]=='e')
for(j=0;j<strlen(fl[p1]);j++)
m[p1][scant(fl[p1][j])]=i;
for(i=0;i<t;i++)
cout<<"\t"<<t1[i];
cout<<endl;
for(j=0;j<nt;j++)
cout<<n1[j];
for(i=0;i<t;i++)
cout<<"\t"<<" ";
if(m[j][i]!=-1)
cout<<p[m[j][i]];
cout<<endl;
}
void main()
clrscr();
parse p;
p.input();
p.process();
getch();
11.Write a program to implement Earley parsing algorithm.
class State(object):
def __init__(self, label, rules, dot_idx, start_idx, end_idx, idx, made_from, producer):
self.label = label
self.rules = rules
self.dot_idx = dot_idx
self.start_idx = start_idx
self.end_idx = end_idx
self.idx = idx
self.made_from = made_from
self.producer = producer
def next(self):
"""Returns the tag after the dot"""
returnself.rules[self.dot_idx]
def complete(self):
returnlen(self.rules) == self.dot_idx
def __eq__(self, other):

return (self.label == other.label and
self.rules == other.rules and
self.dot_idx == other.dot_idx and
self.start_idx == other.start_idx and
self.end_idx == other.end_idx)
def __str__(self):
rule_string = ''
fori, rule in enumerate(self.rules):
ifi == self.dot_idx:
rule_string += '\\bullet '
rule_string += rule + ' '
ifself.dot_idx == len(self.rules):
rule_string += '\\bullet'
return 'S%d %s -> %s [%d, %d] %s %s' % (self.idx, self.label, rule_string, self.start_idx,
self.end_idx, self.made_from, self.producer)
classEarley:
def __init__(self, words, grammar, terminals):
self.chart = [[] for _ in range(len(words) + 1)]
self.current_id = 0
self.words = words
self.grammar = grammar
self.terminals = terminals
defget_new_id(self):
self.current_id += 1
returnself.current_id - 1
defis_terminal(self, tag):
return tag in self.terminals
defis_complete(self, state):
returnlen(state.rules) == state.dot_idx
defenqueue(self, state, chart_entry):

if state not in self.chart[chart_entry]:
self.chart[chart_entry].append(state)
else:
self.current_id -= 1
def predictor(self, state):

for production in self.grammar[state.next()]:
self.enqueue(State(state.next(), production, 0, state.end_idx, state.end_idx, self.get_new_id(),
[], 'predictor'), state.end_idx)
def scanner(self, state):

ifself.words[state.end_idx] in self.grammar[state.next()]:
self.enqueue(State(state.next(), [self.words[state.end_idx]], 1, state.end_idx, state.end_idx + 1,
self.get_new_id(), [], 'scanner'), state.end_idx + 1)
def completer(self, state):

for s in self.chart[state.start_idx]:
if not s.complete() and s.next() == state.label and s.end_idx == state.start_idx and s.label !=
'gamma':
self.enqueue(State(s.label, s.rules, s.dot_idx + 1, s.start_idx, state.end_idx, self.get_new_id(),
s.made_from + [state.idx], 'completer'), state.end_idx)
def parse(self):
self.enqueue(State('gamma', ['S'], 0, 0, 0, self.get_new_id(), [], 'dummy start state'), 0)
fori in range(len(self.words) + 1):

for state in self.chart[i]:
if not state.complete() and not self.is_terminal(state.next()):
self.predictor(state)
elifi != len(self.words) and not state.complete() and self.is_terminal(state.next()):
self.scanner(state)
else:
self.completer(state)
def __str__(self):
res = ''
fori, chart in enumerate(self.chart):

res += '\nChart[%d]\n' % i
for state in chart:
res += str(state) + '\n'
return res
def test():
grammar = {
'S': [['NP', 'VP'], ['Aux', 'NP', 'VP'], ['VP']],
'NP': [['Det', 'Nominal'], ['Proper-Noun']],
'Nominal': [['Noun'], ['Noun', 'Nominal']],
'VP': [['Verb'], ['Verb', 'NP']],
'Det': ['that', 'this', 'a'],
'Noun': ['book', 'flight', 'meal', 'money'],
'Verb': ['book', 'include', 'prever'],
'Aux': ['does'],
'Prep': ['from', 'to', 'on'],
'Proper-Noun': ['Houston', 'TWA']
}
terminals = ['Det', 'Noun', 'Verb', 'Aux', 'Prep', 'Proper-Noun']
earley = Earley(['book', 'that', 'flight'], grammar, terminals)

earley.parse()
print(earley)
if __name__ == '__main__':
test()
12. Write a program that takes as input and output its various senses and part of speeches. Use
English WordNet and its APIs.
importnltk
fromnltk.wsd import lesk
fromnltk.corpus import wordnet
fromnltk.tokenize import word_tokenize
fromnltk.corpus import stopwords
sentence="I went to the bank to deposit my money"
stop_words=set(stopwords.words('english'))
tokens = set(word_tokenize(sentence))
sent=tokens.difference(stop_words)
for sense in sent:
print(sense+'---- '+lesk(sentence,sense).definition())
print()
print('*****************************************************************')
print(nltk.pos_tag(tokens))

J.K. Institute of Applied Physics and Technology: Natural Language Processing Assignment

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

J.K. Institute of Applied Physics and Technology: Natural Language Processing Assignment

Uploaded by

Copyright:

Available Formats

J.K.

Natural Language Processing

NAME:- VIVEK KUMAR ARYA

def editDistance(str1, str2,m,n):

return 1 + min(editDistance(str1, str2, m, n-1), # Insert

editDistance(str1, str2, m-1, n), # Remove

editDistance(str1, str2, m-1, n-1) # Replace

print editDistance(str1, str2, len(str1), len(str2))

2. Write a program to implement SOUNDEX algorithm

letters = [char for char in query if char.isalpha()]

return query + "000"

to_remove = ('a', 'e', 'i', 'o', 'u', 'y', 'h', 'w')

letters = [char for char in letters if char not in to_remove]

return first_letter + "000"

('d', 't'): 3, ('l',): 4, ('m', 'n'): 5, ('r',): 6}

first_letter = [value if first_letter else first_letter for group, value in to_replace.items()

letters = [value if char else char

for char in letters

for group, value in to_replace.items()

letters = [char for ind, char in enumerate(letters)

if (ind == len(letters) - 1 or (ind+1 < len(letters) and char != letters[ind+1]))]

letters = [char for char in letters if isinstance(char, int)][0:3]

while len(letters) < 3:

string = "".join([str(l) for l in letters])

3.Design a tokenizer, sentence segmenter and morphological analyser using

text = "Natural language processing (NLP) is a field " + \

5. Write a program to perform stemming on a given text file. Use Porter’s

from nltk.stem.porter import *

plurals = ['caresses', 'flies', 'dies', 'mules', 'denied',

'died', 'agreed', 'owned', 'humbled', 'sized',

'meeting', 'stating', 'siezing', 'itemization',

'sensational', 'traditional', 'reference', 'colonizer',

tokens= nltk.word_tokenize("where there is a will, there will be a way.")

print("Parts of Speech : ",nltk.pos_tag(tokens))

grammar = [('S',['a', 'S']), ('S',['a']) ];

forleftSide, rightSide in grammar:

for nonterminal in nonterminals:

elif (nonterminal in rightSide[-1]):

regular = regular and not (nonterminal in rightSide[:-1])

regular = regular and not (nonterminal in rightSide[1:])

9.Write a program to convert a given CFG to CNF (Chomsky Normal Form).

from string import ascii_letters as letters

for key in new_dict:

# Check if we have a rule violation

# A -> BCD gives 1) A-> BE (if E is the first "free"

# letter from letters pool) and 2) E-> CD

for j in range(0, len(values[i]) - 2):

# replace first rule

# add new rules

rules.setdefault(new_key, []).append(values[i][j] + let[0])

# save letter, as it'll be used in next rule

# remove letter from free letters list

# last 2 letters remain always the same

# Remove empty rules (A->e)

# list with keys of empty rules

# find non-terminal rules and add them in list

for key in new_dict:

# if key gives an empty state and is not in list, add it

if values[i] == 'e' and key not in e_list:

# if key doesn't contain any values, remove it from dictionary

if key not in rules:

# delete empty rules

def eq(self, other):