Download as pdf or txt
Download as pdf or txt
You are on page 1of 22

J.K.

INSTITUTE OF APPLIED
PHYSICS AND TECHNOLOGY

Natural Language Processing


Assignment

NAME:- VIVEK KUMAR ARYA


ROLL NO.:- 15
SEMESTER :- B.Tech 8th Semester
BRANCH:- Computer Science & Engineering
1.Write a program to find minimum edit distance between two input
strings.

def editDistance(str1, str2,m,n):

if m==0:

return n

if n==0:

return m

if str1[m-1]==str2[n-1]:

return editDistance(str1,str2,m-1,n-1)

return 1 + min(editDistance(str1, str2, m, n-1), # Insert

editDistance(str1, str2, m-1, n), # Remove

editDistance(str1, str2, m-1, n-1) # Replace

str1 = "sunday"

str2 = "saturday"

print editDistance(str1, str2, len(str1), len(str2))

2. Write a program to implement SOUNDEX algorithm


def soundex(query: str):

query = query.lower()

letters = [char for char in query if char.isalpha()]


if len(query) == 1:

return query + "000"

to_remove = ('a', 'e', 'i', 'o', 'u', 'y', 'h', 'w')

first_letter = letters[0]

letters = letters[1:]

letters = [char for char in letters if char not in to_remove]

if len(letters) == 0:

return first_letter + "000"

to_replace = {('b', 'f', 'p', 'v'): 1, ('c', 'g', 'j', 'k', 'q', 's', 'x', 'z'): 2,

('d', 't'): 3, ('l',): 4, ('m', 'n'): 5, ('r',): 6}

first_letter = [value if first_letter else first_letter for group, value in to_replace.items()

if first_letter in group]

letters = [value if char else char

for char in letters

for group, value in to_replace.items()

if char in group]

letters = [char for ind, char in enumerate(letters)

if (ind == len(letters) - 1 or (ind+1 < len(letters) and char != letters[ind+1]))]

if first_letter == letters[0]:

letters[0] = query[0]

else:

letters.insert(0, query[0])

first_letter = letters[0]

letters = letters[1:]

letters = [char for char in letters if isinstance(char, int)][0:3]

while len(letters) < 3:

letters.append(0)
letters.insert(0, first_letter)

string = "".join([str(l) for l in letters])

return string

print (soundex("King"))

3.Design a tokenizer, sentence segmenter and morphological analyser using


any FST toolkit.
from nltk.tokenize import sent_tokenize, word_tokenize

text = "Natural language processing (NLP) is a field " + \


"of computer science, artificial intelligence " + \
"and computational linguistics concerned with " + \
"the interactions between computers and human " + \
"(natural) languages, and, in particular, " + \
"concerned with programming computers to " + \
"fruitfully process large natural language " + \
"corpora. Challenges in natural language " + \
"processing frequently involve natural " + \
"language understanding, natural language" + \
"generation frequently from formal, machine" + \
"-readable logical forms), connecting language " + \
"and machine perception, managing human-" + \
"computer dialog systems, or some combination " + \
"thereof."

print(sent_tokenize(text))
print(word_tokenize(text))`

5. Write a program to perform stemming on a given text file. Use Porter’s


Stemmer.

from nltk.stem.porter import *

stemmer = PorterStemmer()

plurals = ['caresses', 'flies', 'dies', 'mules', 'denied',

'died', 'agreed', 'owned', 'humbled', 'sized',

'meeting', 'stating', 'siezing', 'itemization',

'sensational', 'traditional', 'reference', 'colonizer',

'plotted']
singles = [stemmer.stem(plural) for plural in plurals]

print(' '.join(singles))

6. Use any tagger to tag a text file. Now write a program to find the most
likely tag in the tagged text.

import nltk

tokens= nltk.word_tokenize("where there is a will, there will be a way.")

print("Parts of Speech : ",nltk.pos_tag(tokens))

8.Write a program to check whether a given grammar is Context Free Grammar (CFG)
or not?

termials = ['a']

nonterminals = ['S']

grammar = [('S',['a', 'S']), ('S',['a']) ];

regular = True

leftRG = False

rightRG = False

forleftSide, rightSide in grammar:

for nonterminal in nonterminals:

if not(leftRG or rightRG):

iflen(rightSide) > 1:

if (nonterminal in rightSide[0]):

leftRG = True

elif (nonterminal in rightSide[-1]):

rightRG = True

else:
regular = regular and not (nonterminal in rightSide)

ifrightRG:

regular = regular and not (nonterminal in rightSide[:-1])

ifleftRG:

regular = regular and not (nonterminal in rightSide[1:])

print(regular)

9.Write a program to convert a given CFG to CNF (Chomsky Normal Form).

from string import ascii_letters as letters

import copy

import re

# Remove large rules (more than 2 states in the right part, eg. A->BCD)

def large(rules,let,voc):

# Make a hard copy of the dictionary (as its size is changing over the # process)

new_dict = copy.deepcopy(rules)

for key in new_dict:

values = new_dict[key]

fori in range(len(values)):

# Check if we have a rule violation

iflen(values[i]) > 2:

# A -> BCD gives 1) A-> BE (if E is the first "free"

# letter from letters pool) and 2) E-> CD

for j in range(0, len(values[i]) - 2):

# replace first rule

if j==0:
rules[key][i] = rules[key][i][0] + let[0]

# add new rules

else:

rules.setdefault(new_key, []).append(values[i][j] + let[0])

voc.append(let[0])

# save letter, as it'll be used in next rule

new_key = copy.deepcopy(let[0])

# remove letter from free letters list

let.remove(let[0])

# last 2 letters remain always the same

rules.setdefault(new_key, []).append(values[i][-2:])

returnrules,let,voc

# Remove empty rules (A->e)

def empty(rules,voc):

# list with keys of empty rules

e_list = []

# find non-terminal rules and add them in list

new_dict = copy.deepcopy(rules)

for key in new_dict:

values = new_dict[key]

fori in range(len(values)):

# if key gives an empty state and is not in list, add it

if values[i] == 'e' and key not in e_list:

e_list.append(key)
# remove empty state

rules[key].remove(values[i])

# if key doesn't contain any values, remove it from dictionary

iflen(rules[key]) == 0:

if key not in rules:

voc.remove(key)

rules.pop(key, None)

# delete empty rules

new_dict = copy.deepcopy(rules)

for key in new_dict:

values = new_dict[key]

fori in range(len(values)):

# check for rules in the form A->BC or A->CB, where B is in e_list

# and C in vocabulary

iflen(values[i]) == 2:

# check for rule in the form A->BC, excluding the case that

# gives A->A as a result)

if values[i][0] in e_list and key!=values[i][1]:

rules.setdefault(key, []).append(values[i][1])

# check for rule in the form A->CB, excluding the case that

# gives A->A as a result)

if values[i][1] in e_list and key!=values[i][0]:

if values[i][0]!=values[i][1]:

rules.setdefault(key, []).append(values[i][0])
returnrules,voc

# Remove short rules (A->B)

def short(rules,voc):

# create a dictionary in the form letter:letter (at the beginning

# D(A) = {A})

D = dict(zip(voc, voc))

# just transform value from string to list, to be able to insert more values

for key in D:

D[key] = list(D[key])

# for every letter A of the vocabulary, if B->C, B in D(A) and C not in D(A)

# add C in D(A)

for letter in voc:

for key in rules:

if key in D[letter]:

values = rules[key]

fori in range(len(values)):

iflen(values[i]) == 1 and values[i] not in D[letter]:

D.setdefault(letter, []).append(values[i])

rules,D = short1(rules,D)

returnrules,D

def short1(rules,D):
# remove short rules (with length in right side = 1)

new_dict = copy.deepcopy(rules)

for key in new_dict:

values = new_dict[key]

fori in range(len(values)):

iflen(values[i]) == 1:

rules[key].remove(values[i])

iflen(rules[key]) == 0: rules.pop(key, None)

# replace each rule A->BC with A->B'C', where B' in D(B) and C' in D(C)

for key in rules:

values = rules[key]

fori in range(len(values)):

# search all possible B' in D(B)

for j in D[values[i][0]]:

# search all possible C' in D(C)

for k in D[values[i][1]]:

# concatenate B' and C' and insert a new rule

ifj+k not in values:

rules.setdefault(key, []).append(j + k)

returnrules,D

# Insert rules S->BC for every A->BC where A in D(S)-{S}

deffinal_rules(rules,D,S):
for let in D[S]:

# check if a key has no values

if not rules[S] and not rules[let]:

for v in rules[let]:

if v not in rules[S]:

rules.setdefault(S, []).append(v)

return rules

# Print rules

defprint_rules(rules):

for key in rules:

values = rules[key]

fori in range(len(values)):

print(key + '->' + values[i])

return 1

def main():

rules = {}

voc = []

# This list's going to be our "letters pool" for naming new states

let = list(letters[26:]) + list(letters[:25])

let.remove('e')

# Number of grammar rules

while True:
userInput = input('Give number of rules')

try:

# check if N is integer >=2

N = int(userInput)

if N <=2: print('N must be a number >=2!')

else: break

exceptValueError:

print("That's not an int!")

# Initial state

while True:

S = input('Give initial state')

if not re.match("[a-zA-Z]*$", S): print('Initial state must be a single \

character!')

else:break

print('+------------------------------------------------------+')

print('|Give rules in the form A B (space-delimited), for A->B|')

print('|or A BCD, if more than one states in the right part |')

print('|(without spaces between right part members). |')

print('+------------------------------------------------------+')

fori in range(N):

# A rule is actually in the form fr->to. However, user gives fr to.

fr, to = map(str,input('Rule #' + str(i + 1)).split())

# Remove given letters from "letters pool"


for l in fr:

if l!='e' and l not in voc:

voc.append(l)

if l in let:

let.remove(l)

for l in to:

if l!='e' and l not in voc:

voc.append(l)

if l in let:

let.remove(l)

# Insert rule to dictionary

rules.setdefault(fr, []).append(to)

# remove large rules and print new rules

print('\nRules after large rules removal')

rules,let,voc = large(rules,let,voc)

print_rules(rules)

#print voc

# remove empty rules and print new rules

print('\nRules after empty rules removal')

rules,voc = empty(rules,voc)

print_rules(rules)

#print voc
print('\nRules after short rules removal')

rules,D = short(rules,voc)

print_rules(rules)

print('\nFinal rules')

rules = final_rules(rules,D,S)

print_rules(rules)

if __name__ == '__main__':

main()

10.Write a program to implement basic top-down parser. Use appropriate grammar


rule.

#include<iostream.h>

#include<conio.h>

#include<string.h>

class parse

intnt,t,m[20][20],i,s,n,p1,q,k,j;

char p[30][30],n1[20],t1[20],ch,b,c,f[30][30],fl[30][30];

public:

int scant(char);

intscannt(char);

void process();

void input();

};
int parse::scannt(char a)

int c=-1,i;

for(i=0;i<nt;i++)

if(n1[i]==a)

returni;

return c;

int parse::scant(char b)

int c1=-1,j;

for(j=0;j<t;j++)

if(t1[j]==b)

return j;

return c1;

}
void parse::input()

cout<<"Enter the number of productions:";

cin>>n;

cout<<"Enter the productions one by one"<<endl;

for(i=0;i<n;i++)

cin>>p[i];

nt=0;

t=0;

void parse::process()

for(i=0;i<n;i++)

if(scannt(p[i][0])==-1)

n1[nt++]=p[i][0];

for(i=0;i<n;i++)

for(j=3;j<strlen(p[i]);j++)

if(p[i][j]!='e')

if(scannt(p[i][j])==-1)

if((scant(p[i][j]))==-1)
t1[t++]=p[i][j];

t1[t++]='$';

for(i=0;i<nt;i++)

for(j=0;j<t;j++)

m[i][j]=-1;

for(i=0;i<nt;i++)

cout<<"Enter first["<<n1[i]<<"]:";

cin>>f[i];

for(i=0;i<nt;i++)

cout<<"Enter follow["<<n1[i]<<"]:";

cin>>fl[i];

for(i=0;i<n;i++)

p1=scannt(p[i][0]);

if((q=scant(p[i][3]))!=-1)

m[p1][q]=i;
if((q=scannt(p[i][3]))!=-1)

for(j=0;j<strlen(f[q]);j++)

m[p1][scant(f[q][j])]=i;

if(p[i][3]=='e')

for(j=0;j<strlen(fl[p1]);j++)

m[p1][scant(fl[p1][j])]=i;

for(i=0;i<t;i++)

cout<<"\t"<<t1[i];

cout<<endl;

for(j=0;j<nt;j++)

cout<<n1[j];

for(i=0;i<t;i++)

cout<<"\t"<<" ";

if(m[j][i]!=-1)

cout<<p[m[j][i]];

cout<<endl;

}
void main()

clrscr();

parse p;

p.input();

p.process();

getch();

11.Write a program to implement Earley parsing algorithm.

class State(object):
def __init__(self, label, rules, dot_idx, start_idx, end_idx, idx, made_from, producer):
self.label = label
self.rules = rules
self.dot_idx = dot_idx
self.start_idx = start_idx
self.end_idx = end_idx
self.idx = idx
self.made_from = made_from
self.producer = producer

def next(self):
"""Returns the tag after the dot"""
returnself.rules[self.dot_idx]

def complete(self):
returnlen(self.rules) == self.dot_idx

def __eq__(self, other):


return (self.label == other.label and
self.rules == other.rules and
self.dot_idx == other.dot_idx and
self.start_idx == other.start_idx and
self.end_idx == other.end_idx)

def __str__(self):
rule_string = ''
fori, rule in enumerate(self.rules):
ifi == self.dot_idx:
rule_string += '\\bullet '
rule_string += rule + ' '
ifself.dot_idx == len(self.rules):
rule_string += '\\bullet'
return 'S%d %s -> %s [%d, %d] %s %s' % (self.idx, self.label, rule_string, self.start_idx,
self.end_idx, self.made_from, self.producer)

classEarley:
def __init__(self, words, grammar, terminals):
self.chart = [[] for _ in range(len(words) + 1)]
self.current_id = 0
self.words = words
self.grammar = grammar
self.terminals = terminals

defget_new_id(self):
self.current_id += 1
returnself.current_id - 1

defis_terminal(self, tag):
return tag in self.terminals

defis_complete(self, state):
returnlen(state.rules) == state.dot_idx

defenqueue(self, state, chart_entry):


if state not in self.chart[chart_entry]:
self.chart[chart_entry].append(state)
else:
self.current_id -= 1

def predictor(self, state):


for production in self.grammar[state.next()]:
self.enqueue(State(state.next(), production, 0, state.end_idx, state.end_idx, self.get_new_id(),
[], 'predictor'), state.end_idx)

def scanner(self, state):


ifself.words[state.end_idx] in self.grammar[state.next()]:
self.enqueue(State(state.next(), [self.words[state.end_idx]], 1, state.end_idx, state.end_idx + 1,
self.get_new_id(), [], 'scanner'), state.end_idx + 1)

def completer(self, state):


for s in self.chart[state.start_idx]:
if not s.complete() and s.next() == state.label and s.end_idx == state.start_idx and s.label !=
'gamma':
self.enqueue(State(s.label, s.rules, s.dot_idx + 1, s.start_idx, state.end_idx, self.get_new_id(),
s.made_from + [state.idx], 'completer'), state.end_idx)

def parse(self):
self.enqueue(State('gamma', ['S'], 0, 0, 0, self.get_new_id(), [], 'dummy start state'), 0)

fori in range(len(self.words) + 1):


for state in self.chart[i]:
if not state.complete() and not self.is_terminal(state.next()):
self.predictor(state)
elifi != len(self.words) and not state.complete() and self.is_terminal(state.next()):
self.scanner(state)
else:
self.completer(state)

def __str__(self):
res = ''

fori, chart in enumerate(self.chart):


res += '\nChart[%d]\n' % i
for state in chart:
res += str(state) + '\n'

return res

def test():
grammar = {
'S': [['NP', 'VP'], ['Aux', 'NP', 'VP'], ['VP']],
'NP': [['Det', 'Nominal'], ['Proper-Noun']],
'Nominal': [['Noun'], ['Noun', 'Nominal']],
'VP': [['Verb'], ['Verb', 'NP']],
'Det': ['that', 'this', 'a'],
'Noun': ['book', 'flight', 'meal', 'money'],
'Verb': ['book', 'include', 'prever'],
'Aux': ['does'],
'Prep': ['from', 'to', 'on'],
'Proper-Noun': ['Houston', 'TWA']
}
terminals = ['Det', 'Noun', 'Verb', 'Aux', 'Prep', 'Proper-Noun']

earley = Earley(['book', 'that', 'flight'], grammar, terminals)


earley.parse()
print(earley)
if __name__ == '__main__':
test()

12. Write a program that takes as input and output its various senses and part of speeches. Use
English WordNet and its APIs.

importnltk

fromnltk.wsd import lesk

fromnltk.corpus import wordnet

fromnltk.tokenize import word_tokenize

fromnltk.corpus import stopwords

sentence="I went to the bank to deposit my money"

stop_words=set(stopwords.words('english'))

tokens = set(word_tokenize(sentence))

sent=tokens.difference(stop_words)

for sense in sent:

print(sense+'---- '+lesk(sentence,sense).definition())

print()

print('*****************************************************************')

print(nltk.pos_tag(tokens))

You might also like