Download as pdf or txt
Download as pdf or txt
You are on page 1of 3

import sgmllib class MyParser(sgmllib.SGMLParser): "a simple parser class" def parse(self, s): self.feed(s) self.

close() def __init__(self, verbose=0): "Initialize and object, passing verbose to the superclass" sgmllib.SGMLParser.__init__(self,verbose) self.inside_a_element = 0 self.descriptions = [] self.hyperlinks = [] def start_a(self, attributes): self.inside_a_element = 1 for name,value in attributes: if (name == "href" and "http://" in value): self.hyperlinks.append(value) def end_a(self): self.inside_a_element = 0 def handle_data(self, data): if self.inside_a_element == 1: self.descriptions.append(data) def get_hyperlinks(self): "returns hyperlinks found" return self.hyperlinks def get_descriptions(self): return self.descriptions import urllib, threading from Queue import PriorityQueue,Queue class URLGetter(threading.Thread): def __init__(self, ud): self.url = ud[1] self.result = {} self.parser = MyParser() self.depth = ud[0] threading.Thread.__init__(self) def get_result(self): return self.result def run(self): try: print "parsing\n" f = urllib.urlopen(self.url) contents = f.read() f.close() try: self.parser.parse(contents) except: print "Exception while parsing....." for link in self.parser.get_hyperlinks():

self.result[link] = self.depth except: print "Could not open document: %s" % self.url class crawler: def __init__(self,target,maxdepth=3,MAXTHREADS=4): self.maxdepth = maxdepth #Maximum depth of crawling self.to_visit = PriorityQueue() self.to_visit.put((1,target)) self.results = {} self.q = Queue(MAXTHREADS-1) def producer(self): while (not self.to_visit.empty()) or (not self.q.empty()): print "remain to visit: " + str(self.to_visit.qsize()), "\nthreads running: " + str(self.q.qsize()) + "\n" thread = URLGetter(self.to_visit.get()) thread.start() self.q.put(thread, True) def consumer(self): while (not self.to_visit.empty()) or (not self.q.empty()): thread = self.q.get(True) thread.join() res = thread.get_result() for url in res.keys(): try: if self.results[url] != None: self.results[url] = res[url] print "skipping, depth= ",self.results[url] except KeyError: print res[url]," ",self.maxdepth,"\n" if res[url] <= self.maxdepth: self.results[url] = res[url] self.to_visit.put((res[url]+1,url)) def crawl(self): prod_thread = threading.Thread(target=self.producer) cons_thread = threading.Thread(target=self.consumer) prod_thread.start() cons_thread.start() prod_thread.join() cons_thread.join() def get_results(self): return self.results.keys() #single threaded version class webcrawler: def __init__(self): pass def _webcrawl(self,seed,depth,search_text,l,next_urls): import urllib print "Depth:",depth; print "Seed:",seed; if depth>0:

r = urllib.urlopen(seed) s = r.read() r.close() print s if search_text in s: l.append(seed) p = MyParser() p.parse(s) urls = p.get_hyperlinks() for url in urls: if not (url in next_urls): next_urls.append(url) self._webcrawl(url,depth-1,search_text,l,next_urls) def webcrawl(self,seed,depth,search_text): a = [] self._webcrawl(seed,depth,search_text,a,[]) print a def main(): a = crawler("http://www.google.com",1) a.crawl() print a.get_results() f = open("/home/moshe/Desktop/50\ weeks/week1.py","wr") for l,d in a.get_results(): f.write(str(d) + l + "\n") f.close() main()

You might also like