"""Routines for interacting with SourceForge interfaces""" import cgi import re import urllib import urlparse def urldecode(query): d = cgi.parse_qs(query) for k, v in d.items(): if len(v) != 1: raise ValueError, "unexpected duplicate entry" d[k] = v[0] return d class SummaryParser: rx_href = re.compile('HREF="(\S*\?[A-Za-z0-9=&_]+)"') VERBOSE = 0 def __init__(self, root_url, funcs, verbose=None): if verbose: self.VERBOSE = verbose self.root_url = root_url self.offset = 0 self.hrefs = [] self.funcs = {} self.next = None for func in funcs: self.funcs[func] = 1 def get_hrefs(self): return self.hrefs def load(self, _url, offset=None): url = urlparse.urljoin(self.root_url, _url) if self.VERBOSE: print "loading", url if offset is not None: self.offset = offset f = urllib.urlopen(url) resp = f.read() f.close() self.parse(resp) def parse(self, buf): for line in buf.split("\n"): line_offset = 0 while 1: mo = self.rx_href.search(line, line_offset) if mo: self.handle_href_match(mo, line) line_offset = mo.end(1) else: break if self.VERBOSE: print "found %d hrefs" % len(self.hrefs) if self.next: self.load_next() def handle_href_match(self, mo, line): query = mo.group(1) d = self.parse_query(query) self.handle_query(query, d, line) def handle_query(self, query, dict, line): if self.VERBOSE: print query if not dict.has_key('func'): return if dict['func'] == 'browse' and dict.has_key('offset'): off = int(dict['offset']) if off > self.offset: self.next = query, dict if self.keep_func(dict['func']): self.hrefs.append((query, dict, line)) def keep_func(self, func): if self.funcs.has_key(func): return 1 def parse_query(self, href): i = href.find("?") return urldecode(href[i+1:]) def load_next(self): assert self.next is not None query, dict = self.next self.next = None new_offset = int(dict['offset']) self.load(query, new_offset)