#!/usr/bin/env python

# Parse a seealso file, and add its contents to a pickled dictionary.

import os, sys
import urllib, pickle
from xml.dom import minidom

# Dublin Core namespace
DC_NS = 'http://purl.org/dc/elements/1.1/'

def main ():
    if len(sys.argv) < 3:
        print 'Usage: %s URL database-filename'
        sys.exit(1)
        
    url = sys.argv[1]
    db_file = sys.argv[2]

    # Fetch XML data
    f = urllib.urlopen(url)
    data = f.read()
    f.close()

    # Parse XML data
    dom = minidom.parseString(data)
    L = []
    def get_text (node):
        t = ""
        for c in node.childNodes:
            if c.nodeType == c.TEXT_NODE:
                t += c.nodeValue
            elif c.nodeType == c.ELEMENT_NODE:
                t += get_text(c)
        return t.strip()

    # Get document title and author
    author = document_title = document_url = None

    document_title_nodes = list(dom.getElementsByTagNameNS(DC_NS, 'title'))
    document_url_nodes = list(dom.getElementsByTagNameNS(DC_NS, 'identifier'))
    author_nodes = list(dom.getElementsByTagNameNS(DC_NS, 'creator'))

    assert len(document_title_nodes) <= 1
    assert len(document_url_nodes) <= 1
    assert len(author_nodes) <= 1

    if document_title_nodes:
        document_title = get_text(document_title_nodes[0])
    if author_nodes:
        author = get_text(author_nodes[0])
    if document_url_nodes:
        document_url = get_text(document_url_nodes[0])
        if document_url == '' or not document_url.startswith('http://'):
            document_url = None
        
    
    # Loop over items
    for item in dom.getElementsByTagNameNS(None, 'item'):
        href = item.getAttributeNS(None, 'href')
        title_node = item.getElementsByTagNameNS(None, 'title')[0]
        title = get_text(title_node)

        # XXX this will ignore emphasized chunks of text
        excerpt_nodes = item.getElementsByTagNameNS(None, 'excerpt')
        if len(excerpt_nodes):
            excerpt = get_text(excerpt_nodes[0])
        else:
            excerpt = None

        # Multiple target elements are allowed
        target_nodes = item.getElementsByTagNameNS(None, 'target')
        for t in target_nodes:
            target = get_text(t)
            L.append((target, href, document_title, document_url, author,
                      title, excerpt))

        


    # update database
    # The database is a pickled dictionary mapping 'module name' ->
    # list of (title, url, excerpt) pairs
    if not os.path.exists(db_file):
        db = {}
    else:
        input = open(db_file, 'rb')
        db = pickle.load(input)
        input.close()
    for entry in L:
        # Check if URL is already listed; if yes, delete the old entry
        module = entry[0]
        url = entry[1]
        exlist = db.get(module, [])
        exlist = [t for t in exlist if t[0] != url]

        # Append to list
        exlist.append(entry[1:])

        # Reinsert (since the list-comp created a new list)
        db[module] = exlist
    
    #import pprint ; pprint.pprint(db)
    output = open(db_file, 'wb')
    pickle.dump(db, output)
    output.close()

if __name__ == '__main__':
    main()
    
