#!/Users/andrewk/source/p/python/python.exe

# Example: indexer that records info about all the files in a directory tree.

import os, sys
import itertools, functools
import cPickle

#
# Management of the list of indexing functions.
#

_indexers = {}

def register(ext, func):
    """Registers the function 'func'

    >>> is_indexable_filename('foo.jpg')
    False
    >>> register('jpg', None)
    >>> is_indexable_filename('foo.jpg')
    True
    >>> _indexers.clear()
    """
    _indexers['.' + ext] = func

def is_indexable_filename (fn):
    """Returns true if there's an indexer available for the given filename.

    >>> register('txt', None)
    >>> is_indexable_filename('foo.txt')
    True
    >>> is_indexable_filename('foo.jpg')
    False
    """
    base, ext = os.path.splitext(fn)
    return _indexers.has_key(ext)

def is_ignorable_directory (dirname):
    """Return true if the directory with the given name shouldn't be scanned.

    >>> is_ignorable_directory('.svn')
    True
    >>> is_ignorable_directory('text')
    False
    """
    return (dirname in ('.svn', 'CVS'))

def remove_punctuation (word):
    """Removes leading and trailing punctuation characters from a word.
    May return the empty string.

    >>> remove_punctuation('test')
    'test'
    >>> remove_punctuation('comma,')
    'comma'
    >>> remove_punctuation('()')
    ''
    """
    word = word.strip(',.?!"\'()[]#*\\')
    return word

#
# Functions for indexing directories and files
#

def index (*args):
    """Index the directory trees rooted at the specified paths.
    Can take any number of arguments.
    Returns the index data structure.
    """
    idx = load_index()
    for path in args:
        index_tree(idx, path)
    save_index(idx)
    return idx

def index_tree (idx, path):
    """Index the contents of the files in the directory tree rooted at 'path'.
    """
    for dirpath, dirnames, filenames in os.walk(path):
        # Remove ignorable directories
        for d in list(dirnames):
            if is_ignorable_directory(d):
                dirnames.remove(d)

        # Discard uninteresting filenames
        filenames = [fn for fn in filenames
                     if is_indexable_filename(fn)]

        # Index files        
        for fn in filenames:
            full_path = os.path.join(dirpath, fn)
            index_file(idx, full_path)

def index_file (idx, path):
    """Index the contents of a single file.  It's assumed that
    an indexing function will be found for the file's type.
    """
    assert is_indexable_filename(path)
    base, ext = os.path.splitext(path)
    
    indexer = _indexers[ext]
    record_func = functools.partial(record, idx)
    indexer(path, record_func)


#
# Index data structure
#
# The index is a big dictionary:
#   { word => [list of (filename, line number) tuples] } 
# 

def lookup (idx, word):
    """Return an iterator over the files and lines containing the requested
    word.
    """
    for file, line in idx.get(word, []):
        yield (file, line)
    
def record (idx, word, path, line=None):
    """Add an index entry for the given word, using the specified path
    and line number.  The line number can be None.

    >>> record({}, 'word', '/path', None)
    {'word': {('/path', None): 1}}
    >>> record({}, 'word', '/path', 42)
    {'word': {('/path', 42): 1}}
    """
    d = idx.setdefault(word, {})
    key = (path, line)
    if key not in d:
        d[key] = 1
    return idx

def load_index ():
    """Read index from disk.
    """
    index_filename = '/tmp/index'
    if os.path.exists(index_filename):
        input = open(index_filename, 'rb')
        idx = cPickle.load(input)
        input.close()
    else:
        idx = {}
    
    return idx

def save_index (idx):
    """Write index to disk.
    """
    output = open('/tmp/index', 'wb')
    cPickle.dump(idx, output, -1)
    output.close()
    
    import pprint
    print len(idx), 'words in index'
    #print idx
    ##pprint.pprint(idx)

#
# File analysis functions
#

def text_inspector (input_file, record_func):
    line_num = 1
    for line in open(input_file, 'r'):
        for word in line.split():
            word = remove_punctuation(word.lower())
            if word != '':
                record_func(word, input_file, line_num)
        line_num += 1


if __name__ == '__main__':
    if '-t' in sys.argv[1:]:
        import doctest
        doctest.testmod()
        raise SystemExit

    register('txt', text_inspector)
    #register('jpg', jpg_inspector)
    #register('gif', gif_inspector)

    idx = index(*sys.argv[1:])

    # Look up a word
    for filename, line in lookup(idx, 'the'):
        print filename, line
    

# Exercises:
# * Matching lines are output in random order.  Output them in sorted order.
#   [5] (One-line change)
# * Use itertools.groupby() for better output, i.e. file.txt: 1 3 4 5
#   [10]
# * Remove file entries before adding new ones. [15]