#!/Users/andrewk/source/p/python/python.exe # Example: indexer that records info about all the files in a directory tree. import os, sys import itertools, functools import cPickle # # Management of the list of indexing functions. # _indexers = {} def register(ext, func): """Registers the function 'func' >>> is_indexable_filename('foo.jpg') False >>> register('jpg', None) >>> is_indexable_filename('foo.jpg') True >>> _indexers.clear() """ _indexers['.' + ext] = func def is_indexable_filename (fn): """Returns true if there's an indexer available for the given filename. >>> register('txt', None) >>> is_indexable_filename('foo.txt') True >>> is_indexable_filename('foo.jpg') False """ base, ext = os.path.splitext(fn) return _indexers.has_key(ext) def is_ignorable_directory (dirname): """Return true if the directory with the given name shouldn't be scanned. >>> is_ignorable_directory('.svn') True >>> is_ignorable_directory('text') False """ return (dirname in ('.svn', 'CVS')) def remove_punctuation (word): """Removes leading and trailing punctuation characters from a word. May return the empty string. >>> remove_punctuation('test') 'test' >>> remove_punctuation('comma,') 'comma' >>> remove_punctuation('()') '' """ word = word.strip(',.?!"\'()[]#*\\') return word # # Functions for indexing directories and files # def index (*args): """Index the directory trees rooted at the specified paths. Can take any number of arguments. Returns the index data structure. """ idx = load_index() for path in args: index_tree(idx, path) save_index(idx) return idx def index_tree (idx, path): """Index the contents of the files in the directory tree rooted at 'path'. """ for dirpath, dirnames, filenames in os.walk(path): # Remove ignorable directories for d in list(dirnames): if is_ignorable_directory(d): dirnames.remove(d) # Discard uninteresting filenames filenames = [fn for fn in filenames if is_indexable_filename(fn)] # Index files for fn in filenames: full_path = os.path.join(dirpath, fn) index_file(idx, full_path) def index_file (idx, path): """Index the contents of a single file. It's assumed that an indexing function will be found for the file's type. """ assert is_indexable_filename(path) base, ext = os.path.splitext(path) indexer = _indexers[ext] record_func = functools.partial(record, idx) indexer(path, record_func) # # Index data structure # # The index is a big dictionary: # { word => [list of (filename, line number) tuples] } # def lookup (idx, word): """Return an iterator over the files and lines containing the requested word. """ for file, line in idx.get(word, []): yield (file, line) def record (idx, word, path, line=None): """Add an index entry for the given word, using the specified path and line number. The line number can be None. >>> record({}, 'word', '/path', None) {'word': {('/path', None): 1}} >>> record({}, 'word', '/path', 42) {'word': {('/path', 42): 1}} """ d = idx.setdefault(word, {}) key = (path, line) if key not in d: d[key] = 1 return idx def load_index (): """Read index from disk. """ index_filename = '/tmp/index' if os.path.exists(index_filename): input = open(index_filename, 'rb') idx = cPickle.load(input) input.close() else: idx = {} return idx def save_index (idx): """Write index to disk. """ output = open('/tmp/index', 'wb') cPickle.dump(idx, output, -1) output.close() import pprint print len(idx), 'words in index' #print idx ##pprint.pprint(idx) # # File analysis functions # def text_inspector (input_file, record_func): line_num = 1 for line in open(input_file, 'r'): for word in line.split(): word = remove_punctuation(word.lower()) if word != '': record_func(word, input_file, line_num) line_num += 1 if __name__ == '__main__': if '-t' in sys.argv[1:]: import doctest doctest.testmod() raise SystemExit register('txt', text_inspector) #register('jpg', jpg_inspector) #register('gif', gif_inspector) idx = index(*sys.argv[1:]) # Look up a word for filename, line in lookup(idx, 'the'): print filename, line # Exercises: # * Matching lines are output in random order. Output them in sorted order. # [5] (One-line change) # * Use itertools.groupby() for better output, i.e. file.txt: 1 3 4 5 # [10] # * Remove file entries before adding new ones. [15]