Indexed String Search
Just for experimentation, I created an indexed string search system based on hashing, implemented in Python for fast prototyping. The results were rather shocking actually. I had a hunch that it could be slow, but not this slow. The C-based string search I made earlier, that searches file by file directly, is a lot faster.
The principle for this script is to create a hashed database that holds all the words with their location. In theory, this is quicker than searching through the all files again and again. I think the main problem is the gigantic database that gets created, which is 10 times larger than the actual files to search. This takes a very long time to load into memory every time the script is called.
Anyway, here is the code. It works, but it's very slow:
#!/usr/bin/python import os.path import pickle class SearchDatabase(object): def __init__(self): self._db = dict() def _visitor(self, arg, dirname, names): for name in names: filename = os.path.join(dirname, name) if os.path.isfile(filename): fh = open(filename, "r") for line_no, line in enumerate(fh): location = "%s:%d" % (filename, line_no + 1) for word in line.split(): if not word.upper() in self._db: self._db[word.upper()] = dict() self._db[word.upper()][location] = line.rstrip() fh.close() def create(self, directory): os.path.walk(directory, self._visitor, None) def save(self, filename): fh = open(filename, "wb") pickle.dump(self._db, fh, pickle.HIGHEST_PROTOCOL) fh.close() def load(self, filename): fh = open(filename, "rb") self._db = pickle.load(fh) fh.close() def locate_and_display(self, word): try: for location in self._db[word.upper()]: print "%s:%s" % (location, self._db[word.upper()][location]) except KeyError: return if __name__ == "__main__": import sys import getopt def usage_and_exit(): print "Usage: %s <options> <word>" % (sys.argv[0]) print "Options:" print " -h Display this help and exit." print " -d DIR Create database by traversing directory DIR." print " -f FILE Use FILE as database filename." print "" sys.exit(1) try: opts, args = getopt.getopt(sys.argv[1:], "hd:f:") except getopt.GetoptError as err: print "Error:", str(err) usage_and_exit() directory = None db_filename = "locate.db" for o, a in opts: if o == '-h': usage_and_exit() elif o == '-d': directory = a elif o == '-f': db_filename = a sdb = SearchDatabase() if directory: # Create Mode if not os.path.isdir(directory): print "Error: invalid directory" usage_and_exit() sdb.create(directory) sdb.save(db_filename) else: # Search Mode if len(args) == 0: print "Error: please specify a search word" usage_and_exit() sdb.load(db_filename) for arg in args: sdb.locate_and_display(arg)