Indexed String Search
Just for experimentation, I created an indexed string search system based on hashing, implemented in Python for fast prototyping. The results were rather shocking actually. I had a hunch that it could be slow, but not this slow. The C-based string search I made earlier, that searches file by file directly, is a lot faster.
The principle for this script is to create a hashed database that holds all the words with their location. In theory, this is quicker than searching through the all files again and again. I think the main problem is the gigantic database that gets created, which is 10 times larger than the actual files to search. This takes a very long time to load into memory every time the script is called.
Anyway, here is the code. It works, but it's very slow:
#!/usr/bin/python
import os.path
import pickle
class SearchDatabase(object):
def __init__(self):
self._db = dict()
def _visitor(self, arg, dirname, names):
for name in names:
filename = os.path.join(dirname, name)
if os.path.isfile(filename):
fh = open(filename, "r")
for line_no, line in enumerate(fh):
location = "%s:%d" % (filename, line_no + 1)
for word in line.split():
if not word.upper() in self._db:
self._db[word.upper()] = dict()
self._db[word.upper()][location] = line.rstrip()
fh.close()
def create(self, directory):
os.path.walk(directory, self._visitor, None)
def save(self, filename):
fh = open(filename, "wb")
pickle.dump(self._db, fh, pickle.HIGHEST_PROTOCOL)
fh.close()
def load(self, filename):
fh = open(filename, "rb")
self._db = pickle.load(fh)
fh.close()
def locate_and_display(self, word):
try:
for location in self._db[word.upper()]:
print "%s:%s" % (location, self._db[word.upper()][location])
except KeyError:
return
if __name__ == "__main__":
import sys
import getopt
def usage_and_exit():
print "Usage: %s <options> <word>" % (sys.argv[0])
print "Options:"
print " -h Display this help and exit."
print " -d DIR Create database by traversing directory DIR."
print " -f FILE Use FILE as database filename."
print ""
sys.exit(1)
try:
opts, args = getopt.getopt(sys.argv[1:], "hd:f:")
except getopt.GetoptError as err:
print "Error:", str(err)
usage_and_exit()
directory = None
db_filename = "locate.db"
for o, a in opts:
if o == '-h':
usage_and_exit()
elif o == '-d':
directory = a
elif o == '-f':
db_filename = a
sdb = SearchDatabase()
if directory: # Create Mode
if not os.path.isdir(directory):
print "Error: invalid directory"
usage_and_exit()
sdb.create(directory)
sdb.save(db_filename)
else: # Search Mode
if len(args) == 0:
print "Error: please specify a search word"
usage_and_exit()
sdb.load(db_filename)
for arg in args:
sdb.locate_and_display(arg)