#!/usr/bin/python import re, sys try : filename = sys.argv[1] except : sys.stderr.write("%s \n" % sys.argv[0]) sys.exit(1) fo = open(filename, 'rU') fc = fo.read() fo.close() count = dict() for l in fc.split("\n") : ll = re.sub("[,\"'!\?\.;,]", "", l) words = re.split(" +", ll) for w in words : w = w.lower() if not count.has_key(w) : count[w] = 0 count[w] = count[w] + 1 typeCount = len(count.keys()) tokenCount = sum(count.values()) fileNum = re.sub("[^0-9]", "", filename) print "%s\t%s\t%s" % (fileNum, typeCount, tokenCount)