-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdsdr.py
115 lines (87 loc) · 2.65 KB
/
dsdr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/python
from numpy import matrix
import nltk.data
from nltk.corpus import stopwords
import re
from nltk.stem.wordnet import WordNetLemmatizer
from classes import *
import sys,getopt
def usage():
''' Print the command line usage of the program'''
print "Usage: " + sys.argv[0] + " [OPTIONS] FILE..."
print "See " + sys.argv[0] + " -h for more details"
# TODO([email protected]): Remove this function from this file and seperate it into a module.
def removeStopwords(sentence):
'''Remove Stop words and stem the sentence. It also splits the sentences into words before stemming. '''
# TODO([email protected]) : Add part of speach to each word hence produceds
ret = []
orig = []
stmr = WordNetLemmatizer()
for sen in sentence:
orig.append(sen)
sen = [ stmr.lemmatize(word.lower(),'v') for word in re.sub("[^\w]"," ",sen).split() if word.lower() not in stopwords.words('english') ]
ret.append(sen)
return ret,orig
# TODO([email protected]): Add more command line options
args = sys.argv[1:]
try:
arg,opt = getopt.getopt(args,"h")
except getopt.GetoptError:
usage()
sys.exit(1)
if len(opt) == 0:
usage()
sys.exit(1)
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentence = []
for tempfile in opt:
fp = open(tempfile)
data = fp.read()
data = tokenizer.tokenize(data)
for sen in data:
sentence.append(sen)
fp.close()
sentence,original_sentence = removeStopwords(sentence)
bag_of_words = []
for sen in sentence:
for word in sen:
if word not in bag_of_words:
bag_of_words.append( word )
global_vector = [0 for x in range(len(bag_of_words)) ]
sentence_temp = []
B = []
i = 0
for sen in sentence:
v = [ 0 for x in range(len(bag_of_words)) ]
for word in sen:
v[bag_of_words.index(word)] += 1
global_vector[bag_of_words.index(word)] += 1
if len(sen) > 0:
sentence_temp.append(sentenceRepresentation(sen,v,original_sentence[i]))
B.append(v)
i = i + 1
sentence = sentence_temp
global_vector = Vector(global_vector)
#sentence = sorted(sentence,key= lambda x: global_vector.cosine(x.words))
summary = []
print "How many sentences : "
n = int(raw_input())
B = matrix(B)
print B
B0 = B.T*B
B0 = B0/.7
for i in range(n):
for j in range(len(sentence)):
temp = B0.T[j]*B0.T[j].T
sentence[j].score = temp[0,0] / (1 + B[j,j])
sentence = sorted(sentence,key = lambda x:x.score)
sentence.reverse()
summary.append(sentence[0].original)
temp = B0.T[j].T*B0.T[j]
temp = temp / (1 + B[j,j])
B0 = B0 - temp
for sen in summary:
print sen
print "\n"
# TODO([email protected]): Add the sentence regeneration
# TODO: Document all functions used within our code including the once that we created