19
19
import time
20
20
import scandir
21
21
from collections import Counter
22
- from hashlib import sha1
22
+ from hashlib import sha256
23
23
from naiveBayesClassifier import tokenizer
24
24
from naiveBayesClassifier .trainer import Trainer
25
25
from naiveBayesClassifier .classifier import Classifier
@@ -43,6 +43,8 @@ def getFiles(dir, notRecursive):
43
43
if notRecursive :
44
44
for filename in os .listdir (dir ):
45
45
filePath = os .path .join (dir ,filename )
46
+ if os .path .isdir (filePath ):
47
+ continue
46
48
yield filePath
47
49
# Recursive
48
50
else :
@@ -82,19 +84,19 @@ def parseSampleDir(dir, notRecursive=False, generateInfo=False, onlyRelevantExte
82
84
pass
83
85
84
86
# Extract strings from file
85
- ( strings , sha1sum ) = extractStrings (filePath , generateInfo )
87
+ ( strings , sha256sum ) = extractStrings (filePath , generateInfo )
86
88
87
89
# Skip if MD5 already known - avoid duplicate files
88
- if sha1sum in known_sha1sums :
90
+ if sha256sum in known_sha1sums :
89
91
#if args.debug:
90
92
print "[-] Skipping strings from %s due to MD5 duplicate detection" % filePath
91
93
continue
92
94
93
95
# Add md5 value
94
96
if generateInfo :
95
- known_sha1sums .append (sha1sum )
97
+ known_sha1sums .append (sha256sum )
96
98
file_info [filePath ] = {}
97
- file_info [filePath ]["hash" ] = sha1sum
99
+ file_info [filePath ]["hash" ] = sha256sum
98
100
99
101
# Magic evaluation
100
102
if not args .nomagic :
@@ -114,7 +116,7 @@ def parseSampleDir(dir, notRecursive=False, generateInfo=False, onlyRelevantExte
114
116
file_info [fileName ]["hashes" ] = []
115
117
file_info [fileName ]["folder_names" ] = []
116
118
file_info [fileName ]["count" ] += 1
117
- file_info [fileName ]["hashes" ].append (sha1sum )
119
+ file_info [fileName ]["hashes" ].append (sha256sum )
118
120
if folderName not in file_info [fileName ]["folder_names" ]:
119
121
file_info [fileName ]["folder_names" ].append (folderName )
120
122
@@ -189,7 +191,7 @@ def extractStrings(filePath, generateInfo):
189
191
f .close ()
190
192
# Generate md5
191
193
if generateInfo :
192
- sha1sum = sha1 (data ).hexdigest ()
194
+ sha256sum = sha256 (data ).hexdigest ()
193
195
194
196
# Read strings
195
197
strings = re .findall ("[\x1f -\x7e ]{6,}" , data )
@@ -214,7 +216,7 @@ def extractStrings(filePath, generateInfo):
214
216
traceback .print_exc ()
215
217
pass
216
218
217
- return cleaned_strings , sha1sum
219
+ return cleaned_strings , sha256sum
218
220
219
221
220
222
def sampleStringEvaluation (sample_string_stats , good_strings , file_info ):
@@ -260,65 +262,66 @@ def sampleStringEvaluation(sample_string_stats, good_strings, file_info):
260
262
print "Appending %s to %s" % ( string , fileName )
261
263
inverse_stats [fileName ].append (string )
262
264
263
- # SUPER RULE GENERATION -------------------------------------------
264
-
265
- super_rules = []
266
- if not args .nosuper and not args .inverse :
265
+ # SUPER RULE GENERATION -------------------------------------------
267
266
268
- # SUPER RULES GENERATOR - preliminary work
269
- # If a string occurs more than once in different files
270
- if sample_string_stats [string ]["count" ] > 1 :
271
- if args .debug :
272
- print "OVERLAP Count: %s\n String: \" %s\" %s" % ( sample_string_stats [string ]["count" ], string , "\n FILE: " .join (sample_string_stats [string ]["files" ]) )
273
- # Create a combination string from the file set that matches to that string
274
- combi = ":" .join (sorted (sample_string_stats [string ]["files" ]))
275
- # print "STRING: " + string
276
- # print "COMBI: " + combi
277
- # If combination not yet known
278
- if combi not in combinations :
279
- combinations [combi ] = {}
280
- combinations [combi ]["count" ] = 1
281
- combinations [combi ]["strings" ] = []
282
- combinations [combi ]["strings" ].append (string )
283
- combinations [combi ]["files" ] = sample_string_stats [string ]["files" ]
284
- else :
285
- combinations [combi ]["count" ] += 1
286
- combinations [combi ]["strings" ].append (string )
287
- # Set the maximum combination count
288
- if combinations [combi ]["count" ] > max_combi_count :
289
- max_combi_count = combinations [combi ]["count" ]
290
- # print "Max Combi Count set to: %s" % max_combi_count
291
-
292
- print "[+] Generating Super Rules ... (a lot of foo magic)"
293
- for combi_count in range (max_combi_count , 1 , - 1 ):
294
- for combi in combinations :
295
- if combi_count == combinations [combi ]["count" ]:
296
- #print "Count %s - Combi %s" % ( str(combinations[combi]["count"]), combi )
297
- # Filter the string set
298
- #print "BEFORE"
299
- #print len(combinations[combi]["strings"])
300
- #print combinations[combi]["strings"]
301
- string_set = combinations [combi ]["strings" ]
267
+ super_rules = []
268
+ if not args .nosuper and not args .inverse :
269
+
270
+ # SUPER RULES GENERATOR - preliminary work
271
+ # If a string occurs more than once in different files
272
+ # print sample_string_stats[string]["count"]
273
+ if sample_string_stats [string ]["count" ] > 1 :
274
+ if args .debug :
275
+ print "OVERLAP Count: %s\n String: \" %s\" %s" % ( sample_string_stats [string ]["count" ], string , "\n FILE: " .join (sample_string_stats [string ]["files" ]) )
276
+ # Create a combination string from the file set that matches to that string
277
+ combi = ":" .join (sorted (sample_string_stats [string ]["files" ]))
278
+ # print "STRING: " + string
279
+ # print "COMBI: " + combi
280
+ # If combination not yet known
281
+ if combi not in combinations :
282
+ combinations [combi ] = {}
283
+ combinations [combi ]["count" ] = 1
302
284
combinations [combi ]["strings" ] = []
303
- combinations [combi ]["strings" ] = filterStringSet (string_set )
304
- #print combinations[combi]["strings"]
305
- #print "AFTER"
306
- #print len(combinations[combi]["strings"])
307
- # Combi String count after filtering
308
- #print "String count after filtering: %s" % str(len(combinations[combi]["strings"]))
309
-
310
- # If the string set of the combination has a required size
311
- if len (combinations [combi ]["strings" ]) >= int (args .rc ):
312
- # Remove the files in the combi rule from the simple set
313
- if args .nosimple :
314
- for file in combinations [combi ]["files" ]:
315
- if file in file_strings :
316
- del file_strings [file ]
317
- # Add it as a super rule
318
- print "[-] Adding Super Rule with %s strings." % str (len (combinations [combi ]["strings" ]))
319
- #if args.debug:
320
- #print "Rule Combi: %s" % combi
321
- super_rules .append (combinations [combi ])
285
+ combinations [combi ]["strings" ].append (string )
286
+ combinations [combi ]["files" ] = sample_string_stats [string ]["files" ]
287
+ else :
288
+ combinations [combi ]["count" ] += 1
289
+ combinations [combi ]["strings" ].append (string )
290
+ # Set the maximum combination count
291
+ if combinations [combi ]["count" ] > max_combi_count :
292
+ max_combi_count = combinations [combi ]["count" ]
293
+ # print "Max Combi Count set to: %s" % max_combi_count
294
+
295
+ print "[+] Generating Super Rules ... (a lot of foo magic)"
296
+ for combi_count in range (max_combi_count , 1 , - 1 ):
297
+ for combi in combinations :
298
+ if combi_count == combinations [combi ]["count" ]:
299
+ #print "Count %s - Combi %s" % ( str(combinations[combi]["count"]), combi )
300
+ # Filter the string set
301
+ #print "BEFORE"
302
+ #print len(combinations[combi]["strings"])
303
+ #print combinations[combi]["strings"]
304
+ string_set = combinations [combi ]["strings" ]
305
+ combinations [combi ]["strings" ] = []
306
+ combinations [combi ]["strings" ] = filterStringSet (string_set )
307
+ #print combinations[combi]["strings"]
308
+ #print "AFTER"
309
+ #print len(combinations[combi]["strings"])
310
+ # Combi String count after filtering
311
+ #print "String count after filtering: %s" % str(len(combinations[combi]["strings"]))
312
+
313
+ # If the string set of the combination has a required size
314
+ if len (combinations [combi ]["strings" ]) >= int (args .rc ):
315
+ # Remove the files in the combi rule from the simple set
316
+ if args .nosimple :
317
+ for file in combinations [combi ]["files" ]:
318
+ if file in file_strings :
319
+ del file_strings [file ]
320
+ # Add it as a super rule
321
+ print "[-] Adding Super Rule with %s strings." % str (len (combinations [combi ]["strings" ]))
322
+ #if args.debug:
323
+ #print "Rule Combi: %s" % combi
324
+ super_rules .append (combinations [combi ])
322
325
323
326
# Return all data
324
327
return (file_strings , combinations , super_rules , inverse_stats )
@@ -618,8 +621,11 @@ def generateGeneralCondition(file_info):
618
621
619
622
try :
620
623
for filePath in file_info :
621
- magic = file_info_mal [filePath ]["magic" ]
622
- size = file_info_mal [filePath ]["size" ]
624
+ # Short file name info used for inverse generation has no magic/size fields
625
+ if "magic" not in file_info [filePath ]:
626
+ continue
627
+ magic = file_info [filePath ]["magic" ]
628
+ size = file_info [filePath ]["size" ]
623
629
if magic not in magic_headers and magic != "" :
624
630
magic_headers .append (magic )
625
631
if size not in file_sizes :
@@ -641,6 +647,9 @@ def generateGeneralCondition(file_info):
641
647
condition = "{0}" .format (getFileRange (max (file_sizes )))
642
648
643
649
except Exception , e :
650
+ if args .debug :
651
+ traceback .print_exc ()
652
+ exit (1 )
644
653
print "[E] ERROR while generating general condition - check the global rule and remove it if it's faulty"
645
654
646
655
return condition
@@ -858,6 +867,7 @@ def createRules(file_strings, super_rules, file_info, inverse_stats):
858
867
if not args .nosuper and not args .inverse :
859
868
860
869
fh .write ("/* Super Rules ------------------------------------------------------------- */\n \n " )
870
+ super_rule_names = []
861
871
862
872
print "[+] Generating super rules ..."
863
873
printed_combi = {}
@@ -880,7 +890,11 @@ def createRules(file_strings, super_rules, file_info, inverse_stats):
880
890
rule_name += "_" + cleanedName
881
891
882
892
# Shorten rule name
883
- rule_name = rule_name [:127 ]
893
+ rule_name = rule_name [:124 ]
894
+ # Add count if rule name already taken
895
+ if rule_name not in super_rule_names :
896
+ rule_name = "%s_%s" % (rule_name , super_rule_count )
897
+ super_rule_names .append (rule_name )
884
898
885
899
# Create a list of files
886
900
file_listing = ", " .join (file_list )
@@ -906,7 +920,7 @@ def createRules(file_strings, super_rules, file_info, inverse_stats):
906
920
rule += "\t \t date = \" %s\" \n " % getTimestampBasic ()
907
921
rule += "\t \t super_rule = 1\n "
908
922
for i , filePath in enumerate (super_rule ["files" ]):
909
- rule += "\t \t hash%s = \" %s\" \n " % (str (i ) + 1 , file_info [filePath ]["hash" ])
923
+ rule += "\t \t hash%s = \" %s\" \n " % (str (i + 1 ) , file_info [filePath ]["hash" ])
910
924
911
925
rule += "\t strings:\n "
912
926
@@ -990,9 +1004,9 @@ def getRuleStrings(elements):
990
1004
991
1005
# Checking string length
992
1006
is_fullword = True
993
- if len (string ) > 80 :
1007
+ if len (string ) > args . s :
994
1008
# cut string
995
- string = string [:80 ].rstrip ("\\ " )
1009
+ string = string [:args . s ].rstrip ("\\ " )
996
1010
# not fullword anymore
997
1011
is_fullword = False
998
1012
# Show as fullword
@@ -1171,7 +1185,7 @@ def printWelcome():
1171
1185
print " Yara Rule Generator"
1172
1186
print " by Florian Roth"
1173
1187
print " July 2015"
1174
- print " Version 0.14.1 "
1188
+ print " Version 0.14.2 "
1175
1189
print " "
1176
1190
print "###############################################################################"
1177
1191
0 commit comments