Skip to content

Commit 7794c45

Browse files
committed
v0.14.2
- Fixed Bug in Super Rule Generation - Switched from SHA1 to SHA256 (convenience purposes) - General rule bugfix - Bugfix max string length application Former-commit-id: e896d38 [formerly 4522fd3] Former-commit-id: dc38785
1 parent 260b989 commit 7794c45

File tree

5 files changed

+98
-84
lines changed

5 files changed

+98
-84
lines changed

README.md

+12-12
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ See the following blog post for a more detailed description on how to use yarGen
106106

107107
![Generator Run](./screens/yargen-running.png)
108108

109-
![Output Rule](./screens/output-rule-0.11.png)
109+
![Output Rule](./screens/output-rule-0.14.1.png)
110110

111111
As you can see in the screenshot above you'll get a rule that contains strings, which are not found in the goodware strings database.
112112

@@ -118,7 +118,7 @@ To get a more generic rule, remove string $s5, which is very specific for this c
118118

119119
### Use the shipped database (FAST) to create some rules
120120

121-
python yarGen.py -m X:\MAL\Case1401
121+
```python yarGen.py -m X:\MAL\Case1401```
122122

123123
Use the shipped database of goodware strings and scan the malware directory
124124
"X:\MAL" recursively. Create rules for all files included in this directory and
@@ -130,37 +130,37 @@ directory.
130130
yarGen will by default use the top 20 strings based on their score. To see how a
131131
certain string in the rule scored, use the "--score" parameter.
132132

133-
python yarGen.py --score -m X:\MAL\Case1401
133+
```python yarGen.py --score -m X:\MAL\Case1401```
134134

135135
### Use only strings with a certain minimum score
136136

137137
In order to use only strings for your rules that match a certain minimum score use the "-z" parameter. It is a good pratice to first create rules with "--score" and than perform a second run with a minimum score set for you sample set via "-z".
138138

139-
python yarGen.py --score -z 5 -m X:\MAL\Case1401
139+
```python yarGen.py --score -z 5 -m X:\MAL\Case1401```
140140

141141
### Preset author and reference
142142

143-
python yarGen.py -a "Florian Roth" -r "http://goo.gl/c2qgFx" -m /opt/mal/case_441 -o case441.yar
143+
```python yarGen.py -a "Florian Roth" -r "http://goo.gl/c2qgFx" -m /opt/mal/case_441 -o case441.yar```
144144

145145
### Exclude strings from Goodware samples
146146

147-
python yarGen.py --excludegood -m /opt/mal/case_441
147+
```python yarGen.py --excludegood -m /opt/mal/case_441```
148148

149149
### Supress simple rule if alreay covered by a super rules
150150

151-
python yarGen.py --nosimple -m /opt/mal/case_441
151+
```python yarGen.py --nosimple -m /opt/mal/case_441```
152152

153153
### Show debugging output
154154

155-
python yarGen.py --debug -m /opt/mal/case_441
155+
```python yarGen.py --debug -m /opt/mal/case_441```
156156

157157
### Create a new goodware strings database
158158

159-
python yarGen.py -c -g C:\Windows\System32
159+
```python yarGen.py -c -g C:\Windows\System32```
160160

161161
### Update the goodware strings database (append new strings to the old ones)
162162

163-
python yarGen.py -u -g "C:\Program Files"
163+
```python yarGen.py -u -g "C:\Program Files"```
164164

165165
### Inverse rule creation (still beta)
166166

@@ -178,8 +178,8 @@ E.g. you want to create inverse rules for all Windows executables in the System3
178178

179179
yarGen than creates rules that identify e.g. file name "cmd.exe" in path ending with "System32" and checks if the file contains certain necessary strings. If the strings don't show up, the rule will fire. This indicates a replaced system file or malware file that tries to masquerade as a system file.
180180

181-
python yarGen.py --inverse -oe -m G:\goodware\
181+
```python yarGen.py --inverse -oe -m G:\goodware\```
182182

183183
You can also instruct yarGen not to include the file path but solely rely on the filename.
184184

185-
python yarGen.py --inverse -oe --nodirname -m G:\goodware\
185+
```python yarGen.py --inverse -oe --nodirname -m G:\goodware\```

screens/output-rule-0.11.png

-135 KB
Binary file not shown.

screens/output-rule-0.14.1.png

168 KB
Loading

screens/yargen-running.png

18.1 KB
Loading

yarGen.py

+86-72
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import time
2020
import scandir
2121
from collections import Counter
22-
from hashlib import sha1
22+
from hashlib import sha256
2323
from naiveBayesClassifier import tokenizer
2424
from naiveBayesClassifier.trainer import Trainer
2525
from naiveBayesClassifier.classifier import Classifier
@@ -43,6 +43,8 @@ def getFiles(dir, notRecursive):
4343
if notRecursive:
4444
for filename in os.listdir(dir):
4545
filePath = os.path.join(dir,filename)
46+
if os.path.isdir(filePath):
47+
continue
4648
yield filePath
4749
# Recursive
4850
else:
@@ -82,19 +84,19 @@ def parseSampleDir(dir, notRecursive=False, generateInfo=False, onlyRelevantExte
8284
pass
8385

8486
# Extract strings from file
85-
( strings, sha1sum ) = extractStrings(filePath, generateInfo)
87+
( strings, sha256sum ) = extractStrings(filePath, generateInfo)
8688

8789
# Skip if MD5 already known - avoid duplicate files
88-
if sha1sum in known_sha1sums:
90+
if sha256sum in known_sha1sums:
8991
#if args.debug:
9092
print "[-] Skipping strings from %s due to MD5 duplicate detection" % filePath
9193
continue
9294

9395
# Add md5 value
9496
if generateInfo:
95-
known_sha1sums.append(sha1sum)
97+
known_sha1sums.append(sha256sum)
9698
file_info[filePath] = {}
97-
file_info[filePath]["hash"] = sha1sum
99+
file_info[filePath]["hash"] = sha256sum
98100

99101
# Magic evaluation
100102
if not args.nomagic:
@@ -114,7 +116,7 @@ def parseSampleDir(dir, notRecursive=False, generateInfo=False, onlyRelevantExte
114116
file_info[fileName]["hashes"] = []
115117
file_info[fileName]["folder_names"] = []
116118
file_info[fileName]["count"] += 1
117-
file_info[fileName]["hashes"].append(sha1sum)
119+
file_info[fileName]["hashes"].append(sha256sum)
118120
if folderName not in file_info[fileName]["folder_names"]:
119121
file_info[fileName]["folder_names"].append(folderName)
120122

@@ -189,7 +191,7 @@ def extractStrings(filePath, generateInfo):
189191
f.close()
190192
# Generate md5
191193
if generateInfo:
192-
sha1sum = sha1(data).hexdigest()
194+
sha256sum = sha256(data).hexdigest()
193195

194196
# Read strings
195197
strings = re.findall("[\x1f-\x7e]{6,}", data)
@@ -214,7 +216,7 @@ def extractStrings(filePath, generateInfo):
214216
traceback.print_exc()
215217
pass
216218

217-
return cleaned_strings, sha1sum
219+
return cleaned_strings, sha256sum
218220

219221

220222
def sampleStringEvaluation(sample_string_stats, good_strings, file_info):
@@ -260,65 +262,66 @@ def sampleStringEvaluation(sample_string_stats, good_strings, file_info):
260262
print "Appending %s to %s" % ( string, fileName )
261263
inverse_stats[fileName].append(string)
262264

263-
# SUPER RULE GENERATION -------------------------------------------
264-
265-
super_rules = []
266-
if not args.nosuper and not args.inverse:
265+
# SUPER RULE GENERATION -------------------------------------------
267266

268-
# SUPER RULES GENERATOR - preliminary work
269-
# If a string occurs more than once in different files
270-
if sample_string_stats[string]["count"] > 1:
271-
if args.debug:
272-
print "OVERLAP Count: %s\nString: \"%s\"%s" % ( sample_string_stats[string]["count"], string, "\nFILE: ".join(sample_string_stats[string]["files"]) )
273-
# Create a combination string from the file set that matches to that string
274-
combi = ":".join(sorted(sample_string_stats[string]["files"]))
275-
# print "STRING: " + string
276-
# print "COMBI: " + combi
277-
# If combination not yet known
278-
if combi not in combinations:
279-
combinations[combi] = {}
280-
combinations[combi]["count"] = 1
281-
combinations[combi]["strings"] = []
282-
combinations[combi]["strings"].append(string)
283-
combinations[combi]["files"] = sample_string_stats[string]["files"]
284-
else:
285-
combinations[combi]["count"] += 1
286-
combinations[combi]["strings"].append(string)
287-
# Set the maximum combination count
288-
if combinations[combi]["count"] > max_combi_count:
289-
max_combi_count = combinations[combi]["count"]
290-
# print "Max Combi Count set to: %s" % max_combi_count
291-
292-
print "[+] Generating Super Rules ... (a lot of foo magic)"
293-
for combi_count in range(max_combi_count, 1, -1):
294-
for combi in combinations:
295-
if combi_count == combinations[combi]["count"]:
296-
#print "Count %s - Combi %s" % ( str(combinations[combi]["count"]), combi )
297-
# Filter the string set
298-
#print "BEFORE"
299-
#print len(combinations[combi]["strings"])
300-
#print combinations[combi]["strings"]
301-
string_set = combinations[combi]["strings"]
267+
super_rules = []
268+
if not args.nosuper and not args.inverse:
269+
270+
# SUPER RULES GENERATOR - preliminary work
271+
# If a string occurs more than once in different files
272+
# print sample_string_stats[string]["count"]
273+
if sample_string_stats[string]["count"] > 1:
274+
if args.debug:
275+
print "OVERLAP Count: %s\nString: \"%s\"%s" % ( sample_string_stats[string]["count"], string, "\nFILE: ".join(sample_string_stats[string]["files"]) )
276+
# Create a combination string from the file set that matches to that string
277+
combi = ":".join(sorted(sample_string_stats[string]["files"]))
278+
# print "STRING: " + string
279+
# print "COMBI: " + combi
280+
# If combination not yet known
281+
if combi not in combinations:
282+
combinations[combi] = {}
283+
combinations[combi]["count"] = 1
302284
combinations[combi]["strings"] = []
303-
combinations[combi]["strings"] = filterStringSet(string_set)
304-
#print combinations[combi]["strings"]
305-
#print "AFTER"
306-
#print len(combinations[combi]["strings"])
307-
# Combi String count after filtering
308-
#print "String count after filtering: %s" % str(len(combinations[combi]["strings"]))
309-
310-
# If the string set of the combination has a required size
311-
if len(combinations[combi]["strings"]) >= int(args.rc):
312-
# Remove the files in the combi rule from the simple set
313-
if args.nosimple:
314-
for file in combinations[combi]["files"]:
315-
if file in file_strings:
316-
del file_strings[file]
317-
# Add it as a super rule
318-
print "[-] Adding Super Rule with %s strings." % str(len(combinations[combi]["strings"]))
319-
#if args.debug:
320-
#print "Rule Combi: %s" % combi
321-
super_rules.append(combinations[combi])
285+
combinations[combi]["strings"].append(string)
286+
combinations[combi]["files"] = sample_string_stats[string]["files"]
287+
else:
288+
combinations[combi]["count"] += 1
289+
combinations[combi]["strings"].append(string)
290+
# Set the maximum combination count
291+
if combinations[combi]["count"] > max_combi_count:
292+
max_combi_count = combinations[combi]["count"]
293+
# print "Max Combi Count set to: %s" % max_combi_count
294+
295+
print "[+] Generating Super Rules ... (a lot of foo magic)"
296+
for combi_count in range(max_combi_count, 1, -1):
297+
for combi in combinations:
298+
if combi_count == combinations[combi]["count"]:
299+
#print "Count %s - Combi %s" % ( str(combinations[combi]["count"]), combi )
300+
# Filter the string set
301+
#print "BEFORE"
302+
#print len(combinations[combi]["strings"])
303+
#print combinations[combi]["strings"]
304+
string_set = combinations[combi]["strings"]
305+
combinations[combi]["strings"] = []
306+
combinations[combi]["strings"] = filterStringSet(string_set)
307+
#print combinations[combi]["strings"]
308+
#print "AFTER"
309+
#print len(combinations[combi]["strings"])
310+
# Combi String count after filtering
311+
#print "String count after filtering: %s" % str(len(combinations[combi]["strings"]))
312+
313+
# If the string set of the combination has a required size
314+
if len(combinations[combi]["strings"]) >= int(args.rc):
315+
# Remove the files in the combi rule from the simple set
316+
if args.nosimple:
317+
for file in combinations[combi]["files"]:
318+
if file in file_strings:
319+
del file_strings[file]
320+
# Add it as a super rule
321+
print "[-] Adding Super Rule with %s strings." % str(len(combinations[combi]["strings"]))
322+
#if args.debug:
323+
#print "Rule Combi: %s" % combi
324+
super_rules.append(combinations[combi])
322325

323326
# Return all data
324327
return (file_strings, combinations, super_rules, inverse_stats)
@@ -618,8 +621,11 @@ def generateGeneralCondition(file_info):
618621

619622
try:
620623
for filePath in file_info:
621-
magic = file_info_mal[filePath]["magic"]
622-
size = file_info_mal[filePath]["size"]
624+
# Short file name info used for inverse generation has no magic/size fields
625+
if "magic" not in file_info[filePath]:
626+
continue
627+
magic = file_info[filePath]["magic"]
628+
size = file_info[filePath]["size"]
623629
if magic not in magic_headers and magic != "":
624630
magic_headers.append(magic)
625631
if size not in file_sizes:
@@ -641,6 +647,9 @@ def generateGeneralCondition(file_info):
641647
condition = "{0}".format(getFileRange(max(file_sizes)))
642648

643649
except Exception, e:
650+
if args.debug:
651+
traceback.print_exc()
652+
exit(1)
644653
print "[E] ERROR while generating general condition - check the global rule and remove it if it's faulty"
645654

646655
return condition
@@ -858,6 +867,7 @@ def createRules(file_strings, super_rules, file_info, inverse_stats):
858867
if not args.nosuper and not args.inverse:
859868

860869
fh.write("/* Super Rules ------------------------------------------------------------- */\n\n")
870+
super_rule_names = []
861871

862872
print "[+] Generating super rules ..."
863873
printed_combi = {}
@@ -880,7 +890,11 @@ def createRules(file_strings, super_rules, file_info, inverse_stats):
880890
rule_name += "_" + cleanedName
881891

882892
# Shorten rule name
883-
rule_name = rule_name[:127]
893+
rule_name = rule_name[:124]
894+
# Add count if rule name already taken
895+
if rule_name not in super_rule_names:
896+
rule_name = "%s_%s" % (rule_name, super_rule_count)
897+
super_rule_names.append(rule_name)
884898

885899
# Create a list of files
886900
file_listing = ", ".join(file_list)
@@ -906,7 +920,7 @@ def createRules(file_strings, super_rules, file_info, inverse_stats):
906920
rule += "\t\tdate = \"%s\"\n" % getTimestampBasic()
907921
rule += "\t\tsuper_rule = 1\n"
908922
for i, filePath in enumerate(super_rule["files"]):
909-
rule += "\t\thash%s = \"%s\"\n" % (str(i)+1, file_info[filePath]["hash"])
923+
rule += "\t\thash%s = \"%s\"\n" % (str(i+1), file_info[filePath]["hash"])
910924

911925
rule += "\tstrings:\n"
912926

@@ -990,9 +1004,9 @@ def getRuleStrings(elements):
9901004

9911005
# Checking string length
9921006
is_fullword = True
993-
if len(string) > 80:
1007+
if len(string) > args.s:
9941008
# cut string
995-
string = string[:80].rstrip("\\")
1009+
string = string[:args.s].rstrip("\\")
9961010
# not fullword anymore
9971011
is_fullword = False
9981012
# Show as fullword
@@ -1171,7 +1185,7 @@ def printWelcome():
11711185
print " Yara Rule Generator"
11721186
print " by Florian Roth"
11731187
print " July 2015"
1174-
print " Version 0.14.1"
1188+
print " Version 0.14.2"
11751189
print " "
11761190
print "###############################################################################"
11771191

0 commit comments

Comments
 (0)