10
10
import argparse
11
11
import itertools
12
12
import contextlib
13
- from typing import Set , Dict , List , Tuple , Union , Literal , Callable , Iterable , Optional , Sequence
13
+ from typing import Set , Dict , List , Tuple , Literal , Callable , Iterable , Optional , Sequence
14
14
from dataclasses import field , dataclass
15
15
16
16
import pefile
30
30
from floss .qs .db .expert import ExpertStringDatabase
31
31
from floss .qs .db .winapi import WindowsApiStringDatabase
32
32
33
- MIN_STR_LEN = 6
34
-
35
33
logger = logging .getLogger ("quantumstrand" )
36
34
37
35
@@ -45,6 +43,7 @@ def timing(msg: str):
45
43
46
44
@dataclass
47
45
class Range :
46
+ "a range of contiguous integer values, such as offsets within a byte sequence"
48
47
offset : int
49
48
length : int
50
49
@@ -53,22 +52,13 @@ def end(self) -> int:
53
52
return self .offset + self .length
54
53
55
54
def slice (self , offset , size ) -> "Range" :
56
- "create a new range thats a sub-range of this one"
55
+ "create a new range thats a sub-range of this one, using relative offsets "
57
56
assert offset < self .length
58
57
assert offset + size <= self .length
59
58
return Range (self .offset + offset , size )
60
59
61
- def __contains__ (self , other : Union [int , "Range" ]) -> bool :
62
- if isinstance (other , int ):
63
- # this range strictly contains the point
64
- return self .offset <= other < self .end
65
- elif isinstance (other , Range ):
66
- # this range strictly contains the other one
67
- return (other .offset in self ) and (other .end in self )
68
- else :
69
- raise TypeError (f"unsupported type: { type (other )} " )
70
-
71
60
def __iter__ (self ):
61
+ "iterate over the values in this range"
72
62
yield from range (self .offset , self .end )
73
63
74
64
def __repr__ (self ):
@@ -91,10 +81,11 @@ class Slice:
91
81
92
82
@property
93
83
def data (self ) -> bytes :
84
+ "get the bytes in this slice, copying the data out"
94
85
return self .buf [self .range .offset : self .range .end ]
95
86
96
87
def slice (self , offset , size ) -> "Slice" :
97
- "create a new slice thats a sub-slice of this one"
88
+ "create a new slice thats a sub-slice of this one, using relative offsets "
98
89
return Slice (self .buf , self .range .slice (offset , size ))
99
90
100
91
@classmethod
@@ -130,6 +121,7 @@ def offset(self) -> int:
130
121
return self .string .slice .range .offset
131
122
132
123
124
+ MIN_STR_LEN = 6
133
125
ASCII_BYTE = r" !\"#\$%&\'\(\)\*\+,-\./0123456789:;<=>\?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]\^_`abcdefghijklmnopqrstuvwxyz\{\|\}\\\~\t" .encode (
134
126
"ascii"
135
127
)
@@ -138,7 +130,7 @@ def offset(self) -> int:
138
130
139
131
140
132
def extract_ascii_strings (slice : Slice , n : int = MIN_STR_LEN ) -> Iterable [ExtractedString ]:
141
- """Extract ASCII strings from the given binary data."" "
133
+ "enumerate ASCII strings in the given binary data"
142
134
143
135
if not slice .range .length :
144
136
return
@@ -158,7 +150,7 @@ def extract_ascii_strings(slice: Slice, n: int = MIN_STR_LEN) -> Iterable[Extrac
158
150
159
151
160
152
def extract_unicode_strings (slice : Slice , n : int = MIN_STR_LEN ) -> Iterable [ExtractedString ]:
161
- """Extract naive UTF-16 strings from the given binary data."" "
153
+ "enumerate naive UTF-16 strings in the given binary data"
162
154
163
155
if not slice .range .length :
164
156
return
@@ -183,6 +175,7 @@ def extract_unicode_strings(slice: Slice, n: int = MIN_STR_LEN) -> Iterable[Extr
183
175
184
176
185
177
def extract_strings (slice : Slice , n : int = MIN_STR_LEN ) -> Iterable [ExtractedString ]:
178
+ "enumerate ASCII and naive UTF-16 strings in the given binary data"
186
179
return list (
187
180
sorted (
188
181
itertools .chain (extract_ascii_strings (slice , n ), extract_unicode_strings (slice , n )),
@@ -464,9 +457,14 @@ def load_databases() -> Sequence[Tagger]:
464
457
465
458
data_path = pathlib .Path (floss .qs .db .oss .__file__ ).parent / "data"
466
459
460
+ # below i use a `if True` blocks to delineate the different databases.
461
+ # these could be functions, at the expense of more visual noise.
462
+ # note that each one creates a closure over the database object.
463
+
467
464
if True :
468
465
winapi_database = floss .qs .db .winapi .WindowsApiStringDatabase .from_dir (data_path / "winapi" )
469
466
467
+ # note closure over winapi_database
470
468
def winapi_database_tagger (s : ExtractedString ) -> Sequence [Tag ]:
471
469
return query_winapi_name_database (winapi_database , s .string )
472
470
@@ -475,6 +473,7 @@ def winapi_database_tagger(s: ExtractedString) -> Sequence[Tag]:
475
473
if True :
476
474
capa_expert_database = ExpertStringDatabase .from_file (data_path / "expert" / "capa.jsonl" )
477
475
476
+ # note closure over capa_expert_database
478
477
def capa_expert_database_tagger (s : ExtractedString ) -> Sequence [Tag ]:
479
478
return query_expert_string_database (capa_expert_database , s .string )
480
479
@@ -487,6 +486,7 @@ def capa_expert_database_tagger(s: ExtractedString) -> Sequence[Tag]:
487
486
488
487
library_databases .append (OpenSourceStringDatabase .from_file (data_path / "crt" / "msvc_v143.jsonl.gz" ))
489
488
489
+ # note closure over library_databases
490
490
def library_databases_tagger (s : ExtractedString ) -> Sequence [Tag ]:
491
491
return query_library_string_databases (library_databases , s .string )
492
492
@@ -501,6 +501,7 @@ def library_databases_tagger(s: ExtractedString) -> Sequence[Tag]:
501
501
StringGlobalPrevalenceDatabase .from_file (data_path / "gp" / "cwindb-dotnet.jsonl.gz" )
502
502
)
503
503
504
+ # note closure over global_prevalence_database
504
505
def global_prevalence_database_tagger (s : ExtractedString ) -> Sequence [Tag ]:
505
506
return query_global_prevalence_database (global_prevalence_database , s .string )
506
507
@@ -509,6 +510,7 @@ def global_prevalence_database_tagger(s: ExtractedString) -> Sequence[Tag]:
509
510
if True :
510
511
global_prevalence_hash_database_xaa = StringHashDatabase .from_file (data_path / "gp" / "xaa-hashes.bin" )
511
512
513
+ # note closure over global_prevalence_hash_database_xaa
512
514
def global_prevalence_hash_database_xaa_tagger (s : ExtractedString ) -> Sequence [Tag ]:
513
515
return query_global_prevalence_hash_database (global_prevalence_hash_database_xaa , s .string )
514
516
@@ -517,6 +519,7 @@ def global_prevalence_hash_database_xaa_tagger(s: ExtractedString) -> Sequence[T
517
519
if True :
518
520
global_prevalence_hash_database_yaa = StringHashDatabase .from_file (data_path / "gp" / "yaa-hashes.bin" )
519
521
522
+ # note closure over global_prevalence_hash_database_yaa
520
523
def global_prevalence_hash_database_yaa_tagger (s : ExtractedString ) -> Sequence [Tag ]:
521
524
return query_global_prevalence_hash_database (global_prevalence_hash_database_yaa , s .string )
522
525
@@ -527,6 +530,27 @@ def global_prevalence_hash_database_yaa_tagger(s: ExtractedString) -> Sequence[T
527
530
528
531
@dataclass
529
532
class Layout (abc .ABC ):
533
+ """
534
+ recursively describe a region of a data, as a tree.
535
+ the compute_layout routines construct this tree.
536
+
537
+ each node in the tree (Layout), describes a range of the data.
538
+ it may have children, which describes sub-ranges of the data.
539
+ children don't overlap nor extend before/beyond the parent range.
540
+ children are ordered by their offset in the data.
541
+ children don't have to be contiguous - there can be gaps, or none at all.
542
+ there are routines for traversing to the prior/next sibling, if any,
543
+ and accessor properties for the parent and children.
544
+
545
+ each node has a nice human readable name.
546
+ each node has a list of strings that are contained by the node;
547
+ these strings don't overlap with any children strings, they're only found in the gaps.
548
+
549
+ note that `Layout` is the abstract base class for nodes in the tree.
550
+ subclasses are used to represent different types of regions,
551
+ such as a PE file, a section, a segment, or a resource.
552
+ subclasses can provide more specific behavior when it comes to tagging strings.
553
+ """
530
554
slice : Slice
531
555
532
556
# human readable name
@@ -588,10 +612,12 @@ def add_child(self, child: "Layout"):
588
612
589
613
@property
590
614
def offset (self ) -> int :
615
+ "convenience"
591
616
return self .slice .range .offset
592
617
593
618
@property
594
619
def end (self ) -> int :
620
+ "convenience"
595
621
return self .slice .range .end
596
622
597
623
def tag_strings (self , taggers : Sequence [Tagger ]):
@@ -650,8 +676,7 @@ class SectionLayout(Layout):
650
676
651
677
@dataclass
652
678
class SegmentLayout (Layout ):
653
- """region not covered by any section"""
654
-
679
+ """region not covered by any section, such as PE header or overlay"""
655
680
pass
656
681
657
682
@@ -1014,7 +1039,7 @@ def render_strings(
1014
1039
# rsrc: BINARY/102/0 (pe)
1015
1040
return render_strings (console , layout .children [0 ], tag_rules , depth , name_hint = layout .name )
1016
1041
1017
- BORDER_STYLE = Style ( color = "grey50" )
1042
+ BORDER_STYLE = MUTED_STYLE
1018
1043
1019
1044
name = layout .name
1020
1045
if name_hint :
0 commit comments