Skip to content

Commit 4d92299

Browse files
qs: documentation
1 parent c0268f2 commit 4d92299

File tree

1 file changed

+45
-20
lines changed

1 file changed

+45
-20
lines changed

floss/qs/main.py

Lines changed: 45 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import argparse
1111
import itertools
1212
import contextlib
13-
from typing import Set, Dict, List, Tuple, Union, Literal, Callable, Iterable, Optional, Sequence
13+
from typing import Set, Dict, List, Tuple, Literal, Callable, Iterable, Optional, Sequence
1414
from dataclasses import field, dataclass
1515

1616
import pefile
@@ -30,8 +30,6 @@
3030
from floss.qs.db.expert import ExpertStringDatabase
3131
from floss.qs.db.winapi import WindowsApiStringDatabase
3232

33-
MIN_STR_LEN = 6
34-
3533
logger = logging.getLogger("quantumstrand")
3634

3735

@@ -45,6 +43,7 @@ def timing(msg: str):
4543

4644
@dataclass
4745
class Range:
46+
"a range of contiguous integer values, such as offsets within a byte sequence"
4847
offset: int
4948
length: int
5049

@@ -53,22 +52,13 @@ def end(self) -> int:
5352
return self.offset + self.length
5453

5554
def slice(self, offset, size) -> "Range":
56-
"create a new range thats a sub-range of this one"
55+
"create a new range thats a sub-range of this one, using relative offsets"
5756
assert offset < self.length
5857
assert offset + size <= self.length
5958
return Range(self.offset + offset, size)
6059

61-
def __contains__(self, other: Union[int, "Range"]) -> bool:
62-
if isinstance(other, int):
63-
# this range strictly contains the point
64-
return self.offset <= other < self.end
65-
elif isinstance(other, Range):
66-
# this range strictly contains the other one
67-
return (other.offset in self) and (other.end in self)
68-
else:
69-
raise TypeError(f"unsupported type: {type(other)}")
70-
7160
def __iter__(self):
61+
"iterate over the values in this range"
7262
yield from range(self.offset, self.end)
7363

7464
def __repr__(self):
@@ -91,10 +81,11 @@ class Slice:
9181

9282
@property
9383
def data(self) -> bytes:
84+
"get the bytes in this slice, copying the data out"
9485
return self.buf[self.range.offset : self.range.end]
9586

9687
def slice(self, offset, size) -> "Slice":
97-
"create a new slice thats a sub-slice of this one"
88+
"create a new slice thats a sub-slice of this one, using relative offsets"
9889
return Slice(self.buf, self.range.slice(offset, size))
9990

10091
@classmethod
@@ -130,6 +121,7 @@ def offset(self) -> int:
130121
return self.string.slice.range.offset
131122

132123

124+
MIN_STR_LEN = 6
133125
ASCII_BYTE = r" !\"#\$%&\'\(\)\*\+,-\./0123456789:;<=>\?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]\^_`abcdefghijklmnopqrstuvwxyz\{\|\}\\\~\t".encode(
134126
"ascii"
135127
)
@@ -138,7 +130,7 @@ def offset(self) -> int:
138130

139131

140132
def extract_ascii_strings(slice: Slice, n: int = MIN_STR_LEN) -> Iterable[ExtractedString]:
141-
"""Extract ASCII strings from the given binary data."""
133+
"enumerate ASCII strings in the given binary data"
142134

143135
if not slice.range.length:
144136
return
@@ -158,7 +150,7 @@ def extract_ascii_strings(slice: Slice, n: int = MIN_STR_LEN) -> Iterable[Extrac
158150

159151

160152
def extract_unicode_strings(slice: Slice, n: int = MIN_STR_LEN) -> Iterable[ExtractedString]:
161-
"""Extract naive UTF-16 strings from the given binary data."""
153+
"enumerate naive UTF-16 strings in the given binary data"
162154

163155
if not slice.range.length:
164156
return
@@ -183,6 +175,7 @@ def extract_unicode_strings(slice: Slice, n: int = MIN_STR_LEN) -> Iterable[Extr
183175

184176

185177
def extract_strings(slice: Slice, n: int = MIN_STR_LEN) -> Iterable[ExtractedString]:
178+
"enumerate ASCII and naive UTF-16 strings in the given binary data"
186179
return list(
187180
sorted(
188181
itertools.chain(extract_ascii_strings(slice, n), extract_unicode_strings(slice, n)),
@@ -464,9 +457,14 @@ def load_databases() -> Sequence[Tagger]:
464457

465458
data_path = pathlib.Path(floss.qs.db.oss.__file__).parent / "data"
466459

460+
# below i use a `if True` blocks to delineate the different databases.
461+
# these could be functions, at the expense of more visual noise.
462+
# note that each one creates a closure over the database object.
463+
467464
if True:
468465
winapi_database = floss.qs.db.winapi.WindowsApiStringDatabase.from_dir(data_path / "winapi")
469466

467+
# note closure over winapi_database
470468
def winapi_database_tagger(s: ExtractedString) -> Sequence[Tag]:
471469
return query_winapi_name_database(winapi_database, s.string)
472470

@@ -475,6 +473,7 @@ def winapi_database_tagger(s: ExtractedString) -> Sequence[Tag]:
475473
if True:
476474
capa_expert_database = ExpertStringDatabase.from_file(data_path / "expert" / "capa.jsonl")
477475

476+
# note closure over capa_expert_database
478477
def capa_expert_database_tagger(s: ExtractedString) -> Sequence[Tag]:
479478
return query_expert_string_database(capa_expert_database, s.string)
480479

@@ -487,6 +486,7 @@ def capa_expert_database_tagger(s: ExtractedString) -> Sequence[Tag]:
487486

488487
library_databases.append(OpenSourceStringDatabase.from_file(data_path / "crt" / "msvc_v143.jsonl.gz"))
489488

489+
# note closure over library_databases
490490
def library_databases_tagger(s: ExtractedString) -> Sequence[Tag]:
491491
return query_library_string_databases(library_databases, s.string)
492492

@@ -501,6 +501,7 @@ def library_databases_tagger(s: ExtractedString) -> Sequence[Tag]:
501501
StringGlobalPrevalenceDatabase.from_file(data_path / "gp" / "cwindb-dotnet.jsonl.gz")
502502
)
503503

504+
# note closure over global_prevalence_database
504505
def global_prevalence_database_tagger(s: ExtractedString) -> Sequence[Tag]:
505506
return query_global_prevalence_database(global_prevalence_database, s.string)
506507

@@ -509,6 +510,7 @@ def global_prevalence_database_tagger(s: ExtractedString) -> Sequence[Tag]:
509510
if True:
510511
global_prevalence_hash_database_xaa = StringHashDatabase.from_file(data_path / "gp" / "xaa-hashes.bin")
511512

513+
# note closure over global_prevalence_hash_database_xaa
512514
def global_prevalence_hash_database_xaa_tagger(s: ExtractedString) -> Sequence[Tag]:
513515
return query_global_prevalence_hash_database(global_prevalence_hash_database_xaa, s.string)
514516

@@ -517,6 +519,7 @@ def global_prevalence_hash_database_xaa_tagger(s: ExtractedString) -> Sequence[T
517519
if True:
518520
global_prevalence_hash_database_yaa = StringHashDatabase.from_file(data_path / "gp" / "yaa-hashes.bin")
519521

522+
# note closure over global_prevalence_hash_database_yaa
520523
def global_prevalence_hash_database_yaa_tagger(s: ExtractedString) -> Sequence[Tag]:
521524
return query_global_prevalence_hash_database(global_prevalence_hash_database_yaa, s.string)
522525

@@ -527,6 +530,27 @@ def global_prevalence_hash_database_yaa_tagger(s: ExtractedString) -> Sequence[T
527530

528531
@dataclass
529532
class Layout(abc.ABC):
533+
"""
534+
recursively describe a region of a data, as a tree.
535+
the compute_layout routines construct this tree.
536+
537+
each node in the tree (Layout), describes a range of the data.
538+
it may have children, which describes sub-ranges of the data.
539+
children don't overlap nor extend before/beyond the parent range.
540+
children are ordered by their offset in the data.
541+
children don't have to be contiguous - there can be gaps, or none at all.
542+
there are routines for traversing to the prior/next sibling, if any,
543+
and accessor properties for the parent and children.
544+
545+
each node has a nice human readable name.
546+
each node has a list of strings that are contained by the node;
547+
these strings don't overlap with any children strings, they're only found in the gaps.
548+
549+
note that `Layout` is the abstract base class for nodes in the tree.
550+
subclasses are used to represent different types of regions,
551+
such as a PE file, a section, a segment, or a resource.
552+
subclasses can provide more specific behavior when it comes to tagging strings.
553+
"""
530554
slice: Slice
531555

532556
# human readable name
@@ -588,10 +612,12 @@ def add_child(self, child: "Layout"):
588612

589613
@property
590614
def offset(self) -> int:
615+
"convenience"
591616
return self.slice.range.offset
592617

593618
@property
594619
def end(self) -> int:
620+
"convenience"
595621
return self.slice.range.end
596622

597623
def tag_strings(self, taggers: Sequence[Tagger]):
@@ -650,8 +676,7 @@ class SectionLayout(Layout):
650676

651677
@dataclass
652678
class SegmentLayout(Layout):
653-
"""region not covered by any section"""
654-
679+
"""region not covered by any section, such as PE header or overlay"""
655680
pass
656681

657682

@@ -1014,7 +1039,7 @@ def render_strings(
10141039
# rsrc: BINARY/102/0 (pe)
10151040
return render_strings(console, layout.children[0], tag_rules, depth, name_hint=layout.name)
10161041

1017-
BORDER_STYLE = Style(color="grey50")
1042+
BORDER_STYLE = MUTED_STYLE
10181043

10191044
name = layout.name
10201045
if name_hint:

0 commit comments

Comments
 (0)