Skip to content

Commit e2e36ed

Browse files
qs: re-enable structure hints
closes #770
1 parent b3de4cd commit e2e36ed

File tree

2 files changed

+123
-85
lines changed

2 files changed

+123
-85
lines changed

floss/qs/main.py

Lines changed: 123 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
import pefile
1717
import colorama
1818
import lancelot
19-
import intervaltree
2019
import rich.traceback
2120
from rich.text import Text
2221
from rich.style import Style
@@ -196,7 +195,7 @@ def Span(text: str, style: Style = DEFAULT_STYLE) -> Text:
196195

197196
PADDING_WIDTH = 2
198197
OFFSET_WIDTH = 8
199-
STRUCTURE_WIDTH = 16
198+
STRUCTURE_WIDTH = 20
200199

201200

202201
def render_string_padding():
@@ -291,8 +290,9 @@ def render_string_structure(s: TaggedString):
291290
ret = Text()
292291

293292
if s.structure:
294-
structure = Span("/" + s.structure, style=MUTED_STYLE)
295-
structure.align("left", STRUCTURE_WIDTH)
293+
structure = Span(s.structure, style=Style(color="blue"))
294+
structure.align("left", STRUCTURE_WIDTH - 1)
295+
ret.append(Span("/", style=MUTED_STYLE))
296296
ret.append(structure)
297297
else:
298298
ret.append_text(Span(" " * STRUCTURE_WIDTH))
@@ -528,6 +528,74 @@ def global_prevalence_hash_database_yaa_tagger(s: ExtractedString) -> Sequence[T
528528
return ret
529529

530530

531+
@dataclass
532+
class Structure:
533+
slice: Slice
534+
name: str
535+
536+
537+
def collect_pe_structures(slice: Slice, pe: pefile.PE) -> Sequence[Structure]:
538+
structures = []
539+
540+
for section in sorted(pe.sections, key=lambda s: s.PointerToRawData):
541+
offset = section.get_file_offset()
542+
size = section.sizeof()
543+
544+
structures.append(
545+
Structure(
546+
slice=slice.slice(offset, size),
547+
name="section header",
548+
)
549+
)
550+
551+
if hasattr(pe, "DIRECTORY_ENTRY_IMPORT"):
552+
for dll in pe.DIRECTORY_ENTRY_IMPORT:
553+
try:
554+
dll_name = dll.dll.decode("ascii")
555+
except UnicodeDecodeError:
556+
continue
557+
558+
rva = dll.struct.Name
559+
size = len(dll_name)
560+
offset = pe.get_offset_from_rva(rva)
561+
562+
structures.append(
563+
Structure(
564+
slice=slice.slice(offset, size),
565+
name="import table",
566+
)
567+
)
568+
569+
for entry in dll.imports:
570+
if entry.name is None:
571+
continue
572+
573+
if entry.name_offset is None:
574+
continue
575+
576+
try:
577+
symbol_name = entry.name.decode("ascii")
578+
except UnicodeDecodeError:
579+
continue
580+
581+
offset = entry.name_offset
582+
size = len(symbol_name)
583+
584+
structures.append(
585+
Structure(
586+
slice=slice.slice(offset, size),
587+
name="import table",
588+
)
589+
)
590+
591+
# TODO: other structures
592+
# export table
593+
# certificate data
594+
# rich header
595+
596+
return structures
597+
598+
531599
@dataclass
532600
class Layout(abc.ABC):
533601
"""
@@ -551,6 +619,7 @@ class Layout(abc.ABC):
551619
such as a PE file, a section, a segment, or a resource.
552620
subclasses can provide more specific behavior when it comes to tagging strings.
553621
"""
622+
554623
slice: Slice
555624

556625
# human readable name
@@ -645,6 +714,37 @@ def tag_strings(self, taggers: Sequence[Tagger]):
645714
for child in self.children:
646715
child.tag_strings(taggers)
647716

717+
def mark_structures(self, structures: Optional[Tuple[Dict[int, Structure], ...]] = (), **kwargs):
718+
"""
719+
mark the structures that might be associated with each string, recursively.
720+
this means that the TaggedStrings may now have a non-empty .structure field.
721+
722+
this can be overridden, if a subclass has a way of parsing structures,
723+
such as a PE file and all its data.
724+
"""
725+
if structures:
726+
for string in self.strings:
727+
for structures_by_address in structures:
728+
structure = structures_by_address.get(string.offset)
729+
if structure:
730+
string.structure = structure.name
731+
break
732+
733+
for child in self.children:
734+
child.mark_structures(structures=structures, **kwargs)
735+
736+
737+
@dataclass
738+
class SectionLayout(Layout):
739+
section: pefile.SectionStructure
740+
741+
742+
@dataclass
743+
class SegmentLayout(Layout):
744+
"""region not covered by any section, such as PE header or overlay"""
745+
746+
pass
747+
648748

649749
@dataclass
650750
class PELayout(Layout):
@@ -654,6 +754,8 @@ class PELayout(Layout):
654754
# file offsets of bytes that are recognized as code
655755
code_offsets: Set[int]
656756

757+
structures_by_address: Dict[int, Structure]
758+
657759
def tag_strings(self, taggers: Sequence[Tagger]):
658760
def check_is_reloc_tagger(s: ExtractedString) -> Sequence[Tag]:
659761
return check_is_reloc(self.reloc_offsets, s)
@@ -668,88 +770,23 @@ def check_is_code_tagger(s: ExtractedString) -> Sequence[Tag]:
668770

669771
super().tag_strings(taggers)
670772

671-
672-
@dataclass
673-
class SectionLayout(Layout):
674-
section: pefile.SectionStructure
675-
676-
677-
@dataclass
678-
class SegmentLayout(Layout):
679-
"""region not covered by any section, such as PE header or overlay"""
680-
pass
773+
def mark_structures(self, structures=(), **kwargs):
774+
for child in self.children:
775+
if isinstance(child, (SectionLayout, SegmentLayout)):
776+
# expected child of a PE
777+
child.mark_structures(structures=structures + (self.structures_by_address,), **kwargs)
778+
else:
779+
# unexpected child of a PE
780+
# maybe like a resource or overlay, etc.
781+
# which is fine - but we don't expect it to know about the PE structures.
782+
child.mark_structures(structures=structures, **kwargs)
681783

682784

683785
@dataclass
684786
class ResourceLayout(Layout):
685787
pass
686788

687789

688-
@dataclass
689-
class Structure:
690-
slice: Slice
691-
name: str
692-
693-
694-
def compute_file_structures(slice: Slice, pe: pefile.PE) -> Sequence[Structure]:
695-
structures = []
696-
697-
for section in sorted(pe.sections, key=lambda s: s.PointerToRawData):
698-
offset = section.get_file_offset()
699-
size = section.sizeof()
700-
701-
structures.append(
702-
Structure(
703-
slice=slice.slice(offset, size),
704-
name="section header",
705-
)
706-
)
707-
708-
if hasattr(pe, "DIRECTORY_ENTRY_IMPORT"):
709-
for dll in pe.DIRECTORY_ENTRY_IMPORT:
710-
try:
711-
dll_name = dll.dll.decode("ascii")
712-
except UnicodeDecodeError:
713-
continue
714-
715-
rva = dll.struct.Name
716-
size = len(dll_name)
717-
offset = pe.get_offset_from_rva(rva)
718-
719-
structures.append(
720-
Structure(
721-
slice=slice.slice(offset, size),
722-
name="import table",
723-
)
724-
)
725-
726-
for entry in dll.imports:
727-
if entry.name is None:
728-
continue
729-
730-
if entry.name_offset is None:
731-
continue
732-
733-
try:
734-
symbol_name = entry.name.decode("ascii")
735-
except UnicodeDecodeError:
736-
continue
737-
738-
offset = entry.name_offset
739-
size = len(symbol_name)
740-
741-
structures.append(
742-
Structure(
743-
slice=slice.slice(offset, size),
744-
name="import table",
745-
)
746-
)
747-
748-
# TODO: other structures
749-
750-
return structures
751-
752-
753790
def compute_pe_layout(slice: Slice) -> Layout:
754791
data = slice.data
755792

@@ -758,12 +795,13 @@ def compute_pe_layout(slice: Slice) -> Layout:
758795
except pefile.PEFormatError as e:
759796
raise ValueError("pefile failed to load workspace") from e
760797

761-
structures = compute_file_structures(slice, pe)
798+
structures = collect_pe_structures(slice, pe)
762799
reloc_offsets = get_reloc_offsets(slice, pe)
763800

764-
structures_by_range = intervaltree.IntervalTree()
765-
for interval in structures:
766-
structures_by_range.addi(interval.slice.range.offset, interval.slice.range.end, interval)
801+
structures_by_address = {}
802+
for structure in structures:
803+
for offset in structure.slice.range:
804+
structures_by_address[offset] = structure
767805

768806
# lancelot only accepts bytes, not mmap
769807
with timing("lancelot: load workspace"):
@@ -792,6 +830,7 @@ def compute_pe_layout(slice: Slice) -> Layout:
792830
name="pe",
793831
reloc_offsets=reloc_offsets,
794832
code_offsets=code_offsets,
833+
structures_by_address=structures_by_address,
795834
)
796835

797836
for section in pe.sections:
@@ -1168,7 +1207,7 @@ def main():
11681207
taggers = load_databases()
11691208
layout.tag_strings(taggers)
11701209

1171-
# TODO: figure out how to mark structures
1210+
layout.mark_structures()
11721211

11731212
# remove tags from libraries that have too few matches (five, by default).
11741213
remove_false_positive_lib_strings(layout)

setup.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@
5858
extras_require={
5959
"qs": [
6060
"colorama==0.4.6",
61-
"intervaltree==3.1.0",
6261
"python-lancelot==0.8.6",
6362
],
6463
"dev": [

0 commit comments

Comments
 (0)