16
16
import pefile
17
17
import colorama
18
18
import lancelot
19
- import intervaltree
20
19
import rich .traceback
21
20
from rich .text import Text
22
21
from rich .style import Style
@@ -196,7 +195,7 @@ def Span(text: str, style: Style = DEFAULT_STYLE) -> Text:
196
195
197
196
PADDING_WIDTH = 2
198
197
OFFSET_WIDTH = 8
199
- STRUCTURE_WIDTH = 16
198
+ STRUCTURE_WIDTH = 20
200
199
201
200
202
201
def render_string_padding ():
@@ -291,8 +290,9 @@ def render_string_structure(s: TaggedString):
291
290
ret = Text ()
292
291
293
292
if s .structure :
294
- structure = Span ("/" + s .structure , style = MUTED_STYLE )
295
- structure .align ("left" , STRUCTURE_WIDTH )
293
+ structure = Span (s .structure , style = Style (color = "blue" ))
294
+ structure .align ("left" , STRUCTURE_WIDTH - 1 )
295
+ ret .append (Span ("/" , style = MUTED_STYLE ))
296
296
ret .append (structure )
297
297
else :
298
298
ret .append_text (Span (" " * STRUCTURE_WIDTH ))
@@ -528,6 +528,74 @@ def global_prevalence_hash_database_yaa_tagger(s: ExtractedString) -> Sequence[T
528
528
return ret
529
529
530
530
531
+ @dataclass
532
+ class Structure :
533
+ slice : Slice
534
+ name : str
535
+
536
+
537
+ def collect_pe_structures (slice : Slice , pe : pefile .PE ) -> Sequence [Structure ]:
538
+ structures = []
539
+
540
+ for section in sorted (pe .sections , key = lambda s : s .PointerToRawData ):
541
+ offset = section .get_file_offset ()
542
+ size = section .sizeof ()
543
+
544
+ structures .append (
545
+ Structure (
546
+ slice = slice .slice (offset , size ),
547
+ name = "section header" ,
548
+ )
549
+ )
550
+
551
+ if hasattr (pe , "DIRECTORY_ENTRY_IMPORT" ):
552
+ for dll in pe .DIRECTORY_ENTRY_IMPORT :
553
+ try :
554
+ dll_name = dll .dll .decode ("ascii" )
555
+ except UnicodeDecodeError :
556
+ continue
557
+
558
+ rva = dll .struct .Name
559
+ size = len (dll_name )
560
+ offset = pe .get_offset_from_rva (rva )
561
+
562
+ structures .append (
563
+ Structure (
564
+ slice = slice .slice (offset , size ),
565
+ name = "import table" ,
566
+ )
567
+ )
568
+
569
+ for entry in dll .imports :
570
+ if entry .name is None :
571
+ continue
572
+
573
+ if entry .name_offset is None :
574
+ continue
575
+
576
+ try :
577
+ symbol_name = entry .name .decode ("ascii" )
578
+ except UnicodeDecodeError :
579
+ continue
580
+
581
+ offset = entry .name_offset
582
+ size = len (symbol_name )
583
+
584
+ structures .append (
585
+ Structure (
586
+ slice = slice .slice (offset , size ),
587
+ name = "import table" ,
588
+ )
589
+ )
590
+
591
+ # TODO: other structures
592
+ # export table
593
+ # certificate data
594
+ # rich header
595
+
596
+ return structures
597
+
598
+
531
599
@dataclass
532
600
class Layout (abc .ABC ):
533
601
"""
@@ -551,6 +619,7 @@ class Layout(abc.ABC):
551
619
such as a PE file, a section, a segment, or a resource.
552
620
subclasses can provide more specific behavior when it comes to tagging strings.
553
621
"""
622
+
554
623
slice : Slice
555
624
556
625
# human readable name
@@ -645,6 +714,37 @@ def tag_strings(self, taggers: Sequence[Tagger]):
645
714
for child in self .children :
646
715
child .tag_strings (taggers )
647
716
717
+ def mark_structures (self , structures : Optional [Tuple [Dict [int , Structure ], ...]] = (), ** kwargs ):
718
+ """
719
+ mark the structures that might be associated with each string, recursively.
720
+ this means that the TaggedStrings may now have a non-empty .structure field.
721
+
722
+ this can be overridden, if a subclass has a way of parsing structures,
723
+ such as a PE file and all its data.
724
+ """
725
+ if structures :
726
+ for string in self .strings :
727
+ for structures_by_address in structures :
728
+ structure = structures_by_address .get (string .offset )
729
+ if structure :
730
+ string .structure = structure .name
731
+ break
732
+
733
+ for child in self .children :
734
+ child .mark_structures (structures = structures , ** kwargs )
735
+
736
+
737
+ @dataclass
738
+ class SectionLayout (Layout ):
739
+ section : pefile .SectionStructure
740
+
741
+
742
+ @dataclass
743
+ class SegmentLayout (Layout ):
744
+ """region not covered by any section, such as PE header or overlay"""
745
+
746
+ pass
747
+
648
748
649
749
@dataclass
650
750
class PELayout (Layout ):
@@ -654,6 +754,8 @@ class PELayout(Layout):
654
754
# file offsets of bytes that are recognized as code
655
755
code_offsets : Set [int ]
656
756
757
+ structures_by_address : Dict [int , Structure ]
758
+
657
759
def tag_strings (self , taggers : Sequence [Tagger ]):
658
760
def check_is_reloc_tagger (s : ExtractedString ) -> Sequence [Tag ]:
659
761
return check_is_reloc (self .reloc_offsets , s )
@@ -668,88 +770,23 @@ def check_is_code_tagger(s: ExtractedString) -> Sequence[Tag]:
668
770
669
771
super ().tag_strings (taggers )
670
772
671
-
672
- @ dataclass
673
- class SectionLayout ( Layout ):
674
- section : pefile . SectionStructure
675
-
676
-
677
- @ dataclass
678
- class SegmentLayout ( Layout ):
679
- """region not covered by any section, such as PE header or overlay"""
680
- pass
773
+ def mark_structures ( self , structures = (), ** kwargs ):
774
+ for child in self . children :
775
+ if isinstance ( child , ( SectionLayout , SegmentLayout ) ):
776
+ # expected child of a PE
777
+ child . mark_structures ( structures = structures + ( self . structures_by_address ,), ** kwargs )
778
+ else :
779
+ # unexpected child of a PE
780
+ # maybe like a resource or overlay, etc.
781
+ # which is fine - but we don't expect it to know about the PE structures.
782
+ child . mark_structures ( structures = structures , ** kwargs )
681
783
682
784
683
785
@dataclass
684
786
class ResourceLayout (Layout ):
685
787
pass
686
788
687
789
688
- @dataclass
689
- class Structure :
690
- slice : Slice
691
- name : str
692
-
693
-
694
- def compute_file_structures (slice : Slice , pe : pefile .PE ) -> Sequence [Structure ]:
695
- structures = []
696
-
697
- for section in sorted (pe .sections , key = lambda s : s .PointerToRawData ):
698
- offset = section .get_file_offset ()
699
- size = section .sizeof ()
700
-
701
- structures .append (
702
- Structure (
703
- slice = slice .slice (offset , size ),
704
- name = "section header" ,
705
- )
706
- )
707
-
708
- if hasattr (pe , "DIRECTORY_ENTRY_IMPORT" ):
709
- for dll in pe .DIRECTORY_ENTRY_IMPORT :
710
- try :
711
- dll_name = dll .dll .decode ("ascii" )
712
- except UnicodeDecodeError :
713
- continue
714
-
715
- rva = dll .struct .Name
716
- size = len (dll_name )
717
- offset = pe .get_offset_from_rva (rva )
718
-
719
- structures .append (
720
- Structure (
721
- slice = slice .slice (offset , size ),
722
- name = "import table" ,
723
- )
724
- )
725
-
726
- for entry in dll .imports :
727
- if entry .name is None :
728
- continue
729
-
730
- if entry .name_offset is None :
731
- continue
732
-
733
- try :
734
- symbol_name = entry .name .decode ("ascii" )
735
- except UnicodeDecodeError :
736
- continue
737
-
738
- offset = entry .name_offset
739
- size = len (symbol_name )
740
-
741
- structures .append (
742
- Structure (
743
- slice = slice .slice (offset , size ),
744
- name = "import table" ,
745
- )
746
- )
747
-
748
- # TODO: other structures
749
-
750
- return structures
751
-
752
-
753
790
def compute_pe_layout (slice : Slice ) -> Layout :
754
791
data = slice .data
755
792
@@ -758,12 +795,13 @@ def compute_pe_layout(slice: Slice) -> Layout:
758
795
except pefile .PEFormatError as e :
759
796
raise ValueError ("pefile failed to load workspace" ) from e
760
797
761
- structures = compute_file_structures (slice , pe )
798
+ structures = collect_pe_structures (slice , pe )
762
799
reloc_offsets = get_reloc_offsets (slice , pe )
763
800
764
- structures_by_range = intervaltree .IntervalTree ()
765
- for interval in structures :
766
- structures_by_range .addi (interval .slice .range .offset , interval .slice .range .end , interval )
801
+ structures_by_address = {}
802
+ for structure in structures :
803
+ for offset in structure .slice .range :
804
+ structures_by_address [offset ] = structure
767
805
768
806
# lancelot only accepts bytes, not mmap
769
807
with timing ("lancelot: load workspace" ):
@@ -792,6 +830,7 @@ def compute_pe_layout(slice: Slice) -> Layout:
792
830
name = "pe" ,
793
831
reloc_offsets = reloc_offsets ,
794
832
code_offsets = code_offsets ,
833
+ structures_by_address = structures_by_address ,
795
834
)
796
835
797
836
for section in pe .sections :
@@ -1168,7 +1207,7 @@ def main():
1168
1207
taggers = load_databases ()
1169
1208
layout .tag_strings (taggers )
1170
1209
1171
- # TODO: figure out how to mark structures
1210
+ layout . mark_structures ()
1172
1211
1173
1212
# remove tags from libraries that have too few matches (five, by default).
1174
1213
remove_false_positive_lib_strings (layout )
0 commit comments