|
| 1 | +// Copyright 2025 The Go Authors. All rights reserved. |
| 2 | +// Use of this source code is governed by a BSD-style |
| 3 | +// license that can be found in the LICENSE file. |
| 4 | + |
| 5 | +// Package asm provides a simple parser for Go assembly files. |
| 6 | +package asm |
| 7 | + |
| 8 | +import ( |
| 9 | + "bufio" |
| 10 | + "bytes" |
| 11 | + "fmt" |
| 12 | + "strings" |
| 13 | + "unicode" |
| 14 | +) |
| 15 | + |
| 16 | +// Kind describes the nature of an identifier in an assembly file. |
| 17 | +type Kind uint8 |
| 18 | + |
| 19 | +const ( |
| 20 | + Invalid Kind = iota // reserved zero value; not used by Ident |
| 21 | + Ref // arbitrary reference to symbol or control label |
| 22 | + Text // definition of TEXT (function) symbol |
| 23 | + Global // definition of GLOBL (var) symbol |
| 24 | + Data // initialization of GLOBL (var) symbol; effectively a reference |
| 25 | + Label // definition of control label |
| 26 | +) |
| 27 | + |
| 28 | +func (k Kind) String() string { |
| 29 | + if int(k) < len(kindString) { |
| 30 | + return kindString[k] |
| 31 | + } |
| 32 | + return fmt.Sprintf("Kind(%d)", k) |
| 33 | +} |
| 34 | + |
| 35 | +var kindString = [...]string{ |
| 36 | + Invalid: "invalid", |
| 37 | + Ref: "ref", |
| 38 | + Text: "text", |
| 39 | + Global: "global", |
| 40 | + Data: "data", |
| 41 | + Label: "label", |
| 42 | +} |
| 43 | + |
| 44 | +// A file represents a parsed file of Go assembly language. |
| 45 | +type File struct { |
| 46 | + Idents []Ident |
| 47 | + |
| 48 | + // TODO(adonovan): use token.File? This may be important in a |
| 49 | + // future in which analyzers can report diagnostics in .s files. |
| 50 | +} |
| 51 | + |
| 52 | +// Ident represents an identifier in an assembly file. |
| 53 | +type Ident struct { |
| 54 | + Name string // symbol name (after correcting [·∕]); Name[0]='.' => current package |
| 55 | + Offset int // zero-based byte offset |
| 56 | + Kind Kind |
| 57 | +} |
| 58 | + |
| 59 | +// End returns the identifier's end offset. |
| 60 | +func (id Ident) End() int { return id.Offset + len(id.Name) } |
| 61 | + |
| 62 | +// Parse extracts identifiers from Go assembly files. |
| 63 | +// Since it is a best-effort parser, it never returns an error. |
| 64 | +func Parse(content []byte) *File { |
| 65 | + var idents []Ident |
| 66 | + offset := 0 // byte offset of start of current line |
| 67 | + |
| 68 | + // TODO(adonovan) use a proper tokenizer that respects |
| 69 | + // comments, string literals, line continuations, etc. |
| 70 | + scan := bufio.NewScanner(bytes.NewReader(content)) |
| 71 | + for ; scan.Scan(); offset += len(scan.Bytes()) + len("\n") { |
| 72 | + line := scan.Text() |
| 73 | + |
| 74 | + // Strip comments. |
| 75 | + if idx := strings.Index(line, "//"); idx >= 0 { |
| 76 | + line = line[:idx] |
| 77 | + } |
| 78 | + |
| 79 | + // Skip blank lines. |
| 80 | + if strings.TrimSpace(line) == "" { |
| 81 | + continue |
| 82 | + } |
| 83 | + |
| 84 | + // Check for label definitions (ending with colon). |
| 85 | + if colon := strings.IndexByte(line, ':'); colon > 0 { |
| 86 | + label := strings.TrimSpace(line[:colon]) |
| 87 | + if isIdent(label) { |
| 88 | + idents = append(idents, Ident{ |
| 89 | + Name: label, |
| 90 | + Offset: offset + strings.Index(line, label), |
| 91 | + Kind: Label, |
| 92 | + }) |
| 93 | + continue |
| 94 | + } |
| 95 | + } |
| 96 | + |
| 97 | + // Split line into words. |
| 98 | + words := strings.Fields(line) |
| 99 | + if len(words) == 0 { |
| 100 | + continue |
| 101 | + } |
| 102 | + |
| 103 | + // A line of the form |
| 104 | + // TEXT ·sym<ABIInternal>(SB),NOSPLIT,$12 |
| 105 | + // declares a text symbol "·sym". |
| 106 | + if len(words) > 1 { |
| 107 | + kind := Invalid |
| 108 | + switch words[0] { |
| 109 | + case "TEXT": |
| 110 | + kind = Text |
| 111 | + case "GLOBL": |
| 112 | + kind = Global |
| 113 | + case "DATA": |
| 114 | + kind = Data |
| 115 | + } |
| 116 | + if kind != Invalid { |
| 117 | + sym := words[1] |
| 118 | + sym = cutBefore(sym, ",") // strip ",NOSPLIT,$12" etc |
| 119 | + sym = cutBefore(sym, "(") // "sym(SB)" -> "sym" |
| 120 | + sym = cutBefore(sym, "<") // "sym<ABIInternal>" -> "sym" |
| 121 | + sym = strings.TrimSpace(sym) |
| 122 | + if isIdent(sym) { |
| 123 | + // (The Index call assumes sym is not itself "TEXT" etc.) |
| 124 | + idents = append(idents, Ident{ |
| 125 | + Name: cleanup(sym), |
| 126 | + Kind: kind, |
| 127 | + Offset: offset + strings.Index(line, sym), |
| 128 | + }) |
| 129 | + } |
| 130 | + continue |
| 131 | + } |
| 132 | + } |
| 133 | + |
| 134 | + // Find references in the rest of the line. |
| 135 | + pos := 0 |
| 136 | + for _, word := range words { |
| 137 | + // Find actual position of word within line. |
| 138 | + tokenPos := strings.Index(line[pos:], word) |
| 139 | + if tokenPos < 0 { |
| 140 | + panic(line) |
| 141 | + } |
| 142 | + tokenPos += pos |
| 143 | + pos = tokenPos + len(word) |
| 144 | + |
| 145 | + // Reject probable instruction mnemonics (e.g. MOV). |
| 146 | + if len(word) >= 2 && word[0] != '·' && |
| 147 | + !strings.ContainsFunc(word, unicode.IsLower) { |
| 148 | + continue |
| 149 | + } |
| 150 | + |
| 151 | + if word[0] == '$' { |
| 152 | + word = word[1:] |
| 153 | + tokenPos++ |
| 154 | + |
| 155 | + // Reject probable immediate values (e.g. "$123"). |
| 156 | + if !strings.ContainsFunc(word, isNonDigit) { |
| 157 | + continue |
| 158 | + } |
| 159 | + } |
| 160 | + |
| 161 | + // Reject probably registers (e.g. "PC"). |
| 162 | + if len(word) <= 3 && !strings.ContainsFunc(word, unicode.IsLower) { |
| 163 | + continue |
| 164 | + } |
| 165 | + |
| 166 | + // Probable identifier reference. |
| 167 | + // |
| 168 | + // TODO(adonovan): handle FP symbols correctly; |
| 169 | + // sym+8(FP) is essentially a comment about |
| 170 | + // stack slot 8, not a reference to a symbol |
| 171 | + // with a declaration somewhere; so they form |
| 172 | + // an equivalence class without a canonical |
| 173 | + // declaration. |
| 174 | + // |
| 175 | + // TODO(adonovan): handle pseudoregisters and field |
| 176 | + // references such as: |
| 177 | + // MOVD $runtime·g0(SB), g // pseudoreg |
| 178 | + // MOVD R0, g_stackguard0(g) // field ref |
| 179 | + |
| 180 | + sym := cutBefore(word, "(") // "·sym(SB)" => "sym" |
| 181 | + sym = cutBefore(sym, "+") // "sym+8(FP)" => "sym" |
| 182 | + sym = cutBefore(sym, "<") // "sym<ABIInternal>" =>> "sym" |
| 183 | + if isIdent(sym) { |
| 184 | + idents = append(idents, Ident{ |
| 185 | + Name: cleanup(sym), |
| 186 | + Kind: Ref, |
| 187 | + Offset: offset + tokenPos, |
| 188 | + }) |
| 189 | + } |
| 190 | + } |
| 191 | + } |
| 192 | + |
| 193 | + _ = scan.Err() // ignore scan errors |
| 194 | + |
| 195 | + return &File{Idents: idents} |
| 196 | +} |
| 197 | + |
| 198 | +// isIdent reports whether s is a valid Go assembly identifier. |
| 199 | +func isIdent(s string) bool { |
| 200 | + for i, r := range s { |
| 201 | + if !isIdentRune(r, i) { |
| 202 | + return false |
| 203 | + } |
| 204 | + } |
| 205 | + return len(s) > 0 |
| 206 | +} |
| 207 | + |
| 208 | +// cutBefore returns the portion of s before the first occurrence of sep, if any. |
| 209 | +func cutBefore(s, sep string) string { |
| 210 | + if before, _, ok := strings.Cut(s, sep); ok { |
| 211 | + return before |
| 212 | + } |
| 213 | + return s |
| 214 | +} |
| 215 | + |
| 216 | +// cleanup converts a symbol name from assembler syntax to linker syntax. |
| 217 | +func cleanup(sym string) string { |
| 218 | + return repl.Replace(sym) |
| 219 | +} |
| 220 | + |
| 221 | +var repl = strings.NewReplacer( |
| 222 | + "·", ".", // (U+00B7 MIDDLE DOT) |
| 223 | + "∕", "/", // (U+2215 DIVISION SLASH) |
| 224 | +) |
| 225 | + |
| 226 | +func isNonDigit(r rune) bool { return !unicode.IsDigit(r) } |
| 227 | + |
| 228 | +// -- plundered from GOROOT/src/cmd/asm/internal/asm/parse.go -- |
| 229 | + |
| 230 | +// We want center dot (·) and division slash (∕) to work as identifier characters. |
| 231 | +func isIdentRune(ch rune, i int) bool { |
| 232 | + if unicode.IsLetter(ch) { |
| 233 | + return true |
| 234 | + } |
| 235 | + switch ch { |
| 236 | + case '_': // Underscore; traditional. |
| 237 | + return true |
| 238 | + case '\u00B7': // Represents the period in runtime.exit. U+00B7 '·' middle dot |
| 239 | + return true |
| 240 | + case '\u2215': // Represents the slash in runtime/debug.setGCPercent. U+2215 '∕' division slash |
| 241 | + return true |
| 242 | + } |
| 243 | + // Digits are OK only after the first character. |
| 244 | + return i > 0 && unicode.IsDigit(ch) |
| 245 | +} |
0 commit comments