gopls/internal/util/asm: better assembly parsing

adonovan · gopherbot · commit d81d6fcce1a2 · 2025-03-04T11:48:12.000-08:00
This CL adds a rudimentary parser for symbols in Go .s files. It is a placeholder for a more principled implementation, but it is sufficient to make Definition support control labels (also in this CL) and for a cross-references index (future work). + test of Definition on control label + test of asm.Parse Updates golang/go#71754 Change-Id: I2ff19b4ade130c051197d6b097a1a3dbcd95555a Reviewed-on: https://go-review.googlesource.com/c/tools/+/654335 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Jonathan Amsterdam <jba@google.com> Auto-Submit: Alan Donovan <adonovan@google.com>
diff --git a/gopls/internal/goasm/definition.go b/gopls/internal/goasm/definition.go
@@ -2,20 +2,20 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// Package goasm provides language-server features for files in Go
+// assembly language (https://go.dev/doc/asm).
 package goasm
 
 import (
-	"bytes"
 	"context"
 	"fmt"
 	"go/token"
-	"strings"
-	"unicode"
 
 	"golang.org/x/tools/gopls/internal/cache"
 	"golang.org/x/tools/gopls/internal/cache/metadata"
 	"golang.org/x/tools/gopls/internal/file"
 	"golang.org/x/tools/gopls/internal/protocol"
+	"golang.org/x/tools/gopls/internal/util/asm"
 	"golang.org/x/tools/gopls/internal/util/morestrings"
 	"golang.org/x/tools/internal/event"
 )
@@ -41,21 +41,27 @@ func Definition(ctx context.Context, snapshot *cache.Snapshot, fh file.Handle, p
 		return nil, err
 	}
 
+	// Parse the assembly.
+	//
+	// TODO(adonovan): make this just another
+	// attribute of the type-checked cache.Package.
+	file := asm.Parse(content)
+
 	// Figure out the selected symbol.
 	// For now, just find the identifier around the cursor.
-	//
-	// TODO(adonovan): use a real asm parser; see cmd/asm/internal/asm/parse.go.
-	// Ideally this would just be just another attribute of the
-	// type-checked cache.Package.
-	nonIdentRune := func(r rune) bool { return !isIdentRune(r) }
-	i := bytes.LastIndexFunc(content[:offset], nonIdentRune)
-	j := bytes.IndexFunc(content[offset:], nonIdentRune)
-	if j < 0 || j == 0 {
-		return nil, nil // identifier runs to EOF, or not an identifier
+	var found *asm.Ident
+	for _, id := range file.Idents {
+		if id.Offset <= offset && offset <= id.End() {
+			found = &id
+			break
+		}
 	}
-	sym := string(content[i+1 : offset+j])
-	sym = strings.ReplaceAll(sym, "·", ".") // (U+00B7 MIDDLE DOT)
-	sym = strings.ReplaceAll(sym, "∕", "/") // (U+2215 DIVISION SLASH)
+	if found == nil {
+		return nil, fmt.Errorf("not an identifier")
+	}
+
+	// Resolve a symbol with a "." prefix to the current package.
+	sym := found.Name
 	if sym != "" && sym[0] == '.' {
 		sym = string(mp.PkgPath) + sym
 	}
@@ -92,18 +98,23 @@ func Definition(ctx context.Context, snapshot *cache.Snapshot, fh file.Handle, p
 		if err == nil {
 			return []protocol.Location{loc}, nil
 		}
-	}
 
-	// TODO(adonovan): support jump to var, block label, and other
-	// TEXT, DATA, and GLOBAL symbols in the same file. Needs asm parser.
+	} else {
+		// local symbols (funcs, vars, labels)
+		for _, id := range file.Idents {
+			if id.Name == found.Name &&
+				(id.Kind == asm.Text || id.Kind == asm.Global || id.Kind == asm.Label) {
 
-	return nil, nil
-}
+				loc, err := mapper.OffsetLocation(id.Offset, id.End())
+				if err != nil {
+					return nil, err
+				}
+				return []protocol.Location{loc}, nil
+			}
+		}
+	}
 
-// The assembler allows center dot (· U+00B7) and
-// division slash (∕ U+2215) to work as identifier characters.
-func isIdentRune(r rune) bool {
-	return unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_' || r == '·' || r == '∕'
+	return nil, nil
 }
 
 // TODO(rfindley): avoid the duplicate column mapping here, by associating a
diff --git a/gopls/internal/golang/assembly.go b/gopls/internal/golang/assembly.go
@@ -10,6 +10,9 @@ package golang
 // - ./codeaction.go - computes the symbol and offers the CodeAction command.
 // - ../server/command.go - handles the command by opening a web page.
 // - ../server/server.go - handles the HTTP request and calls this function.
+//
+// For language-server behavior in Go assembly language files,
+// see [golang.org/x/tools/gopls/internal/goasm].
 
 import (
 	"bytes"
diff --git a/gopls/internal/test/marker/testdata/definition/asm.txt b/gopls/internal/test/marker/testdata/definition/asm.txt
@@ -26,6 +26,9 @@ var _ = ff // pacify unusedfunc analyzer
 TEXT ·ff(SB), $16                       //@ loc(ffasm, "ff"), def("ff", ffgo)
         CALL    example·com∕b·B         //@ def("com", bB)
         JMP     ·ff                     //@ def("ff", ffgo)
+	JMP     label			//@ def("label", label)
+label:					//@ loc(label,"label")
+        RET
 
 -- b/b.go --
 package b
diff --git a/gopls/internal/util/asm/parse.go b/gopls/internal/util/asm/parse.go
@@ -0,0 +1,245 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package asm provides a simple parser for Go assembly files.
+package asm
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"strings"
+	"unicode"
+)
+
+// Kind describes the nature of an identifier in an assembly file.
+type Kind uint8
+
+const (
+	Invalid Kind = iota // reserved zero value; not used by Ident
+	Ref                 // arbitrary reference to symbol or control label
+	Text                // definition of TEXT (function) symbol
+	Global              // definition of GLOBL (var) symbol
+	Data                // initialization of GLOBL (var) symbol; effectively a reference
+	Label               // definition of control label
+)
+
+func (k Kind) String() string {
+	if int(k) < len(kindString) {
+		return kindString[k]
+	}
+	return fmt.Sprintf("Kind(%d)", k)
+}
+
+var kindString = [...]string{
+	Invalid: "invalid",
+	Ref:     "ref",
+	Text:    "text",
+	Global:  "global",
+	Data:    "data",
+	Label:   "label",
+}
+
+// A file represents a parsed file of Go assembly language.
+type File struct {
+	Idents []Ident
+
+	// TODO(adonovan): use token.File? This may be important in a
+	// future in which analyzers can report diagnostics in .s files.
+}
+
+// Ident represents an identifier in an assembly file.
+type Ident struct {
+	Name   string // symbol name (after correcting [·∕]); Name[0]='.' => current package
+	Offset int    // zero-based byte offset
+	Kind   Kind
+}
+
+// End returns the identifier's end offset.
+func (id Ident) End() int { return id.Offset + len(id.Name) }
+
+// Parse extracts identifiers from Go assembly files.
+// Since it is a best-effort parser, it never returns an error.
+func Parse(content []byte) *File {
+	var idents []Ident
+	offset := 0 // byte offset of start of current line
+
+	// TODO(adonovan) use a proper tokenizer that respects
+	// comments, string literals, line continuations, etc.
+	scan := bufio.NewScanner(bytes.NewReader(content))
+	for ; scan.Scan(); offset += len(scan.Bytes()) + len("\n") {
+		line := scan.Text()
+
+		// Strip comments.
+		if idx := strings.Index(line, "//"); idx >= 0 {
+			line = line[:idx]
+		}
+
+		// Skip blank lines.
+		if strings.TrimSpace(line) == "" {
+			continue
+		}
+
+		// Check for label definitions (ending with colon).
+		if colon := strings.IndexByte(line, ':'); colon > 0 {
+			label := strings.TrimSpace(line[:colon])
+			if isIdent(label) {
+				idents = append(idents, Ident{
+					Name:   label,
+					Offset: offset + strings.Index(line, label),
+					Kind:   Label,
+				})
+				continue
+			}
+		}
+
+		// Split line into words.
+		words := strings.Fields(line)
+		if len(words) == 0 {
+			continue
+		}
+
+		// A line of the form
+		//    TEXT ·sym<ABIInternal>(SB),NOSPLIT,$12
+		// declares a text symbol "·sym".
+		if len(words) > 1 {
+			kind := Invalid
+			switch words[0] {
+			case "TEXT":
+				kind = Text
+			case "GLOBL":
+				kind = Global
+			case "DATA":
+				kind = Data
+			}
+			if kind != Invalid {
+				sym := words[1]
+				sym = cutBefore(sym, ",") // strip ",NOSPLIT,$12" etc
+				sym = cutBefore(sym, "(") // "sym(SB)" -> "sym"
+				sym = cutBefore(sym, "<") // "sym<ABIInternal>" -> "sym"
+				sym = strings.TrimSpace(sym)
+				if isIdent(sym) {
+					// (The Index call assumes sym is not itself "TEXT" etc.)
+					idents = append(idents, Ident{
+						Name:   cleanup(sym),
+						Kind:   kind,
+						Offset: offset + strings.Index(line, sym),
+					})
+				}
+				continue
+			}
+		}
+
+		// Find references in the rest of the line.
+		pos := 0
+		for _, word := range words {
+			// Find actual position of word within line.
+			tokenPos := strings.Index(line[pos:], word)
+			if tokenPos < 0 {
+				panic(line)
+			}
+			tokenPos += pos
+			pos = tokenPos + len(word)
+
+			// Reject probable instruction mnemonics (e.g. MOV).
+			if len(word) >= 2 && word[0] != '·' &&
+				!strings.ContainsFunc(word, unicode.IsLower) {
+				continue
+			}
+
+			if word[0] == '$' {
+				word = word[1:]
+				tokenPos++
+
+				// Reject probable immediate values (e.g. "$123").
+				if !strings.ContainsFunc(word, isNonDigit) {
+					continue
+				}
+			}
+
+			// Reject probably registers (e.g. "PC").
+			if len(word) <= 3 && !strings.ContainsFunc(word, unicode.IsLower) {
+				continue
+			}
+
+			// Probable identifier reference.
+			//
+			// TODO(adonovan): handle FP symbols correctly;
+			// sym+8(FP) is essentially a comment about
+			// stack slot 8, not a reference to a symbol
+			// with a declaration somewhere; so they form
+			// an equivalence class without a canonical
+			// declaration.
+			//
+			// TODO(adonovan): handle pseudoregisters and field
+			// references such as:
+			//    MOVD	$runtime·g0(SB), g      // pseudoreg
+			//    MOVD	R0, g_stackguard0(g)    // field ref
+
+			sym := cutBefore(word, "(") // "·sym(SB)" => "sym"
+			sym = cutBefore(sym, "+")   // "sym+8(FP)" => "sym"
+			sym = cutBefore(sym, "<")   // "sym<ABIInternal>" =>> "sym"
+			if isIdent(sym) {
+				idents = append(idents, Ident{
+					Name:   cleanup(sym),
+					Kind:   Ref,
+					Offset: offset + tokenPos,
+				})
+			}
+		}
+	}
+
+	_ = scan.Err() // ignore scan errors
+
+	return &File{Idents: idents}
+}
+
+// isIdent reports whether s is a valid Go assembly identifier.
+func isIdent(s string) bool {
+	for i, r := range s {
+		if !isIdentRune(r, i) {
+			return false
+		}
+	}
+	return len(s) > 0
+}
+
+// cutBefore returns the portion of s before the first occurrence of sep, if any.
+func cutBefore(s, sep string) string {
+	if before, _, ok := strings.Cut(s, sep); ok {
+		return before
+	}
+	return s
+}
+
+// cleanup converts a symbol name from assembler syntax to linker syntax.
+func cleanup(sym string) string {
+	return repl.Replace(sym)
+}
+
+var repl = strings.NewReplacer(
+	"·", ".", // (U+00B7 MIDDLE DOT)
+	"∕", "/", // (U+2215 DIVISION SLASH)
+)
+
+func isNonDigit(r rune) bool { return !unicode.IsDigit(r) }
+
+// -- plundered from GOROOT/src/cmd/asm/internal/asm/parse.go --
+
+// We want center dot (·) and division slash (∕) to work as identifier characters.
+func isIdentRune(ch rune, i int) bool {
+	if unicode.IsLetter(ch) {
+		return true
+	}
+	switch ch {
+	case '_': // Underscore; traditional.
+		return true
+	case '\u00B7': // Represents the period in runtime.exit. U+00B7 '·' middle dot
+		return true
+	case '\u2215': // Represents the slash in runtime/debug.setGCPercent. U+2215 '∕' division slash
+		return true
+	}
+	// Digits are OK only after the first character.
+	return i > 0 && unicode.IsDigit(ch)
+}
diff --git a/gopls/internal/util/asm/parse_test.go b/gopls/internal/util/asm/parse_test.go