bug fix to encode_arm64.s: some registers overwritten in memmove call

AWSjswinney · AWSjswinney · commit f81760ec4c92 · 2020-10-02T15:49:34.000Z
In encode_arm64.s, encodeBlock, two of the registers added during the port from
amd64 were not saved or restored for the memmove call. Instead of saving them,
just recalculate their values. Additionally, I made a few small changes to
improve things since I've learned a bit more about ARMv8 assembly.
 - The CMP instruction accepts an immediate as the first argument
 - use LDP/STP instead of SIMD instructions

The change to use the load-pair and store-pair instructions instead of the SIMD
instructions results in some modest performance improvements as meastured on
Neoverse N1 (Graviton 2).

name              old time/op    new time/op    delta
WordsDecode1e1-2    25.9ns ± 1%    26.1ns ± 1%  +0.66%  (p=0.005 n=10+10)
WordsDecode1e2-2     107ns ± 0%     105ns ± 0%  -1.87%  (p=0.000 n=10+10)
WordsDecode1e3-2     953ns ± 0%     901ns ± 0%  -5.50%  (p=0.000 n=10+10)
WordsDecode1e4-2    10.6µs ± 0%     9.9µs ± 2%  -6.60%  (p=0.000 n=7+10)
WordsDecode1e5-2     170µs ± 1%     164µs ± 1%  -3.12%  (p=0.000 n=10+9)
WordsDecode1e6-2    1.71ms ± 0%    1.66ms ± 0%  -2.98%  (p=0.000 n=10+10)
WordsEncode1e1-2    22.0ns ± 1%    21.9ns ± 1%  -0.67%  (p=0.006 n=8+10)
WordsEncode1e2-2     248ns ± 0%     245ns ± 0%  -1.21%  (p=0.002 n=8+10)
WordsEncode1e3-2    2.50µs ± 0%    2.49µs ± 0%    ~     (p=0.103 n=10+9)
WordsEncode1e4-2    27.8µs ± 3%    28.0µs ± 2%    ~     (p=0.075 n=10+10)
WordsEncode1e5-2     339µs ± 0%     343µs ± 0%  +1.18%  (p=0.000 n=9+10)
WordsEncode1e6-2    3.39ms ± 0%    3.42ms ± 0%  +0.94%  (p=0.000 n=10+10)
RandomEncode-2      74.8µs ± 1%    77.1µs ± 1%  +3.16%  (p=0.000 n=10+10)
_UFlat0-2           68.8µs ± 1%    66.4µs ± 2%  -3.54%  (p=0.000 n=10+10)
_UFlat1-2            770µs ± 0%     740µs ± 1%  -3.93%  (p=0.000 n=10+10)
_UFlat2-2           6.57µs ± 0%    6.55µs ± 0%  -0.25%  (p=0.000 n=8+10)
_UFlat3-2            183ns ± 0%     178ns ± 1%  -2.84%  (p=0.000 n=9+10)
_UFlat4-2           9.76µs ± 1%    9.56µs ± 0%  -2.07%  (p=0.000 n=10+9)
_UFlat5-2            301µs ± 0%     293µs ± 0%  -2.67%  (p=0.000 n=9+10)
_UFlat6-2            280µs ± 1%     267µs ± 1%  -4.63%  (p=0.000 n=10+10)
_UFlat7-2            241µs ± 0%     230µs ± 1%  -4.68%  (p=0.000 n=9+10)
_UFlat8-2            745µs ± 0%     715µs ± 1%  -4.11%  (p=0.000 n=10+10)
_UFlat9-2           1.01ms ± 0%    0.96ms ± 0%  -4.60%  (p=0.000 n=10+10)
_UFlat10-2          62.3µs ± 1%    59.3µs ± 1%  -4.72%  (p=0.000 n=10+9)
_UFlat11-2           258µs ± 0%     252µs ± 1%  -2.56%  (p=0.000 n=10+10)
_ZFlat0-2            135µs ± 1%     132µs ± 1%  -1.88%  (p=0.000 n=10+8)
_ZFlat1-2           1.76ms ± 0%    1.74ms ± 0%  -1.00%  (p=0.000 n=9+9)
_ZFlat2-2           9.54µs ± 0%    9.84µs ± 5%  +3.18%  (p=0.000 n=10+10)
_ZFlat3-2            449ns ± 0%     447ns ± 0%  -0.38%  (p=0.000 n=10+9)
_ZFlat4-2           15.6µs ± 0%    16.0µs ± 4%    ~     (p=0.118 n=9+10)
_ZFlat5-2            560µs ± 1%     555µs ± 1%  -0.89%  (p=0.000 n=9+9)
_ZFlat6-2            531µs ± 0%     534µs ± 0%  +0.64%  (p=0.000 n=10+10)
_ZFlat7-2            466µs ± 0%     468µs ± 0%  +0.32%  (p=0.003 n=10+10)
_ZFlat8-2           1.42ms ± 0%    1.42ms ± 0%  +0.43%  (p=0.000 n=10+10)
_ZFlat9-2           1.93ms ± 0%    1.94ms ± 0%  +0.44%  (p=0.000 n=10+10)
_ZFlat10-2           120µs ± 0%     121µs ± 3%    ~     (p=0.436 n=9+9)
_ZFlat11-2           433µs ± 0%     437µs ± 0%  +1.03%  (p=0.000 n=10+10)
ExtendMatch-2       9.77µs ± 0%    9.76µs ± 0%  -0.13%  (p=0.050 n=10+10)

As measured on Cortex-A53 (Raspberry Pi 3)

name              old time/op    new time/op    delta
WordsDecode1e1-4     152ns ± 2%     151ns ± 0%    ~     (p=0.536 n=10+8)
WordsDecode1e2-4     639ns ± 0%     617ns ± 0%  -3.54%  (p=0.000 n=9+8)
WordsDecode1e3-4    6.74µs ± 2%    6.35µs ± 0%  -5.75%  (p=0.000 n=10+9)
WordsDecode1e4-4    66.7µs ± 0%    63.5µs ± 0%  -4.69%  (p=0.000 n=9+9)
WordsDecode1e5-4     715µs ± 0%     684µs ± 0%  -4.38%  (p=0.000 n=8+8)
WordsDecode1e6-4    6.87ms ± 2%    6.53ms ± 1%  -4.99%  (p=0.000 n=10+9)
WordsEncode1e1-4     127ns ± 2%     126ns ± 0%    ~     (p=0.065 n=10+9)
WordsEncode1e2-4    1.58µs ± 0%    1.57µs ± 0%  -0.99%  (p=0.000 n=8+8)
WordsEncode1e3-4    15.1µs ± 0%    14.9µs ± 0%  -1.46%  (p=0.000 n=9+8)
WordsEncode1e4-4     148µs ± 0%     148µs ± 4%    ~     (p=0.497 n=9+10)
WordsEncode1e5-4    1.54ms ± 0%    1.54ms ± 0%  +0.12%  (p=0.012 n=10+8)
WordsEncode1e6-4    14.4ms ± 0%    14.4ms ± 1%  -0.47%  (p=0.015 n=9+8)
RandomEncode-4      1.13ms ± 1%    1.13ms ± 1%    ~     (p=0.529 n=10+10)
_UFlat0-4            294µs ± 0%     288µs ± 1%  -2.08%  (p=0.000 n=9+9)
_UFlat1-4           3.05ms ± 1%    2.98ms ± 1%  -2.22%  (p=0.000 n=9+9)
_UFlat2-4           37.3µs ± 0%    37.4µs ± 1%    ~     (p=0.093 n=8+9)
_UFlat3-4            909ns ± 0%     914ns ± 2%    ~     (p=0.526 n=8+10)
_UFlat4-4           58.7µs ± 0%    58.1µs ± 0%  -1.09%  (p=0.000 n=8+10)
_UFlat5-4           1.22ms ± 0%    1.19ms ± 1%  -2.14%  (p=0.000 n=8+8)
_UFlat6-4           1.03ms ± 0%    0.99ms ± 0%  -3.28%  (p=0.000 n=9+8)
_UFlat7-4            895µs ± 0%     861µs ± 0%  -3.79%  (p=0.000 n=8+8)
_UFlat8-4           2.83ms ± 0%    2.75ms ± 0%  -2.88%  (p=0.000 n=7+8)
_UFlat9-4           3.85ms ± 1%    3.73ms ± 1%  -3.03%  (p=0.000 n=8+9)
_UFlat10-4           286µs ± 0%     282µs ± 0%  -1.59%  (p=0.000 n=9+9)
_UFlat11-4          1.06ms ± 0%    1.02ms ± 0%  -3.58%  (p=0.000 n=8+9)
_ZFlat0-4            620µs ± 0%     620µs ± 1%    ~     (p=0.963 n=9+8)
_ZFlat1-4           9.49ms ± 1%    9.67ms ± 3%  +1.87%  (p=0.000 n=9+10)
_ZFlat2-4           61.8µs ± 0%    62.3µs ± 3%    ~     (p=0.829 n=8+10)
_ZFlat3-4           2.80µs ± 1%    2.79µs ± 0%  -0.55%  (p=0.000 n=8+8)
_ZFlat4-4            108µs ± 0%     109µs ± 0%  +0.55%  (p=0.000 n=10+8)
_ZFlat5-4           2.59ms ± 2%    2.58ms ± 1%    ~     (p=0.274 n=10+8)
_ZFlat6-4           2.39ms ± 3%    2.40ms ± 1%    ~     (p=0.631 n=10+10)
_ZFlat7-4           2.11ms ± 0%    2.08ms ± 1%  -1.23%  (p=0.000 n=10+9)
_ZFlat8-4           6.86ms ± 0%    6.92ms ± 1%  +0.78%  (p=0.000 n=9+8)
_ZFlat9-4           9.42ms ± 0%    9.40ms ± 1%    ~     (p=0.606 n=8+9)
_ZFlat10-4           620µs ± 1%     621µs ± 4%    ~     (p=0.173 n=8+10)
_ZFlat11-4          1.94ms ± 0%    1.93ms ± 0%  -0.52%  (p=0.001 n=9+8)
ExtendMatch-4       69.3µs ± 2%    69.2µs ± 0%    ~     (p=0.515 n=10+8)
diff --git a/decode_arm64.s b/decode_arm64.s
@@ -70,7 +70,7 @@ loop:
 	// x := uint32(src[s] >> 2)
 	// switch
 	MOVW $60, R1
-	ADD  R4>>2, ZR, R4
+	LSRW $2, R4, R4
 	CMPW R4, R1
 	BLS  tagLit60Plus
 
@@ -111,13 +111,12 @@ doLit:
 	// is contiguous in memory and so it needs to leave enough source bytes to
 	// read the next tag without refilling buffers, but Go's Decode assumes
 	// contiguousness (the src argument is a []byte).
-	MOVD $16, R1
-	CMP  R1, R4
-	BGT  callMemmove
-	CMP  R1, R2
-	BLT  callMemmove
-	CMP  R1, R3
-	BLT  callMemmove
+	CMP $16, R4
+	BGT callMemmove
+	CMP $16, R2
+	BLT callMemmove
+	CMP $16, R3
+	BLT callMemmove
 
 	// !!! Implement the copy from src to dst as a 16-byte load and store.
 	// (Decode's documentation says that dst and src must not overlap.)
@@ -130,9 +129,8 @@ doLit:
 	// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
 	// 16-byte loads and stores. This technique probably wouldn't be as
 	// effective on architectures that are fussier about alignment.
-
-	VLD1 0(R6), [V0.B16]
-	VST1 [V0.B16], 0(R7)
+	LDP 0(R6), (R14, R15)
+	STP (R14, R15), 0(R7)
 
 	// d += length
 	// s += length
@@ -210,8 +208,7 @@ tagLit61:
 	B     doLit
 
 tagLit62Plus:
-	MOVW $62, R1
-	CMPW R1, R4
+	CMPW $62, R4
 	BHI  tagLit63
 
 	// case x == 62:
@@ -273,10 +270,9 @@ tagCopy:
 	// We have a copy tag. We assume that:
 	//	- R3 == src[s] & 0x03
 	//	- R4 == src[s]
-	MOVD $2, R1
-	CMP  R1, R3
-	BEQ  tagCopy2
-	BGT  tagCopy4
+	CMP $2, R3
+	BEQ tagCopy2
+	BGT tagCopy4
 
 	// case tagCopy1:
 	// s += 2
@@ -346,13 +342,11 @@ doCopy:
 	// }
 	// copy 16 bytes
 	// d += length
-	MOVD $16, R1
-	MOVD $8, R0
-	CMP  R1, R4
+	CMP  $16, R4
 	BGT  slowForwardCopy
-	CMP  R0, R5
+	CMP  $8, R5
 	BLT  slowForwardCopy
-	CMP  R1, R14
+	CMP  $16, R14
 	BLT  slowForwardCopy
 	MOVD 0(R15), R2
 	MOVD R2, 0(R7)
@@ -426,8 +420,7 @@ makeOffsetAtLeast8:
 	//   // The two previous lines together means that d-offset, and therefore
 	//   // R15, is unchanged.
 	// }
-	MOVD $8, R1
-	CMP  R1, R5
+	CMP  $8, R5
 	BGE  fixUpSlowForwardCopy
 	MOVD (R15), R3
 	MOVD R3, (R7)
@@ -477,9 +470,7 @@ verySlowForwardCopy:
 	ADD  $1, R15, R15
 	ADD  $1, R7, R7
 	SUB  $1, R4, R4
-	MOVD $0, R1
-	CMP  R1, R4
-	BNE  verySlowForwardCopy
+	CBNZ R4, verySlowForwardCopy
 	B    loop
 
 	// The code above handles copy tags.
diff --git a/encode_arm64.s b/encode_arm64.s
@@ -35,11 +35,9 @@ TEXT ·emitLiteral(SB), NOSPLIT, $32-56
 	MOVW R3, R4
 	SUBW $1, R4, R4
 
-	MOVW $60, R2
-	CMPW R2, R4
+	CMPW $60, R4
 	BLT  oneByte
-	MOVW $256, R2
-	CMPW R2, R4
+	CMPW $256, R4
 	BLT  twoBytes
 
 threeBytes:
@@ -98,8 +96,7 @@ TEXT ·emitCopy(SB), NOSPLIT, $0-48
 
 loop0:
 	// for length >= 68 { etc }
-	MOVW $68, R2
-	CMPW R2, R3
+	CMPW $68, R3
 	BLT  step1
 
 	// Emit a length 64 copy, encoded as 3 bytes.
@@ -112,9 +109,8 @@ loop0:
 
 step1:
 	// if length > 64 { etc }
-	MOVD $64, R2
-	CMP  R2, R3
-	BLE  step2
+	CMP $64, R3
+	BLE step2
 
 	// Emit a length 60 copy, encoded as 3 bytes.
 	MOVD $0xee, R2
@@ -125,11 +121,9 @@ step1:
 
 step2:
 	// if length >= 12 || offset >= 2048 { goto step3 }
-	MOVD $12, R2
-	CMP  R2, R3
+	CMP  $12, R3
 	BGE  step3
-	MOVW $2048, R2
-	CMPW R2, R11
+	CMPW $2048, R11
 	BGE  step3
 
 	// Emit the remaining copy, encoded as 2 bytes.
@@ -295,27 +289,24 @@ varTable:
 	// var table [maxTableSize]uint16
 	//
 	// In the asm code, unlike the Go code, we can zero-initialize only the
-	// first tableSize elements. Each uint16 element is 2 bytes and each VST1
-	// writes 64 bytes, so we can do only tableSize/32 writes instead of the
-	// 2048 writes that would zero-initialize all of table's 32768 bytes.
-	// This clear could overrun the first tableSize elements, but it won't
-	// overrun the allocated stack size.
+	// first tableSize elements. Each uint16 element is 2 bytes and each
+	// iterations writes 64 bytes, so we can do only tableSize/32 writes
+	// instead of the 2048 writes that would zero-initialize all of table's
+	// 32768 bytes. This clear could overrun the first tableSize elements, but
+	// it won't overrun the allocated stack size.
 	ADD  $128, RSP, R17
 	MOVD R17, R4
 
 	// !!! R6 = &src[tableSize]
 	ADD R6<<1, R17, R6
 
-	// zero the SIMD registers
-	VEOR V0.B16, V0.B16, V0.B16
-	VEOR V1.B16, V1.B16, V1.B16
-	VEOR V2.B16, V2.B16, V2.B16
-	VEOR V3.B16, V3.B16, V3.B16
-
 memclr:
-	VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(R4)
-	CMP    R4, R6
-	BHI    memclr
+	STP.P (ZR, ZR), 64(R4)
+	STP   (ZR, ZR), -48(R4)
+	STP   (ZR, ZR), -32(R4)
+	STP   (ZR, ZR), -16(R4)
+	CMP   R4, R6
+	BHI   memclr
 
 	// !!! R6 = &src[0]
 	MOVD R7, R6
@@ -404,8 +395,7 @@ fourByteMatch:
 	// on inputMargin in encode.go.
 	MOVD R7, R3
 	SUB  R10, R3, R3
-	MOVD $16, R2
-	CMP  R2, R3
+	CMP  $16, R3
 	BLE  emitLiteralFastPath
 
 	// ----------------------------------------
@@ -454,18 +444,21 @@ inlineEmitLiteralMemmove:
 	MOVD R3, 24(RSP)
 
 	// Finish the "d +=" part of "d += emitLiteral(etc)".
-	ADD  R3, R8, R8
-	MOVD R7, 80(RSP)
-	MOVD R8, 88(RSP)
-	MOVD R15, 120(RSP)
-	CALL runtime·memmove(SB)
-	MOVD 64(RSP), R5
-	MOVD 72(RSP), R6
-	MOVD 80(RSP), R7
-	MOVD 88(RSP), R8
-	MOVD 96(RSP), R9
-	MOVD 120(RSP), R15
-	B    inner1
+	ADD   R3, R8, R8
+	MOVD  R7, 80(RSP)
+	MOVD  R8, 88(RSP)
+	MOVD  R15, 120(RSP)
+	CALL  runtime·memmove(SB)
+	MOVD  64(RSP), R5
+	MOVD  72(RSP), R6
+	MOVD  80(RSP), R7
+	MOVD  88(RSP), R8
+	MOVD  96(RSP), R9
+	MOVD  120(RSP), R15
+	ADD   $128, RSP, R17
+	MOVW  $0xa7bd, R16
+	MOVKW $(0x1e35<<16), R16
+	B     inner1
 
 inlineEmitLiteralEnd:
 	// End inline of the emitLiteral call.
@@ -489,9 +482,9 @@ emitLiteralFastPath:
 	// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
 	// 16-byte loads and stores. This technique probably wouldn't be as
 	// effective on architectures that are fussier about alignment.
-	VLD1 0(R10), [V0.B16]
-	VST1 [V0.B16], 0(R8)
-	ADD  R3, R8, R8
+	LDP 0(R10), (R0, R1)
+	STP (R0, R1), 0(R8)
+	ADD R3, R8, R8
 
 inner1:
 	// for { etc }