@@ -35,11 +35,9 @@ TEXT ·emitLiteral(SB), NOSPLIT, $32-56
35
35
MOVW R3, R4
36
36
SUBW $1 , R4, R4
37
37
38
- MOVW $60 , R2
39
- CMPW R2, R4
38
+ CMPW $60 , R4
40
39
BLT oneByte
41
- MOVW $256 , R2
42
- CMPW R2, R4
40
+ CMPW $256 , R4
43
41
BLT twoBytes
44
42
45
43
threeBytes:
@@ -98,8 +96,7 @@ TEXT ·emitCopy(SB), NOSPLIT, $0-48
98
96
99
97
loop0:
100
98
// for length >= 68 { etc }
101
- MOVW $68 , R2
102
- CMPW R2, R3
99
+ CMPW $68 , R3
103
100
BLT step1
104
101
105
102
// Emit a length 64 copy, encoded as 3 bytes.
@@ -112,9 +109,8 @@ loop0:
112
109
113
110
step1:
114
111
// if length > 64 { etc }
115
- MOVD $64 , R2
116
- CMP R2, R3
117
- BLE step2
112
+ CMP $64 , R3
113
+ BLE step2
118
114
119
115
// Emit a length 60 copy, encoded as 3 bytes.
120
116
MOVD $0xee , R2
@@ -125,11 +121,9 @@ step1:
125
121
126
122
step2:
127
123
// if length >= 12 || offset >= 2048 { goto step3 }
128
- MOVD $12 , R2
129
- CMP R2, R3
124
+ CMP $12 , R3
130
125
BGE step3
131
- MOVW $2048 , R2
132
- CMPW R2, R11
126
+ CMPW $2048 , R11
133
127
BGE step3
134
128
135
129
// Emit the remaining copy, encoded as 2 bytes.
@@ -295,27 +289,24 @@ varTable:
295
289
// var table [maxTableSize]uint16
296
290
//
297
291
// In the asm code, unlike the Go code, we can zero-initialize only the
298
- // first tableSize elements. Each uint16 element is 2 bytes and each VST1
299
- // writes 64 bytes, so we can do only tableSize/32 writes instead of the
300
- // 2048 writes that would zero-initialize all of table's 32768 bytes.
301
- // This clear could overrun the first tableSize elements, but it won't
302
- // overrun the allocated stack size.
292
+ // first tableSize elements. Each uint16 element is 2 bytes and each
293
+ // iterations writes 64 bytes, so we can do only tableSize/32 writes
294
+ // instead of the 2048 writes that would zero-initialize all of table's
295
+ // 32768 bytes. This clear could overrun the first tableSize elements, but
296
+ // it won't overrun the allocated stack size.
303
297
ADD $128 , RSP, R17
304
298
MOVD R17, R4
305
299
306
300
// !!! R6 = &src[tableSize]
307
301
ADD R6<<1 , R17, R6
308
302
309
- // zero the SIMD registers
310
- VEOR V0.B16, V0.B16, V0.B16
311
- VEOR V1.B16, V1.B16, V1.B16
312
- VEOR V2.B16, V2.B16, V2.B16
313
- VEOR V3.B16, V3.B16, V3.B16
314
-
315
303
memclr:
316
- VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64 (R4)
317
- CMP R4, R6
318
- BHI memclr
304
+ STP.P (ZR, ZR), 64 (R4)
305
+ STP (ZR, ZR), -48 (R4)
306
+ STP (ZR, ZR), -32 (R4)
307
+ STP (ZR, ZR), -16 (R4)
308
+ CMP R4, R6
309
+ BHI memclr
319
310
320
311
// !!! R6 = &src[0]
321
312
MOVD R7, R6
@@ -404,8 +395,7 @@ fourByteMatch:
404
395
// on inputMargin in encode.go.
405
396
MOVD R7, R3
406
397
SUB R10, R3, R3
407
- MOVD $16 , R2
408
- CMP R2, R3
398
+ CMP $16 , R3
409
399
BLE emitLiteralFastPath
410
400
411
401
// ----------------------------------------
@@ -454,18 +444,21 @@ inlineEmitLiteralMemmove:
454
444
MOVD R3, 24 (RSP)
455
445
456
446
// Finish the "d +=" part of "d += emitLiteral(etc)".
457
- ADD R3, R8, R8
458
- MOVD R7, 80 (RSP)
459
- MOVD R8, 88 (RSP)
460
- MOVD R15, 120 (RSP)
461
- CALL runtime·memmove(SB)
462
- MOVD 64 (RSP), R5
463
- MOVD 72 (RSP), R6
464
- MOVD 80 (RSP), R7
465
- MOVD 88 (RSP), R8
466
- MOVD 96 (RSP), R9
467
- MOVD 120 (RSP), R15
468
- B inner1
447
+ ADD R3, R8, R8
448
+ MOVD R7, 80 (RSP)
449
+ MOVD R8, 88 (RSP)
450
+ MOVD R15, 120 (RSP)
451
+ CALL runtime·memmove(SB)
452
+ MOVD 64 (RSP), R5
453
+ MOVD 72 (RSP), R6
454
+ MOVD 80 (RSP), R7
455
+ MOVD 88 (RSP), R8
456
+ MOVD 96 (RSP), R9
457
+ MOVD 120 (RSP), R15
458
+ ADD $128 , RSP, R17
459
+ MOVW $0xa7bd , R16
460
+ MOVKW $(0x1e35 <<16 ), R16
461
+ B inner1
469
462
470
463
inlineEmitLiteralEnd:
471
464
// End inline of the emitLiteral call.
@@ -489,9 +482,9 @@ emitLiteralFastPath:
489
482
// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
490
483
// 16-byte loads and stores. This technique probably wouldn't be as
491
484
// effective on architectures that are fussier about alignment.
492
- VLD1 0 (R10), [V0.B16]
493
- VST1 [V0.B16] , 0 (R8)
494
- ADD R3, R8, R8
485
+ LDP 0 (R10), (R0, R1)
486
+ STP (R0, R1) , 0 (R8)
487
+ ADD R3, R8, R8
495
488
496
489
inner1:
497
490
// for { etc }
0 commit comments