Skip to content

Commit 4883749

Browse files
CL/aarch64: implement the wasm SIMD i32x4.dot_i16x8_s instruction
This patch implements, for aarch64, the following wasm SIMD extensions i32x4.dot_i16x8_s instruction WebAssembly/simd#127 It also updates dependencies as follows, in order that the new instruction can be parsed, decoded, etc: wat to 1.0.27 wast to 26.0.1 wasmparser to 0.65.0 wasmprinter to 0.2.12 The changes are straightforward: * new CLIF instruction `widening_pairwise_dot_product_s` * translation from wasm into `widening_pairwise_dot_product_s` * new AArch64 instructions `smull`, `smull2` (part of the `VecRRR` group) * translation from `widening_pairwise_dot_product_s` to `smull ; smull2 ; addv` There is no testcase in this commit, because that is a separate repo. The implementation has been tested, nevertheless.
1 parent 54a97f7 commit 4883749

File tree

25 files changed

+228
-54
lines changed

25 files changed

+228
-54
lines changed

Cargo.lock

Lines changed: 26 additions & 26 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,12 @@ anyhow = "1.0.19"
3737
target-lexicon = { version = "0.11.0", default-features = false }
3838
pretty_env_logger = "0.4.0"
3939
file-per-thread-logger = "0.1.1"
40-
wat = "1.0.26"
40+
wat = "1.0.27"
4141
libc = "0.2.60"
4242
log = "0.4.8"
4343
rayon = "1.2.1"
4444
humantime = "1.3.0"
45-
wasmparser = "0.63"
45+
wasmparser = "0.65"
4646

4747
[dev-dependencies]
4848
env_logger = "0.7.1"

cranelift/codegen/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ peepmatic-traits = { path = "../peepmatic/crates/traits", optional = true, versi
3030
peepmatic-runtime = { path = "../peepmatic/crates/runtime", optional = true, version = "0.67.0" }
3131
regalloc = { version = "0.0.31" }
3232
souper-ir = { version = "1", optional = true }
33-
wast = { version = "25.0.0", optional = true }
33+
wast = { version = "26.0.1", optional = true }
3434
# It is a goal of the cranelift-codegen crate to have minimal external dependencies.
3535
# Please don't add any unless they are essential to the task of creating binary
3636
# machine code. Integration tests that need external dependencies can be

cranelift/codegen/meta/src/shared/instructions.rs

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4078,6 +4078,41 @@ pub(crate) fn define(
40784078
.operands_out(vec![a]),
40794079
);
40804080

4081+
let I16x8 = &TypeVar::new(
4082+
"I16x8",
4083+
"A SIMD vector type containing 8 integer lanes each 16 bits wide.",
4084+
TypeSetBuilder::new()
4085+
.ints(16..16)
4086+
.simd_lanes(8..8)
4087+
.includes_scalars(false)
4088+
.build(),
4089+
);
4090+
4091+
let x = &Operand::new("x", I16x8);
4092+
let y = &Operand::new("y", I16x8);
4093+
let a = &Operand::new("a", &I16x8.merge_lanes());
4094+
4095+
ig.push(
4096+
Inst::new(
4097+
"widening_pairwise_dot_product_s",
4098+
r#"
4099+
Takes corresponding elements in `x` and `y`, performs a sign-extending length-doubling
4100+
multiplication on them, then adds adjacent pairs of elements to form the result. For
4101+
example, if the input vectors are `[x3, x2, x1, x0]` and `[y3, y2, y1, y0]`, it produces
4102+
the vector `[r1, r0]`, where `r1 = sx(x3) * sx(y3) + sx(x2) * sx(y2)` and
4103+
`r0 = sx(x1) * sx(y1) + sx(x0) * sx(y0)`, and `sx(n)` sign-extends `n` to twice its width.
4104+
4105+
This will double the lane width and halve the number of lanes. So the resulting
4106+
vector has the same number of bits as `x` and `y` do (individually).
4107+
4108+
See https://github.com/WebAssembly/simd/pull/127 for background info.
4109+
"#,
4110+
&formats.binary,
4111+
)
4112+
.operands_in(vec![x, y])
4113+
.operands_out(vec![a]),
4114+
);
4115+
40814116
let IntTo = &TypeVar::new(
40824117
"IntTo",
40834118
"A larger integer type with the same number of lanes",

cranelift/codegen/src/isa/aarch64/inst/args.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -677,6 +677,9 @@ impl VectorSize {
677677
}
678678
}
679679

680+
/// Produces a `VectorSize` with lanes twice as wide. Note that if the resulting
681+
/// size would exceed 128 bits, then the number of lanes is also halved, so as to
682+
/// ensure that the result size is at most 128 bits.
680683
pub fn widen(&self) -> VectorSize {
681684
match self {
682685
VectorSize::Size8x8 => VectorSize::Size16x8,
@@ -689,6 +692,7 @@ impl VectorSize {
689692
}
690693
}
691694

695+
/// Produces a `VectorSize` that has the same lane width, but half as many lanes.
692696
pub fn halve(&self) -> VectorSize {
693697
match self {
694698
VectorSize::Size8x16 => VectorSize::Size8x8,

cranelift/codegen/src/isa/aarch64/inst/emit.rs

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1950,11 +1950,13 @@ impl MachInstEmit for Inst {
19501950
(0b001_01110_00_1 | enc_size << 1, 0b100000)
19511951
}
19521952
VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110),
1953+
VecALUOp::Smull => (0b000_01110_00_1 | enc_size << 1, 0b110000),
1954+
VecALUOp::Smull2 => (0b010_01110_00_1 | enc_size << 1, 0b110000),
19531955
};
1954-
let top11 = if is_float {
1955-
top11 | (q << 9) | enc_float_size << 1
1956-
} else {
1957-
top11 | (q << 9)
1956+
let top11 = match alu_op {
1957+
VecALUOp::Smull | VecALUOp::Smull2 => top11,
1958+
_ if is_float => top11 | (q << 9) | enc_float_size << 1,
1959+
_ => top11 | (q << 9),
19581960
};
19591961
sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
19601962
}

cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3243,6 +3243,78 @@ fn test_aarch64_binemit() {
32433243
"zip1 v9.2d, v20.2d, v17.2d",
32443244
));
32453245

3246+
insns.push((
3247+
Inst::VecRRR {
3248+
alu_op: VecALUOp::Smull,
3249+
rd: writable_vreg(16),
3250+
rn: vreg(12),
3251+
rm: vreg(1),
3252+
size: VectorSize::Size8x16,
3253+
},
3254+
"90C1210E",
3255+
"smull v16.8h, v12.8b, v1.8b",
3256+
));
3257+
3258+
insns.push((
3259+
Inst::VecRRR {
3260+
alu_op: VecALUOp::Smull,
3261+
rd: writable_vreg(2),
3262+
rn: vreg(13),
3263+
rm: vreg(6),
3264+
size: VectorSize::Size16x8,
3265+
},
3266+
"A2C1660E",
3267+
"smull v2.4s, v13.4h, v6.4h",
3268+
));
3269+
3270+
insns.push((
3271+
Inst::VecRRR {
3272+
alu_op: VecALUOp::Smull,
3273+
rd: writable_vreg(8),
3274+
rn: vreg(12),
3275+
rm: vreg(14),
3276+
size: VectorSize::Size32x4,
3277+
},
3278+
"88C1AE0E",
3279+
"smull v8.2d, v12.2s, v14.2s",
3280+
));
3281+
3282+
insns.push((
3283+
Inst::VecRRR {
3284+
alu_op: VecALUOp::Smull2,
3285+
rd: writable_vreg(16),
3286+
rn: vreg(12),
3287+
rm: vreg(1),
3288+
size: VectorSize::Size8x16,
3289+
},
3290+
"90C1214E",
3291+
"smull2 v16.8h, v12.16b, v1.16b",
3292+
));
3293+
3294+
insns.push((
3295+
Inst::VecRRR {
3296+
alu_op: VecALUOp::Smull2,
3297+
rd: writable_vreg(2),
3298+
rn: vreg(13),
3299+
rm: vreg(6),
3300+
size: VectorSize::Size16x8,
3301+
},
3302+
"A2C1664E",
3303+
"smull2 v2.4s, v13.8h, v6.8h",
3304+
));
3305+
3306+
insns.push((
3307+
Inst::VecRRR {
3308+
alu_op: VecALUOp::Smull2,
3309+
rd: writable_vreg(8),
3310+
rn: vreg(12),
3311+
rm: vreg(14),
3312+
size: VectorSize::Size32x4,
3313+
},
3314+
"88C1AE4E",
3315+
"smull2 v8.2d, v12.4s, v14.4s",
3316+
));
3317+
32463318
insns.push((
32473319
Inst::VecMisc {
32483320
op: VecMisc2::Not,

0 commit comments

Comments
 (0)