Skip to content

error : couldn't allocate output register for constraint 'w' #141022

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
scoutzeng opened this issue May 22, 2025 · 5 comments
Open

error : couldn't allocate output register for constraint 'w' #141022

scoutzeng opened this issue May 22, 2025 · 5 comments

Comments

@scoutzeng
Copy link

scoutzeng commented May 22, 2025

Env as belows:
Clang version: 19.1.1
Visual Studio 17 2022

When i compile an arm asm snippet with clang-cl, compiler output that it can't allocate output register for constraint 'w'.

__mmask64 _mm512_test_epi8_mask(__m512i a, __m512i b)
{
    uint8x16_t mask_and = vld1q_u8(g_mask_epi8);
    __m512i tmp;
    tmp.vect_u8[0] = vandq_u8(vtstq_u8(a.vect_u8[0], b.vect_u8[0]), mask_and);
    tmp.vect_u8[1] = vandq_u8(vtstq_u8(a.vect_u8[1], b.vect_u8[1]), mask_and);
    tmp.vect_u8[2] = vandq_u8(vtstq_u8(a.vect_u8[2], b.vect_u8[2]), mask_and);
    tmp.vect_u8[3] = vandq_u8(vtstq_u8(a.vect_u8[3], b.vect_u8[3]), mask_and);
    uint8_t r[8];
    __asm__ __volatile__ (
        "addv %b[r0], %[t0].8b              \n\t"
        "addv %b[r2], %[t1].8b              \n\t"
        "addv %b[r4], %[t2].8b              \n\t"
        "addv %b[r6], %[t3].8b              \n\t"
        "ins %[t0].d[0], %[t0].d[1]         \n\t"
        "ins %[t1].d[0], %[t1].d[1]         \n\t"
        "ins %[t2].d[0], %[t2].d[1]         \n\t"
        "ins %[t3].d[0], %[t3].d[1]         \n\t"
        "addv %b[r1], %[t0].8b              \n\t"
        "addv %b[r3], %[t1].8b              \n\t"
        "addv %b[r5], %[t2].8b              \n\t"
        "addv %b[r7], %[t3].8b              \n\t"
        :[r0]"=w"(r[0]), [r1]"=w"(r[1]), [r2]"=w"(r[2]), [r3]"=w"(r[3]), [r4]"=w"(r[4]), [r5]"=w"(r[5]), [r6]"=w"(r[6]),
         [r7]"=w"(r[7]), 
         [t0]"+w"(tmp.vect_u8[0]), [t1]"+w"(tmp.vect_u8[1]), [t2]"+w"(tmp.vect_u8[2]), [t3]"+w"(tmp.vect_u8[3])
    );
    uint64x1_t res = vreinterpret_u64_u8(vld1_u8((const uint8_t *)r));
    return vget_lane_u64(res, 0);
}
@scoutzeng
Copy link
Author

According to the document https://llvm.org/docs/LangRef.html?referer=https%3A%2F%2Fcloud.tencent.com%2Fdeveloper%2Fask%2Fsof%2F114457294#supported-constraint-code-list, llvm support constraint w, i'm fussed about that, is this a compiler issue?

@llvmbot
Copy link
Member

llvmbot commented May 22, 2025

@llvm/issue-subscribers-backend-x86

Author: None (scoutzeng)

Env as belows: Clang version: 19.1.1 Visual Studio 17 2022

When i compile an arm asm snippet with clang-cl, compiler output that it can't allocate output register for constraint 'w'.

__mmask64 _mm512_test_epi8_mask(__m512i a, __m512i b)
{
    uint8x16_t mask_and = vld1q_u8(g_mask_epi8);
    __m512i tmp;
    tmp.vect_u8[0] = vandq_u8(vtstq_u8(a.vect_u8[0], b.vect_u8[0]), mask_and);
    tmp.vect_u8[1] = vandq_u8(vtstq_u8(a.vect_u8[1], b.vect_u8[1]), mask_and);
    tmp.vect_u8[2] = vandq_u8(vtstq_u8(a.vect_u8[2], b.vect_u8[2]), mask_and);
    tmp.vect_u8[3] = vandq_u8(vtstq_u8(a.vect_u8[3], b.vect_u8[3]), mask_and);
    uint8_t r[8];
    __asm__ __volatile__ (
        "addv %b[r0], %[t0].8b              \n\t"
        "addv %b[r2], %[t1].8b              \n\t"
        "addv %b[r4], %[t2].8b              \n\t"
        "addv %b[r6], %[t3].8b              \n\t"
        "ins %[t0].d[0], %[t0].d[1]         \n\t"
        "ins %[t1].d[0], %[t1].d[1]         \n\t"
        "ins %[t2].d[0], %[t2].d[1]         \n\t"
        "ins %[t3].d[0], %[t3].d[1]         \n\t"
        "addv %b[r1], %[t0].8b              \n\t"
        "addv %b[r3], %[t1].8b              \n\t"
        "addv %b[r5], %[t2].8b              \n\t"
        "addv %b[r7], %[t3].8b              \n\t"
        :[r0]"=w"(r[0]), [r1]"=w"(r[1]), [r2]"=w"(r[2]), [r3]"=w"(r[3]), [r4]"=w"(r[4]), [r5]"=w"(r[5]), [r6]"=w"(r[6]),
         [r7]"=w"(r[7]), 
         [t0]"+w"(tmp.vect_u8[0]), [t1]"+w"(tmp.vect_u8[1]), [t2]"+w"(tmp.vect_u8[2]), [t3]"+w"(tmp.vect_u8[3])
    );
    uint64x1_t res = vreinterpret_u64_u8(vld1_u8((const uint8_t *)r));
    return vget_lane_u64(res, 0);
}

@llvmbot
Copy link
Member

llvmbot commented May 22, 2025

@llvm/issue-subscribers-backend-aarch64

Author: None (scoutzeng)

Env as belows: Clang version: 19.1.1 Visual Studio 17 2022

When i compile an arm asm snippet with clang-cl, compiler output that it can't allocate output register for constraint 'w'.

__mmask64 _mm512_test_epi8_mask(__m512i a, __m512i b)
{
    uint8x16_t mask_and = vld1q_u8(g_mask_epi8);
    __m512i tmp;
    tmp.vect_u8[0] = vandq_u8(vtstq_u8(a.vect_u8[0], b.vect_u8[0]), mask_and);
    tmp.vect_u8[1] = vandq_u8(vtstq_u8(a.vect_u8[1], b.vect_u8[1]), mask_and);
    tmp.vect_u8[2] = vandq_u8(vtstq_u8(a.vect_u8[2], b.vect_u8[2]), mask_and);
    tmp.vect_u8[3] = vandq_u8(vtstq_u8(a.vect_u8[3], b.vect_u8[3]), mask_and);
    uint8_t r[8];
    __asm__ __volatile__ (
        "addv %b[r0], %[t0].8b              \n\t"
        "addv %b[r2], %[t1].8b              \n\t"
        "addv %b[r4], %[t2].8b              \n\t"
        "addv %b[r6], %[t3].8b              \n\t"
        "ins %[t0].d[0], %[t0].d[1]         \n\t"
        "ins %[t1].d[0], %[t1].d[1]         \n\t"
        "ins %[t2].d[0], %[t2].d[1]         \n\t"
        "ins %[t3].d[0], %[t3].d[1]         \n\t"
        "addv %b[r1], %[t0].8b              \n\t"
        "addv %b[r3], %[t1].8b              \n\t"
        "addv %b[r5], %[t2].8b              \n\t"
        "addv %b[r7], %[t3].8b              \n\t"
        :[r0]"=w"(r[0]), [r1]"=w"(r[1]), [r2]"=w"(r[2]), [r3]"=w"(r[3]), [r4]"=w"(r[4]), [r5]"=w"(r[5]), [r6]"=w"(r[6]),
         [r7]"=w"(r[7]), 
         [t0]"+w"(tmp.vect_u8[0]), [t1]"+w"(tmp.vect_u8[1]), [t2]"+w"(tmp.vect_u8[2]), [t3]"+w"(tmp.vect_u8[3])
    );
    uint64x1_t res = vreinterpret_u64_u8(vld1_u8((const uint8_t *)r));
    return vget_lane_u64(res, 0);
}

@efriedma-quic
Copy link
Collaborator

It looks like the compiler is specifically unhappy that the type of the output value is uint8_t, as opposed to something wider (uint32_t etc.). I guess it's a bug, since gcc is happy with it...

As a workaround, you can change the output value to a wider integer.

Reduced testcase:

void z() {
    char x; asm("":"=w"(x));
}

@davemgreen
Copy link
Collaborator

Out of interest - why do you need inline assembly, and can you just use addp to accumulate the bits? Something like

    uint8x16_t x = vpaddq_u8(tmp. vect_u8[0], tmp. vect_u8[1]);
    uint8x16_t y = vpaddq_u8(tmp. vect_u8[2], tmp. vect_u8[3]);
    x = vpaddq_u8(x,y);
    return vget_low_u8(vpaddq_u8(x, x));

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

5 participants