On 3/12/20 7:58 AM, LIU Zhiwei wrote:
> +target_ulong HELPER(vmpopc_m)(void *v0, void *vs2, CPURISCVState *env,
> + uint32_t desc)
> +{
> + target_ulong cnt = 0;
> + uint32_t mlen = vext_mlen(desc);
> + uint32_t vm = vext_vm(desc);
> + uint32_t vl = env->vl;
> + int i;
> +
> + for (i = 0; i < vl; i++) {
> + if (vm || vext_elem_mask(v0, mlen, i)) {
> + if (vext_elem_mask(vs2, mlen, i)) {
> + cnt++;
> + }
> + }
> + }
> + return cnt;
> +}
This is ok as-is, so
Reviewed-by: Richard Henderson <[email protected]>
But you can do better.
You create an array, similar to arm's pred_esz_masks[],
indexed by log2(mlen).
mask = pred_mlen_masks[log2_mlen];
n = vl >> (6 - log2_mlen);
r = extract32(vl, 0, 6 - log2_mlen);
if (r) {
rmask = extract64(mask, 0, r << log2_mlen);
} else {
rmask = 0;
}
if (vm) {
for (i = 0; i < n; i++) {
uint64_t j = ((uint64_t *)vs2)[i];
cnt += ctpop64(j & mask);
}
if (rmask) {
uint64_t j = ((uint64_t *)vs2)[i];
cnt += ctpop64(j & rmask);
}
} else {
for (i = 0; i < n; i++) {
uint64_t j = ((uint64_t *)vs2)[i];
uint64_t k = ((uint64_t *)v0)[i];
cnt += ctpop64(j & k & mask);
}
if (rmask) {
uint64_t j = ((uint64_t *)vs2)[i];
uint64_t k = ((uint64_t *)v0)[i];
cnt += ctpop64(j & k & rmask);
}
}
r~