Issue 172903
Summary [RegAlloc]
Labels new issue
Assignees
Reporter JoernEngel
    I was told to submit testcases for bad register allocation.  Here you go!

Performance is dominated by the 40 decode_one(). Ideally they should generate 4 instructions each, but without the "ifdef __BMI2__" hint I get 5 instructions and about 10% worse performance. Drawback of using shrx is the extra register spent on a constant value.  In this case it is worth spending the extra register, determined by experimentation and benchmarks.  I have no idea how a compiler would automatically make the right decision on unknown code, this might be a challenge.

Otherwise the generated code is quite good, almost pleasant to read.  Gcc-12 is storing the constant (53) 6 times per loop iteration, clang-16 does it once outside the loop.


#include <endian.h>

typedef unsigned char u8;
typedef unsigned short u16;
typedef unsigned u32;
typedef unsigned long long u64;
typedef u64 u64u __attribute__((may_alias, aligned(1)));

static inline u64 read_le64(const void *buf) { return le64toh(*(const u64u *)buf); }

static inline u64 shrx(u64 val, u64 s)
{
	u64 ret;
	asm("shrx %[s],%[v],%[r]\n\t"
		:[r]"=r"(ret)
		:[v]"r"(val), [s]"r"(s)
		:);
	return ret;
}

static inline u64 decode_one(u64 acc, u8 *op, const u16 table[2048])
{
	u64 slot;
	/* shrx is faster than mov+shr */
#ifdef __BMI2__
	slot = shrx(acc, 53);
#else
	slot = acc >> 53;
#endif
	u16 entry = table[slot];
	*op = entry>>8;
	acc <<= entry&0xff;
	return acc;
}

static inline u64 reload_one(u64 acc, const void **ipp)
{
	const void *ip = *ipp;

	int bits = __builtin_ctzll(acc);
	int bytes = bits/8;
	ip -= bytes;
	acc = read_le64(ip)|1;
	acc <<= bits&7;

	*ipp = ip;
	return acc;
}

void dec_bitstream_mod8(void *dst, u64 dlen,
		const void *src, u64 slen[8],
		const u16 table[2048])
{
	const void *ip[8];
	ip[0] = src+slen[0] - 8;
	for (int i=1; i<8; i++)
		ip[i] = ip[i-1] + slen[i];
	u8 *op = dst;
	u64 r[8];
	for (int i=0; i<8; i++)
		r[i] = read_le64(ip[i])|1;
	u64 o=0;
	for (; o+40<=dlen; o+=40) {
		r[0] = decode_one(r[0], op+o+000, table);
		r[1] = decode_one(r[1], op+o+001, table);
		r[2] = decode_one(r[2], op+o+002, table);
		r[3] = decode_one(r[3], op+o+003, table);
		r[4] = decode_one(r[4], op+o+004, table);
		r[5] = decode_one(r[5], op+o+005, table);
		r[6] = decode_one(r[6], op+o+006, table);
		r[7] = decode_one(r[7], op+o+007, table);

		r[0] = decode_one(r[0], op+o+010, table);
		r[1] = decode_one(r[1], op+o+011, table);
		r[2] = decode_one(r[2], op+o+012, table);
		r[3] = decode_one(r[3], op+o+013, table);
		r[4] = decode_one(r[4], op+o+014, table);
		r[5] = decode_one(r[5], op+o+015, table);
		r[6] = decode_one(r[6], op+o+016, table);
		r[7] = decode_one(r[7], op+o+017, table);

		r[0] = decode_one(r[0], op+o+020, table);
		r[1] = decode_one(r[1], op+o+021, table);
		r[2] = decode_one(r[2], op+o+022, table);
		r[3] = decode_one(r[3], op+o+023, table);
		r[4] = decode_one(r[4], op+o+024, table);
		r[5] = decode_one(r[5], op+o+025, table);
		r[6] = decode_one(r[6], op+o+026, table);
		r[7] = decode_one(r[7], op+o+027, table);

		r[0] = decode_one(r[0], op+o+030, table);
		r[1] = decode_one(r[1], op+o+031, table);
		r[2] = decode_one(r[2], op+o+032, table);
		r[3] = decode_one(r[3], op+o+033, table);
		r[4] = decode_one(r[4], op+o+034, table);
		r[5] = decode_one(r[5], op+o+035, table);
		r[6] = decode_one(r[6], op+o+036, table);
		r[7] = decode_one(r[7], op+o+037, table);

		r[0] = decode_one(r[0], op+o+040, table);
		r[1] = decode_one(r[1], op+o+041, table);
		r[2] = decode_one(r[2], op+o+042, table);
		r[3] = decode_one(r[3], op+o+043, table);
		r[4] = decode_one(r[4], op+o+044, table);
		r[5] = decode_one(r[5], op+o+045, table);
		r[6] = decode_one(r[6], op+o+046, table);
		r[7] = decode_one(r[7], op+o+047, table);

		r[0] = reload_one(r[0], &ip[0]);
		r[1] = reload_one(r[1], &ip[1]);
		r[2] = reload_one(r[2], &ip[2]);
		r[3] = reload_one(r[3], &ip[3]);
		r[4] = reload_one(r[4], &ip[4]);
		r[5] = reload_one(r[5], &ip[5]);
		r[6] = reload_one(r[6], &ip[6]);
		r[7] = reload_one(r[7], &ip[7]);
	}
	for (;; o+=8) {
		/* weird indentation is easier to read */
#		pragma GCC diagnostic push
#		pragma GCC diagnostic ignored "-Wmisleading-indentation"
		if (o+0>=dlen) break; r[0] = decode_one(r[0], op+o+0, table);
		if (o+1>=dlen) break; r[1] = decode_one(r[1], op+o+1, table);
		if (o+2>=dlen) break; r[2] = decode_one(r[2], op+o+2, table);
		if (o+3>=dlen) break; r[3] = decode_one(r[3], op+o+3, table);
		if (o+4>=dlen) break; r[4] = decode_one(r[4], op+o+4, table);
		if (o+5>=dlen) break; r[5] = decode_one(r[5], op+o+5, table);
		if (o+6>=dlen) break; r[6] = decode_one(r[6], op+o+6, table);
		if (o+7>=dlen) break; r[7] = decode_one(r[7], op+o+7, table);
#		pragma GCC diagnostic pop
	}
}
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to