#include <math.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
#include <stdlib.h>

#include "perf.h"
#include "stat.h"
#include "lfsr.h"

volatile unsigned long acc = 0;

#ifndef LFSR_BITS
#define LFSR_BITS 16
#endif

void __attribute__((noinline)) wipe_btb(void)

{
	u32 val = 1;
	int i;

	for (i=0; i<1<<LFSR_BITS; i++) {
		if (val & 1)
			acc++;
		else
			acc--;
		val = lfsr(val, lfsr_taps(LFSR_BITS));

		if (~val & 1)
			acc--;
		else
			acc++;
		val = lfsr(val, lfsr_taps(LFSR_BITS));
	}
}

#define BITS_PER_LONG	(sizeof(long) * 8)

#ifndef LOOPS
#define LOOPS 1000000
#endif

#ifdef SOFTFLS

static __always_inline unsigned long __fls(unsigned long word)
{
	int num = BITS_PER_LONG - 1;

	if (!(word & (~0ul << 32))) {
		num -= 32;
		word <<= 32;
	}
	if (!(word & (~0ul << (BITS_PER_LONG-16)))) {
		num -= 16;
		word <<= 16;
	}
	if (!(word & (~0ul << (BITS_PER_LONG-8)))) {
		num -= 8;
		word <<= 8;
	}
	if (!(word & (~0ul << (BITS_PER_LONG-4)))) {
		num -= 4;
		word <<= 4;
	}
	if (!(word & (~0ul << (BITS_PER_LONG-2)))) {
		num -= 2;
		word <<= 2;
	}
	if (!(word & (~0ul << (BITS_PER_LONG-1))))
		num -= 1;

	return num;
}

#else

static __always_inline unsigned long __fls(unsigned long word)
{
#if LZCNT
	asm("rep; bsr %1,%0"
		: "=r" (word)
		: "rm" (word));
	return BITS_PER_LONG - 1 - word;
#else
	asm("bsr %1,%0"
		: "=r" (word)
		: "rm" (word));
	return word;
#endif
}

#endif


#ifdef NEW

unsigned long __attribute__((noinline)) int_sqrt(unsigned long x)
{
	unsigned long b, m, y = 0;

	if (x <= 1)
		return x;

#ifdef FLS
	m = 1UL << (__fls(x) & ~1UL);
#else
	m = 1UL << (BITS_PER_LONG - 2);
#endif

#ifdef ANSHUL
	while (m > x)
		m >>= 2;
#endif

	while (m != 0) {
		b = y + m;
		y >>= 1;

		if (x >= b) {
			x -= b;
			y += m;
		}
		m >>= 2;
	}

	return y;
}

#elif LINUS


unsigned long __attribute__((noinline)) int_sqrt(unsigned long x)
{
	unsigned long m, y;

	if (x <= 1)
		return x;

	m = 64;
	do {
		unsigned long new_m = m << 2;
		if (!new_m)
			break;
		m = new_m;
	} while (m < x);

	y = 0;
	do {
		unsigned long b = y + m;
		b = y + m;
		y >>= 1;

		if (x >= b) {
			x -= b;
			y += m;
		}
		m >>= 2;
	} while (m);

	return y;
}

#else

unsigned long __attribute__((noinline)) int_sqrt(unsigned long x)
{
	unsigned long op, res, one;

	op = x;
	res = 0;

	one = 1UL << (BITS_PER_LONG - 2);
	while (one > op)
		one >>= 2;

	while (one != 0) {
		if (op >= res + one) {
			op = op - (res + one);
			res = res +  2 * one;
		}
		res /= 2;
		one /= 4;
	}
	return res;
}

#endif

#ifndef EVENT
#define EVENT PERF_COUNT_HW_CPU_CYCLES
#endif

static struct perf_event_attr attr_event = {
	.type = PERF_TYPE_HARDWARE,
	.config = EVENT,
	.exclude_kernel = 1,
	.pinned = 1,
};

static void *create_event(struct perf_event_attr *attr)
{
	void *event;
	int fd = sys_perf_event_open(attr, 0, -1, -1, 0);

	if (fd < 0) {
		perror("sys_perf_event_open");
		exit(-1);
	}

	event = mmap(NULL, sysconf(_SC_PAGESIZE), PROT_READ, MAP_SHARED, fd, 0);
	if (event == (void *)-1) {
		perror("mmap");
		exit(-1);
	}

	close(fd);

	mmap_read_pinned(event);

	return event;
}

void main(void)
{
	void *event = create_event(&attr_event);
	struct stats stats_nop;
	struct stats stats_event;
	unsigned long i;
	u64 val;

	init_stats(&stats_nop);
	init_stats(&stats_event);

	for (i=0; i<LOOPS; i++) {
		unsigned long a;

#if 1
		val = mmap_read_pinned(event);
		barrier();
		update_stats(&stats_nop, mmap_read_pinned(event) - val);

		val = mmap_read_pinned(event);
		barrier();
		a = int_sqrt(i);
		barrier();
		update_stats(&stats_event, mmap_read_pinned(event) - val);
#endif


#ifdef VALIDATE
		{
			unsigned long b = floor(sqrt(i));
			if (a != b)
				printf("%ld %ld %ld\n", i, a, b);
		}
#endif

#ifdef WIPE_BTB
		wipe_btb();
#endif
	}

	printf("event: %f +- %f\n", avg_stats(&stats_event) - avg_stats(&stats_nop), 
			sum_errors(stddev_stats(&stats_nop), stddev_stats(&stats_event)));
}
