On Mon, Nov 12, 2012 at 8:09 PM, Bogdan Harjoc <har...@gmail.com> wrote:

> Initial results from a small .ccache (3.0) dir:
>

The previous results were bogus (I was diffing .gz compressed objects).

I tried the bsdiff approach on a new .ccache folder obtained from compiling
8 linux kernels (various v3.x versions) configured as allnoconfig.

Stats:
- 22MB
- 4030 objects (gzipped)
- 503 cache hits

After applying diffs:
- 8.4MB (1862 files) were selected for diffs
- 1.5MB (17%) compressed patches resulted
- applying the 1862 diffs took 1.21 sec (so 0.65ms per patch)

I got the 0.65ms figure using a script that runs gunzip -c and bspatch for
each patched file in the cache. To find out and subtract the overhead of
creating the gunzip and bspatch processes, I ran the loop with just
"gunzip; bspatch --help > null" instead of the normal "gunzip; bspatch". So
the result should cover just the actual patch applying (ccache does the
gunzip anyway).

The test was done on a 3.4Ghz i7-2600.

My conclusions are:
- performance impact should be negligible (when compressing files offline
-- e.g. when make aborts with "no space left on device")
- savings in size vary (7MB out of 22MB in this case)
- the patches will need to have the "old filename" embedded

If people post results from their cache folders and there is interest, I
can work on an implementation.

Attached are the updated sources for the test. I normally run them in a
"testccache" directory, since they create two more folders with data.

(I posted this mail privately just to Andrew by mistake 3 days ago,
reposting it to ccache-list now)

Cheers,
Bogdan

Attachment: bspatch-all.sh
Description: Bourne shell script

#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <errno.h>
#include <dirent.h>
#include <string.h>
#include <stdlib.h>
#include <limits.h>
#include <ctype.h>

const char *BSDIFF = "bsdiff";
//const char *BSDIFF = "~/build/bsdiff-4.3/bsdiff";

struct Func {
	Func *next;

	char *name;
};

struct Obj {
	Obj *next;

	char *name;
	char *expanded;
	int nfuncs;
	Func *funcs;
	
	Obj *match;
	int score;
};

const int HASH_SIZE = 19031;
Obj *objs[HASH_SIZE] = {};
#define NAME_HASH(name) ((*(int *)(name)) % HASH_SIZE)

Obj *read_obj(const char *strbase, int a, int b, const char *namesuff)
{
	Obj *o = new Obj;
	o->next = 0;
	o->name = new char[64];
	o->expanded = new char[128];
	o->nfuncs = 0;
	o->funcs = 0;
	o->match = 0;
	o->score = 0;
	
	o->name = strdup(namesuff);

	int nc = snprintf(o->expanded, 128, "%x/%x/%s", a, b, namesuff);
	if (nc <= 0 || nc >= 64) abort();
	
	char strpath[PATH_MAX];
	sprintf(strpath, "%s/%s", strbase, namesuff);

	FILE *f = fopen(strpath, "rb");
	if (!f) {
		printf("open %s\n", namesuff);
		abort();
	}

	for (;;) {
		static char line[PATH_MAX];
		if (!fgets(line, PATH_MAX, f))
			break;

		for (int l=strlen(line); l && isspace(line[l-1]); line[--l]=0);


		Func **h = &o->funcs;
		
		Func *fn = new Func;
		fn->name = strdup(line);
		
		fn->next = *h;
		*h = fn;

		o->nfuncs++;
	}

	fclose(f);

	return o;
}

#define max(a,b) (((a) > (b)) ? (a) : (b))

void compare(Obj *o1, Obj *o2)
{
	/*
	if (strcmp(o1->name, "978911edd002afb6ecb9611659327e3e-3475537") ||
		strcmp(o2->name, "4794bbcacf3bc42cdf1a44cf89523949-3429991"))
		return;
	*/

	if (o1->nfuncs < 4 || o2->nfuncs < 4) return;
	int r = 100*o1->nfuncs/o2->nfuncs;
	if (r < 80 || r > 125) return;
	if (!strcmp(o1->name, o2->name)) return;

	Func *f1 = o2->funcs;
	Func *f2 = o1->funcs;
	
	int m=0;
	while (f1) {
		int r = 0;
		while (f2 && (r = strcmp(f1->name, f2->name)) < 0) // reversed since we pushed fns at the head
			f2 = f2->next;

		if (f2 && r==0) // match
			m++;

		f1 = f1->next;
	}
	
	int score = 1000*m/max(o1->nfuncs, o2->nfuncs);
	if (score > 800 && score > o1->score) {
		o1->score = score;
		o1->match = o2;
	}
}

int mk_file_dirs(char *file)
{
	char *ptr = file;

	for (;;) {
		ptr = strchr(ptr, '/');
		if (!ptr) return 0;

		*ptr = 0;
		int ret = mkdir(file, 0750);
		*ptr = '/';

		if (ret < 0 && errno != EEXIST) {
			fprintf(stderr, "%s: errno=%s (%d)\n", file, strerror(errno), errno);
			return -1;
		}

		ptr++;
	}
}

int main()
{
	int nobjs=0;

	for (int a=0; a<0x10; a++)
		for (int b=0; b<0x10; b++) {
			char strbase[PATH_MAX];
			sprintf(strbase, "strings/.ccache/%x/%x", a, b);

			DIR *dh = opendir(strbase);
			if (!dh) continue;

			for (;;) {
				struct dirent *de = readdir(dh);
				if (!de)
					break;
				if (strchr(de->d_name, '.'))
					continue;
				
				Obj *o = read_obj(strbase, a, b, de->d_name);
				
				Obj **h = &objs[NAME_HASH(o->name)];
				o->next = *h;
				*h = o;

				nobjs++;
			}

			closedir(dh);
		}
	
	long long total_raw=0;
	long long total_bsd=0;
	int nbsd=0;

	for (int h1=0; h1<HASH_SIZE; h1++) {
		fprintf(stderr, "\r%d%%", h1*100/HASH_SIZE);
		fflush(stderr);

		for (Obj *o1=objs[h1]; o1; o1=o1->next) {
			// Look for a match in all the objects that are after it in our hash.
			// If we get a match, it means we can store o1 as a delta from o2.

			for (int h2=h1; h2<HASH_SIZE; h2++) {
				Obj *o2 = (h2==h1) ? o1 : objs[h2];
				
				for (; o2; o2=o2->next)
					compare(o1, o2);
			}

			if (o1->match) {
				char bin1[PATH_MAX];
				char bin2[PATH_MAX];
				sprintf(bin1, "%s/.ccache/%s", getenv("HOME"), o1->expanded);
				sprintf(bin2, "%s/.ccache/%s", getenv("HOME"), o1->match->expanded);

				char cmd[PATH_MAX];

				sprintf(cmd, "gunzip -c %s > a.o", bin1);
				if (system(cmd)) { printf("%s\n", cmd); abort(); }

				sprintf(cmd, "gunzip -c %s > b.o", bin2);
				if (system(cmd)) { printf("%s\n", cmd); abort(); }
				
				char patch[PATH_MAX];
				sprintf(patch, "patches/.ccache/%s", o1->expanded);

				if (mk_file_dirs(patch))
					abort();

				snprintf(cmd, PATH_MAX, "%s a.o b.o %s", BSDIFF, patch);
				if (system(cmd)) { printf("%s\n", cmd); abort(); }
				
				struct stat st;

				if (stat(bin1, &st))
					{ printf("stat %s\n", bin1); abort(); }
				off_t s1 = st.st_size;

				if (stat(patch, &st))
					{ printf("%s\n", patch); abort(); }
				off_t sd = st.st_size;
			
				/*if (stat(bin2, &st))
					{ printf("stat %s\n", bin2); abort(); }
				off_t s2 = st.st_size;

				printf("r %ld \tsc %d\tn1 %d\tn2 %d\ts1 %ld\ts2 %ld\tsd %ld\t%s %s\n", 
						sd*100/max(s1, s2),
						o1->score, o1->nfuncs, o1->match->nfuncs, 
						s1, s2, sd,
						o1->expanded, o1->match->expanded);*/

				total_raw += s1;
				total_bsd += sd;
	
				nbsd++;
			}
		}
	}

	remove("a.o");
	remove("b.o");

	fprintf(stderr, "\n%d files compressed (%lld -> %lld bytes, %lld%%) \n", nbsd, total_raw, total_bsd, 100*total_bsd/total_raw);
	
	return 0;
}

Attachment: get-symbols.sh
Description: Bourne shell script

_______________________________________________
ccache mailing list
ccache@lists.samba.org
https://lists.samba.org/mailman/listinfo/ccache

Reply via email to