Initial results from a small .ccache (3.0) dir:

- 6476 objects
- 300MB
- probably about 500-1000 compiles/recompiles of around 100 small to large
projects

The test was:
1. Find the candidates for compression, based on: objdump -t | grep " g "
(defined symbols). If two objects had at least 4 symbols defined, and 85%
of them were identical, the files were selected for compression.
2. Run bsdiff on the selected pairs of files, and collect the total raw
size, and the resulting compressed size.

The results are:
4459 out of 6476 files compressed (6099674 -> 629795 bytes)

So roughly 90% compression rate, for a random .ccache folder.

I attached sources for the test (first run "./get-symbols.sh", then
"./find-similar").

Will post results for a more favorable scenario (multiple builds of
different versions of the same project) if time permits.
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <dirent.h>
#include <string.h>
#include <stdlib.h>
#include <limits.h>
#include <ctype.h>

const char *BSDIFF = "~/build/bsdiff-4.3/bsdiff";

struct Func {
	Func *next;

	char *name;
};

struct Obj {
	Obj *next;

	char *name;
	char *path;
	int nfuncs;
	Func *funcs;
	
	Obj *match;
	int common;
};

const int HASH_SIZE = 19031;
Obj *objs[HASH_SIZE] = {};
#define NAME_HASH(name) ((*(int *)(name)) % HASH_SIZE)

Obj *read_obj(const char *base, int a, int b, const char *namesuff)
{
	Obj *o = new Obj;
	o->next = 0;
	o->name = new char[64];
	o->path = new char[128];
	o->nfuncs = 0;
	o->funcs = 0;
	o->common = 0;
	o->match = 0;
	
	int nc = snprintf(o->name, 64, "%x%x%s", a, b, namesuff);
	if (nc <= 0 || nc >= 64) abort();

	snprintf(o->path, 128, "%s/%s", base, namesuff);
	FILE *f = fopen(o->path, "rb");
	for (;;) {
		static char line[PATH_MAX];
		if (!fgets(line, PATH_MAX, f))
			break;

		for (int l=strlen(line); l && isspace(line[l-1]); line[--l]=0);


		Func **h = &o->funcs;
		
		Func *fn = new Func;
		fn->name = strdup(line);
		
		fn->next = *h;
		*h = fn;

		o->nfuncs++;
	}

	fclose(f);

	return o;
}

void compare(Obj *o1, Obj *o2)
{
	/*
	if (strcmp(o1->name, "978911edd002afb6ecb9611659327e3e-3475537") ||
		strcmp(o2->name, "4794bbcacf3bc42cdf1a44cf89523949-3429991"))
		return;
	*/

	if (o1->nfuncs < 4 || o2->nfuncs < 4) return;
	int r = 100*o1->nfuncs/o2->nfuncs;
	if (r < 80 || r > 125) return;
	if (!strcmp(o1->name, o2->name)) return;

	Func *f1 = o2->funcs;
	Func *f2 = o1->funcs;
	
	int m=0;
	while (f1) {
		int r = 0;
		while (f2 && (r = strcmp(f1->name, f2->name)) < 0) // reversed since we pushed fns at the head
			f2 = f2->next;

		if (f2 && r==0) // match
			m++;

		f1 = f1->next;
	}
	
	if (m > o1->common) {
		o1->common = m;
		o1->match = o2;
	}
}

int main()
{
	int nobjs=0;

	for (int a=0; a<0x10; a++)
		for (int b=0; b<0x10; b++) {
			char base[] = "strings/.ccache/1/2";
			sprintf(base, "strings/.ccache/%x/%x", a, b);
			DIR *dh = opendir(base);
			if (!dh) continue;

			for (;;) {
				struct dirent *de = readdir(dh);
				if (!de)
					break;
				if (strchr(de->d_name, '.'))
					continue;
				
				Obj *o = read_obj(base, a, b, de->d_name);
				
				Obj **h = &objs[NAME_HASH(o->name)];
				o->next = *h;
				*h = o;

				nobjs++;
			}

			closedir(dh);
		}
	
	int total_raw=0;
	int total_bsd=0;
	int nbsd=0;

	for (int h1=0; h1<HASH_SIZE; h1++) {
		printf("\r%d%%", h1*100/HASH_SIZE);

		for (Obj *o1=objs[h1]; o1; o1=o1->next) {
			// Look for a match in all the objects that are after it in our hash.
			// If we get a match, it means we can store o1 as a delta from o2.

			for (int h2=h1; h2<HASH_SIZE; h2++) {
				Obj *o2 = (h2==h1) ? o1 : objs[h2];
				
				for (; o2; o2=o2->next)
					compare(o1, o2);
			}

			if (o1->nfuncs && 100*o1->common/o1->nfuncs >= 85) {
				char cmd[PATH_MAX];
				snprintf(cmd, PATH_MAX, "%s %s %s tmp.bsd", BSDIFF, o1->path, o1->match->path);
				if (system(cmd))
					{ printf("%s\n", cmd); abort(); }
				
				struct stat st;
				if (stat(o1->path, &st))
					{ printf("%s\n", o1->path); abort(); }
				total_raw += st.st_size;

				if (stat("tmp.bsd", &st))
					{ printf("%s (tmp.bsd)\n", o1->path); abort(); }
				total_bsd += st.st_size;
				
				nbsd++;
			}
		}
	}

	remove("tmp.bsd");

	printf("\n%d files compressed (%d -> %d bytes)\n", nbsd, total_raw, total_bsd);
	
	return 0;
}

Attachment: get-symbols.sh
Description: Bourne shell script

_______________________________________________
ccache mailing list
ccache@lists.samba.org
https://lists.samba.org/mailman/listinfo/ccache

Reply via email to