On Mon, Nov 12, 2012 at 8:09 PM, Bogdan Harjoc <har...@gmail.com> wrote:
> Initial results from a small .ccache (3.0) dir: > The previous results were bogus (I was diffing .gz compressed objects). I tried the bsdiff approach on a new .ccache folder obtained from compiling 8 linux kernels (various v3.x versions) configured as allnoconfig. Stats: - 22MB - 4030 objects (gzipped) - 503 cache hits After applying diffs: - 8.4MB (1862 files) were selected for diffs - 1.5MB (17%) compressed patches resulted - applying the 1862 diffs took 1.21 sec (so 0.65ms per patch) I got the 0.65ms figure using a script that runs gunzip -c and bspatch for each patched file in the cache. To find out and subtract the overhead of creating the gunzip and bspatch processes, I ran the loop with just "gunzip; bspatch --help > null" instead of the normal "gunzip; bspatch". So the result should cover just the actual patch applying (ccache does the gunzip anyway). The test was done on a 3.4Ghz i7-2600. My conclusions are: - performance impact should be negligible (when compressing files offline -- e.g. when make aborts with "no space left on device") - savings in size vary (7MB out of 22MB in this case) - the patches will need to have the "old filename" embedded If people post results from their cache folders and there is interest, I can work on an implementation. Attached are the updated sources for the test. I normally run them in a "testccache" directory, since they create two more folders with data. (I posted this mail privately just to Andrew by mistake 3 days ago, reposting it to ccache-list now) Cheers, Bogdan
bspatch-all.sh
Description: Bourne shell script
#include <stdio.h> #include <sys/types.h> #include <sys/stat.h> #include <errno.h> #include <dirent.h> #include <string.h> #include <stdlib.h> #include <limits.h> #include <ctype.h> const char *BSDIFF = "bsdiff"; //const char *BSDIFF = "~/build/bsdiff-4.3/bsdiff"; struct Func { Func *next; char *name; }; struct Obj { Obj *next; char *name; char *expanded; int nfuncs; Func *funcs; Obj *match; int score; }; const int HASH_SIZE = 19031; Obj *objs[HASH_SIZE] = {}; #define NAME_HASH(name) ((*(int *)(name)) % HASH_SIZE) Obj *read_obj(const char *strbase, int a, int b, const char *namesuff) { Obj *o = new Obj; o->next = 0; o->name = new char[64]; o->expanded = new char[128]; o->nfuncs = 0; o->funcs = 0; o->match = 0; o->score = 0; o->name = strdup(namesuff); int nc = snprintf(o->expanded, 128, "%x/%x/%s", a, b, namesuff); if (nc <= 0 || nc >= 64) abort(); char strpath[PATH_MAX]; sprintf(strpath, "%s/%s", strbase, namesuff); FILE *f = fopen(strpath, "rb"); if (!f) { printf("open %s\n", namesuff); abort(); } for (;;) { static char line[PATH_MAX]; if (!fgets(line, PATH_MAX, f)) break; for (int l=strlen(line); l && isspace(line[l-1]); line[--l]=0); Func **h = &o->funcs; Func *fn = new Func; fn->name = strdup(line); fn->next = *h; *h = fn; o->nfuncs++; } fclose(f); return o; } #define max(a,b) (((a) > (b)) ? (a) : (b)) void compare(Obj *o1, Obj *o2) { /* if (strcmp(o1->name, "978911edd002afb6ecb9611659327e3e-3475537") || strcmp(o2->name, "4794bbcacf3bc42cdf1a44cf89523949-3429991")) return; */ if (o1->nfuncs < 4 || o2->nfuncs < 4) return; int r = 100*o1->nfuncs/o2->nfuncs; if (r < 80 || r > 125) return; if (!strcmp(o1->name, o2->name)) return; Func *f1 = o2->funcs; Func *f2 = o1->funcs; int m=0; while (f1) { int r = 0; while (f2 && (r = strcmp(f1->name, f2->name)) < 0) // reversed since we pushed fns at the head f2 = f2->next; if (f2 && r==0) // match m++; f1 = f1->next; } int score = 1000*m/max(o1->nfuncs, o2->nfuncs); if (score > 800 && score > o1->score) { o1->score = score; o1->match = o2; } } int mk_file_dirs(char *file) { char *ptr = file; for (;;) { ptr = strchr(ptr, '/'); if (!ptr) return 0; *ptr = 0; int ret = mkdir(file, 0750); *ptr = '/'; if (ret < 0 && errno != EEXIST) { fprintf(stderr, "%s: errno=%s (%d)\n", file, strerror(errno), errno); return -1; } ptr++; } } int main() { int nobjs=0; for (int a=0; a<0x10; a++) for (int b=0; b<0x10; b++) { char strbase[PATH_MAX]; sprintf(strbase, "strings/.ccache/%x/%x", a, b); DIR *dh = opendir(strbase); if (!dh) continue; for (;;) { struct dirent *de = readdir(dh); if (!de) break; if (strchr(de->d_name, '.')) continue; Obj *o = read_obj(strbase, a, b, de->d_name); Obj **h = &objs[NAME_HASH(o->name)]; o->next = *h; *h = o; nobjs++; } closedir(dh); } long long total_raw=0; long long total_bsd=0; int nbsd=0; for (int h1=0; h1<HASH_SIZE; h1++) { fprintf(stderr, "\r%d%%", h1*100/HASH_SIZE); fflush(stderr); for (Obj *o1=objs[h1]; o1; o1=o1->next) { // Look for a match in all the objects that are after it in our hash. // If we get a match, it means we can store o1 as a delta from o2. for (int h2=h1; h2<HASH_SIZE; h2++) { Obj *o2 = (h2==h1) ? o1 : objs[h2]; for (; o2; o2=o2->next) compare(o1, o2); } if (o1->match) { char bin1[PATH_MAX]; char bin2[PATH_MAX]; sprintf(bin1, "%s/.ccache/%s", getenv("HOME"), o1->expanded); sprintf(bin2, "%s/.ccache/%s", getenv("HOME"), o1->match->expanded); char cmd[PATH_MAX]; sprintf(cmd, "gunzip -c %s > a.o", bin1); if (system(cmd)) { printf("%s\n", cmd); abort(); } sprintf(cmd, "gunzip -c %s > b.o", bin2); if (system(cmd)) { printf("%s\n", cmd); abort(); } char patch[PATH_MAX]; sprintf(patch, "patches/.ccache/%s", o1->expanded); if (mk_file_dirs(patch)) abort(); snprintf(cmd, PATH_MAX, "%s a.o b.o %s", BSDIFF, patch); if (system(cmd)) { printf("%s\n", cmd); abort(); } struct stat st; if (stat(bin1, &st)) { printf("stat %s\n", bin1); abort(); } off_t s1 = st.st_size; if (stat(patch, &st)) { printf("%s\n", patch); abort(); } off_t sd = st.st_size; /*if (stat(bin2, &st)) { printf("stat %s\n", bin2); abort(); } off_t s2 = st.st_size; printf("r %ld \tsc %d\tn1 %d\tn2 %d\ts1 %ld\ts2 %ld\tsd %ld\t%s %s\n", sd*100/max(s1, s2), o1->score, o1->nfuncs, o1->match->nfuncs, s1, s2, sd, o1->expanded, o1->match->expanded);*/ total_raw += s1; total_bsd += sd; nbsd++; } } } remove("a.o"); remove("b.o"); fprintf(stderr, "\n%d files compressed (%lld -> %lld bytes, %lld%%) \n", nbsd, total_raw, total_bsd, 100*total_bsd/total_raw); return 0; }
get-symbols.sh
Description: Bourne shell script
_______________________________________________ ccache mailing list ccache@lists.samba.org https://lists.samba.org/mailman/listinfo/ccache