Here is how I find big files that have gotten into the repository.
The shell script grovels through the repository history listing every
file in every commit in the history of HEAD, along with its length.
Duplicates are removed from this list, and then a space-use analysis
is done of the files, grouping them by the directories they are
recorded within.  The output graph shows how much space is attributed
to each directory and file, with output for small directories and
files suppressed.

The script is crude in that it doesn't take into account space that is
saved by compression, either of a file individually or multiple
similar files in a pack file.  It also doesn't notice if identical
files are stored under different names in different commits.  But it
works well enough for finding that gigabyte file that has crept into
the respository.

Once you've obtained the blob hash, you can use git-find-blob to
determine exactly how it is referenced.

Dale
----------------------------------------------------------------------
#! /bin/bash

# git-space-use
# List a tree showing the space breakdown of all the different files
# in all commits.

# Generate all the commit identifiers.
git rev-list --all |
# Process each commit.
while read COMMIT
do
    # List all the files in the commit, with their lengths.
    git ls-tree --full-tree -l -r $COMMIT |
    while read MODE TYPE HASH LENGTH NAME
    do
        # Output a line like 'du' would make.
        echo $LENGTH$'\t'./"$NAME"/$HASH
    done
done |
# Remove duplicate mentions of the same file with the same contents.
sort -u -nr | tee ${TMPDIR:-/tmp}/${0##*/}.$$ |
# Assemble a space-use graph.
dugraph -c
----------------------------------------------------------------------
/* dugraph.c - program to make a pretty graph out of a du report */

/* Option -c forces the sizes of directories with listed children to be
   calculated from the sizes of the listed children.  This helps if the
   input du listing has had subtrees chopped out of it (for instance, if
   they really reside on other filesystems than the root of the tree), but
   generally requires a -a listing to give sensible results, otherwise the
   calculated size for a directory containing subdirectories will ignore
   the files contained directly in the directory. */

/* Must be compiled with gcc rather than g++, because the types of function
   arguments are given in the old style. */

/* Define I64 to use unsigned 64-bit integers for sizes and totals. */
#define I64

#ifdef I64
  #define SIZE uint64_t
  /* In C99, uint64_t is defined in stdint.h. */
  #include <stdint.h>
  /* Get the macro PRIu64 which is the modifer-and-format-letter for uint64_t. 
*/
  #define __STDC_FORMAT_MACROS
  #include <inttypes.h>
#else
  #define SIZE unsigned long int
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* number of lines the listing should occupy */
int length = 60;
/* message for suppressed directories */
#define SUPPRESSED  "(etc.)"

/* format of a tree node */
struct node {
            struct node *lson;  /* left son */
            struct node *rbrother;/* right brother */
            SIZE   size;   /* size of directory in kbytes */
            SIZE   cum_size;    /* size of everything listed before
                                 * this directory */
            int         loc;        /* location we will print it at */
            int         print_col;/* column to print name in */
            int         print_limit;
                                /* location we can't print on or
                                 * after */
            int         last;   /* are we last son of our father? */
            char            name[1];    /* name */
          };

/* root of the tree */
struct node *root = NULL;
/* total size of things listed */
SIZE    total_size;
/* current line number we are on (0-origin) */
int         current_line = 0;
/* list of where to put bars */
int         bar_list[50];
/* number of bars in the list */
int         bar_count = 0;
/* set to force recalculation of sizes of directories */
int         calculate_sizes = 0;

/* declare functions */
void            read_input();
struct node *insert_in_tree();
void            dfs();
void            dfs1();
void            missing_sizes();
void            calc_cum_size();
void            sort();
void            calc_loc();
void            blank();
void            mark_last();
void            calc_pc();
void            output();
void            position();

int main(argc, argv)
     int    argc;
     char   **argv;
    {
    struct node *t; /* scratch */

    /* process the options */
    if (argv[1] != NULL && strcmp(argv[1], "-c") == 0)
        {
        calculate_sizes = 1;
        }

    /* read the input and form a tree */
    read_input();
    root->size = 0;
    /* put sizes on entries that have none */
    dfs(NULL, missing_sizes);
    /* sort each directory */
    dfs(sort, NULL);
    /* calculate cumulative size before each entry */
    root->cum_size = 0;
    dfs(calc_cum_size, NULL);
    /* calculate the total size */
    total_size = 0;
    for (t = root->lson; t != NULL; t = t->rbrother)
        total_size += t->size;
    /* if total_size is 0, increase it to 1 so division by it will not fail. */
    if (total_size == 0)
        total_size = 1;
    /* calculate the location of each directory */
    /* blank out subdirectories that get scrunched together at the bottom */
    root->print_limit = length;
    dfs(calc_loc, blank);
    /* print out the tree */
    for (t = root->lson; t != NULL; t = t->rbrother)
        {
        /* mark the last son of each directory */
        /* figure out the print columns */
        t->print_col = 0;
        dfs1(calc_pc, mark_last, t);
        dfs1(output, NULL, t);
        }
    /* put blank space at end */
    position(length);

    exit(0);
    }

/* read input and form a tree */
void read_input()
    {
    SIZE   size;        /* size read from input */
    char            name[1000]; /* directory name read from input */
    int         r;          /* return code */
    struct node *t;

    /* Insert a slash as name[0].  Since we read the name into name+1,
     * this leaves us with the ability to prepend a slash to names.
     * We do this when the read name starts with a slash, so that the
     * name "/a/b/c" is parsed as the sequence of directories "/", "a",
     * "b", "c", by the trick of prepending a slash and then searching
     * for the slash between components starting at one char after
     * the previous component. */
    name[0] = '/';
    /* make the dummy node at the top of the tree */
    root = (struct node *)malloc(sizeof (struct node));
    root->name[0] = '\0';
    root->lson = NULL;
    /* read the next line of input */
    for (;;)
        {
        #ifdef I64
          r = fscanf(stdin, "%" PRIu64 " %[^\n]", &size, name+1);
        #else
          r = fscanf(stdin, "%lu %[^\n]", &size, name+1);
        #endif
        /* exit if we have finished the input */
        if (r == EOF)
            {
            break;
            }
        /* skip this line and try reading again if there was a conversion
           error */
        if (r != 2)
            {
            fscanf(stdin, "%*[^\n] ");
            continue;
            }
        /* insert (or find) the directory in the tree and save its size */
        /* Prepend a slash if the read name starts with a slash.
         * But, if the entire read name is a single slash, use "/" as the
         * name.  This gives the correct result, since "/" as a directory 
         * name is the true name of the root directory, "", followed by
         * a redundant "/".  Thus, the name should be "", but all absolute
         * directory names, we prepend a slash to before passing to
         * insert_in_tree. */
        t = insert_in_tree(name[1] == '/' && name[2] != '\0' ? name : name+1);
        if (!calculate_sizes || t->lson == NULL)
            t->size = size;
        }
    }

/* insert (or find) a directory in the tree */
struct node *insert_in_tree(name)
    char    *name;      /* name of the directory */
    {
    struct node *t; /* pointer for searching down through tree */
    char            *np;    /* points to next part of directory name to be
                     * examined */
    struct node *t1;    /* scratch pointer */
    char            *np1;/* scratch pointer */

    /* read through the name, one directory-part at a time, and hunt
     * down the tree, constructing nodes as needed */
    for (t = root, np = name; np != NULL; np = np1)
        {
        /* extract the next directory-part */
        /* This search is done starting with the second character, so that
         * we will always find a component with at least one character. */
        if ((np1 = strchr(np+1, '/')) != NULL)
            {
            /* we found a slash, replace it with a null, and position
             * np1 to point to the remainder of the name */
            *np1++ = '\0';
            }
        /* else */
            /* we found no shash, so we are at the end of the name
             * np1 has been set to NULL for us by strchr */
        /* search the sons of this node for a node with the proper name */
        for (t1 = t->lson; t1 != NULL && strcmp(t1->name, np) != 0;
                t1 = t1->rbrother)
            ;
        /* did we find one? */
        if (t1 != NULL)
            /* yes, go to it */
            t = t1;
        else
            {
            /* no, make one */
            t1 = (struct node *)malloc(sizeof(struct node) + strlen(np));
            strcpy(t1->name, np);
            t1->lson = NULL;
            t1->rbrother = NULL;
            t1->size = 0;
            /* insert it in tree */
            t1->rbrother = t->lson;
            t->lson = t1;
            if (!calculate_sizes)
                {
                t->size = 0;
                }
            t = t1;
            }
        }
    return t;
    }

/* depth-first-search routine */
void dfs(pre_routine, post_routine)
    void    (*pre_routine)();   /* routine to execute before scanning
                         * descendants */
    void    (*post_routine)();  /* routine to execute after scanning
                         * descendants */
    {
    dfs1(pre_routine, post_routine, root);
    }

/* depth-first-search service routine */
void dfs1(pre_routine, post_routine, t)
    void    (*pre_routine)();   /* routine to execute before scanning
                         * descendants */
    void    (*post_routine)();  /* routine to execute after scanning
                         * descendants */
    struct node *t;     /* node to operate on */
    {
    struct node *t1;        /* scratch pointer */

    /* if it exists, execute the pre-routine */
    if (pre_routine != NULL)
        (*pre_routine)(t);
    /* call self on sons of this node */
    for (t1 = t->lson; t1 != NULL; t1 = t1->rbrother)
        dfs1(pre_routine, post_routine, t1);
    /* if it exists, execute the post-routine */
    if (post_routine != NULL)
        (*post_routine)(t);
    }

/* add missing sizes */
void missing_sizes(t)
    struct node *t;
    {
    struct node *t1;        /* scratch pointer */
    unsigned long   s;      /* scratch */

    if (t->size == 0)
        {
        /* size is missing, we have to calcuate it */
        s = 0;
        for (t1 = t->lson; t1 != NULL; t1 = t1->rbrother)
            s += t1->size;
        t->size = s;
        }
    }

/* calculate the culumative sizes of sons */
void calc_cum_size(t)
    struct node *t;
    {
    SIZE   cs;      /* scratch */
    struct node *t1;        /* scratch pointer */

    if (t == root)
        cs = 0;
    else
        {
        /* figure out how much is in the directory itself */
        for (t1 = t->lson, cs = 0; t1 != NULL; t1 = t1->rbrother)
            {
            cs += t1->size;
            }
        /* cs is the size accounted for by subdirectories */
        cs = t->size - cs;
        }
    /* cs is the size of the files in the directory itself */
    for (t1 = t->lson; t1 != NULL; t1 = t1->rbrother)
        {
        t1->cum_size = cs;
        cs += t1->size;
        }
    }

/* sort the directories under a directory */
void sort(t)
    struct node *t;
    {
    struct node *p1, *p2, *p3, *pp;     /* scratch pointers */
    int         nodes, n;               /* scratch */

    /* count the number of nodes */
    nodes = 0;
    for (p1 = t->lson; p1 != NULL; p1 = p1->rbrother)
        nodes++;
    /* just a simple and inefficient bubble sort */
    for (n = 1; n < nodes; n++)
        for (p1 = NULL, p2 = t->lson, p3 = p2->rbrother; p3 != NULL;
                p1 = p2, p2 = p3, p3 = p3->rbrother)
            {
            if (p2->size < p3->size)
                {
                /* exchange the nodes p2 and p3 */
                pp = p3->rbrother;
                p3->rbrother = p2;
                p2->rbrother = pp;
                if (p1 != NULL)
                    p1->rbrother = p3;
                else
                    t->lson = p3;
                /* exchange the values of p2 and p3 */
                pp = p2;
                p2 = p3;
                p3 = pp;
                }
            }
    }

/* calculate the print location */
void calc_loc(t)
    struct node *t;
    {
    SIZE   cs;      /* scratch */
    struct node *t1, *t2;   /* scratch pointers */
    int         print_limit;
                        /* location next directory after t will
                         * be printed */

    if (t == root)
        cs = 0;
    else
        {
        /* figure out how much is in the directory itself */
        for (t1 = t->lson, cs = 0; t1 != NULL; t1 = t1->rbrother)
            {
            cs += t1->size;
            }
        /* cs is the size accounted for by subdirectories */
        cs = t->size - cs;
        }
    /* cs is the size of the files in the directory itself */
    /* convert cs to lines */
    cs = cs*length/total_size + t->loc;
    /* calculate where next directory after t will be */
    print_limit = t->print_limit;
    /* assign locations */
    for (t1 = t->lson, t2 = NULL; t1 != NULL; t2 = t1, t1 = t1->rbrother)
        {
        /* make sure we don't run into next directory */
        if (cs >= print_limit)
            {
            cs = print_limit-1;
            }
        t1->loc = cs;
        if (t2 != NULL)
            t2->print_limit = cs;
        cs += t1->size*length/total_size;
        }
    if (t2 != NULL)
        t2->print_limit = print_limit;
    }

/* figure out which directories to blank out */
void blank(t)
    struct node *t;
    {
    struct node *t1, *t2, *t3;      /* loop pointers */

    /* return if there aren't at least two sons */
    if (t->lson == NULL || t->lson->rbrother == NULL)
        return;
    for (t1 = NULL, t2 = t->lson, t3 = t2->rbrother; t3 != NULL;
            t1 = t2, t2 = t3, t3 = t3->rbrother)
        if (t2->loc == t3->loc)
            {
            /* replace t1 and succeeding nodes with "(etc.)" */
            t3 = (struct node *)malloc(sizeof (struct node) +
                sizeof (SUPPRESSED) - 1);
            strcpy(t3->name, SUPPRESSED);
            t3->lson = t3->rbrother = NULL;
            t3->loc = t2->loc;
            if (t1 == NULL)
                t->lson = t3;
            else
                t1->rbrother = t3;
            }
    }

/* mark the last son of each directory */
void mark_last(t)
    struct node *t;
    {
    struct node *t1, *t2;   /* scratch pointers */
    t->last = 0;
    for (t1 = t->lson, t2 = NULL; t1 != NULL; t2 = t1, t1 = t1->rbrother)
        ;
    if (t2 != NULL)
        t2->last = 1;
    }

/* calculate the print columns */
void calc_pc(t)
    struct node *t;
    {
    struct node *t1;        /* scratch pointer */
    int         c;      /* column sons will be printed in */

    c = t->print_col + strlen(t->name) + 5;
    for (t1 = t->lson; t1 != NULL; t1 = t1->rbrother)   
        t1->print_col = c;
    }

/* write the output */
void output(t)
    struct node *t;
    {
    position(t->loc);
    printf("--%s%s", t->name, (t->lson != NULL ? "--+" : ""));
    /* remove the bar for our father if we are the last son */
    if (t->last)
        bar_count--;
    /* add the location of the bar to the bar list if we have a son */
    if (t->lson != NULL)
        {
        bar_list[bar_count] = t->print_col + strlen(t->name) + 5 - 1;
        bar_count++;
        }
    }

/* position to a specific line */
void position(line)
    int line;       /* line number */
    {
    int i;          /* counts through the bar list */
    int j;          /* current column number */

    /* for every line we need to go down */
    for (; current_line < line; current_line++)
        {
        putchar('\n');
        /* print the bars for this line */
        j = 0;
        for (i = 0; i < bar_count; i++)
            {
            for (; j < bar_list[i]; j++)
                putchar(' ');
            if (current_line == line-1 && i == bar_count-1)
                putchar('+');
            else
                putchar('|');
            j++;
            }
        }
    }
----------------------------------------------------------------------

-- 
You received this message because you are subscribed to the Google Groups "Git 
for human beings" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to git-users+unsubscr...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Reply via email to