Re: [PATCH 27/34] shortlog: release strbuf after use in insert_one_record()

2017-09-08 Thread Junio C Hamano
Jeff King  writes:

> IOW, something like the patch below, which pushes the re-parsing out to
> the stdin code-path, and lets the internal traversal format directly
> into the final buffer. It seems to be about 3% faster than the existing
> code, and fixes the leak (by dropping that variable entirely).

Wow, that is s logical a conclusion that I somewhat feel ashamed
that I didn't think of it myself.  

Nicely done.

>
> -Peff
>
> ---
> diff --git a/builtin/shortlog.c b/builtin/shortlog.c
> index 43c4799ea9..e29875b843 100644
> --- a/builtin/shortlog.c
> +++ b/builtin/shortlog.c
> @@ -52,26 +52,8 @@ static void insert_one_record(struct shortlog *log,
> const char *oneline)
>  {
>   struct string_list_item *item;
> - const char *mailbuf, *namebuf;
> - size_t namelen, maillen;
> - struct strbuf namemailbuf = STRBUF_INIT;
> - struct ident_split ident;
>  
> - if (split_ident_line(, author, strlen(author)))
> - return;
> -
> - namebuf = ident.name_begin;
> - mailbuf = ident.mail_begin;
> - namelen = ident.name_end - ident.name_begin;
> - maillen = ident.mail_end - ident.mail_begin;
> -
> - map_user(>mailmap, , , , );
> - strbuf_add(, namebuf, namelen);
> -
> - if (log->email)
> - strbuf_addf(, " <%.*s>", (int)maillen, mailbuf);
> -
> - item = string_list_insert(>list, namemailbuf.buf);
> + item = string_list_insert(>list, author);
>  
>   if (log->summary)
>   item->util = (void *)(UTIL_TO_INT(item) + 1);
> @@ -114,9 +96,33 @@ static void insert_one_record(struct shortlog *log,
>   }
>  }
>  
> +static int parse_stdin_author(struct shortlog *log,
> +struct strbuf *out, const char *in)
> +{
> + const char *mailbuf, *namebuf;
> + size_t namelen, maillen;
> + struct ident_split ident;
> +
> + if (split_ident_line(, in, strlen(in)))
> + return -1;
> +
> + namebuf = ident.name_begin;
> + mailbuf = ident.mail_begin;
> + namelen = ident.name_end - ident.name_begin;
> + maillen = ident.mail_end - ident.mail_begin;
> +
> + map_user(>mailmap, , , , );
> + strbuf_add(out, namebuf, namelen);
> + if (log->email)
> + strbuf_addf(out, " <%.*s>", (int)maillen, mailbuf);
> +
> + return 0;
> +}
> +
>  static void read_from_stdin(struct shortlog *log)
>  {
>   struct strbuf author = STRBUF_INIT;
> + struct strbuf mapped_author = STRBUF_INIT;
>   struct strbuf oneline = STRBUF_INIT;
>   static const char *author_match[2] = { "Author: ", "author " };
>   static const char *committer_match[2] = { "Commit: ", "committer " };
> @@ -134,9 +140,15 @@ static void read_from_stdin(struct shortlog *log)
>   while (strbuf_getline_lf(, stdin) != EOF &&
>  !oneline.len)
>   ; /* discard blanks */
> - insert_one_record(log, v, oneline.buf);
> +
> + strbuf_reset(_author);
> + if (parse_stdin_author(log, _author, v) < 0)
> + continue;
> +
> + insert_one_record(log, mapped_author.buf, oneline.buf);
>   }
>   strbuf_release();
> + strbuf_release(_author);
>   strbuf_release();
>  }
>  
> @@ -153,7 +165,9 @@ void shortlog_add_commit(struct shortlog *log, struct 
> commit *commit)
>   ctx.date_mode.type = DATE_NORMAL;
>   ctx.output_encoding = get_log_output_encoding();
>  
> - fmt = log->committer ? "%cn <%ce>" : "%an <%ae>";
> + fmt = log->committer ?
> + (log->email ? "%cN <%cE>" : "%cN") :
> + (log->email ? "%aN <%aE>" : "%aN");
>  
>   format_commit_message(commit, fmt, , );
>   if (!log->summary) {


Re: [PATCH 27/34] shortlog: release strbuf after use in insert_one_record()

2017-09-07 Thread Jeff King
On Thu, Sep 07, 2017 at 11:56:48PM -0400, Jeff King wrote:

> > True; I do not think string_list API does.  But for this particular
> > application, I suspect that we can by looking at the util field of
> > the item returned.  A newly created one has NULL, but we always make
> > it non-NULL before leaving this function.
> 
> Yeah, I agree that would work here.
> 
> I also wondered if we could get away with avoiding the malloc entirely
> here. Especially in the "shortlog -n" case, it is identical to the name
> field we already have in ident.name. So ideally we'd do a lookup to see
> if we have the entry before allocating anything (since we do one lookup
> per commit, but only insert once per unique author).
> 
> But that doesn't quite work, because ident.name doesn't put to a
> NUL-terminated string, and string_list only handles strings.

I happened to look at this more while digging on an unrelated shortlog
bug. I think the whole thing could actually be reorganized a bit.

We call insert_one_record() from shortlog_add_commit(). The latter
formats "%an <%ae>", only to have the former parse it back to its
constituent parts. That seems rather silly.

This is an artifact of shortlog's original mode, which was to parse "git
log" output. But for an internal traversal, we can just format the
correct item right off the bat. That part of insert_one_record() is also
where we handle the mailmap mapping. But again, the internal traversal
can just "%aE" to format that correctly in the first place.

IOW, something like the patch below, which pushes the re-parsing out to
the stdin code-path, and lets the internal traversal format directly
into the final buffer. It seems to be about 3% faster than the existing
code, and fixes the leak (by dropping that variable entirely).

-Peff

---
diff --git a/builtin/shortlog.c b/builtin/shortlog.c
index 43c4799ea9..e29875b843 100644
--- a/builtin/shortlog.c
+++ b/builtin/shortlog.c
@@ -52,26 +52,8 @@ static void insert_one_record(struct shortlog *log,
  const char *oneline)
 {
struct string_list_item *item;
-   const char *mailbuf, *namebuf;
-   size_t namelen, maillen;
-   struct strbuf namemailbuf = STRBUF_INIT;
-   struct ident_split ident;
 
-   if (split_ident_line(, author, strlen(author)))
-   return;
-
-   namebuf = ident.name_begin;
-   mailbuf = ident.mail_begin;
-   namelen = ident.name_end - ident.name_begin;
-   maillen = ident.mail_end - ident.mail_begin;
-
-   map_user(>mailmap, , , , );
-   strbuf_add(, namebuf, namelen);
-
-   if (log->email)
-   strbuf_addf(, " <%.*s>", (int)maillen, mailbuf);
-
-   item = string_list_insert(>list, namemailbuf.buf);
+   item = string_list_insert(>list, author);
 
if (log->summary)
item->util = (void *)(UTIL_TO_INT(item) + 1);
@@ -114,9 +96,33 @@ static void insert_one_record(struct shortlog *log,
}
 }
 
+static int parse_stdin_author(struct shortlog *log,
+  struct strbuf *out, const char *in)
+{
+   const char *mailbuf, *namebuf;
+   size_t namelen, maillen;
+   struct ident_split ident;
+
+   if (split_ident_line(, in, strlen(in)))
+   return -1;
+
+   namebuf = ident.name_begin;
+   mailbuf = ident.mail_begin;
+   namelen = ident.name_end - ident.name_begin;
+   maillen = ident.mail_end - ident.mail_begin;
+
+   map_user(>mailmap, , , , );
+   strbuf_add(out, namebuf, namelen);
+   if (log->email)
+   strbuf_addf(out, " <%.*s>", (int)maillen, mailbuf);
+
+   return 0;
+}
+
 static void read_from_stdin(struct shortlog *log)
 {
struct strbuf author = STRBUF_INIT;
+   struct strbuf mapped_author = STRBUF_INIT;
struct strbuf oneline = STRBUF_INIT;
static const char *author_match[2] = { "Author: ", "author " };
static const char *committer_match[2] = { "Commit: ", "committer " };
@@ -134,9 +140,15 @@ static void read_from_stdin(struct shortlog *log)
while (strbuf_getline_lf(, stdin) != EOF &&
   !oneline.len)
; /* discard blanks */
-   insert_one_record(log, v, oneline.buf);
+
+   strbuf_reset(_author);
+   if (parse_stdin_author(log, _author, v) < 0)
+   continue;
+
+   insert_one_record(log, mapped_author.buf, oneline.buf);
}
strbuf_release();
+   strbuf_release(_author);
strbuf_release();
 }
 
@@ -153,7 +165,9 @@ void shortlog_add_commit(struct shortlog *log, struct 
commit *commit)
ctx.date_mode.type = DATE_NORMAL;
ctx.output_encoding = get_log_output_encoding();
 
-   fmt = log->committer ? "%cn <%ce>" : "%an <%ae>";
+   fmt = log->committer ?
+   (log->email ? "%cN <%cE>" : "%cN") :
+   (log->email ? "%aN <%aE>" : "%aN");
 

Re: [PATCH 27/34] shortlog: release strbuf after use in insert_one_record()

2017-09-07 Thread Jeff King
On Fri, Sep 08, 2017 at 09:33:38AM +0900, Junio C Hamano wrote:

> >> An alterative, as this is the only place we add to log->list, could
> >> be to make log->list take ownership of the string by not adding a
> >> _release() here but instead _detach(), I guess.
> >
> > I agree that would be better, but I think it's a little tricky. The
> > string_list_insert() call may make a new entry, or it may return an
> > existing one. We'd still need to free in the latter case. I'm not sure
> > the string_list interface makes it easy to tell the difference.
> 
> True; I do not think string_list API does.  But for this particular
> application, I suspect that we can by looking at the util field of
> the item returned.  A newly created one has NULL, but we always make
> it non-NULL before leaving this function.

Yeah, I agree that would work here.

I also wondered if we could get away with avoiding the malloc entirely
here. Especially in the "shortlog -n" case, it is identical to the name
field we already have in ident.name. So ideally we'd do a lookup to see
if we have the entry before allocating anything (since we do one lookup
per commit, but only insert once per unique author).

But that doesn't quite work, because ident.name doesn't put to a
NUL-terminated string, and string_list only handles strings.

We _can_ reuse the same buffer over and over:

diff --git a/builtin/shortlog.c b/builtin/shortlog.c
index 43c4799ea9..7328abf4a1 100644
--- a/builtin/shortlog.c
+++ b/builtin/shortlog.c
@@ -54,7 +54,7 @@ static void insert_one_record(struct shortlog *log,
struct string_list_item *item;
const char *mailbuf, *namebuf;
size_t namelen, maillen;
-   struct strbuf namemailbuf = STRBUF_INIT;
+   static struct strbuf namemailbuf = STRBUF_INIT;
struct ident_split ident;
 
if (split_ident_line(, author, strlen(author)))
@@ -66,6 +66,7 @@ static void insert_one_record(struct shortlog *log,
maillen = ident.mail_end - ident.mail_begin;
 
map_user(>mailmap, , , , );
+   strbuf_reset();
strbuf_add(, namebuf, namelen);
 
if (log->email)

That saves the malloc, if not the extra copying. It shows about a 1%
speed up on "git shortlog -ns" on linux.git. Probably that's not worth
caring too much about, but it also "solves" the leaking problem (I'm not
sure if the speedup is from calling malloc less frequently, or from
lowering our peak heap usage due to fixing the leak).

-Peff


Re: [PATCH 27/34] shortlog: release strbuf after use in insert_one_record()

2017-09-07 Thread Junio C Hamano
Jeff King  writes:

> On Thu, Sep 07, 2017 at 04:51:12AM +0900, Junio C Hamano wrote:
>
>> > diff --git a/builtin/shortlog.c b/builtin/shortlog.c
>> > index 43c4799ea9..48af16c681 100644
>> > --- a/builtin/shortlog.c
>> > +++ b/builtin/shortlog.c
>> > @@ -50,66 +50,67 @@ static int compare_by_list(const void *a1, const void 
>> > *a2)
>> >  static void insert_one_record(struct shortlog *log,
>> >  const char *author,
>> >  const char *oneline)
>> >  {
>> > ...
>> >item = string_list_insert(>list, namemailbuf.buf);
>> > +  strbuf_release();
>> 
>> As log->list.strdup_strings is set to true in shortlog_init(),
>> namemailbuf.buf will leak without this.
>> 
>> An alterative, as this is the only place we add to log->list, could
>> be to make log->list take ownership of the string by not adding a
>> _release() here but instead _detach(), I guess.
>
> I agree that would be better, but I think it's a little tricky. The
> string_list_insert() call may make a new entry, or it may return an
> existing one. We'd still need to free in the latter case. I'm not sure
> the string_list interface makes it easy to tell the difference.

True; I do not think string_list API does.  But for this particular
application, I suspect that we can by looking at the util field of
the item returned.  A newly created one has NULL, but we always make
it non-NULL before leaving this function.

>> It is also curious that at the end of shortlog_output(), we set the
>> log->list.strdup_strings again to true immediately before calling
>> string_list_clear() on it.  I suspect that is unnecessary?
>
> I think so. I was curious whether I might have introduced this weirdness
> when I refactored this code a year or so ago. But no, it looks like it
> dates all the way back to the very first version of shortlog.c.

Hmph.


Re: [PATCH 27/34] shortlog: release strbuf after use in insert_one_record()

2017-09-06 Thread Jeff King
On Thu, Sep 07, 2017 at 04:51:12AM +0900, Junio C Hamano wrote:

> > diff --git a/builtin/shortlog.c b/builtin/shortlog.c
> > index 43c4799ea9..48af16c681 100644
> > --- a/builtin/shortlog.c
> > +++ b/builtin/shortlog.c
> > @@ -50,66 +50,67 @@ static int compare_by_list(const void *a1, const void 
> > *a2)
> >  static void insert_one_record(struct shortlog *log,
> >   const char *author,
> >   const char *oneline)
> >  {
> > ...
> > item = string_list_insert(>list, namemailbuf.buf);
> > +   strbuf_release();
> 
> As log->list.strdup_strings is set to true in shortlog_init(),
> namemailbuf.buf will leak without this.
> 
> An alterative, as this is the only place we add to log->list, could
> be to make log->list take ownership of the string by not adding a
> _release() here but instead _detach(), I guess.

I agree that would be better, but I think it's a little tricky. The
string_list_insert() call may make a new entry, or it may return an
existing one. We'd still need to free in the latter case. I'm not sure
the string_list interface makes it easy to tell the difference.

> It is also curious that at the end of shortlog_output(), we set the
> log->list.strdup_strings again to true immediately before calling
> string_list_clear() on it.  I suspect that is unnecessary?

I think so. I was curious whether I might have introduced this weirdness
when I refactored this code a year or so ago. But no, it looks like it
dates all the way back to the very first version of shortlog.c.

-Peff


Re: [PATCH 27/34] shortlog: release strbuf after use in insert_one_record()

2017-09-06 Thread Junio C Hamano
Rene Scharfe  writes:

> Signed-off-by: Rene Scharfe 
> ---
>  builtin/shortlog.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/builtin/shortlog.c b/builtin/shortlog.c
> index 43c4799ea9..48af16c681 100644
> --- a/builtin/shortlog.c
> +++ b/builtin/shortlog.c
> @@ -50,66 +50,67 @@ static int compare_by_list(const void *a1, const void *a2)
>  static void insert_one_record(struct shortlog *log,
> const char *author,
> const char *oneline)
>  {
> ...
>   item = string_list_insert(>list, namemailbuf.buf);
> + strbuf_release();

As log->list.strdup_strings is set to true in shortlog_init(),
namemailbuf.buf will leak without this.

An alterative, as this is the only place we add to log->list, could
be to make log->list take ownership of the string by not adding a
_release() here but instead _detach(), I guess.

It is also curious that at the end of shortlog_output(), we set the
log->list.strdup_strings again to true immediately before calling
string_list_clear() on it.  I suspect that is unnecessary?



[PATCH 27/34] shortlog: release strbuf after use in insert_one_record()

2017-08-30 Thread Rene Scharfe
Signed-off-by: Rene Scharfe 
---
 builtin/shortlog.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/builtin/shortlog.c b/builtin/shortlog.c
index 43c4799ea9..48af16c681 100644
--- a/builtin/shortlog.c
+++ b/builtin/shortlog.c
@@ -50,66 +50,67 @@ static int compare_by_list(const void *a1, const void *a2)
 static void insert_one_record(struct shortlog *log,
  const char *author,
  const char *oneline)
 {
struct string_list_item *item;
const char *mailbuf, *namebuf;
size_t namelen, maillen;
struct strbuf namemailbuf = STRBUF_INIT;
struct ident_split ident;
 
if (split_ident_line(, author, strlen(author)))
return;
 
namebuf = ident.name_begin;
mailbuf = ident.mail_begin;
namelen = ident.name_end - ident.name_begin;
maillen = ident.mail_end - ident.mail_begin;
 
map_user(>mailmap, , , , );
strbuf_add(, namebuf, namelen);
 
if (log->email)
strbuf_addf(, " <%.*s>", (int)maillen, mailbuf);
 
item = string_list_insert(>list, namemailbuf.buf);
+   strbuf_release();
 
if (log->summary)
item->util = (void *)(UTIL_TO_INT(item) + 1);
else {
const char *dot3 = log->common_repo_prefix;
char *buffer, *p;
struct strbuf subject = STRBUF_INIT;
const char *eol;
 
/* Skip any leading whitespace, including any blank lines. */
while (*oneline && isspace(*oneline))
oneline++;
eol = strchr(oneline, '\n');
if (!eol)
eol = oneline + strlen(oneline);
if (starts_with(oneline, "[PATCH")) {
char *eob = strchr(oneline, ']');
if (eob && (!eol || eob < eol))
oneline = eob + 1;
}
while (*oneline && isspace(*oneline) && *oneline != '\n')
oneline++;
format_subject(, oneline, " ");
buffer = strbuf_detach(, NULL);
 
if (dot3) {
int dot3len = strlen(dot3);
if (dot3len > 5) {
while ((p = strstr(buffer, dot3)) != NULL) {
int taillen = strlen(p) - dot3len;
memcpy(p, "/.../", 5);
memmove(p + 5, p + dot3len, taillen + 
1);
}
}
}
 
if (item->util == NULL)
item->util = xcalloc(1, sizeof(struct string_list));
string_list_append(item->util, buffer);
}
 }
-- 
2.14.1