I wrote:
> On my machine, the combination of these two ideas reduces the
> runtime of the example above from ~150 seconds to ~53 seconds,
> or nearly 3x better. I see something like a 2% improvement on
> Joel's test corpus, which might just be noise. So this isn't
> any sort of universal panacea, but it sure helps when LACON
> evaluation is the bottleneck.
After another round of testing, I really can't see any improvement
at all from that patch on anything except the original Tcl test
case. Indeed, a lot of cases seem very slightly worse, perhaps
because compact() now has to make two passes over all the arcs.
So that's leaving me a bit dissatisfied with it; I'm going to
stick it on the back burner for now, in hopes of a better idea.
However, in a different line of thought, I realized that the
memory allocation logic could use some polishing. It gives out
ten arcs per NFA state initially, and then adds ten more at a time.
However, that's not very bright when you look at the actual usage
patterns, because most states have only one or two out-arcs,
but some have lots and lots. I instrumented things to gather
stats about arcs-per-state on your larger corpus, and I got this,
where the seond column is the total fraction of states having
the given number of arcs or fewer:
arcs | cum_fraction
------+------------------------
0 | 0.03152871318455725868
1 | 0.55852399556959499493
2 | 0.79408539124378449284
3 | 0.86926656199366447221
4 | 0.91726891675794579062
5 | 0.92596934405572457792
6 | 0.93491612836055807037
7 | 0.94075102352639209644
8 | 0.94486598829672779379
9 | 0.94882085883928361399
10 | 0.95137992908336444821
11 | 0.95241399914559696173
12 | 0.95436547669138874594
13 | 0.95534682472329051385
14 | 0.95653340893356523452
15 | 0.95780804864876924571
16 | 0.95902387577636979702
17 | 0.95981494467267418552
18 | 0.96048662216159976997
19 | 0.96130294229052153065
20 | 0.96196856160309755204
...
3238 | 0.99999985870142624926
3242 | 0.99999987047630739515
4095 | 0.99999987342002768163
4535 | 0.99999987930746825457
4642 | 0.99999988225118854105
4706 | 0.99999989402606968694
5890 | 0.99999989696978997342
6386 | 0.99999990874467111931
7098 | 0.99999991168839140579
7751 | 0.99999994701303484347
7755 | 0.99999998233767828116
7875 | 0.99999998822511885410
8049 | 1.00000000000000000000
So it seemed clear to me that we should only give out a couple of arcs
per state initially, but then let it ramp up faster than 10 arcs per
additional malloc. After a bit of fooling I have the attached.
This does nothing for the very largest examples in the corpus (the
ones that cause "regex too complex") --- those were well over the
REG_MAX_COMPILE_SPACE limit before and they still are. But all the
rest get nicely smaller. The average pg_regcomp memory consumption
drops from ~89K to ~48K.
regards, tom lane
diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c
index 69929eddfc..411c86f4ff 100644
--- a/src/backend/regex/regc_nfa.c
+++ b/src/backend/regex/regc_nfa.c
@@ -142,9 +142,12 @@ newstate(struct nfa *nfa)
{
s = nfa->free;
nfa->free = s->next;
+ /* the state's free arcs stay as they are */
}
else
{
+ int i;
+
if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
{
NERR(REG_ETOOBIG);
@@ -157,9 +160,16 @@ newstate(struct nfa *nfa)
return NULL;
}
nfa->v->spaceused += sizeof(struct state);
- s->oas.next = NULL;
- s->free = NULL;
- s->noas = 0;
+ s->lastab = NULL;
+ /* initialize free arcs; this should match allocarc() */
+ for (i = 0; i < NINITARCS - 1; i++)
+ {
+ s->inita[i].type = 0;
+ s->inita[i].freechain = &s->inita[i + 1];
+ }
+ s->inita[i].type = 0;
+ s->inita[i].freechain = NULL;
+ s->free = &s->inita[0];
}
assert(nfa->nstates >= 0);
@@ -255,11 +265,11 @@ destroystate(struct nfa *nfa,
struct arcbatch *abnext;
assert(s->no == FREESTATE);
- for (ab = s->oas.next; ab != NULL; ab = abnext)
+ for (ab = s->lastab; ab != NULL; ab = abnext)
{
abnext = ab->next;
+ nfa->v->spaceused -= ARCBATCHSIZE(ab->narcs);
FREE(ab);
- nfa->v->spaceused -= sizeof(struct arcbatch);
}
s->ins = NULL;
s->outs = NULL;
@@ -377,41 +387,39 @@ allocarc(struct nfa *nfa,
{
struct arc *a;
- /* shortcut */
- if (s->free == NULL && s->noas < ABSIZE)
- {
- a = &s->oas.a[s->noas];
- s->noas++;
- return a;
- }
-
/* if none at hand, get more */
if (s->free == NULL)
{
struct arcbatch *newAb;
- int i;
+ size_t narcs;
+ size_t i;
if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
{
NERR(REG_ETOOBIG);
return NULL;
}
- newAb = (struct arcbatch *) MALLOC(sizeof(struct arcbatch));
+ narcs = (s->lastab != NULL) ? s->lastab->narcs * 2 : FIRSTABSIZE;
+ if (narcs > MAXABSIZE)
+ narcs = MAXABSIZE;
+ newAb = (struct arcbatch *) MALLOC(ARCBATCHSIZE(narcs));
if (newAb == NULL)
{
NERR(REG_ESPACE);
return NULL;
}
- nfa->v->spaceused += sizeof(struct arcbatch);
- newAb->next = s->oas.next;
- s->oas.next = newAb;
+ nfa->v->spaceused += ARCBATCHSIZE(narcs);
+ newAb->narcs = narcs;
+ newAb->next = s->lastab;
+ s->lastab = newAb;
- for (i = 0; i < ABSIZE; i++)
+ for (i = 0; i < narcs - 1; i++)
{
newAb->a[i].type = 0;
newAb->a[i].freechain = &newAb->a[i + 1];
}
- newAb->a[ABSIZE - 1].freechain = NULL;
+ newAb->a[i].type = 0;
+ newAb->a[i].freechain = NULL;
s->free = &newAb->a[0];
}
assert(s->free != NULL);
@@ -3413,7 +3421,7 @@ dumparc(struct arc *a,
FILE *f)
{
struct arc *aa;
- struct arcbatch *ab;
+ bool found = false;
fprintf(f, "\t");
switch (a->type)
@@ -3451,15 +3459,29 @@ dumparc(struct arc *a,
}
if (a->from != s)
fprintf(f, "?%d?", a->from->no);
- for (ab = &a->from->oas; ab != NULL; ab = ab->next)
+
+ /*
+ * Check that arc belongs to its from-state's arc space. This is
+ * approximate, since we don't verify the arc isn't misaligned within the
+ * space; but it's enough to catch the likely problem of an arc being
+ * misassigned to a different parent state.
+ */
+ if (a >= a->from->inita && a < a->from->inita + NINITARCS)
+ found = true; /* member of initial arcset */
+ else
{
- for (aa = &ab->a[0]; aa < &ab->a[ABSIZE]; aa++)
- if (aa == a)
+ struct arcbatch *ab;
+
+ for (ab = a->from->lastab; ab != NULL; ab = ab->next)
+ {
+ if (a >= ab->a && a < ab->a + ab->narcs)
+ {
+ found = true;
break; /* NOTE BREAK OUT */
- if (aa < &ab->a[ABSIZE]) /* propagate break */
- break; /* NOTE BREAK OUT */
+ }
+ }
}
- if (ab == NULL)
+ if (!found)
fprintf(f, "?!?"); /* not in allocated space */
fprintf(f, "->");
if (a->to == NULL)
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h
index 306525eb5f..65e6ace9d1 100644
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -294,12 +294,23 @@ struct arc
struct arc *colorchainRev; /* back-link in color's arc chain */
};
+/*
+ * We include NINITARCS arcs directly in the parent state, to save a separate
+ * malloc for states with few out-arcs. If more arcs are needed, the first
+ * arcbatch has size FIRSTABSIZE; each succeeding batch doubles in size,
+ * up to MAXABSIZE.
+ */
+#define NINITARCS 2 /* 2 is enough for about 80% of states */
+#define FIRSTABSIZE 8 /* 2+8 is enough for 95% of states */
+#define MAXABSIZE 64 /* don't waste too much space per arcbatch */
+
struct arcbatch
{ /* for bulk allocation of arcs */
- struct arcbatch *next;
-#define ABSIZE 10
- struct arc a[ABSIZE];
+ struct arcbatch *next; /* chain link */
+ size_t narcs; /* number of arcs allocated in this arcbatch */
+ struct arc a[FLEXIBLE_ARRAY_MEMBER];
};
+#define ARCBATCHSIZE(n) ((n) * sizeof(struct arc) + offsetof(struct arcbatch, a))
struct state
{
@@ -312,10 +323,10 @@ struct state
struct arc *outs; /* chain of outarcs */
struct arc *free; /* chain of free arcs */
struct state *tmp; /* temporary for traversal algorithms */
- struct state *next; /* chain for traversing all */
- struct state *prev; /* back chain */
- struct arcbatch oas; /* first arcbatch, avoid malloc in easy case */
- int noas; /* number of arcs used in first arcbatch */
+ struct state *next; /* chain for traversing all live states */
+ struct state *prev; /* back-link in chain of all live states */
+ struct arcbatch *lastab; /* chain of associated arcbatches */
+ struct arc inita[NINITARCS]; /* a few arcs to avoid extra malloc */
};
struct nfa
@@ -413,7 +424,7 @@ struct cnfa
*/
#ifndef REG_MAX_COMPILE_SPACE
#define REG_MAX_COMPILE_SPACE \
- (100000 * sizeof(struct state) + 100000 * sizeof(struct arcbatch))
+ (100000 * sizeof(struct state) + 2000000 * sizeof(struct arc))
#endif
/*