diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c
index 2dad3e8..eee9a19 100644
--- a/src/backend/access/transam/parallel.c
+++ b/src/backend/access/transam/parallel.c
@@ -456,7 +456,7 @@ LaunchParallelWorkers(ParallelContext *pcxt)
 	 * fails.  It wouldn't help much anyway, because registering the worker in
 	 * no way guarantees that it will start up and initialize successfully.
 	 */
-	for (i = 0; i < pcxt->nworkers; ++i)
+	for (i = pcxt->nworkers_launched; i < pcxt->nworkers; ++i)
 	{
 		memcpy(worker.bgw_extra, &i, sizeof(int));
 		if (!any_registrations_failed &&
@@ -480,8 +480,10 @@ LaunchParallelWorkers(ParallelContext *pcxt)
 			 */
 			any_registrations_failed = true;
 			pcxt->worker[i].bgwhandle = NULL;
+/*
 			pfree(pcxt->worker[i].error_mqh);
 			pcxt->worker[i].error_mqh = NULL;
+*/
 		}
 	}
 
diff --git a/src/backend/executor/nodeGather.c b/src/backend/executor/nodeGather.c
index 1e5b1b7..958423c 100644
--- a/src/backend/executor/nodeGather.c
+++ b/src/backend/executor/nodeGather.c
@@ -167,7 +167,7 @@ ExecGather(GatherState *node)
 				node->nreaders = 0;
 				node->nextreader = 0;
 				node->reader =
-					palloc(pcxt->nworkers_launched * sizeof(TupleQueueReader *));
+					palloc(gather->num_workers * sizeof(TupleQueueReader *));
 
 				for (i = 0; i < pcxt->nworkers_launched; ++i)
 				{
@@ -346,6 +346,38 @@ gather_readnext(GatherState *gatherstate)
 		if (nvisited >= gatherstate->nreaders)
 		{
 			/*
+			 * As we are going to wait for the workers to send tuples, this may be
+			 * possible because of not sufficient workers that are planned?
+			 * Does the gather have all the required parallel workers? If not try to get
+			 * some more workers (only when all the previously allocated workers are still
+			 * doing the job) before we wait, this will further increase the performance
+			 * of the query as planned.
+			 */
+			if ((gatherstate->pei->pcxt->nworkers_launched < gatherstate->pei->pcxt->nworkers) &&
+				(gatherstate->nreaders == gatherstate->nworkers_launched))
+			{
+				int			i;
+				MemoryContext oldContext;
+
+				/* Run TupleQueueReaders in per-tuple context */
+				oldContext = MemoryContextSwitchTo(gatherstate->ps.state->es_query_cxt);
+
+				LaunchParallelWorkers(gatherstate->pei->pcxt);
+				gatherstate->nworkers_launched = gatherstate->pei->pcxt->nworkers_launched;
+
+				for (i = gatherstate->nreaders; i < gatherstate->nworkers_launched; ++i)
+				{
+					shm_mq_set_handle(gatherstate->pei->tqueue[i],
+							gatherstate->pei->pcxt->worker[i].bgwhandle);
+					gatherstate->reader[gatherstate->nreaders++] =
+						CreateTupleQueueReader(gatherstate->pei->tqueue[i],
+								gatherstate->funnel_slot->tts_tupleDescriptor);
+				}
+
+				MemoryContextSwitchTo(oldContext);
+			}
+
+			/*
 			 * If (still) running plan locally, return NULL so caller can
 			 * generate another tuple from the local copy of the plan.
 			 */
