Dear all, as requested,
See attached patch 9 of 13
Best Regards,
Jerry
commit f9c3f968aab22e1ff4782a721da30bcd839e614f
Author: Andre Vehreschild <[email protected]>
Date: Fri Sep 5 14:40:24 2025 +0200
Fortran: Fix creating shared memory on macOS.
On MacOS mmap() very often does not respect the provided base address
for the shared memory segment. On the other hand the mutexes have to be
on the same (virtual) address for each process to function properly.
Therefore try a configurable number of times to get the same address for
the shared memory segment on MacOS. If that fails the user is notified
and the program terminates.
gcc/fortran/ChangeLog:
* invoke.texi: Document new environment variable GFORTRAN_IMAGE_
RESTARTS_LIMIT.
libgfortran/ChangeLog:
* caf/shmem.c (_gfortran_caf_finalize): Ensure all memory is
freeed.
* caf/shmem/allocator.c (allocator_shared_malloc): Just assert
that an index is within its bounds.
* caf/shmem/shared_memory.c (shared_memory_init): When shared
memory can not be placed at desired address, exit the image with
a certain code to let the supervisor restart the image.
(shared_memory_cleanup): Only the supervisor must unlink the shm
object.
* caf/shmem/supervisor.c (GFORTRAN_ENV_IMAGE_RESTARTS_LIMITS):
New environment variable.
(get_image_restarts_limit): Get the limit on image restarts
(accumulates over all) form the environment variable or default
to 4000.
(ensure_shmem_initialization): Add error handling.
(startWorker): Start a single worker/image.
(kill_all_images): Kill all images.
(supervisor_main_loop): When a worker/image reports a shared
memory issue just try to restart it.
* caf/shmem/thread_support.c (initialize_shared_mutex): Mark
mutex robust on plattforms that support it.
(initialize_shared_errorcheck_mutex): Same.
diff --git a/gcc/fortran/invoke.texi b/gcc/fortran/invoke.texi
index 6e8e13a982b..c544037033a 100644
--- a/gcc/fortran/invoke.texi
+++ b/gcc/fortran/invoke.texi
@@ -2350,6 +2350,15 @@ memory.
Warning: Choosing a large shared memory size may produce large coredumps!
+@env{GFORTRAN_IMAGE_RESTARTS_LIMIT}: On certain platforms, esp. MacOS, the
+shared memory segment needs to be placed on the same (virtual) address in every
+image or synchronisation primitives do not work as expected. Unfortunately are
+some OSes somewhat arbitrary on when they can do this. When the OS is not able
+to fullfill the request, then the image aborts itsself and is restarted by the
+supervisor untill the OS complies. This environment variable limits the total
+number of restarts of all images having an issue with shared memory segment
+placement. The default value is 4000.
+
The shared memory coarray library internally uses some additional environment
variables, which will be overwritten without notice or may result in failure to
start. These are: @code{GFORTRAN_IMAGE_NUM}, @code{GFORTRAN_SHMEM_PID} and
diff --git a/libgfortran/caf/shmem.c b/libgfortran/caf/shmem.c
index 266feab3e45..446e5f54483 100644
--- a/libgfortran/caf/shmem.c
+++ b/libgfortran/caf/shmem.c
@@ -152,6 +152,7 @@ _gfortran_caf_finalize (void)
free_team_list (caf_teams_formed);
caf_teams_formed = NULL;
+ shared_memory_cleanup (&local->sm);
free (local);
thread_support_cleanup ();
diff --git a/libgfortran/caf/shmem/allocator.c b/libgfortran/caf/shmem/allocator.c
index 2a22abb2a80..bd88f33e200 100644
--- a/libgfortran/caf/shmem/allocator.c
+++ b/libgfortran/caf/shmem/allocator.c
@@ -101,6 +101,8 @@ allocator_shared_malloc (allocator *a, size_t size)
sz = next_power_of_two (size);
act_size = sz > sizeof (bucket) ? sz : sizeof (bucket);
bucket_list_index = __builtin_clzl (act_size);
+ assert (bucket_list_index
+ < (int) (sizeof (a->s->free_bucket_head) / sizeof (shared_mem_ptr)));
if (SHMPTR_IS_NULL (a->s->free_bucket_head[bucket_list_index]))
return shared_memory_get_mem_with_alignment (a->shm, act_size, MAX_ALIGN);
diff --git a/libgfortran/caf/shmem/shared_memory.c b/libgfortran/caf/shmem/shared_memory.c
index d0789a4bac6..0659e6ba023 100644
--- a/libgfortran/caf/shmem/shared_memory.c
+++ b/libgfortran/caf/shmem/shared_memory.c
@@ -29,6 +29,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
#include "libgfortran.h"
#include "allocator.h"
#include "shared_memory.h"
+#include "supervisor.h"
#include <assert.h>
#include <fcntl.h>
@@ -194,7 +195,7 @@ shared_memory_init (shared_memory_act *mem, size_t size)
else
{
#ifdef HAVE_MMAP
- mem->shm_fd = shm_open (shm_name, O_RDWR, 0);
+ mem->shm_fd = shm_open (shm_name, O_RDWR, 0600);
if (mem->shm_fd == -1)
{
perror ("opening shared memory segment failed.");
@@ -212,7 +213,14 @@ shared_memory_init (shared_memory_act *mem, size_t size)
#ifdef HAVE_MMAP
mem->glbl.base
= mmap (base_ptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, mem->shm_fd, 0);
- if (mem->glbl.base == MAP_FAILED)
+ if (base_ptr && mem->glbl.base != base_ptr)
+ {
+ /* The supervisor will start us again. */
+ close (mem->shm_fd);
+ free (local);
+ exit (210);
+ }
+ else if (!base_ptr && !mem->glbl.base)
{
perror ("mmap failed");
exit (1);
@@ -249,9 +257,6 @@ shared_memory_init (shared_memory_act *mem, size_t size)
void
shared_memory_cleanup (shared_memory_act *mem)
{
- char shm_name[NAME_MAX];
-
- snprintf (shm_name, NAME_MAX, "/gfor-shm-%s", shared_memory_get_env ());
#ifdef HAVE_MMAP
int res = munmap (mem->glbl.base, mem->size);
if (res)
@@ -263,11 +268,18 @@ shared_memory_cleanup (shared_memory_act *mem)
{
perror ("closing shm file handle failed. Trying to continue...");
}
- res = shm_unlink (shm_name);
- if (res == -1)
+ if (this_image.image_num == -1)
{
- perror ("shm_unlink failed");
- exit (1);
+ char shm_name[NAME_MAX];
+
+ snprintf (shm_name, NAME_MAX, "/gfor-shm-%s", shared_memory_get_env ());
+ /* Only the supervisor is to delete the shm-file. */
+ res = shm_unlink (shm_name);
+ if (res == -1)
+ {
+ perror ("shm_unlink failed");
+ exit (1);
+ }
}
#elif defined(WIN32)
if (!UnmapViewOfFile (mem->glbl.base))
diff --git a/libgfortran/caf/shmem/supervisor.c b/libgfortran/caf/shmem/supervisor.c
index c39ffc6715c..780ab4a45c0 100644
--- a/libgfortran/caf/shmem/supervisor.c
+++ b/libgfortran/caf/shmem/supervisor.c
@@ -43,6 +43,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
#define GFORTRAN_ENV_NUM_IMAGES "GFORTRAN_NUM_IMAGES"
#define GFORTRAN_ENV_SHARED_MEMORY_SIZE "GFORTRAN_SHARED_MEMORY_SIZE"
#define GFORTRAN_ENV_IMAGE_NUM "GFORTRAN_IMAGE_NUM"
+#define GFORTRAN_ENV_IMAGE_RESTARTS_LIMITS "GFORTRAN_IMAGE_RESTARTS_LIMIT"
image_local *local = NULL;
@@ -68,6 +69,21 @@ get_image_num_from_envvar (void)
return nimages;
}
+/* Get the number of restarts allowed when the shared memory could not be placed
+at the same location in each image. This is mostly important for MacOS, because
+this OS acts somewhat arbitrary/indeterministic. */
+
+static unsigned
+get_image_restarts_limit (void)
+{
+ char *limit_chars;
+ unsigned limit = 4000;
+ limit_chars = getenv (GFORTRAN_ENV_IMAGE_RESTARTS_LIMITS);
+ if (limit_chars)
+ limit = atoi (limit_chars);
+ return limit;
+}
+
/* Get the amount of memory for the shared memory block. This is picked from
an environment variable. If that is not there, pick a reasonable default.
Note that on a 64-bit system which allows overcommit, there is no penalty in
@@ -157,6 +173,11 @@ ensure_shmem_initialization (void)
return;
local = malloc (sizeof (image_local));
+ if (!local)
+ {
+ caf_runtime_error ("can not initialize memory for local cache");
+ exit (1);
+ }
#if defined(_SC_PAGE_SIZE)
pagesize = sysconf (_SC_PAGE_SIZE);
#elif defined(WIN32)
@@ -234,6 +255,54 @@ ensure_shmem_initialization (void)
extern char **environ;
#endif
+static bool
+startWorker (image *im __attribute__ ((unused)),
+ char ***argv __attribute__ ((unused)))
+{
+#ifdef HAVE_FORK
+ caf_shmem_pid new_pid;
+ if ((new_pid = fork ()))
+ {
+ im->supervisor->images[im->image_num]
+ = (image_tracker) {new_pid, IMAGE_OK};
+ return false;
+ }
+ else
+ {
+ if (new_pid == -1)
+ caf_runtime_error ("error spawning child\n");
+ static char **new_env;
+ static char num_image[32];
+ size_t n = 2; /* Add one env-var and one for the term NULL. */
+
+ /* Count the number of entries in the current environment. */
+ for (char **e = environ; *e; ++e, ++n)
+ ;
+ new_env = (char **) malloc (sizeof (char *) * n);
+ memcpy (new_env, environ, sizeof (char *) * (n - 2));
+ snprintf (num_image, 32, "%s=%d", GFORTRAN_ENV_IMAGE_NUM, im->image_num);
+ new_env[n - 2] = num_image;
+ new_env[n - 1] = NULL;
+ if (execve ((*argv)[0], *argv, new_env) == -1)
+ {
+ perror ("execve failed");
+ }
+ exit (255);
+ }
+#endif
+ return true;
+}
+
+#ifndef WIN32
+static void
+kill_all_images (supervisor *m)
+{
+ for (int j = 0; j < local->total_num_images; j++)
+ if (m->images[j].status == IMAGE_OK)
+ kill (m->images[j].pid, SIGKILL);
+}
+#endif
+
/* argc and argv may not be used on certain OSes. Flag them unused therefore.
*/
int
@@ -254,40 +323,19 @@ supervisor_main_loop (int *argc __attribute__ ((unused)),
GetCurrentDirectory (cdLen, currentDir);
#else
int chstatus;
+ unsigned restarts = 0, restarts_limit;
+ restarts_limit = get_image_restarts_limit ();
#endif
*exit_code = 0;
shared_memory_set_env (getpid ());
- m = this_image.supervisor;
+ im.supervisor = m = this_image.supervisor;
for (im.image_num = 0; im.image_num < local->total_num_images; im.image_num++)
{
#ifdef HAVE_FORK
- caf_shmem_pid new_pid;
- if ((new_pid = fork ()))
- {
- if (new_pid == -1)
- caf_runtime_error ("error spawning child\n");
- m->images[im.image_num] = (image_tracker) {new_pid, IMAGE_OK};
- }
- else
- {
- static char **new_env;
- static char num_image[32];
- size_t n = 2; /* Add one env-var and one for the term NULL. */
-
- /* Count the number of entries in the current environment. */
- for (char **e = environ; *e; ++e, ++n)
- ;
- new_env = (char **) malloc (sizeof (char *) * n);
- memcpy (new_env, environ, sizeof (char *) * (n - 2));
- snprintf (num_image, 32, "%s=%d", GFORTRAN_ENV_IMAGE_NUM,
- im.image_num);
- new_env[n - 2] = num_image;
- new_env[n - 1] = NULL;
- execve ((*argv)[0], *argv, new_env);
- return 1;
- }
+ if (startWorker (&im, argv))
+ return 1;
#elif defined(WIN32)
LPTCH new_env;
size_t n = 0, es;
@@ -345,6 +393,14 @@ supervisor_main_loop (int *argc __attribute__ ((unused)),
#ifdef HAVE_FORK
caf_shmem_pid finished_pid = wait (&chstatus);
int j;
+
+ if (finished_pid == -1)
+ {
+ /* Skip wait having an issue. */
+ perror ("wait failed");
+ --i;
+ continue;
+ }
if (WIFEXITED (chstatus) && !WEXITSTATUS (chstatus))
{
for (j = 0;
@@ -365,35 +421,56 @@ supervisor_main_loop (int *argc __attribute__ ((unused)),
j < local->total_num_images && m->images[j].pid != finished_pid;
j++)
;
- dprintf (2, "ERROR: Image %d(pid: %d) failed with %d.\n", j + 1,
- finished_pid, WTERMSIG (chstatus));
- if (j == local->total_num_images)
+ if (WEXITSTATUS (chstatus) == 210)
{
- if (finished_pid == getpid ())
+ --i;
+ im.image_num = j;
+ ++restarts;
+ if (restarts > restarts_limit)
{
- dprintf (2,
- "WARNING: Supervisor process got signal %d. Killing "
- "childs and exiting.\n",
- WTERMSIG (chstatus));
- for (j = 0; j < local->total_num_images; j++)
- {
- if (m->images[j].status == IMAGE_OK)
- kill (m->images[j].pid, SIGKILL);
- }
+ kill_all_images (m);
+ caf_runtime_error (
+ "After restarting images %d times, no common state on "
+ "shared memory could be reached. Giving up...",
+ restarts);
exit (1);
}
- dprintf (2,
- "WARNING: Got signal %d for unknown process %d. "
- "Ignoring and trying to continue.\n",
- WTERMSIG (chstatus), finished_pid);
+ if (startWorker (&im, argv))
+ return 1;
continue;
}
- m->images[j].status = IMAGE_FAILED;
- atomic_fetch_add (&m->failed_images, 1);
- if (*exit_code < WTERMSIG (chstatus))
- *exit_code = WTERMSIG (chstatus);
- else if (*exit_code == 0)
- *exit_code = 1;
+ else
+ {
+ dprintf (2,
+ "ERROR: Image %d(pid: %d) failed with signal %d, "
+ "exitstatus %d.\n",
+ j + 1, finished_pid, WTERMSIG (chstatus),
+ WEXITSTATUS (chstatus));
+ if (j == local->total_num_images)
+ {
+ if (finished_pid == getpid ())
+ {
+ dprintf (
+ 2,
+ "WARNING: Supervisor process got signal %d. Killing "
+ "childs and exiting.\n",
+ WTERMSIG (chstatus));
+ kill_all_images (m);
+ exit (1);
+ }
+ dprintf (2,
+ "WARNING: Got signal %d for unknown process %d. "
+ "Ignoring and trying to continue.\n",
+ WTERMSIG (chstatus), finished_pid);
+ continue;
+ }
+ m->images[j].status = IMAGE_FAILED;
+ atomic_fetch_add (&m->failed_images, 1);
+ if (*exit_code < WTERMSIG (chstatus))
+ *exit_code = WTERMSIG (chstatus);
+ else if (*exit_code == 0)
+ *exit_code = 1;
+ }
}
/* Trigger waiting sync images aka sync_table. */
for (j = 0; j < local->total_num_images; j++)
diff --git a/libgfortran/caf/shmem/thread_support.c b/libgfortran/caf/shmem/thread_support.c
index e2c53627c2f..dcd8b00b788 100755
--- a/libgfortran/caf/shmem/thread_support.c
+++ b/libgfortran/caf/shmem/thread_support.c
@@ -50,6 +50,9 @@ initialize_shared_mutex (caf_shmem_mutex *mutex)
pthread_mutexattr_t mattr;
ERRCHECK (pthread_mutexattr_init (&mattr));
ERRCHECK (pthread_mutexattr_setpshared (&mattr, PTHREAD_PROCESS_SHARED));
+#ifdef PTHREAD_MUTEX_ROBUST
+ ERRCHECK (pthread_mutexattr_setrobust (&mattr, PTHREAD_MUTEX_ROBUST));
+#endif
ERRCHECK (pthread_mutex_init (mutex, &mattr));
ERRCHECK (pthread_mutexattr_destroy (&mattr));
}
@@ -61,6 +64,9 @@ initialize_shared_errorcheck_mutex (caf_shmem_mutex *mutex)
ERRCHECK (pthread_mutexattr_init (&mattr));
ERRCHECK (pthread_mutexattr_settype (&mattr, PTHREAD_MUTEX_ERRORCHECK));
ERRCHECK (pthread_mutexattr_setpshared (&mattr, PTHREAD_PROCESS_SHARED));
+#ifdef PTHREAD_MUTEX_ROBUST
+ ERRCHECK (pthread_mutexattr_setrobust (&mattr, PTHREAD_MUTEX_ROBUST));
+#endif
ERRCHECK (pthread_mutex_init (mutex, &mattr));
ERRCHECK (pthread_mutexattr_destroy (&mattr));
}
@@ -248,7 +254,7 @@ bm_set_mask (volatile unsigned long mask[], const int size)
mask[i] = ~0UL >> (ULONGBITS - rem);
}
-__attribute_used__ static bool
+__attribute__ ((used)) static bool
bm_is_none (volatile unsigned long mask[], const int size)
{
const int entries = size / ULONGBITS;