Hi All,

I'm using dmtcp version 2.5.2 on CentOS Linux release 7.4.1708 (Core) 
and getting a segfault on the first checkpoint after restarting for the 
second time.  The program counts from 1-n and displays the even numbers 
on the terminal and the odd numbers are written to a file.  I've 
generated a backtrace from the core file that was dumped and it seems 
there is an issue with the memory being out of bounds on writing the new 
checkpoint file after the second restart.  I'm unsure on how to proceed 
tracking down this issue, can I get some help in figuring this out? 
I've pasted some information from my session, the ckpt files and the 
backtrace below.  If there is any other information needed, please let 
me know.

Please advise, thanks!

Jim


Below is a snip from a session:

/* begin session snip */

[jal@dev-intel18 dmtcp]$ dmtcp_launch --checkpoint-open-files --tmpdir 
~/dmtcp -i 3 ./count.exe 200 foo.out
   2
    4
    6
    8
   10
   12
^C
[jal@dev-intel18 dmtcp]$ dmtcp_restart --tmpdir ~/dmtcp -i 3 
ckpt_count.exe_b56a5d1b54403d5-40000-ae8c81eb05070.dmtcp
    8
   10
   12
   14
^C
[jal@dev-intel18 dmtcp]$ dmtcp_restart --tmpdir ~/dmtcp -i 3 
ckpt_count.exe_b56a5d1b54403d5-40000-ae8c81eb05070.dmtcp
   14
   16
   18
Segmentation fault (core dumped)
[jal@dev-intel18 dmtcp]$

/* end session snip */


When this occurs, it leaves a temp ckpt file in the directory where the 
checkpoint file is:

/* begin ckpt file listing */

-rw-------   1 jal staff 3.1M Nov 26 19:41 
ckpt_count.exe_b56a5d1b54403d5-40000-ae8c81eb05070.dmtcp
-rw-------   1 jal staff 1.1K Nov 26 19:41 
ckpt_count.exe_b56a5d1b54403d5-40000-ae8c81eb05070.dmtcp.temp

/* end ckpt file listing */


The following is the backtrace on the core file that is generated:

/* begin bt full */

[jal@dev-intel18 dmtcp]$ gdb ./count.exe core.38259
GNU gdb (GDB) Red Hat Enterprise Linux 7.6.1-100.el7_4.1
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later 
<http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
Reading symbols from /mnt/home/jal/dmtcp/count.exe...done.

warning: core file may not match specified executable file.
[New LWP 38272]
[New LWP 38259]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/usr/lib64/libthread_db.so.1".
Failed to read a valid object file image from memory.
Core was generated by `'.
Program terminated with signal 11, Segmentation fault.
#0  0x00002b4ae65f1ca9 in jalib::JAllocDispatcher::free 
(p=0x2aaaaaaab008) at ../../include/../jalib/jalloc.h:48
48            JAllocDispatcher::deallocate(_p, *_p+sizeof(size_t));
(gdb) bt full
#0  0x00002b4ae65f1ca9 in jalib::JAllocDispatcher::free 
(p=0x2aaaaaaab008) at ../../include/../jalib/jalloc.h:48
         _p = 0x2aaaaaaab000
#1  0x00002b4ae68063e9 in dmtcp::ProcSelfMaps::~ProcSelfMaps 
(this=0x2b4ae675c348, __in_chrg=<optimized out>) at procselfmaps.cpp:93
No locals.
#2  0x00002b4ae67dff3f in mtcp_writememoryareas (fd=6) at writeckpt.cpp:140
         area = {{{addr = 0xffffffffff600000 "H\307\300`", __addr = 
18446744073699065856}, {
               endAddr = 0xffffffffff601000 <Address 0xffffffffff601000 
out of bounds>, __endAddr = 18446744073699069952}, {size = 4096,
               __size = 4096}, {offset = 0, __offset = 0}, {prot = 5, 
__prot = 5}, {flags = 18, __flags = 18}, {devmajor = 0, __devmajor = 0}, {
               devminor = 0, __devminor = 0}, {inodenum = 0, __inodenum 
= 0}, properties = 0,
             name = 
"[vsyscall]\000al/dmtcp/dmtcp-jal@dev-intel18/dmtcpSharedArea.b56a5d1b54403d5-40000-ae8c7db0dfe00.ae8cfee1afc17\000J+\000\000`fO\350J+\000\000\f\311^\346J+\000\000\360\325}\346J+\000\000B#p\346J+\000\000\000\000\000\000\000\000\000\000\230\177O\350J+",
 
'\000' <repeats 26 times>, "\060\000\000\000\000\000\000\000 
gO\350J+\000\000"...},
           _padding = 
"\000\000`\377\377\377\377\377\000\020`\377\377\377\377\377\000\020", 
'\000' <repeats 14 times>, "\005\000\000\000\000\000\000\000\022", 
'\000' <repeats 39 times>, 
"[vsyscall]\000al/dmtcp/dmtcp-jal@dev-intel18/dmtcpSharedArea.b56a5d1b54403d5-40000-ae8c7db0dfe00.ae8cfee1afc17\000J+\000\000`fO\350J+\000\000"...}
         stack_was_seen = 0
         __FUNCTION__ = "mtcp_writememoryareas"
#3  0x00002b4ae67df50c in dmtcp::CkptSerializer::writeCkptImage 
(mtcpHdr=0x2b4ae84f9120, mtcpHdrLen=4096) at ckptserializer.cpp:423
         ckptFilename = 
"/mnt/home/jal/dmtcp/ckpt_count.exe_b56a5d1b54403d5-40000-ae8c81eb05070.dmtcp"
         tempCkptFilename = 
"/mnt/home/jal/dmtcp/ckpt_count.exe_b56a5d1b54403d5-40000-ae8c81eb05070.dmtcp.temp"
         __FUNCTION__ = "writeCkptImage"
         use_compression = true
         fdCkptFileOnDisk = 4
         fd = 6
#4  0x00002b4ae67e34ea in checkpointhread (dummy=0x0) at threadlist.cpp:398
         rwLock = {__data = {__lock = 0, __nr_readers = 0, 
__readers_wakeup = 0, __writer_wakeup = 0, __nr_readers_queued = 1, 
__nr_writers_queued = 0,
             __writer = 38272, __shared = 0, __pad1 = 0, __pad2 = 0, 
__flags = 0},
           __size = '\000' <repeats 16 times>, 
"\001\000\000\000\000\000\000\000\200\225", '\000' <repeats 29 times>, 
__align = 0}
         computation_generation = 3
         mtcpHdr = {{signature = "MTCP_HEADER_v2.2\n", '\000' <repeats 
14 times>, saved_brk = 0x1992000, restore_addr = 0x2b4ae78fb000,
             restore_size = 10485760, vdsoStart = 0x7ffe97dd3000, 
vdsoEnd = 0x7ffe97dd5000, vvarStart = 0x0, vvarEnd = 0x0,
             post_restart = 0x2b4ae67e4a83 
<dmtcp::ThreadList::postRestart()>,
             post_restart_debug = 0x2b4ae67e4a40 
<dmtcp::ThreadList::postRestartDebug()>, motherofall_tls_info = {fs = 0, 
gs = 0, gdtentrytls = {{
                   entry_number = 0, base_addr = 3866284224, limit = 
11082, seg_32bit = 0, contents = 0, read_exec_only = 0, limit_in_pages = 0,
                   seg_not_present = 0, useable = 0, lm = 0}}}, 
tls_pid_offset = 724, tls_tid_offset = 720, myinfo_gs = 0},
           _padding = "MTCP_HEADER_v2.2\n", '\000' <repeats 16 times>, " 
\231\001\000\000\000\000\000\260\217\347J+\000\000\000\000\240\000\000\000\000\000\000\060ݗ\376\177\000\000\000Pݗ\376\177",
 
'\000' <repeats 18 times>, 
"\203J~\346J+\000\000@J~\346J+\000\000\000\000\000\000\000\000\000\000\300\320r\346J+\000\000\000\000\000\000\324\002\000\000\320\002",
 
'\000' <repeats 3965 times>}
         __FUNCTION__ = "checkpointhread"
#5  0x00002b4ae67daa7f in pthread_start (arg=0x2b4ae6730388) at 
threadwrappers.cpp:159
         threadArg = 0x2b4ae6730388
         thread_arg = 0x0
         pthread_fn = 0x2b4ae67e3036 <checkpointhread(void*)>
         virtualTid = 0
         __FUNCTION__ = "pthread_start"
         set = {__val = {2048, 0 <repeats 15 times>}}
         result = 0x2b4ae84fba80
---Type <return> to continue, or q <return> to quit---
         __wrapperExecutionLockAcquired = false
#6  0x00002b4ae6e1ce25 in start_thread (arg=0x2b4ae84fc700) at 
pthread_create.c:308
         __res = <optimized out>
         pd = 0x2b4ae84fc700
         now = <optimized out>
         unwind_buf = {cancel_jmp_buf = {{jmp_buf = {47600725116672, 
7167173562728005828, 37668, 47600725117376, 47600725116672, 47600704869552,
                 7167173562673479876, 7167187699045187780}, 
mask_was_saved = 0}}, priv = {pad = {0x0, 0x0, 0x2b4ae67c55ec
      <dmtcp::DmtcpWorker::eventHook(eDmtcpEvent, 
_DmtcpEventData_t*)+194>, 0x0}, data = {prev = 0x0, cleanup = 0x0, 
canceltype = -428059156}}}
         not_first_call = <optimized out>
         pagesize_m1 = <optimized out>
         sp = <optimized out>
         freesize = <optimized out>
#7  0x00002b4ae67da71a in clone_start (arg=0x2b4ae6748008) at 
threadwrappers.cpp:68
         thread = 0x2b4ae6748008
         ret = 1
#8  0x00002b4ae66fec7d in clone_start (arg=0x2b4ae6730448) at 
pid/pid_miscwrappers.cpp:110
         threadArg = 0x2b4ae6730448
         fn = 0x2b4ae67da6c3 <clone_start(void*)>
         thread_arg = 0x2b4ae6748008
         virtualTid = 40002
#9  0x00002b4ae694634d in clone () at 
../sysdeps/unix/sysv/linux/x86_64/clone.S:113
No locals.
(gdb)

/* end bt full */

_______________________________________________
Dmtcp-forum mailing list
Dmtcp-forum@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/dmtcp-forum

Reply via email to