Changeset: 6d5dbd832675 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=6d5dbd832675
Modified Files:
monetdb5/mal/Makefile.ag
monetdb5/mal/mal.c
monetdb5/mal/mal_client.c
monetdb5/mal/mal_client.h
monetdb5/modules/mal/wlcr.c
monetdb5/modules/mal/wlcr.h
monetdb5/modules/mal/wlcr.mal
monetdb5/optimizer/opt_wlcr.c
sql/backends/monet5/sql_scenario.c
sql/backends/monet5/sql_wlcr.c
sql/backends/monet5/sql_wlcr.h
sql/backends/monet5/sql_wlcr.mal
sql/scripts/60_wlcr.sql
sql/test/wlcr/Tests/All
sql/test/wlcr/Tests/wlc01.py
sql/test/wlcr/Tests/wlr01.py
sql/test/wlcr/Tests/wlr01.stable.err
sql/test/wlcr/Tests/wlr01.stable.out
sql/test/wlcr/Tests/wlr20.py
sql/test/wlcr/Tests/wlr20.stable.err
sql/test/wlcr/Tests/wlr20.stable.out
sql/test/wlcr/Tests/wlr30.py
sql/test/wlcr/Tests/wlr40.py
Branch: wlcr
Log Message:
A new, simplified replication interface.
- the replication process can be controlled by the transaction id
- masterClock(), replicaClock(), replicaBacklog() to inspect state.
- focus on update queries only.
diffs (truncated from 2251 to 300 lines):
diff --git a/monetdb5/mal/Makefile.ag b/monetdb5/mal/Makefile.ag
--- a/monetdb5/mal/Makefile.ag
+++ b/monetdb5/mal/Makefile.ag
@@ -10,6 +10,7 @@ INCLUDES = ../../common/options \
../../clients/mapilib \
../../gdk \
../optimizer \
+ ../modules/mal \
$(openssl_CFLAGS)
MTSAFE
diff --git a/monetdb5/mal/mal.c b/monetdb5/mal/mal.c
--- a/monetdb5/mal/mal.c
+++ b/monetdb5/mal/mal.c
@@ -36,6 +36,7 @@ int have_hge;
#include "mal_runtime.h"
#include "mal_resource.h"
#include "opt_statistics.h"
+#include "wlcr.h"
MT_Lock mal_contextLock MT_LOCK_INITIALIZER("mal_contextLock");
MT_Lock mal_namespaceLock MT_LOCK_INITIALIZER("mal_namespaceLock");
@@ -124,6 +125,7 @@ void mserver_reset(int exit)
str err = 0;
GDKprepareExit();
+ WLCreset();
MCstopClients(0);
setHeartbeat(-1);
stopProfiler();
diff --git a/monetdb5/mal/mal_client.c b/monetdb5/mal/mal_client.c
--- a/monetdb5/mal/mal_client.c
+++ b/monetdb5/mal/mal_client.c
@@ -243,7 +243,6 @@ MCinitClientRecord(Client c, oid user, b
c->exception_buf_initialized = 0;
c->error_row = c->error_fld = c->error_msg = c->error_input = NULL;
c->wlcr_kind = 0;
- c->wlcr_mode = 0;
c->wlcr = NULL;
#ifndef HAVE_EMBEDDED /* no authentication in embedded mode */
{
@@ -400,7 +399,6 @@ freeClient(Client c)
if( c->wlcr)
freeMalBlk(c->wlcr);
c->wlcr_kind = 0;
- c->wlcr_mode = 0;
c->wlcr = NULL;
}
if (t)
diff --git a/monetdb5/mal/mal_client.h b/monetdb5/mal/mal_client.h
--- a/monetdb5/mal/mal_client.h
+++ b/monetdb5/mal/mal_client.h
@@ -177,7 +177,6 @@ typedef struct CLIENT {
* This allows a single server to act as both a master and a replica.
*/
int wlcr_kind; // used by master to characterise the compound
transaction
- int wlcr_mode; // used by replica to control rerunning the transaction
MalBlkPtr wlcr;
/*
diff --git a/monetdb5/modules/mal/wlcr.c b/monetdb5/modules/mal/wlcr.c
--- a/monetdb5/modules/mal/wlcr.c
+++ b/monetdb5/modules/mal/wlcr.c
@@ -9,118 +9,127 @@
/*
* (c) 2017 Martin Kersten
* This module collects the workload-capture-replay statements during
transaction execution,
- * also known as asynchronous logical replication management.
+ * also known as asynchronous logical replication management. It can be used
for
+ * multiple purposes: BACKUP, REPLICATION, and REPLAY
*
- * The goal is to easily clone a master database.
+ * For a BACKUP we need either a complete update log from the beginning, or
+ * a binary snapshot with a collection of logs recording its changes since.
+ * To ensure transaction ACID properties, the log record should be stored on
+ * disk within the transaction brackets, which may cause a serious IO load.
+ * (Tip, store these logs files on an SSD or NVM)
*
+ * For REPLICATION, also called a database clone or slave, we take a snapshot
and the
+ * log files that reflect the recent changes. The log updates are replayed
against
+ * the snapshot until a specific time point is reached.
+ *
+ * Some systems also use the logical logs to REPLAY all (expensive) queries
+ * against the database.
+ *
+ * The goal of this module is to ease BACKUP and REPPLICATION of a master
database
+ * with a time-bounded delay.
+ * Such a clone is a database replica that aid in query workload sharing,
+ * database versioning, and (re-)partitioning.
+ *
+ * Simplicity and ease of end-user control has been the driving argument here.
*
* IMPLEMENTATION
* The underlying assumption of the techniques deployed is that the database
- * resides on a proper (global) file system to guarantees recovery from most
- * storage system related failures. Such as RAID disks.
- * Furthermore, when deployed in a Cloud setting, the database recides in the
- * global file system.
+ * resides on a proper (global/distributed) file system to guarantees recovery
+ * from most storage system related failures, e.g. using RAID disks or
LSFsystems.
*
- * A database can be set once into 'master' mode only once using the SQL
command:
+ * A database can be set into 'master' mode only once using the SQL command:
* CALL master()
+ * An alternative path to the log records can be given to reduce the storage
cost,
+ * e.g. a nearby SSD.
+ * By default, it creates a directory .../dbfarm/dbname/master to hold all
+ * necessary information for the creation of a database clone.
*
- * It creates a directory .../dbfarm/dbname/master to hold all necessary
information
- * for the creation and maintenance of replicas.
- * A configuration file is added to keep track on the status of the master.
+ * A master configuration file is added to the database directory to keep the
state/
* It contains the following key=value pairs:
- * snapshot=<path to a binary snapshot>
+ * snapshot=<path to a snapshot directory>
* logs=<path to the wlcr log directory>
- * state=<started, paused,(resume), stopped>
- * firstbatch=<first batch file to be applied>
- * batches=<last batch file to be applied>
- * drift=<maximal delay before transactions are seen globally, in
seconds>
- * threshold=<min response time for queries to be kept>
- * rollbock=<flag to indicate keeping the aborted transactions as
well>
+ * state=<started, stopped>
+ * batches=<next available batch file to be applied>
+ * drift=<maximal delay before transactions are published as a
separate log, in seconds>
+ * write=<timestamp of the last transaction recorded>
*
- * Every replica should start off with a copy of binary snapshot identified by
'snapshot'
- * by default stored in .../dbfarm/dbname/master/bat. An alternative path can
be given
- * to reduce the storage cost at the expense of slower recovery time (e.g. AWS
glacier).
- * A missing path to the snapshot denotes that we can start rebuilding with an
empty database instead.
- * The log files are stored as master/<dbname>_<batchnumber>.
+ * A missing path to the snapshot denotes that we can start the clone with an
empty database.
+ * The log files are stored as master/<dbname>_<batchnumber>. They belong to
the snapshot.
*
* Each wlcr log file contains a serial log of committed compound transactions.
* The log records are represented as ordinary MAL statement blocks, which
* are executed in serial mode. (parallelism can be considered for large
updates later)
- * Each transaction job is identified by the owner of the query,
- * commit/rollback status, its starting time and runtime (in ms).
+ * Each transaction job is identified by the owner of the query, its starting
time and runtime (in ms).
+ * The log-record should end with a commit.
*
- * Update queries are always logged and pure queries can be limited to those
- * that satisfy an minimum execution threshold.
- * CALL logthreshold(duration)
- * The threshold is given in milliseconds.
- * The threshold setting is saved and affects all future master log records.
- * The default for a production system version should be set to -1, which
ignores all pure queries.
- *
- * The aborted transactions can also be gathered using the call
- * CALL logrollback(1);
- * Such queries may be helpful in the analysis of transactions with failures.
- *
- * A transaction log is owned by the master. He decides when the log may be
globally used.
- * The trigger for this is the allowed drift. A new transaction log is created
when
+ * A transaction log is created by the master. He decides when the log may be
globally used.
+ * The trigger for this is the allowed 'drift'. A new transaction log is
created when
* the system has been collecting logs for some time (drift in seconds).
* The drift determines the maximal window of transactions loss that is
permitted.
- * The maximum drift can be set using a SQL command. Setting it to zero leads
to a
- * log file per transaction and may cause a large overhead for short running
transactions.
+ * The maximum drift can be set using a SQL command, e.g.
+ * CALL drift(duration)
+ * Setting it to zero leads to a log file per transaction and may cause a
large log directory.
+ * A default of 5 minutes should balance polling overhead.
*
* A minor problem here is that we should ensure that the log file is closed
even if there
- * are no transactions running. It is solved with a separate monitor thread.
- * After closing, the replicas can see from the master configuration file that
a log is available.
+ * are no transactions running. It is solved with a separate monitor thread,
which ensures
+ * that the logs are flushed at least after 'drift' seconds since the first
logrecord was created.
+ * After closing, the replicas can see from the master configuration file that
a new log batch is available.
*
- * The transaction loggin can be temporarily paused using the command
- * CALL master(2)
- * This mode should be uses sparingly. For example if you plan to perform a
COPY INTO LOCKED mode
- * and want to avoid an avalanche of update records.
- *
- * Logging is resumed using the command
- * CALL master(3)
- * A warning is issued when during the suspension update transactions have
been issued.
- * The final step is to close transaction logging with the command
- * CALL master(4).
- * It typically is the end-of-life-time for a snapshot and its log files.
+ * The final step is to close stop ransaction logging with the command
+ * CALL stopmaster.
+ * It typically is the end-of-life-time for a snapshot. For example, when
planning to do
+ * a large bulk load of the database, stopping logging avoids a double write
into the
+ * database. The database can be brought back into wlcr mode using a fresh
snapshot.
*
*[TODO] A more secure way to set a database into master mode is to use the
command
* monetdb master <dbname> [ <optional snapshot path>]
* which locks the database, takes a save copy, initializes the state chance.
*
* A fresh replica can be constructed as follows:
- * monetdb replica <dbname> <mastername>
+ * monetdb replicate <dbname> <mastername>
*
* Instead of using the monetdb command line we can use the SQL calls directly
* master() and replicate(), provided we start with a fresh database.
*
- * REPLICAS
+ * CLONE
*
- * A fresh database can be turned into a replica using the call
- * CALL replicate("mastername")
+ * Every clone should start off with a copy of the binary snapshot identified
by 'snapshot'.
+ * A fresh database can be turned into a clone using the call
+ * CALL replicate('mastername')
* It will grab the latest snapshot of the master and applies all
- * known log files before releasing the system. Progress of
+ * available log files before releasing the system. Progress of
* the replication can be monitored using the -fraw option in mclient.
+ * The master has no knowledge about the number of clones and their
whereabouts.
*
- * It will iterate through the log files, applying all transactions.
- * It excludes catalog and update queries, which are always executed.
- * Queries are simply ignored.
+ * The clone process will iterate in the background through the log files,
+ * applying all update transactions.
*
- * The alternative is to also replay the queries .
- * CALL replaythreshold(threshold)
- * In this mode all pure queries are also executed for which the reported
threshold exceeds the argument.
- * Enabling the query log collects the execution times for these queries.
+ * An optional timestamp or transaction id can be added to apply the logs until
+ * a given moment. This is particularly handy when an unexpected
+ * desastrous user action (drop persisten table) has to be recovered from.
+ *
+ * CALL replicate('mastername');
+ * CALL replicate('mastername',NOW()); -- stops after we are in sink
+ * ...
+ * CALL replicate(NOW()); -- partial roll forward
+ * ...
+ * CALL replicate(); --continue nondisturbed
+ *
+ * SELECT replicaClock();
+ * returns the timestamp of the last replicated transaction.
+ * SELECT replicaBacklog();
+ * returns the number of pending transactions to be in sink with master.
+ * SELECT masterClock();
+ * return the timestamp of the last committed transaction in the master.
*
* Any failure encountered during a log replay terminates the replication
process,
- * leaving a message in the merovingian log.
- *
- * The replica creation can be suspended at the master and at the clone.
- * It will continue after the corresponding resume* operation is issues.
+ * leaving a message in the merovingian log configuration.
*
* The wlcr files purposely have a textual format derived from the MAL
statements.
- * Simplicity and ease of control has been the driving argument here.
+ * This provides a stepping stone for remote execution later.
*
* [TODO] consider the roll forward of SQL session variables, i.e.
optimizer_pipe (for now assume default pipe).
- * [TODO] The status of the master/replica should be accessible for inspection
* [TODO] the user might want to indicate a time-stamp, to rebuild to a
certain point
*
*/
@@ -129,23 +138,20 @@
#include "mal_builder.h"
#include "wlcr.h"
-static MT_Lock wlcr_lock MT_LOCK_INITIALIZER("wlcr_lock");
+MT_Lock wlcr_lock MT_LOCK_INITIALIZER("wlcr_lock");
-static str wlcr_snapshot= 0; // The location of the snapshot against which the
logs work
-static str wlcr_logs = 0; // The location in the global file store for
the logs
-static stream *wlcr_fd = 0;
-static int wlcr_start = 0; // time stamp of first transaction in log file
-static int wlcr_state = 0; // The current status of the in the life cycle
-static int wlcr_tag = 0; // number of database chancing transactions
-static int wlcr_pausetag = 0; // number of database chancing transactions
when pausing
+static char wlc_snapshot[PATHLENGTH]; // The location of the snapshot against
which the logs work
+static lng wlc_start= 0; // Start time of first
transaction
+static stream *wlc_fd = 0;
// These properties are needed by the replica to direct the roll-forward.
-int wlcr_threshold = 0; // should be set to -1 for production
-str wlcr_dbname = 0; // The master database name
-int wlcr_firstbatch = 0; // first log file associated with the snapshot
-int wlcr_batches = 0; // identifier of next batch
-int wlcr_drift = 10; // maximal period covered by a single log file
in seconds
-int wlcr_rollback= 0; // also log the aborted queries.
+char wlc_dir[PATHLENGTH]; // The location in the global file store for
the logs
+char wlc_name[IDLENGTH]; // The master database name
+lng wlc_id = 0; // next transaction id
+int wlc_state = 0; // The current status of the in the
life cycle
+char wlc_write[26]; // The timestamp of the last committed
transaction
+int wlc_batches = 0; // identifier of next batch
+int wlc_drift = 10; // maximal period covered by a single log file
in seconds
/* The database snapshots are binary copies of the dbfarm/database/bat
* New snapshots are created currently using the 'monetdb snapshot <db>'
command
@@ -157,67 +163,67 @@ int wlcr_rollback= 0; // also log the a
int
WLCused(void)
{
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list