ArielGlenn has submitted this change and it was merged.
Change subject: sql2txt: convert sql dumps to input format suitable for LOAD
DATA INFILE
......................................................................
sql2txt: convert sql dumps to input format suitable for LOAD DATA INFILE
Change-Id: Ifa9b4cdbb1015343b77befe7fe1bf1b4931b4381
---
M xmlfileutils/Makefile
M xmlfileutils/filebuffers.c
M xmlfileutils/mwxml2sql.c
M xmlfileutils/mwxml2sql.h
M xmlfileutils/sqlutils.c
5 files changed, 167 insertions(+), 89 deletions(-)
Approvals:
ArielGlenn: Verified; Looks good to me, approved
diff --git a/xmlfileutils/Makefile b/xmlfileutils/Makefile
index 41c5562..b673439 100644
--- a/xmlfileutils/Makefile
+++ b/xmlfileutils/Makefile
@@ -12,22 +12,31 @@
SHELL=/bin/sh
-all: mwxml2sql
+all: mwxml2sql sql2txt
mwxml2sql: mwxml2sql.o filebuffers.o xmltags.o mwxmlelts.o sqlutils.o
$(CC) $(CFLAGS) $(LDFLAGS) -o mwxml2sql mwxml2sql.o filebuffers.o \
xmltags.o mwxmlelts.o sqlutils.o -lssl -lcrypto -lbz2 -lz
-install: mwxml2sql
+sql2txt: sql2txt.o filebuffers.o sqlutils.o
+ $(CC) $(CFLAGS) $(LDFLAGS) -o sql2txt sql2txt.o filebuffers.o
sqlutils.o \
+ -lcrypto -lbz2 -lz
+
+install: mwxml2sql sql2txt
if ( test ! -d $(PREFIX)/bin ) ; then mkdir -p $(PREFIX)/bin ; fi
cp -f mwxml2sql $(PREFIX)/bin/mwxml2sql
+ cp -f sql2txt $(PREFIX)/bin/sql2txt
chmod a+x $(PREFIX)/bin/mwxml2sql
+ chmod a+x $(PREFIX)/bin/sql2txt
clean:
- rm -f *.o *.a mwxml2sql
+ rm -f *.o *.a mwxml2sql sql2txt
mwxml2sql.o: mwxml2sql.c mwxml2sql.h
$(CC) $(CFLAGS) -c mwxml2sql.c
+
+sql2txt.o: sql2txt.c mwxml2sql.h
+ $(CC) $(CFLAGS) -c sql2txt.c
filebuffers.o: filebuffers.c mwxml2sql.h
$(CC) $(CFLAGS) -c filebuffers.c
@@ -53,6 +62,7 @@
$(DISTNAME)/mwxml2sql.h \
$(DISTNAME)/filebuffers.c \
$(DISTNAME)/mwxml2sql.c \
+ $(DISTNAME)/sql2txt.c \
$(DISTNAME)/sqlutils.c \
$(DISTNAME)/xmltags.c \
$(DISTNAME)/Makefile \
diff --git a/xmlfileutils/filebuffers.c b/xmlfileutils/filebuffers.c
index f7144da..1640043 100644
--- a/xmlfileutils/filebuffers.c
+++ b/xmlfileutils/filebuffers.c
@@ -18,6 +18,84 @@
#include "mwxml2sql.h"
/*
+ args:
+ file_name name of file
+ verbose 1 to write to stderr information about the
+ filename as it is processed, 0 for quiet mode
+
+ returns:
+ name of file without suffix, or NULL on error
+ known suffixes are BZSUFFIX, GZSUFFIX, TXTSUFFIX (.gz .bz2 .txt)
+*/
+char *get_filebase(char *file_name, int verbose) {
+ char *start = NULL, *copy = NULL;
+
+ if (!file_name) return(NULL);
+ if ((start = strrchr(file_name, '.')) != NULL) {
+ if (!strcmp(start, BZSUFFIX) ||
+ !strcmp(start, GZSUFFIX) ||
+ !strcmp(start, TXTSUFFIX)) {
+ }
+ copy = (char *)malloc(start - file_name +1);
+ if (!copy) {
+ fprintf(stderr,"failed to get memory for output filename\n");
+ return(NULL);
+ }
+ strncpy(copy, file_name, start - file_name);
+ copy[start - file_name] = '\0';
+ return(copy);
+ if (verbose > 1)
+ fprintf(stderr,"passed %s and returning base %s\n", file_name, copy);
+ }
+ copy = (char *)malloc(strlen(file_name) +1);
+ if (!copy) {
+ fprintf(stderr,"failed to get memory for output filename\n");
+ return(NULL);
+ }
+ strcpy(copy, file_name);
+ return(copy);
+}
+
+/*
+ args:
+ file_name name of file
+ verbose 1 to write to stderr information about the
+ filename as it is processed, 0 for quiet mode
+
+ returns:
+ file suffix if there is one, or the empty string if there is none,
+ or NULL on error
+ known suffixes are BZSUFFIX, GZSUFFIX, TXTSUFFIX (.gz .bz2 .txt)
+*/
+char *get_filesuffix(char *file_name, int verbose) {
+ char *start = NULL, *copy = NULL;
+
+ if (!file_name) return(NULL);
+ if ((start = strrchr(file_name, '.')) != NULL) {
+ if (!strcmp(start, BZSUFFIX) ||
+ !strcmp(start, GZSUFFIX) ||
+ !strcmp(start, TXTSUFFIX)) {
+ copy = (char *)malloc(file_name+strlen(file_name) - start +1);
+ if (!copy) {
+ fprintf(stderr,"failed to get memory for output filename\n");
+ return(NULL);
+ }
+ strcpy(copy, start);
+ if (verbose > 1)
+ fprintf(stderr,"passed %s and returning suffix %s\n", file_name, copy);
+ return(copy);
+ }
+ }
+ copy = (char *)malloc(1);
+ if (!copy) {
+ fprintf(stderr,"failed to get memory for output filename\n");
+ return(NULL);
+ }
+ copy[0] = '\0';
+ return(copy);
+}
+
+/*
args:
contents - data to compress
compressed_length - where length of compressed data will be stored
@@ -495,7 +573,8 @@
suffix suffix of filename
mwv list of structures with information about the MediaWiki
versions for which sql output in these files will
- be produced
+ be produced. If this is NULL then only one file will
+ be produced, without a version name in it
returns:
allocated and filled in output file structure on success
@@ -511,10 +590,13 @@
output_file_t *init_output_file(char *basename, char *suffix, mw_version_t
*mwv) {
output_file_t *outf, *current, *head = NULL;
mw_version_t *next = NULL;
+ int do_once = 1;
+ char *version = NULL;
- /* do this now for each mwv... */
- while (mwv) {
- next = mwv->next;
+ /* do this now for each mwv... or once if mwv is NULL */
+ while (mwv || do_once) {
+ do_once = 0;
+ if (mwv) next = mwv->next;
outf = (output_file_t *)malloc(sizeof(output_file_t));
if (!outf) {
@@ -535,18 +617,21 @@
if (basename == NULL) {
outf->filetype = PLAINTEXT;
- outf->fd = stdin;
+ outf->fd = stdout;
continue;
}
/* "basename-" + version + suffix (if there is one) */
- outf->filename = (char *)malloc(strlen(basename) +
(suffix?strlen(suffix):0) + strlen(outf->mwv->version) + 2);
+ if (outf->mwv) version = outf->mwv->version;
+ else version = NULL;
+
+ outf->filename = (char *)malloc(strlen(basename) +
(suffix?strlen(suffix):0) + strlen(version) + 2);
if (!outf->filename) {
fprintf(stderr,"failed to get memory for output file information\n");
free_output_file(head);
return(NULL);
}
- sprintf(outf->filename, "%s-%s%s", basename, outf->mwv->version,
suffix?suffix:"0");
+ sprintf(outf->filename, "%s%s%s%s", basename, version?"-":"",
version?version:"", suffix?suffix:"");
if (!suffix) {
outf->filetype = PLAINTEXT;
outf->fd = fopen (outf->filename, "w");
diff --git a/xmlfileutils/mwxml2sql.c b/xmlfileutils/mwxml2sql.c
index b7353ec..90fd252 100644
--- a/xmlfileutils/mwxml2sql.c
+++ b/xmlfileutils/mwxml2sql.c
@@ -180,84 +180,6 @@
exit(-1);
}
-/*
- args:
- file_name name of file
- verbose 1 to write to stderr information about the
- filename as it is processed, 0 for quiet mode
-
- returns:
- name of file without suffix, or NULL on error
- known suffixes are BZSUFFIX, GZSUFFIX, TXTSUFFIX (.gz .bz2 .txt)
-*/
-char *get_filebase(char *file_name, int verbose) {
- char *start = NULL, *copy = NULL;
-
- if (!file_name) return(NULL);
- if ((start = strrchr(file_name, '.')) != NULL) {
- if (!strcmp(start, BZSUFFIX) ||
- !strcmp(start, GZSUFFIX) ||
- !strcmp(start, TXTSUFFIX)) {
- }
- copy = (char *)malloc(start - file_name +1);
- if (!copy) {
- fprintf(stderr,"failed to get memory for output filename\n");
- return(NULL);
- }
- strncpy(copy, file_name, start - file_name);
- copy[start - file_name] = '\0';
- return(copy);
- if (verbose > 1)
- fprintf(stderr,"passed %s and returning base %s\n", file_name, copy);
- }
- copy = (char *)malloc(strlen(file_name) +1);
- if (!copy) {
- fprintf(stderr,"failed to get memory for output filename\n");
- return(NULL);
- }
- strcpy(copy, file_name);
- return(copy);
-}
-
-/*
- args:
- file_name name of file
- verbose 1 to write to stderr information about the
- filename as it is processed, 0 for quiet mode
-
- returns:
- file suffix if there is one, or the empty string if there is none,
- or NULL on error
- known suffixes are BZSUFFIX, GZSUFFIX, TXTSUFFIX (.gz .bz2 .txt)
-*/
-char *get_filesuffix(char *file_name, int verbose) {
- char *start = NULL, *copy = NULL;
-
- if (!file_name) return(NULL);
- if ((start = strrchr(file_name, '.')) != NULL) {
- if (!strcmp(start, BZSUFFIX) ||
- !strcmp(start, GZSUFFIX) ||
- !strcmp(start, TXTSUFFIX)) {
- copy = (char *)malloc(file_name+strlen(file_name) - start +1);
- if (!copy) {
- fprintf(stderr,"failed to get memory for output filename\n");
- return(NULL);
- }
- strcpy(copy, start);
- if (verbose > 1)
- fprintf(stderr,"passed %s and returning suffix %s\n", file_name, copy);
- return(copy);
- }
- }
- copy = (char *)malloc(1);
- if (!copy) {
- fprintf(stderr,"failed to get memory for output filename\n");
- return(NULL);
- }
- copy[0] = '\0';
- return(copy);
-}
-
int main(int argc, char **argv) {
int optindex=0;
int optc = 0;
diff --git a/xmlfileutils/mwxml2sql.h b/xmlfileutils/mwxml2sql.h
index 7638563..35a715d 100644
--- a/xmlfileutils/mwxml2sql.h
+++ b/xmlfileutils/mwxml2sql.h
@@ -210,6 +210,7 @@
void print_sql_field(FILE *f, char *field, int isstring, int islast);
void copy_sql_field(char *outbuf, char *field, int isstring, int islast);
char *sql_escape(char *s, int s_size, char *out, int out_size);
+char *tab_escape(char *s, int s_size, char *out, int out_size);
void title_escape(char *t);
char *un_xml_escape(char *value, char *output, int last);
void digits_only(char *buf);
diff --git a/xmlfileutils/sqlutils.c b/xmlfileutils/sqlutils.c
index 4577e4f..03a1cc5 100644
--- a/xmlfileutils/sqlutils.c
+++ b/xmlfileutils/sqlutils.c
@@ -178,7 +178,7 @@
returns:
pointer to the next byte in s to be processed, or to NULL if all
- bytes were processed
+ bytes were processed
this function escapes character strings for input to mysql,
adding a trailing '\0' to the result
@@ -250,6 +250,66 @@
/*
args:
+ s string to escape
+ s_size length of string to escape
+ out holder for result
+ out_size size of holder for result
+
+ returns:
+ pointer to the next byte in s to be processed, or to NULL if all
+ bytes were processed
+
+ this function escapes tabs in character strings for input to LOAD FILE
+ adding a trailing '\0' to the result (you should pass a string that
+ already has the remainder of the mysql escapes applied)
+
+ if s_size is 0, the string to escape must be null terminated
+ and its length is not checked.
+*/
+char *tab_escape(char *s, int s_size, char *out, int out_size) {
+ char c;
+ char *from ;
+ char *to;
+ int copied = 0;
+ int ind = 0;
+
+ from = s;
+ to = out;
+ while (*from || ind < s_size) {
+ if (copied +3 > out_size) {
+ /* null terminate here and return index */
+ *to = '\0';
+ return(from);
+ }
+ switch (*from) {
+ case '\t':
+ c = 't';
+ break;
+ default:
+ c = 0;
+ *to = *from;
+ to++;
+ copied++;
+ from++;
+ ind++;
+ }
+ if (c) {
+ *to = '\\';
+ to++;
+ copied++;
+ *to = c;
+ to++;
+ copied++;
+ from++;
+ ind++;
+ }
+ }
+ *to = '\0';
+ return(NULL);
+}
+
+/*
+ args:
t null-terminated title string to be converted
this function converts the supplied page title to its canonical
--
To view, visit https://gerrit.wikimedia.org/r/50167
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ifa9b4cdbb1015343b77befe7fe1bf1b4931b4381
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps
Gerrit-Branch: ariel
Gerrit-Owner: ArielGlenn <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits