ArielGlenn has submitted this change and it was merged.

Change subject: sql2txt: convert sql dumps to input format suitable for LOAD 
DATA INFILE
......................................................................


sql2txt: convert sql dumps to input format suitable for LOAD DATA INFILE

Change-Id: Ifa9b4cdbb1015343b77befe7fe1bf1b4931b4381
---
M xmlfileutils/Makefile
M xmlfileutils/filebuffers.c
M xmlfileutils/mwxml2sql.c
M xmlfileutils/mwxml2sql.h
M xmlfileutils/sqlutils.c
5 files changed, 167 insertions(+), 89 deletions(-)

Approvals:
  ArielGlenn: Verified; Looks good to me, approved



diff --git a/xmlfileutils/Makefile b/xmlfileutils/Makefile
index 41c5562..b673439 100644
--- a/xmlfileutils/Makefile
+++ b/xmlfileutils/Makefile
@@ -12,22 +12,31 @@
 
 SHELL=/bin/sh
 
-all: mwxml2sql
+all: mwxml2sql sql2txt
 
 mwxml2sql: mwxml2sql.o filebuffers.o xmltags.o mwxmlelts.o sqlutils.o
        $(CC) $(CFLAGS) $(LDFLAGS) -o mwxml2sql mwxml2sql.o filebuffers.o \
            xmltags.o mwxmlelts.o sqlutils.o -lssl -lcrypto -lbz2 -lz
 
-install: mwxml2sql
+sql2txt: sql2txt.o filebuffers.o sqlutils.o
+       $(CC) $(CFLAGS) $(LDFLAGS) -o sql2txt sql2txt.o filebuffers.o 
sqlutils.o \
+           -lcrypto -lbz2 -lz
+
+install: mwxml2sql sql2txt
        if ( test ! -d $(PREFIX)/bin ) ; then mkdir -p $(PREFIX)/bin ; fi
        cp -f mwxml2sql $(PREFIX)/bin/mwxml2sql
+       cp -f sql2txt $(PREFIX)/bin/sql2txt
        chmod a+x $(PREFIX)/bin/mwxml2sql
+       chmod a+x $(PREFIX)/bin/sql2txt
 
 clean: 
-       rm -f *.o *.a mwxml2sql
+       rm -f *.o *.a mwxml2sql sql2txt
 
 mwxml2sql.o: mwxml2sql.c mwxml2sql.h
        $(CC) $(CFLAGS) -c mwxml2sql.c
+
+sql2txt.o: sql2txt.c mwxml2sql.h
+       $(CC) $(CFLAGS) -c sql2txt.c
 
 filebuffers.o: filebuffers.c mwxml2sql.h
        $(CC) $(CFLAGS) -c filebuffers.c
@@ -53,6 +62,7 @@
           $(DISTNAME)/mwxml2sql.h \
           $(DISTNAME)/filebuffers.c \
           $(DISTNAME)/mwxml2sql.c \
+          $(DISTNAME)/sql2txt.c \
           $(DISTNAME)/sqlutils.c \
           $(DISTNAME)/xmltags.c \
           $(DISTNAME)/Makefile \
diff --git a/xmlfileutils/filebuffers.c b/xmlfileutils/filebuffers.c
index f7144da..1640043 100644
--- a/xmlfileutils/filebuffers.c
+++ b/xmlfileutils/filebuffers.c
@@ -18,6 +18,84 @@
 #include "mwxml2sql.h"
 
 /*
+   args:
+      file_name      name of file
+      verbose        1 to write to stderr information about the
+                     filename as it is processed, 0 for quiet mode
+
+  returns:
+      name of file without suffix, or NULL on error
+      known suffixes are BZSUFFIX, GZSUFFIX, TXTSUFFIX (.gz .bz2 .txt)
+*/
+char *get_filebase(char *file_name, int verbose) {
+  char *start = NULL, *copy = NULL;
+
+  if (!file_name) return(NULL);
+  if ((start = strrchr(file_name, '.')) != NULL) {
+    if (!strcmp(start, BZSUFFIX) ||
+       !strcmp(start, GZSUFFIX) ||
+       !strcmp(start, TXTSUFFIX)) {
+    }
+    copy = (char *)malloc(start - file_name +1);
+    if (!copy) {
+      fprintf(stderr,"failed to get memory for output filename\n");
+      return(NULL);
+    }
+    strncpy(copy, file_name, start - file_name);
+    copy[start - file_name] = '\0';
+    return(copy);
+    if (verbose > 1)
+      fprintf(stderr,"passed %s and returning base %s\n", file_name, copy);
+  }
+  copy = (char *)malloc(strlen(file_name) +1);
+  if (!copy) {
+    fprintf(stderr,"failed to get memory for output filename\n");
+    return(NULL);
+  }
+  strcpy(copy, file_name);
+  return(copy);
+}
+
+/*
+   args:
+      file_name      name of file
+      verbose        1 to write to stderr information about the
+                     filename as it is processed, 0 for quiet mode
+
+  returns:
+      file suffix if there is one, or the empty string if there is none,
+      or NULL on error
+      known suffixes are BZSUFFIX, GZSUFFIX, TXTSUFFIX (.gz .bz2 .txt)
+*/
+char *get_filesuffix(char *file_name, int verbose) {
+  char *start = NULL, *copy = NULL;
+
+  if (!file_name) return(NULL);
+  if ((start = strrchr(file_name, '.')) != NULL) {
+    if (!strcmp(start, BZSUFFIX) ||
+       !strcmp(start, GZSUFFIX) ||
+       !strcmp(start, TXTSUFFIX)) {
+      copy = (char *)malloc(file_name+strlen(file_name) - start +1);
+      if (!copy) {
+       fprintf(stderr,"failed to get memory for output filename\n");
+       return(NULL);
+      }
+      strcpy(copy, start);
+      if (verbose > 1)
+       fprintf(stderr,"passed %s and returning suffix %s\n", file_name, copy);
+      return(copy);
+    }
+  }
+  copy = (char *)malloc(1);
+  if (!copy) {
+    fprintf(stderr,"failed to get memory for output filename\n");
+    return(NULL);
+  }
+  copy[0] = '\0';
+  return(copy);
+}
+
+/*
   args:
      contents             - data to compress
      compressed_length    - where length of compressed data will be stored
@@ -495,7 +573,8 @@
     suffix       suffix of filename
     mwv          list of structures with information about the MediaWiki
                  versions for which sql output in these files will
-                 be produced
+                 be produced. If this is NULL then only one file will
+                 be produced, without a version name in it
 
   returns:
     allocated and filled in output file structure on success
@@ -511,10 +590,13 @@
 output_file_t *init_output_file(char *basename, char *suffix, mw_version_t 
*mwv) {
   output_file_t *outf, *current, *head = NULL;
   mw_version_t *next = NULL;
+  int do_once = 1;
+  char *version = NULL;
 
-  /* do this now for each mwv... */
-  while (mwv) {
-    next = mwv->next;
+  /* do this now for each mwv... or once if mwv is NULL */
+  while (mwv || do_once) {
+    do_once = 0;
+    if (mwv) next = mwv->next;
 
     outf = (output_file_t *)malloc(sizeof(output_file_t));
     if (!outf) {
@@ -535,18 +617,21 @@
 
     if (basename == NULL) {
       outf->filetype = PLAINTEXT;
-      outf->fd = stdin;
+      outf->fd = stdout;
       continue;
     }
 
     /* "basename-" + version + suffix (if there is one) */
-    outf->filename = (char *)malloc(strlen(basename) + 
(suffix?strlen(suffix):0) + strlen(outf->mwv->version) + 2);
+    if (outf->mwv) version = outf->mwv->version;
+    else version = NULL;
+
+    outf->filename = (char *)malloc(strlen(basename) + 
(suffix?strlen(suffix):0) + strlen(version) + 2);
     if (!outf->filename) {
       fprintf(stderr,"failed to get memory for output file information\n");
       free_output_file(head);
       return(NULL);
     }
-    sprintf(outf->filename, "%s-%s%s", basename, outf->mwv->version, 
suffix?suffix:"0");
+    sprintf(outf->filename, "%s%s%s%s", basename, version?"-":"", 
version?version:"", suffix?suffix:"");
     if (!suffix) {
       outf->filetype = PLAINTEXT;
       outf->fd = fopen (outf->filename, "w");
diff --git a/xmlfileutils/mwxml2sql.c b/xmlfileutils/mwxml2sql.c
index b7353ec..90fd252 100644
--- a/xmlfileutils/mwxml2sql.c
+++ b/xmlfileutils/mwxml2sql.c
@@ -180,84 +180,6 @@
   exit(-1);
 }
 
-/*
-   args:
-      file_name      name of file
-      verbose        1 to write to stderr information about the
-                     filename as it is processed, 0 for quiet mode
-
-  returns:
-      name of file without suffix, or NULL on error
-      known suffixes are BZSUFFIX, GZSUFFIX, TXTSUFFIX (.gz .bz2 .txt)
-*/
-char *get_filebase(char *file_name, int verbose) {
-  char *start = NULL, *copy = NULL;
-
-  if (!file_name) return(NULL);
-  if ((start = strrchr(file_name, '.')) != NULL) {
-    if (!strcmp(start, BZSUFFIX) ||
-       !strcmp(start, GZSUFFIX) ||
-       !strcmp(start, TXTSUFFIX)) {
-       }
-      copy = (char *)malloc(start - file_name +1);
-      if (!copy) {
-       fprintf(stderr,"failed to get memory for output filename\n");
-       return(NULL);
-      }
-      strncpy(copy, file_name, start - file_name);
-      copy[start - file_name] = '\0';
-      return(copy);
-      if (verbose > 1)
-       fprintf(stderr,"passed %s and returning base %s\n", file_name, copy);
-  }
-  copy = (char *)malloc(strlen(file_name) +1);
-  if (!copy) {
-    fprintf(stderr,"failed to get memory for output filename\n");
-    return(NULL);
-  }
-  strcpy(copy, file_name);
-  return(copy);
-}
-
-/*
-   args:
-      file_name      name of file
-      verbose        1 to write to stderr information about the
-                     filename as it is processed, 0 for quiet mode
-
-  returns:
-      file suffix if there is one, or the empty string if there is none,
-      or NULL on error
-      known suffixes are BZSUFFIX, GZSUFFIX, TXTSUFFIX (.gz .bz2 .txt)
-*/
-char *get_filesuffix(char *file_name, int verbose) {
-  char *start = NULL, *copy = NULL;
-
-  if (!file_name) return(NULL);
-  if ((start = strrchr(file_name, '.')) != NULL) {
-    if (!strcmp(start, BZSUFFIX) ||
-       !strcmp(start, GZSUFFIX) ||
-       !strcmp(start, TXTSUFFIX)) {
-      copy = (char *)malloc(file_name+strlen(file_name) - start +1);
-      if (!copy) {
-       fprintf(stderr,"failed to get memory for output filename\n");
-       return(NULL);
-      }
-      strcpy(copy, start);
-      if (verbose > 1)
-       fprintf(stderr,"passed %s and returning suffix %s\n", file_name, copy);
-      return(copy);
-    }
-  }
-  copy = (char *)malloc(1);
-  if (!copy) {
-    fprintf(stderr,"failed to get memory for output filename\n");
-    return(NULL);
-  }
-  copy[0] = '\0';
-  return(copy);
-}
-
 int main(int argc, char **argv) {
   int optindex=0;
   int optc = 0;
diff --git a/xmlfileutils/mwxml2sql.h b/xmlfileutils/mwxml2sql.h
index 7638563..35a715d 100644
--- a/xmlfileutils/mwxml2sql.h
+++ b/xmlfileutils/mwxml2sql.h
@@ -210,6 +210,7 @@
 void print_sql_field(FILE *f, char *field, int isstring, int islast);
 void copy_sql_field(char *outbuf, char *field, int isstring, int islast);
 char *sql_escape(char *s, int s_size, char *out, int out_size);
+char *tab_escape(char *s, int s_size, char *out, int out_size);
 void title_escape(char *t);
 char *un_xml_escape(char *value, char *output, int last);
 void digits_only(char *buf);
diff --git a/xmlfileutils/sqlutils.c b/xmlfileutils/sqlutils.c
index 4577e4f..03a1cc5 100644
--- a/xmlfileutils/sqlutils.c
+++ b/xmlfileutils/sqlutils.c
@@ -178,7 +178,7 @@
 
    returns:
       pointer to the next byte in s to be processed, or to NULL if all
-      bytes were processed 
+      bytes were processed
 
    this function escapes character strings for input to mysql,
    adding a trailing '\0' to the result
@@ -250,6 +250,66 @@
 
 /*
   args:
+     s           string to escape
+     s_size      length of string to escape
+     out         holder for result
+     out_size    size of holder for result
+
+   returns:
+      pointer to the next byte in s to be processed, or to NULL if all
+      bytes were processed
+
+   this function escapes tabs in character strings for input to LOAD FILE
+   adding a trailing '\0' to the result (you should pass a string that
+   already has the remainder of the mysql escapes applied)
+
+   if s_size is 0, the string to escape must be null terminated
+   and its length is not checked.
+*/
+char *tab_escape(char *s, int s_size, char *out, int out_size) {
+  char c;
+  char *from ;
+  char *to;
+  int copied = 0;
+  int ind = 0;
+
+  from = s;
+  to = out;
+  while (*from || ind < s_size) {
+    if (copied +3 > out_size) {
+      /* null terminate here and return index */
+      *to = '\0';
+      return(from);
+    }
+    switch (*from) {
+    case '\t':
+      c = 't';
+      break;
+    default:
+      c = 0;
+      *to = *from;
+      to++;
+      copied++;
+      from++;
+      ind++;
+    }
+    if (c) {
+      *to = '\\';
+      to++;
+      copied++;
+      *to = c;
+      to++;
+      copied++;
+      from++;
+      ind++;
+    }
+  }
+  *to = '\0';
+  return(NULL);
+}
+
+/*
+  args:
     t    null-terminated title string to be converted
 
   this function converts the supplied page title to its canonical

-- 
To view, visit https://gerrit.wikimedia.org/r/50167
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ifa9b4cdbb1015343b77befe7fe1bf1b4931b4381
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps
Gerrit-Branch: ariel
Gerrit-Owner: ArielGlenn <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to