[MediaWiki-commits] [Gerrit] actually add sha1.{h, c}, base36.c, sql2txt.c from my local r... - change (operations/dumps)

ArielGlenn (Code Review) Mon, 04 Mar 2013 10:17:20 -0800

ArielGlenn has submitted this change and it was merged.

Change subject: actually add sha1.{h,c}, base36.c, sql2txt.c from my local repo 
>_<
......................................................................



actually add sha1.{h,c}, base36.c, sql2txt.c from my local repo >_<

Change-Id: Id76d8790fdeaa0c5db8dfa8078adf4f54bda1ab4
---
A xmlfileutils/base36.c
A xmlfileutils/sha1.c
A xmlfileutils/sha1.h
A xmlfileutils/sql2txt.c
4 files changed, 982 insertions(+), 0 deletions(-)

Approvals:
  ArielGlenn: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/xmlfileutils/base36.c b/xmlfileutils/base36.c
new file mode 100644
index 0000000..4db7275
--- /dev/null
+++ b/xmlfileutils/base36.c
@@ -0,0 +1,199 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+/*
+  these routines are used solely to convert a sha1 string
+  to base336 for mediawiki revision table entries. grrrrr
+  what a waste
+*/
+
+/* 
+   args:
+     in       array of ints, 3 bytes per int (leave the upper 4th byte
+              free, it's needed for overflow for multibyte calculations)
+     in_copy  pre-allocated array same size as in, which will be
+              altered during the conversion and can be ignored afterwards
+     temp     pre-allocated array same size as in, which will be used
+              for temp results durin the conversion and can be ignored
+              afterwards
+     in_len   length of integer array
+     out      pre-allocated array of integers into which the result
+              will be placed, one base-36 digit per int.  no checks
+              are made as to the length being sufficient, this is the
+              caller's responsibility, strlen(in)*24/5 +1 (length needed
+             for base 32) should be enough
+
+   returns:
+      number of base 36 digits in the result
+
+   this function converts an integer array to an array of base 36 digits.
+   the input value is not altered
+   the argument out will contain the result
+*/
+int tobase36(unsigned int *in, unsigned int *in_copy, unsigned int *temp, int 
in_len, unsigned int *out) {
+  unsigned int digits;
+  int overflow;
+
+  int temp_ind = 0, in_ind = 0, out_ind = 0;
+  int i;
+  int done = 0;
+
+  for (i=0; i<in_len; i++) in_copy[i] = in[i];
+
+  while (1) {
+    in_ind = temp_ind = overflow = 0;
+    while (!in_copy[in_ind] && (in_ind < in_len)) in_ind++;
+    
+    while (in_ind < in_len) {
+      if (in_copy[in_ind] < 36) {
+       overflow = in_copy[in_ind++];
+       if (in_len == 1) {
+         done++;
+         break;
+       }
+      }
+      digits = overflow << 24 | in_copy[in_ind++];
+      temp[temp_ind++] = digits / 36;
+      overflow = digits % 36;
+    }
+    out[out_ind++] = overflow;
+    if (done) {
+      /* reverse the digits now */
+      for (i = 0; i< out_ind; i++) temp[i] = out[i];
+      for (i = 0; i< out_ind; i++) out[out_ind - i -1] = temp[i];
+      return(out_ind);
+    }
+    for (i=0; i<temp_ind; i++) in_copy[i] = temp[i];
+    in_len = temp_ind;
+  }
+}
+
+/* 
+   args:
+     c       character representing a hex digit, lower case
+
+   returns   corresponding integer value
+
+   this function converts a single char (interpreted as
+   a hex digit) to int
+*/
+int char2int(char c) {
+  char *map="0123456789abcdef";
+
+  return(strchr(map, c) - map);
+}
+
+/*
+  args:
+     s           character string representing hex digits
+     len         length of s (it does not need to be null-terminated) 
+     intbuf      pre-allocated array of integers into which the result
+                 will be placed, 3 bytes per int.  no checks
+                 are made as to the length being sufficient, this is the
+                 caller's responsibility, strlen(s)/6 + 1 is enough
+
+  returns:
+    length of int buf used
+
+  this function packs an array of characters representing hex digits
+  into an array of ints, 3 bytes per int
+*/
+int hexstring2int(char *s, int len, unsigned int *intbuf) {
+  int s_ind = 0, int_ind = 0;
+  int remainder;
+  int i;
+
+  remainder = len%6;
+  int_ind = 0;
+  intbuf[int_ind] = 0;
+  while (remainder && s_ind < len) {
+    intbuf[int_ind] = char2int(s[s_ind++]) | (intbuf[int_ind] << 4);
+    remainder -=1;
+    len-=1;
+  }
+  if (intbuf[int_ind]) int_ind++;
+  
+  while (len>0) {
+    intbuf[int_ind] = 0;
+    for (i=0; i<6; i++) {
+      intbuf[int_ind] = char2int(s[s_ind++]) | (intbuf[int_ind] << 4);
+    }    
+    len -=6;
+    int_ind++;
+  }
+  return(int_ind);
+}
+
+/*
+  args:
+     i      integer to covert
+
+  returns:
+     character corresponding to the base-36 value of the int
+
+  this function converts a single integer (of value less than 36) to
+  its character representation
+*/
+char int2char(int i) {
+  char *map="0123456789abcdefghijklmnopqrstuvwxyz";
+
+  return(map[i]);
+}
+
+/* 
+   args:
+      int_buf        array of ints, one base-36 digit per int
+      int_buf_len    length of array of ints
+      s              pre-allocated buffer into which the representation
+                     of the int array will be placed.  the string is
+                    the null-terminated. no check is made to determine
+                    that the buffer is large enough, that is the caller's
+                    responsibility, int_buf_len + 1 is enough.
+
+   this function converts an array of integers, each element representing
+   a base-36 value, into a character string representing the array
+   leading 0's in the integer value are omitted
+*/
+void int2string(unsigned int *int_buf, int int_buf_len, char *s) {
+  int int_buf_ind = 0, s_ind = 0;
+
+  /* skip leading 0's */
+  while (!int_buf[int_buf_ind] && int_buf_ind<int_buf_len) int_buf_ind++;
+
+  while (int_buf_ind < int_buf_len) {
+    s[s_ind] = int2char(int_buf[int_buf_ind]);
+    s_ind++;
+    int_buf_ind++;
+  }
+  s[s_ind] = '\0';
+  return;
+}
+
+/*
+  typical usage: 
+
+int main() {
+  char s_in[41];
+  int s_in_len;
+  unsigned int copy[41];
+  unsigned int temp[41];
+
+  unsigned int num_buf[7];
+  int num_buf_len;
+
+  unsigned int output[34];
+  char s_out[35];
+
+  int out_len;
+
+  strcpy(s_in, "560913458ecab77ad7989fa33fa4e5ddce2b367e");
+  s_in_len = strlen(s_in);
+  num_buf_len = hexstring2int(s_in, s_in_len, num_buf);
+  out_len = tobase36(num_buf, copy, temp, num_buf_len, output);
+  int2string(output, out_len, s_out);
+  fprintf(stderr,"result is %s\n", s_out);
+  exit(0);
+}
+
+*/
diff --git a/xmlfileutils/sha1.c b/xmlfileutils/sha1.c
new file mode 100644
index 0000000..1044441
--- /dev/null
+++ b/xmlfileutils/sha1.c
@@ -0,0 +1,394 @@
+/*
+Christophe Devine 
[email protected]
+http://www.cr0.net:8040/code/crypto/
+*/
+/*
+ *  FIPS-180-1 compliant SHA-1 implementation
+ *
+ *  Copyright (C) 2001-2003  Christophe Devine
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <string.h>
+
+#include "sha1.h"
+/* uncomment the following line to run the test suite */
+
+/* #define TEST  */
+
+#define GET_UINT32(n,b,i)                       \
+{                                               \
+    (n) = ( (uint32) (b)[(i)    ] << 24 )       \
+        | ( (uint32) (b)[(i) + 1] << 16 )       \
+        | ( (uint32) (b)[(i) + 2] <<  8 )       \
+        | ( (uint32) (b)[(i) + 3]       );      \
+}
+
+#define PUT_UINT32(n,b,i)                       \
+{                                               \
+    (b)[(i)    ] = (uint8) ( (n) >> 24 );       \
+    (b)[(i) + 1] = (uint8) ( (n) >> 16 );       \
+    (b)[(i) + 2] = (uint8) ( (n) >>  8 );       \
+    (b)[(i) + 3] = (uint8) ( (n)       );       \
+}
+
+void sha1_starts( sha1_context *ctx )
+{
+    ctx->total[0] = 0;
+    ctx->total[1] = 0;
+
+    ctx->state[0] = 0x67452301;
+    ctx->state[1] = 0xEFCDAB89;
+    ctx->state[2] = 0x98BADCFE;
+    ctx->state[3] = 0x10325476;
+    ctx->state[4] = 0xC3D2E1F0;
+}
+
+void sha1_process( sha1_context *ctx, uint8 data[64] )
+{
+    uint32 temp, W[16], A, B, C, D, E;
+
+    GET_UINT32( W[0],  data,  0 );
+    GET_UINT32( W[1],  data,  4 );
+    GET_UINT32( W[2],  data,  8 );
+    GET_UINT32( W[3],  data, 12 );
+    GET_UINT32( W[4],  data, 16 );
+    GET_UINT32( W[5],  data, 20 );
+    GET_UINT32( W[6],  data, 24 );
+    GET_UINT32( W[7],  data, 28 );
+    GET_UINT32( W[8],  data, 32 );
+    GET_UINT32( W[9],  data, 36 );
+    GET_UINT32( W[10], data, 40 );
+    GET_UINT32( W[11], data, 44 );
+    GET_UINT32( W[12], data, 48 );
+    GET_UINT32( W[13], data, 52 );
+    GET_UINT32( W[14], data, 56 );
+    GET_UINT32( W[15], data, 60 );
+
+#define S(x,n) ((x << n) | ((x & 0xFFFFFFFF) >> (32 - n)))
+
+#define R(t)                                            \
+(                                                       \
+    temp = W[(t -  3) & 0x0F] ^ W[(t - 8) & 0x0F] ^     \
+           W[(t - 14) & 0x0F] ^ W[ t      & 0x0F],      \
+    ( W[t & 0x0F] = S(temp,1) )                         \
+)
+
+#define P(a,b,c,d,e,x)                                  \
+{                                                       \
+    e += S(a,5) + F(b,c,d) + K + x; b = S(b,30);        \
+}
+
+    A = ctx->state[0];
+    B = ctx->state[1];
+    C = ctx->state[2];
+    D = ctx->state[3];
+    E = ctx->state[4];
+
+#define F(x,y,z) (z ^ (x & (y ^ z)))
+#define K 0x5A827999
+
+    P( A, B, C, D, E, W[0]  );
+    P( E, A, B, C, D, W[1]  );
+    P( D, E, A, B, C, W[2]  );
+    P( C, D, E, A, B, W[3]  );
+    P( B, C, D, E, A, W[4]  );
+    P( A, B, C, D, E, W[5]  );
+    P( E, A, B, C, D, W[6]  );
+    P( D, E, A, B, C, W[7]  );
+    P( C, D, E, A, B, W[8]  );
+    P( B, C, D, E, A, W[9]  );
+    P( A, B, C, D, E, W[10] );
+    P( E, A, B, C, D, W[11] );
+    P( D, E, A, B, C, W[12] );
+    P( C, D, E, A, B, W[13] );
+    P( B, C, D, E, A, W[14] );
+    P( A, B, C, D, E, W[15] );
+    P( E, A, B, C, D, R(16) );
+    P( D, E, A, B, C, R(17) );
+    P( C, D, E, A, B, R(18) );
+    P( B, C, D, E, A, R(19) );
+
+#undef K
+#undef F
+
+#define F(x,y,z) (x ^ y ^ z)
+#define K 0x6ED9EBA1
+
+    P( A, B, C, D, E, R(20) );
+    P( E, A, B, C, D, R(21) );
+    P( D, E, A, B, C, R(22) );
+    P( C, D, E, A, B, R(23) );
+    P( B, C, D, E, A, R(24) );
+    P( A, B, C, D, E, R(25) );
+    P( E, A, B, C, D, R(26) );
+    P( D, E, A, B, C, R(27) );
+    P( C, D, E, A, B, R(28) );
+    P( B, C, D, E, A, R(29) );
+    P( A, B, C, D, E, R(30) );
+    P( E, A, B, C, D, R(31) );
+    P( D, E, A, B, C, R(32) );
+    P( C, D, E, A, B, R(33) );
+    P( B, C, D, E, A, R(34) );
+    P( A, B, C, D, E, R(35) );
+    P( E, A, B, C, D, R(36) );
+    P( D, E, A, B, C, R(37) );
+    P( C, D, E, A, B, R(38) );
+    P( B, C, D, E, A, R(39) );
+
+#undef K
+#undef F
+
+#define F(x,y,z) ((x & y) | (z & (x | y)))
+#define K 0x8F1BBCDC
+
+    P( A, B, C, D, E, R(40) );
+    P( E, A, B, C, D, R(41) );
+    P( D, E, A, B, C, R(42) );
+    P( C, D, E, A, B, R(43) );
+    P( B, C, D, E, A, R(44) );
+    P( A, B, C, D, E, R(45) );
+    P( E, A, B, C, D, R(46) );
+    P( D, E, A, B, C, R(47) );
+    P( C, D, E, A, B, R(48) );
+    P( B, C, D, E, A, R(49) );
+    P( A, B, C, D, E, R(50) );
+    P( E, A, B, C, D, R(51) );
+    P( D, E, A, B, C, R(52) );
+    P( C, D, E, A, B, R(53) );
+    P( B, C, D, E, A, R(54) );
+    P( A, B, C, D, E, R(55) );
+    P( E, A, B, C, D, R(56) );
+    P( D, E, A, B, C, R(57) );
+    P( C, D, E, A, B, R(58) );
+    P( B, C, D, E, A, R(59) );
+
+#undef K
+#undef F
+
+#define F(x,y,z) (x ^ y ^ z)
+#define K 0xCA62C1D6
+
+    P( A, B, C, D, E, R(60) );
+    P( E, A, B, C, D, R(61) );
+    P( D, E, A, B, C, R(62) );
+    P( C, D, E, A, B, R(63) );
+    P( B, C, D, E, A, R(64) );
+    P( A, B, C, D, E, R(65) );
+    P( E, A, B, C, D, R(66) );
+    P( D, E, A, B, C, R(67) );
+    P( C, D, E, A, B, R(68) );
+    P( B, C, D, E, A, R(69) );
+    P( A, B, C, D, E, R(70) );
+    P( E, A, B, C, D, R(71) );
+    P( D, E, A, B, C, R(72) );
+    P( C, D, E, A, B, R(73) );
+    P( B, C, D, E, A, R(74) );
+    P( A, B, C, D, E, R(75) );
+    P( E, A, B, C, D, R(76) );
+    P( D, E, A, B, C, R(77) );
+    P( C, D, E, A, B, R(78) );
+    P( B, C, D, E, A, R(79) );
+
+#undef K
+#undef F
+
+    ctx->state[0] += A;
+    ctx->state[1] += B;
+    ctx->state[2] += C;
+    ctx->state[3] += D;
+    ctx->state[4] += E;
+}
+
+void sha1_update( sha1_context *ctx, uint8 *input, uint32 length )
+{
+    uint32 left, fill;
+
+    if( ! length ) return;
+
+    left = ctx->total[0] & 0x3F;
+    fill = 64 - left;
+
+    ctx->total[0] += length;
+    ctx->total[0] &= 0xFFFFFFFF;
+
+    if( ctx->total[0] < length )
+        ctx->total[1]++;
+
+    if( left && length >= fill )
+    {
+        memcpy( (void *) (ctx->buffer + left),
+                (void *) input, fill );
+        sha1_process( ctx, ctx->buffer );
+        length -= fill;
+        input  += fill;
+        left = 0;
+    }
+
+    while( length >= 64 )
+    {
+        sha1_process( ctx, input );
+        length -= 64;
+        input  += 64;
+    }
+
+    if( length )
+    {
+        memcpy( (void *) (ctx->buffer + left),
+                (void *) input, length );
+    }
+}
+
+static uint8 sha1_padding[64] =
+{
+ 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+void sha1_finish( sha1_context *ctx, uint8 digest[20] )
+{
+    uint32 last, padn;
+    uint32 high, low;
+    uint8 msglen[8];
+
+    high = ( ctx->total[0] >> 29 )
+         | ( ctx->total[1] <<  3 );
+    low  = ( ctx->total[0] <<  3 );
+
+    PUT_UINT32( high, msglen, 0 );
+    PUT_UINT32( low,  msglen, 4 );
+
+    last = ctx->total[0] & 0x3F;
+    padn = ( last < 56 ) ? ( 56 - last ) : ( 120 - last );
+
+    sha1_update( ctx, sha1_padding, padn );
+    sha1_update( ctx, msglen, 8 );
+
+    PUT_UINT32( ctx->state[0], digest,  0 );
+    PUT_UINT32( ctx->state[1], digest,  4 );
+    PUT_UINT32( ctx->state[2], digest,  8 );
+    PUT_UINT32( ctx->state[3], digest, 12 );
+    PUT_UINT32( ctx->state[4], digest, 16 );
+}
+
+#ifdef TEST
+
+#include <stdlib.h>
+#include <stdio.h>
+
+/*
+ * those are the standard FIPS-180-1 test vectors
+ */
+
+static char *msg[] =
+{
+    "abc",
+    "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq",
+    NULL
+};
+
+static char *val[] =
+{
+    "a9993e364706816aba3e25717850c26c9cd0d89d",
+    "84983e441c3bd26ebaae4aa1f95129e5e54670f1",
+    "34aa973cd4c4daa4f61eeb2bdbad27316534016f"
+};
+
+int main( int argc, char *argv[] )
+{
+    FILE *f;
+    int i, j;
+    char output[41];
+    sha1_context ctx;
+    unsigned char buf[1000];
+    unsigned char sha1sum[20];
+
+    if( argc < 2 )
+    {
+        printf( "\n SHA-1 Validation Tests:\n\n" );
+
+        for( i = 0; i < 3; i++ )
+        {
+            printf( " Test %d ", i + 1 );
+
+            sha1_starts( &ctx );
+
+            if( i < 2 )
+            {
+                sha1_update( &ctx, (uint8 *) msg[i],
+                             strlen( msg[i] ) );
+            }
+            else
+            {
+                memset( buf, 'a', 1000 );
+
+                for( j = 0; j < 1000; j++ )
+                {
+                    sha1_update( &ctx, (uint8 *) buf, 1000 );
+                }
+            }
+
+            sha1_finish( &ctx, sha1sum );
+
+            for( j = 0; j < 20; j++ )
+            {
+                sprintf( output + j * 2, "%02x", sha1sum[j] );
+            }
+
+            if( memcmp( output, val[i], 40 ) )
+            {
+                printf( "failed!\n" );
+                return( 1 );
+            }
+
+            printf( "passed.\n" );
+        }
+
+        printf( "\n" );
+    }
+    else
+    {
+        if( ! ( f = fopen( argv[1], "rb" ) ) )
+        {
+            perror( "fopen" );
+            return( 1 );
+        }
+
+        sha1_starts( &ctx );
+
+        while( ( i = fread( buf, 1, sizeof( buf ), f ) ) > 0 )
+        {
+            sha1_update( &ctx, buf, i );
+        }
+
+        sha1_finish( &ctx, sha1sum );
+
+        for( j = 0; j < 20; j++ )
+        {
+            printf( "%02x", sha1sum[j] );
+        }
+
+        printf( "  %s\n", argv[1] );
+    }
+
+    return( 0 );
+}
+
+#endif
+
diff --git a/xmlfileutils/sha1.h b/xmlfileutils/sha1.h
new file mode 100644
index 0000000..228a56e
--- /dev/null
+++ b/xmlfileutils/sha1.h
@@ -0,0 +1,29 @@
+/*
+Christophe Devine 
[email protected]
+http://www.cr0.net:8040/code/crypto/
+*/
+#ifndef _SHA1_H
+#define _SHA1_H
+
+#ifndef uint8
+#define uint8  unsigned char
+#endif
+
+#ifndef uint32
+#define uint32 unsigned long int
+#endif
+
+typedef struct
+{
+    uint32 total[2];
+    uint32 state[5];
+    uint8 buffer[64];
+}
+sha1_context;
+
+void sha1_starts( sha1_context *ctx );
+void sha1_update( sha1_context *ctx, uint8 *input, uint32 length );
+void sha1_finish( sha1_context *ctx, uint8 digest[20] );
+
+#endif /* sha1.h */
diff --git a/xmlfileutils/sql2txt.c b/xmlfileutils/sql2txt.c
new file mode 100644
index 0000000..3d64a82
--- /dev/null
+++ b/xmlfileutils/sql2txt.c
@@ -0,0 +1,360 @@
+#include <stdio.h>
+#include <getopt.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <regex.h>
+#include <ctype.h>
+#include <inttypes.h>
+#include <time.h>
+#include <bzlib.h>
+#include <zlib.h>
+#include <stdarg.h>
+
+#include "mwxml2sql.h"
+
+int lines_done = 0;
+int tuples_done = 0;
+  
+void show_error(char *message, ...) {
+  va_list argptr;
+
+  va_start(argptr,message);
+
+  fprintf(stderr,"Error encountered: (%d:%d) ", lines_done, tuples_done);
+  if (message)
+    vfprintf(stderr,message, argptr);
+  else
+    fprintf(stderr,"unknown error");
+  fprintf(stderr,"\n");
+
+  va_end(argptr);
+
+  return;
+}
+
+int write_field(output_file_t *f, char *start, char *end, int starting_quote, 
int ending_quote, int verbose) {
+  char out_buf[TEXT_BUF_LEN*2 +7];
+  char *ind;
+
+  ind = out_buf;
+
+  if (starting_quote) {
+    *ind = '\'';
+    ind++;
+  }
+  strncpy(ind, start, end-start);
+  ind += end-start;
+  if (ending_quote) {
+    *ind = '\'';
+    ind++;
+  }
+  *ind = '\0';
+  if (verbose) fprintf(stderr,"put field: <%s>\n", out_buf);
+  return(put_line(f, out_buf));
+}
+
+char *do_field(input_file_t *sql, output_file_t *text, char *start, int 
verbose) {
+  /* should be at the beginning of a field. either a leading ' or the data.
+     our job: read in data, til we get to ..
+     - closing ' if we opened with one
+     - , or ) if there was no open quote
+     end of buffer first, in which case we write out what we have,
+     saving a few chars in case of escapes I guess, move them in etc
+     and refill buffer, then keep going with the above
+     once we get to that or as we get to that we write what we have
+     and put start at uh...the comma or the ) if there is one, else
+     to NULL if we hit eof? blergh
+  */
+  int quoted = 0;
+  char *ind = NULL;
+  int first_write = 1;
+  char load_data_escaped_buf[TEXT_BUF_LEN*2 + 6];
+  int donulls = 1;
+
+  while (*start == ' ') start++;
+
+  if (*start == '\'') {
+    quoted++;
+    start+=1;
+  }
+  ind = start;
+  while (1) {
+    if (quoted && *ind == '\'') {
+      load_data_escape(start, ind-start, load_data_escaped_buf, 
sizeof(load_data_escaped_buf), 0);
+      write_field(text, load_data_escaped_buf, load_data_escaped_buf + 
strlen(load_data_escaped_buf), first_write&&quoted, 1&&quoted, verbose);
+      start = ind+1;
+      return(start);
+    }
+    else if (!quoted && (*ind == ',' || *ind == ')' )) {
+      load_data_escape(start, ind-start, load_data_escaped_buf, 
sizeof(load_data_escaped_buf), donulls);
+      write_field(text, load_data_escaped_buf, load_data_escaped_buf + 
strlen(load_data_escaped_buf), first_write&&quoted, 1&&quoted, verbose);
+      first_write = 0;
+      start = ind;
+      return(start);
+    }
+    else if (!*ind) {
+      load_data_escape(start, ind-start, load_data_escaped_buf, 
sizeof(load_data_escaped_buf), donulls);
+      write_field(text, load_data_escaped_buf, load_data_escaped_buf + 
strlen(load_data_escaped_buf), first_write&&quoted, 0, verbose);
+      first_write = 0;
+      if (!get_line(sql)) {
+       show_error("abrupt end to data after or in field %s\n", start);
+       return(NULL);
+      }
+      start = sql->in_buf->content;
+      ind = start;
+    }
+    else {
+      /* move ind along, skipping over escaped crap etc. */
+      if (*ind == '\\') {
+       ind++;
+       if (!*ind) {
+         sql->leftover[0] = '\\';
+         sql->leftover[1] = '\0';
+         load_data_escape(start, ind-start-1, load_data_escaped_buf, 
sizeof(load_data_escaped_buf), donulls);
+         write_field(text, load_data_escaped_buf, load_data_escaped_buf + 
strlen(load_data_escaped_buf), first_write&&quoted, 0, verbose);
+         first_write = 0;
+         if (!get_line(sql)) {
+           show_error("abrupt end to data after backslash in field %s\n", 
start);
+           return(NULL);
+         }
+         start = sql->in_buf->content;
+         ind = start;
+       }
+       else ind++;
+      }
+      else ind++;
+    }
+  }
+  return(NULL);
+}
+
+/* we are at ) and we need to find ( */
+char *find_next_tuple(input_file_t *sql, char *start, int verbose) {
+  while (*start != '(') {
+    if (!*start) return(NULL); /* end of full line */
+    else start++;
+  }
+  return(start);
+}
+
+/* if we have a partial line we had better deal with it here, so
+   that when we return to the caller an entire tuple has in fact been 
processed,
+   with the next piece of the line preloaded into buffer
+   expect *start to be '(' = start of tuple
+*/
+char *do_tuple(input_file_t *sql, output_file_t *text, char *start, int 
verbose) {
+  int first = 1;
+  char buf[2];
+
+  buf[0] = '\t';
+  buf[1] = '\0';
+  while (*start == ' ') start++;
+
+  if (*start == '(') start++;
+  else {
+    show_error("expected ( for beginning of tuple, got this: %s\n", start);
+    return(NULL);
+  }
+  if (!*start) {
+    if (get_line(sql) == NULL) return(NULL);
+    start = sql->in_buf->content;
+  }
+  while (start && *start) {
+    if (first) first = 0;
+    else {
+      put_line(text, buf);
+    }
+    start = do_field(sql, text, start, verbose);
+    /* we should now be at either ')' or ',', we want to skip to:
+       next ( if there is one, or .. .';' (which should indicate end of line,
+       so expect that)
+    */
+
+    if (!start) {
+      if (get_line(sql) == NULL) return(NULL);
+      start = sql->in_buf->content;
+    }
+
+    while (*start == ' ') start++;
+
+    /* if we ran out of data right after a tuple = (xx,yyy,...zzz) then refill 
the buffer
+       if we run out in the middle of a field do_field will handle that case */
+    if (!*start) {
+      if (get_line(sql) == NULL) return(NULL);
+      start = sql->in_buf->content;
+    }
+    if (*start == ')') {
+      start = find_next_tuple(sql, start, verbose);
+      return(start);
+    }
+    else if (*start == ',') {
+      start++;
+      if (!*start) { /* try to refill the buffer */
+       if (get_line(sql) == NULL) return(NULL);
+       start = sql->in_buf->content;
+      }
+    }
+    else {
+      show_error("tuple has unexpected data: <%s>", start);
+      return(NULL);
+    }
+  }
+  return NULL;
+}
+
+/* if we have a partial line we had better deal with it here, so
+   that when we return to the caller an entire line has in fact been processed 
*/
+int do_line(input_file_t *sql, output_file_t *text, int verbose) {
+  int skip = 0;
+  char *start = NULL;
+  char buf[2];
+
+  if (verbose) fprintf(stderr,"processing line starting <%c%c%c>\n", 
sql->in_buf->content[0], sql->in_buf->content[1], sql->in_buf->content[2]);
+  /* input may start with INSERT ... VALUES (
+     or simply with with a leading (
+     newline means end of tuple or tuples
+     anything else doesn't have tuples so we ignore it
+  */
+  if (!strncmp(sql->in_buf->content, "INSERT ", 6)) {
+    start = strstr(sql->in_buf->content, " VALUES (");
+    if (!start) skip++;
+    else start+=7;
+  }
+  else if (sql->in_buf->content[0] != '(') skip++;
+  else start = sql->in_buf->content;
+
+  if (skip) return(0); /* don't process this line, it doesn't have a data 
tuple */
+  buf[0] = '\n';
+  buf[1] = '\0';
+  while (start) {
+    start = do_tuple(sql, text, start, verbose);
+    tuples_done++;
+    put_line(text, buf);
+  }
+
+  /* fixme we should actually capture error returns from do_tuple and
+     return with -1 here */
+  return(0);
+}
+
+/*
+   args:
+     whoami    name of calling program
+     message   message to print out before usage information, if any
+               this should not end in a newline
+
+   this function prints usage information for the program to stdout
+*/
+void usage(char *whoami, char *message) {
+  if (message) {
+    fprintf(stderr,"%s\n\n",message);
+  }
+  fprintf(stderr,"Usage: %s [--sqlfile filename] [--txtfile filename] 
[--verbose] [--help]\n", whoami);
+  fprintf(stderr,"\n");
+  fprintf(stderr,"Reads a possibly compressed stream of MySQL INSERT 
statements and converts\n");
+  fprintf(stderr,"it to tab-separated output suitable for import via LOAD 
FILE\n");
+  fprintf(stderr,"\n");
+  fprintf(stderr,"Arguments:\n");
+  fprintf(stderr,"\n");
+  fprintf(stderr,"sqlfile   (s):   name of sqlfile from which to read INSERT 
statements; if none\n");
+  fprintf(stderr,"                 is specified, data will be read from stdin. 
 If a filename is\n");
+  fprintf(stderr,"                 specified that ends in .gz or .bz2, the 
file will silently be\n");
+  fprintf(stderr,"                 decompressed.\n");
+  fprintf(stderr,"txtfile   (t):   name of file to which to write output; if 
none is specified,\n");
+  fprintf(stderr,"                 data will be written to stdout. If a 
filename is specified that\n");
+  fprintf(stderr,"                 ends in .gz or .bz2, the file will be gz or 
bz2 compressed.\n");
+  fprintf(stderr,"help      (h):   print this help message and exit\n");
+  fprintf(stderr,"verbose   (v):   write progress information to stderr.\n");
+  exit(-1);
+}
+
+int main(int argc, char **argv) {
+  int optindex=0;
+  int optc = 0;
+  int result;
+
+  int help = 0;
+  int verbose = 0;
+
+  char *sql_file = NULL;  /* contains mysql insert commands */
+  char *text_file = NULL; /* output */
+
+  input_file_t *sql = NULL;
+  output_file_t *text = NULL;
+
+  char *filebase = NULL;
+  char *filesuffix = NULL;
+
+  struct option optvalues[] = {
+    {"sqlfile", required_argument, NULL, 'c'},
+    {"textfile", required_argument, NULL, 'f'},
+    {"help", no_argument, NULL, 'h'},
+    {"verbose", no_argument, NULL, 'v'},
+    {NULL, 0, NULL, 0}
+  };
+
+  while (1) {
+    optc=getopt_long(argc,argv,"hs:t:v", optvalues, &optindex);
+    if (optc==-1) break;
+
+    switch(optc) {
+    case 's':
+      sql_file = optarg;
+      break;
+    case 't':
+      text_file = optarg;
+      break;
+    case 'h':
+      help++;
+      break;
+    case 'v':
+      verbose++; 
+      break;
+    default:
+      usage(argv[0],"unknown option or other error\n");
+    }
+  }
+
+  if (help) usage(argv[0], NULL);
+
+  sql = init_input_file(sql_file);
+  if (!sql) exit(1);
+
+  if (!text_file)
+    text = init_output_file(NULL, NULL, NULL);
+  else {
+    /* take apart the name if needed and shove in the prefix, then the suffix 
*/
+
+    filebase = get_filebase(text_file, verbose);
+    filesuffix = get_filesuffix(text_file, verbose);
+    text = init_output_file(filebase, filesuffix, NULL);
+  }
+
+  if (verbose) fprintf(stderr,"Input and output files opened\n");
+
+  while (1) {
+    if (get_line(sql) == NULL) break;
+    result = do_line(sql, text, verbose);
+    if (result) {
+      fprintf(stderr,"error encountered scanning sql file\n");
+      exit(1);
+    }
+    lines_done++;
+    if (verbose && !(lines_done%1000)) fprintf(stderr,"%d lines processed\n", 
lines_done);
+  }
+
+  if (verbose && (lines_done%1000)) fprintf(stderr,"%d lines processed\n", 
lines_done);
+
+  close_input_file(sql);
+  free_input_file(sql);
+
+  close_output_file(text);
+  free_output_file(text);
+
+  exit(0);
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/52057
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Id76d8790fdeaa0c5db8dfa8078adf4f54bda1ab4
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps
Gerrit-Branch: ariel
Gerrit-Owner: ArielGlenn <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>
Gerrit-Reviewer: jenkins-bot

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] actually add sha1.{h, c}, base36.c, sql2txt.c from my local r... - change (operations/dumps)

Reply via email to