ArielGlenn has submitted this change and it was merged.
Change subject: actually add sha1.{h,c}, base36.c, sql2txt.c from my local repo
>_<
......................................................................
actually add sha1.{h,c}, base36.c, sql2txt.c from my local repo >_<
Change-Id: Id76d8790fdeaa0c5db8dfa8078adf4f54bda1ab4
---
A xmlfileutils/base36.c
A xmlfileutils/sha1.c
A xmlfileutils/sha1.h
A xmlfileutils/sql2txt.c
4 files changed, 982 insertions(+), 0 deletions(-)
Approvals:
ArielGlenn: Looks good to me, approved
jenkins-bot: Verified
diff --git a/xmlfileutils/base36.c b/xmlfileutils/base36.c
new file mode 100644
index 0000000..4db7275
--- /dev/null
+++ b/xmlfileutils/base36.c
@@ -0,0 +1,199 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+/*
+ these routines are used solely to convert a sha1 string
+ to base336 for mediawiki revision table entries. grrrrr
+ what a waste
+*/
+
+/*
+ args:
+ in array of ints, 3 bytes per int (leave the upper 4th byte
+ free, it's needed for overflow for multibyte calculations)
+ in_copy pre-allocated array same size as in, which will be
+ altered during the conversion and can be ignored afterwards
+ temp pre-allocated array same size as in, which will be used
+ for temp results durin the conversion and can be ignored
+ afterwards
+ in_len length of integer array
+ out pre-allocated array of integers into which the result
+ will be placed, one base-36 digit per int. no checks
+ are made as to the length being sufficient, this is the
+ caller's responsibility, strlen(in)*24/5 +1 (length needed
+ for base 32) should be enough
+
+ returns:
+ number of base 36 digits in the result
+
+ this function converts an integer array to an array of base 36 digits.
+ the input value is not altered
+ the argument out will contain the result
+*/
+int tobase36(unsigned int *in, unsigned int *in_copy, unsigned int *temp, int
in_len, unsigned int *out) {
+ unsigned int digits;
+ int overflow;
+
+ int temp_ind = 0, in_ind = 0, out_ind = 0;
+ int i;
+ int done = 0;
+
+ for (i=0; i<in_len; i++) in_copy[i] = in[i];
+
+ while (1) {
+ in_ind = temp_ind = overflow = 0;
+ while (!in_copy[in_ind] && (in_ind < in_len)) in_ind++;
+
+ while (in_ind < in_len) {
+ if (in_copy[in_ind] < 36) {
+ overflow = in_copy[in_ind++];
+ if (in_len == 1) {
+ done++;
+ break;
+ }
+ }
+ digits = overflow << 24 | in_copy[in_ind++];
+ temp[temp_ind++] = digits / 36;
+ overflow = digits % 36;
+ }
+ out[out_ind++] = overflow;
+ if (done) {
+ /* reverse the digits now */
+ for (i = 0; i< out_ind; i++) temp[i] = out[i];
+ for (i = 0; i< out_ind; i++) out[out_ind - i -1] = temp[i];
+ return(out_ind);
+ }
+ for (i=0; i<temp_ind; i++) in_copy[i] = temp[i];
+ in_len = temp_ind;
+ }
+}
+
+/*
+ args:
+ c character representing a hex digit, lower case
+
+ returns corresponding integer value
+
+ this function converts a single char (interpreted as
+ a hex digit) to int
+*/
+int char2int(char c) {
+ char *map="0123456789abcdef";
+
+ return(strchr(map, c) - map);
+}
+
+/*
+ args:
+ s character string representing hex digits
+ len length of s (it does not need to be null-terminated)
+ intbuf pre-allocated array of integers into which the result
+ will be placed, 3 bytes per int. no checks
+ are made as to the length being sufficient, this is the
+ caller's responsibility, strlen(s)/6 + 1 is enough
+
+ returns:
+ length of int buf used
+
+ this function packs an array of characters representing hex digits
+ into an array of ints, 3 bytes per int
+*/
+int hexstring2int(char *s, int len, unsigned int *intbuf) {
+ int s_ind = 0, int_ind = 0;
+ int remainder;
+ int i;
+
+ remainder = len%6;
+ int_ind = 0;
+ intbuf[int_ind] = 0;
+ while (remainder && s_ind < len) {
+ intbuf[int_ind] = char2int(s[s_ind++]) | (intbuf[int_ind] << 4);
+ remainder -=1;
+ len-=1;
+ }
+ if (intbuf[int_ind]) int_ind++;
+
+ while (len>0) {
+ intbuf[int_ind] = 0;
+ for (i=0; i<6; i++) {
+ intbuf[int_ind] = char2int(s[s_ind++]) | (intbuf[int_ind] << 4);
+ }
+ len -=6;
+ int_ind++;
+ }
+ return(int_ind);
+}
+
+/*
+ args:
+ i integer to covert
+
+ returns:
+ character corresponding to the base-36 value of the int
+
+ this function converts a single integer (of value less than 36) to
+ its character representation
+*/
+char int2char(int i) {
+ char *map="0123456789abcdefghijklmnopqrstuvwxyz";
+
+ return(map[i]);
+}
+
+/*
+ args:
+ int_buf array of ints, one base-36 digit per int
+ int_buf_len length of array of ints
+ s pre-allocated buffer into which the representation
+ of the int array will be placed. the string is
+ the null-terminated. no check is made to determine
+ that the buffer is large enough, that is the caller's
+ responsibility, int_buf_len + 1 is enough.
+
+ this function converts an array of integers, each element representing
+ a base-36 value, into a character string representing the array
+ leading 0's in the integer value are omitted
+*/
+void int2string(unsigned int *int_buf, int int_buf_len, char *s) {
+ int int_buf_ind = 0, s_ind = 0;
+
+ /* skip leading 0's */
+ while (!int_buf[int_buf_ind] && int_buf_ind<int_buf_len) int_buf_ind++;
+
+ while (int_buf_ind < int_buf_len) {
+ s[s_ind] = int2char(int_buf[int_buf_ind]);
+ s_ind++;
+ int_buf_ind++;
+ }
+ s[s_ind] = '\0';
+ return;
+}
+
+/*
+ typical usage:
+
+int main() {
+ char s_in[41];
+ int s_in_len;
+ unsigned int copy[41];
+ unsigned int temp[41];
+
+ unsigned int num_buf[7];
+ int num_buf_len;
+
+ unsigned int output[34];
+ char s_out[35];
+
+ int out_len;
+
+ strcpy(s_in, "560913458ecab77ad7989fa33fa4e5ddce2b367e");
+ s_in_len = strlen(s_in);
+ num_buf_len = hexstring2int(s_in, s_in_len, num_buf);
+ out_len = tobase36(num_buf, copy, temp, num_buf_len, output);
+ int2string(output, out_len, s_out);
+ fprintf(stderr,"result is %s\n", s_out);
+ exit(0);
+}
+
+*/
diff --git a/xmlfileutils/sha1.c b/xmlfileutils/sha1.c
new file mode 100644
index 0000000..1044441
--- /dev/null
+++ b/xmlfileutils/sha1.c
@@ -0,0 +1,394 @@
+/*
+Christophe Devine
[email protected]
+http://www.cr0.net:8040/code/crypto/
+*/
+/*
+ * FIPS-180-1 compliant SHA-1 implementation
+ *
+ * Copyright (C) 2001-2003 Christophe Devine
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <string.h>
+
+#include "sha1.h"
+/* uncomment the following line to run the test suite */
+
+/* #define TEST */
+
+#define GET_UINT32(n,b,i) \
+{ \
+ (n) = ( (uint32) (b)[(i) ] << 24 ) \
+ | ( (uint32) (b)[(i) + 1] << 16 ) \
+ | ( (uint32) (b)[(i) + 2] << 8 ) \
+ | ( (uint32) (b)[(i) + 3] ); \
+}
+
+#define PUT_UINT32(n,b,i) \
+{ \
+ (b)[(i) ] = (uint8) ( (n) >> 24 ); \
+ (b)[(i) + 1] = (uint8) ( (n) >> 16 ); \
+ (b)[(i) + 2] = (uint8) ( (n) >> 8 ); \
+ (b)[(i) + 3] = (uint8) ( (n) ); \
+}
+
+void sha1_starts( sha1_context *ctx )
+{
+ ctx->total[0] = 0;
+ ctx->total[1] = 0;
+
+ ctx->state[0] = 0x67452301;
+ ctx->state[1] = 0xEFCDAB89;
+ ctx->state[2] = 0x98BADCFE;
+ ctx->state[3] = 0x10325476;
+ ctx->state[4] = 0xC3D2E1F0;
+}
+
+void sha1_process( sha1_context *ctx, uint8 data[64] )
+{
+ uint32 temp, W[16], A, B, C, D, E;
+
+ GET_UINT32( W[0], data, 0 );
+ GET_UINT32( W[1], data, 4 );
+ GET_UINT32( W[2], data, 8 );
+ GET_UINT32( W[3], data, 12 );
+ GET_UINT32( W[4], data, 16 );
+ GET_UINT32( W[5], data, 20 );
+ GET_UINT32( W[6], data, 24 );
+ GET_UINT32( W[7], data, 28 );
+ GET_UINT32( W[8], data, 32 );
+ GET_UINT32( W[9], data, 36 );
+ GET_UINT32( W[10], data, 40 );
+ GET_UINT32( W[11], data, 44 );
+ GET_UINT32( W[12], data, 48 );
+ GET_UINT32( W[13], data, 52 );
+ GET_UINT32( W[14], data, 56 );
+ GET_UINT32( W[15], data, 60 );
+
+#define S(x,n) ((x << n) | ((x & 0xFFFFFFFF) >> (32 - n)))
+
+#define R(t) \
+( \
+ temp = W[(t - 3) & 0x0F] ^ W[(t - 8) & 0x0F] ^ \
+ W[(t - 14) & 0x0F] ^ W[ t & 0x0F], \
+ ( W[t & 0x0F] = S(temp,1) ) \
+)
+
+#define P(a,b,c,d,e,x) \
+{ \
+ e += S(a,5) + F(b,c,d) + K + x; b = S(b,30); \
+}
+
+ A = ctx->state[0];
+ B = ctx->state[1];
+ C = ctx->state[2];
+ D = ctx->state[3];
+ E = ctx->state[4];
+
+#define F(x,y,z) (z ^ (x & (y ^ z)))
+#define K 0x5A827999
+
+ P( A, B, C, D, E, W[0] );
+ P( E, A, B, C, D, W[1] );
+ P( D, E, A, B, C, W[2] );
+ P( C, D, E, A, B, W[3] );
+ P( B, C, D, E, A, W[4] );
+ P( A, B, C, D, E, W[5] );
+ P( E, A, B, C, D, W[6] );
+ P( D, E, A, B, C, W[7] );
+ P( C, D, E, A, B, W[8] );
+ P( B, C, D, E, A, W[9] );
+ P( A, B, C, D, E, W[10] );
+ P( E, A, B, C, D, W[11] );
+ P( D, E, A, B, C, W[12] );
+ P( C, D, E, A, B, W[13] );
+ P( B, C, D, E, A, W[14] );
+ P( A, B, C, D, E, W[15] );
+ P( E, A, B, C, D, R(16) );
+ P( D, E, A, B, C, R(17) );
+ P( C, D, E, A, B, R(18) );
+ P( B, C, D, E, A, R(19) );
+
+#undef K
+#undef F
+
+#define F(x,y,z) (x ^ y ^ z)
+#define K 0x6ED9EBA1
+
+ P( A, B, C, D, E, R(20) );
+ P( E, A, B, C, D, R(21) );
+ P( D, E, A, B, C, R(22) );
+ P( C, D, E, A, B, R(23) );
+ P( B, C, D, E, A, R(24) );
+ P( A, B, C, D, E, R(25) );
+ P( E, A, B, C, D, R(26) );
+ P( D, E, A, B, C, R(27) );
+ P( C, D, E, A, B, R(28) );
+ P( B, C, D, E, A, R(29) );
+ P( A, B, C, D, E, R(30) );
+ P( E, A, B, C, D, R(31) );
+ P( D, E, A, B, C, R(32) );
+ P( C, D, E, A, B, R(33) );
+ P( B, C, D, E, A, R(34) );
+ P( A, B, C, D, E, R(35) );
+ P( E, A, B, C, D, R(36) );
+ P( D, E, A, B, C, R(37) );
+ P( C, D, E, A, B, R(38) );
+ P( B, C, D, E, A, R(39) );
+
+#undef K
+#undef F
+
+#define F(x,y,z) ((x & y) | (z & (x | y)))
+#define K 0x8F1BBCDC
+
+ P( A, B, C, D, E, R(40) );
+ P( E, A, B, C, D, R(41) );
+ P( D, E, A, B, C, R(42) );
+ P( C, D, E, A, B, R(43) );
+ P( B, C, D, E, A, R(44) );
+ P( A, B, C, D, E, R(45) );
+ P( E, A, B, C, D, R(46) );
+ P( D, E, A, B, C, R(47) );
+ P( C, D, E, A, B, R(48) );
+ P( B, C, D, E, A, R(49) );
+ P( A, B, C, D, E, R(50) );
+ P( E, A, B, C, D, R(51) );
+ P( D, E, A, B, C, R(52) );
+ P( C, D, E, A, B, R(53) );
+ P( B, C, D, E, A, R(54) );
+ P( A, B, C, D, E, R(55) );
+ P( E, A, B, C, D, R(56) );
+ P( D, E, A, B, C, R(57) );
+ P( C, D, E, A, B, R(58) );
+ P( B, C, D, E, A, R(59) );
+
+#undef K
+#undef F
+
+#define F(x,y,z) (x ^ y ^ z)
+#define K 0xCA62C1D6
+
+ P( A, B, C, D, E, R(60) );
+ P( E, A, B, C, D, R(61) );
+ P( D, E, A, B, C, R(62) );
+ P( C, D, E, A, B, R(63) );
+ P( B, C, D, E, A, R(64) );
+ P( A, B, C, D, E, R(65) );
+ P( E, A, B, C, D, R(66) );
+ P( D, E, A, B, C, R(67) );
+ P( C, D, E, A, B, R(68) );
+ P( B, C, D, E, A, R(69) );
+ P( A, B, C, D, E, R(70) );
+ P( E, A, B, C, D, R(71) );
+ P( D, E, A, B, C, R(72) );
+ P( C, D, E, A, B, R(73) );
+ P( B, C, D, E, A, R(74) );
+ P( A, B, C, D, E, R(75) );
+ P( E, A, B, C, D, R(76) );
+ P( D, E, A, B, C, R(77) );
+ P( C, D, E, A, B, R(78) );
+ P( B, C, D, E, A, R(79) );
+
+#undef K
+#undef F
+
+ ctx->state[0] += A;
+ ctx->state[1] += B;
+ ctx->state[2] += C;
+ ctx->state[3] += D;
+ ctx->state[4] += E;
+}
+
+void sha1_update( sha1_context *ctx, uint8 *input, uint32 length )
+{
+ uint32 left, fill;
+
+ if( ! length ) return;
+
+ left = ctx->total[0] & 0x3F;
+ fill = 64 - left;
+
+ ctx->total[0] += length;
+ ctx->total[0] &= 0xFFFFFFFF;
+
+ if( ctx->total[0] < length )
+ ctx->total[1]++;
+
+ if( left && length >= fill )
+ {
+ memcpy( (void *) (ctx->buffer + left),
+ (void *) input, fill );
+ sha1_process( ctx, ctx->buffer );
+ length -= fill;
+ input += fill;
+ left = 0;
+ }
+
+ while( length >= 64 )
+ {
+ sha1_process( ctx, input );
+ length -= 64;
+ input += 64;
+ }
+
+ if( length )
+ {
+ memcpy( (void *) (ctx->buffer + left),
+ (void *) input, length );
+ }
+}
+
+static uint8 sha1_padding[64] =
+{
+ 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+void sha1_finish( sha1_context *ctx, uint8 digest[20] )
+{
+ uint32 last, padn;
+ uint32 high, low;
+ uint8 msglen[8];
+
+ high = ( ctx->total[0] >> 29 )
+ | ( ctx->total[1] << 3 );
+ low = ( ctx->total[0] << 3 );
+
+ PUT_UINT32( high, msglen, 0 );
+ PUT_UINT32( low, msglen, 4 );
+
+ last = ctx->total[0] & 0x3F;
+ padn = ( last < 56 ) ? ( 56 - last ) : ( 120 - last );
+
+ sha1_update( ctx, sha1_padding, padn );
+ sha1_update( ctx, msglen, 8 );
+
+ PUT_UINT32( ctx->state[0], digest, 0 );
+ PUT_UINT32( ctx->state[1], digest, 4 );
+ PUT_UINT32( ctx->state[2], digest, 8 );
+ PUT_UINT32( ctx->state[3], digest, 12 );
+ PUT_UINT32( ctx->state[4], digest, 16 );
+}
+
+#ifdef TEST
+
+#include <stdlib.h>
+#include <stdio.h>
+
+/*
+ * those are the standard FIPS-180-1 test vectors
+ */
+
+static char *msg[] =
+{
+ "abc",
+ "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq",
+ NULL
+};
+
+static char *val[] =
+{
+ "a9993e364706816aba3e25717850c26c9cd0d89d",
+ "84983e441c3bd26ebaae4aa1f95129e5e54670f1",
+ "34aa973cd4c4daa4f61eeb2bdbad27316534016f"
+};
+
+int main( int argc, char *argv[] )
+{
+ FILE *f;
+ int i, j;
+ char output[41];
+ sha1_context ctx;
+ unsigned char buf[1000];
+ unsigned char sha1sum[20];
+
+ if( argc < 2 )
+ {
+ printf( "\n SHA-1 Validation Tests:\n\n" );
+
+ for( i = 0; i < 3; i++ )
+ {
+ printf( " Test %d ", i + 1 );
+
+ sha1_starts( &ctx );
+
+ if( i < 2 )
+ {
+ sha1_update( &ctx, (uint8 *) msg[i],
+ strlen( msg[i] ) );
+ }
+ else
+ {
+ memset( buf, 'a', 1000 );
+
+ for( j = 0; j < 1000; j++ )
+ {
+ sha1_update( &ctx, (uint8 *) buf, 1000 );
+ }
+ }
+
+ sha1_finish( &ctx, sha1sum );
+
+ for( j = 0; j < 20; j++ )
+ {
+ sprintf( output + j * 2, "%02x", sha1sum[j] );
+ }
+
+ if( memcmp( output, val[i], 40 ) )
+ {
+ printf( "failed!\n" );
+ return( 1 );
+ }
+
+ printf( "passed.\n" );
+ }
+
+ printf( "\n" );
+ }
+ else
+ {
+ if( ! ( f = fopen( argv[1], "rb" ) ) )
+ {
+ perror( "fopen" );
+ return( 1 );
+ }
+
+ sha1_starts( &ctx );
+
+ while( ( i = fread( buf, 1, sizeof( buf ), f ) ) > 0 )
+ {
+ sha1_update( &ctx, buf, i );
+ }
+
+ sha1_finish( &ctx, sha1sum );
+
+ for( j = 0; j < 20; j++ )
+ {
+ printf( "%02x", sha1sum[j] );
+ }
+
+ printf( " %s\n", argv[1] );
+ }
+
+ return( 0 );
+}
+
+#endif
+
diff --git a/xmlfileutils/sha1.h b/xmlfileutils/sha1.h
new file mode 100644
index 0000000..228a56e
--- /dev/null
+++ b/xmlfileutils/sha1.h
@@ -0,0 +1,29 @@
+/*
+Christophe Devine
[email protected]
+http://www.cr0.net:8040/code/crypto/
+*/
+#ifndef _SHA1_H
+#define _SHA1_H
+
+#ifndef uint8
+#define uint8 unsigned char
+#endif
+
+#ifndef uint32
+#define uint32 unsigned long int
+#endif
+
+typedef struct
+{
+ uint32 total[2];
+ uint32 state[5];
+ uint8 buffer[64];
+}
+sha1_context;
+
+void sha1_starts( sha1_context *ctx );
+void sha1_update( sha1_context *ctx, uint8 *input, uint32 length );
+void sha1_finish( sha1_context *ctx, uint8 digest[20] );
+
+#endif /* sha1.h */
diff --git a/xmlfileutils/sql2txt.c b/xmlfileutils/sql2txt.c
new file mode 100644
index 0000000..3d64a82
--- /dev/null
+++ b/xmlfileutils/sql2txt.c
@@ -0,0 +1,360 @@
+#include <stdio.h>
+#include <getopt.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <regex.h>
+#include <ctype.h>
+#include <inttypes.h>
+#include <time.h>
+#include <bzlib.h>
+#include <zlib.h>
+#include <stdarg.h>
+
+#include "mwxml2sql.h"
+
+int lines_done = 0;
+int tuples_done = 0;
+
+void show_error(char *message, ...) {
+ va_list argptr;
+
+ va_start(argptr,message);
+
+ fprintf(stderr,"Error encountered: (%d:%d) ", lines_done, tuples_done);
+ if (message)
+ vfprintf(stderr,message, argptr);
+ else
+ fprintf(stderr,"unknown error");
+ fprintf(stderr,"\n");
+
+ va_end(argptr);
+
+ return;
+}
+
+int write_field(output_file_t *f, char *start, char *end, int starting_quote,
int ending_quote, int verbose) {
+ char out_buf[TEXT_BUF_LEN*2 +7];
+ char *ind;
+
+ ind = out_buf;
+
+ if (starting_quote) {
+ *ind = '\'';
+ ind++;
+ }
+ strncpy(ind, start, end-start);
+ ind += end-start;
+ if (ending_quote) {
+ *ind = '\'';
+ ind++;
+ }
+ *ind = '\0';
+ if (verbose) fprintf(stderr,"put field: <%s>\n", out_buf);
+ return(put_line(f, out_buf));
+}
+
+char *do_field(input_file_t *sql, output_file_t *text, char *start, int
verbose) {
+ /* should be at the beginning of a field. either a leading ' or the data.
+ our job: read in data, til we get to ..
+ - closing ' if we opened with one
+ - , or ) if there was no open quote
+ end of buffer first, in which case we write out what we have,
+ saving a few chars in case of escapes I guess, move them in etc
+ and refill buffer, then keep going with the above
+ once we get to that or as we get to that we write what we have
+ and put start at uh...the comma or the ) if there is one, else
+ to NULL if we hit eof? blergh
+ */
+ int quoted = 0;
+ char *ind = NULL;
+ int first_write = 1;
+ char load_data_escaped_buf[TEXT_BUF_LEN*2 + 6];
+ int donulls = 1;
+
+ while (*start == ' ') start++;
+
+ if (*start == '\'') {
+ quoted++;
+ start+=1;
+ }
+ ind = start;
+ while (1) {
+ if (quoted && *ind == '\'') {
+ load_data_escape(start, ind-start, load_data_escaped_buf,
sizeof(load_data_escaped_buf), 0);
+ write_field(text, load_data_escaped_buf, load_data_escaped_buf +
strlen(load_data_escaped_buf), first_write&"ed, 1&"ed, verbose);
+ start = ind+1;
+ return(start);
+ }
+ else if (!quoted && (*ind == ',' || *ind == ')' )) {
+ load_data_escape(start, ind-start, load_data_escaped_buf,
sizeof(load_data_escaped_buf), donulls);
+ write_field(text, load_data_escaped_buf, load_data_escaped_buf +
strlen(load_data_escaped_buf), first_write&"ed, 1&"ed, verbose);
+ first_write = 0;
+ start = ind;
+ return(start);
+ }
+ else if (!*ind) {
+ load_data_escape(start, ind-start, load_data_escaped_buf,
sizeof(load_data_escaped_buf), donulls);
+ write_field(text, load_data_escaped_buf, load_data_escaped_buf +
strlen(load_data_escaped_buf), first_write&"ed, 0, verbose);
+ first_write = 0;
+ if (!get_line(sql)) {
+ show_error("abrupt end to data after or in field %s\n", start);
+ return(NULL);
+ }
+ start = sql->in_buf->content;
+ ind = start;
+ }
+ else {
+ /* move ind along, skipping over escaped crap etc. */
+ if (*ind == '\\') {
+ ind++;
+ if (!*ind) {
+ sql->leftover[0] = '\\';
+ sql->leftover[1] = '\0';
+ load_data_escape(start, ind-start-1, load_data_escaped_buf,
sizeof(load_data_escaped_buf), donulls);
+ write_field(text, load_data_escaped_buf, load_data_escaped_buf +
strlen(load_data_escaped_buf), first_write&"ed, 0, verbose);
+ first_write = 0;
+ if (!get_line(sql)) {
+ show_error("abrupt end to data after backslash in field %s\n",
start);
+ return(NULL);
+ }
+ start = sql->in_buf->content;
+ ind = start;
+ }
+ else ind++;
+ }
+ else ind++;
+ }
+ }
+ return(NULL);
+}
+
+/* we are at ) and we need to find ( */
+char *find_next_tuple(input_file_t *sql, char *start, int verbose) {
+ while (*start != '(') {
+ if (!*start) return(NULL); /* end of full line */
+ else start++;
+ }
+ return(start);
+}
+
+/* if we have a partial line we had better deal with it here, so
+ that when we return to the caller an entire tuple has in fact been
processed,
+ with the next piece of the line preloaded into buffer
+ expect *start to be '(' = start of tuple
+*/
+char *do_tuple(input_file_t *sql, output_file_t *text, char *start, int
verbose) {
+ int first = 1;
+ char buf[2];
+
+ buf[0] = '\t';
+ buf[1] = '\0';
+ while (*start == ' ') start++;
+
+ if (*start == '(') start++;
+ else {
+ show_error("expected ( for beginning of tuple, got this: %s\n", start);
+ return(NULL);
+ }
+ if (!*start) {
+ if (get_line(sql) == NULL) return(NULL);
+ start = sql->in_buf->content;
+ }
+ while (start && *start) {
+ if (first) first = 0;
+ else {
+ put_line(text, buf);
+ }
+ start = do_field(sql, text, start, verbose);
+ /* we should now be at either ')' or ',', we want to skip to:
+ next ( if there is one, or .. .';' (which should indicate end of line,
+ so expect that)
+ */
+
+ if (!start) {
+ if (get_line(sql) == NULL) return(NULL);
+ start = sql->in_buf->content;
+ }
+
+ while (*start == ' ') start++;
+
+ /* if we ran out of data right after a tuple = (xx,yyy,...zzz) then refill
the buffer
+ if we run out in the middle of a field do_field will handle that case */
+ if (!*start) {
+ if (get_line(sql) == NULL) return(NULL);
+ start = sql->in_buf->content;
+ }
+ if (*start == ')') {
+ start = find_next_tuple(sql, start, verbose);
+ return(start);
+ }
+ else if (*start == ',') {
+ start++;
+ if (!*start) { /* try to refill the buffer */
+ if (get_line(sql) == NULL) return(NULL);
+ start = sql->in_buf->content;
+ }
+ }
+ else {
+ show_error("tuple has unexpected data: <%s>", start);
+ return(NULL);
+ }
+ }
+ return NULL;
+}
+
+/* if we have a partial line we had better deal with it here, so
+ that when we return to the caller an entire line has in fact been processed
*/
+int do_line(input_file_t *sql, output_file_t *text, int verbose) {
+ int skip = 0;
+ char *start = NULL;
+ char buf[2];
+
+ if (verbose) fprintf(stderr,"processing line starting <%c%c%c>\n",
sql->in_buf->content[0], sql->in_buf->content[1], sql->in_buf->content[2]);
+ /* input may start with INSERT ... VALUES (
+ or simply with with a leading (
+ newline means end of tuple or tuples
+ anything else doesn't have tuples so we ignore it
+ */
+ if (!strncmp(sql->in_buf->content, "INSERT ", 6)) {
+ start = strstr(sql->in_buf->content, " VALUES (");
+ if (!start) skip++;
+ else start+=7;
+ }
+ else if (sql->in_buf->content[0] != '(') skip++;
+ else start = sql->in_buf->content;
+
+ if (skip) return(0); /* don't process this line, it doesn't have a data
tuple */
+ buf[0] = '\n';
+ buf[1] = '\0';
+ while (start) {
+ start = do_tuple(sql, text, start, verbose);
+ tuples_done++;
+ put_line(text, buf);
+ }
+
+ /* fixme we should actually capture error returns from do_tuple and
+ return with -1 here */
+ return(0);
+}
+
+/*
+ args:
+ whoami name of calling program
+ message message to print out before usage information, if any
+ this should not end in a newline
+
+ this function prints usage information for the program to stdout
+*/
+void usage(char *whoami, char *message) {
+ if (message) {
+ fprintf(stderr,"%s\n\n",message);
+ }
+ fprintf(stderr,"Usage: %s [--sqlfile filename] [--txtfile filename]
[--verbose] [--help]\n", whoami);
+ fprintf(stderr,"\n");
+ fprintf(stderr,"Reads a possibly compressed stream of MySQL INSERT
statements and converts\n");
+ fprintf(stderr,"it to tab-separated output suitable for import via LOAD
FILE\n");
+ fprintf(stderr,"\n");
+ fprintf(stderr,"Arguments:\n");
+ fprintf(stderr,"\n");
+ fprintf(stderr,"sqlfile (s): name of sqlfile from which to read INSERT
statements; if none\n");
+ fprintf(stderr," is specified, data will be read from stdin.
If a filename is\n");
+ fprintf(stderr," specified that ends in .gz or .bz2, the
file will silently be\n");
+ fprintf(stderr," decompressed.\n");
+ fprintf(stderr,"txtfile (t): name of file to which to write output; if
none is specified,\n");
+ fprintf(stderr," data will be written to stdout. If a
filename is specified that\n");
+ fprintf(stderr," ends in .gz or .bz2, the file will be gz or
bz2 compressed.\n");
+ fprintf(stderr,"help (h): print this help message and exit\n");
+ fprintf(stderr,"verbose (v): write progress information to stderr.\n");
+ exit(-1);
+}
+
+int main(int argc, char **argv) {
+ int optindex=0;
+ int optc = 0;
+ int result;
+
+ int help = 0;
+ int verbose = 0;
+
+ char *sql_file = NULL; /* contains mysql insert commands */
+ char *text_file = NULL; /* output */
+
+ input_file_t *sql = NULL;
+ output_file_t *text = NULL;
+
+ char *filebase = NULL;
+ char *filesuffix = NULL;
+
+ struct option optvalues[] = {
+ {"sqlfile", required_argument, NULL, 'c'},
+ {"textfile", required_argument, NULL, 'f'},
+ {"help", no_argument, NULL, 'h'},
+ {"verbose", no_argument, NULL, 'v'},
+ {NULL, 0, NULL, 0}
+ };
+
+ while (1) {
+ optc=getopt_long(argc,argv,"hs:t:v", optvalues, &optindex);
+ if (optc==-1) break;
+
+ switch(optc) {
+ case 's':
+ sql_file = optarg;
+ break;
+ case 't':
+ text_file = optarg;
+ break;
+ case 'h':
+ help++;
+ break;
+ case 'v':
+ verbose++;
+ break;
+ default:
+ usage(argv[0],"unknown option or other error\n");
+ }
+ }
+
+ if (help) usage(argv[0], NULL);
+
+ sql = init_input_file(sql_file);
+ if (!sql) exit(1);
+
+ if (!text_file)
+ text = init_output_file(NULL, NULL, NULL);
+ else {
+ /* take apart the name if needed and shove in the prefix, then the suffix
*/
+
+ filebase = get_filebase(text_file, verbose);
+ filesuffix = get_filesuffix(text_file, verbose);
+ text = init_output_file(filebase, filesuffix, NULL);
+ }
+
+ if (verbose) fprintf(stderr,"Input and output files opened\n");
+
+ while (1) {
+ if (get_line(sql) == NULL) break;
+ result = do_line(sql, text, verbose);
+ if (result) {
+ fprintf(stderr,"error encountered scanning sql file\n");
+ exit(1);
+ }
+ lines_done++;
+ if (verbose && !(lines_done%1000)) fprintf(stderr,"%d lines processed\n",
lines_done);
+ }
+
+ if (verbose && (lines_done%1000)) fprintf(stderr,"%d lines processed\n",
lines_done);
+
+ close_input_file(sql);
+ free_input_file(sql);
+
+ close_output_file(text);
+ free_output_file(text);
+
+ exit(0);
+}
--
To view, visit https://gerrit.wikimedia.org/r/52057
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Id76d8790fdeaa0c5db8dfa8078adf4f54bda1ab4
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps
Gerrit-Branch: ariel
Gerrit-Owner: ArielGlenn <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>
Gerrit-Reviewer: jenkins-bot
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits