ArielGlenn has uploaded a new change for review.
https://gerrit.wikimedia.org/r/50175
Change subject: build static binaries, makefile fixes, sha1 field fix
......................................................................
build static binaries, makefile fixes, sha1 field fix
- static binary build targets now provided, folks must provide
their own copies of statically built libz.a and libbz2.a
- moved off of dependency on openssl libs, now using Christophe
Devine's sha1 code, updated README to reflect change in dependencies
- sha1 digest must be converted to base36 for db, added this
- added missing files to make dist
Change-Id: I0a2db652602c34f0c901b4ec80b4dff507f845ba
---
M xmlfileutils/Makefile
M xmlfileutils/README
M xmlfileutils/mwxml2sql.h
M xmlfileutils/mwxmlelts.c
4 files changed, 70 insertions(+), 19 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/dumps
refs/changes/75/50175/1
diff --git a/xmlfileutils/Makefile b/xmlfileutils/Makefile
index b673439..4d27faf 100644
--- a/xmlfileutils/Makefile
+++ b/xmlfileutils/Makefile
@@ -14,13 +14,13 @@
all: mwxml2sql sql2txt
-mwxml2sql: mwxml2sql.o filebuffers.o xmltags.o mwxmlelts.o sqlutils.o
+mwxml2sql: mwxml2sql.o filebuffers.o xmltags.o mwxmlelts.o sqlutils.o base36.o
sha1.o
$(CC) $(CFLAGS) $(LDFLAGS) -o mwxml2sql mwxml2sql.o filebuffers.o \
- xmltags.o mwxmlelts.o sqlutils.o -lssl -lcrypto -lbz2 -lz
+ xmltags.o mwxmlelts.o sqlutils.o base36.o sha1.o -lbz2 -lz
sql2txt: sql2txt.o filebuffers.o sqlutils.o
$(CC) $(CFLAGS) $(LDFLAGS) -o sql2txt sql2txt.o filebuffers.o
sqlutils.o \
- -lcrypto -lbz2 -lz
+ -lbz2 -lz
install: mwxml2sql sql2txt
if ( test ! -d $(PREFIX)/bin ) ; then mkdir -p $(PREFIX)/bin ; fi
@@ -31,6 +31,16 @@
clean:
rm -f *.o *.a mwxml2sql sql2txt
+
+static: mwxml2sql_static sql2txt_static
+
+mwxml2sql_static: mwxml2sql
+ $(CC) $(CFLAGS) -static -static-libgcc -o mwxml2sql_static mwxml2sql.o
filebuffers.o \
+ xmltags.o mwxmlelts.o sqlutils.o base36.o sha1.o -L. -lbz2 -lz
+
+sql2txt_static: sql2txt
+ $(CC) $(CFLAGS) -static -static-libgcc -o sql2txt_static sql2txt.o
filebuffers.o sqlutils.o \
+ sha1.o -L. -lbz2 -lz
mwxml2sql.o: mwxml2sql.c mwxml2sql.h
$(CC) $(CFLAGS) -c mwxml2sql.c
@@ -50,6 +60,12 @@
sqlutils.o: sqlutils.c mwxml2sql.h
$(CC) $(CFLAGS) -c sqlutils.c
+base36.o: base36.c
+ $(CC) $(CFLAGS) -c base36.c
+
+sha1.o: sha1.c sha1.h
+ $(CC) $(CFLAGS) -c sha1.c
+
distclean:
rm -f $(DISTNAME)
rm -f *.tar.gz
@@ -65,6 +81,10 @@
$(DISTNAME)/sql2txt.c \
$(DISTNAME)/sqlutils.c \
$(DISTNAME)/xmltags.c \
+ $(DISTNAME)/mwxmlelts.c \
+ $(DISTNAME)/sha1.h \
+ $(DISTNAME)/sha1.c \
+ $(DISTNAME)/base36.c \
$(DISTNAME)/Makefile \
$(DISTNAME)/COPYING \
$(DISTNAME)/README \
diff --git a/xmlfileutils/README b/xmlfileutils/README
index e6c1397..9f5268d 100644
--- a/xmlfileutils/README
+++ b/xmlfileutils/README
@@ -6,11 +6,10 @@
xml dumps from the Wikimedia projects and that is all
that it's intended to do.
-To install this program, you will need to have the libssl,
-libcrypto, libz and bz2 development libraries installed,
-as well as the gcc toolchain or an equivalent C compiler
-and its supporting libraries. You'll also need the 'make'
-utility.
+To install this program, you will need to have the libz and
+bz2 development libraries installed, as well as the gcc
+toolchain or an equivalent C compiler and its supporting
+libraries. You'll also need the 'make' utility.
This program has been tested only on 64-bit Linux. You can
try building it on other platforms but without any support
@@ -101,3 +100,14 @@
This does NOT support dumps from wikis with LiquidThread enabled.
That's a feature set for a future version.
+
+LICENSE
+
+The files sha1.c and sha1.h are released by Christophe Levine under
+GPLv2 (see the file COPYING in this directory). His web site is
+no longer available and the code has since been folded into many
+other projects but you can find it via archive.org:
+http://web.archive.org/web/20031123112259/http://www.cr0.net:8040/code/crypto/sha1/
+
+The remaining files are copyright Ariel Glenn 2013 and also released
+under the GPLv2 (see again the file COPYING in this directory).
diff --git a/xmlfileutils/mwxml2sql.h b/xmlfileutils/mwxml2sql.h
index 35a715d..f4f4ce1 100644
--- a/xmlfileutils/mwxml2sql.h
+++ b/xmlfileutils/mwxml2sql.h
@@ -18,6 +18,8 @@
#include <zlib.h>
#include <stdarg.h>
+#include "sha1.h"
+
#define VERSION "0.0.1"
#define MAX_TAG_NAME_LEN 256
@@ -251,6 +253,12 @@
char *get_filesuffix(char *file_name, int verbose);
int do_file_header(input_file_t *f, int skipschema, char **schema, siteinfo_t
**s, int verbose);
+int tobase36(unsigned int *in, unsigned int *in_copy, unsigned int *temp, int
in_len, unsigned int *out);
+int char2int(char c);
+int hexstring2int(char *s, int len, unsigned int *intbuf);
+char int2char(int i);
+void int2string(unsigned int *int_buf, int int_buf_len, char *s);
+
static inline int mwv_any_greater(mw_version_t *mwv,int mj,int mn ) {
mw_version_t *head = mwv;
diff --git a/xmlfileutils/mwxmlelts.c b/xmlfileutils/mwxmlelts.c
index e37b7de..1e7e281 100644
--- a/xmlfileutils/mwxmlelts.c
+++ b/xmlfileutils/mwxmlelts.c
@@ -18,6 +18,8 @@
#include "mwxml2sql.h"
+#define SHA_DIGEST_LENGTH 20
+
char page_in_process[MAX_ID_LEN];
int page_rows_written;
int rev_rows_written;
@@ -311,9 +313,9 @@
int todo_length;
char *todo, *todo_new;
int text_length = 0;
- SHA_CTX ctx;
+ sha1_context ctx;
unsigned char sha1[SHA_DIGEST_LENGTH];
- char sha1_string[SHA_DIGEST_LENGTH*2];
+ unsigned char sha1_string[SHA_DIGEST_LENGTH*2 +1];
int i=0;
char *compressed_content = NULL;
int compressed_length = 0;
@@ -322,7 +324,14 @@
char compressed_buf[TEXT_BUF_LEN_PADDED];
char *compressed_ptr = NULL;
- if (get_sha1) SHA1_Init(&ctx);
+ unsigned int sha1_copy[SHA_DIGEST_LENGTH*2 +1];
+ unsigned int sha1_temp[SHA_DIGEST_LENGTH*2 +1];
+ unsigned int sha1_num[SHA_DIGEST_LENGTH/3 +1];
+ int sha1_num_len;
+ unsigned int sha1_b36[SHA_DIGEST_LENGTH*8/5 + 6];
+ int sha1_b36_len;
+
+ if (get_sha1) sha1_starts(&ctx);
ind = strstr(f->in_buf->content, "<text");
if (!ind) return(0);
@@ -362,7 +371,7 @@
if (!endtag) {
leftover = un_xml_escape(ind, raw, 0);
if (get_text_length) text_length+= strlen(raw);
- if (get_sha1) SHA1_Update(&ctx, raw, strlen(raw));
+ if (get_sha1) sha1_update(&ctx, (unsigned char *)raw, strlen(raw));
if (text_compress) {
/* FIXME do something with this return value */
compressed_ptr = gzipit(raw, &compressed_length, compressed_buf,
sizeof(compressed_buf));
@@ -406,7 +415,7 @@
un_xml_escape(ind, raw, 1);
*endtag = '<';
if (get_text_length) text_length+= strlen(raw);
- if (get_sha1) SHA1_Update(&ctx, raw, strlen(raw));
+ if (get_sha1) sha1_update(&ctx, (unsigned char *)raw, strlen(raw));
if (text_compress) {
/* FIXME do something with this return value */
compressed_ptr = gzipit(raw, &compressed_length, compressed_buf,
sizeof(compressed_buf));
@@ -467,12 +476,16 @@
so we don't have to compute it.
*/
if (get_sha1) {
- SHA1_Final(sha1, &ctx);
- /* fixme is this really the best way? look at it later */
- for (i=0; i < SHA_DIGEST_LENGTH; i++) {
- sprintf((char*)&(sha1_string[i*2]), "%02x", sha1[i]);
- }
- sprintf(r->sha1, "%s", sha1);
+ sha1_finish(&ctx, sha1);
+
+ /* base36 conversion, blah */
+ for (i=0; i < SHA_DIGEST_LENGTH; i++)
+ sprintf((char*)&(sha1_string[i*2]), "%02x", sha1[i]);
+
+ /* sha1_num_len = hexstring2int((char *)sha1_string,
SHA_DIGEST_LENGTH*2, sha1_num);*/
+ sha1_num_len = hexstring2int((char *)sha1_string, SHA_DIGEST_LENGTH*2,
sha1_num);
+ sha1_b36_len = tobase36(sha1_num, sha1_copy, sha1_temp, sha1_num_len,
sha1_b36);
+ int2string(sha1_b36, sha1_b36_len, r->sha1);
}
if (verbose > 1) fprintf(stderr,"text info: insert end of line written\n");
--
To view, visit https://gerrit.wikimedia.org/r/50175
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I0a2db652602c34f0c901b4ec80b4dff507f845ba
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps
Gerrit-Branch: ariel
Gerrit-Owner: ArielGlenn <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits