ArielGlenn has uploaded a new change for review.
https://gerrit.wikimedia.org/r/72005
Change subject: mwbzutils: clean up makefile and source in prep for debian
packaging
......................................................................
mwbzutils: clean up makefile and source in prep for debian packaging
* cleanup install, add deinstall targets
* make distclean actually do that
* generate man pages with help2man
* add or redo all usage messages to conform with help2man
* add version and copyright info to all programs
Change-Id: Id7ddd9edb5b2e22f896166a23cf49d28a010007b
---
M xmldumps-backup/mwbzutils/Makefile
M xmldumps-backup/mwbzutils/checkforbz2footer.c
M xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c
M xmldumps-backup/mwbzutils/dumplastbz2block.c
M xmldumps-backup/mwbzutils/findpageidinbz2xml.c
M xmldumps-backup/mwbzutils/recompressxml.c
M xmldumps-backup/mwbzutils/writeuptopageid.c
7 files changed, 411 insertions(+), 155 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/dumps
refs/changes/05/72005/1
diff --git a/xmldumps-backup/mwbzutils/Makefile
b/xmldumps-backup/mwbzutils/Makefile
index 5fcd560..f3a3c44 100644
--- a/xmldumps-backup/mwbzutils/Makefile
+++ b/xmldumps-backup/mwbzutils/Makefile
@@ -16,22 +16,38 @@
# 2010-2010: see the file COPYING for details.
# ------------------------------------------------------------------
-CC=gcc
-LDFLAGS=
-BIGFILES=-D_FILE_OFFSET_BITS=64
-CFLAGS=-Wall -Winline -O2 -g $(BIGFILES)
-PREFIX=/usr/local
+VERSION = "0.0.3"
+CC = gcc
+LDFLAGS =
+BIGFILES = -D_FILE_OFFSET_BITS=64
+CFLAGS = -Wall -Winline -O2 -g $(BIGFILES) -DVERSION=\"$(VERSION)\"
-SHELL=/bin/sh
+build: checkforbz2footer dumpbz2filefromoffset \
+ dumplastbz2block findpageidinbz2xml \
+ recompressxml writeuptopageid \
-OBJSBZ= bzlibfuncs.o
+NAME_CHECKFORBZ2FOOTER = "Check if bzip2 file ends with bz2 magic footer"
+NAME_DUMPBZ2FILEFROMOFFSET = "Write MediaWiki XML pages from bzip2 file
starting from offset"
+NAME_DUMPLASTBZ2BLOCK = "Find last bz2 block in bzip2 file and dump
contents"
+NAME_FINDPAGEIDINBZ2XML = "Display offset of bz2 block for given page id
in bzip2 MediaWiki XML file"
+NAME_RECOMPRESSXML = "Bz2 compress MediaWiki XML input in batches of
pages"
+NAME_WRITEUPTOPAGEID = "Write range of page content from MediaWiki XML
input"
-all: checkforbz2footer \
- dumpbz2filefromoffset \
- dumplastbz2block \
- findpageidinbz2xml \
- recompressxml \
- writeuptopageid
+BINDIR = $(DESTDIR)$(PREFIX)/usr/local/bin/
+MANDIR = $(DESTDIR)$(PREFIX)/usr/local/share/man/man1/
+
+GZIP = /bin/gzip
+HELP2MAN = /usr/bin/help2man
+SHELL = /bin/sh
+
+DISTNAME = mwbzutils-$(VERSION)
+
+OBJSBZ = bzlibfuncs.o
+
+build: checkforbz2footer dumpbz2filefromoffset \
+ dumplastbz2block findpageidinbz2xml \
+ recompressxml writeuptopageid \
+ manpages
dumplastbz2block: $(OBJSBZ) mwbzlib.o dumplastbz2block.o
$(CC) $(CFLAGS) $(LDFLAGS) -o dumplastbz2block dumplastbz2block.o
mwbzlib.o $(OBJSBZ) -lbz2
@@ -51,25 +67,61 @@
writeuptopage: $(OBJSBZ) writeuptopageid.o
$(CC) $(CFLAGS) $(LDFLAGS) -o writeuptopageid writeuptopageid.o -lbz2
+manpages: dumplastbz2block.1.gz findpageidinbz2xml.1.gz \
+ checkforbz2footer.1.gz dumpbz2filefromoffset.1.gz \
+ recompressxml.1.gz writeuptopageid.1.gz
+
+dumplastbz2block.1.gz : dumplastbz2block
+ $(HELP2MAN) --section 1 --no-info --name $(NAME_DUMPLASTBZ2BLOCK) \
+ --no-discard-stderr ./dumplastbz2block | $(GZIP) >
dumplastbz2block.1.gz
+findpageidinbz2xml.1.gz : findpageidinbz2xml
+ $(HELP2MAN) --section 1 --no-info --name $(NAME_FINDPAGEIDINBZ2XML) \
+ --no-discard-stderr ./findpageidinbz2xml | $(GZIP) >
findpageidinbz2xml.1.gz
+checkforbz2footer.1.gz : checkforbz2footer
+ $(HELP2MAN) --section 1 --no-info --name $(NAME_CHECKFORBZ2FOOTER) \
+ --no-discard-stderr ./checkforbz2footer | $(GZIP) >
checkforbz2footer.1.gz
+dumpbz2filefromoffset.1.gz : dumpbz2filefromoffset
+ $(HELP2MAN) --section 1 --no-info --name $(NAME_DUMPBZ2FILEFROMOFFSET) \
+ --no-discard-stderr ./dumpbz2filefromoffset | $(GZIP) >
dumpbz2filefromoffset.1.gz
+recompressxml.1.gz : recompressxml
+ $(HELP2MAN) --section 1 --no-info --name $(NAME_RECOMPRESSXML) \
+ --no-discard-stderr ./recompressxml | $(GZIP) >
recompressxml.1.gz
+writeuptopageid.1.gz : writeuptopageid
+ $(HELP2MAN) --section 1 --no-info --name $(NAME_WRITEUPTOPAGEID) \
+ --no-discard-stderr ./writeuptopageid | $(GZIP) >
writeuptopageid.1.gz
+
+
install: dumplastbz2block findpageidinbz2xml checkforbz2footer
dumpbz2filefromoffset recompressxml writeuptopageid
- if ( test ! -d $(PREFIX)/bin ) ; then mkdir -p $(PREFIX)/bin ; fi
- cp -f dumplastbz2block $(PREFIX)/bin/dumplastbz2block
- cp -f findpageidinbz2xml $(PREFIX)/bin/findpageidinbz2xml
- cp -f checkforbz2footer $(PREFIX)/bin/checkforbz2footer
- cp -f dumpbz2filefromoffset $(PREFIX)/bin/dumpbz2filefromoffset
- cp -f recompressxml $(PREFIX)/bin/recompressxml
- cp -f writeuptopageid $(PREFIX)/bin/writeuptopageid
- chmod a+x $(PREFIX)/bin/dumplastbz2block
- chmod a+x $(PREFIX)/bin/findpageidinbz2xml
- chmod a+x $(PREFIX)/bin/checkforbz2footer
- chmod a+x $(PREFIX)/bin/dumpbz2filefromoffset
- chmod a+x $(PREFIX)/bin/recompressxml
- chmod a+x $(PREFIX)/bin/writeuptopageid
+ install --directory $(BINDIR)
+ install --mode=755 dumplastbz2block $(BINDIR)
+ install --mode=755 findpageidinbz2xml $(BINDIR)
+ install --mode=755 checkforbz2footer $(BINDIR)
+ install --mode=755 dumpbz2filefromoffset $(BINDIR)
+ install --mode=755 recompressxml $(BINDIR)
+ install --mode=755 writeuptopageid $(BINDIR)
+ install --directory $(MANDIR)
+ install --mode=755 dumplastbz2block.1.gz $(BINDIR)
+ install --mode=755 findpageidinbz2xml.1.gz $(BINDIR)
+ install --mode=755 checkforbz2footer.1.gz $(BINDIR)
+ install --mode=755 dumpbz2filefromoffset.1.gz $(BINDIR)
+ install --mode=755 recompressxml.1.gz $(BINDIR)
+ install --mode=755 writeuptopageid.1.gz $(BINDIR)
+
+deinstall:
+ rm -f $(BINDIR)dumplastbz2block
+ rm -f $(BINDIR)findpageidinbz2xml
+ rm -f $(BINDIR)checkforbz2footer
+ rm -f $(BINDIR)dumpbz2filefromoffset
+ rm -f $(BINDIR)recompressxml
+ rm -f $(BINDIR)writeuptopageid
clean:
rm -f *.o *.a dumplastbz2block findpageidinbz2xml \
checkforbz2footer dumpbz2filefromoffset \
recompressxml writeuptopageid
+ rm -f dumplastbz2block.1.gz findpageidinbz2xml.1.gz \
+ checkforbz2footer.1.gz dumpbz2filefromoffset.1.gz \
+ recompressxml.1.gz writeuptopageid.1.gz
bzlibfuncs.o: bzlibfuncs.c bzlib.h bzlib_private.h
$(CC) $(CFLAGS) -c bzlibfuncs.c
@@ -90,9 +142,10 @@
writeuptopageid.o: writeuptopageid.c
$(CC) $(CFLAGS) -c writeuptopageid.c
-distclean: clean
+distclean:
+ rm -f $(DISTNAME)
+ rm -f *.tar.gz
-DISTNAME=mwbzutils-0.0.3
dist:
rm -f $(DISTNAME)
ln -s -f . $(DISTNAME)
diff --git a/xmldumps-backup/mwbzutils/checkforbz2footer.c
b/xmldumps-backup/mwbzutils/checkforbz2footer.c
index 7ff9f7e..b6ad199 100644
--- a/xmldumps-backup/mwbzutils/checkforbz2footer.c
+++ b/xmldumps-backup/mwbzutils/checkforbz2footer.c
@@ -8,20 +8,52 @@
#include <errno.h>
#include "mwbzutils.h"
-/*
- Check to see whether a file ends with a bz2 footer or not
- (i.e. if it is truncated or corrupted).
- This is a crude but fast test for integrity; we don't
- check the CRC at the end of fthe stream, nor do we check the
- bit padding in the last byte of the file.
+void usage(char *message) {
+ char * help =
+"Usage: checkforbz2footer [--version|--help]\n"
+" or: checkforbz2footer <infile>\n\n"
+"Check whether the specified bzip2 compressed file ends with a bz2 footer\n"
+"or not ((i.e. if it is truncated or corrupted).\n"
+"This is a crude but fast test for integrity; we don't check the CRC at\n"
+"the end of the stream, nor do we check the bit padding in the last byte\n"
+"of the file.\n\n"
+"Exits with 0 if the file has the bz2 footer, 1 if the file does not have\n"
+"the footer and -1 on error.\n\n"
+"Options:\n\n"
+"Flags:\n\n"
+" -h, --help Show this help message\n"
+" -v, --version Display the version of this program and exit\n\n"
+"Arguments:\n\n"
+" <infile> Name of the file to check\n\n"
+"Report bugs in checkforbz2footer to <https://bugzilla.wikimedia.org/>.\n\n"
+"See also:\n\n"
+" dumpbz2filefromoffset(1), dumplastbz2block(1), findpageidinbz2xml(1)\n"
+" recompressxml(1), writeuptopageid(1)\n\n";
+ if (message) {
+ fprintf(stderr,"%s\n\n",message);
+ }
+ fprintf(stderr,"%s",help);
+ exit(-1);
+}
- Arguments: the name of the file to check, presumably
- a bzipped file.
- Outputs: none.
- Exits with 0 if the file contains the footer at the end,
- -1 if the file does not contain the footer or there is an error.
-*/
-
+void show_version(char *version_string) {
+ char * copyright =
+"Copyright (C) 2011, 2012, 2013 Ariel T. Glenn. All rights reserved.\n\n"
+"This program is free software: you can redistribute it and/or modify it\n"
+"under the terms of the GNU General Public License as published by the\n"
+"Free Software Foundation, either version 2 of the License, or (at your\n"
+"option) any later version.\n\n"
+"This program is distributed in the hope that it will be useful, but\n"
+"WITHOUT ANY WARRANTY; without even the implied warranty of \n"
+"MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General\n"
+"Public License for more details.\n\n"
+"You should have received a copy of the GNU General Public License along\n"
+"with this program. If not, see <http://www.gnu.org/licenses/>\n\n"
+ "Written by Ariel T. Glenn.\n";
+ fprintf(stderr,"checkforbz2footer %s\n", version_string);
+ fprintf(stderr,"%s",copyright);
+ exit(-1);
+}
int main(int argc, char **argv) {
@@ -30,9 +62,13 @@
bz_info_t bfile;
if (argc != 2) {
- fprintf(stderr,"usage: %s infile\n", argv[0]);
+ usage("Missing option or argument.");
exit(-1);
}
+
+ if (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-h")) usage(NULL);
+ if (!strcmp(argv[1], "--version") || !strcmp(argv[1], "-v"))
show_version(VERSION);
+
fin = open (argv[1], O_RDONLY);
if (fin < 0) {
fprintf(stderr,"failed to open file %s for read\n", argv[1]);
diff --git a/xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c
b/xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c
index 5066bb9..03b2b9b 100644
--- a/xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c
+++ b/xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c
@@ -10,8 +10,56 @@
#include <regex.h>
#include "mwbzutils.h"
+void usage(char *message) {
+ char * help =
+"Usage: dumpbz2filefromoffset [--version|--help]\n"
+" or: dumpbz2filefromoffset <infile> <offset>\n\n"
+"Find the first bz2 block in a file after the specified offset, uncompress\n"
+"and write contents from that point on to stdout, starting with the first\n"
+"<page> tag encountered.\n\n"
+"The starting <mediawiki> tag and the <siteinfo> header from the file will\n"
+"be written out first.\n\n"
+"Note that some bytes from the very last block may be lost if the blocks are\n"
+"not byte-aligned. This is due to the bzip2 crc at the eof being wrong.\n\n"
+"Exits with BZ_OK on success, various BZ_ errors otherwise.\n\n"
+"Options:\n\n"
+"Flags:\n\n"
+" -h, --help Show this help message\n"
+" -v, --version Display the version of this program and exit\n\n"
+"Arguments:\n\n"
+" <infile> Name of the file to check\n"
+" <offset> byte in the file from which to start processing\n\n"
+"Report bugs in dumpbz2filefromoffset to
<https://bugzilla.wikimedia.org/>.\n\n"
+"See also checkforbz2footer(1), dumplastbz2block(1), findpageidinbz2xml(1),\n"
+ "recompressxml(1), writeuptopageid(1)\n\n";
+ if (message) {
+ fprintf(stderr,"%s\n\n",message);
+ }
+ fprintf(stderr,"%s",help);
+ exit(-1);
+}
+
+void show_version(char *version_string) {
+ char * copyright =
+"Copyright (C) 2011, 2012, 2013 Ariel T. Glenn. All rights reserved.\n\n"
+"This program is free software: you can redistribute it and/or modify it\n"
+"under the terms of the GNU General Public License as published by the\n"
+"Free Software Foundation, either version 2 of the License, or (at your\n"
+"option) any later version.\n\n"
+"This program is distributed in the hope that it will be useful, but\n"
+"WITHOUT ANY WARRANTY; without even the implied warranty of \n"
+"MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General\n"
+"Public License for more details.\n\n"
+"You should have received a copy of the GNU General Public License along\n"
+"with this program. If not, see <http://www.gnu.org/licenses/>\n\n"
+ "Written by Ariel T. Glenn.\n";
+ fprintf(stderr,"dumpbz2filefromoffset %s\n", version_string);
+ fprintf(stderr,"%s",copyright);
+ exit(-1);
+}
+
/*
- dump the <meadiawiki> header (up through
+ dump the <mediawiki> header (up through
</siteinfo> close tag) found at the
beginning of xml dump files.
returns:
@@ -206,37 +254,18 @@
return(0);
}
-/*
- find the first bz2 block after the specified offset,
- uncompress from that point on, write out the
- contents starting with the first <page> tag,
- prefacing first with the <mediawiki> header from
- the beginning of the file, up through </siteinfo>.
-
- note that we may lose some bytes from the very last
- block if the blocks are bit shifted, because the
- bzip crc at end of file will be wrong. (needs testing to
- find a workaround, simply not feeding in the crc doesn't
- suffice)
-
- for purposes of the XML dumps this is fine, since we use
- this tool to generate prefetch data starting from
- a given pageid, rather than needing to uncompress
- gigabytes of data to get to the point in the file
- we want.
-
- returns:
- BZ_OK on success, various BZ_ errors otherwise.
-*/
int main(int argc, char **argv) {
int fin, res;
off_t position;
- if (argc != 3) {
- fprintf(stderr,"usage: %s infile position\n", argv[0]);
+ if (argc < 2 || argc > 3) {
+ usage("Missing or bad options/arguments");
exit(-1);
}
+ if (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-h")) usage(NULL);
+ if (!strcmp(argv[1], "--version") || !strcmp(argv[1], "-v"))
show_version(VERSION);
+
fin = open (argv[1], O_RDONLY);
if (fin < 0) {
fprintf(stderr,"failed to open file %s for read\n", argv[1]);
diff --git a/xmldumps-backup/mwbzutils/dumplastbz2block.c
b/xmldumps-backup/mwbzutils/dumplastbz2block.c
index 34d5601..ab441ad 100644
--- a/xmldumps-backup/mwbzutils/dumplastbz2block.c
+++ b/xmldumps-backup/mwbzutils/dumplastbz2block.c
@@ -9,22 +9,52 @@
#include <inttypes.h>
#include "mwbzutils.h"
+void usage(char *message) {
+ char * help =
+"Usage: dumplastbz2block [--version|--help]\n"
+" or: dumplastbz2block <infile>\n\n"
+"Find the last bz2 block marker in a file and dump whatever can be\n"
+"decompressed after that point. The header of the file must be intact\n"
+"in order for any output to be produced.\n"
+"This will produce output for truncated files as well, as long as there\n"
+"is 'enough' data after the block marker.\n"
+"Exits with 0 if some decompressed data was written, 1 if no data could\n"
+"be uncompressed and -1 on error.\n\n"
+"Options:\n\n"
+"Flags:\n\n"
+" -h, --help Show this help message\n"
+" -v, --version Display the version of this program and exit\n\n"
+"Arguments:\n\n"
+" <infile> Name of the file to process\n\n"
+"Report bugs in dumplastbz2block to <https://bugzilla.wikimedia.org/>.\n\n"
+"See also checkforbz2footer(1), dumpbz2filefromoffset(1),
findpageidinbz2xml(1),\n"
+"recompressxml(1), writeuptopageid(1)\n\n";
+ if (message) {
+ fprintf(stderr,"%s\n\n",message);
+ }
+ fprintf(stderr,"%s",help);
+ exit(-1);
+}
-/*
- Find the last bz2 block marker in a file
- and dump whatever can be decompressed after
- that point. The header of the file must
- be intact in order for any output to be produced.
- This will produce output for truncated files as well,
- as long as there is "enough" data after the block
- marker.
+void show_version(char *version_string) {
+ char * copyright =
+"Copyright (C) 2011, 2012, 2013 Ariel T. Glenn. All rights reserved.\n\n"
+"This program is free software: you can redistribute it and/or modify it\n"
+"under the terms of the GNU General Public License as published by the\n"
+"Free Software Foundation, either version 2 of the License, or (at your\n"
+"option) any later version.\n\n"
+"This program is distributed in the hope that it will be useful, but\n"
+"WITHOUT ANY WARRANTY; without even the implied warranty of \n"
+"MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General\n"
+"Public License for more details.\n\n"
+"You should have received a copy of the GNU General Public License along\n"
+"with this program. If not, see <http://www.gnu.org/licenses/>\n\n"
+ "Written by Ariel T. Glenn.\n";
+ fprintf(stderr,"dumplastbz2block %s\n", version_string);
+ fprintf(stderr,"%s",copyright);
+ exit(-1);
+}
- Arguments: the name of the file to check, presumably
- a bzipped file.
- Outputs: the decompressed data at the end of the file.
- Exits with 0 if decompression of some data can be done,
- 1 if decompression fails, and -1 on error.
-*/
int main(int argc, char **argv) {
@@ -38,9 +68,12 @@
int length = 5000; /* output buffer size */
if (argc != 2) {
- fprintf(stderr,"usage: %s infile\n", argv[0]);
+ usage("Missing option or argument.");
exit(-1);
}
+
+ if (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-h")) usage(NULL);
+ if (!strcmp(argv[1], "--version") || !strcmp(argv[1], "-v"))
show_version(VERSION);
fin = open (argv[1], O_RDONLY);
if (fin < 0) {
@@ -96,4 +129,3 @@
close(fin);
exit(0);
}
-
diff --git a/xmldumps-backup/mwbzutils/findpageidinbz2xml.c
b/xmldumps-backup/mwbzutils/findpageidinbz2xml.c
index f00da48..f403a8b 100644
--- a/xmldumps-backup/mwbzutils/findpageidinbz2xml.c
+++ b/xmldumps-backup/mwbzutils/findpageidinbz2xml.c
@@ -13,6 +13,63 @@
#include <zlib.h>
#include "mwbzutils.h"
+void usage(char *message) {
+ char * help =
+"Usage: findpageidinbz2xml --filename file --pageid id [--stubfile] [--useapi]
[--verbose]\n"
+" [--help] [--version]\n\n"
+"Show the offset of the bz2 block in the specified MediaWiki XML dump file\n"
+"containing the given page id. This assumes that the bz2 header of the file\n"
+"is intact and that page ids are steadily increasing throughout the file.\n\n"
+"If the page id is found, a line in the following format will be written to
stdout:\n"
+" position:xxxxx pageid:nnn\n\n"
+"where 'xxxxx' is the offset of the block from the beginning of the file,
and\n"
+"'nnn' is the id of the first page encountered in that block.\n\n"
+"Note:\n"
+"This program may use the MediaWiki api to find page ids from revision ids\n"
+"if 'useapi' is specified.\n"
+"It may use a stub file to find page ids from rev ids if 'stubfile' is
specified.\n"
+"It will only do one of the above if it has been reading from the file for
some\n"
+"large number of iterations without findind a page tag (some pages have >
500K\n"
+"revisions and a heck of a lot of text).\n"
+"If both 'useapi' and 'stubfile' are specified, the api will be used as it is
faster.\n\n"
+"Exits with 0 in success, -1 on error.\n\n"
+"Options:\n\n"
+" -f, --filename name of file to search\n"
+" -p, --pageid page_id of page for which to search\n"
+" -s, --stubfile name of MediaWiki XML stub file to fall back on (see
'Note' above)\n"
+" -a, --useapi fall back to the api if stuck (see 'Note' above)\n"
+" -V, --verbose show search process; specify multiple times for more
output\n"
+" -h, --help Show this help message\n"
+" -V, --version Display the version of this program and exit\n\n"
+"Report bugs in checkforbz2footer to <https://bugzilla.wikimedia.org/>.\n\n"
+"See also dumpbz2filefromoffset(1), dumplastbz2block(1),
findpageidinbz2xml(1),\n"
+ "recompressxml(1), writeuptopageid(1)\n\n";
+ if (message) {
+ fprintf(stderr,"%s\n\n",message);
+ }
+ fprintf(stderr,"%s",help);
+ exit(-1);
+}
+
+void show_version(char *version_string) {
+ char * copyright =
+"Copyright (C) 2011, 2012, 2013 Ariel T. Glenn. All rights reserved.\n\n"
+"This program is free software: you can redistribute it and/or modify it\n"
+"under the terms of the GNU General Public License as published by the\n"
+"Free Software Foundation, either version 2 of the License, or (at your\n"
+"option) any later version.\n\n"
+"This program is distributed in the hope that it will be useful, but\n"
+"WITHOUT ANY WARRANTY; without even the implied warranty of \n"
+"MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General\n"
+"Public License for more details.\n\n"
+"You should have received a copy of the GNU General Public License along\n"
+"with this program. If not, see <http://www.gnu.org/licenses/>\n\n"
+ "Written by Ariel T. Glenn.\n";
+ fprintf(stderr,"findpageidinbz2xml %s\n", version_string);
+ fprintf(stderr,"%s",copyright);
+ exit(-1);
+}
+
/*
find the first bz2 block marker in the file,
from its current position,
@@ -484,36 +541,6 @@
}
}
-
-void usage(char *whoami, char *message) {
- if (message) {
- fprintf(stderr,message);
- }
- fprintf(stderr,"usage: %s --filename file --pageid id [--stubfile]
[--useapi] [--verbose]\n", whoami);
- exit(1);
-}
-
-/*
- given a bzipped and possibly truncated file, and a page id,
- hunt for the page id in the file; this assume that the
- bz2 header is intact and that page ids are steadily increasing
- throughout the file.
-
- writes the offset of the relevant block (from beginning of file)
- and the first pageid found in that block, to stdout
-
- it may use the api to find page ids from rev ids if use_api is specified
- it may use a stub file to find page ids from rev ids if stubfile is specified
- it will only do these if it has been reading from awhile without
- findind a page tag (some pages have > 500K revisions and a heck of
- a lot of text)
- if both use_api and stubfile are specified, we will use_api, it's faster
-
- format of output:
- position:xxxxx pageid:nnn
-
- returns: 0 on success, -1 on error
-*/
int main(int argc, char **argv) {
int fin, res, page_id=0;
off_t position, interval, file_size;
@@ -529,20 +556,22 @@
struct option optvalues[] = {
{"filename", 1, 0, 'f'},
+ {"help", 0, 0, 'h'},
{"pageid", 1, 0, 'p'},
{"useapi", 0, 0, 'a'},
{"verbose", 0, 0, 'v'},
+ {"version", 0, 0, 'V'},
{"stubfile", 1, 0, 's'},
{NULL, 0, NULL, 0}
};
while (1) {
- optc=getopt_long_only(argc,argv,"filename:pageid:useapi:stubfile:verbose",
optvalues, &optindex);
+
optc=getopt_long_only(argc,argv,"filename:help:pageid:useapi:stubfile:verbose:version",
optvalues, &optindex);
if (optc=='f') {
filename=optarg;
}
else if (optc=='p') {
- if (!(isdigit(optarg[0]))) usage(argv[0],NULL);
+ if (!(isdigit(optarg[0]))) usage(NULL);
page_id=atoi(optarg);
}
else if (optc=='a')
@@ -551,18 +580,22 @@
use_stub=1;
stubfile = optarg;
}
+ else if (optc=='h')
+ usage(NULL);
else if (optc=='v')
verbose++;
+ else if (optc=='V')
+ show_version(VERSION);
else if (optc==-1) break;
- else usage(argv[0],"Unknown option or other error\n");
+ else usage("Unknown option or other error\n");
}
if (! filename || ! page_id) {
- usage(argv[0],NULL);
+ usage(NULL);
}
if (page_id <1) {
- usage(argv[0], "Please specify a page_id >= 1.\n");
+ usage("Please specify a page_id >= 1.\n");
}
fin = open (filename, O_RDONLY);
diff --git a/xmldumps-backup/mwbzutils/recompressxml.c
b/xmldumps-backup/mwbzutils/recompressxml.c
index be6cc92..417cdb6 100644
--- a/xmldumps-backup/mwbzutils/recompressxml.c
+++ b/xmldumps-backup/mwbzutils/recompressxml.c
@@ -31,6 +31,54 @@
bz_stream strm_indx;
+void usage(char *message) {
+ char * help =
+"Usage: recompressxml --pagesperstream n [--buildindex filename] [--verbose]\n"
+" or: recompressxml [--version|--help]\n\n"
+"Reads a stream of XML pages from stdin and writes to stdout the bz2
compressed\n"
+"data, one bz2 stream (header, blocks, footer) per specified number of
pages.\n\n"
+"Options:\n\n"
+" -p, --pagesperstream: Compress this number of pages in each complete\n"
+" bz2stream before opening a new stream. The
siteinfo\n"
+" header is written to a separate stream at the
beginning\n"
+" of all output, and the closing mediawiki tag is
written\n"
+" into a separate stream at the end.\n"
+" -b, --buildindex: Generate a file containing an index of pages ids and
titles\n"
+" per stream. Each line contains:
offset-to-stream:pageid:pagetitle\n"
+" If filename ends in '.bz2' the file will be written
in bz2 format.\n"
+" -v, --verbose: Write lots of debugging output to stderr. This
option can be used\n"
+" multiple times to increase verbosity.\n";
+" -h, --help Show this help message\n"
+" -V, --version Display the version of this program and exit\n\n"
+"Report bugs in checkforbz2footer to <https://bugzilla.wikimedia.org/>.\n\n"
+"See also checkforbz2footer(1), dumpbz2filefromoffset(1),
dumplastbz2block(1),\n"
+"findpageidinbz2xml(1), writeuptopageid(1)\n\n";
+ if (message) {
+ fprintf(stderr,"%s\n\n",message);
+ }
+ fprintf(stderr,"%s",help);
+ exit(-1);
+}
+
+void show_version(char *version_string) {
+ char * copyright =
+"Copyright (C) 2011, 2012, 2013 Ariel T. Glenn. All rights reserved.\n\n"
+"This program is free software: you can redistribute it and/or modify it\n"
+"under the terms of the GNU General Public License as published by the\n"
+"Free Software Foundation, either version 2 of the License, or (at your\n"
+"option) any later version.\n\n"
+"This program is distributed in the hope that it will be useful, but\n"
+"WITHOUT ANY WARRANTY; without even the implied warranty of \n"
+"MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General\n"
+"Public License for more details.\n\n"
+"You should have received a copy of the GNU General Public License along\n"
+"with this program. If not, see <http://www.gnu.org/licenses/>\n\n"
+ "Written by Ariel T. Glenn.\n";
+ fprintf(stderr,"recompressxml %s\n", version_string);
+ fprintf(stderr,"%s",copyright);
+ exit(-1);
+}
+
void setupIndexBz2Stream() {
int bz_verbosity = 0;
int bz_workFactor = 0;
@@ -257,27 +305,6 @@
return;
}
-void usage(char *whoami, char *message) {
- if (message) {
- fprintf(stderr,"%s",message);
- }
- fprintf(stderr,"Usage: %s --pagesperstream n [--buildindex indexfilename]
[--verbose]\n\n", whoami);
- fprintf(stderr,"Reads a stream of XML pages from stdin,\n");
- fprintf(stderr,"and writes to stdout the bz2 compressed\n");
- fprintf(stderr,"data, one bz2 stream per count pages.\n\n");
- fprintf(stderr,"Options:\n");
- fprintf(stderr,"pagesperstream: compress this many pages in each complete
bz2stream before\n");
- fprintf(stderr," opening a new stream. The siteinfo header
is written to a\n");
- fprintf(stderr," separate stream at the beginning of all
output, and the closing\n");
- fprintf(stderr," mediawiki tag is written into a separate
stream at the end.\n");
- fprintf(stderr,"buildindex: generate a file containing an index of pages
ids and titles\n");
- fprintf(stderr," per stream. Each line contains:
offset-to-stream:pageid:pagetitle\n");
- fprintf(stderr," If filename ends in '.bz2' the file will be
written in bz2 format.\n");
- fprintf(stderr,"verbose: produce lots of debugging output to stderr.
This option can be used\n");
- fprintf(stderr," multiple times to increase verbosity.\n");
- exit(-1);
-}
-
int main(int argc, char **argv) {
int optindex=0;
int optc;
@@ -285,8 +312,10 @@
struct option optvalues[] = {
{"buildindex", 1, 0, 'b'},
+ {"help", 0, 0, 'h'},
{"pagesperstream", 1, 0, 'p'},
{"verbose", 0, 0, 'v'},
+ {"version", 0, 0, 'V'},
{NULL, 0, NULL, 0}
};
@@ -301,18 +330,22 @@
if (optc=='b') {
indexFilename = optarg;
}
+ else if (optc=='h')
+ usage(NULL);
else if (optc=='p') {
- if (!(isdigit(optarg[0]))) usage(argv[0],NULL);
+ if (!(isdigit(optarg[0]))) usage(NULL);
count=atoi(optarg);
}
else if (optc=='v')
verbose++;
+ else if (optc=='V')
+ show_version(VERSION);
else if (optc==-1) break;
- else usage(argv[0],"unknown option or other error\n");
+ else usage("unknown option or other error\n");
}
if (count <= 0) {
- usage(argv[0],"bad or no argument given for count.\n");
+ usage("bad or no argument given for count.\n");
}
if (indexFilename) {
@@ -321,7 +354,7 @@
}
indexfd = fopen(indexFilename, "w");
if (! indexfd) {
- usage(argv[0],"failed to open index file for write.\n");
+ usage("failed to open index file for write.\n");
}
if (!strcmp(indexFilename+(strlen(indexFilename)-4),".bz2")) {
if (verbose) {
diff --git a/xmldumps-backup/mwbzutils/writeuptopageid.c
b/xmldumps-backup/mwbzutils/writeuptopageid.c
index ea608df..4df5c99 100644
--- a/xmldumps-backup/mwbzutils/writeuptopageid.c
+++ b/xmldumps-backup/mwbzutils/writeuptopageid.c
@@ -10,13 +10,52 @@
namespaces will one project want? */
#define MAXHEADERLEN 524289
-void usage(char *me) {
- fprintf(stderr,"Usage: %s startPageID [endPageID]\n",me);
- fprintf(stderr,"Copies the contents of an XML file starting with and
including startPageID\n");
- fprintf(stderr,"and up to but not including endPageID. This program is used
in processing XML\n");
- fprintf(stderr,"dump files that were only partially written, as well as in
writing partial\n");
- fprintf(stderr,"stub files for reruns of those dump files.\n");
- fprintf(stderr,"If endPageID is ommitted, all pages starting from
startPageID will be copied.\n");
+void usage(char *message) {
+ char * help =
+"Usage: writeuptopageid [--version|--help]\n"
+" or: writeuptopageid <startpageid> <endpageid>\n\n"
+"Reads a MediaWiki XML file from stdin anfd writes a range of pages from the
file\n"
+"to stdout, starting with and including the startpageid, up to but not
including\n"
+"the endpageid.\n"
+"This program can be used in processing XML dump files that were only
partially\n"
+"written, as well as in writing partial stub files for reruns of those dump
files.\n"
+"If endPageID is ommitted, all pages starting from startPageID will be
copied.\n\n"
+"Options:\n\n"
+"Flags:\n\n"
+" -h, --help Show this help message\n"
+" -v, --version Display the version of this program and exit\n\n"
+"Arguments:\n\n"
+" <startpageid> id of the first page to write\n"
+" <endpageid> id of the page at which to stop writing; if omitted, all
pages through eof\n"
+" will be written\n\n"
+"Report bugs in writeuptopageid to <https://bugzilla.wikimedia.org/>.\n\n"
+"See also checkforbz2footer(1), dumpbz2filefromoffset(1),
dumplastbz2block(1),\n"
+ "findpageidinbz2xml(1), recompressxml(1)\n\n";
+ if (message) {
+ fprintf(stderr,"%s\n\n",message);
+ }
+ fprintf(stderr,"%s",help);
+ exit(-1);
+}
+
+
+void show_version(char *version_string) {
+ char * copyright =
+"Copyright (C) 2011, 2012, 2013 Ariel T. Glenn. All rights reserved.\n\n"
+"This program is free software: you can redistribute it and/or modify it\n"
+"under the terms of the GNU General Public License as published by the\n"
+"Free Software Foundation, either version 2 of the License, or (at your\n"
+"option) any later version.\n\n"
+"This program is distributed in the hope that it will be useful, but\n"
+"WITHOUT ANY WARRANTY; without even the implied warranty of \n"
+"MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General\n"
+"Public License for more details.\n\n"
+"You should have received a copy of the GNU General Public License along\n"
+"with this program. If not, see <http://www.gnu.org/licenses/>\n\n"
+ "Written by Ariel T. Glenn.\n";
+ fprintf(stderr,"writeuptopageid %s\n", version_string);
+ fprintf(stderr,"%s",copyright);
+ exit(-1);
}
/* note that even if we have only read a partial line
@@ -131,9 +170,12 @@
char mem[MAXHEADERLEN];
if (argc < 2 || argc > 3) {
- usage(argv[0]);
+ usage(NULL);
exit(-1);
}
+
+ if (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-h")) usage(NULL);
+ if (!strcmp(argv[1], "--version") || !strcmp(argv[1], "-v"))
show_version(VERSION);
errno = 0;
startPageID = strtol(argv[1], &nonNumeric, 10);
@@ -141,8 +183,7 @@
*nonNumeric != 0 ||
nonNumeric == (char *) &startPageID ||
errno != 0) {
- fprintf (stderr,"The value you entered for startPageID must be a positive
integer.\n");
- usage(argv[0]);
+ usage("The value you entered for startPageID must be a positive integer.");
exit(-1);
}
if (argc == 3) {
@@ -151,8 +192,7 @@
*nonNumeric != 0 ||
nonNumeric == (char *) &endPageID ||
errno != 0) {
- fprintf (stderr,"The value you entered for endPageID must be a positive
integer.\n");
- usage(argv[0]);
+ usage("The value you entered for endPageID must be a positive
integer.\n");
exit(-1);
}
}
--
To view, visit https://gerrit.wikimedia.org/r/72005
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Id7ddd9edb5b2e22f896166a23cf49d28a010007b
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps
Gerrit-Branch: ariel
Gerrit-Owner: ArielGlenn <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits