[MediaWiki-commits] [Gerrit] mwbzutils: clean up makefile and source in prep for debian p... - change (operations/dumps)

ArielGlenn (Code Review) Thu, 04 Jul 2013 12:48:43 -0700

ArielGlenn has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/72005



Change subject: mwbzutils: clean up makefile and source in prep for debian 
packaging
......................................................................

mwbzutils: clean up makefile and source in prep for debian packaging

* cleanup install, add deinstall targets
* make distclean actually do that
* generate man pages with help2man
* add or redo all usage messages to conform with help2man
* add version and copyright info to all programs

Change-Id: Id7ddd9edb5b2e22f896166a23cf49d28a010007b
---
M xmldumps-backup/mwbzutils/Makefile
M xmldumps-backup/mwbzutils/checkforbz2footer.c
M xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c
M xmldumps-backup/mwbzutils/dumplastbz2block.c
M xmldumps-backup/mwbzutils/findpageidinbz2xml.c
M xmldumps-backup/mwbzutils/recompressxml.c
M xmldumps-backup/mwbzutils/writeuptopageid.c
7 files changed, 411 insertions(+), 155 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/dumps 
refs/changes/05/72005/1

diff --git a/xmldumps-backup/mwbzutils/Makefile 
b/xmldumps-backup/mwbzutils/Makefile
index 5fcd560..f3a3c44 100644
--- a/xmldumps-backup/mwbzutils/Makefile
+++ b/xmldumps-backup/mwbzutils/Makefile
@@ -16,22 +16,38 @@
 # 2010-2010: see the file COPYING for details.
 # ------------------------------------------------------------------
 
-CC=gcc
-LDFLAGS=
-BIGFILES=-D_FILE_OFFSET_BITS=64
-CFLAGS=-Wall -Winline -O2 -g $(BIGFILES)
-PREFIX=/usr/local
+VERSION        = "0.0.3"
+CC             = gcc
+LDFLAGS        =
+BIGFILES       = -D_FILE_OFFSET_BITS=64
+CFLAGS         = -Wall -Winline -O2 -g $(BIGFILES) -DVERSION=\"$(VERSION)\"
 
-SHELL=/bin/sh
+build: checkforbz2footer dumpbz2filefromoffset \
+       dumplastbz2block findpageidinbz2xml \
+       recompressxml writeuptopageid \
 
-OBJSBZ= bzlibfuncs.o
+NAME_CHECKFORBZ2FOOTER       = "Check if bzip2 file ends with bz2 magic footer"
+NAME_DUMPBZ2FILEFROMOFFSET   = "Write MediaWiki XML pages from bzip2 file 
starting from offset"
+NAME_DUMPLASTBZ2BLOCK        = "Find last bz2 block in bzip2 file and dump 
contents"
+NAME_FINDPAGEIDINBZ2XML      = "Display offset of bz2 block for given page id 
in bzip2 MediaWiki XML file"
+NAME_RECOMPRESSXML           = "Bz2 compress MediaWiki XML input in batches of 
pages"
+NAME_WRITEUPTOPAGEID         = "Write range of page content from MediaWiki XML 
input"
 
-all: checkforbz2footer \
-       dumpbz2filefromoffset \
-       dumplastbz2block \
-       findpageidinbz2xml \
-       recompressxml \
-       writeuptopageid
+BINDIR         = $(DESTDIR)$(PREFIX)/usr/local/bin/
+MANDIR         = $(DESTDIR)$(PREFIX)/usr/local/share/man/man1/
+
+GZIP           = /bin/gzip
+HELP2MAN       = /usr/bin/help2man
+SHELL          = /bin/sh
+
+DISTNAME       = mwbzutils-$(VERSION)
+
+OBJSBZ         = bzlibfuncs.o
+
+build: checkforbz2footer dumpbz2filefromoffset \
+       dumplastbz2block findpageidinbz2xml \
+       recompressxml writeuptopageid \
+       manpages
 
 dumplastbz2block: $(OBJSBZ) mwbzlib.o dumplastbz2block.o
        $(CC) $(CFLAGS) $(LDFLAGS) -o dumplastbz2block dumplastbz2block.o 
mwbzlib.o  $(OBJSBZ) -lbz2
@@ -51,25 +67,61 @@
 writeuptopage: $(OBJSBZ) writeuptopageid.o
        $(CC) $(CFLAGS) $(LDFLAGS) -o writeuptopageid writeuptopageid.o -lbz2
 
+manpages: dumplastbz2block.1.gz findpageidinbz2xml.1.gz \
+       checkforbz2footer.1.gz dumpbz2filefromoffset.1.gz \
+       recompressxml.1.gz writeuptopageid.1.gz
+
+dumplastbz2block.1.gz : dumplastbz2block
+       $(HELP2MAN) --section 1 --no-info --name $(NAME_DUMPLASTBZ2BLOCK) \
+               --no-discard-stderr ./dumplastbz2block | $(GZIP) > 
dumplastbz2block.1.gz
+findpageidinbz2xml.1.gz : findpageidinbz2xml
+       $(HELP2MAN) --section 1 --no-info --name $(NAME_FINDPAGEIDINBZ2XML) \
+               --no-discard-stderr ./findpageidinbz2xml | $(GZIP) > 
findpageidinbz2xml.1.gz
+checkforbz2footer.1.gz : checkforbz2footer
+       $(HELP2MAN) --section 1 --no-info --name $(NAME_CHECKFORBZ2FOOTER) \
+               --no-discard-stderr ./checkforbz2footer | $(GZIP) > 
checkforbz2footer.1.gz
+dumpbz2filefromoffset.1.gz : dumpbz2filefromoffset
+       $(HELP2MAN) --section 1 --no-info --name $(NAME_DUMPBZ2FILEFROMOFFSET) \
+               --no-discard-stderr ./dumpbz2filefromoffset | $(GZIP) > 
dumpbz2filefromoffset.1.gz
+recompressxml.1.gz : recompressxml
+       $(HELP2MAN) --section 1 --no-info --name $(NAME_RECOMPRESSXML) \
+               --no-discard-stderr ./recompressxml | $(GZIP) > 
recompressxml.1.gz
+writeuptopageid.1.gz : writeuptopageid
+       $(HELP2MAN) --section 1 --no-info --name $(NAME_WRITEUPTOPAGEID) \
+               --no-discard-stderr ./writeuptopageid | $(GZIP) > 
writeuptopageid.1.gz
+
+
 install: dumplastbz2block findpageidinbz2xml checkforbz2footer 
dumpbz2filefromoffset recompressxml writeuptopageid
-       if ( test ! -d $(PREFIX)/bin ) ; then mkdir -p $(PREFIX)/bin ; fi
-       cp -f dumplastbz2block $(PREFIX)/bin/dumplastbz2block
-       cp -f findpageidinbz2xml $(PREFIX)/bin/findpageidinbz2xml
-       cp -f checkforbz2footer $(PREFIX)/bin/checkforbz2footer
-       cp -f dumpbz2filefromoffset $(PREFIX)/bin/dumpbz2filefromoffset
-       cp -f recompressxml $(PREFIX)/bin/recompressxml
-       cp -f writeuptopageid $(PREFIX)/bin/writeuptopageid
-       chmod a+x $(PREFIX)/bin/dumplastbz2block
-       chmod a+x $(PREFIX)/bin/findpageidinbz2xml
-       chmod a+x $(PREFIX)/bin/checkforbz2footer
-       chmod a+x $(PREFIX)/bin/dumpbz2filefromoffset
-       chmod a+x $(PREFIX)/bin/recompressxml
-       chmod a+x $(PREFIX)/bin/writeuptopageid
+       install --directory                         $(BINDIR)
+       install --mode=755   dumplastbz2block       $(BINDIR)
+       install --mode=755   findpageidinbz2xml     $(BINDIR)
+       install --mode=755   checkforbz2footer      $(BINDIR)
+       install --mode=755   dumpbz2filefromoffset  $(BINDIR)
+       install --mode=755   recompressxml          $(BINDIR)
+       install --mode=755   writeuptopageid        $(BINDIR)
+       install --directory                         $(MANDIR)
+       install --mode=755   dumplastbz2block.1.gz       $(BINDIR)
+       install --mode=755   findpageidinbz2xml.1.gz     $(BINDIR)
+       install --mode=755   checkforbz2footer.1.gz      $(BINDIR)
+       install --mode=755   dumpbz2filefromoffset.1.gz  $(BINDIR)
+       install --mode=755   recompressxml.1.gz          $(BINDIR)
+       install --mode=755   writeuptopageid.1.gz        $(BINDIR)
+
+deinstall:
+       rm -f $(BINDIR)dumplastbz2block
+       rm -f $(BINDIR)findpageidinbz2xml
+       rm -f $(BINDIR)checkforbz2footer
+       rm -f $(BINDIR)dumpbz2filefromoffset
+       rm -f $(BINDIR)recompressxml
+       rm -f $(BINDIR)writeuptopageid
 
 clean: 
        rm -f *.o *.a dumplastbz2block findpageidinbz2xml \
                checkforbz2footer dumpbz2filefromoffset \
                recompressxml writeuptopageid
+       rm -f dumplastbz2block.1.gz findpageidinbz2xml.1.gz \
+               checkforbz2footer.1.gz dumpbz2filefromoffset.1.gz \
+               recompressxml.1.gz writeuptopageid.1.gz
 
 bzlibfuncs.o: bzlibfuncs.c bzlib.h bzlib_private.h
        $(CC) $(CFLAGS) -c bzlibfuncs.c
@@ -90,9 +142,10 @@
 writeuptopageid.o: writeuptopageid.c
        $(CC) $(CFLAGS) -c writeuptopageid.c
 
-distclean: clean
+distclean:
+       rm -f $(DISTNAME)
+       rm -f *.tar.gz
 
-DISTNAME=mwbzutils-0.0.3
 dist: 
        rm -f $(DISTNAME)
        ln -s -f . $(DISTNAME)
diff --git a/xmldumps-backup/mwbzutils/checkforbz2footer.c 
b/xmldumps-backup/mwbzutils/checkforbz2footer.c
index 7ff9f7e..b6ad199 100644
--- a/xmldumps-backup/mwbzutils/checkforbz2footer.c
+++ b/xmldumps-backup/mwbzutils/checkforbz2footer.c
@@ -8,20 +8,52 @@
 #include <errno.h>
 #include "mwbzutils.h"
 
-/* 
-   Check to see whether a file ends with a bz2 footer or not
-   (i.e. if it is truncated or corrupted). 
-   This is a crude but fast test for integrity; we don't 
-   check the CRC at the end of fthe stream, nor do we check the
-   bit padding in the last byte of the file.
+void usage(char *message) {
+  char * help =
+"Usage: checkforbz2footer [--version|--help]\n"
+"   or: checkforbz2footer <infile>\n\n"
+"Check whether the specified bzip2 compressed file ends with a bz2 footer\n"
+"or not ((i.e. if it is truncated or corrupted).\n"
+"This is a crude but fast test for integrity; we don't check the CRC at\n"
+"the end of the stream, nor do we check the bit padding in the last byte\n"
+"of the file.\n\n"
+"Exits with 0 if the file has the bz2 footer, 1 if the file does not have\n"
+"the footer and -1 on error.\n\n"
+"Options:\n\n"
+"Flags:\n\n"
+"  -h, --help       Show this help message\n"
+"  -v, --version    Display the version of this program and exit\n\n"
+"Arguments:\n\n"
+"  <infile>         Name of the file to check\n\n"
+"Report bugs in checkforbz2footer to <https://bugzilla.wikimedia.org/>.\n\n"
+"See also:\n\n"
+"  dumpbz2filefromoffset(1), dumplastbz2block(1), findpageidinbz2xml(1)\n"
+"  recompressxml(1), writeuptopageid(1)\n\n";
+  if (message) {
+    fprintf(stderr,"%s\n\n",message);
+  }
+  fprintf(stderr,"%s",help);
+  exit(-1);
+}
 
-   Arguments: the name of the file to check, presumably 
-   a bzipped file. 
-   Outputs: none.
-   Exits with 0 if the file contains the footer at the end, 
-   -1 if the file does not contain the footer or there is an error.
-*/
-
+void show_version(char *version_string) {
+  char * copyright =
+"Copyright (C) 2011, 2012, 2013 Ariel T. Glenn.  All rights reserved.\n\n"
+"This program is free software: you can redistribute it and/or modify it\n"
+"under the  terms of the GNU General Public License as published by the\n"
+"Free Software Foundation, either version 2 of the License, or (at your\n"
+"option) any later version.\n\n"
+"This  program  is  distributed  in the hope that it will be useful, but\n"
+"WITHOUT ANY WARRANTY; without even the implied warranty of \n"
+"MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General\n"
+"Public License for more details.\n\n"
+"You should have received a copy of the GNU General Public License along\n"
+"with this program.  If not, see <http://www.gnu.org/licenses/>\n\n"
+    "Written by Ariel T. Glenn.\n";
+  fprintf(stderr,"checkforbz2footer %s\n", version_string);
+  fprintf(stderr,"%s",copyright);
+  exit(-1);
+}
 
 int main(int argc, char **argv) {
 
@@ -30,9 +62,13 @@
   bz_info_t bfile;
 
   if (argc != 2) {
-    fprintf(stderr,"usage: %s infile\n", argv[0]);
+    usage("Missing option or argument.");
     exit(-1);
   }
+
+  if (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-h")) usage(NULL);
+  if (!strcmp(argv[1], "--version") || !strcmp(argv[1], "-v")) 
show_version(VERSION);
+
   fin = open (argv[1], O_RDONLY);
   if (fin < 0) {
     fprintf(stderr,"failed to open file %s for read\n", argv[1]);
diff --git a/xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c 
b/xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c
index 5066bb9..03b2b9b 100644
--- a/xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c
+++ b/xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c
@@ -10,8 +10,56 @@
 #include <regex.h>
 #include "mwbzutils.h"
 
+void usage(char *message) {
+  char * help =
+"Usage: dumpbz2filefromoffset [--version|--help]\n"
+"   or: dumpbz2filefromoffset <infile> <offset>\n\n"
+"Find the first bz2 block in a file after the specified offset, uncompress\n"
+"and write contents from that point on to stdout, starting with the first\n"
+"<page> tag encountered.\n\n"
+"The starting <mediawiki> tag and the <siteinfo> header from the file will\n"
+"be written out first.\n\n"
+"Note that some bytes from the very last block may be lost if the blocks are\n"
+"not byte-aligned. This is due to the bzip2 crc at the eof being wrong.\n\n"
+"Exits with BZ_OK on success, various BZ_ errors otherwise.\n\n"
+"Options:\n\n"
+"Flags:\n\n"
+"  -h, --help       Show this help message\n"
+"  -v, --version    Display the version of this program and exit\n\n"
+"Arguments:\n\n"
+"  <infile>         Name of the file to check\n"
+"  <offset>         byte in the file from which to start processing\n\n"
+"Report bugs in dumpbz2filefromoffset to 
<https://bugzilla.wikimedia.org/>.\n\n"
+"See also checkforbz2footer(1), dumplastbz2block(1), findpageidinbz2xml(1),\n"
+    "recompressxml(1), writeuptopageid(1)\n\n";
+  if (message) {
+    fprintf(stderr,"%s\n\n",message);
+  }
+  fprintf(stderr,"%s",help);
+  exit(-1);
+}
+
+void show_version(char *version_string) {
+  char * copyright =
+"Copyright (C) 2011, 2012, 2013 Ariel T. Glenn.  All rights reserved.\n\n"
+"This program is free software: you can redistribute it and/or modify it\n"
+"under the  terms of the GNU General Public License as published by the\n"
+"Free Software Foundation, either version 2 of the License, or (at your\n"
+"option) any later version.\n\n"
+"This  program  is  distributed  in the hope that it will be useful, but\n"
+"WITHOUT ANY WARRANTY; without even the implied warranty of \n"
+"MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General\n"
+"Public License for more details.\n\n"
+"You should have received a copy of the GNU General Public License along\n"
+"with this program.  If not, see <http://www.gnu.org/licenses/>\n\n"
+    "Written by Ariel T. Glenn.\n";
+  fprintf(stderr,"dumpbz2filefromoffset %s\n", version_string);
+  fprintf(stderr,"%s",copyright);
+  exit(-1);
+}
+
 /* 
-   dump the <meadiawiki> header (up through
+   dump the <mediawiki> header (up through
    </siteinfo> close tag) found at the 
    beginning of xml dump files. 
    returns:
@@ -206,37 +254,18 @@
   return(0);
 }
 
-/*
-  find the first bz2 block after the specified offset,
-  uncompress from that point on, write out the
-  contents starting with the first <page> tag,
-  prefacing first with the <mediawiki> header from
-  the beginning of the file, up through </siteinfo>.
-
-  note that we may lose some bytes from the very last
-  block if the blocks are bit shifted, because the
-  bzip crc at end of file will be wrong.  (needs testing to
-  find a workaround, simply not feeding in the crc doesn't
-  suffice)
-
-  for purposes of the XML dumps this is fine, since we use
-  this tool to generate prefetch data starting from
-  a given pageid, rather than needing to uncompress
-  gigabytes of data to get to the point in the file
-  we want.
-
-  returns:
-    BZ_OK on success, various BZ_ errors otherwise.
-*/
 int main(int argc, char **argv) {
   int fin, res;
   off_t position;
 
-  if (argc != 3) {
-    fprintf(stderr,"usage: %s infile position\n", argv[0]);
+  if (argc < 2 || argc > 3) {
+    usage("Missing or bad options/arguments");
     exit(-1);
   }
 
+  if (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-h")) usage(NULL);
+  if (!strcmp(argv[1], "--version") || !strcmp(argv[1], "-v")) 
show_version(VERSION);
+
   fin = open (argv[1], O_RDONLY);
   if (fin < 0) {
     fprintf(stderr,"failed to open file %s for read\n", argv[1]);
diff --git a/xmldumps-backup/mwbzutils/dumplastbz2block.c 
b/xmldumps-backup/mwbzutils/dumplastbz2block.c
index 34d5601..ab441ad 100644
--- a/xmldumps-backup/mwbzutils/dumplastbz2block.c
+++ b/xmldumps-backup/mwbzutils/dumplastbz2block.c
@@ -9,22 +9,52 @@
 #include <inttypes.h>
 #include "mwbzutils.h"
 
+void usage(char *message) {
+  char * help =
+"Usage: dumplastbz2block [--version|--help]\n"
+"   or: dumplastbz2block <infile>\n\n"
+"Find the last bz2 block marker in a file and dump whatever can be\n"
+"decompressed after that point.  The header of the file must be intact\n"
+"in order for any output to be produced.\n"
+"This will produce output for truncated files as well, as long as there\n"
+"is 'enough' data after the block marker.\n"
+"Exits with 0 if some decompressed data was written, 1 if no data could\n"
+"be uncompressed and -1 on error.\n\n"
+"Options:\n\n"
+"Flags:\n\n"
+"  -h, --help       Show this help message\n"
+"  -v, --version    Display the version of this program and exit\n\n"
+"Arguments:\n\n"
+"  <infile>         Name of the file to process\n\n"
+"Report bugs in dumplastbz2block to <https://bugzilla.wikimedia.org/>.\n\n"
+"See also checkforbz2footer(1), dumpbz2filefromoffset(1), 
findpageidinbz2xml(1),\n"
+"recompressxml(1), writeuptopageid(1)\n\n";
+  if (message) {
+    fprintf(stderr,"%s\n\n",message);
+  }
+  fprintf(stderr,"%s",help);
+  exit(-1);
+}
 
-/* 
-   Find the last bz2 block marker in a file
-   and dump whatever can be decompressed after
-   that point.  The header of the file must
-   be intact in order for any output to be produced.
-   This will produce output for truncated files as well,
-   as long as there is "enough" data after the block 
-   marker.
+void show_version(char *version_string) {
+  char * copyright =
+"Copyright (C) 2011, 2012, 2013 Ariel T. Glenn.  All rights reserved.\n\n"
+"This program is free software: you can redistribute it and/or modify it\n"
+"under the  terms of the GNU General Public License as published by the\n"
+"Free Software Foundation, either version 2 of the License, or (at your\n"
+"option) any later version.\n\n"
+"This  program  is  distributed  in the hope that it will be useful, but\n"
+"WITHOUT ANY WARRANTY; without even the implied warranty of \n"
+"MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General\n"
+"Public License for more details.\n\n"
+"You should have received a copy of the GNU General Public License along\n"
+"with this program.  If not, see <http://www.gnu.org/licenses/>\n\n"
+    "Written by Ariel T. Glenn.\n";
+  fprintf(stderr,"dumplastbz2block %s\n", version_string);
+  fprintf(stderr,"%s",copyright);
+  exit(-1);
+}
 
-   Arguments: the name of the file to check, presumably 
-   a bzipped file. 
-   Outputs: the decompressed data at the end of the file.
-   Exits with 0 if decompression of some data can be done,
-   1 if decompression fails, and -1 on error.
-*/
 
 int main(int argc, char **argv) {
 
@@ -38,9 +68,12 @@
   int length = 5000; /* output buffer size */
 
   if (argc != 2) {
-    fprintf(stderr,"usage: %s infile\n", argv[0]);
+    usage("Missing option or argument.");
     exit(-1);
   }
+
+  if (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-h")) usage(NULL);
+  if (!strcmp(argv[1], "--version") || !strcmp(argv[1], "-v")) 
show_version(VERSION);
 
   fin = open (argv[1], O_RDONLY);
   if (fin < 0) {
@@ -96,4 +129,3 @@
   close(fin);
   exit(0);
 }
-
diff --git a/xmldumps-backup/mwbzutils/findpageidinbz2xml.c 
b/xmldumps-backup/mwbzutils/findpageidinbz2xml.c
index f00da48..f403a8b 100644
--- a/xmldumps-backup/mwbzutils/findpageidinbz2xml.c
+++ b/xmldumps-backup/mwbzutils/findpageidinbz2xml.c
@@ -13,6 +13,63 @@
 #include <zlib.h>
 #include "mwbzutils.h"
 
+void usage(char *message) {
+  char * help =
+"Usage: findpageidinbz2xml --filename file --pageid id [--stubfile] [--useapi] 
[--verbose]\n"
+"       [--help] [--version]\n\n"
+"Show the offset of the bz2 block in the specified MediaWiki XML dump file\n"
+"containing the given page id.  This assumes that the bz2 header of the file\n"
+"is intact and that page ids are steadily increasing throughout the file.\n\n"
+"If the page id is found, a line in the following format will be written to 
stdout:\n"
+"    position:xxxxx pageid:nnn\n\n"
+"where 'xxxxx' is the offset of the block from the beginning of the file, 
and\n"
+"'nnn' is the id of the first page encountered in that block.\n\n"
+"Note:\n"
+"This program may use the MediaWiki api to find page ids from revision ids\n"
+"if 'useapi' is specified.\n"
+"It may use a stub file to find page ids from rev ids if 'stubfile' is 
specified.\n"
+"It will only do one of the above if it has been reading from the file for 
some\n"
+"large number of iterations without findind a page tag (some pages have > 
500K\n"
+"revisions and a heck of a lot of text).\n"
+"If both 'useapi' and 'stubfile' are specified, the api will be used as it is 
faster.\n\n"
+"Exits with 0 in success, -1 on error.\n\n"
+"Options:\n\n"
+"  -f, --filename   name of file to search\n"
+"  -p, --pageid     page_id of page for which to search\n"
+"  -s, --stubfile   name of MediaWiki XML stub file to fall back on (see 
'Note' above)\n"
+"  -a, --useapi     fall back to the api if stuck (see 'Note' above)\n"
+"  -V, --verbose    show search process; specify multiple times for more 
output\n"
+"  -h, --help       Show this help message\n"
+"  -V, --version    Display the version of this program and exit\n\n"
+"Report bugs in checkforbz2footer to <https://bugzilla.wikimedia.org/>.\n\n"
+"See also dumpbz2filefromoffset(1), dumplastbz2block(1), 
findpageidinbz2xml(1),\n"
+    "recompressxml(1), writeuptopageid(1)\n\n";
+  if (message) {
+    fprintf(stderr,"%s\n\n",message);
+  }
+  fprintf(stderr,"%s",help);
+  exit(-1);
+}
+
+void show_version(char *version_string) {
+  char * copyright =
+"Copyright (C) 2011, 2012, 2013 Ariel T. Glenn.  All rights reserved.\n\n"
+"This program is free software: you can redistribute it and/or modify it\n"
+"under the  terms of the GNU General Public License as published by the\n"
+"Free Software Foundation, either version 2 of the License, or (at your\n"
+"option) any later version.\n\n"
+"This  program  is  distributed  in the hope that it will be useful, but\n"
+"WITHOUT ANY WARRANTY; without even the implied warranty of \n"
+"MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General\n"
+"Public License for more details.\n\n"
+"You should have received a copy of the GNU General Public License along\n"
+"with this program.  If not, see <http://www.gnu.org/licenses/>\n\n"
+    "Written by Ariel T. Glenn.\n";
+  fprintf(stderr,"findpageidinbz2xml %s\n", version_string);
+  fprintf(stderr,"%s",copyright);
+  exit(-1);
+}
+
 /* 
    find the first bz2 block marker in the file, 
    from its current position,
@@ -484,36 +541,6 @@
   }
 }
 
-
-void usage(char *whoami, char *message) {
-  if (message) {
-    fprintf(stderr,message);
-  }
-  fprintf(stderr,"usage: %s --filename file --pageid id [--stubfile] 
[--useapi] [--verbose]\n", whoami);
-  exit(1);
-}
-
-/*
-  given a bzipped and possibly truncated file, and a page id, 
-  hunt for the page id in the file; this assume that the
-  bz2 header is intact and that page ids are steadily increasing
-  throughout the file. 
-
-  writes the offset of the relevant block (from beginning of file) 
-  and the first pageid found in that block, to stdout
-
-  it may use the api to find page ids from rev ids if use_api is specified
-  it may use a stub file to find page ids from rev ids if stubfile is specified
-  it will only do these if it has been reading from awhile without
-  findind a page tag (some pages have > 500K revisions and a heck of
-  a lot of text)
-  if both use_api and stubfile are specified, we will use_api, it's faster
-
-  format of output:
-     position:xxxxx pageid:nnn
-
-  returns: 0 on success, -1 on error
-*/
 int main(int argc, char **argv) {
   int fin, res, page_id=0;
   off_t position, interval, file_size;
@@ -529,20 +556,22 @@
 
   struct option optvalues[] = {
     {"filename", 1, 0, 'f'},
+    {"help", 0, 0, 'h'},
     {"pageid", 1, 0, 'p'},
     {"useapi", 0, 0, 'a'},
     {"verbose", 0, 0, 'v'},
+    {"version", 0, 0, 'V'},
     {"stubfile", 1, 0, 's'},
     {NULL, 0, NULL, 0}
   };
 
   while (1) {
-    optc=getopt_long_only(argc,argv,"filename:pageid:useapi:stubfile:verbose", 
optvalues, &optindex);
+    
optc=getopt_long_only(argc,argv,"filename:help:pageid:useapi:stubfile:verbose:version",
 optvalues, &optindex);
     if (optc=='f') {
      filename=optarg;
     }
     else if (optc=='p') {
-      if (!(isdigit(optarg[0]))) usage(argv[0],NULL);
+      if (!(isdigit(optarg[0]))) usage(NULL);
       page_id=atoi(optarg);
     }
     else if (optc=='a') 
@@ -551,18 +580,22 @@
       use_stub=1;
       stubfile = optarg;
     }
+    else if (optc=='h')
+      usage(NULL);
     else if (optc=='v') 
       verbose++;
+    else if (optc=='V') 
+      show_version(VERSION);
     else if (optc==-1) break;
-    else usage(argv[0],"Unknown option or other error\n");
+    else usage("Unknown option or other error\n");
   }
 
   if (! filename || ! page_id) {
-    usage(argv[0],NULL);
+    usage(NULL);
   }
 
   if (page_id <1) {
-    usage(argv[0], "Please specify a page_id >= 1.\n");
+    usage("Please specify a page_id >= 1.\n");
   }
 
   fin = open (filename, O_RDONLY);
diff --git a/xmldumps-backup/mwbzutils/recompressxml.c 
b/xmldumps-backup/mwbzutils/recompressxml.c
index be6cc92..417cdb6 100644
--- a/xmldumps-backup/mwbzutils/recompressxml.c
+++ b/xmldumps-backup/mwbzutils/recompressxml.c
@@ -31,6 +31,54 @@
 
 bz_stream strm_indx;
 
+void usage(char *message) {
+  char * help =
+"Usage: recompressxml --pagesperstream n [--buildindex filename] [--verbose]\n"
+"   or: recompressxml [--version|--help]\n\n"
+"Reads a stream of XML pages from stdin and writes to stdout the bz2 
compressed\n"
+"data, one bz2 stream (header, blocks, footer) per specified number of 
pages.\n\n"
+"Options:\n\n"
+"  -p, --pagesperstream:  Compress this number of pages in each complete\n"
+"                         bz2stream before opening a new stream.  The 
siteinfo\n"
+"                         header is written to a separate stream at the 
beginning\n"
+"                         of all output, and the closing mediawiki tag is 
written\n"
+"                         into a separate stream at the end.\n"
+"  -b, --buildindex:      Generate a file containing an index of pages ids and 
titles\n"
+"                         per stream.  Each line contains: 
offset-to-stream:pageid:pagetitle\n"
+"                         If filename ends in '.bz2' the file will be written 
in bz2 format.\n"
+"  -v, --verbose:         Write lots of debugging output to stderr.  This 
option can be used\n"
+"                         multiple times to increase verbosity.\n";
+"  -h, --help             Show this help message\n"
+"  -V, --version          Display the version of this program and exit\n\n"
+"Report bugs in checkforbz2footer to <https://bugzilla.wikimedia.org/>.\n\n"
+"See also checkforbz2footer(1), dumpbz2filefromoffset(1), 
dumplastbz2block(1),\n"
+"findpageidinbz2xml(1), writeuptopageid(1)\n\n";
+  if (message) {
+    fprintf(stderr,"%s\n\n",message);
+  }
+  fprintf(stderr,"%s",help);
+  exit(-1);
+}
+
+void show_version(char *version_string) {
+  char * copyright =
+"Copyright (C) 2011, 2012, 2013 Ariel T. Glenn.  All rights reserved.\n\n"
+"This program is free software: you can redistribute it and/or modify it\n"
+"under the  terms of the GNU General Public License as published by the\n"
+"Free Software Foundation, either version 2 of the License, or (at your\n"
+"option) any later version.\n\n"
+"This  program  is  distributed  in the hope that it will be useful, but\n"
+"WITHOUT ANY WARRANTY; without even the implied warranty of \n"
+"MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General\n"
+"Public License for more details.\n\n"
+"You should have received a copy of the GNU General Public License along\n"
+"with this program.  If not, see <http://www.gnu.org/licenses/>\n\n"
+    "Written by Ariel T. Glenn.\n";
+  fprintf(stderr,"recompressxml %s\n", version_string);
+  fprintf(stderr,"%s",copyright);
+  exit(-1);
+}
+
 void setupIndexBz2Stream() {
   int bz_verbosity = 0;
   int bz_workFactor = 0;
@@ -257,27 +305,6 @@
   return;
 }
 
-void usage(char *whoami, char *message) {
-  if (message) {
-    fprintf(stderr,"%s",message);
-  }
-  fprintf(stderr,"Usage: %s --pagesperstream n [--buildindex indexfilename] 
[--verbose]\n\n", whoami);
-  fprintf(stderr,"Reads a stream of XML pages from stdin,\n");
-  fprintf(stderr,"and writes to stdout the bz2 compressed\n");
-  fprintf(stderr,"data, one bz2 stream per count pages.\n\n");
-  fprintf(stderr,"Options:\n");
-  fprintf(stderr,"pagesperstream: compress this many pages in each complete 
bz2stream before\n");
-  fprintf(stderr,"                opening a new stream.  The siteinfo header 
is written to a\n");
-  fprintf(stderr,"                separate stream at the beginning of all 
output, and the closing\n");
-  fprintf(stderr,"                mediawiki tag is written into a separate 
stream at the end.\n");
-  fprintf(stderr,"buildindex:     generate a file containing an index of pages 
ids and titles\n");
-  fprintf(stderr,"                per stream.  Each line contains: 
offset-to-stream:pageid:pagetitle\n");
-  fprintf(stderr,"                If filename ends in '.bz2' the file will be 
written in bz2 format.\n");
-  fprintf(stderr,"verbose:        produce lots of debugging output to stderr.  
This option can be used\n");
-  fprintf(stderr,"                multiple times to increase verbosity.\n");
-  exit(-1);
-}
-
 int main(int argc, char **argv) {
   int optindex=0;
   int optc;
@@ -285,8 +312,10 @@
 
   struct option optvalues[] = {
     {"buildindex", 1, 0, 'b'},
+    {"help", 0, 0, 'h'},
     {"pagesperstream", 1, 0, 'p'},
     {"verbose", 0, 0, 'v'},
+    {"version", 0, 0, 'V'},
     {NULL, 0, NULL, 0}
   };
 
@@ -301,18 +330,22 @@
     if (optc=='b') {
       indexFilename = optarg;
     }
+    else if (optc=='h') 
+      usage(NULL);
     else if (optc=='p') {
-      if (!(isdigit(optarg[0]))) usage(argv[0],NULL);
+      if (!(isdigit(optarg[0]))) usage(NULL);
       count=atoi(optarg);
     }
     else if (optc=='v') 
       verbose++;
+    else if (optc=='V') 
+      show_version(VERSION);
     else if (optc==-1) break;
-    else usage(argv[0],"unknown option or other error\n");
+    else usage("unknown option or other error\n");
   }
 
   if (count <= 0) {
-    usage(argv[0],"bad or no argument given for count.\n");
+    usage("bad or no argument given for count.\n");
   }
 
   if (indexFilename) {
@@ -321,7 +354,7 @@
     }
     indexfd = fopen(indexFilename, "w");
     if (! indexfd) {
-      usage(argv[0],"failed to open index file for write.\n");
+      usage("failed to open index file for write.\n");
     }
     if (!strcmp(indexFilename+(strlen(indexFilename)-4),".bz2")) {
       if (verbose) {
diff --git a/xmldumps-backup/mwbzutils/writeuptopageid.c 
b/xmldumps-backup/mwbzutils/writeuptopageid.c
index ea608df..4df5c99 100644
--- a/xmldumps-backup/mwbzutils/writeuptopageid.c
+++ b/xmldumps-backup/mwbzutils/writeuptopageid.c
@@ -10,13 +10,52 @@
    namespaces will one project want? */
 #define MAXHEADERLEN 524289
 
-void usage(char *me) {
-  fprintf(stderr,"Usage: %s startPageID [endPageID]\n",me);
-  fprintf(stderr,"Copies the contents of an XML file starting with and 
including startPageID\n");
-  fprintf(stderr,"and up to but not including endPageID. This program is used 
in processing XML\n");
-  fprintf(stderr,"dump files that were only partially written, as well as in 
writing partial\n");
-  fprintf(stderr,"stub files for reruns of those dump files.\n");
-  fprintf(stderr,"If endPageID is ommitted, all pages starting from 
startPageID will be copied.\n");
+void usage(char *message) {
+  char * help =
+"Usage: writeuptopageid [--version|--help]\n"
+"   or: writeuptopageid <startpageid> <endpageid>\n\n"
+"Reads a MediaWiki XML file from stdin anfd writes a range of pages from the 
file\n"
+"to stdout, starting with and including the startpageid, up to but not 
including\n"
+"the endpageid.\n"
+"This program can be used in processing XML dump files that were only 
partially\n"
+"written, as well as in writing partial stub files for reruns of those dump 
files.\n"
+"If endPageID is ommitted, all pages starting from startPageID will be 
copied.\n\n"
+"Options:\n\n"
+"Flags:\n\n"
+"  -h, --help       Show this help message\n"
+"  -v, --version    Display the version of this program and exit\n\n"
+"Arguments:\n\n"
+"  <startpageid>   id of the first page to write\n"
+"  <endpageid>     id of the page at which to stop writing; if omitted, all 
pages through eof\n"
+"                   will be written\n\n"
+"Report bugs in writeuptopageid to <https://bugzilla.wikimedia.org/>.\n\n"
+"See also checkforbz2footer(1), dumpbz2filefromoffset(1), 
dumplastbz2block(1),\n"
+    "findpageidinbz2xml(1), recompressxml(1)\n\n";
+ if (message) {
+   fprintf(stderr,"%s\n\n",message);
+ }
+ fprintf(stderr,"%s",help);
+ exit(-1);
+}
+
+
+void show_version(char *version_string) {
+  char * copyright =
+"Copyright (C) 2011, 2012, 2013 Ariel T. Glenn.  All rights reserved.\n\n"
+"This program is free software: you can redistribute it and/or modify it\n"
+"under the  terms of the GNU General Public License as published by the\n"
+"Free Software Foundation, either version 2 of the License, or (at your\n"
+"option) any later version.\n\n"
+"This  program  is  distributed  in the hope that it will be useful, but\n"
+"WITHOUT ANY WARRANTY; without even the implied warranty of \n"
+"MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General\n"
+"Public License for more details.\n\n"
+"You should have received a copy of the GNU General Public License along\n"
+"with this program.  If not, see <http://www.gnu.org/licenses/>\n\n"
+    "Written by Ariel T. Glenn.\n";
+  fprintf(stderr,"writeuptopageid %s\n", version_string);
+  fprintf(stderr,"%s",copyright);
+  exit(-1);
 }
 
 /* note that even if we have only read a partial line
@@ -131,9 +170,12 @@
   char mem[MAXHEADERLEN];
 
   if (argc < 2 || argc > 3) {
-    usage(argv[0]);
+    usage(NULL);
     exit(-1);
   }
+
+  if (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-h")) usage(NULL);
+  if (!strcmp(argv[1], "--version") || !strcmp(argv[1], "-v")) 
show_version(VERSION);
 
   errno = 0;
   startPageID = strtol(argv[1], &nonNumeric, 10);
@@ -141,8 +183,7 @@
       *nonNumeric != 0 ||
       nonNumeric == (char *) &startPageID || 
       errno != 0) {
-    fprintf (stderr,"The value you entered for startPageID must be a positive 
integer.\n");
-    usage(argv[0]);
+    usage("The value you entered for startPageID must be a positive integer.");
     exit(-1);
   }
   if (argc == 3) {
@@ -151,8 +192,7 @@
        *nonNumeric != 0 ||
        nonNumeric == (char *) &endPageID || 
        errno != 0) {
-      fprintf (stderr,"The value you entered for endPageID must be a positive 
integer.\n");
-      usage(argv[0]);
+      usage("The value you entered for endPageID must be a positive 
integer.\n");
       exit(-1);
     }
   }

-- 
To view, visit https://gerrit.wikimedia.org/r/72005
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Id7ddd9edb5b2e22f896166a23cf49d28a010007b
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps
Gerrit-Branch: ariel
Gerrit-Owner: ArielGlenn <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] mwbzutils: clean up makefile and source in prep for debian p... - change (operations/dumps)

Reply via email to