[MediaWiki-commits] [Gerrit] operations...mwbzutils[master]: little tool that displays the last page id in bz2 xml conten...

ArielGlenn (Code Review) Sun, 12 Feb 2017 13:45:07 -0800

ArielGlenn has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/337341 )


Change subject: little tool that displays the last page id in bz2 xml content 
file
......................................................................

little tool that displays the last page id in bz2 xml content file

I need this to check some files for the current en wp run and it
will be handy later for file integrity checking.

[WIP] not completely tested yet so not ready for commit

Change-Id: I2688e461dec59d6d742de93da317c0c80c4d90e4
---
M xmldumps-backup/mwbzutils/Makefile
A xmldumps-backup/mwbzutils/getlastpageidinbz2xml.c
M xmldumps-backup/mwbzutils/mwbzlib.c
3 files changed, 321 insertions(+), 12 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/dumps/mwbzutils 
refs/changes/41/337341/1

diff --git a/xmldumps-backup/mwbzutils/Makefile 
b/xmldumps-backup/mwbzutils/Makefile
index b67f09c..5106c4e 100644
--- a/xmldumps-backup/mwbzutils/Makefile
+++ b/xmldumps-backup/mwbzutils/Makefile
@@ -16,7 +16,7 @@
 # 2010-2013: see the file COPYING for details.
 # ------------------------------------------------------------------
 
-VERSION        = "0.0.5"
+VERSION        = "0.0.6"
 CC            ?= gcc
 BIGFILES       = -D_FILE_OFFSET_BITS=64
 CPPFLAGS      += $(BIGFILES) -DVERSION=\"$(VERSION)\"
@@ -24,12 +24,15 @@
 
 build: checkforbz2footer dumpbz2filefromoffset \
        dumplastbz2block findpageidinbz2xml \
-       recompressxml writeuptopageid compressedmanpages
+       recompressxml writeuptopageid compressedmanpages \
+       getlastpageidinbz2xml
+
 
 NAME_CHECKFORBZ2FOOTER       = "Check if bzip2 file ends with bz2 magic footer"
 NAME_DUMPBZ2FILEFROMOFFSET   = "Write MediaWiki XML pages from bzip2 file 
starting from offset"
 NAME_DUMPLASTBZ2BLOCK        = "Find last bz2 block in bzip2 file and dump 
contents"
 NAME_FINDPAGEIDINBZ2XML      = "Display offset of bz2 block for given page id 
in bzip2 MediaWiki XML file"
+NAME_FINDLASTPAGEIDINBZ2XML  = "Display last page id bzip2 MediaWiki XML file"
 NAME_RECOMPRESSXML           = "Bz2 compress MediaWiki XML input in batches of 
pages"
 NAME_WRITEUPTOPAGEID         = "Write range of page content from MediaWiki XML 
input"
 
@@ -59,6 +62,9 @@
 
 findpageidinbz2xml: $(OBJSBZ) mwbzlib.o httptiny.o findpageidinbz2xml.o
        $(CC) $(LDFLAGS) -o findpageidinbz2xml findpageidinbz2xml.o httptiny.o 
$(OBJS) $(LIBS) -lz
+
+getlastpageidinbz2xml: $(OBJSBZ) mwbzlib.o getlastpageidinbz2xml.o
+       $(CC) $(LDFLAGS) -o getlastpageidinbz2xml getlastpageidinbz2xml.o 
$(OBJS) $(LIBS)
 
 recompressxml: $(OBJSBZ) recompressxml.o
        $(CC) $(LDFLAGS) -o recompressxml recompressxml.o $(LIBS)
@@ -95,6 +101,9 @@
 findpageidinbz2xml.1 : findpageidinbz2xml
        $(HELP2MAN) --section 1 --no-info --name $(NAME_FINDPAGEIDINBZ2XML) \
                --no-discard-stderr ./findpageidinbz2xml > 
docs/findpageidinbz2xml.1
+getlastpageidinbz2xml.1 : getlastpageidinbz2xml
+       $(HELP2MAN) --section 1 --no-info --name $(NAME_GETLASTPAGEIDINBZ2XML) \
+               --no-discard-stderr ./getlastpageidinbz2xml > 
docs/getlastpageidinbz2xml.1
 recompressxml.1 : recompressxml
        $(HELP2MAN) --section 1 --no-info --name $(NAME_RECOMPRESSXML) \
                --no-discard-stderr ./recompressxml > docs/recompressxml.1
@@ -103,12 +112,13 @@
                --no-discard-stderr ./writeuptopageid > docs/writeuptopageid.1
 
 install: dumplastbz2block findpageidinbz2xml checkforbz2footer 
dumpbz2filefromoffset \
-       recompressxml writeuptopageid compressedmanpages
+       recompressxml writeuptopageid compressedmanpages getlastpageidinbz2xml
        install --directory                         $(BINDIR)
        install --mode=755   checkforbz2footer      $(BINDIR)
        install --mode=755   dumplastbz2block       $(BINDIR)
        install --mode=755   dumpbz2filefromoffset  $(BINDIR)
        install --mode=755   findpageidinbz2xml     $(BINDIR)
+       install --mode=755   getlastpageidinbz2xml  $(BINDIR)
        install --mode=755   recompressxml          $(BINDIR)
        install --mode=755   writeuptopageid        $(BINDIR)
        install --directory                         $(MANDIR)
@@ -121,6 +131,7 @@
 uninstall:
        rm -f $(BINDIR)dumplastbz2block
        rm -f $(BINDIR)findpageidinbz2xml
+       rm -f $(BINDIR)getlastpageidinbz2xml
        rm -f $(BINDIR)checkforbz2footer
        rm -f $(BINDIR)dumpbz2filefromoffset
        rm -f $(BINDIR)recompressxml
@@ -132,6 +143,7 @@
 
 clean: 
        rm -f *.o *.a dumplastbz2block findpageidinbz2xml \
+               getlastpageidinbz2xml \
                checkforbz2footer dumpbz2filefromoffset \
                recompressxml writeuptopageid docs/*.1.gz
 
diff --git a/xmldumps-backup/mwbzutils/getlastpageidinbz2xml.c 
b/xmldumps-backup/mwbzutils/getlastpageidinbz2xml.c
new file mode 100644
index 0000000..5493d59
--- /dev/null
+++ b/xmldumps-backup/mwbzutils/getlastpageidinbz2xml.c
@@ -0,0 +1,296 @@
+#include <unistd.h>
+#include <getopt.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <regex.h>
+#include <inttypes.h>
+#include <zlib.h>
+#include "mwbzutils.h"
+
+void usage(char *message) {
+  char * help =
+"Usage: getlastpageidinbz2xml --filename file [--verbose]\n"
+"       [--help] [--version]\n\n"
+"Show the last page id in the specified MediaWiki XML dump file.\n"
+"This assumes that the last bz2 block(s) of the file are intact.\n"
+"Exits with 0 in success, -1 on error.\n\n"
+"Options:\n\n"
+"  -f, --filename   name of file to search\n"
+"  -v, --verbose    show search process; specify multiple times for more 
output\n"
+"  -h, --help       Show this help message\n"
+"  -V, --version    Display the version of this program and exit\n\n"
+"Report bugs in getlastpageidinbz2xml to 
<https://phabricator.wikimedia.org/>.\n\n"
+"See also dumpbz2filefromoffset(1), dumplastbz2block(1), 
findpageidinbz2xml(1),\n"
+    "recompressxml(1), writeuptopageid(1)\n\n";
+  if (message) {
+    fprintf(stderr,"%s\n\n",message);
+  }
+  fprintf(stderr,"%s",help);
+  exit(-1);
+}
+
+void show_version(char *version_string) {
+  char * copyright =
+"Copyright (C) 2017 Ariel T. Glenn.  All rights reserved.\n\n"
+"This program is free software: you can redistribute it and/or modify it\n"
+"under the  terms of the GNU General Public License as published by the\n"
+"Free Software Foundation, either version 2 of the License, or (at your\n"
+"option) any later version.\n\n"
+"This  program  is  distributed  in the hope that it will be useful, but\n"
+"WITHOUT ANY WARRANTY; without even the implied warranty of \n"
+"MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General\n"
+"Public License for more details.\n\n"
+"You should have received a copy of the GNU General Public License along\n"
+"with this program.  If not, see <http://www.gnu.org/licenses/>\n\n"
+    "Written by Ariel T. Glenn.\n";
+  fprintf(stderr,"getlastpageidinbz2xml %s\n", version_string);
+  fprintf(stderr,"%s",copyright);
+  exit(-1);
+}
+
+/*
+ if any page id is found, appropriate updates will be made to pinfo
+ no updates are made to the buffer about consumed data, the caller
+ is responsible
+ */
+void find_last_pageid_in_buffer(buf_info_t *buffer, page_info_t *pinfo,
+                               bz_info_t *bfile, int verbose) {
+  regmatch_t *match_page_id;
+  regex_t compiled_page_id;
+
+  char *page_id = "<page>\n[ ]+<title>[^<]+</title>\n([ ]+<ns>[0-9]+</ns>\n)?[ 
]+<id>([0-9]+)</id>\n"; 
+
+  char *match_from ;
+
+  regcomp(&compiled_page_id, page_id, REG_EXTENDED);
+  match_page_id = (regmatch_t *)malloc(sizeof(regmatch_t)*3);
+
+  if (buffer_is_empty(buffer)) return;
+
+  match_from = (char *)buffer->next_to_read;
+  while (regexec(&compiled_page_id, match_from, 3, match_page_id, 0) == 0) {
+    /* found one, yay */
+    if (match_page_id[2].rm_so >=0) {
+        pinfo->page_id = atoi((char 
*)(buffer->next_to_read+match_page_id[2].rm_so));
+        pinfo->position = bfile->block_start;
+        pinfo->bits_shifted = bfile->bits_shifted;
+       /* get ready to search rest of buffer */
+       match_from += match_page_id[0].rm_eo;
+    }
+    else {
+      /* should never happen */
+      fprintf(stderr,"regex gone bad...\n"); 
+      exit(-1);
+    }
+  }
+  free(match_page_id);
+  regfree(&compiled_page_id);
+  return;
+}
+
+
+void init_pinfo(page_info_t *pinfo) {
+  pinfo->bits_shifted = -1;
+  pinfo->position = (off_t)-1;
+  pinfo->page_id = -1;
+  return;
+}
+
+/* 
+   get the last page id after position in file 
+   expect position to be the start of a bz2 block
+   if a pageid is found, the structure pinfo will be updated accordingly
+   returns:
+      1 if a pageid found,
+      0 if no pageid found,
+      -1 on error
+*/
+int get_last_page_id_after_offset(int fin, page_info_t *pinfo,
+                                 bz_info_t *bfile, off_t upto, int verbose) {
+  int length=5000; /* output buffer size */
+
+  buf_info_t *b;
+  const int KEEP = 310;
+
+  b = init_buffer(length);
+  init_pinfo(pinfo);
+
+    /* try to fill the buffer, unless of course we hit eof */
+    /* could be a case where they read no bytes, more bytes are avail in 
buffer,
+       we hit eof. what then? */
+    /* while ((res = get_buffer_of_uncompressed_data(b, fin, bfile, FORWARD) 
>=0) && (! bfile->eof)) { */
+    /* while (!get_buffer_of_uncompressed_data(b, fin, bfile, FORWARD) && (! 
bfile->eof)) { */
+
+
+    while (get_buffer_of_uncompressed_data(b, fin, bfile, FORWARD) >= 0 && (! 
bfile -> eof)) {
+      find_last_pageid_in_buffer(b, pinfo, bfile, verbose);
+      /* did we hit eof? then th-th-that's all folks */
+      if (bfile->eof)
+       break;
+
+      /*
+       We keep reading more buffers because we want the _last_ pageid,
+       not the first one
+      */
+      else if (buffer_is_empty(b)) {
+       /* entire buffer is now available for next read */
+       bfile->strm.next_out = (char *)b->buffer;
+       bfile->strm.avail_out = bfile->bufout_size;
+       b->next_to_fill = b->buffer;
+      }
+      else if (b->bytes_avail> KEEP) {
+       /* dump contents of buffer except last KEEP chars,
+          move those to front so we can keep reading.
+          We keep that much in case somewhere near the end was a page
+          tag or a page id tag that got cut off in the middle.
+       */
+       move_bytes_to_buffer_start(b, b->end - KEEP, KEEP);
+       bfile->strm.next_out = (char *)b->next_to_fill;
+       bfile->strm.avail_out = b->end - b->next_to_fill;
+      }
+      else {
+       /* move available bytes (don't have KEEP) up to front */
+       move_bytes_to_buffer_start(b, b->next_to_read, b->bytes_avail);
+       bfile->strm.next_out = (char *)b->next_to_fill;
+       bfile->strm.avail_out = b->end - b->next_to_fill;
+      }
+      if (bfile->position > upto) {
+       /* we're done */
+       break;
+      }
+    }
+    if (bfile->eof || bfile->position > upto) {
+    /* see what's left in the buffer after eof. maybe we got something good */
+      find_last_pageid_in_buffer(b, pinfo, bfile, verbose);
+      BZ2_bzDecompressEnd(&(bfile->strm));
+      free_buffer(b);
+      free(b);
+      if (pinfo->page_id == -1) return 0; /* not found */
+      else if (pinfo->page_id > 0) return 1; /* found */
+      else return(-1); /* error */
+    }
+    else {
+      /* we have an error from get_buffer_of_uncompressed_data */
+      BZ2_bzDecompressEnd(&(bfile->strm));
+      free_buffer(b);
+      free(b);
+      fprintf(stderr,"freed buffer\n");
+      return(-1); /* error */
+    }
+}
+
+
+int giveup(int fin) {
+  fprintf(stderr,"Failed to find any page ids in file, exiting\n");
+  close(fin);
+  exit(1);
+}
+
+int main(int argc, char **argv) {
+  int fin, res, page_id=0;
+  off_t block_end, block_start, upto;
+  page_info_t pinfo;
+  char *filename = NULL;
+  int optindex=0;
+  bz_info_t bfile;
+  int verbose = 0;
+  int optc;
+  int result;
+
+  struct option optvalues[] = {
+    {"filename", 1, 0, 'f'},
+    {"verbose", 0, 0, 'v'},
+    {"version", 0, 0, 'V'},
+    {NULL, 0, NULL, 0}
+  };
+
+  while (1) {
+    optc=getopt_long_only(argc,argv,"f:hvV", optvalues, &optindex);
+    if (optc=='f') {
+     filename=optarg;
+    }
+    else if (optc=='h')
+      usage(NULL);
+    else if (optc=='v')
+      verbose++;
+    else if (optc=='V')
+      show_version(VERSION);
+    else if (optc==-1) break;
+    else usage("Unknown option or other error\n");
+  }
+
+  if (! filename) {
+    usage(NULL);
+  }
+
+  fin = open (filename, O_RDONLY);
+  if (fin < 0) {
+    fprintf(stderr,"Failed to open file %s for read\n", filename);
+    exit(1);
+  }
+
+  bfile.file_size = get_file_size(fin);
+  bfile.footer = init_footer();
+  bfile.marker = init_marker();
+  result = check_file_for_footer(fin, &bfile);
+  if (result == -1) {
+    bfile.position = bfile.file_size;
+  }
+  else {
+    bfile.position = bfile.file_size - (off_t)11; /* size of footer, perhaps 
with 1 byte extra */
+  }
+  bfile.position -=(off_t)6; /* size of marker */
+  bfile.initialized = 0;
+  bfile.bytes_read = 0;
+
+  /* start at end of file */
+  block_end = bfile.position;
+  upto = block_end;
+
+  block_start = (off_t)-1;
+  page_id = 0;
+
+  while (!page_id) {
+    bfile.initialized = 0;
+    /* calling this explicitly without setting bfile.initialized to 0 above, 
does not fix problem,
+       we get the -2 param errors again */
+
+    /* this init does not malloc anything */
+    init_decompress(&bfile);
+
+    /* this calls init_decompress which calls BZ2_bzDecompressInit which sets 
strm->s and then strm->s->strm = strm 
+    but it must not do it every time, when I add the above then the initialize 
works, why?? */
+    block_start = find_first_bz2_block_from_offset(&bfile, fin, block_end, 
BACKWARD);
+    
+    if (block_start <= (off_t) 0) giveup(fin);
+    /* this calls get_buffer_of_uncompressed_data which calls 
get_and_decompress_data which COULD
+       call init_bz2_file, let's see if it does or not... not the second time! 
 let's force it*/
+    BZ2_bzDecompressEnd (&(bfile.strm));
+
+    res = get_last_page_id_after_offset(fin, &pinfo, &bfile, upto, verbose);
+    if (res > 0) {
+      page_id = pinfo.page_id;
+    }
+    else {
+      /* look for previous block */
+      /* FIXME this must be broken somehow. */
+      upto = block_end;
+      block_end = block_start - (off_t) 1;
+      if (block_end <= (off_t) 0) giveup(fin);
+    }
+    /* ths seems not to free enough stuff, check around. we have a leak */
+    /* even after adding this above we stil have the same leak wtf*/
+    BZ2_bzDecompressEnd (&(bfile.strm));
+  }
+  if (!page_id) giveup(fin);
+
+  fprintf(stderr, "page_id:%d\n", page_id);
+  close(fin);
+  exit(0);
+}  
diff --git a/xmldumps-backup/mwbzutils/mwbzlib.c 
b/xmldumps-backup/mwbzutils/mwbzlib.c
index 152ce98..3bdc1a3 100644
--- a/xmldumps-backup/mwbzutils/mwbzlib.c
+++ b/xmldumps-backup/mwbzutils/mwbzlib.c
@@ -299,7 +299,8 @@
   off_t seekresult;
 
   bfile->bufin_size = BUFINSIZE;
-  bfile->marker = init_marker();
+  if (bfile->marker == NULL)
+    bfile->marker = init_marker();
   bfile->bytes_read = 0;
   bfile->bytes_written = 0;
   bfile->eof = 0;
@@ -439,7 +440,6 @@
     bfile->strm.next_out = (char *)bfile->bufout;
     bfile->strm.avail_out = bfile->bufout_size;
   }
-
   ret = BZ_OK;
   while (BZ_OK == ret && bfile->bytes_written == 0) {
     ret = BZ2_bzDecompress_mine ( &(bfile->strm) );
@@ -512,12 +512,12 @@
 }
 
 void dumpbuf_info_t(buf_info_t *b) {
-  fprintf(stdout, "\n");
-  fprintf(stdout, "b->buffer: %ld\n", (long int) b->buffer);
-  fprintf(stdout, "b->end: %ld\n", (long int) b->end);
-  fprintf(stdout, "b->next_to_read: %ld\n", (long int) b->next_to_read);
-  fprintf(stdout, "b->next_to_fill: %ld\n", (long int) b->next_to_fill);
-  fprintf(stdout, "b->bytes_avail: %ld\n", (long int) b->bytes_avail);
+  fprintf(stderr, "\n");
+  fprintf(stderr, "b->buffer: %ld\n", (long int) b->buffer);
+  fprintf(stderr, "b->end: %ld\n", (long int) b->end);
+  fprintf(stderr, "b->next_to_read: %ld\n", (long int) b->next_to_read);
+  fprintf(stderr, "b->next_to_fill: %ld\n", (long int) b->next_to_fill);
+  fprintf(stderr, "b->bytes_avail: %ld\n", (long int) b->bytes_avail);
 }
 
 /* 
@@ -640,7 +640,8 @@
   unsigned char buffout[5000];
 
   bfile->bufin_size = BUFINSIZE;
-  bfile->marker = init_marker();
+  if (bfile->marker == NULL)
+    bfile->marker = init_marker();
   bfile->position = position;
   bfile->block_start = (off_t)-1;
   bfile->bytes_read = 0;

-- 
To view, visit https://gerrit.wikimedia.org/r/337341
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I2688e461dec59d6d742de93da317c0c80c4d90e4
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps/mwbzutils
Gerrit-Branch: master
Gerrit-Owner: ArielGlenn <ar...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] operations...mwbzutils[master]: little tool that displays the last page id in bz2 xml conten...

Reply via email to