ArielGlenn has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/348138 )

Change subject: process zero-length text entries as regular sql inserts
......................................................................


process zero-length text entries as regular sql inserts

previously we've just been silently dropping them, but there's
no need. MediaWiki will happily display the zero-length entry.

Change-Id: Icd8c9eccf8d2c9b550f1fad2f0f462a53819f9bd
---
M xmlfileutils/mwxmlelts.c
1 file changed, 54 insertions(+), 4 deletions(-)

Approvals:
  ArielGlenn: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/xmlfileutils/mwxmlelts.c b/xmlfileutils/mwxmlelts.c
index de0c475..083bba6 100644
--- a/xmlfileutils/mwxmlelts.c
+++ b/xmlfileutils/mwxmlelts.c
@@ -267,6 +267,8 @@
 
 /*
    <text xml:space="preserve">#REDIRECT [[Computer accessibility]] {{R from 
CamelCase}}</text> (text content file)
+   OR
+   <text xml:space="preserve" />(text content file)
    (most will be multiple lines)
 
    args:
@@ -340,8 +342,54 @@
     while (*ind == ' ') ind++;
     while (*ind && !(*ind == '/' && *(ind+1) == '>') && *ind != '>') ind++;
   }
-  if (*ind != '>') return(0); /* other options are: no close of tag on line, 
weird... or
-                                tag ends in /> which means no content, 
probably deleted */
+
+  /* handle the special case of a zero-length entry, this can be legit */
+  if (*ind == '/' && *(ind+1) == '>') {
+    /* tag ends in /> which means no content, probably deleted */
+    if (text_bytes_written == 0) {
+      strcpy(buf,"BEGIN;\n");
+      put_line_all(sqlt, buf);
+      snprintf(buf, sizeof(buf), "INSERT %s INTO %s (old_id, old_text, 
old_flags) VALUES\n", insert_ignore?"IGNORE":"", t->text);
+      put_line_all(sqlt, buf);
+    }
+    else {
+      strcpy(buf,",\n");
+      put_line_all(sqlt, buf);
+    }
+    snprintf(buf, sizeof(buf),"(%s, '",r->text_id);
+    put_line_all(sqlt, buf);
+    strcpy(buf,"', ");
+    put_line_all(sqlt, buf);
+    sprintf(buf,"'%s')", "utf-8");
+    put_line_all(sqlt, buf);
+
+    if (get_text_length) text_length = 0;
+
+    if (get_sha1) {
+      /* probably this does nothing, check it */
+      sha1_update(&ctx, (unsigned char *)raw, 0);
+
+      sha1_finish(&ctx, sha1);
+
+      /* base36 conversion, blah */
+      for (i=0; i < SHA_DIGEST_LENGTH; i++)
+        sprintf((char*)&(sha1_string[i*2]), "%02x", sha1[i]);
+
+      /*    sha1_num_len = hexstring2int((char *)sha1_string, 
SHA_DIGEST_LENGTH*2, sha1_num);*/
+      sha1_num_len = hexstring2int((char *)sha1_string, SHA_DIGEST_LENGTH*2, 
sha1_num);
+      sha1_b36_len = tobase36(sha1_num, sha1_copy, sha1_temp, sha1_num_len, 
sha1_b36);
+      int2string(sha1_b36, sha1_b36_len, r->sha1);
+    }
+
+    /* NULL or not, the caller can figure it out. we just advance the pointer. 
*/
+    get_line(f);
+
+    if (verbose > 1) fprintf(stderr,"text info: insert end of line written\n");
+    return(1);
+  }
+
+  if (*ind != '>') return(0); /* other options are: no close of tag on line, 
weird */
+
 
   ind++;  /* skip that closing '.' */
 
@@ -351,6 +399,7 @@
     put_line_all(sqlt, buf);
     snprintf(buf, sizeof(buf), "INSERT %s INTO %s (old_id, old_text, 
old_flags) VALUES\n", insert_ignore?"IGNORE":"", t->text);
     put_line_all(sqlt, buf);
+    if (get_text_length) sprintf(r->text_len, "%d", text_length);
   }
   else {
     strcpy(buf,",\n");
@@ -659,8 +708,9 @@
       return(0);
     }
   }
-
-  /*       <text id="382338088" bytes="57" />  */
+  /*       <text id="382338088" bytes="57" />
+     but can also be:
+           <text id="382338088" bytes="0" />  */
   get_elt_with_attrs(stubs, TEXT, NULL, 0, attrs, MAX_ATTRS_STR_LEN);
   if (verbose > 1) fprintf(stderr,"text tag found, %s\n", attrs);
   attrs_ptr = attrs;

-- 
To view, visit https://gerrit.wikimedia.org/r/348138
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Icd8c9eccf8d2c9b550f1fad2f0f462a53819f9bd
Gerrit-PatchSet: 2
Gerrit-Project: operations/dumps
Gerrit-Branch: ariel
Gerrit-Owner: ArielGlenn <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to