ArielGlenn has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/348138 )
Change subject: process zero-length text entries as regular sql inserts
......................................................................
process zero-length text entries as regular sql inserts
previously we've just been silently dropping them, but there's
no need. MediaWiki will happily display the zero-length entry.
Change-Id: Icd8c9eccf8d2c9b550f1fad2f0f462a53819f9bd
---
M xmlfileutils/mwxmlelts.c
1 file changed, 54 insertions(+), 4 deletions(-)
Approvals:
ArielGlenn: Looks good to me, approved
jenkins-bot: Verified
diff --git a/xmlfileutils/mwxmlelts.c b/xmlfileutils/mwxmlelts.c
index de0c475..083bba6 100644
--- a/xmlfileutils/mwxmlelts.c
+++ b/xmlfileutils/mwxmlelts.c
@@ -267,6 +267,8 @@
/*
<text xml:space="preserve">#REDIRECT [[Computer accessibility]] {{R from
CamelCase}}</text> (text content file)
+ OR
+ <text xml:space="preserve" />(text content file)
(most will be multiple lines)
args:
@@ -340,8 +342,54 @@
while (*ind == ' ') ind++;
while (*ind && !(*ind == '/' && *(ind+1) == '>') && *ind != '>') ind++;
}
- if (*ind != '>') return(0); /* other options are: no close of tag on line,
weird... or
- tag ends in /> which means no content,
probably deleted */
+
+ /* handle the special case of a zero-length entry, this can be legit */
+ if (*ind == '/' && *(ind+1) == '>') {
+ /* tag ends in /> which means no content, probably deleted */
+ if (text_bytes_written == 0) {
+ strcpy(buf,"BEGIN;\n");
+ put_line_all(sqlt, buf);
+ snprintf(buf, sizeof(buf), "INSERT %s INTO %s (old_id, old_text,
old_flags) VALUES\n", insert_ignore?"IGNORE":"", t->text);
+ put_line_all(sqlt, buf);
+ }
+ else {
+ strcpy(buf,",\n");
+ put_line_all(sqlt, buf);
+ }
+ snprintf(buf, sizeof(buf),"(%s, '",r->text_id);
+ put_line_all(sqlt, buf);
+ strcpy(buf,"', ");
+ put_line_all(sqlt, buf);
+ sprintf(buf,"'%s')", "utf-8");
+ put_line_all(sqlt, buf);
+
+ if (get_text_length) text_length = 0;
+
+ if (get_sha1) {
+ /* probably this does nothing, check it */
+ sha1_update(&ctx, (unsigned char *)raw, 0);
+
+ sha1_finish(&ctx, sha1);
+
+ /* base36 conversion, blah */
+ for (i=0; i < SHA_DIGEST_LENGTH; i++)
+ sprintf((char*)&(sha1_string[i*2]), "%02x", sha1[i]);
+
+ /* sha1_num_len = hexstring2int((char *)sha1_string,
SHA_DIGEST_LENGTH*2, sha1_num);*/
+ sha1_num_len = hexstring2int((char *)sha1_string, SHA_DIGEST_LENGTH*2,
sha1_num);
+ sha1_b36_len = tobase36(sha1_num, sha1_copy, sha1_temp, sha1_num_len,
sha1_b36);
+ int2string(sha1_b36, sha1_b36_len, r->sha1);
+ }
+
+ /* NULL or not, the caller can figure it out. we just advance the pointer.
*/
+ get_line(f);
+
+ if (verbose > 1) fprintf(stderr,"text info: insert end of line written\n");
+ return(1);
+ }
+
+ if (*ind != '>') return(0); /* other options are: no close of tag on line,
weird */
+
ind++; /* skip that closing '.' */
@@ -351,6 +399,7 @@
put_line_all(sqlt, buf);
snprintf(buf, sizeof(buf), "INSERT %s INTO %s (old_id, old_text,
old_flags) VALUES\n", insert_ignore?"IGNORE":"", t->text);
put_line_all(sqlt, buf);
+ if (get_text_length) sprintf(r->text_len, "%d", text_length);
}
else {
strcpy(buf,",\n");
@@ -659,8 +708,9 @@
return(0);
}
}
-
- /* <text id="382338088" bytes="57" /> */
+ /* <text id="382338088" bytes="57" />
+ but can also be:
+ <text id="382338088" bytes="0" /> */
get_elt_with_attrs(stubs, TEXT, NULL, 0, attrs, MAX_ATTRS_STR_LEN);
if (verbose > 1) fprintf(stderr,"text tag found, %s\n", attrs);
attrs_ptr = attrs;
--
To view, visit https://gerrit.wikimedia.org/r/348138
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Icd8c9eccf8d2c9b550f1fad2f0f462a53819f9bd
Gerrit-PatchSet: 2
Gerrit-Project: operations/dumps
Gerrit-Branch: ariel
Gerrit-Owner: ArielGlenn <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits