Awight has uploaded a new change for review.
https://gerrit.wikimedia.org/r/192737
Change subject: Calculate text length during input processing
......................................................................
Calculate text length during input processing
Change-Id: Ie4fc6f204d11908b20fe6589b75d6dbb8ac302d9
---
M src/org/mediawiki/importer/SqlWriter15.java
M src/org/mediawiki/importer/XmlDumpReader.java
2 files changed, 4 insertions(+), 23 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/tools/mwdumper
refs/changes/37/192737/1
diff --git a/src/org/mediawiki/importer/SqlWriter15.java
b/src/org/mediawiki/importer/SqlWriter15.java
index 4960220..cd02773 100644
--- a/src/org/mediawiki/importer/SqlWriter15.java
+++ b/src/org/mediawiki/importer/SqlWriter15.java
@@ -89,28 +89,6 @@
lastRevision = revision;
}
- private static int lengthUtf8(String s) {
- final int slen = s.length();
- final char[] buf = Buffer.get(slen);
- s.getChars(0, slen, buf, 0);
- int len = 0;
- for (int i = 0; i < slen; i++) {
- char c = buf[i];
- if (c < 0x80)
- len++;
- else if (c < 0x800)
- len+=2;
- else if (c < 0xD800 || c >= 0xE000)
- len+=3;
- else {
- // Surrogate pairs are assumed to be valid.
- len+=4;
- i++;
- }
- }
- return len;
- }
-
private void updatePage(Page page, Revision revision) throws
IOException {
bufferInsertRow("page", new Object[][] {
{"page_id", new Integer(page.Id)},
@@ -123,7 +101,7 @@
{"page_random", traits.getRandom()},
{"page_touched", traits.getCurrentTime()},
{"page_latest", new Integer(revision.Id)},
- {"page_len", new
Integer(lengthUtf8(revision.Text))}});
+ {"page_len", revision.Bytes}});
checkpoint();
}
diff --git a/src/org/mediawiki/importer/XmlDumpReader.java
b/src/org/mediawiki/importer/XmlDumpReader.java
index b162b55..d5df43e 100644
--- a/src/org/mediawiki/importer/XmlDumpReader.java
+++ b/src/org/mediawiki/importer/XmlDumpReader.java
@@ -438,6 +438,9 @@
void readText() {
rev.Text = bufferContentsOrNull();
if (rev.Text==null && !deleted) rev.Text = ""; //NOTE: null
means deleted/supressed
+ if (rev.Bytes == null) {
+ rev.Bytes = new
Integer(rev.Text.getBytes("UTF-8").length);
+ }
}
void readSha1() {
--
To view, visit https://gerrit.wikimedia.org/r/192737
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie4fc6f204d11908b20fe6589b75d6dbb8ac302d9
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/tools/mwdumper
Gerrit-Branch: master
Gerrit-Owner: Awight <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits