Awight has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/192737

Change subject: Calculate text length during input processing
......................................................................

Calculate text length during input processing

Change-Id: Ie4fc6f204d11908b20fe6589b75d6dbb8ac302d9
---
M src/org/mediawiki/importer/SqlWriter15.java
M src/org/mediawiki/importer/XmlDumpReader.java
2 files changed, 4 insertions(+), 23 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/tools/mwdumper 
refs/changes/37/192737/1

diff --git a/src/org/mediawiki/importer/SqlWriter15.java 
b/src/org/mediawiki/importer/SqlWriter15.java
index 4960220..cd02773 100644
--- a/src/org/mediawiki/importer/SqlWriter15.java
+++ b/src/org/mediawiki/importer/SqlWriter15.java
@@ -89,28 +89,6 @@
                lastRevision = revision;
        }
        
-       private static int lengthUtf8(String s) {
-               final int slen = s.length();
-               final char[] buf = Buffer.get(slen);
-               s.getChars(0, slen, buf, 0);
-               int len = 0;
-               for (int i = 0; i < slen; i++) {
-                       char c = buf[i];
-                       if (c < 0x80)
-                               len++;
-                       else if (c < 0x800)
-                               len+=2;
-                       else if (c < 0xD800 || c >= 0xE000)
-                               len+=3;
-                       else {
-                               // Surrogate pairs are assumed to be valid.
-                               len+=4;
-                               i++;
-                       }
-               }
-               return len;
-       }
-       
        private void updatePage(Page page, Revision revision) throws 
IOException {
                bufferInsertRow("page", new Object[][] {
                                {"page_id", new Integer(page.Id)},
@@ -123,7 +101,7 @@
                                {"page_random", traits.getRandom()},
                                {"page_touched", traits.getCurrentTime()},
                                {"page_latest", new Integer(revision.Id)},
-                               {"page_len", new 
Integer(lengthUtf8(revision.Text))}});
+                               {"page_len", revision.Bytes}});
                checkpoint();
        }
 
diff --git a/src/org/mediawiki/importer/XmlDumpReader.java 
b/src/org/mediawiki/importer/XmlDumpReader.java
index b162b55..d5df43e 100644
--- a/src/org/mediawiki/importer/XmlDumpReader.java
+++ b/src/org/mediawiki/importer/XmlDumpReader.java
@@ -438,6 +438,9 @@
        void readText() {
                rev.Text = bufferContentsOrNull();
                if (rev.Text==null && !deleted) rev.Text = ""; //NOTE: null 
means deleted/supressed
+               if (rev.Bytes == null) {
+                       rev.Bytes = new 
Integer(rev.Text.getBytes("UTF-8").length);
+               }
        }
 
        void readSha1() {

-- 
To view, visit https://gerrit.wikimedia.org/r/192737
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie4fc6f204d11908b20fe6589b75d6dbb8ac302d9
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/tools/mwdumper
Gerrit-Branch: master
Gerrit-Owner: Awight <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to