Brion VIBBER has submitted this change and it was merged.

Change subject: [WIP] Calculate text length during input processing
......................................................................


[WIP] Calculate text length during input processing

Change-Id: Ie4fc6f204d11908b20fe6589b75d6dbb8ac302d9
---
M src/org/mediawiki/importer/SqlWriter15.java
M src/org/mediawiki/importer/XmlDumpReader.java
2 files changed, 11 insertions(+), 23 deletions(-)

Approvals:
  Brion VIBBER: Verified; Looks good to me, approved



diff --git a/src/org/mediawiki/importer/SqlWriter15.java 
b/src/org/mediawiki/importer/SqlWriter15.java
index 4960220..cd02773 100644
--- a/src/org/mediawiki/importer/SqlWriter15.java
+++ b/src/org/mediawiki/importer/SqlWriter15.java
@@ -89,28 +89,6 @@
                lastRevision = revision;
        }
        
-       private static int lengthUtf8(String s) {
-               final int slen = s.length();
-               final char[] buf = Buffer.get(slen);
-               s.getChars(0, slen, buf, 0);
-               int len = 0;
-               for (int i = 0; i < slen; i++) {
-                       char c = buf[i];
-                       if (c < 0x80)
-                               len++;
-                       else if (c < 0x800)
-                               len+=2;
-                       else if (c < 0xD800 || c >= 0xE000)
-                               len+=3;
-                       else {
-                               // Surrogate pairs are assumed to be valid.
-                               len+=4;
-                               i++;
-                       }
-               }
-               return len;
-       }
-       
        private void updatePage(Page page, Revision revision) throws 
IOException {
                bufferInsertRow("page", new Object[][] {
                                {"page_id", new Integer(page.Id)},
@@ -123,7 +101,7 @@
                                {"page_random", traits.getRandom()},
                                {"page_touched", traits.getCurrentTime()},
                                {"page_latest", new Integer(revision.Id)},
-                               {"page_len", new 
Integer(lengthUtf8(revision.Text))}});
+                               {"page_len", revision.Bytes}});
                checkpoint();
        }
 
diff --git a/src/org/mediawiki/importer/XmlDumpReader.java 
b/src/org/mediawiki/importer/XmlDumpReader.java
index 8431b5e..01c01c7 100644
--- a/src/org/mediawiki/importer/XmlDumpReader.java
+++ b/src/org/mediawiki/importer/XmlDumpReader.java
@@ -27,6 +27,7 @@
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
 import java.util.Calendar;
 import java.util.GregorianCalendar;
 import java.util.HashMap;
@@ -425,6 +426,15 @@
        void readText() {
                rev.Text = bufferContentsOrNull();
                if (rev.Text==null && !deleted) rev.Text = ""; //NOTE: null 
means deleted/supressed
+               if (rev.Bytes == null) {
+                       try {
+                               rev.Bytes = rev.Text.getBytes("UTF-8").length;
+                       } catch (UnsupportedEncodingException ex) {
+                               // FIXME: What should we use as a default value 
on failure?
+                               // This is probably unreachable...
+                               rev.Bytes = -1;
+                       }
+               }
        }
 
        void readSha1() {

-- 
To view, visit https://gerrit.wikimedia.org/r/192737
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ie4fc6f204d11908b20fe6589b75d6dbb8ac302d9
Gerrit-PatchSet: 4
Gerrit-Project: mediawiki/tools/mwdumper
Gerrit-Branch: master
Gerrit-Owner: Awight <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>
Gerrit-Reviewer: Brion VIBBER <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to