[
https://issues.apache.org/jira/browse/HADOOP-3481?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
NOMURA Yoshihide updated HADOOP-3481:
-------------------------------------
Status: Patch Available (was: Open)
*** LineRecordReader-org.java Thu May 15 16:20:15 2008
--- LineRecordReader.java Fri May 30 19:11:09 2008
***************
*** 18,26 ****
package org.apache.hadoop.mapred;
import java.io.IOException;
import java.io.InputStream;
! import java.io.OutputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
--- 18,27 ----
package org.apache.hadoop.mapred;
+ import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
! import java.nio.charset.Charset;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
***************
*** 53,58 ****
--- 54,60 ----
private int bufferLength = 0;
// the current position in the buffer
private int bufferPosn = 0;
+ private Charset charset = Charset.forName("UTF-8");
/**
* Create a line reader that reads from the given stream using the
***************
*** 76,81 ****
--- 78,92 ----
*/
public LineReader(InputStream in, Configuration conf) throws IOException {
this(in, conf.getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE));
+ Charset charset = null;
+ try {
+ charset = Charset.forName(conf.get("io.file.defaultEncoding",
"utf-8"));
+ } catch (Exception e) {
+ // nop
+ }
+ if (charset != null) {
+ this.charset = charset;
+ }
}
/**
***************
*** 105,110 ****
--- 116,122 ----
*/
public int readLine(Text str) throws IOException {
str.clear();
+ ByteArrayOutputStream buf = new ByteArrayOutputStream();
boolean hadFinalNewline = false;
boolean hadFinalReturn = false;
boolean hitEndOfFile = false;
***************
*** 138,154 ****
}
int length = bufferPosn - startPosn - (hadFinalReturn ? 1 : 0);
if (length >= 0) {
! str.append(buffer, startPosn, length);
}
}
int newlineLength = (hadFinalNewline ? 1 : 0) + (hadFinalReturn ? 1 :
0);
if (!hitEndOfFile) {
int length = bufferPosn - startPosn - newlineLength;
if (length > 0) {
! str.append(buffer, startPosn, length);
}
}
! return str.getLength() + newlineLength;
}
}
--- 150,169 ----
}
int length = bufferPosn - startPosn - (hadFinalReturn ? 1 : 0);
if (length >= 0) {
! buf.write(buffer, startPosn, length);
}
}
int newlineLength = (hadFinalNewline ? 1 : 0) + (hadFinalReturn ? 1 :
0);
if (!hitEndOfFile) {
int length = bufferPosn - startPosn - newlineLength;
if (length > 0) {
! buf.write(buffer, startPosn, length);
}
}
! buf.close();
! String tmpstr = new String(buf.toByteArray(), charset);
! str.set(tmpstr);
! return buf.size() + newlineLength;
}
}
> TextInputFormat should support character encoding settings
> ----------------------------------------------------------
>
> Key: HADOOP-3481
> URL: https://issues.apache.org/jira/browse/HADOOP-3481
> Project: Hadoop Core
> Issue Type: Improvement
> Components: mapred
> Affects Versions: 0.17.0
> Environment: Windows XP SP3
> Reporter: NOMURA Yoshihide
>
> I need to read text files in different character encoding from UTF-8,
> but I think TextInputFormat doesn't support such character encoding.
> I suggest the TextInputFormat to support encoding settings like this.
> conf.set("io.file.defaultEncoding", "MS932");
> I will submit a patch candidate.
--
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.