baeminbo commented on code in PR #32398:
URL: https://github.com/apache/beam/pull/32398#discussion_r1746721126
##########
sdks/java/core/src/main/java/org/apache/beam/sdk/io/TextSource.java:
##########
@@ -377,106 +392,61 @@ private boolean readDefaultLine() throws IOException {
return true;
}
- /**
- * Loosely based upon <a
- *
href="https://github.com/hanborq/hadoop/blob/master/src/core/org/apache/hadoop/util/LineReader.java">Hadoop
- * LineReader.java</a>
- *
- * <p>Note that this implementation fixes an issue where a partial match
against the delimiter
- * would have been lost if the delimiter crossed at the buffer boundaries
during reading.
- */
private boolean readCustomLine() throws IOException {
- assert !eof;
+ checkState(!eof);
+ checkNotNull(delimiter);
+ checkNotNull(
+ delimiterFinder, "DelimiterFinder must not be null if custom
delimiter is used.");
long bytesConsumed = 0;
- int delPosn = 0;
- EOF:
- for (; ; ) {
- int startPosn = bufferPosn; // starting from where we left off the
last time
+ delimiterFinder.reset();
- // Read the next chunk from the file, ensure that we read at least one
byte
- // or reach EOF.
+ while (true) {
+ int startPosn = bufferPosn;
while (bufferPosn >= bufferLength) {
startPosn = bufferPosn = 0;
byteBuffer.clear();
bufferLength = inChannel.read(byteBuffer);
- // If we are at EOF then try to create the last value from the
buffer.
if (bufferLength < 0) {
eof = true;
- // Write any partial delimiter now that we are at EOF
- if (delPosn != 0) {
- str.write(delimiter, 0, delPosn);
- }
-
- // Don't return an empty record if the file ends with a delimiter
if (str.size() == 0) {
return false;
}
+ // Not ending with a delimiter.
currentValue = str.toString(StandardCharsets.UTF_8.name());
- break EOF;
+ break;
}
}
- int prevDelPosn = delPosn;
- DELIMITER_MATCH:
- {
- if (delPosn > 0) {
- // slow-path: Handle the case where we only matched part of the
delimiter, possibly
- // adding that to str fixing up any partially consumed delimiter
if we don't match the
- // whole delimiter
- for (; bufferPosn < bufferLength; ++bufferPosn) {
- if (buffer[bufferPosn] == delimiter[delPosn]) {
- delPosn++;
- if (delPosn == delimiter.length) {
- bufferPosn++;
- break DELIMITER_MATCH; // Skip matching the delimiter using
the fast path
- }
- } else {
- // Add to str any previous partial delimiter since we didn't
match the whole
- // delimiter
- str.write(delimiter, 0, prevDelPosn);
- if (buffer[bufferPosn] == delimiter[0]) {
- delPosn = 1;
- } else {
- delPosn = 0;
- }
- break; // Leave this loop and use the fast-path delimiter
matching
- }
- }
- }
+ if (eof) {
+ break;
+ }
- // fast-path: Look for the delimiter within the buffer
- for (; bufferPosn < bufferLength; ++bufferPosn) {
- if (buffer[bufferPosn] == delimiter[delPosn]) {
- delPosn++;
- if (delPosn == delimiter.length) {
- bufferPosn++;
- break;
- }
- } else if (buffer[bufferPosn] == delimiter[0]) {
- delPosn = 1;
- } else {
- delPosn = 0;
- }
+ boolean delimiterFound = false;
+ for (; bufferPosn < bufferLength; ++bufferPosn) {
Review Comment:
I changed the location of `startPosn` just before the loop. If it's
initialized in the loop, it looks like:
```
int startPosn;
for (startPosn = bufferPosn; bufferPosn < bufferLength;
++bufferPosn) { <-- looks a bit weird to me as `startPosn` is initialized only
and not changed in the loop.
if (delimiterFinder.feed(buffer[bufferPosn])) {
++bufferPosn;
delimiterFound = true;
break;
}
}
int readLength = bufferPosn - startPosn; <-- `startPosn` is used
here. It cannot be in the scope of the loop above.
bytesConsumed += readLength;
...
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]