[
https://issues.apache.org/jira/browse/NIFI-2876?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15880552#comment-15880552
]
ASF GitHub Bot commented on NIFI-2876:
--------------------------------------
Github user markap14 commented on a diff in the pull request:
https://github.com/apache/nifi/pull/1214#discussion_r102726151
--- Diff:
nifi-commons/nifi-utils/src/main/java/org/apache/nifi/stream/io/util/TextLineDemarcator.java
---
@@ -95,52 +71,61 @@ public OffsetInfo nextOffsetInfo() {
*
* @return offset info
*/
- public OffsetInfo nextOffsetInfo(byte[] startsWith) {
+ public OffsetInfo nextOffsetInfo(byte[] startsWith) throws IOException
{
OffsetInfo offsetInfo = null;
- int lineLength = 0;
- byte[] token = null;
- lineLoop:
- while (this.bufferLength != -1) {
+ byte previousByteVal = 0;
+ byte[] data = null;
+ nextTokenLoop:
+ while (data == null && this.bufferLength != -1) {
if (this.index >= this.bufferLength) {
this.fill();
}
+ int delimiterSize = 0;
if (this.bufferLength != -1) {
- int i;
byte byteVal;
+ int i;
for (i = this.index; i < this.bufferLength; i++) {
byteVal = this.buffer[i];
- lineLength++;
- int crlfLength = computeEol(byteVal, i + 1);
- if (crlfLength > 0) {
- i += crlfLength;
- if (crlfLength == 2) {
- lineLength++;
- }
- offsetInfo = new OffsetInfo(this.offset,
lineLength, crlfLength);
+
+ if (byteVal == 10) {
+ delimiterSize = previousByteVal == 13 ? 2 : 1;
+ } else if (previousByteVal == 13) {
+ delimiterSize = 1;
+ i--;
+ }
+ previousByteVal = byteVal;
+ if (delimiterSize > 0) {
+ this.index = i + 1;
+ int size = Math.max(1, this.index - this.mark);
+ offsetInfo = new OffsetInfo(this.offset, size,
delimiterSize);
+ this.offset += size;
if (startsWith != null) {
- token = this.extractDataToken(lineLength);
+ data = this.extractDataToken(size);
}
this.mark = this.index;
- break lineLoop;
+ break nextTokenLoop;
}
}
this.index = i;
+ } else {
+ delimiterSize = previousByteVal == 13 || previousByteVal
== 10 ? 1 : 0;
+ if (offsetInfo == null) {
+ int size = this.index - this.mark;
+ if (size > 0) {
+ offsetInfo = new OffsetInfo(this.offset, size,
delimiterSize);
+ this.offset += size;
+ }
+ }
+ if (startsWith != null) {
+ data = this.extractDataToken(this.index - this.mark);
+ }
}
}
- // EOF where last char(s) are not CRLF.
- if (lineLength > 0 && offsetInfo == null) {
- offsetInfo = new OffsetInfo(this.offset, lineLength, 0);
- if (startsWith != null) {
- token = this.extractDataToken(lineLength);
- }
- }
- this.offset += lineLength;
- // checks if the new line starts with 'startsWith' chars
- if (startsWith != null) {
+ if (startsWith != null && data != null) {
for (int i = 0; i < startsWith.length; i++) {
byte sB = startsWith[i];
- if (token != null && sB != token[i]) {
+ if (data != null && sB != data[i]) {
--- End diff --
`data` is guaranteed non-null here because it is checked above.
> Refactor TextLineDemarcator and StreamDemarcator into a common abstract class
> -----------------------------------------------------------------------------
>
> Key: NIFI-2876
> URL: https://issues.apache.org/jira/browse/NIFI-2876
> Project: Apache NiFi
> Issue Type: Improvement
> Reporter: Oleg Zhurakousky
> Assignee: Oleg Zhurakousky
> Priority: Minor
> Fix For: 1.2.0
>
>
> Based on the work that has been performed as part of the NIFI-2851 we now
> have a new class with a significantly faster logic to perform demarcation of
> the InputStream (TextLineDemarcator). This new class's initial starting point
> was the existing LineDemarcator. They both now share ~60-70% of common code
> which would be important to extract into a common abstract class as well as
> incorporate the new (faster) demarcation logic int StreamDemarcator.
--
This message was sent by Atlassian JIRA
(v6.3.15#6346)