[jira] [Updated] (IMPALA-7278) distinct clause is not working as expected with custom UDFs

2018-07-11 Thread Tim Armstrong (JIRA)


 [ 
https://issues.apache.org/jira/browse/IMPALA-7278?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Tim Armstrong updated IMPALA-7278:
--
Description: 
Distinct clause when executed with custom UDF returns unexpected results.

Custom UDF Definition:

udf.h file:

==
{code}
#ifndef IMPALA_UDF_SAMPLE_UDF_H
#define IMPALA_UDF_SAMPLE_UDF_H

#include "udf.h"

using namespace impala_udf;

#ifdef __cplusplus
extern "C"
{
#endif

 

StringVal udf_clear(FunctionContext* context, StringVal& sInput);
#ifdef __cplusplus
}
#endif
#endif
{code}

udf.cpp:


{code}
#include "clear.h"

StringVal udf_clear(
 FunctionContext* context,
 StringVal& sInput /* String to encrypt */
 )
{
 unsigned char* pReturnData = context->Allocate( 100 );
 memset( pReturnData, NULL, 100);
 memcpy(pReturnData, sInput.ptr, sInput.len );
 StringVal sResult( pReturnData );
 sResult.len = sInput.len;
 context->Free( (uint8_t*)pReturnData );
 return sResult;
}
{code}
CMakeLists.txt:

===
{code}
project (clear)
 ADD_LIBRARY (clear2.8_RHEL SHARED clear.cpp )
 TARGET_LINK_LIBRARIES (clear2.8_RHEL libImpalaUdf.a )
 SET_TARGET_PROPERTIES (clear2.8_RHEL PROPERTIES SUFFIX ".so")
 SET_TARGET_PROPERTIES (clear2.8_RHEL PROPERTIES PREFIX "")
 INSTALL ( TARGETS clear2.8_RHEL DESTINATION . )

Query Syntax:

CREATE TABLE clear (c1 STRING, c2 STRING) row format delimited fields 
terminated by ',' stored as textfile;
LOAD DATA INPATH '/user/clear.csv' OVERWRITE INTO TABLE clear;

Query: describe clear
+--++-+
| name | type | comment |
+--++-+
| c1 | string | |
| c2 | string | |
+--++-+
Fetched 2 row(s) in 0.04s

select * from clear;
+-+-+
| c1 | c2 |
+-+-+
| 111 | 111 |
| 111 | 111 |
| 22 | 22 |
| 44 | 44 |
| 22 | 22 |
| 333 | 333 |
| 333 | 333 |
+-+-+
Fetched 7 row(s) in 0.14s

select distinct udf_clear(c1),c2 from clear;
+---+-+
| default.udf_clear(c1) | c2 |
+---+-+
| {color:#d04437}*22* {color}| 44 |   <== this should be *44* 
| 22 | 22 |
| 333 | 333 |
| 111 | 111 |
+---+-+
Fetched 4 row(s) in 0.24s
{code}
 
Expected result:
{code}
select distinct c1,c2 from clear;

+-+-+
| c1 | c2 |
+-+-+
| 44 | 44 |
| 22 | 22 |
| 333 | 333 |
| 111 | 111 |
+-+-+
Fetched 4 row(s) in 0.25s
 {code}

  was:
Distinct clause when executed with custom UDF returns unexpected results.

Custom UDF Definition:

udf.h file:

==

#ifndef IMPALA_UDF_SAMPLE_UDF_H
#define IMPALA_UDF_SAMPLE_UDF_H

#include "udf.h"

using namespace impala_udf;

#ifdef __cplusplus
extern "C"
{
#endif

 

StringVal udf_clear(FunctionContext* context, StringVal& sInput);
#ifdef __cplusplus
}
#endif
#endif

udf.cpp:



#include "clear.h"

StringVal udf_clear(
 FunctionContext* context,
 StringVal& sInput /* String to encrypt */
 )
{
 unsigned char* pReturnData = context->Allocate( 100 );
 memset( pReturnData, NULL, 100);
 memcpy(pReturnData, sInput.ptr, sInput.len );
 StringVal sResult( pReturnData );
 sResult.len = sInput.len;
 context->Free( (uint8_t*)pReturnData );
 return sResult;
}

CMakeLists.txt:

===

project (clear)
 ADD_LIBRARY (clear2.8_RHEL SHARED clear.cpp )
 TARGET_LINK_LIBRARIES (clear2.8_RHEL libImpalaUdf.a )
 SET_TARGET_PROPERTIES (clear2.8_RHEL PROPERTIES SUFFIX ".so")
 SET_TARGET_PROPERTIES (clear2.8_RHEL PROPERTIES PREFIX "")
 INSTALL ( TARGETS clear2.8_RHEL DESTINATION . )

Query Syntax:

CREATE TABLE clear (c1 STRING, c2 STRING) row format delimited fields 
terminated by ',' stored as textfile;
LOAD DATA INPATH '/user/clear.csv' OVERWRITE INTO TABLE clear;

Query: describe clear
+--++-+
| name | type | comment |
+--++-+
| c1 | string | |
| c2 | string | |
+--++-+
Fetched 2 row(s) in 0.04s

select * from clear;
+-+-+
| c1 | c2 |
+-+-+
| 111 | 111 |
| 111 | 111 |
| 22 | 22 |
| 44 | 44 |
| 22 | 22 |
| 333 | 333 |
| 333 | 333 |
+-+-+
Fetched 7 row(s) in 0.14s

select distinct udf_clear(c1),c2 from clear;
+---+-+
| default.udf_clear(c1) | c2 |
+---+-+
| {color:#d04437}*22* {color}| 44 |   <== this should be *44* 
| 22 | 22 |
| 333 | 333 |
| 111 | 111 |
+---+-+
Fetched 4 row(s) in 0.24s

 

Expected result:

select distinct c1,c2 from clear;

+-+-+
| c1 | c2 |
+-+-+
| 44 | 44 |
| 22 | 22 |
| 333 | 333 |
| 111 | 111 |
+-+-+
Fetched 4 row(s) in 0.25s

 


> distinct 

[jira] [Updated] (IMPALA-7278) distinct clause is not working as expected with custom UDFs

2018-07-11 Thread Tim Armstrong (JIRA)


 [ 
https://issues.apache.org/jira/browse/IMPALA-7278?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Tim Armstrong updated IMPALA-7278:
--
Description: 
Distinct clause when executed with custom UDF returns unexpected results.

Custom UDF Definition:

udf.h file:
{code}
#ifndef IMPALA_UDF_SAMPLE_UDF_H
#define IMPALA_UDF_SAMPLE_UDF_H

#include "udf.h"

using namespace impala_udf;

#ifdef __cplusplus
extern "C"
{
#endif

StringVal udf_clear(FunctionContext* context, StringVal& sInput);
#ifdef __cplusplus
}
#endif
#endif
{code}

udf.cpp:

{code}
#include "clear.h"

StringVal udf_clear(
 FunctionContext* context,
 StringVal& sInput /* String to encrypt */
 )
{
 unsigned char* pReturnData = context->Allocate( 100 );
 memset( pReturnData, NULL, 100);
 memcpy(pReturnData, sInput.ptr, sInput.len );
 StringVal sResult( pReturnData );
 sResult.len = sInput.len;
 context->Free( (uint8_t*)pReturnData );
 return sResult;
}
{code}
CMakeLists.txt:
{code}
project (clear)
 ADD_LIBRARY (clear2.8_RHEL SHARED clear.cpp )
 TARGET_LINK_LIBRARIES (clear2.8_RHEL libImpalaUdf.a )
 SET_TARGET_PROPERTIES (clear2.8_RHEL PROPERTIES SUFFIX ".so")
 SET_TARGET_PROPERTIES (clear2.8_RHEL PROPERTIES PREFIX "")
 INSTALL ( TARGETS clear2.8_RHEL DESTINATION . )

Query Syntax:

CREATE TABLE clear (c1 STRING, c2 STRING) row format delimited fields 
terminated by ',' stored as textfile;
LOAD DATA INPATH '/user/clear.csv' OVERWRITE INTO TABLE clear;

Query: describe clear
+--++-+
| name | type | comment |
+--++-+
| c1 | string | |
| c2 | string | |
+--++-+
Fetched 2 row(s) in 0.04s

select * from clear;
+-+-+
| c1 | c2 |
+-+-+
| 111 | 111 |
| 111 | 111 |
| 22 | 22 |
| 44 | 44 |
| 22 | 22 |
| 333 | 333 |
| 333 | 333 |
+-+-+
Fetched 7 row(s) in 0.14s

select distinct udf_clear(c1),c2 from clear;
+---+-+
| default.udf_clear(c1) | c2 |
+---+-+
| {color:#d04437}*22* {color}| 44 |   <== this should be *44* 
| 22 | 22 |
| 333 | 333 |
| 111 | 111 |
+---+-+
Fetched 4 row(s) in 0.24s
{code}
 
Expected result:
{code}
select distinct c1,c2 from clear;
+-+-+
| c1 | c2 |
+-+-+
| 44 | 44 |
| 22 | 22 |
| 333 | 333 |
| 111 | 111 |
+-+-+
Fetched 4 row(s) in 0.25s
 {code}

  was:
Distinct clause when executed with custom UDF returns unexpected results.

Custom UDF Definition:

udf.h file:

==
{code}
#ifndef IMPALA_UDF_SAMPLE_UDF_H
#define IMPALA_UDF_SAMPLE_UDF_H

#include "udf.h"

using namespace impala_udf;

#ifdef __cplusplus
extern "C"
{
#endif

 

StringVal udf_clear(FunctionContext* context, StringVal& sInput);
#ifdef __cplusplus
}
#endif
#endif
{code}

udf.cpp:


{code}
#include "clear.h"

StringVal udf_clear(
 FunctionContext* context,
 StringVal& sInput /* String to encrypt */
 )
{
 unsigned char* pReturnData = context->Allocate( 100 );
 memset( pReturnData, NULL, 100);
 memcpy(pReturnData, sInput.ptr, sInput.len );
 StringVal sResult( pReturnData );
 sResult.len = sInput.len;
 context->Free( (uint8_t*)pReturnData );
 return sResult;
}
{code}
CMakeLists.txt:

===
{code}
project (clear)
 ADD_LIBRARY (clear2.8_RHEL SHARED clear.cpp )
 TARGET_LINK_LIBRARIES (clear2.8_RHEL libImpalaUdf.a )
 SET_TARGET_PROPERTIES (clear2.8_RHEL PROPERTIES SUFFIX ".so")
 SET_TARGET_PROPERTIES (clear2.8_RHEL PROPERTIES PREFIX "")
 INSTALL ( TARGETS clear2.8_RHEL DESTINATION . )

Query Syntax:

CREATE TABLE clear (c1 STRING, c2 STRING) row format delimited fields 
terminated by ',' stored as textfile;
LOAD DATA INPATH '/user/clear.csv' OVERWRITE INTO TABLE clear;

Query: describe clear
+--++-+
| name | type | comment |
+--++-+
| c1 | string | |
| c2 | string | |
+--++-+
Fetched 2 row(s) in 0.04s

select * from clear;
+-+-+
| c1 | c2 |
+-+-+
| 111 | 111 |
| 111 | 111 |
| 22 | 22 |
| 44 | 44 |
| 22 | 22 |
| 333 | 333 |
| 333 | 333 |
+-+-+
Fetched 7 row(s) in 0.14s

select distinct udf_clear(c1),c2 from clear;
+---+-+
| default.udf_clear(c1) | c2 |
+---+-+
| {color:#d04437}*22* {color}| 44 |   <== this should be *44* 
| 22 | 22 |
| 333 | 333 |
| 111 | 111 |
+---+-+
Fetched 4 row(s) in 0.24s
{code}
 
Expected result:
{code}
select distinct c1,c2 from clear;

+-+-+
| c1 | c2 |
+-+-+
| 44 | 44 |
| 22 | 22 |
| 333 | 333 |
| 111 | 111 |
+-+-+
Fetched 4 row(s) in 0.25s
 {code}


>