Thanks a bunch for replying! I quickly ran this - the TAGS_US data in HDFS is 
in parquet format.

select distinct typeof(cvalue) 

from hdfs.drill.TAGS_US;

+----------+

|  EXPR$0  |

+----------+

| VARCHAR  |

+----------+



Same with the table in Redshift. I changed my query to specifically cast the 
columns to VARCHAR again - 

select count(*) 

from redshift.reports.public.us_tags as a 

join hdfs.drill.TAGS_US as b

on cast(b.cvalue as varchar) = cast(a.tag_value as varchar) ;


I see the same error again.

Here is the explain plan for the query - 

select count(*) 
from hdfs.drill.TAGS_US as a
join redshift.reports.public.us_tags as b
on a.cvalue = b.tag_value;

Error: SYSTEM ERROR: IllegalStateException: Already had POJO for id 
(java.lang.Integer) 
[com.fasterxml.jackson.annotation.ObjectIdGenerator$IdKey@3372bbe8]


+------+------+
| text | json |
+------+------+
| 00-00    Screen
00-01      StreamAgg(group=[{}], EXPR$0=[$SUM0($0)])
00-02        UnionExchange
01-01          StreamAgg(group=[{}], EXPR$0=[COUNT()])
01-02            Project($f0=[0])
01-03              HashJoin(condition=[=($0, $1)], joinType=[inner])
01-05                Scan(groupscan=[ParquetGroupScan 
[entries=[ReadEntryWithPath 
[path=hdfs://ec2-XX-XX-XX-XX.compute-1.amazonaws.com:8020/drill/TAGS_US]], 
selectionRoot=hdfs://ec2-XX-XX-XX-XX.compute-1.amazonaws.com:8020/drill/TAGS_US,
 numFiles=1, usedMetadataFile=false, columns=[`cvalue`]]])
01-04                BroadcastExchange
02-01                  Project(tag_value=[$2])
02-02                    Jdbc(sql=[SELECT *
FROM "reports"."public"."us_tags"])
 | {
  "head" : {
    "version" : 1,
    "generator" : {
      "type" : "ExplainHandler",
      "info" : ""
    },
    "type" : "APACHE_DRILL_PHYSICAL",
    "options" : [ ],
    "queue" : 0,
    "resultMode" : "EXEC"
  },
  "graph" : [ {
    "pop" : "jdbc-scan",
    "@id" : 0,
    "sql" : "SELECT *\nFROM \"reports\".\"public\".\"us_tags\"",
    "config" : {
      "type" : "jdbc",
      "driver" : "com.amazon.redshift.jdbc4.Driver",
      "url" : "",
      "username" : "",
      "password" : "",
      "enabled" : true
    },
    "userName" : "",
    "cost" : 0.0
  }, {
    "pop" : "project",
    "@id" : 131073,
    "exprs" : [ {
      "ref" : "`tag_value`",
      "expr" : "`tag_value`"
    } ],
    "child" : 0,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 100.0
  }, {
    "pop" : "broadcast-exchange",
    "@id" : 65540,
    "child" : 131073,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 100.0
  }, {
    "pop" : "parquet-scan",
    "@id" : 65541,
    "userName" : "XXXX",
    "entries" : [ {
      "path" : 
"hdfs://ec2-XX-XX-XX-XX.compute-1.amazonaws.com:8020/drill/TAGS_US"
    } ],
    "storage" : {
      "type" : "file",
      "enabled" : true,
      "connection" : "hdfs://ec2-XX-XX-XX-XX.compute-1.amazonaws.com:8020/",
      "workspaces" : {
        "root" : {
          "location" : "/",
          "writable" : true,
          "defaultInputFormat" : null
        },
        "tmp" : {
          "location" : "/tmp",
          "writable" : true,
          "defaultInputFormat" : null
        },
        "drill" : {
          "location" : "/drill",
          "writable" : true,
          "defaultInputFormat" : "tsv"
        },
        "drill2" : {
          "location" : "/drill",
          "writable" : true,
          "defaultInputFormat" : "csv"
        }
      },
      "formats" : {
        "psv" : {
          "type" : "text",
          "extensions" : [ "tbl" ],
          "delimiter" : "|"
        },
        "csv" : {
          "type" : "text",
          "extensions" : [ "csv" ],
          "delimiter" : ","
        },
        "tsv" : {
          "type" : "text",
          "extensions" : [ "tsv" ],
          "delimiter" : "\t"
        },
        "parquet" : {
          "type" : "parquet"
        },
        "json" : {
          "type" : "json"
        },
        "avro" : {
          "type" : "avro"
        },
        "sequencefile" : {
          "type" : "sequencefile",
          "extensions" : [ "seq" ]
        },
        "csvh" : {
          "type" : "text",
          "extensions" : [ "csvh" ],
          "extractHeader" : true,
          "delimiter" : ","
        }
      }
    },
    "format" : {
      "type" : "parquet"
    },
    "columns" : [ "`cvalue`" ],
    "selectionRoot" : 
"hdfs://ec2-XX-XX-XX-XX.compute-1.amazonaws.com:8020/drill/TAGS_US",
    "fileSet" : [ "/drill/TAGS_US/0_0_1.parquet", 
"/drill/TAGS_US/0_0_0.parquet" ],
    "cost" : 4.1667342E7
  }, {
    "pop" : "hash-join",
    "@id" : 65539,
    "left" : 65541,
    "right" : 65540,
    "conditions" : [ {
      "relationship" : "EQUALS",
      "left" : "`cvalue`",
      "right" : "`tag_value`"
    } ],
    "joinType" : "INNER",
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 4.1667342E7
  }, {
    "pop" : "project",
    "@id" : 65538,
    "exprs" : [ {
      "ref" : "`$f0`",
      "expr" : "0"
    } ],
    "child" : 65539,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 4.1667342E7
  }, {
    "pop" : "streaming-aggregate",
    "@id" : 65537,
    "child" : 65538,
    "keys" : [ ],
    "exprs" : [ {
      "ref" : "`EXPR$0`",
      "expr" : "count(1) "
    } ],
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 1.0
  }, {
    "pop" : "union-exchange",
    "@id" : 2,
    "child" : 65537,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 1.0
  }, {
    "pop" : "streaming-aggregate",
    "@id" : 1,
    "child" : 2,
    "keys" : [ ],
    "exprs" : [ {
      "ref" : "`EXPR$0`",
      "expr" : "$sum0(`EXPR$0`) "
    } ],
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 1.0
  }, {
    "pop" : "screen",
    "@id" : 0,
    "child" : 1,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 1.0
  } ]
} |
+------+------+




> On Mon, Jan 4, 2016 at 9:42 PM, Andries Engelbrecht 
> <[email protected]> wrote:
> Perhaps check the data type of all the fields being used for the join.
> 
> Select cvalue, TYPEOF(cvalue) from hdfs...... limit 10
> 
> and similar for  tag_value on redshift.
> 
> You can then do a predicate to find records where the data type may be 
> different.
>  where typeof(<field>) not like '<data type of field>'
> 
> I believe there was a nice write up on they topic, but can't find it now.
> 
> 
> --Andries
> 
> 
> 
> 
> > On Jan 3, 2016, at 8:45 PM, Rohit Kulkarni <[email protected]> 
> > wrote:
> >
> > Hello all,
> >
> > I am sure if not all of you, but some of you must have seen this error some
> > time -
> >
> > *Error: SYSTEM ERROR: IllegalStateException: Already had POJO for id
> > (java.lang.Integer)
> > [com.fasterxml.jackson.annotation.ObjectIdGenerator$IdKey@3372bbe8]*
> >
> > ​I am trying to do a join between Redshift (JDBC​) and HDFS like this -
> >
> >
> >
> >
> > *select count(*)from hdfs.drill.TAGS_US as aright join
> > redshift.reports.public.us_tags as bon a.cvalue = b.tag_value;*
> >
> >
> > I don't see anything wrong in the query. The two individual tables return
> > proper data when fired a query separately. Is something missing or am I
> > doing something wrong?
> >
> > Would very much appreciate your help! Thanks!!
> >
> > --
> > Warm Regards,
> > Rohit Kulkarni



-- 
Warm Regards,
Rohit Kulkarni
Mo.: +91 89394 63593

Reply via email to