[jira] [Updated] (SPARK-27689) Error to execute hive views with spark

2019-05-20 Thread Juan Antonio (JIRA)


 [ 
https://issues.apache.org/jira/browse/SPARK-27689?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Juan Antonio updated SPARK-27689:
-
Description: 
I have a python error when I execute the following code using hive views but it 
works correctly when I run it with hive tables.

*Hive databases:*
{code:java}
CREATE DATABASE schema_p LOCATION "hdfs:///tmp/schema_p";
{code}
*Hive tables:*
{code:java}
CREATE TABLE schema_p.product(
 id_product string,
 name string,
 country string,
 city string,
 start_date string,
 end_date string
 )
 ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
 STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
 OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
 LOCATION 'hdfs:///tmp/schema_p/product';
{code}
{code:java}
CREATE TABLE schema_p.person_product(
 id_person string,
 id_product string,
 country string,
 city string,
 price string,
 start_date string,
 end_date string
 )
 ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
 STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
 OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
 LOCATION 'hdfs:///tmp/schema_p/person_product';
{code}
*Hive views:*
{code:java}
CREATE VIEW schema_p.product_v AS SELECT CAST(id_product AS INT) AS id_product, 
name AS name, country AS country, city AS city, CAST(start_date AS DATE) AS 
start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.product;
 
CREATE VIEW schema_p.person_product_v AS SELECT CAST(id_person AS INT) AS 
id_person, CAST(id_product AS INT) AS id_product, country AS country, city AS 
city, CAST(price AS DECIMAL(38,8)) AS price, CAST(start_date AS DATE) AS 
start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person_product;
{code}
*Code*:
{code:java}
def read_tables(sc):
  in_dict = { 'product': 'product_v', 'person_product': 'person_product_v' }
  data_dict = {}
  for n, d in in_dict.iteritems():
data_dict[n] = sc.read.table(d)
  return data_dict

def get_population(tables, ref_date_str):
  product = tables['product']
  person_product = tables['person_product']

  count_prod 
=person_product.groupBy('id_product').agg(F.count('id_product').alias('count_prod'))
  person_product_join = person_product.join(product,'id_product')
  person_count = person_product_join.join(count_prod,'id_product')

  final = person_product_join.join(person_count, 'id_person', 'left')

  return final

import pyspark.sql.functions as F
import functools
from pyspark.sql.functions import col
from pyspark.sql.functions import add_months, lit, count, coalesce

spark.sql('use schema_p')
data_dict = read_tables(spark)
data_dict

population = get_population(data_dict, '2019-04-30')
population.show()
{code}
*Error:*
{code:java}
Traceback (most recent call last):
File "", line 1, in 
File "", line 10, in get_population
File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line 
931, in join
jdf = self._jdf.join(other._jdf, on, how)
File 
"/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py",
 line 1160, in __call__
File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 69, in 
deco
raise AnalysisException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.AnalysisException: u'Resolved attribute(s) 
id_person#103,start_date#108,id_product#104,end_date#109,price#107,country#105,city#106
 missing from 
price#4,id_product#1,start_date#5,end_date#6,id_person#0,city#3,country#2 in 
operator !Project [cast(id_person#103 as int) AS id_person#76, 
cast(id_product#104 as int) AS id_product#77, cast(country#105 as string) AS 
country#78, cast(city#106 as string) AS city#79, cast(price#107 as 
decimal(38,8)) AS price#80, cast(start_date#108 as date) AS start_date#81, 
cast(end_date#109 as date) AS end_date#82]. Attribute(s) with the same name 
appear in the operation: 
id_person,start_date,id_product,end_date,price,country,city. Please check if 
the right attribute(s) are used.;;
Project [id_person#0, id_product#1, country#2, city#3, price#4, start_date#5, 
end_date#6, name#29, country#30, city#31, start_date#32, end_date#33, 
id_product#104, country#105, city#106, price#107, start_date#108, end_date#109, 
name#137, country#138, city#139, start_date#140, end_date#141, count_prod#61L]
+- Join LeftOuter, (id_person#0 = id_person#103)
:- Project [id_product#1, id_person#0, country#2, city#3, price#4, 
start_date#5, end_date#6, name#29, country#30, city#31, start_date#32, 
end_date#33]
: +- Join Inner, (id_product#1 = id_product#28)
: :- SubqueryAlias person_product_v
: : +- View (`schema_p`.`person_product_v`, 
[id_person#0,id_product#1,country#2,city#3,price#4,start_date#5,end_date#6])
: : +- Project [cast(id_person#7 as int) AS id_person#0, cast(id_product#8 as 
int) AS id_product#1, cast(country#9 as string) AS country#2, cast(city#10 as 
string) AS city#3, 

[jira] [Updated] (SPARK-27689) Error to execute hive views with spark

2019-05-17 Thread Juan Antonio (JIRA)


 [ 
https://issues.apache.org/jira/browse/SPARK-27689?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Juan Antonio updated SPARK-27689:
-
Priority: Major  (was: Minor)

> Error to execute hive views with spark
> --
>
> Key: SPARK-27689
> URL: https://issues.apache.org/jira/browse/SPARK-27689
> Project: Spark
>  Issue Type: Bug
>  Components: SQL
>Affects Versions: 2.3.0, 2.3.3, 2.4.3
>Reporter: Juan Antonio
>Priority: Major
>
> I have a python error when I execute the following code using hive views but 
> it works correctly when I run it with hive tables.
> *Hive databases:*
> {code:java}
> CREATE DATABASE schema_p LOCATION "hdfs:///tmp/schema_p";
> {code}
> *Hive tables:*
> {code:java}
> CREATE TABLE schema_p.person(
>  id_person string, 
>  identifier string, 
>  gender string, 
>  start_date string, 
>  end_date string)
>  ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
>  STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
>  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
>  LOCATION 'hdfs:///tmp/schema_p/person';
> {code}
> {code:java}
> CREATE TABLE schema_p.product(
>  id_product string,
>  name string,
>  country string,
>  city string,
>  start_date string,
>  end_date string
>  )
>  ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
>  STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
>  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
>  LOCATION 'hdfs:///tmp/schema_p/product';
> {code}
> {code:java}
> CREATE TABLE schema_p.person_product(
>  id_person string,
>  id_product string,
>  country string,
>  city string,
>  price string,
>  start_date string,
>  end_date string
>  )
>  ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
>  STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
>  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
>  LOCATION 'hdfs:///tmp/schema_p/person_product';
> {code}
> *Hive views:*
> {code:java}
> CREATE VIEW schema_p.person_v AS SELECT CAST(id_person AS INT) AS id_person, 
> CAST(identifier AS INT) AS identifier, gender AS gender, CAST(start_date AS 
> DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person;
>  CREATE VIEW schema_p.product_v AS SELECT CAST(id_product AS INT) AS 
> id_product, name AS name, country AS country, city AS city, CAST(start_date 
> AS DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM 
> schema_p.product;
>  CREATE VIEW schema_p.person_product_v AS SELECT CAST(id_person AS INT) AS 
> id_person, CAST(id_product AS INT) AS id_product, country AS country, city AS 
> city, CAST(price AS DECIMAL(38,8)) AS price, CAST(start_date AS DATE) AS 
> start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person_product;
> {code}
> *
> *Code*:
>  
> {code}
> def read_tables(sc):
>   in_dict = { 'person': 'person_v', 'product': 'product_v', 'person_product': 
> 'person_product_v' }
>   data_dict = {}
>   for n, d in in_dict.iteritems():
> data_dict[n] = sc.read.table(d)
>   return data_dict
> def get_population(tables, ref_date_str):
>   person = tables['person']
>   product = tables['product']
>   person_product = tables['person_product']
>   person_product_join = person_product.join(product,'id_product')
>   count_prod = 
> person_product.groupBy('id_product').agg(F.count('id_product').alias('count_prod'))
>   person_count = person_product_join.join(count_prod,'id_product')
>   final1 = person_product_join.join(person_count, 'id_person', 'left')
>   final = final1.withColumn('reference_date', F.lit(ref_date_str))
>   return final
> import pyspark.sql.functions as F
> import functools
> from pyspark.sql.functions import col
> from pyspark.sql.functions import add_months, lit, count, coalesce
> spark.sql('use schema_p')
> data_dict = read_tables(spark)
> data_dict
> population = get_population(data_dict, '2019-04-30')
> population.show(){code}
>  
> *Error:*
> {code:java}
> File "", line 1, in 
>  File "", line 10, in get_population
>  File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line 
> 931, in join
>  jdf = self._jdf.join(other._jdf, on, how)
>  File 
> "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py",
>  line 1160, in __call__
>  File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 69, 
> in deco
>  raise AnalysisException(s.split(': ', 1)[1], stackTrace)
>  pyspark.sql.utils.AnalysisException: u'Resolved attribute(s) 
> id_product#124,end_date#129,city#126,price#127,start_date#128,id_person#123,country#125
>  missing from 
> city#47,price#48,start_date#49,id_product#45,end_date#50,id_person#44,country#46
>  in operator 

[jira] [Updated] (SPARK-27689) Error to execute hive views with spark

2019-05-15 Thread Juan Antonio (JIRA)


 [ 
https://issues.apache.org/jira/browse/SPARK-27689?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Juan Antonio updated SPARK-27689:
-
Priority: Minor  (was: Major)

> Error to execute hive views with spark
> --
>
> Key: SPARK-27689
> URL: https://issues.apache.org/jira/browse/SPARK-27689
> Project: Spark
>  Issue Type: Bug
>  Components: SQL
>Affects Versions: 2.3.0, 2.3.3, 2.4.3
>Reporter: Juan Antonio
>Priority: Minor
>
> I have a python error when I execute the following code using hive views but 
> it works correctly when I run it with hive tables.
> *Hive databases:*
> {code:java}
> CREATE DATABASE schema_p LOCATION "hdfs:///tmp/schema_p";
> {code}
> *Hive tables:*
> {code:java}
> CREATE TABLE schema_p.person(
>  id_person string, 
>  identifier string, 
>  gender string, 
>  start_date string, 
>  end_date string)
>  ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
>  STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
>  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
>  LOCATION 'hdfs:///tmp/schema_p/person';
> {code}
> {code:java}
> CREATE TABLE schema_p.product(
>  id_product string,
>  name string,
>  country string,
>  city string,
>  start_date string,
>  end_date string
>  )
>  ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
>  STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
>  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
>  LOCATION 'hdfs:///tmp/schema_p/product';
> {code}
> {code:java}
> CREATE TABLE schema_p.person_product(
>  id_person string,
>  id_product string,
>  country string,
>  city string,
>  price string,
>  start_date string,
>  end_date string
>  )
>  ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
>  STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
>  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
>  LOCATION 'hdfs:///tmp/schema_p/person_product';
> {code}
> *Hive views:*
> {code:java}
> CREATE VIEW schema_p.person_v AS SELECT CAST(id_person AS INT) AS id_person, 
> CAST(identifier AS INT) AS identifier, gender AS gender, CAST(start_date AS 
> DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person;
>  CREATE VIEW schema_p.product_v AS SELECT CAST(id_product AS INT) AS 
> id_product, name AS name, country AS country, city AS city, CAST(start_date 
> AS DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM 
> schema_p.product;
>  CREATE VIEW schema_p.person_product_v AS SELECT CAST(id_person AS INT) AS 
> id_person, CAST(id_product AS INT) AS id_product, country AS country, city AS 
> city, CAST(price AS DECIMAL(38,8)) AS price, CAST(start_date AS DATE) AS 
> start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person_product;
> {code}
> *
> *Code*:
>  
> {code}
> def read_tables(sc):
>   in_dict = { 'person': 'person_v', 'product': 'product_v', 'person_product': 
> 'person_product_v' }
>   data_dict = {}
>   for n, d in in_dict.iteritems():
> data_dict[n] = sc.read.table(d)
>   return data_dict
> def get_population(tables, ref_date_str):
>   person = tables['person']
>   product = tables['product']
>   person_product = tables['person_product']
>   person_product_join = person_product.join(product,'id_product')
>   count_prod = 
> person_product.groupBy('id_product').agg(F.count('id_product').alias('count_prod'))
>   person_count = person_product_join.join(count_prod,'id_product')
>   final1 = person_product_join.join(person_count, 'id_person', 'left')
>   final = final1.withColumn('reference_date', F.lit(ref_date_str))
>   return final
> import pyspark.sql.functions as F
> import functools
> from pyspark.sql.functions import col
> from pyspark.sql.functions import add_months, lit, count, coalesce
> spark.sql('use schema_p')
> data_dict = read_tables(spark)
> data_dict
> population = get_population(data_dict, '2019-04-30')
> population.show(){code}
>  
> *Error:*
> {code:java}
> File "", line 1, in 
>  File "", line 10, in get_population
>  File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line 
> 931, in join
>  jdf = self._jdf.join(other._jdf, on, how)
>  File 
> "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py",
>  line 1160, in __call__
>  File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 69, 
> in deco
>  raise AnalysisException(s.split(': ', 1)[1], stackTrace)
>  pyspark.sql.utils.AnalysisException: u'Resolved attribute(s) 
> id_product#124,end_date#129,city#126,price#127,start_date#128,id_person#123,country#125
>  missing from 
> city#47,price#48,start_date#49,id_product#45,end_date#50,id_person#44,country#46
>  in operator 

[jira] [Updated] (SPARK-27689) Error to execute hive views with spark

2019-05-15 Thread Hyukjin Kwon (JIRA)


 [ 
https://issues.apache.org/jira/browse/SPARK-27689?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Hyukjin Kwon updated SPARK-27689:
-
Priority: Major  (was: Critical)

> Error to execute hive views with spark
> --
>
> Key: SPARK-27689
> URL: https://issues.apache.org/jira/browse/SPARK-27689
> Project: Spark
>  Issue Type: Bug
>  Components: SQL
>Affects Versions: 2.3.0, 2.3.3, 2.4.3
>Reporter: Juan Antonio
>Priority: Major
>
> I have a python error when I execute the following code using hive views but 
> it works correctly when I run it with hive tables.
> *Hive databases:*
> {code:java}
> CREATE DATABASE schema_p LOCATION "hdfs:///tmp/schema_p";
> {code}
> *Hive tables:*
> {code:java}
> CREATE TABLE schema_p.person(
>  id_person string, 
>  identifier string, 
>  gender string, 
>  start_date string, 
>  end_date string)
>  ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
>  STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
>  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
>  LOCATION 'hdfs:///tmp/schema_p/person';
> {code}
> {code:java}
> CREATE TABLE schema_p.product(
>  id_product string,
>  name string,
>  country string,
>  city string,
>  start_date string,
>  end_date string
>  )
>  ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
>  STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
>  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
>  LOCATION 'hdfs:///tmp/schema_p/product';
> {code}
> {code:java}
> CREATE TABLE schema_p.person_product(
>  id_person string,
>  id_product string,
>  country string,
>  city string,
>  price string,
>  start_date string,
>  end_date string
>  )
>  ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
>  STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
>  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
>  LOCATION 'hdfs:///tmp/schema_p/person_product';
> {code}
> *Hive views:*
> {code:java}
> CREATE VIEW schema_p.person_v AS SELECT CAST(id_person AS INT) AS id_person, 
> CAST(identifier AS INT) AS identifier, gender AS gender, CAST(start_date AS 
> DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person;
>  CREATE VIEW schema_p.product_v AS SELECT CAST(id_product AS INT) AS 
> id_product, name AS name, country AS country, city AS city, CAST(start_date 
> AS DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM 
> schema_p.product;
>  CREATE VIEW schema_p.person_product_v AS SELECT CAST(id_person AS INT) AS 
> id_person, CAST(id_product AS INT) AS id_product, country AS country, city AS 
> city, CAST(price AS DECIMAL(38,8)) AS price, CAST(start_date AS DATE) AS 
> start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person_product;
> {code}
> *
> *Code*:
>  
> {code}
> def read_tables(sc):
>   in_dict = { 'person': 'person_v', 'product': 'product_v', 'person_product': 
> 'person_product_v' }
>   data_dict = {}
>   for n, d in in_dict.iteritems():
> data_dict[n] = sc.read.table(d)
>   return data_dict
> def get_population(tables, ref_date_str):
>   person = tables['person']
>   product = tables['product']
>   person_product = tables['person_product']
>   person_product_join = person_product.join(product,'id_product')
>   count_prod = 
> person_product.groupBy('id_product').agg(F.count('id_product').alias('count_prod'))
>   person_count = person_product_join.join(count_prod,'id_product')
>   final1 = person_product_join.join(person_count, 'id_person', 'left')
>   final = final1.withColumn('reference_date', F.lit(ref_date_str))
>   return final
> import pyspark.sql.functions as F
> import functools
> from pyspark.sql.functions import col
> from pyspark.sql.functions import add_months, lit, count, coalesce
> spark.sql('use schema_p')
> data_dict = read_tables(spark)
> data_dict
> population = get_population(data_dict, '2019-04-30')
> population.show(){code}
>  
> *Error:*
> {code:java}
> File "", line 1, in 
>  File "", line 10, in get_population
>  File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line 
> 931, in join
>  jdf = self._jdf.join(other._jdf, on, how)
>  File 
> "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py",
>  line 1160, in __call__
>  File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 69, 
> in deco
>  raise AnalysisException(s.split(': ', 1)[1], stackTrace)
>  pyspark.sql.utils.AnalysisException: u'Resolved attribute(s) 
> id_product#124,end_date#129,city#126,price#127,start_date#128,id_person#123,country#125
>  missing from 
> city#47,price#48,start_date#49,id_product#45,end_date#50,id_person#44,country#46
>  in 

[jira] [Updated] (SPARK-27689) Error to execute hive views with spark

2019-05-15 Thread Hyukjin Kwon (JIRA)


 [ 
https://issues.apache.org/jira/browse/SPARK-27689?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Hyukjin Kwon updated SPARK-27689:
-
Description: 
I have a python error when I execute the following code using hive views but it 
works correctly when I run it with hive tables.

*Hive databases:*
{code:java}
CREATE DATABASE schema_p LOCATION "hdfs:///tmp/schema_p";
{code}
*Hive tables:*
{code:java}
CREATE TABLE schema_p.person(
 id_person string, 
 identifier string, 
 gender string, 
 start_date string, 
 end_date string)
 ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
 STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
 OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
 LOCATION 'hdfs:///tmp/schema_p/person';
{code}
{code:java}
CREATE TABLE schema_p.product(
 id_product string,
 name string,
 country string,
 city string,
 start_date string,
 end_date string
 )
 ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
 STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
 OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
 LOCATION 'hdfs:///tmp/schema_p/product';
{code}
{code:java}
CREATE TABLE schema_p.person_product(
 id_person string,
 id_product string,
 country string,
 city string,
 price string,
 start_date string,
 end_date string
 )
 ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
 STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
 OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
 LOCATION 'hdfs:///tmp/schema_p/person_product';
{code}
*Hive views:*
{code:java}
CREATE VIEW schema_p.person_v AS SELECT CAST(id_person AS INT) AS id_person, 
CAST(identifier AS INT) AS identifier, gender AS gender, CAST(start_date AS 
DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person;
 CREATE VIEW schema_p.product_v AS SELECT CAST(id_product AS INT) AS 
id_product, name AS name, country AS country, city AS city, CAST(start_date AS 
DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.product;
 CREATE VIEW schema_p.person_product_v AS SELECT CAST(id_person AS INT) AS 
id_person, CAST(id_product AS INT) AS id_product, country AS country, city AS 
city, CAST(price AS DECIMAL(38,8)) AS price, CAST(start_date AS DATE) AS 
start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person_product;
{code}
*

*Code*:

 
{code}
def read_tables(sc):
  in_dict = { 'person': 'person_v', 'product': 'product_v', 'person_product': 
'person_product_v' }
  data_dict = {}
  for n, d in in_dict.iteritems():
data_dict[n] = sc.read.table(d)
  return data_dict

def get_population(tables, ref_date_str):
  person = tables['person']
  product = tables['product']
  person_product = tables['person_product']
  person_product_join = person_product.join(product,'id_product')
  count_prod = 
person_product.groupBy('id_product').agg(F.count('id_product').alias('count_prod'))
  person_count = person_product_join.join(count_prod,'id_product')
  final1 = person_product_join.join(person_count, 'id_person', 'left')
  final = final1.withColumn('reference_date', F.lit(ref_date_str))
  return final

import pyspark.sql.functions as F
import functools
from pyspark.sql.functions import col
from pyspark.sql.functions import add_months, lit, count, coalesce

spark.sql('use schema_p')
data_dict = read_tables(spark)
data_dict

population = get_population(data_dict, '2019-04-30')
population.show(){code}
 

*Error:*
{code:java}
File "", line 1, in 
 File "", line 10, in get_population
 File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line 
931, in join
 jdf = self._jdf.join(other._jdf, on, how)
 File 
"/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py",
 line 1160, in __call__
 File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 69, in 
deco
 raise AnalysisException(s.split(': ', 1)[1], stackTrace)
 pyspark.sql.utils.AnalysisException: u'Resolved attribute(s) 
id_product#124,end_date#129,city#126,price#127,start_date#128,id_person#123,country#125
 missing from 
city#47,price#48,start_date#49,id_product#45,end_date#50,id_person#44,country#46
 in operator !Project [cast(id_person#123 as int) AS id_person#96, 
cast(id_product#124 as int) AS id_product#97, cast(country#125 as string) AS 
country#98, cast(city#126 as string) AS city#99, cast(price#127 as 
decimal(38,8)) AS price#100, cast(start_date#128 as date) AS start_date#101, 
cast(end_date#129 as date) AS end_date#102|#123 as int) AS id_person#96, 
cast(id_product#124 as int) AS id_product#97, cast(country#125 as string) AS 
country#98, cast(city#126 as string) AS city#99, cast(price#127 as 
decimal(38,8)) AS price#100, cast(start_date#128 as date) AS start_date#101, 
cast(end_date#129 as date) AS 

[jira] [Updated] (SPARK-27689) Error to execute hive views with spark

2019-05-13 Thread Yuming Wang (JIRA)


 [ 
https://issues.apache.org/jira/browse/SPARK-27689?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Yuming Wang updated SPARK-27689:

Description: 
I have a python error when I execute the following code using hive views but it 
works correctly when I run it with hive tables.

*Hive databases:*

CREATE DATABASE schema_p LOCATION "hdfs:///tmp/schema_p";

*Hive tables:*

CREATE TABLE schema_p.person(
 id_person string, 
 identifier string, 
 gender string, 
 start_date string, 
 end_date string)
 ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
 STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
 OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
 LOCATION 'hdfs:///tmp/schema_p/person';

CREATE TABLE schema_p.product(
 id_product string,
 name string,
 country string,
 city string,
 start_date string,
 end_date string
 )
 ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
 STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
 OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
 LOCATION 'hdfs:///tmp/schema_p/product';

CREATE TABLE schema_p.person_product(
 id_person string,
 id_product string,
 country string,
 city string,
 price string,
 start_date string,
 end_date string
 )
 ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
 STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
 OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
 LOCATION 'hdfs:///tmp/schema_p/person_product';

*Hive views:*

CREATE VIEW schema_p.person_v AS SELECT CAST(id_person AS INT) AS id_person, 
CAST(identifier AS INT) AS identifier, gender AS gender, CAST(start_date AS 
DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person;
 CREATE VIEW schema_p.product_v AS SELECT CAST(id_product AS INT) AS 
id_product, name AS name, country AS country, city AS city, CAST(start_date AS 
DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.product;
 CREATE VIEW schema_p.person_product_v AS SELECT CAST(id_person AS INT) AS 
id_person, CAST(id_product AS INT) AS id_product, country AS country, city AS 
city, CAST(price AS DECIMAL(38,8)) AS price, CAST(start_date AS DATE) AS 
start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person_product;

*

*Code*:

 
{code:python}
def read_tables(sc):
  in_dict = { 'person': 'person_v', 'product': 'product_v', 'person_product': 
'person_product_v' }
  data_dict = {}
  for n, d in in_dict.iteritems():
data_dict[n] = sc.read.table(d)
  return data_dict

def get_population(tables, ref_date_str):
  person = tables['person']
  product = tables['product']
  person_product = tables['person_product']
  person_product_join = person_product.join(product,'id_product')
  count_prod = 
person_product.groupBy('id_product').agg(F.count('id_product').alias('count_prod'))
  person_count = person_product_join.join(count_prod,'id_product')
  final1 = person_product_join.join(person_count, 'id_person', 'left')
  final = final1.withColumn('reference_date', F.lit(ref_date_str))
  return final

import pyspark.sql.functions as F
import functools
from pyspark.sql.functions import col
from pyspark.sql.functions import add_months, lit, count, coalesce

spark.sql('use schema_p')
data_dict = read_tables(spark)
data_dict

population = get_population(data_dict, '2019-04-30')
population.show(){code}
*

*Error:*

File "", line 1, in 
 File "", line 10, in get_population
 File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line 
931, in join
 jdf = self._jdf.join(other._jdf, on, how)
 File 
"/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py",
 line 1160, in __call__
 File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 69, in 
deco
 raise AnalysisException(s.split(': ', 1)[1], stackTrace)
 pyspark.sql.utils.AnalysisException: u'Resolved attribute(s) 
id_product#124,end_date#129,city#126,price#127,start_date#128,id_person#123,country#125
 missing from 
city#47,price#48,start_date#49,id_product#45,end_date#50,id_person#44,country#46
 in operator !Project [cast(id_person#123 as int) AS id_person#96, 
cast(id_product#124 as int) AS id_product#97, cast(country#125 as string) AS 
country#98, cast(city#126 as string) AS city#99, cast(price#127 as 
decimal(38,8)) AS price#100, cast(start_date#128 as date) AS start_date#101, 
cast(end_date#129 as date) AS end_date#102|#123 as int) AS id_person#96, 
cast(id_product#124 as int) AS id_product#97, cast(country#125 as string) AS 
country#98, cast(city#126 as string) AS city#99, cast(price#127 as 
decimal(38,8)) AS price#100, cast(start_date#128 as date) AS start_date#101, 
cast(end_date#129 as date) AS end_date#102]. Attribute(s) with the same name 
appear in the