[
https://issues.apache.org/jira/browse/SPARK-27689?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Juan Antonio updated SPARK-27689:
---------------------------------
Description:
I have a python error when I execute the following code using hive views but it
works correctly when I run it with hive tables.
*Hive databases:*
{code:java}
CREATE DATABASE schema_p LOCATION "hdfs:///tmp/schema_p";
{code}
*Hive tables:*
{code:java}
CREATE TABLE schema_p.product(
id_product string,
name string,
country string,
city string,
start_date string,
end_date string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs:///tmp/schema_p/product';
{code}
{code:java}
CREATE TABLE schema_p.person_product(
id_person string,
id_product string,
country string,
city string,
price string,
start_date string,
end_date string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs:///tmp/schema_p/person_product';
{code}
*Hive views:*
{code:java}
CREATE VIEW schema_p.product_v AS SELECT CAST(id_product AS INT) AS id_product,
name AS name, country AS country, city AS city, CAST(start_date AS DATE) AS
start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.product;
CREATE VIEW schema_p.person_product_v AS SELECT CAST(id_person AS INT) AS
id_person, CAST(id_product AS INT) AS id_product, country AS country, city AS
city, CAST(price AS DECIMAL(38,8)) AS price, CAST(start_date AS DATE) AS
start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person_product;
{code}
*Code*:
{code:java}
def read_tables(sc):
in_dict = { 'product': 'product_v', 'person_product': 'person_product_v' }
data_dict = {}
for n, d in in_dict.iteritems():
data_dict[n] = sc.read.table(d)
return data_dict
def get_population(tables, ref_date_str):
product = tables['product']
person_product = tables['person_product']
count_prod
=person_product.groupBy('id_product').agg(F.count('id_product').alias('count_prod'))
person_product_join = person_product.join(product,'id_product')
person_count = person_product_join.join(count_prod,'id_product')
final = person_product_join.join(person_count, 'id_person', 'left')
return final
import pyspark.sql.functions as F
import functools
from pyspark.sql.functions import col
from pyspark.sql.functions import add_months, lit, count, coalesce
spark.sql('use schema_p')
data_dict = read_tables(spark)
data_dict
population = get_population(data_dict, '2019-04-30')
population.show()
{code}
*Error:*
{code:java}
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "<stdin>", line 10, in get_population
File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line
931, in join
jdf = self._jdf.join(other._jdf, on, how)
File
"/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py",
line 1160, in __call__
File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 69, in
deco
raise AnalysisException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.AnalysisException: u'Resolved attribute(s)
id_person#103,start_date#108,id_product#104,end_date#109,price#107,country#105,city#106
missing from
price#4,id_product#1,start_date#5,end_date#6,id_person#0,city#3,country#2 in
operator !Project [cast(id_person#103 as int) AS id_person#76,
cast(id_product#104 as int) AS id_product#77, cast(country#105 as string) AS
country#78, cast(city#106 as string) AS city#79, cast(price#107 as
decimal(38,8)) AS price#80, cast(start_date#108 as date) AS start_date#81,
cast(end_date#109 as date) AS end_date#82]. Attribute(s) with the same name
appear in the operation:
id_person,start_date,id_product,end_date,price,country,city. Please check if
the right attribute(s) are used.;;
Project [id_person#0, id_product#1, country#2, city#3, price#4, start_date#5,
end_date#6, name#29, country#30, city#31, start_date#32, end_date#33,
id_product#104, country#105, city#106, price#107, start_date#108, end_date#109,
name#137, country#138, city#139, start_date#140, end_date#141, count_prod#61L]
+- Join LeftOuter, (id_person#0 = id_person#103)
:- Project [id_product#1, id_person#0, country#2, city#3, price#4,
start_date#5, end_date#6, name#29, country#30, city#31, start_date#32,
end_date#33]
: +- Join Inner, (id_product#1 = id_product#28)
: :- SubqueryAlias person_product_v
: : +- View (`schema_p`.`person_product_v`,
[id_person#0,id_product#1,country#2,city#3,price#4,start_date#5,end_date#6])
: : +- Project [cast(id_person#7 as int) AS id_person#0, cast(id_product#8 as
int) AS id_product#1, cast(country#9 as string) AS country#2, cast(city#10 as
string) AS city#3, cast(price#11 as decimal(38,8)) AS price#4,
cast(start_date#12 as date) AS start_date#5, cast(end_date#13 as date) AS
end_date#6]
: : +- Project [cast(id_person#14 as int) AS id_person#7, cast(id_product#15 as
int) AS id_product#8, country#16 AS country#9, city#17 AS city#10,
cast(price#18 as decimal(38,8)) AS price#11, cast(start_date#19 as date) AS
start_date#12, cast(end_date#20 as date) AS end_date#13]
: : +- SubqueryAlias person_product
: : +- HiveTableRelation `schema_p`.`person_product`,
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [id_person#14,
id_product#15, country#16, city#17, price#18, start_date#19, end_date#20]
: +- SubqueryAlias product_v
: +- View (`schema_p`.`product_v`,
[id_product#28,name#29,country#30,city#31,start_date#32,end_date#33])
: +- Project [cast(id_product#34 as int) AS id_product#28, cast(name#35 as
string) AS name#29, cast(country#36 as string) AS country#30, cast(city#37 as
string) AS city#31, cast(start_date#38 as date) AS start_date#32,
cast(end_date#39 as date) AS end_date#33]
: +- Project [cast(id_product#40 as int) AS id_product#34, name#41 AS name#35,
country#42 AS country#36, city#43 AS city#37, cast(start_date#44 as date) AS
start_date#38, cast(end_date#45 as date) AS end_date#39]
: +- SubqueryAlias product
: +- HiveTableRelation `schema_p`.`product`,
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [id_product#40, name#41,
country#42, city#43, start_date#44, end_date#45]
+- Project [id_product#104, id_person#103, country#105, city#106, price#107,
start_date#108, end_date#109, name#137, country#138, city#139, start_date#140,
end_date#141, count_prod#61L]
+- Join Inner, (id_product#104 = id_product#77)
:- Project [id_product#104, id_person#103, country#105, city#106, price#107,
start_date#108, end_date#109, name#137, country#138, city#139, start_date#140,
end_date#141]
: +- Join Inner, (id_product#104 = id_product#136)
: :- SubqueryAlias person_product_v
: : +- View (`schema_p`.`person_product_v`,
[id_person#103,id_product#104,country#105,city#106,price#107,start_date#108,end_date#109])
: : +- Project [cast(id_person#0 as int) AS id_person#103, cast(id_product#1 as
int) AS id_product#104, cast(country#2 as string) AS country#105, cast(city#3
as string) AS city#106, cast(price#4 as decimal(38,8)) AS price#107,
cast(start_date#5 as date) AS start_date#108, cast(end_date#6 as date) AS
end_date#109]
: : +- Project [cast(id_person#7 as int) AS id_person#0, cast(id_product#8 as
int) AS id_product#1, cast(country#9 as string) AS country#2, cast(city#10 as
string) AS city#3, cast(price#11 as decimal(38,8)) AS price#4,
cast(start_date#12 as date) AS start_date#5, cast(end_date#13 as date) AS
end_date#6]
: : +- Project [cast(id_person#14 as int) AS id_person#7, cast(id_product#15 as
int) AS id_product#8, country#16 AS country#9, city#17 AS city#10,
cast(price#18 as decimal(38,8)) AS price#11, cast(start_date#19 as date) AS
start_date#12, cast(end_date#20 as date) AS end_date#13]
: : +- SubqueryAlias person_product
: : +- HiveTableRelation `schema_p`.`person_product`,
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [id_person#14,
id_product#15, country#16, city#17, price#18, start_date#19, end_date#20]
: +- SubqueryAlias product_v
: +- View (`schema_p`.`product_v`,
[id_product#136,name#137,country#138,city#139,start_date#140,end_date#141])
: +- Project [cast(id_product#28 as int) AS id_product#136, cast(name#29 as
string) AS name#137, cast(country#30 as string) AS country#138, cast(city#31 as
string) AS city#139, cast(start_date#32 as date) AS start_date#140,
cast(end_date#33 as date) AS end_date#141]
: +- Project [cast(id_product#34 as int) AS id_product#28, cast(name#35 as
string) AS name#29, cast(country#36 as string) AS country#30, cast(city#37 as
string) AS city#31, cast(start_date#38 as date) AS start_date#32,
cast(end_date#39 as date) AS end_date#33]
: +- Project [cast(id_product#40 as int) AS id_product#34, name#41 AS name#35,
country#42 AS country#36, city#43 AS city#37, cast(start_date#44 as date) AS
start_date#38, cast(end_date#45 as date) AS end_date#39]
: +- SubqueryAlias product
: +- HiveTableRelation `schema_p`.`product`,
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [id_product#40, name#41,
country#42, city#43, start_date#44, end_date#45]
+- Aggregate [id_product#77], [id_product#77, count(id_product#77) AS
count_prod#61L]
+- SubqueryAlias person_product_v
+- View (`schema_p`.`person_product_v`,
[id_person#76,id_product#77,country#78,city#79,price#80,start_date#81,end_date#82])
+- !Project [cast(id_person#103 as int) AS id_person#76, cast(id_product#104 as
int) AS id_product#77, cast(country#105 as string) AS country#78, cast(city#106
as string) AS city#79, cast(price#107 as decimal(38,8)) AS price#80,
cast(start_date#108 as date) AS start_date#81, cast(end_date#109 as date) AS
end_date#82]
+- Project [cast(id_person#7 as int) AS id_person#0, cast(id_product#8 as int)
AS id_product#1, cast(country#9 as string) AS country#2, cast(city#10 as
string) AS city#3, cast(price#11 as decimal(38,8)) AS price#4,
cast(start_date#12 as date) AS start_date#5, cast(end_date#13 as date) AS
end_date#6]
+- Project [cast(id_person#14 as int) AS id_person#7, cast(id_product#15 as
int) AS id_product#8, country#16 AS country#9, city#17 AS city#10,
cast(price#18 as decimal(38,8)) AS price#11, cast(start_date#19 as date) AS
start_date#12, cast(end_date#20 as date) AS end_date#13]
+- SubqueryAlias person_product
+- HiveTableRelation `schema_p`.`person_product`,
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [id_person#14,
id_product#15, country#16, city#17, price#18, start_date#19, end_date#20]
'{code}
was:
I have a python error when I execute the following code using hive views but it
works correctly when I run it with hive tables.
*Hive databases:*
{code:java}
CREATE DATABASE schema_p LOCATION "hdfs:///tmp/schema_p";
{code}
*Hive tables:*
{code:java}
CREATE TABLE schema_p.person(
id_person string,
identifier string,
gender string,
start_date string,
end_date string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs:///tmp/schema_p/person';
{code}
{code:java}
CREATE TABLE schema_p.product(
id_product string,
name string,
country string,
city string,
start_date string,
end_date string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs:///tmp/schema_p/product';
{code}
{code:java}
CREATE TABLE schema_p.person_product(
id_person string,
id_product string,
country string,
city string,
price string,
start_date string,
end_date string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs:///tmp/schema_p/person_product';
{code}
*Hive views:*
{code:java}
CREATE VIEW schema_p.person_v AS SELECT CAST(id_person AS INT) AS id_person,
CAST(identifier AS INT) AS identifier, gender AS gender, CAST(start_date AS
DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person;
CREATE VIEW schema_p.product_v AS SELECT CAST(id_product AS INT) AS
id_product, name AS name, country AS country, city AS city, CAST(start_date AS
DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.product;
CREATE VIEW schema_p.person_product_v AS SELECT CAST(id_person AS INT) AS
id_person, CAST(id_product AS INT) AS id_product, country AS country, city AS
city, CAST(price AS DECIMAL(38,8)) AS price, CAST(start_date AS DATE) AS
start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person_product;
{code}
*********************************************
*Code*:
{code}
def read_tables(sc):
in_dict = { 'person': 'person_v', 'product': 'product_v', 'person_product':
'person_product_v' }
data_dict = {}
for n, d in in_dict.iteritems():
data_dict[n] = sc.read.table(d)
return data_dict
def get_population(tables, ref_date_str):
person = tables['person']
product = tables['product']
person_product = tables['person_product']
person_product_join = person_product.join(product,'id_product')
count_prod =
person_product.groupBy('id_product').agg(F.count('id_product').alias('count_prod'))
person_count = person_product_join.join(count_prod,'id_product')
final1 = person_product_join.join(person_count, 'id_person', 'left')
final = final1.withColumn('reference_date', F.lit(ref_date_str))
return final
import pyspark.sql.functions as F
import functools
from pyspark.sql.functions import col
from pyspark.sql.functions import add_months, lit, count, coalesce
spark.sql('use schema_p')
data_dict = read_tables(spark)
data_dict
population = get_population(data_dict, '2019-04-30')
population.show(){code}
*Error:*
{code:java}
File "<stdin>", line 1, in <module>
File "<stdin>", line 10, in get_population
File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line
931, in join
jdf = self._jdf.join(other._jdf, on, how)
File
"/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py",
line 1160, in __call__
File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 69, in
deco
raise AnalysisException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.AnalysisException: u'Resolved attribute(s)
id_product#124,end_date#129,city#126,price#127,start_date#128,id_person#123,country#125
missing from
city#47,price#48,start_date#49,id_product#45,end_date#50,id_person#44,country#46
in operator !Project [cast(id_person#123 as int) AS id_person#96,
cast(id_product#124 as int) AS id_product#97, cast(country#125 as string) AS
country#98, cast(city#126 as string) AS city#99, cast(price#127 as
decimal(38,8)) AS price#100, cast(start_date#128 as date) AS start_date#101,
cast(end_date#129 as date) AS end_date#102|#123 as int) AS id_person#96,
cast(id_product#124 as int) AS id_product#97, cast(country#125 as string) AS
country#98, cast(city#126 as string) AS city#99, cast(price#127 as
decimal(38,8)) AS price#100, cast(start_date#128 as date) AS start_date#101,
cast(end_date#129 as date) AS end_date#102]. Attribute(s) with the same name
appear in the operation:
id_product,end_date,city,price,start_date,id_person,country. Please check if
the right attribute(s) are used.;;
Project [id_person#44, id_product#45, country#46, city#47, price#48,
start_date#49, end_date#50, name#21, country#22, city#23, start_date#24,
end_date#25, id_product#124, country#125, city#126, price#127, start_date#128,
end_date#129, name#157, country#158, city#159, start_date#160, end_date#161,
count_prod#93L|#44, id_product#45, country#46, city#47, price#48,
start_date#49, end_date#50, name#21, country#22, city#23, start_date#24,
end_date#25, id_product#124, country#125, city#126, price#127, start_date#128,
end_date#129, name#157, country#158, city#159, start_date#160, end_date#161,
count_prod#93L]
+- Join LeftOuter, (id_person#44 = id_person#123)
:- Project [id_product#45, id_person#44, country#46, city#47, price#48,
start_date#49, end_date#50, name#21, country#22, city#23, start_date#24,
end_date#25|#45, id_person#44, country#46, city#47, price#48, start_date#49,
end_date#50, name#21, country#22, city#23, start_date#24, end_date#25]
: +- Join Inner, (id_product#45 = id_product#20)
: :- SubqueryAlias person_product_v
: : +- View (`schema_p`.`person_product_v`,
[id_person#44,id_product#45,country#46,city#47,price#48,start_date#49,end_date#50|#44,id_product#45,country#46,city#47,price#48,start_date#49,end_date#50])
: : +- Project [cast(id_person#51 as int) AS id_person#44, cast(id_product#52
as int) AS id_product#45, cast(country#53 as string) AS country#46,
cast(city#54 as string) AS city#47, cast(price#55 as decimal(38,8)) AS
price#48, cast(start_date#56 as date) AS start_date#49, cast(end_date#57 as
date) AS end_date#50|#51 as int) AS id_person#44, cast(id_product#52 as int) AS
id_product#45, cast(country#53 as string) AS country#46, cast(city#54 as
string) AS city#47, cast(price#55 as decimal(38,8)) AS price#48,
cast(start_date#56 as date) AS start_date#49, cast(end_date#57 as date) AS
end_date#50]
: : +- Project [cast(id_person#58 as int) AS id_person#51, cast(id_product#59
as int) AS id_product#52, country#60 AS country#53, city#61 AS city#54,
cast(price#62 as decimal(38,8)) AS price#55, cast(start_date#63 as date) AS
start_date#56, cast(end_date#64 as date) AS end_date#57|#58 as int) AS
id_person#51, cast(id_product#59 as int) AS id_product#52, country#60 AS
country#53, city#61 AS city#54, cast(price#62 as decimal(38,8)) AS price#55,
cast(start_date#63 as date) AS start_date#56, cast(end_date#64 as date) AS
end_date#57]
: : +- SubqueryAlias person_product
: : +- HiveTableRelation `schema_p`.`person_product`,
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [id_person#58,
id_product#59, country#60, city#61, price#62, start_date#63, end_date#64|#58,
id_product#59, country#60, city#61, price#62, start_date#63, end_date#64]
: +- SubqueryAlias product_v
: +- View (`schema_p`.`product_v`,
[id_product#20,name#21,country#22,city#23,start_date#24,end_date#25|#20,name#21,country#22,city#23,start_date#24,end_date#25])
: +- Project [cast(id_product#26 as int) AS id_product#20, cast(name#27 as
string) AS name#21, cast(country#28 as string) AS country#22, cast(city#29 as
string) AS city#23, cast(start_date#30 as date) AS start_date#24,
cast(end_date#31 as date) AS end_date#25|#26 as int) AS id_product#20,
cast(name#27 as string) AS name#21, cast(country#28 as string) AS country#22,
cast(city#29 as string) AS city#23, cast(start_date#30 as date) AS
start_date#24, cast(end_date#31 as date) AS end_date#25]
: +- Project [cast(id_product#32 as int) AS id_product#26, name#33 AS name#27,
country#34 AS country#28, city#35 AS city#29, cast(start_date#36 as date) AS
start_date#30, cast(end_date#37 as date) AS end_date#31|#32 as int) AS
id_product#26, name#33 AS name#27, country#34 AS country#28, city#35 AS
city#29, cast(start_date#36 as date) AS start_date#30, cast(end_date#37 as
date) AS end_date#31]
: +- SubqueryAlias product
: +- HiveTableRelation `schema_p`.`product`,
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [id_product#32, name#33,
country#34, city#35, start_date#36, end_date#37|#32, name#33, country#34,
city#35, start_date#36, end_date#37]
+- Project [id_product#124, id_person#123, country#125, city#126, price#127,
start_date#128, end_date#129, name#157, country#158, city#159, start_date#160,
end_date#161, count_prod#93L|#124, id_person#123, country#125, city#126,
price#127, start_date#128, end_date#129, name#157, country#158, city#159,
start_date#160, end_date#161, count_prod#93L]
+- Join Inner, (id_product#124 = id_product#97)
:- Project [id_product#124, id_person#123, country#125, city#126, price#127,
start_date#128, end_date#129, name#157, country#158, city#159, start_date#160,
end_date#161|#124, id_person#123, country#125, city#126, price#127,
start_date#128, end_date#129, name#157, country#158, city#159, start_date#160,
end_date#161]
: +- Join Inner, (id_product#124 = id_product#156)
: :- SubqueryAlias person_product_v
: : +- View (`schema_p`.`person_product_v`,
[id_person#123,id_product#124,country#125,city#126,price#127,start_date#128,end_date#129|#123,id_product#124,country#125,city#126,price#127,start_date#128,end_date#129])
: : +- Project [cast(id_person#44 as int) AS id_person#123, cast(id_product#45
as int) AS id_product#124, cast(country#46 as string) AS country#125,
cast(city#47 as string) AS city#126, cast(price#48 as decimal(38,8)) AS
price#127, cast(start_date#49 as date) AS start_date#128, cast(end_date#50 as
date) AS end_date#129|#44 as int) AS id_person#123, cast(id_product#45 as int)
AS id_product#124, cast(country#46 as string) AS country#125, cast(city#47 as
string) AS city#126, cast(price#48 as decimal(38,8)) AS price#127,
cast(start_date#49 as date) AS start_date#128, cast(end_date#50 as date) AS
end_date#129]
: : +- Project [cast(id_person#51 as int) AS id_person#44, cast(id_product#52
as int) AS id_product#45, cast(country#53 as string) AS country#46,
cast(city#54 as string) AS city#47, cast(price#55 as decimal(38,8)) AS
price#48, cast(start_date#56 as date) AS start_date#49, cast(end_date#57 as
date) AS end_date#50|#51 as int) AS id_person#44, cast(id_product#52 as int) AS
id_product#45, cast(country#53 as string) AS country#46, cast(city#54 as
string) AS city#47, cast(price#55 as decimal(38,8)) AS price#48,
cast(start_date#56 as date) AS start_date#49, cast(end_date#57 as date) AS
end_date#50]
: : +- Project [cast(id_person#58 as int) AS id_person#51, cast(id_product#59
as int) AS id_product#52, country#60 AS country#53, city#61 AS city#54,
cast(price#62 as decimal(38,8)) AS price#55, cast(start_date#63 as date) AS
start_date#56, cast(end_date#64 as date) AS end_date#57|#58 as int) AS
id_person#51, cast(id_product#59 as int) AS id_product#52, country#60 AS
country#53, city#61 AS city#54, cast(price#62 as decimal(38,8)) AS price#55,
cast(start_date#63 as date) AS start_date#56, cast(end_date#64 as date) AS
end_date#57]
: : +- SubqueryAlias person_product
: : +- HiveTableRelation `schema_p`.`person_product`,
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [id_person#58,
id_product#59, country#60, city#61, price#62, start_date#63, end_date#64|#58,
id_product#59, country#60, city#61, price#62, start_date#63, end_date#64]
: +- SubqueryAlias product_v
: +- View (`schema_p`.`product_v`,
[id_product#156,name#157,country#158,city#159,start_date#160,end_date#161|#156,name#157,country#158,city#159,start_date#160,end_date#161])
: +- Project [cast(id_product#20 as int) AS id_product#156, cast(name#21 as
string) AS name#157, cast(country#22 as string) AS country#158, cast(city#23 as
string) AS city#159, cast(start_date#24 as date) AS start_date#160,
cast(end_date#25 as date) AS end_date#161|#20 as int) AS id_product#156,
cast(name#21 as string) AS name#157, cast(country#22 as string) AS country#158,
cast(city#23 as string) AS city#159, cast(start_date#24 as date) AS
start_date#160, cast(end_date#25 as date) AS end_date#161]
: +- Project [cast(id_product#26 as int) AS id_product#20, cast(name#27 as
string) AS name#21, cast(country#28 as string) AS country#22, cast(city#29 as
string) AS city#23, cast(start_date#30 as date) AS start_date#24,
cast(end_date#31 as date) AS end_date#25|#26 as int) AS id_product#20,
cast(name#27 as string) AS name#21, cast(country#28 as string) AS country#22,
cast(city#29 as string) AS city#23, cast(start_date#30 as date) AS
start_date#24, cast(end_date#31 as date) AS end_date#25]
: +- Project [cast(id_product#32 as int) AS id_product#26, name#33 AS name#27,
country#34 AS country#28, city#35 AS city#29, cast(start_date#36 as date) AS
start_date#30, cast(end_date#37 as date) AS end_date#31|#32 as int) AS
id_product#26, name#33 AS name#27, country#34 AS country#28, city#35 AS
city#29, cast(start_date#36 as date) AS start_date#30, cast(end_date#37 as
date) AS end_date#31]
: +- SubqueryAlias product
: +- HiveTableRelation `schema_p`.`product`,
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [id_product#32, name#33,
country#34, city#35, start_date#36, end_date#37|#32, name#33, country#34,
city#35, start_date#36, end_date#37]
+- Aggregate [id_product#97|#97], [id_product#97, count(id_product#97) AS
count_prod#93L|#97, count(id_product#97) AS count_prod#93L]
+- SubqueryAlias person_product_v
+- View (`schema_p`.`person_product_v`,
[id_person#96,id_product#97,country#98,city#99,price#100,start_date#101,end_date#102|#96,id_product#97,country#98,city#99,price#100,start_date#101,end_date#102])
+- !Project [cast(id_person#123 as int) AS id_person#96, cast(id_product#124
as int) AS id_product#97, cast(country#125 as string) AS country#98,
cast(city#126 as string) AS city#99, cast(price#127 as decimal(38,8)) AS
price#100, cast(start_date#128 as date) AS start_date#101, cast(end_date#129 as
date) AS end_date#102|#123 as int) AS id_person#96, cast(id_product#124 as int)
AS id_product#97, cast(country#125 as string) AS country#98, cast(city#126 as
string) AS city#99, cast(price#127 as decimal(38,8)) AS price#100,
cast(start_date#128 as date) AS start_date#101, cast(end_date#129 as date) AS
end_date#102]
+- Project [cast(id_person#51 as int) AS id_person#44, cast(id_product#52 as
int) AS id_product#45, cast(country#53 as string) AS country#46, cast(city#54
as string) AS city#47, cast(price#55 as decimal(38,8)) AS price#48,
cast(start_date#56 as date) AS start_date#49, cast(end_date#57 as date) AS
end_date#50|#51 as int) AS id_person#44, cast(id_product#52 as int) AS
id_product#45, cast(country#53 as string) AS country#46, cast(city#54 as
string) AS city#47, cast(price#55 as decimal(38,8)) AS price#48,
cast(start_date#56 as date) AS start_date#49, cast(end_date#57 as date) AS
end_date#50]
+- Project [cast(id_person#58 as int) AS id_person#51, cast(id_product#59 as
int) AS id_product#52, country#60 AS country#53, city#61 AS city#54,
cast(price#62 as decimal(38,8)) AS price#55, cast(start_date#63 as date) AS
start_date#56, cast(end_date#64 as date) AS end_date#57|#58 as int) AS
id_person#51, cast(id_product#59 as int) AS id_product#52, country#60 AS
country#53, city#61 AS city#54, cast(price#62 as decimal(38,8)) AS price#55,
cast(start_date#63 as date) AS start_date#56, cast(end_date#64 as date) AS
end_date#57]
+- SubqueryAlias person_product
+- HiveTableRelation `schema_p`.`person_product`,
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [id_person#58,
id_product#59, country#60, city#61, price#62, start_date#63, end_date#64|#58,
id_product#59, country#60, city#61, price#62, start_date#63, end_date#64]
{code}
> Error to execute hive views with spark
> --------------------------------------
>
> Key: SPARK-27689
> URL: https://issues.apache.org/jira/browse/SPARK-27689
> Project: Spark
> Issue Type: Bug
> Components: SQL
> Affects Versions: 2.3.0, 2.3.3, 2.4.3
> Reporter: Juan Antonio
> Priority: Major
>
> I have a python error when I execute the following code using hive views but
> it works correctly when I run it with hive tables.
> *Hive databases:*
> {code:java}
> CREATE DATABASE schema_p LOCATION "hdfs:///tmp/schema_p";
> {code}
> *Hive tables:*
> {code:java}
> CREATE TABLE schema_p.product(
> id_product string,
> name string,
> country string,
> city string,
> start_date string,
> end_date string
> )
> ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
> STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
> OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
> LOCATION 'hdfs:///tmp/schema_p/product';
> {code}
> {code:java}
> CREATE TABLE schema_p.person_product(
> id_person string,
> id_product string,
> country string,
> city string,
> price string,
> start_date string,
> end_date string
> )
> ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
> STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
> OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
> LOCATION 'hdfs:///tmp/schema_p/person_product';
> {code}
> *Hive views:*
> {code:java}
> CREATE VIEW schema_p.product_v AS SELECT CAST(id_product AS INT) AS
> id_product, name AS name, country AS country, city AS city, CAST(start_date
> AS DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM
> schema_p.product;
>
> CREATE VIEW schema_p.person_product_v AS SELECT CAST(id_person AS INT) AS
> id_person, CAST(id_product AS INT) AS id_product, country AS country, city AS
> city, CAST(price AS DECIMAL(38,8)) AS price, CAST(start_date AS DATE) AS
> start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person_product;
> {code}
> *Code*:
> {code:java}
> def read_tables(sc):
> in_dict = { 'product': 'product_v', 'person_product': 'person_product_v' }
> data_dict = {}
> for n, d in in_dict.iteritems():
> data_dict[n] = sc.read.table(d)
> return data_dict
> def get_population(tables, ref_date_str):
> product = tables['product']
> person_product = tables['person_product']
> count_prod
> =person_product.groupBy('id_product').agg(F.count('id_product').alias('count_prod'))
> person_product_join = person_product.join(product,'id_product')
> person_count = person_product_join.join(count_prod,'id_product')
> final = person_product_join.join(person_count, 'id_person', 'left')
> return final
> import pyspark.sql.functions as F
> import functools
> from pyspark.sql.functions import col
> from pyspark.sql.functions import add_months, lit, count, coalesce
> spark.sql('use schema_p')
> data_dict = read_tables(spark)
> data_dict
> population = get_population(data_dict, '2019-04-30')
> population.show()
> {code}
> *Error:*
> {code:java}
> Traceback (most recent call last):
> File "<stdin>", line 1, in <module>
> File "<stdin>", line 10, in get_population
> File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line
> 931, in join
> jdf = self._jdf.join(other._jdf, on, how)
> File
> "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py",
> line 1160, in __call__
> File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 69,
> in deco
> raise AnalysisException(s.split(': ', 1)[1], stackTrace)
> pyspark.sql.utils.AnalysisException: u'Resolved attribute(s)
> id_person#103,start_date#108,id_product#104,end_date#109,price#107,country#105,city#106
> missing from
> price#4,id_product#1,start_date#5,end_date#6,id_person#0,city#3,country#2 in
> operator !Project [cast(id_person#103 as int) AS id_person#76,
> cast(id_product#104 as int) AS id_product#77, cast(country#105 as string) AS
> country#78, cast(city#106 as string) AS city#79, cast(price#107 as
> decimal(38,8)) AS price#80, cast(start_date#108 as date) AS start_date#81,
> cast(end_date#109 as date) AS end_date#82]. Attribute(s) with the same name
> appear in the operation:
> id_person,start_date,id_product,end_date,price,country,city. Please check if
> the right attribute(s) are used.;;
> Project [id_person#0, id_product#1, country#2, city#3, price#4, start_date#5,
> end_date#6, name#29, country#30, city#31, start_date#32, end_date#33,
> id_product#104, country#105, city#106, price#107, start_date#108,
> end_date#109, name#137, country#138, city#139, start_date#140, end_date#141,
> count_prod#61L]
> +- Join LeftOuter, (id_person#0 = id_person#103)
> :- Project [id_product#1, id_person#0, country#2, city#3, price#4,
> start_date#5, end_date#6, name#29, country#30, city#31, start_date#32,
> end_date#33]
> : +- Join Inner, (id_product#1 = id_product#28)
> : :- SubqueryAlias person_product_v
> : : +- View (`schema_p`.`person_product_v`,
> [id_person#0,id_product#1,country#2,city#3,price#4,start_date#5,end_date#6])
> : : +- Project [cast(id_person#7 as int) AS id_person#0, cast(id_product#8 as
> int) AS id_product#1, cast(country#9 as string) AS country#2, cast(city#10 as
> string) AS city#3, cast(price#11 as decimal(38,8)) AS price#4,
> cast(start_date#12 as date) AS start_date#5, cast(end_date#13 as date) AS
> end_date#6]
> : : +- Project [cast(id_person#14 as int) AS id_person#7, cast(id_product#15
> as int) AS id_product#8, country#16 AS country#9, city#17 AS city#10,
> cast(price#18 as decimal(38,8)) AS price#11, cast(start_date#19 as date) AS
> start_date#12, cast(end_date#20 as date) AS end_date#13]
> : : +- SubqueryAlias person_product
> : : +- HiveTableRelation `schema_p`.`person_product`,
> org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [id_person#14,
> id_product#15, country#16, city#17, price#18, start_date#19, end_date#20]
> : +- SubqueryAlias product_v
> : +- View (`schema_p`.`product_v`,
> [id_product#28,name#29,country#30,city#31,start_date#32,end_date#33])
> : +- Project [cast(id_product#34 as int) AS id_product#28, cast(name#35 as
> string) AS name#29, cast(country#36 as string) AS country#30, cast(city#37 as
> string) AS city#31, cast(start_date#38 as date) AS start_date#32,
> cast(end_date#39 as date) AS end_date#33]
> : +- Project [cast(id_product#40 as int) AS id_product#34, name#41 AS
> name#35, country#42 AS country#36, city#43 AS city#37, cast(start_date#44 as
> date) AS start_date#38, cast(end_date#45 as date) AS end_date#39]
> : +- SubqueryAlias product
> : +- HiveTableRelation `schema_p`.`product`,
> org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [id_product#40, name#41,
> country#42, city#43, start_date#44, end_date#45]
> +- Project [id_product#104, id_person#103, country#105, city#106, price#107,
> start_date#108, end_date#109, name#137, country#138, city#139,
> start_date#140, end_date#141, count_prod#61L]
> +- Join Inner, (id_product#104 = id_product#77)
> :- Project [id_product#104, id_person#103, country#105, city#106, price#107,
> start_date#108, end_date#109, name#137, country#138, city#139,
> start_date#140, end_date#141]
> : +- Join Inner, (id_product#104 = id_product#136)
> : :- SubqueryAlias person_product_v
> : : +- View (`schema_p`.`person_product_v`,
> [id_person#103,id_product#104,country#105,city#106,price#107,start_date#108,end_date#109])
> : : +- Project [cast(id_person#0 as int) AS id_person#103, cast(id_product#1
> as int) AS id_product#104, cast(country#2 as string) AS country#105,
> cast(city#3 as string) AS city#106, cast(price#4 as decimal(38,8)) AS
> price#107, cast(start_date#5 as date) AS start_date#108, cast(end_date#6 as
> date) AS end_date#109]
> : : +- Project [cast(id_person#7 as int) AS id_person#0, cast(id_product#8 as
> int) AS id_product#1, cast(country#9 as string) AS country#2, cast(city#10 as
> string) AS city#3, cast(price#11 as decimal(38,8)) AS price#4,
> cast(start_date#12 as date) AS start_date#5, cast(end_date#13 as date) AS
> end_date#6]
> : : +- Project [cast(id_person#14 as int) AS id_person#7, cast(id_product#15
> as int) AS id_product#8, country#16 AS country#9, city#17 AS city#10,
> cast(price#18 as decimal(38,8)) AS price#11, cast(start_date#19 as date) AS
> start_date#12, cast(end_date#20 as date) AS end_date#13]
> : : +- SubqueryAlias person_product
> : : +- HiveTableRelation `schema_p`.`person_product`,
> org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [id_person#14,
> id_product#15, country#16, city#17, price#18, start_date#19, end_date#20]
> : +- SubqueryAlias product_v
> : +- View (`schema_p`.`product_v`,
> [id_product#136,name#137,country#138,city#139,start_date#140,end_date#141])
> : +- Project [cast(id_product#28 as int) AS id_product#136, cast(name#29 as
> string) AS name#137, cast(country#30 as string) AS country#138, cast(city#31
> as string) AS city#139, cast(start_date#32 as date) AS start_date#140,
> cast(end_date#33 as date) AS end_date#141]
> : +- Project [cast(id_product#34 as int) AS id_product#28, cast(name#35 as
> string) AS name#29, cast(country#36 as string) AS country#30, cast(city#37 as
> string) AS city#31, cast(start_date#38 as date) AS start_date#32,
> cast(end_date#39 as date) AS end_date#33]
> : +- Project [cast(id_product#40 as int) AS id_product#34, name#41 AS
> name#35, country#42 AS country#36, city#43 AS city#37, cast(start_date#44 as
> date) AS start_date#38, cast(end_date#45 as date) AS end_date#39]
> : +- SubqueryAlias product
> : +- HiveTableRelation `schema_p`.`product`,
> org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [id_product#40, name#41,
> country#42, city#43, start_date#44, end_date#45]
> +- Aggregate [id_product#77], [id_product#77, count(id_product#77) AS
> count_prod#61L]
> +- SubqueryAlias person_product_v
> +- View (`schema_p`.`person_product_v`,
> [id_person#76,id_product#77,country#78,city#79,price#80,start_date#81,end_date#82])
> +- !Project [cast(id_person#103 as int) AS id_person#76, cast(id_product#104
> as int) AS id_product#77, cast(country#105 as string) AS country#78,
> cast(city#106 as string) AS city#79, cast(price#107 as decimal(38,8)) AS
> price#80, cast(start_date#108 as date) AS start_date#81, cast(end_date#109 as
> date) AS end_date#82]
> +- Project [cast(id_person#7 as int) AS id_person#0, cast(id_product#8 as
> int) AS id_product#1, cast(country#9 as string) AS country#2, cast(city#10 as
> string) AS city#3, cast(price#11 as decimal(38,8)) AS price#4,
> cast(start_date#12 as date) AS start_date#5, cast(end_date#13 as date) AS
> end_date#6]
> +- Project [cast(id_person#14 as int) AS id_person#7, cast(id_product#15 as
> int) AS id_product#8, country#16 AS country#9, city#17 AS city#10,
> cast(price#18 as decimal(38,8)) AS price#11, cast(start_date#19 as date) AS
> start_date#12, cast(end_date#20 as date) AS end_date#13]
> +- SubqueryAlias person_product
> +- HiveTableRelation `schema_p`.`person_product`,
> org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [id_person#14,
> id_product#15, country#16, city#17, price#18, start_date#19, end_date#20]
> '{code}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]