[jira] [Updated] (SPARK-27689) Error to execute hive views with spark
[ https://issues.apache.org/jira/browse/SPARK-27689?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Juan Antonio updated SPARK-27689: - Description: I have a python error when I execute the following code using hive views but it works correctly when I run it with hive tables. *Hive databases:* {code:java} CREATE DATABASE schema_p LOCATION "hdfs:///tmp/schema_p"; {code} *Hive tables:* {code:java} CREATE TABLE schema_p.product( id_product string, name string, country string, city string, start_date string, end_date string ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' LOCATION 'hdfs:///tmp/schema_p/product'; {code} {code:java} CREATE TABLE schema_p.person_product( id_person string, id_product string, country string, city string, price string, start_date string, end_date string ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' LOCATION 'hdfs:///tmp/schema_p/person_product'; {code} *Hive views:* {code:java} CREATE VIEW schema_p.product_v AS SELECT CAST(id_product AS INT) AS id_product, name AS name, country AS country, city AS city, CAST(start_date AS DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.product; CREATE VIEW schema_p.person_product_v AS SELECT CAST(id_person AS INT) AS id_person, CAST(id_product AS INT) AS id_product, country AS country, city AS city, CAST(price AS DECIMAL(38,8)) AS price, CAST(start_date AS DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person_product; {code} *Code*: {code:java} def read_tables(sc): in_dict = { 'product': 'product_v', 'person_product': 'person_product_v' } data_dict = {} for n, d in in_dict.iteritems(): data_dict[n] = sc.read.table(d) return data_dict def get_population(tables, ref_date_str): product = tables['product'] person_product = tables['person_product'] count_prod =person_product.groupBy('id_product').agg(F.count('id_product').alias('count_prod')) person_product_join = person_product.join(product,'id_product') person_count = person_product_join.join(count_prod,'id_product') final = person_product_join.join(person_count, 'id_person', 'left') return final import pyspark.sql.functions as F import functools from pyspark.sql.functions import col from pyspark.sql.functions import add_months, lit, count, coalesce spark.sql('use schema_p') data_dict = read_tables(spark) data_dict population = get_population(data_dict, '2019-04-30') population.show() {code} *Error:* {code:java} Traceback (most recent call last): File "", line 1, in File "", line 10, in get_population File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line 931, in join jdf = self._jdf.join(other._jdf, on, how) File "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py", line 1160, in __call__ File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 69, in deco raise AnalysisException(s.split(': ', 1)[1], stackTrace) pyspark.sql.utils.AnalysisException: u'Resolved attribute(s) id_person#103,start_date#108,id_product#104,end_date#109,price#107,country#105,city#106 missing from price#4,id_product#1,start_date#5,end_date#6,id_person#0,city#3,country#2 in operator !Project [cast(id_person#103 as int) AS id_person#76, cast(id_product#104 as int) AS id_product#77, cast(country#105 as string) AS country#78, cast(city#106 as string) AS city#79, cast(price#107 as decimal(38,8)) AS price#80, cast(start_date#108 as date) AS start_date#81, cast(end_date#109 as date) AS end_date#82]. Attribute(s) with the same name appear in the operation: id_person,start_date,id_product,end_date,price,country,city. Please check if the right attribute(s) are used.;; Project [id_person#0, id_product#1, country#2, city#3, price#4, start_date#5, end_date#6, name#29, country#30, city#31, start_date#32, end_date#33, id_product#104, country#105, city#106, price#107, start_date#108, end_date#109, name#137, country#138, city#139, start_date#140, end_date#141, count_prod#61L] +- Join LeftOuter, (id_person#0 = id_person#103) :- Project [id_product#1, id_person#0, country#2, city#3, price#4, start_date#5, end_date#6, name#29, country#30, city#31, start_date#32, end_date#33] : +- Join Inner, (id_product#1 = id_product#28) : :- SubqueryAlias person_product_v : : +- View (`schema_p`.`person_product_v`, [id_person#0,id_product#1,country#2,city#3,price#4,start_date#5,end_date#6]) : : +- Project [cast(id_person#7 as int) AS id_person#0, cast(id_product#8 as int) AS id_product#1, cast(country#9 as string) AS country#2, cast(city#10 as string) AS city#3,
[jira] [Updated] (SPARK-27689) Error to execute hive views with spark
[ https://issues.apache.org/jira/browse/SPARK-27689?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Juan Antonio updated SPARK-27689: - Priority: Major (was: Minor) > Error to execute hive views with spark > -- > > Key: SPARK-27689 > URL: https://issues.apache.org/jira/browse/SPARK-27689 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 2.3.0, 2.3.3, 2.4.3 >Reporter: Juan Antonio >Priority: Major > > I have a python error when I execute the following code using hive views but > it works correctly when I run it with hive tables. > *Hive databases:* > {code:java} > CREATE DATABASE schema_p LOCATION "hdfs:///tmp/schema_p"; > {code} > *Hive tables:* > {code:java} > CREATE TABLE schema_p.person( > id_person string, > identifier string, > gender string, > start_date string, > end_date string) > ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' > STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' > OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' > LOCATION 'hdfs:///tmp/schema_p/person'; > {code} > {code:java} > CREATE TABLE schema_p.product( > id_product string, > name string, > country string, > city string, > start_date string, > end_date string > ) > ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' > STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' > OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' > LOCATION 'hdfs:///tmp/schema_p/product'; > {code} > {code:java} > CREATE TABLE schema_p.person_product( > id_person string, > id_product string, > country string, > city string, > price string, > start_date string, > end_date string > ) > ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' > STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' > OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' > LOCATION 'hdfs:///tmp/schema_p/person_product'; > {code} > *Hive views:* > {code:java} > CREATE VIEW schema_p.person_v AS SELECT CAST(id_person AS INT) AS id_person, > CAST(identifier AS INT) AS identifier, gender AS gender, CAST(start_date AS > DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person; > CREATE VIEW schema_p.product_v AS SELECT CAST(id_product AS INT) AS > id_product, name AS name, country AS country, city AS city, CAST(start_date > AS DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM > schema_p.product; > CREATE VIEW schema_p.person_product_v AS SELECT CAST(id_person AS INT) AS > id_person, CAST(id_product AS INT) AS id_product, country AS country, city AS > city, CAST(price AS DECIMAL(38,8)) AS price, CAST(start_date AS DATE) AS > start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person_product; > {code} > * > *Code*: > > {code} > def read_tables(sc): > in_dict = { 'person': 'person_v', 'product': 'product_v', 'person_product': > 'person_product_v' } > data_dict = {} > for n, d in in_dict.iteritems(): > data_dict[n] = sc.read.table(d) > return data_dict > def get_population(tables, ref_date_str): > person = tables['person'] > product = tables['product'] > person_product = tables['person_product'] > person_product_join = person_product.join(product,'id_product') > count_prod = > person_product.groupBy('id_product').agg(F.count('id_product').alias('count_prod')) > person_count = person_product_join.join(count_prod,'id_product') > final1 = person_product_join.join(person_count, 'id_person', 'left') > final = final1.withColumn('reference_date', F.lit(ref_date_str)) > return final > import pyspark.sql.functions as F > import functools > from pyspark.sql.functions import col > from pyspark.sql.functions import add_months, lit, count, coalesce > spark.sql('use schema_p') > data_dict = read_tables(spark) > data_dict > population = get_population(data_dict, '2019-04-30') > population.show(){code} > > *Error:* > {code:java} > File "", line 1, in > File "", line 10, in get_population > File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line > 931, in join > jdf = self._jdf.join(other._jdf, on, how) > File > "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py", > line 1160, in __call__ > File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 69, > in deco > raise AnalysisException(s.split(': ', 1)[1], stackTrace) > pyspark.sql.utils.AnalysisException: u'Resolved attribute(s) > id_product#124,end_date#129,city#126,price#127,start_date#128,id_person#123,country#125 > missing from > city#47,price#48,start_date#49,id_product#45,end_date#50,id_person#44,country#46 > in operator
[jira] [Updated] (SPARK-27689) Error to execute hive views with spark
[ https://issues.apache.org/jira/browse/SPARK-27689?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Juan Antonio updated SPARK-27689: - Priority: Minor (was: Major) > Error to execute hive views with spark > -- > > Key: SPARK-27689 > URL: https://issues.apache.org/jira/browse/SPARK-27689 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 2.3.0, 2.3.3, 2.4.3 >Reporter: Juan Antonio >Priority: Minor > > I have a python error when I execute the following code using hive views but > it works correctly when I run it with hive tables. > *Hive databases:* > {code:java} > CREATE DATABASE schema_p LOCATION "hdfs:///tmp/schema_p"; > {code} > *Hive tables:* > {code:java} > CREATE TABLE schema_p.person( > id_person string, > identifier string, > gender string, > start_date string, > end_date string) > ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' > STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' > OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' > LOCATION 'hdfs:///tmp/schema_p/person'; > {code} > {code:java} > CREATE TABLE schema_p.product( > id_product string, > name string, > country string, > city string, > start_date string, > end_date string > ) > ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' > STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' > OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' > LOCATION 'hdfs:///tmp/schema_p/product'; > {code} > {code:java} > CREATE TABLE schema_p.person_product( > id_person string, > id_product string, > country string, > city string, > price string, > start_date string, > end_date string > ) > ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' > STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' > OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' > LOCATION 'hdfs:///tmp/schema_p/person_product'; > {code} > *Hive views:* > {code:java} > CREATE VIEW schema_p.person_v AS SELECT CAST(id_person AS INT) AS id_person, > CAST(identifier AS INT) AS identifier, gender AS gender, CAST(start_date AS > DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person; > CREATE VIEW schema_p.product_v AS SELECT CAST(id_product AS INT) AS > id_product, name AS name, country AS country, city AS city, CAST(start_date > AS DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM > schema_p.product; > CREATE VIEW schema_p.person_product_v AS SELECT CAST(id_person AS INT) AS > id_person, CAST(id_product AS INT) AS id_product, country AS country, city AS > city, CAST(price AS DECIMAL(38,8)) AS price, CAST(start_date AS DATE) AS > start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person_product; > {code} > * > *Code*: > > {code} > def read_tables(sc): > in_dict = { 'person': 'person_v', 'product': 'product_v', 'person_product': > 'person_product_v' } > data_dict = {} > for n, d in in_dict.iteritems(): > data_dict[n] = sc.read.table(d) > return data_dict > def get_population(tables, ref_date_str): > person = tables['person'] > product = tables['product'] > person_product = tables['person_product'] > person_product_join = person_product.join(product,'id_product') > count_prod = > person_product.groupBy('id_product').agg(F.count('id_product').alias('count_prod')) > person_count = person_product_join.join(count_prod,'id_product') > final1 = person_product_join.join(person_count, 'id_person', 'left') > final = final1.withColumn('reference_date', F.lit(ref_date_str)) > return final > import pyspark.sql.functions as F > import functools > from pyspark.sql.functions import col > from pyspark.sql.functions import add_months, lit, count, coalesce > spark.sql('use schema_p') > data_dict = read_tables(spark) > data_dict > population = get_population(data_dict, '2019-04-30') > population.show(){code} > > *Error:* > {code:java} > File "", line 1, in > File "", line 10, in get_population > File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line > 931, in join > jdf = self._jdf.join(other._jdf, on, how) > File > "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py", > line 1160, in __call__ > File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 69, > in deco > raise AnalysisException(s.split(': ', 1)[1], stackTrace) > pyspark.sql.utils.AnalysisException: u'Resolved attribute(s) > id_product#124,end_date#129,city#126,price#127,start_date#128,id_person#123,country#125 > missing from > city#47,price#48,start_date#49,id_product#45,end_date#50,id_person#44,country#46 > in operator
[jira] [Updated] (SPARK-27689) Error to execute hive views with spark
[ https://issues.apache.org/jira/browse/SPARK-27689?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Hyukjin Kwon updated SPARK-27689: - Priority: Major (was: Critical) > Error to execute hive views with spark > -- > > Key: SPARK-27689 > URL: https://issues.apache.org/jira/browse/SPARK-27689 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 2.3.0, 2.3.3, 2.4.3 >Reporter: Juan Antonio >Priority: Major > > I have a python error when I execute the following code using hive views but > it works correctly when I run it with hive tables. > *Hive databases:* > {code:java} > CREATE DATABASE schema_p LOCATION "hdfs:///tmp/schema_p"; > {code} > *Hive tables:* > {code:java} > CREATE TABLE schema_p.person( > id_person string, > identifier string, > gender string, > start_date string, > end_date string) > ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' > STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' > OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' > LOCATION 'hdfs:///tmp/schema_p/person'; > {code} > {code:java} > CREATE TABLE schema_p.product( > id_product string, > name string, > country string, > city string, > start_date string, > end_date string > ) > ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' > STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' > OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' > LOCATION 'hdfs:///tmp/schema_p/product'; > {code} > {code:java} > CREATE TABLE schema_p.person_product( > id_person string, > id_product string, > country string, > city string, > price string, > start_date string, > end_date string > ) > ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' > STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' > OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' > LOCATION 'hdfs:///tmp/schema_p/person_product'; > {code} > *Hive views:* > {code:java} > CREATE VIEW schema_p.person_v AS SELECT CAST(id_person AS INT) AS id_person, > CAST(identifier AS INT) AS identifier, gender AS gender, CAST(start_date AS > DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person; > CREATE VIEW schema_p.product_v AS SELECT CAST(id_product AS INT) AS > id_product, name AS name, country AS country, city AS city, CAST(start_date > AS DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM > schema_p.product; > CREATE VIEW schema_p.person_product_v AS SELECT CAST(id_person AS INT) AS > id_person, CAST(id_product AS INT) AS id_product, country AS country, city AS > city, CAST(price AS DECIMAL(38,8)) AS price, CAST(start_date AS DATE) AS > start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person_product; > {code} > * > *Code*: > > {code} > def read_tables(sc): > in_dict = { 'person': 'person_v', 'product': 'product_v', 'person_product': > 'person_product_v' } > data_dict = {} > for n, d in in_dict.iteritems(): > data_dict[n] = sc.read.table(d) > return data_dict > def get_population(tables, ref_date_str): > person = tables['person'] > product = tables['product'] > person_product = tables['person_product'] > person_product_join = person_product.join(product,'id_product') > count_prod = > person_product.groupBy('id_product').agg(F.count('id_product').alias('count_prod')) > person_count = person_product_join.join(count_prod,'id_product') > final1 = person_product_join.join(person_count, 'id_person', 'left') > final = final1.withColumn('reference_date', F.lit(ref_date_str)) > return final > import pyspark.sql.functions as F > import functools > from pyspark.sql.functions import col > from pyspark.sql.functions import add_months, lit, count, coalesce > spark.sql('use schema_p') > data_dict = read_tables(spark) > data_dict > population = get_population(data_dict, '2019-04-30') > population.show(){code} > > *Error:* > {code:java} > File "", line 1, in > File "", line 10, in get_population > File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line > 931, in join > jdf = self._jdf.join(other._jdf, on, how) > File > "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py", > line 1160, in __call__ > File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 69, > in deco > raise AnalysisException(s.split(': ', 1)[1], stackTrace) > pyspark.sql.utils.AnalysisException: u'Resolved attribute(s) > id_product#124,end_date#129,city#126,price#127,start_date#128,id_person#123,country#125 > missing from > city#47,price#48,start_date#49,id_product#45,end_date#50,id_person#44,country#46 > in
[jira] [Updated] (SPARK-27689) Error to execute hive views with spark
[ https://issues.apache.org/jira/browse/SPARK-27689?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Hyukjin Kwon updated SPARK-27689: - Description: I have a python error when I execute the following code using hive views but it works correctly when I run it with hive tables. *Hive databases:* {code:java} CREATE DATABASE schema_p LOCATION "hdfs:///tmp/schema_p"; {code} *Hive tables:* {code:java} CREATE TABLE schema_p.person( id_person string, identifier string, gender string, start_date string, end_date string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' LOCATION 'hdfs:///tmp/schema_p/person'; {code} {code:java} CREATE TABLE schema_p.product( id_product string, name string, country string, city string, start_date string, end_date string ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' LOCATION 'hdfs:///tmp/schema_p/product'; {code} {code:java} CREATE TABLE schema_p.person_product( id_person string, id_product string, country string, city string, price string, start_date string, end_date string ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' LOCATION 'hdfs:///tmp/schema_p/person_product'; {code} *Hive views:* {code:java} CREATE VIEW schema_p.person_v AS SELECT CAST(id_person AS INT) AS id_person, CAST(identifier AS INT) AS identifier, gender AS gender, CAST(start_date AS DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person; CREATE VIEW schema_p.product_v AS SELECT CAST(id_product AS INT) AS id_product, name AS name, country AS country, city AS city, CAST(start_date AS DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.product; CREATE VIEW schema_p.person_product_v AS SELECT CAST(id_person AS INT) AS id_person, CAST(id_product AS INT) AS id_product, country AS country, city AS city, CAST(price AS DECIMAL(38,8)) AS price, CAST(start_date AS DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person_product; {code} * *Code*: {code} def read_tables(sc): in_dict = { 'person': 'person_v', 'product': 'product_v', 'person_product': 'person_product_v' } data_dict = {} for n, d in in_dict.iteritems(): data_dict[n] = sc.read.table(d) return data_dict def get_population(tables, ref_date_str): person = tables['person'] product = tables['product'] person_product = tables['person_product'] person_product_join = person_product.join(product,'id_product') count_prod = person_product.groupBy('id_product').agg(F.count('id_product').alias('count_prod')) person_count = person_product_join.join(count_prod,'id_product') final1 = person_product_join.join(person_count, 'id_person', 'left') final = final1.withColumn('reference_date', F.lit(ref_date_str)) return final import pyspark.sql.functions as F import functools from pyspark.sql.functions import col from pyspark.sql.functions import add_months, lit, count, coalesce spark.sql('use schema_p') data_dict = read_tables(spark) data_dict population = get_population(data_dict, '2019-04-30') population.show(){code} *Error:* {code:java} File "", line 1, in File "", line 10, in get_population File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line 931, in join jdf = self._jdf.join(other._jdf, on, how) File "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py", line 1160, in __call__ File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 69, in deco raise AnalysisException(s.split(': ', 1)[1], stackTrace) pyspark.sql.utils.AnalysisException: u'Resolved attribute(s) id_product#124,end_date#129,city#126,price#127,start_date#128,id_person#123,country#125 missing from city#47,price#48,start_date#49,id_product#45,end_date#50,id_person#44,country#46 in operator !Project [cast(id_person#123 as int) AS id_person#96, cast(id_product#124 as int) AS id_product#97, cast(country#125 as string) AS country#98, cast(city#126 as string) AS city#99, cast(price#127 as decimal(38,8)) AS price#100, cast(start_date#128 as date) AS start_date#101, cast(end_date#129 as date) AS end_date#102|#123 as int) AS id_person#96, cast(id_product#124 as int) AS id_product#97, cast(country#125 as string) AS country#98, cast(city#126 as string) AS city#99, cast(price#127 as decimal(38,8)) AS price#100, cast(start_date#128 as date) AS start_date#101, cast(end_date#129 as date) AS
[jira] [Updated] (SPARK-27689) Error to execute hive views with spark
[ https://issues.apache.org/jira/browse/SPARK-27689?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Yuming Wang updated SPARK-27689: Description: I have a python error when I execute the following code using hive views but it works correctly when I run it with hive tables. *Hive databases:* CREATE DATABASE schema_p LOCATION "hdfs:///tmp/schema_p"; *Hive tables:* CREATE TABLE schema_p.person( id_person string, identifier string, gender string, start_date string, end_date string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' LOCATION 'hdfs:///tmp/schema_p/person'; CREATE TABLE schema_p.product( id_product string, name string, country string, city string, start_date string, end_date string ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' LOCATION 'hdfs:///tmp/schema_p/product'; CREATE TABLE schema_p.person_product( id_person string, id_product string, country string, city string, price string, start_date string, end_date string ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' LOCATION 'hdfs:///tmp/schema_p/person_product'; *Hive views:* CREATE VIEW schema_p.person_v AS SELECT CAST(id_person AS INT) AS id_person, CAST(identifier AS INT) AS identifier, gender AS gender, CAST(start_date AS DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person; CREATE VIEW schema_p.product_v AS SELECT CAST(id_product AS INT) AS id_product, name AS name, country AS country, city AS city, CAST(start_date AS DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.product; CREATE VIEW schema_p.person_product_v AS SELECT CAST(id_person AS INT) AS id_person, CAST(id_product AS INT) AS id_product, country AS country, city AS city, CAST(price AS DECIMAL(38,8)) AS price, CAST(start_date AS DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person_product; * *Code*: {code:python} def read_tables(sc): in_dict = { 'person': 'person_v', 'product': 'product_v', 'person_product': 'person_product_v' } data_dict = {} for n, d in in_dict.iteritems(): data_dict[n] = sc.read.table(d) return data_dict def get_population(tables, ref_date_str): person = tables['person'] product = tables['product'] person_product = tables['person_product'] person_product_join = person_product.join(product,'id_product') count_prod = person_product.groupBy('id_product').agg(F.count('id_product').alias('count_prod')) person_count = person_product_join.join(count_prod,'id_product') final1 = person_product_join.join(person_count, 'id_person', 'left') final = final1.withColumn('reference_date', F.lit(ref_date_str)) return final import pyspark.sql.functions as F import functools from pyspark.sql.functions import col from pyspark.sql.functions import add_months, lit, count, coalesce spark.sql('use schema_p') data_dict = read_tables(spark) data_dict population = get_population(data_dict, '2019-04-30') population.show(){code} * *Error:* File "", line 1, in File "", line 10, in get_population File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line 931, in join jdf = self._jdf.join(other._jdf, on, how) File "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py", line 1160, in __call__ File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 69, in deco raise AnalysisException(s.split(': ', 1)[1], stackTrace) pyspark.sql.utils.AnalysisException: u'Resolved attribute(s) id_product#124,end_date#129,city#126,price#127,start_date#128,id_person#123,country#125 missing from city#47,price#48,start_date#49,id_product#45,end_date#50,id_person#44,country#46 in operator !Project [cast(id_person#123 as int) AS id_person#96, cast(id_product#124 as int) AS id_product#97, cast(country#125 as string) AS country#98, cast(city#126 as string) AS city#99, cast(price#127 as decimal(38,8)) AS price#100, cast(start_date#128 as date) AS start_date#101, cast(end_date#129 as date) AS end_date#102|#123 as int) AS id_person#96, cast(id_product#124 as int) AS id_product#97, cast(country#125 as string) AS country#98, cast(city#126 as string) AS city#99, cast(price#127 as decimal(38,8)) AS price#100, cast(start_date#128 as date) AS start_date#101, cast(end_date#129 as date) AS end_date#102]. Attribute(s) with the same name appear in the