Hi Mathew, thanks for answering this, I've also tried with a simple case class and it works fine. I'm using this case class structure, which is failing:
import java.text.SimpleDateFormat import java.util.Calendar import scala.annotation.tailrec trait TabbedToString { _: Product => override def toString: String = excludeSome(productIterator.mkString(",").replaceAll("None", "null")) @tailrec private def excludeSome(s: String): String = { val someIdx = s.indexOf("Some(") if (someIdx < 0) { s } else { val endIdx = s.indexOf(")", someIdx) excludeSome(s.substring(0, someIdx + 5) + s.substring(endIdx + 1)) } } } case class PartialCallEndModule(failoverCorrelationId: String = null) extends TabbedToString case class PartialCallBeginModule(failoverCorrelationId: String = null) extends TabbedToString case class CallingPartyAddress(callingPartyAddress: String) extends TabbedToString case class EarlyMedia(sdpOfferTimestamp: String= null, sdpAnswerTimestamp: String= null, earlyMediaSdp: String= null, earlyMediaInitiatorFlag: String = null) extends TabbedToString case class MessageBody(bodyContentType: String= null, bodyContentLength: String= null, bodyContentDisposition: String= null, bodyOriginator: String = null) extends TabbedToString case class TgppModule(primaryDeviceLinePort: String= null, calledAssertedIdentity: String= null, calledAssertedPresentationIndicator: String= null, sdp: String= null, mediaInitiatorFlag: String= null, earlyMediaList: Array[EarlyMedia], messageBodyList: Array[MessageBody], sipErrorCode: String = null, callingPartyAddressList: Array[CallingPartyAddress]) extends TabbedToString case class CorrelationInfo(key: String= null, creator: String= null, originatorNetwork: String= null, terminatorNetwork: String= null, otherInfoInPCV: String = null) extends TabbedToString case class IpModule(route: String= null, networkCallID: String= null, codec: String= null, accessDeviceAddress: String= null, accessCallID: String= null, accessNetworkInfo: String= null, correlationInfo: CorrelationInfo= null, chargingFunctionAddresses: String= null, codecUsage: String= null, routingNumber: String= null, pCamelLocInfo: String= null, pCamelMscAddress: String= null, pCamelCellIDorLAI: String= null, userAgent: String= null, gets: String = null) extends TabbedToString case class Location(location: String, locationType: String) extends TabbedToString case class LocationInformation(location: Location = null, locationType: String = null) case class Host(group: String= null, userId: String= null, userNumber: String= null, groupNumber: String = null) extends TabbedToString case class FlexibleSeatingHost(hostGroup: String= null, hostUserId: String= null, hostUserNumber: String= null, hostGroupNumber:String = null) case class ServiceExtension(serviceName: String= null, invocationTime: String= null, facResult: String= null, host: Host= null, pushToTalk: String= null, relatedCallId: String= null, mediaSelection: String= null, action: String= null, result: String= null, `type`: String= null, startTime: String= null, stopTime: String= null, confId: String= null, locationActivationResult: String= null, locationDeactivationResult: String= null, callRetrieveResult: String= null, charge: String= null, currency: String= null, time: String= null, sum: String= null, callBridgeResult: String= null, nightServiceActivationMOResult: String= null, nightServiceDeactivationMOResult: String= null, forcedForwardingActivationResult: String= null, forcedForwardingDeactivationResult: String= null, outgoingCallCenterCallFACResult: String= null, outgoingPersonalCallFACResult: String= null, outgoingCallCenterPhoneNumber: String= null, outgoingCallCenterUserId: String= null, outgoingCallCenterGroupNumber: String= null, routingNumber: String= null, preAlertingDuration: String= null, conferenceId: String= null, role: String= null, bridge: String= null, owner: String= null, ownerDN: String= null, title: String= null, projectCode: String= null, recordingDuration: String= null, transactionId: String= null, mobilityNumber: String= null, mobilityRoutingNumber: String= null, recordingTrigger: String= null, recordingDestination: String= null, recordingResult: String= null, sccCallId: String= null, sccNumber: String= null, sccCause: String= null, targetHungGroupId: String= null, flexibleSeatingHost: FlexibleSeatingHost = null) extends TabbedToString case class CenterxModule(group: String= null, department: String= null, accountCode: String= null, authorizationCode: String= null, cbfAuthorizationCode: String= null, callingPartyCategory: String= null, outsideAccessCode: String= null, originalCalledNumber: String= null, originalCalledNumberContext: String= null, originalCalledPresentationIndicator: String= null, originalCalledReason: String= null, redirectingNumber: String= null, redirectingNumberContext: String= null, redirectingPresentationIndicator: String= null, redirectingReason: String= null, trunkGroupName: String= null, trunkGroupInfo: String= null, chargeNumber: String= null, relatedCallId: String= null, relatedCallIdReason: String= null, faxMessaging: String= null, twoStageDiallingDigits: String= null, recallType: String= null, originationMethod: String= null, serviceExtensionList: Array[ServiceExtension], prepaidStatus: String= null, configurableCLID: String= null, virtualOnNetType: String= null, officeZone: String= null, primaryZone: String= null, roamingMscAddress: String= null, customSchemaVersion: String= null, locationList: Array[Location], locationUsage: String= null, cicInsertedAsCac: String = null, extTrackingId: String = null) extends TabbedToString case class RecordId(eventCounter: String, systemId: String, date: String, systemTimeZone: String) extends TabbedToString case class HeaderModule(recordId: RecordId, serviceProvider: String = null, `type`: String) extends TabbedToString case class BasicModule(userNumber: String = null, groupNumber: String = null, direction: String= null, asCallType: String= null, callingNumber: String= null, callingNumberContext: String= null, callingPresentationNumber: String= null, callingPresentationNumberContext: String= null, callingAssertedNumber: String= null, callingAssertedNumberContext: String= null, dialableCallingNumber: String= null, callingPresentationIndicator:String= null, dialedDigits: String= null, dialedDigitsContext: String= null, calledNumber: String= null, calledNumberContext: String= null, networkTranslatedNumber: String= null, networkTranslatedNumberContext: String= null, networkTranslatedGroup: String= null, startTime: String= null, userTimeZone: String= null, localCallId: String= null, remoteCallId: String= null, answerIndicator: String= null, answerTime: String= null, releaseTime: String= null, terminationCause: String= null, q850Cause: String= null, carrierIdentificationCode: String= null, callCategory: String= null, networkCallType: String= null, chargeIndicator: String= null, typeOfNetwork: String= null, releasingParty: String= null, userId: String= null, otherPartyName: String= null, otherPartyNamePresentationIndicator: String= null, clidPermitted: String= null, receivedCallingNumber: String= null, namePermitted: String = null) extends TabbedToString // enriched classes case class CompletedCDRs(completed: List[RichCDR], incompleted: List[CDR]) case class DonoUnidadeTelefonica(id: Long, nome: String) case class CDR(headerModule: HeaderModule, basicModule: BasicModule = null, centerxModule: CenterxModule = null , ipModule: IpModule = null, tgppModule: TgppModule = null, partialCallBeginModule: PartialCallBeginModule = null, partialCallEndModule: PartialCallEndModule = null) extends TabbedToString 2017-05-09 4:54 GMT-03:00 Matthew cao <cybea...@gmail.com>: > Hi, > I have tried simple test like this: > case class A(id: Long) > val sample = spark.range(0,10).as[A] > sample.createOrReplaceTempView("sample") > val df = spark.emptyDataset[A] > val df1 = spark.sql("select * from sample").as[A] > df.union(df1) > > It runs ok. And for nullabillity I thought that issue has been fixed: > https://issues.apache.org/jira/browse/SPARK-18058 > I think you can check your spark version and schema of dataset again? Hope > this help. > > Best, > > On 2017年5月9日, at 04:56, Dirceu Semighini Filho <dirceu.semigh...@gmail.com> > wrote: > > Ok, great, > Well I havn't provided a good example of what I'm doing. Let's assume that > my case class is > case class A(tons of fields, with sub classes) > > val df = sqlContext.sql("select * from a").as[A] > > val df2 = spark.emptyDataset[A] > > df.union(df2) > > This code will throw the exception. > Is this expected? I assume that when I do as[A] it will convert the schema > to the case class schema, and it shouldn't throw the exception, or this > will be done lazy when the union is been processed? > > > > 2017-05-08 17:50 GMT-03:00 Burak Yavuz <brk...@gmail.com>: > >> Yes, unfortunately. This should actually be fixed, and the union's schema >> should have the less restrictive of the DataFrames. >> >> On Mon, May 8, 2017 at 12:46 PM, Dirceu Semighini Filho < >> dirceu.semigh...@gmail.com> wrote: >> >>> HI Burak, >>> By nullability you mean that if I have the exactly the same schema, but >>> one side support null and the other doesn't, this exception (in union >>> dataset) will be thrown? >>> >>> >>> >>> 2017-05-08 16:41 GMT-03:00 Burak Yavuz <brk...@gmail.com>: >>> >>>> I also want to add that generally these may be caused by the >>>> `nullability` field in the schema. >>>> >>>> On Mon, May 8, 2017 at 12:25 PM, Shixiong(Ryan) Zhu < >>>> shixi...@databricks.com> wrote: >>>> >>>>> This is because RDD.union doesn't check the schema, so you won't see >>>>> the problem unless you run RDD and hit the incompatible column problem. >>>>> For >>>>> RDD, You may not see any error if you don't use the incompatible column. >>>>> >>>>> Dataset.union requires compatible schema. You can print ds.schema and >>>>> ds1.schema and check if they are same. >>>>> >>>>> On Mon, May 8, 2017 at 11:07 AM, Dirceu Semighini Filho < >>>>> dirceu.semigh...@gmail.com> wrote: >>>>> >>>>>> Hello, >>>>>> I've a very complex case class structure, with a lot of fields. >>>>>> When I try to union two datasets of this class, it doesn't work with >>>>>> the following error : >>>>>> ds.union(ds1) >>>>>> Exception in thread "main" org.apache.spark.sql.AnalysisException: >>>>>> Union can only be performed on tables with the compatible column types >>>>>> >>>>>> But when use it's rdd, the union goes right: >>>>>> ds.rdd.union(ds1.rdd) >>>>>> res8: org.apache.spark.rdd.RDD[ >>>>>> >>>>>> Is there any reason for this to happen (besides a bug ;) ) >>>>>> >>>>>> >>>>>> >>>>> >>>> >>> >> > >