When I run the script below I get lots of '()' output. Can anyone guide me why 
I get no data in B (PIg version=0.12.1 and A dumps OK)
TIA!!!!



A = load 'hdfs:///user/hduser/smsCorpus_en_2012.04.30_all.xml' using 
org.apache.pig.piggybank.storage.XMLLoader('message') 
    as (x:chararray);
describe A;


B = foreach A GENERATE FLATTEN(REGEX_EXTRACT_ALL(x, 
'<message>\\n\\s*<text>(.*)</text>\\n\\s*    
<source>\\n\\s*<srcNumber>(.*)</srcNumber>\\n\\s*<phoneModel 
(.*)/>\\n\\s*<userProfile>\\n\\s*<userID>(.*)</userID>\\n\\s*<age>(.*)</age>\\n\\s*<gender>(.*)</gender>\\n\\s*<nativeSpeaker>(.*)</nativeSpeaker>\\n\\s*<country>(.*)</country>\\n\\s*<city>(.*)</city>\\n\\s*<experience>(.*)</experience>\\n\\s*<frequency>(.*)</frequency>\\n\\s*<inputMethod>(.*)</inputMethod>\\n\\s*</userProfile>\\n\\s*</source>\\n\\s*<destination
 
(.*)>\\n\\s*<destNumber>(.*)</destNumber>\\n\\s*</destination>\\n\\s*<messageProfile
 (.*)/>\\n\\s*<collectionMethod (.*)/>\\n\\s*</message>')) 
as (SMStext:chararray, srcNumber:chararray, phoneModel:chararray,
    userID:chararray, age:chararray, gender:chararray, nativeSpeaker:chararray, 
    country:chararray, city:chararray, experience:chararray, 
frequency:chararray, 
    inputMethod:chararray, destination:chararray, destNumber:chararray, 
messageProfile:chararray, collectionMethod:chararray);
     
describe B;
dump B;
    
    /* EXAMPLE DATA FROM NUS SMS CORPUS 

<message id="1">
      <text>K</text>
      <source>
    <srcNumber>79780a9dbe83fd1e5dd2bd2543e7da2a</srcNumber>
    <phoneModel manufactuer="Nokia" smartphone="unknown"/>
    <userProfile>
      <userID>79780a9dbe83fd1e5dd2bd2543e7da2a</userID>
      <age>21-25</age>
      <gender>unknown</gender>
      <nativeSpeaker>yes</nativeSpeaker>
      <country>India</country>
      <city>Tiruppur</city>
      <experience>3 to 5 years</experience>
      <frequency>More than 50 SMS daily</frequency>
      <inputMethod>Multi-tap</inputMethod>
    </userProfile>
      </source>
      <destination country="unknown">
    <destNumber>0ffc7585148560b7520931d354c00a9b</destNumber>
      </destination>
      <messageProfile language="en" time="2010.10.24 11:59" type="send"/>
      <collectionMethod collector="Tao Chen" method="SMS Export" 
time="2010/11"/>
    </message>

    */

Reply via email to