Author: mattmann
Date: Tue Apr 12 22:07:54 2011
New Revision: 1091581

URL: http://svn.apache.org/viewvc?rev=1091581&view=rev
Log:
- fix for OODT-172 Improvements to the Filename Extractor

Modified:
    oodt/trunk/CHANGES.txt
    
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenConfig.java
    
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenExtractorMetKeys.java
    
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenMetExtractor.java
    
oodt/trunk/metadata/src/main/resources/examples/filename.extractor.config.xml

Modified: oodt/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/oodt/trunk/CHANGES.txt?rev=1091581&r1=1091580&r2=1091581&view=diff
==============================================================================
--- oodt/trunk/CHANGES.txt (original)
+++ oodt/trunk/CHANGES.txt Tue Apr 12 22:07:54 2011
@@ -4,6 +4,8 @@ Apache OODT Change Log
 Release 0.3-SNAPSHOT (in progress)
 --------------------------------------------
 
+* OODT-172 Improvements to the Filename Extractor (mattmann)
+
 * OODT-170 cas-catalog shuffles query results . . . order is lost (bfoster)
 
 * OODT-169 Pushpull dirstruct xml files fail to replace global 

Modified: 
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenConfig.java
URL: 
http://svn.apache.org/viewvc/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenConfig.java?rev=1091581&r1=1091580&r2=1091581&view=diff
==============================================================================
--- 
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenConfig.java
 (original)
+++ 
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenConfig.java
 Tue Apr 12 22:07:54 2011
@@ -15,12 +15,26 @@
  * limitations under the License.
  */
 
-
 package org.apache.oodt.cas.metadata.extractors;
 
+//JDK imports
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.text.SimpleDateFormat;
+import java.util.List;
+
 //OODT imports
 import org.apache.oodt.cas.metadata.MetExtractorConfig;
+import org.apache.oodt.cas.metadata.Metadata;
+import org.apache.oodt.cas.metadata.util.PathUtils;
+import static 
org.apache.oodt.cas.metadata.extractors.FilenameTokenExtractorMetKeys.*;
+import org.apache.oodt.pcs.input.PGEConfigFileException;
+import org.apache.oodt.pcs.input.PGEConfigFileReader;
 import org.apache.oodt.pcs.input.PGEConfigurationFile;
+import org.apache.oodt.pcs.input.PGEGroup;
+import org.apache.oodt.pcs.input.PGEScalar;
+import org.apache.oodt.pcs.input.PGEVector;
 
 /**
  * 
@@ -42,6 +56,64 @@ public class FilenameTokenConfig impleme
     this.conf = conf;
   }
 
+  public void setConfig(String confFilePath) throws FileNotFoundException,
+      PGEConfigFileException {
+    this.conf = new PGEConfigFileReader().read(new FileInputStream(new File(
+        confFilePath)));
+  }
+
+  public SimpleDateFormat getDateFormatter() {
+    return new SimpleDateFormat(this.conf.getPgeSpecificGroups().get(
+        PRODUCTION_DATE_TIME_GROUP).getScalar(DATETIME_SCALAR).getValue());
+  }
+
+  public String getTokenDelimeterScalar() {
+    return this.conf.getPgeSpecificGroups().get(TOKEN_LIST_GROUP).getScalar(
+        TOKEN_DELIMETER_SCALAR).getValue();
+  }
+
+  public List<String> getTokenMetKeyNames() {
+    return (List<String>) (List<?>) this.conf.getPgeSpecificGroups().get(
+        TOKEN_LIST_GROUP).getVector(TOKEN_MET_KEYS_VECTOR).getElements();
+  }
+
+  public Metadata getSubstringOffsetMet(File file) {
+    PGEGroup substrOffsetGroup = this.conf.getPgeSpecificGroups().get(
+        SUBSTRING_OFFSET_GROUP);
+    Metadata met = new Metadata();
+    String filename = file.getName();
+
+    for (PGEVector vec : substrOffsetGroup.getVectors().values()) {
+      String metKeyName = vec.getName();
+      int offset = Integer.valueOf((String) vec.getElements().get(0)) - 1;
+      int length = Integer.valueOf((String) vec.getElements().get(1));
+      String metVal = filename.substring(offset, offset + length).trim();
+      met.addMetadata(metKeyName, metVal);
+    }
+
+    return met;
+  }
+
+  public Metadata getCommonMet() {
+    PGEGroup commonMetGroup = this.conf.getPgeSpecificGroups().get(
+        COMMON_METADATA_GROUP);
+    Metadata met = new Metadata();
+    for (String scalarName : commonMetGroup.getScalars().keySet()) {
+      PGEScalar scalar = commonMetGroup.getScalar(scalarName);
+      met.addMetadata(scalar.getName(), PathUtils.replaceEnvVariables(scalar
+          .getValue()));
+    }
+
+    for (String vecName : commonMetGroup.getVectors().keySet()) {
+      PGEVector vec = commonMetGroup.getVector(vecName);
+      for (String val : (List<String>) (List<?>) vec.getElements()) {
+        met.addMetadata(vecName, PathUtils.replaceEnvVariables(val));
+      }
+    }
+
+    return met;
+  }
+
   /**
    * @return the conf
    */

Modified: 
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenExtractorMetKeys.java
URL: 
http://svn.apache.org/viewvc/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenExtractorMetKeys.java?rev=1091581&r1=1091580&r2=1091581&view=diff
==============================================================================
--- 
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenExtractorMetKeys.java
 (original)
+++ 
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenExtractorMetKeys.java
 Tue Apr 12 22:07:54 2011
@@ -28,10 +28,20 @@ package org.apache.oodt.cas.metadata.ext
  */
 public interface FilenameTokenExtractorMetKeys {
   
-  String TIME_FORMAT_STRING_SCALAR = "TimeFormatString";
+  public static final String TIME_FORMAT_STRING_SCALAR = "TimeFormatString";
   
-  String SUBSTRING_OFFSET_GROUP = "SubstringOffsetGroup";
+  public static final String SUBSTRING_OFFSET_GROUP = "SubstringOffsetGroup";
+    
+  public static final String TOKEN_LIST_GROUP = "TokenNameListGroup";
   
-  String COMMON_METADATA_GROUP = "CommonMetadata";
+  public static final String TOKEN_DELIMETER_SCALAR = "Delimeter";
+
+  public static final String TOKEN_MET_KEYS_VECTOR = "TokenMetKeys";
+
+  public static final String PRODUCTION_DATE_TIME_GROUP = 
"ProductionDateTimeGroup";
+
+  public static final String DATETIME_SCALAR = "DateTimeFormat";
+
+  public static final String COMMON_METADATA_GROUP = "CommonMetadata";  
 
 }

Modified: 
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenMetExtractor.java
URL: 
http://svn.apache.org/viewvc/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenMetExtractor.java?rev=1091581&r1=1091580&r2=1091581&view=diff
==============================================================================
--- 
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenMetExtractor.java
 (original)
+++ 
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenMetExtractor.java
 Tue Apr 12 22:07:54 2011
@@ -15,23 +15,22 @@
  * limitations under the License.
  */
 
-
 package org.apache.oodt.cas.metadata.extractors;
 
 //JDK imports
 import java.io.File;
-import java.util.logging.Level;
+import java.text.ParseException;
+import java.util.Calendar;
+import java.util.GregorianCalendar;
+import java.util.List;
 import java.util.logging.Logger;
 
 //OODT imports
 import org.apache.oodt.cas.metadata.Metadata;
 import org.apache.oodt.cas.metadata.exceptions.MetExtractionException;
 import org.apache.oodt.cas.metadata.extractors.CmdLineMetExtractor;
-import org.apache.oodt.cas.metadata.util.PathUtils;
-import org.apache.oodt.pcs.input.PGEConfigurationFile;
-import org.apache.oodt.pcs.input.PGEGroup;
-import org.apache.oodt.pcs.input.PGEScalar;
-import org.apache.oodt.pcs.input.PGEVector;
+import org.apache.oodt.commons.date.DateUtils;
+
 
 /**
  * 
@@ -45,8 +44,9 @@ import org.apache.oodt.pcs.input.PGEVect
  */
 public class FilenameTokenMetExtractor extends CmdLineMetExtractor implements
     FilenameTokenExtractorMetKeys {
-  
-  private static final Logger LOG = 
Logger.getLogger(FilenameTokenMetExtractor.class.getName());
+
+  private static final Logger LOG = Logger
+      .getLogger(FilenameTokenMetExtractor.class.getName());
 
   /**
    * Default constructor.
@@ -59,51 +59,45 @@ public class FilenameTokenMetExtractor e
   /*
    * (non-Javadoc)
    * 
-   * @see
-   * org.apache.oodt.cas.metadata.AbstractMetExtractor#extrMetadata(java.io
+   * @see 
org.apache.oodt.cas.metadata.AbstractMetExtractor#extrMetadata(java.io
    * .File)
    */
   @Override
   protected Metadata extrMetadata(File file) throws MetExtractionException {
-    Metadata met = getCoreMet(file);
-    addCommonMetadata(met);
-    return met;
-  }
-  
-  public static void main(String [] args) throws Exception{
-    processMain(args, new FilenameTokenMetExtractor());
-  }
-
-  private Metadata getCoreMet(File file) {
-    PGEGroup substrOffsetGroup = getConf().getPgeSpecificGroups().get(
-        SUBSTRING_OFFSET_GROUP);
     Metadata met = new Metadata();
     String filename = file.getName();
+    List<String> metKeyTokens = ((FilenameTokenConfig) this.config)
+        .getTokenMetKeyNames();
+    String[] filenameToks = filename.split("\\.")[0]
+        .split(((FilenameTokenConfig) this.config).getTokenDelimeterScalar());
+    for (int i = 0; i < filenameToks.length; i++) {
+      String keyName = metKeyTokens.get(i);
+      String keyVal = filenameToks[i];
+      if (keyName.equals("ProductionDateTime")) {
+        Calendar cal = GregorianCalendar.getInstance();
+        try {
+          cal.setTime(((FilenameTokenConfig) this.config).getDateFormatter()
+              .parse(keyVal));
+        } catch (ParseException e) {
+          throw new MetExtractionException(e.getMessage());
+        }
+        keyVal = DateUtils.toString(cal);
+      }
 
-    for (PGEVector vec : substrOffsetGroup.getVectors().values()) {
-      String metKeyName = vec.getName();
-      LOG.log(Level.FINE, "Extracting key: ["+metKeyName+"]");
-      int offset = Integer.valueOf((String) vec.getElements().get(0)) - 1;
-      int length = Integer.valueOf((String) vec.getElements().get(1));
-      String metVal = filename.substring(offset, offset + length).trim();
-      met.addMetadata(metKeyName, metVal);
+      met.addMetadata(keyName, keyVal);
     }
 
-    return met;
-  }
+    Metadata commonMet = ((FilenameTokenConfig) this.config).getCommonMet();
+    met.addMetadata(commonMet.getHashtable());
+    
met.addMetadata(((FilenameTokenConfig)this.config).getSubstringOffsetMet(file));
 
-  private void addCommonMetadata(Metadata met) {
-    PGEGroup commonMetGroup = this.getConf().getPgeSpecificGroups().get(
-        COMMON_METADATA_GROUP);
-
-    for (PGEScalar metScalar : commonMetGroup.getScalars().values()) {
-      met.addMetadata(metScalar.getName(), PathUtils.replaceEnvVariables(
-          metScalar.getValue(), met));
-    }
+    met.addMetadata("Filename", file.getName());
+    met.addMetadata("FileLocation", file.getParentFile().getAbsolutePath());
+    return met;
   }
 
-  private PGEConfigurationFile getConf() {
-    return ((FilenameTokenConfig) this.config).getConf();
+  public static void main(String[] args) throws Exception {
+    processMain(args, new FilenameTokenMetExtractor());
   }
 
 }

Modified: 
oodt/trunk/metadata/src/main/resources/examples/filename.extractor.config.xml
URL: 
http://svn.apache.org/viewvc/oodt/trunk/metadata/src/main/resources/examples/filename.extractor.config.xml?rev=1091581&r1=1091580&r2=1091581&view=diff
==============================================================================
--- 
oodt/trunk/metadata/src/main/resources/examples/filename.extractor.config.xml 
(original)
+++ 
oodt/trunk/metadata/src/main/resources/examples/filename.extractor.config.xml 
Tue Apr 12 22:07:54 2011
@@ -16,6 +16,24 @@ License for the specific language govern
 the License.
 -->
 <input>
+
+   <group name="TokenNameListGroup">
+     <scalar name="Delimeter">_</scalar>
+     <vector name="TokenMetKeys">
+    
+     <!-- 
+     [sourcename]_[tablename]_[productiondatetime].[ext]
+     
+     ex file name: datasource_tablename_20101129122700.txt
+     -->
+       <element>SourceName</element>
+       <element>TableName</element>
+       <element>ProductionDateTime</element>
+     </vector>
+  
+   </group>
+
+
     <group name="SubstringOffsetGroup">
      <!--  the indices for the substring met key selection -->
      <!--  the first element is the starting index in the string -->
@@ -47,8 +65,17 @@ the License.
       </vector>
       
    </group>
+   
+   <group name="ProductionDateTimeGroup">
+     <scalar name="DateTimeFormat">yyyyMMddHHmmss</scalar>
+   </group>   
     
     <group name="CommonMetadata">
+       <!--  can now use environment variable replacement in any of the values 
for 
+             scalars or vectors, just use CAS bracket style [ENV VAR NAME]
+         
+             Also can use vector to indicate multiple values for a particular 
met field.
+        -->
         <scalar name="DataVersion">1.0</scalar>
         <scalar name="CollectionName">Products extracted by the OODT Filename 
Met Extractor</scalar>
         <scalar name="DataProvider">OODT</scalar>


Reply via email to