Author: mattmann
Date: Tue Apr 12 22:07:54 2011
New Revision: 1091581
URL: http://svn.apache.org/viewvc?rev=1091581&view=rev
Log:
- fix for OODT-172 Improvements to the Filename Extractor
Modified:
oodt/trunk/CHANGES.txt
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenConfig.java
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenExtractorMetKeys.java
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenMetExtractor.java
oodt/trunk/metadata/src/main/resources/examples/filename.extractor.config.xml
Modified: oodt/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/oodt/trunk/CHANGES.txt?rev=1091581&r1=1091580&r2=1091581&view=diff
==============================================================================
--- oodt/trunk/CHANGES.txt (original)
+++ oodt/trunk/CHANGES.txt Tue Apr 12 22:07:54 2011
@@ -4,6 +4,8 @@ Apache OODT Change Log
Release 0.3-SNAPSHOT (in progress)
--------------------------------------------
+* OODT-172 Improvements to the Filename Extractor (mattmann)
+
* OODT-170 cas-catalog shuffles query results . . . order is lost (bfoster)
* OODT-169 Pushpull dirstruct xml files fail to replace global
Modified:
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenConfig.java
URL:
http://svn.apache.org/viewvc/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenConfig.java?rev=1091581&r1=1091580&r2=1091581&view=diff
==============================================================================
---
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenConfig.java
(original)
+++
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenConfig.java
Tue Apr 12 22:07:54 2011
@@ -15,12 +15,26 @@
* limitations under the License.
*/
-
package org.apache.oodt.cas.metadata.extractors;
+//JDK imports
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.text.SimpleDateFormat;
+import java.util.List;
+
//OODT imports
import org.apache.oodt.cas.metadata.MetExtractorConfig;
+import org.apache.oodt.cas.metadata.Metadata;
+import org.apache.oodt.cas.metadata.util.PathUtils;
+import static
org.apache.oodt.cas.metadata.extractors.FilenameTokenExtractorMetKeys.*;
+import org.apache.oodt.pcs.input.PGEConfigFileException;
+import org.apache.oodt.pcs.input.PGEConfigFileReader;
import org.apache.oodt.pcs.input.PGEConfigurationFile;
+import org.apache.oodt.pcs.input.PGEGroup;
+import org.apache.oodt.pcs.input.PGEScalar;
+import org.apache.oodt.pcs.input.PGEVector;
/**
*
@@ -42,6 +56,64 @@ public class FilenameTokenConfig impleme
this.conf = conf;
}
+ public void setConfig(String confFilePath) throws FileNotFoundException,
+ PGEConfigFileException {
+ this.conf = new PGEConfigFileReader().read(new FileInputStream(new File(
+ confFilePath)));
+ }
+
+ public SimpleDateFormat getDateFormatter() {
+ return new SimpleDateFormat(this.conf.getPgeSpecificGroups().get(
+ PRODUCTION_DATE_TIME_GROUP).getScalar(DATETIME_SCALAR).getValue());
+ }
+
+ public String getTokenDelimeterScalar() {
+ return this.conf.getPgeSpecificGroups().get(TOKEN_LIST_GROUP).getScalar(
+ TOKEN_DELIMETER_SCALAR).getValue();
+ }
+
+ public List<String> getTokenMetKeyNames() {
+ return (List<String>) (List<?>) this.conf.getPgeSpecificGroups().get(
+ TOKEN_LIST_GROUP).getVector(TOKEN_MET_KEYS_VECTOR).getElements();
+ }
+
+ public Metadata getSubstringOffsetMet(File file) {
+ PGEGroup substrOffsetGroup = this.conf.getPgeSpecificGroups().get(
+ SUBSTRING_OFFSET_GROUP);
+ Metadata met = new Metadata();
+ String filename = file.getName();
+
+ for (PGEVector vec : substrOffsetGroup.getVectors().values()) {
+ String metKeyName = vec.getName();
+ int offset = Integer.valueOf((String) vec.getElements().get(0)) - 1;
+ int length = Integer.valueOf((String) vec.getElements().get(1));
+ String metVal = filename.substring(offset, offset + length).trim();
+ met.addMetadata(metKeyName, metVal);
+ }
+
+ return met;
+ }
+
+ public Metadata getCommonMet() {
+ PGEGroup commonMetGroup = this.conf.getPgeSpecificGroups().get(
+ COMMON_METADATA_GROUP);
+ Metadata met = new Metadata();
+ for (String scalarName : commonMetGroup.getScalars().keySet()) {
+ PGEScalar scalar = commonMetGroup.getScalar(scalarName);
+ met.addMetadata(scalar.getName(), PathUtils.replaceEnvVariables(scalar
+ .getValue()));
+ }
+
+ for (String vecName : commonMetGroup.getVectors().keySet()) {
+ PGEVector vec = commonMetGroup.getVector(vecName);
+ for (String val : (List<String>) (List<?>) vec.getElements()) {
+ met.addMetadata(vecName, PathUtils.replaceEnvVariables(val));
+ }
+ }
+
+ return met;
+ }
+
/**
* @return the conf
*/
Modified:
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenExtractorMetKeys.java
URL:
http://svn.apache.org/viewvc/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenExtractorMetKeys.java?rev=1091581&r1=1091580&r2=1091581&view=diff
==============================================================================
---
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenExtractorMetKeys.java
(original)
+++
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenExtractorMetKeys.java
Tue Apr 12 22:07:54 2011
@@ -28,10 +28,20 @@ package org.apache.oodt.cas.metadata.ext
*/
public interface FilenameTokenExtractorMetKeys {
- String TIME_FORMAT_STRING_SCALAR = "TimeFormatString";
+ public static final String TIME_FORMAT_STRING_SCALAR = "TimeFormatString";
- String SUBSTRING_OFFSET_GROUP = "SubstringOffsetGroup";
+ public static final String SUBSTRING_OFFSET_GROUP = "SubstringOffsetGroup";
+
+ public static final String TOKEN_LIST_GROUP = "TokenNameListGroup";
- String COMMON_METADATA_GROUP = "CommonMetadata";
+ public static final String TOKEN_DELIMETER_SCALAR = "Delimeter";
+
+ public static final String TOKEN_MET_KEYS_VECTOR = "TokenMetKeys";
+
+ public static final String PRODUCTION_DATE_TIME_GROUP =
"ProductionDateTimeGroup";
+
+ public static final String DATETIME_SCALAR = "DateTimeFormat";
+
+ public static final String COMMON_METADATA_GROUP = "CommonMetadata";
}
Modified:
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenMetExtractor.java
URL:
http://svn.apache.org/viewvc/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenMetExtractor.java?rev=1091581&r1=1091580&r2=1091581&view=diff
==============================================================================
---
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenMetExtractor.java
(original)
+++
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/FilenameTokenMetExtractor.java
Tue Apr 12 22:07:54 2011
@@ -15,23 +15,22 @@
* limitations under the License.
*/
-
package org.apache.oodt.cas.metadata.extractors;
//JDK imports
import java.io.File;
-import java.util.logging.Level;
+import java.text.ParseException;
+import java.util.Calendar;
+import java.util.GregorianCalendar;
+import java.util.List;
import java.util.logging.Logger;
//OODT imports
import org.apache.oodt.cas.metadata.Metadata;
import org.apache.oodt.cas.metadata.exceptions.MetExtractionException;
import org.apache.oodt.cas.metadata.extractors.CmdLineMetExtractor;
-import org.apache.oodt.cas.metadata.util.PathUtils;
-import org.apache.oodt.pcs.input.PGEConfigurationFile;
-import org.apache.oodt.pcs.input.PGEGroup;
-import org.apache.oodt.pcs.input.PGEScalar;
-import org.apache.oodt.pcs.input.PGEVector;
+import org.apache.oodt.commons.date.DateUtils;
+
/**
*
@@ -45,8 +44,9 @@ import org.apache.oodt.pcs.input.PGEVect
*/
public class FilenameTokenMetExtractor extends CmdLineMetExtractor implements
FilenameTokenExtractorMetKeys {
-
- private static final Logger LOG =
Logger.getLogger(FilenameTokenMetExtractor.class.getName());
+
+ private static final Logger LOG = Logger
+ .getLogger(FilenameTokenMetExtractor.class.getName());
/**
* Default constructor.
@@ -59,51 +59,45 @@ public class FilenameTokenMetExtractor e
/*
* (non-Javadoc)
*
- * @see
- * org.apache.oodt.cas.metadata.AbstractMetExtractor#extrMetadata(java.io
+ * @see
org.apache.oodt.cas.metadata.AbstractMetExtractor#extrMetadata(java.io
* .File)
*/
@Override
protected Metadata extrMetadata(File file) throws MetExtractionException {
- Metadata met = getCoreMet(file);
- addCommonMetadata(met);
- return met;
- }
-
- public static void main(String [] args) throws Exception{
- processMain(args, new FilenameTokenMetExtractor());
- }
-
- private Metadata getCoreMet(File file) {
- PGEGroup substrOffsetGroup = getConf().getPgeSpecificGroups().get(
- SUBSTRING_OFFSET_GROUP);
Metadata met = new Metadata();
String filename = file.getName();
+ List<String> metKeyTokens = ((FilenameTokenConfig) this.config)
+ .getTokenMetKeyNames();
+ String[] filenameToks = filename.split("\\.")[0]
+ .split(((FilenameTokenConfig) this.config).getTokenDelimeterScalar());
+ for (int i = 0; i < filenameToks.length; i++) {
+ String keyName = metKeyTokens.get(i);
+ String keyVal = filenameToks[i];
+ if (keyName.equals("ProductionDateTime")) {
+ Calendar cal = GregorianCalendar.getInstance();
+ try {
+ cal.setTime(((FilenameTokenConfig) this.config).getDateFormatter()
+ .parse(keyVal));
+ } catch (ParseException e) {
+ throw new MetExtractionException(e.getMessage());
+ }
+ keyVal = DateUtils.toString(cal);
+ }
- for (PGEVector vec : substrOffsetGroup.getVectors().values()) {
- String metKeyName = vec.getName();
- LOG.log(Level.FINE, "Extracting key: ["+metKeyName+"]");
- int offset = Integer.valueOf((String) vec.getElements().get(0)) - 1;
- int length = Integer.valueOf((String) vec.getElements().get(1));
- String metVal = filename.substring(offset, offset + length).trim();
- met.addMetadata(metKeyName, metVal);
+ met.addMetadata(keyName, keyVal);
}
- return met;
- }
+ Metadata commonMet = ((FilenameTokenConfig) this.config).getCommonMet();
+ met.addMetadata(commonMet.getHashtable());
+
met.addMetadata(((FilenameTokenConfig)this.config).getSubstringOffsetMet(file));
- private void addCommonMetadata(Metadata met) {
- PGEGroup commonMetGroup = this.getConf().getPgeSpecificGroups().get(
- COMMON_METADATA_GROUP);
-
- for (PGEScalar metScalar : commonMetGroup.getScalars().values()) {
- met.addMetadata(metScalar.getName(), PathUtils.replaceEnvVariables(
- metScalar.getValue(), met));
- }
+ met.addMetadata("Filename", file.getName());
+ met.addMetadata("FileLocation", file.getParentFile().getAbsolutePath());
+ return met;
}
- private PGEConfigurationFile getConf() {
- return ((FilenameTokenConfig) this.config).getConf();
+ public static void main(String[] args) throws Exception {
+ processMain(args, new FilenameTokenMetExtractor());
}
}
Modified:
oodt/trunk/metadata/src/main/resources/examples/filename.extractor.config.xml
URL:
http://svn.apache.org/viewvc/oodt/trunk/metadata/src/main/resources/examples/filename.extractor.config.xml?rev=1091581&r1=1091580&r2=1091581&view=diff
==============================================================================
---
oodt/trunk/metadata/src/main/resources/examples/filename.extractor.config.xml
(original)
+++
oodt/trunk/metadata/src/main/resources/examples/filename.extractor.config.xml
Tue Apr 12 22:07:54 2011
@@ -16,6 +16,24 @@ License for the specific language govern
the License.
-->
<input>
+
+ <group name="TokenNameListGroup">
+ <scalar name="Delimeter">_</scalar>
+ <vector name="TokenMetKeys">
+
+ <!--
+ [sourcename]_[tablename]_[productiondatetime].[ext]
+
+ ex file name: datasource_tablename_20101129122700.txt
+ -->
+ <element>SourceName</element>
+ <element>TableName</element>
+ <element>ProductionDateTime</element>
+ </vector>
+
+ </group>
+
+
<group name="SubstringOffsetGroup">
<!-- the indices for the substring met key selection -->
<!-- the first element is the starting index in the string -->
@@ -47,8 +65,17 @@ the License.
</vector>
</group>
+
+ <group name="ProductionDateTimeGroup">
+ <scalar name="DateTimeFormat">yyyyMMddHHmmss</scalar>
+ </group>
<group name="CommonMetadata">
+ <!-- can now use environment variable replacement in any of the values
for
+ scalars or vectors, just use CAS bracket style [ENV VAR NAME]
+
+ Also can use vector to indicate multiple values for a particular
met field.
+ -->
<scalar name="DataVersion">1.0</scalar>
<scalar name="CollectionName">Products extracted by the OODT Filename
Met Extractor</scalar>
<scalar name="DataProvider">OODT</scalar>