Author: gates Date: Thu Oct 9 10:31:58 2008 New Revision: 703209 URL: http://svn.apache.org/viewvc?rev=703209&view=rev Log: Pig-472 Added RegExLoader to piggybank, an abstract loader class to parse text files via regular espressions
Added: incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/RegExLoader.java incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestHelper.java incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestRegExLoader.java Modified: incubator/pig/trunk/CHANGES.txt Modified: incubator/pig/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/incubator/pig/trunk/CHANGES.txt?rev=703209&r1=703208&r2=703209&view=diff ============================================================================== --- incubator/pig/trunk/CHANGES.txt (original) +++ incubator/pig/trunk/CHANGES.txt Thu Oct 9 10:31:58 2008 @@ -354,3 +354,6 @@ PIG-342: Fix DistinctDataBag to recalculate size after it has spilled. (bdimcheff via gates) + PIG-472: Added RegExLoader to piggybank, an abstract loader class to parse + text files via regular espressions (spackest via gates) + Added: incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/RegExLoader.java URL: http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/RegExLoader.java?rev=703209&view=auto ============================================================================== --- incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/RegExLoader.java (added) +++ incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/RegExLoader.java Thu Oct 9 10:31:58 2008 @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the + * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is + * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and limitations under the License. + */ + +package org.apache.pig.piggybank.storage; + +import java.io.IOException; +import java.io.OutputStream; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.pig.ReversibleLoadStoreFunc; +import org.apache.pig.data.DataAtom; +import org.apache.pig.data.Datum; +import org.apache.pig.data.Tuple; +import org.apache.pig.impl.io.BufferedPositionedInputStream; + +/** + * RegExLoader is an abstract class used to parse logs based on a regular expression. + * + * There is a single abstract method, getPattern which needs to return a Pattern. Each group will be returned + * as a different DataAtom. + * + * Look to org.apache.pig.piggybank.storage.apachelog.CommonLogLoader for example usage. + */ + +public abstract class RegExLoader implements ReversibleLoadStoreFunc { + protected BufferedPositionedInputStream in = null; + long end = Long.MAX_VALUE; + private byte recordDel = (byte) '\n'; + private String fieldDel = "\t"; + final private static Charset utf8 = Charset.forName("UTF8"); + OutputStream os; + + abstract public Pattern getPattern(); + + public RegExLoader() { + } + + public Tuple getNext() throws IOException { + if (in == null || in.getPosition() > end) { + return null; + } + + Pattern pattern = getPattern(); + Matcher matcher = pattern.matcher(""); + + String line; + if ((line = in.readLine(utf8, recordDel)) != null) { + if (line.length() > 0 && line.charAt(line.length() - 1) == '\r') + line = line.substring(0, line.length() - 1); + + matcher.reset(line); + if (matcher.find()) { + ArrayList<Datum> list = new ArrayList<Datum>(); + + for (int i = 1; i <= matcher.groupCount(); i++) { + list.add(new DataAtom(matcher.group(i))); + } + return new Tuple(list); + } + } + return null; + } + + public void bindTo(String fileName, BufferedPositionedInputStream in, long offset, long end) throws IOException { + this.in = in; + this.end = end; + + // Since we are not block aligned we throw away the first + // record and could on a different instance to read it + if (offset != 0) { + getNext(); + } + } + + public void bindTo(OutputStream os) throws IOException { + this.os = os; + } + + public void putNext(Tuple f) throws IOException { + os.write((f.toDelimitedString(this.fieldDel) + (char) this.recordDel).getBytes("utf8")); + } + + public void finish() throws IOException { + } +} Added: incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestHelper.java URL: http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestHelper.java?rev=703209&view=auto ============================================================================== --- incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestHelper.java (added) +++ incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestHelper.java Thu Oct 9 10:31:58 2008 @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the + * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is + * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and limitations under the License. + */ + +package org.apache.pig.piggybank.test.storage; + +import java.io.File; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import junit.framework.TestCase; + +import org.apache.pig.data.DataAtom; +import org.apache.pig.data.Tuple; +import org.junit.Test; + +public class TestHelper extends TestCase { + @Test + public void testTest() { + assertTrue(true); + } + + + public static ArrayList<String[]> getExpected(ArrayList<String[]> data, Pattern pattern) { + ArrayList<String[]> expected = new ArrayList<String[]>(); + for (int i = 0; i < data.size(); i++) { + String string = data.get(i)[0]; + Matcher matcher = pattern.matcher(string); + matcher.groupCount(); + matcher.find(); + String[] toAdd = new String[] { matcher.group(1), matcher.group(2), matcher.group(3) }; + expected.add(toAdd); + } + + return expected; + } + + private static String join(String delimiter, String[] strings) { + String string = strings[0]; + for (int i = 1; i < strings.length; i++) { + string += delimiter + strings[i]; + } + return string; + } + + public static void examineTuple(ArrayList<String[]> expectedData, Tuple tuple, int tupleCount) { + for (int i = 0; i < tuple.arity(); i++) { + DataAtom dataAtom = tuple.getAtomField(i); + String expected = expectedData.get(tupleCount)[i]; + String actual = dataAtom.toString(); + assertEquals(expected, actual); + } + } + + public static String createTempFile(ArrayList<String[]> myData, String delimiter) throws Exception { + File tmpFile = File.createTempFile("test", ".txt"); + if (tmpFile.exists()) { + tmpFile.delete(); + } + PrintWriter pw = new PrintWriter(tmpFile); + for (int i = 0; i < myData.size(); i++) { + pw.println(join(delimiter, myData.get(i))); + } + pw.close(); + tmpFile.deleteOnExit(); + return tmpFile.getAbsolutePath(); + } +} Added: incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestRegExLoader.java URL: http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestRegExLoader.java?rev=703209&view=auto ============================================================================== --- incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestRegExLoader.java (added) +++ incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestRegExLoader.java Thu Oct 9 10:31:58 2008 @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the + * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is + * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and limitations under the License. + */ + +package org.apache.pig.piggybank.test.storage; + +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Properties; +import java.util.regex.Pattern; + +import junit.framework.TestCase; + +import org.apache.pig.PigServer.ExecType; +import org.apache.pig.data.Tuple; +import org.apache.pig.impl.PigContext; +import org.apache.pig.impl.io.BufferedPositionedInputStream; +import org.apache.pig.impl.io.FileLocalizer; +import org.apache.pig.piggybank.storage.RegExLoader; +import org.junit.Test; + +public class TestRegExLoader extends TestCase { + private static String patternString = "(\\w+),(\\w+);(\\w+)"; + private final static Pattern pattern = Pattern.compile(patternString); + + class DummyRegExLoader extends RegExLoader { + @Override + public Pattern getPattern() { + return Pattern.compile(patternString); + } + } + + public static ArrayList<String[]> data = new ArrayList<String[]>(); + static { + data.add(new String[] { "1,one;i" }); + data.add(new String[] { "2,two;ii" }); + data.add(new String[] { "3,three;iii" }); + } + + @Test + public void testLoadFromBindTo() throws Exception { + String filename = TestHelper.createTempFile(data, " "); + DummyRegExLoader dummyRegExLoader = new DummyRegExLoader(); + PigContext pigContext = new PigContext(ExecType.LOCAL, new Properties()); + InputStream inputStream = FileLocalizer.open(filename, pigContext); + dummyRegExLoader.bindTo(filename, new BufferedPositionedInputStream(inputStream), 0, Long.MAX_VALUE); + + ArrayList<String[]> expected = TestHelper.getExpected(data, pattern); + int tupleCount = 0; + + while (true) { + Tuple tuple = dummyRegExLoader.getNext(); + if (tuple == null) + break; + else { + TestHelper.examineTuple(expected, tuple, tupleCount); + tupleCount++; + } + } + assertEquals(data.size(), tupleCount); + } +}