Repository: incubator-samoa Updated Branches: refs/heads/master fe61d1e74 -> 1b3529983
SAMOA-26: Fix the ArffLoader bug (asp188) Fix #24 Project: http://git-wip-us.apache.org/repos/asf/incubator-samoa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-samoa/commit/1b352998 Tree: http://git-wip-us.apache.org/repos/asf/incubator-samoa/tree/1b352998 Diff: http://git-wip-us.apache.org/repos/asf/incubator-samoa/diff/1b352998 Branch: refs/heads/master Commit: 1b35299837b1ba2376d99e004e777c65d1cff6f5 Parents: fe61d1e Author: Gianmarco De Francisci Morales <[email protected]> Authored: Fri May 15 10:45:38 2015 +0300 Committer: Gianmarco De Francisci Morales <[email protected]> Committed: Fri May 15 10:45:38 2015 +0300 ---------------------------------------------------------------------- .../yahoo/labs/samoa/instances/ArffLoader.java | 105 +++++++++++------- .../yahoo/labs/samoa/instances/Attribute.java | 62 ++++++----- .../labs/samoa/instances/DenseInstance.java | 6 +- .../labs/samoa/instances/ArffLoaderTest.java | 108 +++++++++++++++++++ 4 files changed, 211 insertions(+), 70 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/1b352998/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/ArffLoader.java ---------------------------------------------------------------------- diff --git a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/ArffLoader.java b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/ArffLoader.java index feb5702..dc22bb8 100644 --- a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/ArffLoader.java +++ b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/ArffLoader.java @@ -19,6 +19,7 @@ package com.yahoo.labs.samoa.instances; * limitations under the License. * #L% */ + import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; @@ -30,7 +31,6 @@ import java.util.logging.Level; import java.util.logging.Logger; /** - * * @author abifet */ public class ArffLoader implements Serializable { @@ -87,15 +87,16 @@ public class ArffLoader implements Serializable { while (numAttribute == 0 && streamTokenizer.ttype != StreamTokenizer.TT_EOF) { // For each line while (streamTokenizer.ttype != StreamTokenizer.TT_EOL - && streamTokenizer.ttype != StreamTokenizer.TT_EOF) { + && streamTokenizer.ttype != StreamTokenizer.TT_EOF) { // For each item if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) { // System.out.println(streamTokenizer.nval + "Num "); this.setValue(instance, numAttribute, streamTokenizer.nval, true); - numAttribute++; + //numAttribute++; - } else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD - || streamTokenizer.ttype == 34)) { + } else if (streamTokenizer.sval != null && ( + streamTokenizer.ttype == StreamTokenizer.TT_WORD + || streamTokenizer.ttype == 34 || streamTokenizer.ttype == 39)) { // System.out.println(streamTokenizer.sval + "Str"); boolean isNumeric = attributes.get(numAttribute).isNumeric(); double value; @@ -104,12 +105,14 @@ public class ArffLoader implements Serializable { } else if (isNumeric == true) { value = Double.valueOf(streamTokenizer.sval).doubleValue(); } else { - value = this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval); + value = this.instanceInformation.attribute(numAttribute).indexOfValue( + streamTokenizer.sval); } this.setValue(instance, numAttribute, value, isNumeric); - numAttribute++; + //numAttribute++; } + numAttribute++; streamTokenizer.nextToken(); } streamTokenizer.nextToken(); @@ -119,13 +122,15 @@ public class ArffLoader implements Serializable { } catch (IOException ex) { Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex); } + //System.out.println(instance); return (numAttribute > 0) ? instance : null; } private void setValue(Instance instance, int numAttribute, double value, boolean isNumber) { double valueAttribute; - if (isNumber && this.instanceInformation.attribute(numAttribute).isNominal) { - valueAttribute = this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value)); + if (this.instanceInformation.attribute(numAttribute).isNominal) { + valueAttribute = value; + //this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value)); // System.out.println(value +"/"+valueAttribute+" "); } else { @@ -144,7 +149,7 @@ public class ArffLoader implements Serializable { private Instance readInstanceSparse() { // Return a Sparse Instance Instance instance = new SparseInstance(1.0, null); // (this.instanceInformation.numAttributes() - // + 1); + // + 1); // System.out.println(this.instanceInformation.numAttributes()); int numAttribute; ArrayList<Double> attributeValues = new ArrayList<Double>(); @@ -154,7 +159,7 @@ public class ArffLoader implements Serializable { streamTokenizer.nextToken(); // Remove the '{' char // For each line while (streamTokenizer.ttype != StreamTokenizer.TT_EOL - && streamTokenizer.ttype != StreamTokenizer.TT_EOF) { + && streamTokenizer.ttype != StreamTokenizer.TT_EOF) { while (streamTokenizer.ttype != '}') { // For each item // streamTokenizer.nextToken(); @@ -171,18 +176,22 @@ public class ArffLoader implements Serializable { if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) { // System.out.print(streamTokenizer.nval + " "); - this.setSparseValue(instance, indexValues, attributeValues, numAttribute, streamTokenizer.nval, true); + this.setSparseValue(instance, indexValues, attributeValues, numAttribute, + streamTokenizer.nval, true); // numAttribute++; - } else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD + } else if (streamTokenizer.sval != null && ( + streamTokenizer.ttype == StreamTokenizer.TT_WORD || streamTokenizer.ttype == 34)) { // System.out.print(streamTokenizer.sval + "-"); if (attributes.get(numAttribute).isNumeric()) { this.setSparseValue(instance, indexValues, attributeValues, numAttribute, - Double.valueOf(streamTokenizer.sval).doubleValue(), true); + Double.valueOf(streamTokenizer.sval).doubleValue(), true); } else { - this.setSparseValue(instance, indexValues, attributeValues, numAttribute, this.instanceInformation - .attribute(numAttribute).indexOfValue(streamTokenizer.sval), false); + this.setSparseValue(instance, indexValues, attributeValues, numAttribute, + this.instanceInformation + .attribute(numAttribute).indexOfValue(streamTokenizer.sval), + false); } } streamTokenizer.nextToken(); @@ -202,16 +211,19 @@ public class ArffLoader implements Serializable { arrayIndexValues[i] = indexValues.get(i).intValue(); arrayAttributeValues[i] = attributeValues.get(i).doubleValue(); } - instance.addSparseValues(arrayIndexValues, arrayAttributeValues, this.instanceInformation.numAttributes()); + instance.addSparseValues(arrayIndexValues, arrayAttributeValues, + this.instanceInformation.numAttributes()); return instance; } - private void setSparseValue(Instance instance, List<Integer> indexValues, List<Double> attributeValues, - int numAttribute, double value, boolean isNumber) { + private void setSparseValue(Instance instance, List<Integer> indexValues, + List<Double> attributeValues, + int numAttribute, double value, boolean isNumber) { double valueAttribute; if (isNumber && this.instanceInformation.attribute(numAttribute).isNominal) { - valueAttribute = this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value)); + valueAttribute = + this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value)); } else { valueAttribute = value; } @@ -235,7 +247,7 @@ public class ArffLoader implements Serializable { streamTokenizer.nextToken(); // Remove the '{' char // For each line while (streamTokenizer.ttype != StreamTokenizer.TT_EOL - && streamTokenizer.ttype != StreamTokenizer.TT_EOF) { + && streamTokenizer.ttype != StreamTokenizer.TT_EOF) { while (streamTokenizer.ttype != '}') { // For each item // streamTokenizer.nextToken(); @@ -249,15 +261,18 @@ public class ArffLoader implements Serializable { this.setValue(instance, numAttribute, streamTokenizer.nval, true); // numAttribute++; - } else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD + } else if (streamTokenizer.sval != null && ( + streamTokenizer.ttype == StreamTokenizer.TT_WORD || streamTokenizer.ttype == 34)) { // System.out.print(streamTokenizer.sval + // "/"+this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval)+" "); if (attributes.get(numAttribute).isNumeric()) { - this.setValue(instance, numAttribute, Double.valueOf(streamTokenizer.sval).doubleValue(), true); + this.setValue(instance, numAttribute, + Double.valueOf(streamTokenizer.sval).doubleValue(), true); } else { this.setValue(instance, numAttribute, - this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval), false); + this.instanceInformation.attribute(numAttribute) + .indexOfValue(streamTokenizer.sval), false); // numAttribute++; } } @@ -287,7 +302,8 @@ public class ArffLoader implements Serializable { while (streamTokenizer.ttype != StreamTokenizer.TT_EOF) { // For each line // if (streamTokenizer.ttype == '@') { - if (streamTokenizer.ttype == StreamTokenizer.TT_WORD && streamTokenizer.sval.startsWith("@") == true) { + if (streamTokenizer.ttype == StreamTokenizer.TT_WORD + && streamTokenizer.sval.startsWith("@") == true) { // streamTokenizer.nextToken(); String token = streamTokenizer.sval.toUpperCase(); if (token.startsWith("@RELATION")) { @@ -305,22 +321,12 @@ public class ArffLoader implements Serializable { String type = streamTokenizer.sval; // System.out.println("* " + name + ":" + type + " "); if (streamTokenizer.ttype == '{') { + parseDoubleBrackests(name); + } else if (streamTokenizer.ttype == 10) {//for the buggy non-formal input arff file streamTokenizer.nextToken(); - List<String> attributeLabels = new ArrayList<String>(); - while (streamTokenizer.ttype != '}') { - - if (streamTokenizer.sval != null) { - attributeLabels.add(streamTokenizer.sval); - // System.out.print(streamTokenizer.sval + ","); - } else { - attributeLabels.add(Double.toString(streamTokenizer.nval)); - // System.out.print(streamTokenizer.nval + ","); - } - - streamTokenizer.nextToken(); + if (streamTokenizer.ttype == '{') { + parseDoubleBrackests(name); } - // System.out.println(); - attributes.add(new Attribute(name, attributeLabels)); } else { // Add attribute attributes.add(new Attribute(name)); @@ -341,6 +347,27 @@ public class ArffLoader implements Serializable { return new InstanceInformation(relation, attributes); } + private void parseDoubleBrackests(String name) throws IOException { + + streamTokenizer.nextToken(); + List<String> attributeLabels = new ArrayList<String>(); + while (streamTokenizer.ttype != '}') { + + if (streamTokenizer.sval != null) { + attributeLabels.add(streamTokenizer.sval); + // System.out.print(streamTokenizer.sval + ","); + } else { + attributeLabels.add(Double.toString(streamTokenizer.nval)); + // System.out.print(streamTokenizer.nval + ","); + } + + streamTokenizer.nextToken(); + } + // System.out.println(); + attributes.add(new Attribute(name, attributeLabels)); + + } + private void initStreamTokenizer(Reader reader) { BufferedReader br = new BufferedReader(reader); http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/1b352998/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/Attribute.java ---------------------------------------------------------------------- diff --git a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/Attribute.java b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/Attribute.java index 8609d6e..6ebd678 100644 --- a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/Attribute.java +++ b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/Attribute.java @@ -32,37 +32,38 @@ import java.util.List; import java.util.Map; /** - * * @author abifet */ public class Attribute implements Serializable { public static final String ARFF_ATTRIBUTE = "@attribute"; public static final String ARFF_ATTRIBUTE_NUMERIC = "NUMERIC"; + public static final String ARFF_ATTRIBUTE_NOMINAL = "NOMINAL"; + public static final String ARFF_ATTRIBUTE_DATE = "DATE"; /** - * - */ + * + */ protected boolean isNominal; /** - * - */ + * + */ protected boolean isNumeric; /** - * - */ + * + */ protected boolean isDate; /** - * - */ + * + */ protected String name; /** - * - */ + * + */ protected List<String> attributeValues; /** - * + * * @return */ public List<String> getAttributeValues() { @@ -70,12 +71,12 @@ public class Attribute implements Serializable { } /** - * - */ + * + */ protected int index; /** - * + * * @param string */ public Attribute(String string) { @@ -84,7 +85,7 @@ public class Attribute implements Serializable { } /** - * + * * @param attributeName * @param attributeValues */ @@ -95,14 +96,14 @@ public class Attribute implements Serializable { } /** - * - */ + * + */ public Attribute() { this(""); } /** - * + * * @return */ public boolean isNominal() { @@ -110,7 +111,7 @@ public class Attribute implements Serializable { } /** - * + * * @return */ public String name() { @@ -118,7 +119,7 @@ public class Attribute implements Serializable { } /** - * + * * @param value * @return */ @@ -127,7 +128,7 @@ public class Attribute implements Serializable { } /** - * + * * @return */ public boolean isNumeric() { @@ -135,20 +136,19 @@ public class Attribute implements Serializable { } /** - * + * * @return */ public int numValues() { if (isNumeric()) { return 0; - } - else { + } else { return attributeValues.size(); } } /** - * + * * @return */ public int index() { // RuleClassifier @@ -167,7 +167,7 @@ public class Attribute implements Serializable { private Map<String, Integer> valuesStringAttribute; /** - * + * * @param value * @return */ @@ -198,7 +198,13 @@ public class Attribute implements Serializable { text.append(ARFF_ATTRIBUTE).append(" ").append(Utils.quote(this.name)).append(" "); - text.append(ARFF_ATTRIBUTE_NUMERIC); + if (isNominal) { + text.append(ARFF_ATTRIBUTE_NOMINAL); + } else if (isNumeric) { + text.append(ARFF_ATTRIBUTE_NUMERIC); + } else if (isDate) { + text.append(ARFF_ATTRIBUTE_DATE); + } return text.toString(); } http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/1b352998/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/DenseInstance.java ---------------------------------------------------------------------- diff --git a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/DenseInstance.java b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/DenseInstance.java index 984675e..57d1bfd 100644 --- a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/DenseInstance.java +++ b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/DenseInstance.java @@ -25,7 +25,6 @@ package com.yahoo.labs.samoa.instances; */ /** - * * @author abifet */ public class DenseInstance extends SingleLabelInstance { @@ -62,9 +61,10 @@ public class DenseInstance extends SingleLabelInstance { public String toString() { StringBuffer text = new StringBuffer(); - for (int i = 0; i < this.instanceInformation.numAttributes(); i++) { - if (i > 0) + for (int i = 0; i < this.instanceData.numAttributes(); i++) { + if (i > 0) { text.append(","); + } text.append(this.value(i)); } text.append(",").append(this.weight()); http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/1b352998/samoa-instances/src/test/java/com/yahoo/labs/samoa/instances/ArffLoaderTest.java ---------------------------------------------------------------------- diff --git a/samoa-instances/src/test/java/com/yahoo/labs/samoa/instances/ArffLoaderTest.java b/samoa-instances/src/test/java/com/yahoo/labs/samoa/instances/ArffLoaderTest.java new file mode 100644 index 0000000..62fd7b7 --- /dev/null +++ b/samoa-instances/src/test/java/com/yahoo/labs/samoa/instances/ArffLoaderTest.java @@ -0,0 +1,108 @@ +package com.yahoo.labs.samoa.instances; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2014 - 2015 Apache Software Foundation + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +import org.junit.Before; +import org.junit.Test; + +import java.io.StringReader; + +import static org.junit.Assert.assertEquals; + +public class ArffLoaderTest { + + private ArffLoader loader; + + private StringReader reader; + + @Before + public void setUp() { + String inputString = "@relation test.txt\n" + + "\n" + + "@attribute Dur numeric\n" + + "@attribute Proto {udp,tcp,icmp,arp,ipx/spx,ipv6-icmp,pim,esp,igmp,rtcp,rtp,ipv6,udt}\n" + + "@attribute Dir {' <->',' <?>',' ->',' ?>',' who',' <-',' <?'}\n" + + "@attribute State {CON,PA_PA,PA_FRA, ...}\n" + + "@attribute sTos numeric\n" + + "@attribute dTos numeric\n" + + "@attribute TotPkts numeric\n" + + "@attribute TotBytes numeric\n" + + "@attribute SrcBytes numeric\n" + + "@attribute class {Background,Normal,Botnet}\n" + + "\n" + + "@data\n" + + "\n" + + "1065.731934,udp,' <->',...,0,0,2,252,145,Background\n" + + "1471.787109,udp,' <->',CON,0,0,2,252,145,Background"; + reader = new StringReader(inputString); + int size = 0; + int classAttribute = 10; + loader = new ArffLoader(reader, size, classAttribute); + + } + + @Test + public void testGetHeader() { + InstanceInformation header = loader.getStructure(); + assertEquals(10, header.numAttributes()); + assertEquals(9, header.classIndex()); + assertEquals(true, header.attribute(0).isNumeric()); + assertEquals(false, header.attribute(1).isNumeric()); + assertEquals(false, header.attribute(2).isNumeric()); + assertEquals(false, header.attribute(3).isNumeric()); + assertEquals(true, header.attribute(4).isNumeric()); + assertEquals(true, header.attribute(5).isNumeric()); + assertEquals(true, header.attribute(6).isNumeric()); + assertEquals(true, header.attribute(7).isNumeric()); + assertEquals(true, header.attribute(8).isNumeric()); + assertEquals(false, header.attribute(9).isNumeric()); + + assertEquals(7, header.attribute(2).numValues()); + assertEquals(" <->", header.attribute(2).value(0)); + assertEquals(" <?>", header.attribute(2).value(1)); + assertEquals(" ->", header.attribute(2).value(2)); + assertEquals(" ?>", header.attribute(2).value(3)); + assertEquals(" who", header.attribute(2).value(4)); + assertEquals(" <-", header.attribute(2).value(5)); + assertEquals(" <?", header.attribute(2).value(6)); + + assertEquals(3, header.attribute(9).numValues()); + assertEquals("Background", header.attribute(9).value(0)); + assertEquals("Normal", header.attribute(9).value(1)); + assertEquals("Botnet", header.attribute(9).value(2)); + + } + + @Test + public void testReadInstance() { + Instance instance = loader.readInstance(reader); + assertEquals(1065.731934, instance.value(0), 0); + assertEquals(0, instance.value(1), 0); + assertEquals(0, instance.value(2), 0); + assertEquals(3, instance.value(3), 0); + assertEquals(0, instance.value(4), 0); + assertEquals(0, instance.value(5), 0); + assertEquals(2, instance.value(6), 0); + assertEquals(252, instance.value(7), 0); + assertEquals(145, instance.value(8), 0); + assertEquals(0, instance.value(9), 0); + } +} \ No newline at end of file
