SAMOA-43: Add TextReader
Project: http://git-wip-us.apache.org/repos/asf/incubator-samoa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-samoa/commit/473a4193 Tree: http://git-wip-us.apache.org/repos/asf/incubator-samoa/tree/473a4193 Diff: http://git-wip-us.apache.org/repos/asf/incubator-samoa/diff/473a4193 Branch: refs/heads/master Commit: 473a4193179c8e97364fe071baf199eb4b38d371 Parents: 82b3ef3 Author: abifet <[email protected]> Authored: Fri Aug 21 12:09:37 2015 +0800 Committer: Albert Bifet <[email protected]> Committed: Wed Mar 16 15:33:36 2016 +0100 ---------------------------------------------------------------------- .../org/apache/samoa/streams/TextGenerator.java | 205 +++++++++++++++++++ 1 file changed, 205 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/473a4193/samoa-api/src/main/java/org/apache/samoa/streams/TextGenerator.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/org/apache/samoa/streams/TextGenerator.java b/samoa-api/src/main/java/org/apache/samoa/streams/TextGenerator.java new file mode 100644 index 0000000..c165f33 --- /dev/null +++ b/samoa-api/src/main/java/org/apache/samoa/streams/TextGenerator.java @@ -0,0 +1,205 @@ +package org.apache.samoa.streams; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2014 - 2015 Apache Software Foundation + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +import com.github.javacliparser.IntOption; +import org.apache.samoa.instances.*; +import org.apache.samoa.moa.core.InstanceExample; +import org.apache.samoa.moa.core.ObjectRepository; +import org.apache.samoa.moa.options.AbstractOptionHandler; +import org.apache.samoa.moa.streams.InstanceStream; +import org.apache.samoa.moa.tasks.TaskMonitor; + +import java.util.ArrayList; +import java.util.Random; + +/** + * Text generator that simulates sentiment analysis on tweets. + */ +public class TextGenerator extends AbstractOptionHandler implements InstanceStream { + + private static final long serialVersionUID = 3028905554604259131L; + + public IntOption numAttsOption = new IntOption("numAtts", 'a', + "The number of attributes to generate.", 1000, 0, Integer.MAX_VALUE); + + public IntOption instanceRandomSeedOption = new IntOption( + "instanceRandomSeed", 'i', + "Seed for random generation of instances.", 1); + + protected InstancesHeader streamHeader; + + protected Random instanceRandom; + + protected int[] wordTwitterGenerator; + protected double[] freqTwitterGenerator; + protected double[] sumFreqTwitterGenerator; + protected int[] classTwitterGenerator; + + protected int sizeTable; + protected double probPositive = 0.1; + protected double probNegative = 0.1; + protected double zipfExponent = 1.5; + protected double lengthTweet = 15; + + protected int countTweets = 0; + + @Override + public InstancesHeader getHeader() { + return this.streamHeader; + } + + @Override + public long estimatedRemainingInstances() { + return -1; + } + + @Override + public boolean hasMoreInstances() { + return true; + } + + @Override + public InstanceExample nextInstance() { + int[] votes; + double[] attVals; + attVals = new double[this.numAttsOption.getValue() + 1]; + + do { + int length = (int) (lengthTweet * (1.0 + this.instanceRandom.nextGaussian())); + if (length < 1) length = 1; + votes = new int[3]; + for (int j = 0; j < length; j++) { + double rand = this.instanceRandom.nextDouble(); + //binary search + int i = 0; + int min = 0; + int max = sizeTable - 1; + int mid; + do { + mid = (min + max) / 2; + if (rand > this.sumFreqTwitterGenerator[mid]) { + min = mid + 1; + } else { + max = mid - 1; + } + } while ((this.sumFreqTwitterGenerator[mid] != rand) && (min <= max)); + + attVals[this.wordTwitterGenerator[mid]] = 1; + votes[this.classTwitterGenerator[mid]]++; + + } + } while (votes[1] == votes[2]); + + Instance inst = new DenseInstance(1.0, attVals); + inst.setDataset(getHeader()); + inst.setClassValue((votes[1] > votes[2]) ? 0 : 1); + this.countTweets++; + return new InstanceExample(inst); + } + + @Override + public boolean isRestartable() { + return true; + } + + @Override + public void restart() { + + this.sizeTable = this.numAttsOption.getValue(); + + //Prepare table of words to generate tweets + this.wordTwitterGenerator = new int[sizeTable]; + this.freqTwitterGenerator = new double[sizeTable]; + this.sumFreqTwitterGenerator = new double[sizeTable]; + this.classTwitterGenerator = new int[sizeTable]; + + this.countTweets = 0; + + double sum = 0; + this.instanceRandom = new Random(this.instanceRandomSeedOption.getValue()); + for (int i = 0; i < this.sizeTable; i++) { + this.wordTwitterGenerator[i] = i + 1; + this.freqTwitterGenerator[i] = 1.0 / Math.pow(i + 1, zipfExponent); + sum += this.freqTwitterGenerator[i]; + this.sumFreqTwitterGenerator[i] = sum; + double rand = this.instanceRandom.nextDouble(); + this.classTwitterGenerator[i] = (rand < probPositive ? 1 : (rand < probNegative + probPositive ? 2 : 0)); + } + for (int i = 0; i < this.sizeTable; i++) { + this.freqTwitterGenerator[i] /= sum; + this.sumFreqTwitterGenerator[i] /= sum; + } + + } + + @Override + protected void prepareForUseImpl(TaskMonitor monitor, ObjectRepository repository) { + generateHeader(); + restart(); + } + + @Override + public void getDescription(StringBuilder sb, int indent) { + + } + private void generateHeader() { + ArrayList<Attribute> attributes = new ArrayList(); + for (int i = 0; i < this.numAttsOption.getValue(); i++) { + attributes.add(new Attribute("att" + (i + 1))); + } + ArrayList<String> classLabels = new ArrayList(); + for (int i = 0; i < 2; i++) { + classLabels.add("class" + (i + 1)); + } + attributes.add(new Attribute("class", classLabels)); + this.streamHeader = new InstancesHeader(new Instances( + getCLICreationString(InstanceStream.class), attributes, 0)); + this.streamHeader.setClassIndex(this.streamHeader.numAttributes() - 1); + } + + + public void changePolarity(int numberWords) { + for (int i = 0; i < numberWords; ) { + int randWord = this.instanceRandom.nextInt(this.sizeTable); + int polarity = this.classTwitterGenerator[randWord]; + if (polarity == 1) { + this.classTwitterGenerator[i] = 2; + i++; + } + if (polarity == 2) { + this.classTwitterGenerator[i] = 1; + i++; + } + } + } + + public void changeFreqWords(int numberWords) { + for (int i = 0; i < numberWords; i++) { + int randWordTo = this.instanceRandom.nextInt(this.sizeTable); + int randWordFrom = this.instanceRandom.nextInt(this.sizeTable); + this.wordTwitterGenerator[randWordTo] = randWordFrom; + this.wordTwitterGenerator[randWordFrom] = randWordTo; + } + } + + +}
