Author: vincenzo Date: Mon Dec 12 06:26:29 2005 New Revision: 356257 URL: http://svn.apache.org/viewcvs?rev=356257&view=rev Log: 1) Fixed JAMES-387 (java.lang.ClassCastException: java.lang.Integer). 2) Some enhancements to reduce memory footprint.
Modified: james/server/trunk/src/java/org/apache/james/transport/mailets/BayesianAnalysis.java james/server/trunk/src/java/org/apache/james/util/BayesianAnalyzer.java james/server/trunk/src/java/org/apache/james/util/JDBCBayesianAnalyzer.java Modified: james/server/trunk/src/java/org/apache/james/transport/mailets/BayesianAnalysis.java URL: http://svn.apache.org/viewcvs/james/server/trunk/src/java/org/apache/james/transport/mailets/BayesianAnalysis.java?rev=356257&r1=356256&r2=356257&view=diff ============================================================================== --- james/server/trunk/src/java/org/apache/james/transport/mailets/BayesianAnalysis.java (original) +++ james/server/trunk/src/java/org/apache/james/transport/mailets/BayesianAnalysis.java Mon Dec 12 06:26:29 2005 @@ -340,8 +340,10 @@ try { // this is synchronized to avoid concurrent update of the corpus synchronized(JDBCBayesianAnalyzer.DATABASE_LOCK) { + analyzer.tokenCountsClear(); analyzer.loadHamNSpam(conn); analyzer.buildCorpus(); + analyzer.tokenCountsClear(); } log("BayesianAnalysis Corpus loaded"); Modified: james/server/trunk/src/java/org/apache/james/util/BayesianAnalyzer.java URL: http://svn.apache.org/viewcvs/james/server/trunk/src/java/org/apache/james/util/BayesianAnalyzer.java?rev=356257&r1=356256&r2=356257&view=diff ============================================================================== --- james/server/trunk/src/java/org/apache/james/util/BayesianAnalyzer.java (original) +++ james/server/trunk/src/java/org/apache/james/util/BayesianAnalyzer.java Mon Dec 12 06:26:29 2005 @@ -261,14 +261,21 @@ public void clear() { corpus.clear(); - hamTokenCounts.clear(); - spamTokenCounts.clear(); + tokenCountsClear(); hamMessageCount = 0; spamMessageCount = 0; } /** + * Clears token counters. + */ + public void tokenCountsClear() { + hamTokenCounts.clear(); + spamTokenCounts.clear(); + } + + /** * Public setter for corpus. * * @param corpus The new corpus. @@ -289,17 +296,19 @@ */ public void buildCorpus() { //Combine the known ham & spam tokens. - corpus.putAll(hamTokenCounts); - corpus.putAll(spamTokenCounts); + Set set = new HashSet(hamTokenCounts.size() + spamTokenCounts.size()); + set.addAll(hamTokenCounts.keySet()); + set.addAll(spamTokenCounts.keySet()); + Map tempCorpus = new HashMap(set.size()); //Iterate through all the tokens and compute their new //individual probabilities. - Iterator i = corpus.keySet().iterator(); + Iterator i = set.iterator(); while (i.hasNext()) { String token = (String) i.next(); - - corpus.put(token, new Double(computeProbability(token))); + tempCorpus.put(token, new Double(computeProbability(token))); } + setCorpus(tempCorpus); } /** @@ -335,13 +344,17 @@ //Build a set of the tokens in the Stream. Set tokens = parse(stream); + // Get the corpus to use in this run + // A new corpus may be being built in the meantime + Map workCorpus = getCorpus(); + //Assign their probabilities from the Corpus (using an additional //calculation to determine spamminess). - SortedSet tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens); + SortedSet tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens, workCorpus); //Compute and return the overall probability that the //stream is SPAM. - return computeOverallProbability(tokenProbabilityStrengths); + return computeOverallProbability(tokenProbabilityStrengths, workCorpus); } /** @@ -575,9 +588,10 @@ * The ordering is from the highest strength to the lowest strength. * * @param tokens + * @param workCorpus * @return SortedSet of TokenProbabilityStrength objects. */ - private SortedSet getTokenProbabilityStrengths(Set tokens) { + private SortedSet getTokenProbabilityStrengths(Set tokens, Map workCorpus) { //Convert to a SortedSet of token probability strengths. SortedSet tokenProbabilityStrengths = new TreeSet(); @@ -587,14 +601,15 @@ tps.token = (String) i.next(); - if (corpus.containsKey(tps.token)) { - tps.strength = Math.abs(0.5 - ((Double) corpus.get(tps.token)).doubleValue()); + if (workCorpus.containsKey(tps.token)) { + tps.strength = Math.abs(0.5 - ((Double) workCorpus.get(tps.token)).doubleValue()); } else { //This token has never been seen before, //we'll give it initially the default probability. Double corpusProbability = new Double(DEFAULT_TOKEN_PROBABILITY); tps.strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY); + boolean isTokenDegeneratedFound = false; Collection degeneratedTokens = buildDegenerated(tps.token); Iterator iDegenerated = degeneratedTokens.iterator(); @@ -602,17 +617,21 @@ double strengthDegenerated; while (iDegenerated.hasNext()) { tokenDegenerated = (String) iDegenerated.next(); - if (corpus.containsKey(tokenDegenerated)) { - Double probabilityTemp = (Double) corpus.get(tokenDegenerated); + if (workCorpus.containsKey(tokenDegenerated)) { + Double probabilityTemp = (Double) workCorpus.get(tokenDegenerated); strengthDegenerated = Math.abs(0.5 - probabilityTemp.doubleValue()); if (strengthDegenerated > tps.strength) { + isTokenDegeneratedFound = true; tps.strength = strengthDegenerated; corpusProbability = probabilityTemp; } } } - synchronized(corpus) { - corpus.put(tps.token, corpusProbability); + // to reduce memory usage, put in the corpus only if the probability is different from (stronger than) the default + if (isTokenDegeneratedFound) { + synchronized(workCorpus) { + workCorpus.put(tps.token, corpusProbability); + } } } @@ -672,9 +691,10 @@ * the tokenProbabilities SortedSet. * * @param tokenProbabilities + * @param workCorpus * @return Computed spamminess. */ - private double computeOverallProbability(SortedSet tokenProbabilityStrengths) { + private double computeOverallProbability(SortedSet tokenProbabilityStrengths, Map workCorpus) { double p = 1.0; double np = 1.0; double tempStrength = 0.5; @@ -686,9 +706,15 @@ // System.out.println(tps); - p *= ((Double) corpus.get(tps.token)).doubleValue(); - np *= (1.0 - ((Double) corpus.get(tps.token)).doubleValue()); - // System.out.println("Token:" + tps.token + ", p=" + ((Double) corpus.get(tps.token)).doubleValue() + ", overall p=" + p / (p + np)); + double theDoubleValue = DEFAULT_TOKEN_PROBABILITY; // initialize it to the default + Double theDoubleObject = (Double) workCorpus.get(tps.token); + // if either the original token or a degeneration was found use the double value, otherwise use the default + if (theDoubleObject != null) { + theDoubleValue = theDoubleObject.doubleValue(); + } + p *= theDoubleValue; + np *= (1.0 - theDoubleValue); + // System.out.println("Token:" + tps.token + ", p=" + theDoubleValue + ", overall p=" + p / (p + np)); } return (p / (p + np)); Modified: james/server/trunk/src/java/org/apache/james/util/JDBCBayesianAnalyzer.java URL: http://svn.apache.org/viewcvs/james/server/trunk/src/java/org/apache/james/util/JDBCBayesianAnalyzer.java?rev=356257&r1=356256&r2=356257&view=diff ============================================================================== --- james/server/trunk/src/java/org/apache/james/util/JDBCBayesianAnalyzer.java (original) +++ james/server/trunk/src/java/org/apache/james/util/JDBCBayesianAnalyzer.java Mon Dec 12 06:26:29 2005 @@ -162,7 +162,12 @@ Map ham = getHamTokenCounts(); while (rs.next()) { - ham.put(rs.getString(1), new Integer(rs.getInt(2))); + String token = rs.getString(1); + int count = rs.getInt(2); + // to reduce memory, use the token only if the count is > 1 + if (count > 1) { + ham.put(token, new Integer(count)); + } } //Verbose. delegatedLog("Ham tokens count: " + ham.size()); @@ -176,7 +181,12 @@ Map spam = getSpamTokenCounts(); while (rs.next()) { - spam.put(rs.getString(1), new Integer(rs.getInt(2))); + String token = rs.getString(1); + int count = rs.getInt(2); + // to reduce memory, use the token only if the count is > 1 + if (count > 1) { + spam.put(token, new Integer(count)); + } } //Verbose. --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]