Though, this is not yet released: it's on trunk (will be included in 2.9). Mike
On Tue, Apr 7, 2009 at 1:19 PM, Michael McCandless <luc...@mikemccandless.com> wrote: > I think the new contrib/collation package may address this use case? > It converts each term to its CollationKey, outside of Lucene. > > Mike > > On Tue, Apr 7, 2009 at 7:36 AM, Federica Falini Data Management S.p.A > <ffal...@datamanagement.it> wrote: >> Good morning, >> In Lucene 2.2 i have made modification to Term.java, TermBuffer.java (see >> below) in order to have Term enumerations sorted case-insensitive (when a >> field is not-tokenized): >> TermEnum terms = reader.terms(new Term("myFieldNotTokenized", "")); >> while ("myFieldNotTokenized".equals(terms.term().field())) { >> >> System.out.println( " " + terms.term()); >> if (!terms.next()) break; >> } >> >> For example, instead to obtain this sort on TermEnum: >> >> Annales >> Cafè >> Zucche >> cafe >> >> i need to obtain this : >> >> Annales >> cafe >> Cafè >> Zucche >> >> Now in Lucene 2.4 i find it difficult because the package "index" is changed >> a lot; can i have some indications to keep my sort? >> Thanks in advance >> Federica >> >> Term.java: >> package org.apache.lucene.index; >> >> import java.text.CollationKey; >> >> /** >> * Licensed to the Apache Software Foundation (ASF) under one or more >> * contributor license agreements. See the NOTICE file distributed with >> * this work for additional information regarding copyright ownership. >> * The ASF licenses this file to You under the Apache License, Version 2.0 >> * (the "License"); you may not use this file except in compliance with >> * the License. You may obtain a copy of the License at >> * >> * http://www.apache.org/licenses/LICENSE-2.0 >> * >> * Unless required by applicable law or agreed to in writing, software >> * distributed under the License is distributed on an "AS IS" BASIS, >> * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. >> * See the License for the specific language governing permissions and >> * limitations under the License. >> */ >> >> /** >> A Term represents a word from text. This is the unit of search. It is >> composed of two elements, the text of the word, as a string, and the name >> of >> the field that the text occured in, an interned string. >> >> Note that terms may represent more than words from text fields, but also >> things like dates, email addresses, urls, etc. */ >> >> public final class Term implements Comparable, java.io.Serializable { >> String field; >> String text; >> transient CollationKey ckText; >> >> /** Constructs a Term with the given field and text. >> * <p>Note that a null field or null text value results in undefined >> * behavior for most Lucene APIs that accept a Term parameter. */ >> public Term(String fld, String txt) { >> >> this(fld, txt, true); >> } >> Term(String fld, String txt, boolean intern) { >> field = intern ? fld.intern() : fld; // field names are interned >> text = txt; // unless already known to be >> ckText = OpacCollator.getInstancePool().getCollationKey(text); >> >> } >> >> /** Returns the field of this term, an interned string. The field >> indicates >> the part of a document which this term came from. */ >> public final String field() { return field; } >> >> /** Returns the text of this term. In the case of words, this is simply >> the >> text of the word. In the case of dates and other types, this is an >> encoding of the object as a string. */ >> public final String text() { return text; } >> >> /** >> * Optimized construction of new Terms by reusing same field as this Term >> * - avoids field.intern() overhead >> * @param text The text of the new term (field is implicitly same as this >> Term instance) >> * @return A new Term >> */ >> public Term createTerm(String text) >> { >> return new Term(field,text,false); >> } >> >> /** Compares two terms, returning true iff they have the same >> field and text. */ >> public final boolean equals(Object o) { >> if (o == this) >> return true; >> if (o == null) >> return false; >> if (!(o instanceof Term)) >> return false; >> Term other = (Term)o; >> //return field == other.field && text.equals(other.text); >> return field == other.field && text.equalsIgnoreCase(other.text); >> } >> >> /** Combines the hashCode() of the field and the text. */ >> public final int hashCode() { >> return field.hashCode() + text.hashCode(); >> } >> >> public int compareTo(Object other) { >> return compareTo((Term)other); >> } >> >> /** Compares two terms, returning a negative integer if this >> term belongs before the argument, zero if this term is equal to the >> argument, and a positive integer if this term belongs after the >> argument. >> >> The ordering of terms is first by field, then by text.*/ >> // public final int compareTo(Term other) { >> // if (field == other.field) // fields are interned >> // return text.compareTo(other.text); >> // else >> // return field.compareTo(other.field); >> // } >> public final int compareTo(Term other) { >> >> if (field == other.field) { // fields are interned >> return ckText.compareTo(other.ckText); >> } else { >> // per il field basta il compareToIgnoreCase delle stringhe >> return field.compareToIgnoreCase(other.field); >> } >> } >> >> /** Resets the field and text of a Term. */ >> final void set(String fld, String txt) { >> field = fld; >> text = txt; >> ckText = OpacCollator.getInstancePool().getCollationKey(text); >> } >> >> public final String toString() { return field + ":" + text; } >> >> private void readObject(java.io.ObjectInputStream in) >> throws java.io.IOException, ClassNotFoundException >> { >> in.defaultReadObject(); >> field = field.intern(); >> ckText = OpacCollator.getInstancePool().getCollationKey(text); >> } >> >> >> } >> >> >> TermBuffer.java: >> public final int compareTo(TermBuffer other) { >> if (field == other.field) { // fields are interned >> return compareChars(text, textLength, other.text, other.textLength); >> } >> else { >> return field.compareTo(other.field); >> >> } >> } >> >> private static final int compareChars(char[] v1, int len1, >> char[] v2, int len2) { >> String v1s = new String(v1,0,len1); >> String v2s = new String(v2,0,len2); >> OpacCollator oc = OpacCollator.getInstancePool(); >> CollationKey v1k = oc.getCollationKey(v1s); >> CollationKey v2k = oc.getCollationKey(v2s); >> return v1k.compareTo(v2k); >> } >> >> OpacCollator.java >> package org.apache.lucene.index; >> >> import java.text.CollationKey; >> import java.text.Collator; >> import java.text.ParseException; >> import java.text.RuleBasedCollator; >> import java.util.BitSet; >> import java.util.Locale; >> >> import org.apache.log4j.Logger; >> >> /** >> * Collator che considera lo spazio '\u0020' come primo carattere >> dell'ordinamento.L'underscore >> * sembra essere il primo di tutti >> */ >> public class OpacCollator extends RuleBasedCollator { >> /** >> * logger >> */ >> private static Logger log = >> Logger.getLogger(OpacCollator.class.getName()); >> private static String spacesRules = null; >> /** progressivo identificativo oggetto */ >> // private static int seqIdPool = 0; >> /** identificativo dell'oggetto */ >> private int id = 0; >> >> private final static int POOL = >> Integer.parseInt(System.getProperty("sebina.opac.collatorpool", "256")); >> private final static OpacCollator[] collatorPool = new OpacCollator[POOL]; >> /** la posizione 'true' indica disponibile, la posizione 'false' indica in >> uso */ >> private static BitSet bs = new BitSet(POOL); >> // private static int cntPool = 0; >> static { >> RuleBasedCollator it_ITcollator = (RuleBasedCollator) >> Collator.getInstance(new Locale("it", "IT")); >> // CASPH: per scaffafle virtuale Rimuove tutte le occorrenze del >> carattere di tabulazione nella stringa del collator >> spacesRules = it_ITcollator.getRules().replaceAll("='\t'=", "="); >> spacesRules = spacesRules.replaceAll(";'\t' ;", ";"); >> // Aggiunto il carattere di tabulazione come prioritario allo spazio >> spacesRules = spacesRules.replaceAll("<'_'", "<'\t'<'\u0020'<'_'"); >> >> for (int i = 0; i < POOL; i++) { >> try { >> collatorPool[i] = new OpacCollator(i); >> bs.set(i); >> } catch (ParseException e) { >> log.fatal("Rules:" + it_ITcollator.getRules(), e); >> break; >> } >> } >> log.info("dimensione pool: " + POOL); >> } >> >> public static synchronized OpacCollator getInstancePool() { >> int pos = -1; >> while (pos < 0) { >> pos = bs.nextSetBit(0); >> } >> bs.clear(pos); >> // log.debug("getting pool#" + pos); >> return collatorPool[pos]; >> } >> >> /** >> * Constructor for OpacCollator. >> * >> * @throws ParseException >> */ >> private OpacCollator() throws ParseException { >> super(spacesRules); >> this.setStrength(Collator.PRIMARY); >> } >> >> /** >> * Constructor for OpacCollator. >> * >> * @param id >> * @throws ParseException >> */ >> public OpacCollator(int ident) throws ParseException { >> this(); >> this.id = ident; >> } >> >> /** >> * @see java.text.Collator#getCollationKey(String) >> */ >> public CollationKey getCollationKey(String arg0) { >> CollationKey ck = super.getCollationKey(arg0); >> bs.set(this.id); >> return ck; >> } >> >> } >> >> >> -- >> >> >> Federica FALINI >> Divisione Beni Culturali >> >> >> >> tel: +39.0544.503.886 >> fax: +39.0544.461697 >> e-mail: ffal...@datamanagement.it >> web: http://www.datamanagement.it >> >> 48100 - Ravenna (RA) >> Via S.Cavina, n 7 >> Italy >> ________________________________ >> Questo messaggio di posta elettronica contiene informazioni di carattere >> confidenziale rivolte esclusivamente al destinatario sopra indicato. E' >> vietato l'uso, la diffusione, distribuzione o riproduzione da parte di ogni >> altra persona. Nel caso aveste ricevuto questo messaggio di posta >> elettronica per errore, siete pregati di segnalarlo immediatamente al >> mittente e distruggere quanto ricevuto (compresi i file allegati) senza >> farne copia. This e-mail transmission may contain legally privileged and/or >> confidential information. Please do not read it if you are not the intended >> recipient(S). Any use, distribution, reproduction or disclosure by any other >> person is strictly prohibited. If you have received this e-mail in error, >> please notify the sender and destroy the original transmission and its >> attachments without reading or saving it in any manner. > --------------------------------------------------------------------- To unsubscribe, e-mail: java-dev-unsubscr...@lucene.apache.org For additional commands, e-mail: java-dev-h...@lucene.apache.org