We do have a duplicate field for every indexed field.
1> field stores text with exact case (used for case sensitive search)
2>lowercased text (used for case insensitive search)
Let me find some example for the collator analyzer.
Here is the .java lost attachment
package index.search.util;
import java.io.IOException;
import java.io.Serializable;
import java.util.Comparator;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.FieldComparator;
import org.apache.lucene.search.FieldComparatorSource;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
/**
* This class is a FieldComparatorSource having an alpha numeric string
* comparator. The comparator class will compare fields by alpha numeric values
* and gives the result to SortField which will handle the ascending and
* descending order.
*/
public class AlphaNumericFieldComparatorSource extends FieldComparatorSource
implements Serializable
{
private static final long serialVersionUID = 1L;
/**
* Comparator class to compare the alpha-numeric field values.
*/
private static class AlphaNumericFieldComparator extends
FieldComparator<String> implements Comparator<String>
{
private final String[] values;
private BinaryDocValues docTerms;
private final String field;
private String bottom;
private final BytesRef tempBR = new BytesRef();
private Bits docsWithField;
private int charIndex1, charIndex2, strLen1, strLen2;
// just used internally in this comparator
private static final byte[] MISSING_BYTES = new byte[0];
AlphaNumericFieldComparator()
{
values = null;
field = null;
}
AlphaNumericFieldComparator(final int numHits, final String field)
{
values = new String[numHits];
this.field = field;
}
@Override
public int compareBottom(final int doc)
{
docTerms.get(doc, tempBR);
if (tempBR.length == 0 && docsWithField.get(doc) == false)
{
tempBR.bytes = MISSING_BYTES;
}
if (bottom.getBytes() == MISSING_BYTES)
{
if (tempBR.bytes == MISSING_BYTES)
{
return 0;
}
return -1;
}
else if (tempBR.bytes == MISSING_BYTES)
{
return 1;
}
return compare(bottom, tempBR.utf8ToString());
}
@Override
public void copy(final int slot, final int doc)
{
BytesRef ref = new BytesRef();
if (values[slot] != null && values[slot].length() > 0)
{
ref = new BytesRef(values[slot].getBytes());
}
docTerms.get(doc, ref);
if (ref.length == 0 && docsWithField.get(doc) == false)
{
values[slot] = "";
}
else
{
values[slot] = ref.utf8ToString();
}
}
@Override
public FieldComparator<String> setNextReader(final AtomicReaderContext
context)
throws IOException
{
docTerms = FieldCache.DEFAULT.getTerms(context.reader(), field,
true);
docsWithField =
FieldCache.DEFAULT.getDocsWithField(context.reader(), field);
return this;
}
@Override
public void setBottom(final int bottom)
{
this.bottom = values[bottom];
}
@Override
public String value(final int slot)
{
return values[slot];
}
@Override
public int compareValues(final String val1, final String val2)
{
if (val1 == null)
{
if (val2 == null)
{
return 0;
}
return -1;
}
else if (val2 == null)
{
return 1;
}
return compare(val1, val2);
}
@Override
public int compareDocToValue(final int doc, final String value)
{
docTerms.get(doc, tempBR);
if (tempBR.length == 0 && docsWithField.get(doc) == false)
{
tempBR.bytes = MISSING_BYTES;
}
return compare(tempBR.utf8ToString(), value);
}
/**
* Method to compare 2 alpha numeric strings
*
* @param s1
* @param s2
* @return
*/
@Override
public int compare(final String string1, final String string2)
{
final String strVal1 = string1;
final String strVal2 = string2;
strLen1 = strVal1.length();
strLen2 = strVal2.length();
charIndex1 = charIndex2 = 0;
if (strLen1 == 0)
{
return strLen2 == 0 ? 0 : -1;
}
else if (strLen2 == 0)
{
return 1;
}
while (charIndex1 < strLen1 && charIndex2 < strLen2)
{
final char char1 = strVal1.charAt(charIndex1);
final char char2 = strVal2.charAt(charIndex2);
int result = 0;
if (Character.isDigit(char1))
{
result = Character.isDigit(char2) ? compareDigits(strVal1,
strVal2) : -1;
}
else if (Character.isLetter(char1))
{
result = Character.isLetter(char2) ?
compareAlphabetsAndOthers(strVal1, strVal2) : 1;
}
else
{
result = Character.isDigit(char2) ? 1 :
Character.isLetter(char2) ? -1 :
compareAlphabetsAndOthers(strVal1, strVal2);
}
if (result != 0)
{
return result;
}
}
return strLen1 - strLen2;
}
/**
* Method to compare only digits
*
* @return
*/
private int compareDigits(final String string1, final String string2)
{
int diff = 0;
int zeroCount1 = 0, zeroCount2 = 0;
char char1 = (char) 0, char2 = (char) 0;
// Count the leading zeros and compare it later.
while (charIndex1 < strLen1 && (char1 =
string1.charAt(charIndex1++)) == '0')
{
zeroCount1++;
}
while (charIndex2 < strLen2 && (char2 =
string2.charAt(charIndex2++)) == '0')
{
zeroCount2++;
}
while (true)
{
final boolean endOfDigits1 = (char1 == 0) ||
!Character.isDigit(char1);
final boolean endOfDigits2 = (char2 == 0) ||
!Character.isDigit(char2);
/*
* If one sequence contains more significant digits than the
* other, it's a larger number. In case the sequesnces have
* equal lengths, we need to compare digits at each position;
* the first
* unequal pair determines which is the bigger number.
*/
if (endOfDigits1 && endOfDigits2)
{
return diff != 0 ? diff : -(zeroCount1 - zeroCount2);
}
else if (endOfDigits1)
{
return -1;
}
else if (endOfDigits2)
{
return 1;
}
else if (diff == 0 && char1 != char2)
{
diff = char1 - char2;
}
char1 = charIndex1 < strLen1 ? string1.charAt(charIndex1++) :
(char) 0;
char2 = charIndex2 < strLen2 ? string2.charAt(charIndex2++) :
(char) 0;
}
}
/**
* Method to compare letters and special characters
*
* @param isLetters
* @return
*/
private int compareAlphabetsAndOthers(final String string1, final
String string2)
{
final char char1 = string1.charAt(charIndex1++);
final char char2 = string2.charAt(charIndex2++);
return (char1 == char2) ? 0 : (char1 - char2);
}
@Override
public int compare(final int slot1, final int slot2)
{
final String val1 = values[slot1];
final String val2 = values[slot2];
if (val1 == null)
{
if (val2 == null)
{
return 0;
}
return -1;
}
else if (val2 == null)
{
return 1;
}
return compare(val1, val2);
}
}
/*
* @see
* org.apache.lucene.search.FieldComparatorSource#newComparator(java.lang
* .String, int, int, boolean)
*/
@Override
public FieldComparator<String> newComparator(final String fieldname, final
int numHits, final int sortPos, final boolean reversed)
throws IOException
{
return new AlphaNumericFieldComparator(numHits, fieldname);
}
/**
* Method to return alpha-numeric comparator for collection sort
*
* @return comparator
*/
public Comparator<String> getAlphaNumericComparator()
{
return new AlphaNumericFieldComparator();
}
}
-----Original Message-----
From: Uwe Schindler [mailto:[email protected]]
Sent: Tuesday, November 12, 2013 1:57 PM
To: [email protected]
Subject: RE: Alphanumeric Field Comparison : Lucene 4.5
Hi,
What are you intending to do? The example code is lost!
In general, to sort alphanumeric/lexical on a human readable field, you would
use the collation functionalities (needs a separate field for sorting
containing the collation keys) provided by Lucene.
Use
http://lucene.apache.org/core/4_5_1/analyzers-common/org/apache/lucene/collation/CollationKeyAnalyzer.html
to index the field and then you can do a simple native sort on this field
(SortField.STRING).
Uwe
-----
Uwe Schindler
H.-H.-Meier-Allee 63, D-28213 Bremen
<http://www.thetaphi.de/> http://www.thetaphi.de
eMail: [email protected]<mailto:[email protected]>
From: Umashanker, Srividhya [mailto:[email protected]]
Sent: Tuesday, November 12, 2013 5:00 AM
To: [email protected]<mailto:[email protected]>
Subject: Alphanumeric Field Comparison : Lucene 4.5
Group –
We are looking at sorting lucene doc’s based on a field in alphanumeric order,
as we expect fields to have Alpha numeric characters.
Attached is the AlphaNumericFieldComparatorSource and below is the snippet of
its usage.
final SortField sortField_id = new SortField(FieldName._id.name(), new
AlphaNumericFieldComparatorSource(), false);
Is anyone using an easier approach or please share other alternatives that you
have tried.
Thanks!
-Vidhya