Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/BsvEncoder.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/BsvEncoder.java?rev=1881994&view=auto ============================================================================== --- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/BsvEncoder.java (added) +++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/BsvEncoder.java Fri Sep 25 00:59:37 2020 @@ -0,0 +1,86 @@ +package org.apache.ctakes.dictionary.cased.encoder; + + +import org.apache.ctakes.dictionary.cased.util.bsv.BsvFileParser; +import org.apache.ctakes.dictionary.cased.util.bsv.StringArrayCreator; +import org.apache.ctakes.dictionary.lookup2.util.CuiCodeUtil; +import org.apache.ctakes.utils.env.EnvironmentVariable; +import org.apache.log4j.Logger; +import org.apache.uima.UimaContext; + +import java.io.IOException; +import java.util.*; + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 8/18/2020 + */ +final public class BsvEncoder implements TermEncoder { + + static public final String ENCODER_TYPE = "BSV"; + + static private final Logger LOGGER = Logger.getLogger( "BsvEncoder" ); + + + private final InMemoryEncoder _delegate; + + public BsvEncoder( final String name, final UimaContext uimaContext ) { + this( name, EnvironmentVariable.getEnv( name + "_file", uimaContext ) ); + } + + public BsvEncoder( final String name, final String bsvPath ) { + final Map<Long, Collection<TermEncoding>> encodingMap = parseBsvFile( name, bsvPath ); + _delegate = new InMemoryEncoder( name, encodingMap ); + } + + + /** + * {@inheritDoc} + */ + @Override + public String getName() { + return _delegate.getName(); + } + + + /** + * {@inheritDoc} + */ + @Override + public Collection<TermEncoding> getEncodings( final long cuiCode ) { + return _delegate.getEncodings( cuiCode ); + } + + + /** + * Create a map of {@link TermEncoding} Objects + * by parsing a bsv file. The file should have a columnar format: + * <p> + * CUI|Code + * </p> + * + * @param bsvFilePath path to file containing term rows and bsv columns + * @return map of all cuis and codes read from the bsv file + */ + static private Map<Long, Collection<TermEncoding>> parseBsvFile( final String name, final String bsvFilePath ) { + final Collection<String[]> columnCollection = new HashSet<>(); + try { + columnCollection.addAll( BsvFileParser.parseBsvFile( bsvFilePath, new StringArrayCreator( 2 ) ) ); + } catch ( IOException ioE ) { + LOGGER.error( ioE.getMessage() ); + } + if ( columnCollection.isEmpty() ) { + return Collections.emptyMap(); + } + final Map<Long, Collection<TermEncoding>> encodingMap = new HashMap<>(); + for ( String[] columns : columnCollection ) { + final long cuiCode = CuiCodeUtil.getInstance().getCuiCode( columns[ 0 ] ); + final TermEncoding termEncoding = new TermEncoding( name, columns[ 1 ].trim() ); + encodingMap.computeIfAbsent( cuiCode, l -> new HashSet<>() ).add( termEncoding ); + } + return encodingMap; + } + + +}
Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/BsvListEncoder.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/BsvListEncoder.java?rev=1881994&view=auto ============================================================================== --- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/BsvListEncoder.java (added) +++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/BsvListEncoder.java Fri Sep 25 00:59:37 2020 @@ -0,0 +1,85 @@ +package org.apache.ctakes.dictionary.cased.encoder; + + +import org.apache.ctakes.core.util.StringUtil; +import org.apache.ctakes.dictionary.lookup2.util.CuiCodeUtil; +import org.apache.ctakes.utils.env.EnvironmentVariable; +import org.apache.log4j.Logger; +import org.apache.uima.UimaContext; + +import java.util.*; + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 8/18/2020 + */ +final public class BsvListEncoder implements TermEncoder { + + static public final String ENCODER_TYPE = "BSV_LIST"; + + static private final Logger LOGGER = Logger.getLogger( "BsvListEncoder" ); + + + private final InMemoryEncoder _delegate; + + public BsvListEncoder( final String name, final UimaContext uimaContext ) { + this( name, EnvironmentVariable.getEnv( name + "_list", uimaContext ) ); + } + + public BsvListEncoder( final String name, final String bsvList ) { + final Map<Long, Collection<TermEncoding>> encodingMap = parseList( name, bsvList ); + LOGGER.info( "Parsed " + encodingMap.size() + " encodings for " + name ); + _delegate = new InMemoryEncoder( name, encodingMap ); + } + + + /** + * {@inheritDoc} + */ + @Override + public String getName() { + return _delegate.getName(); + } + + + /** + * {@inheritDoc} + */ + @Override + public Collection<TermEncoding> getEncodings( final long cuiCode ) { + return _delegate.getEncodings( cuiCode ); + } + + + /** + * Create a map of {@link TermEncoding} Objects + * by parsing a bsv file. The file should have a columnar format: + * <p> + * CUI|Code + * </p> + * + * @param bsvList path to file containing term rows and bsv columns + * @return map of all cuis and codes read from the bsv file + */ + static private Map<Long, Collection<TermEncoding>> parseList( final String name, final String bsvList ) { + if ( bsvList.isEmpty() ) { + LOGGER.error( "List of term encodings is empty for " + name ); + return Collections.emptyMap(); + } + final Map<Long, Collection<TermEncoding>> encodingMap = new HashMap<>(); + for ( String encoding : StringUtil.fastSplit( bsvList, '|' ) ) { + final String[] keyValue = StringUtil.fastSplit( encoding, ':' ); + if ( keyValue.length != 2 ) { + LOGGER.warn( "Improper Key : Value pair for Term Encoding " + encoding ); + continue; + } + final long cuiCode = CuiCodeUtil.getInstance().getCuiCode( keyValue[ 0 ] ); + final TermEncoding termEncoding = new TermEncoding( name, keyValue[ 1 ].trim() ); + encodingMap.computeIfAbsent( cuiCode, l -> new HashSet<>() ).add( termEncoding ); + } + return encodingMap; + } + + +} Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/CodeSchema.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/CodeSchema.java?rev=1881994&view=auto ============================================================================== --- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/CodeSchema.java (added) +++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/CodeSchema.java Fri Sep 25 00:59:37 2020 @@ -0,0 +1,59 @@ +package org.apache.ctakes.dictionary.cased.encoder; + +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 8/25/2020 + */ +public enum CodeSchema { + TUI( "int", String.class, "TUI" ), + PREFERRED_TEXT( "text", String.class, "PREFTEXT", "PREF_TEXT", "PREFERRED_TEXT" ), + UNKNOWN( "text", String.class, "UNKNOWN" ); + + + private final String _codeFormat; + private final Class<?> _codeClass; + private final Collection<String> _names; + + + CodeSchema( final String codeFormat, final Class<?> codeClass, final String... names ) { + _codeFormat = codeFormat; + _codeClass = codeClass; + _names = new HashSet<>( Arrays.asList( names ) ); + } + + + public String getCodeFormat() { + return _codeFormat; + } + + public Class<?> getCodeClass() { + return _codeClass; + } + + public Collection<String> getNames() { + return _names; + } + + public boolean isSchema( final TermEncoding encoding ) { + return isSchema( encoding.getSchema() ); + } + + public boolean isSchema( final String name ) { + return _names.contains( name.toUpperCase() ); + } + + + static public CodeSchema getSchema( final String name ) { + return Arrays.stream( CodeSchema.values() ) + .filter( c -> c.isSchema( name ) ) + .findFirst() + .orElse( UNKNOWN ); + } + + +} Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/EncoderStore.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/EncoderStore.java?rev=1881994&view=auto ============================================================================== --- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/EncoderStore.java (added) +++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/EncoderStore.java Fri Sep 25 00:59:37 2020 @@ -0,0 +1,42 @@ +package org.apache.ctakes.dictionary.cased.encoder; + + +import java.util.ArrayList; +import java.util.Collection; + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 8/17/2020 + */ +public enum EncoderStore { + INSTANCE; + + static public EncoderStore getInstance() { + return INSTANCE; + } + + + private final Collection<TermEncoder> _encoders = new ArrayList<>(); + + public boolean addEncoder( final TermEncoder encoder ) { + final String name = encoder.getName(); + synchronized ( _encoders ) { + final boolean present = _encoders.stream() + .map( TermEncoder::getName ) + .anyMatch( name::equals ); + if ( present ) { + // Encoder with given name already exists. + return false; + } + _encoders.add( encoder ); + return true; + } + } + + + public Collection<TermEncoder> getEncoders() { + return _encoders; + } + +} Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/InMemoryEncoder.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/InMemoryEncoder.java?rev=1881994&view=auto ============================================================================== --- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/InMemoryEncoder.java (added) +++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/InMemoryEncoder.java Fri Sep 25 00:59:37 2020 @@ -0,0 +1,46 @@ +package org.apache.ctakes.dictionary.cased.encoder; + + +import java.util.Collection; +import java.util.Collections; +import java.util.Map; + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 8/18/2020 + */ +final public class InMemoryEncoder implements TermEncoder { + + private final String _name; + + // Map of rare tokens to terms that contain those tokens. Used like "First Word Token Lookup" but faster + private final Map<Long, Collection<TermEncoding>> _encodingMap; + + /** + * @param name unique name for dictionary + * @param encodingMap Map with a cui code as key, and TermEncoding Collection as value + */ + public InMemoryEncoder( final String name, final Map<Long, Collection<TermEncoding>> encodingMap ) { + _name = name; + _encodingMap = encodingMap; + } + + /** + * {@inheritDoc} + */ + @Override + public String getName() { + return _name; + } + + /** + * {@inheritDoc} + */ + @Override + public Collection<TermEncoding> getEncodings( final long cuiCode ) { + return _encodingMap.getOrDefault( cuiCode, Collections.emptyList() ); + } + + +} Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/JdbcEncoder.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/JdbcEncoder.java?rev=1881994&view=auto ============================================================================== --- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/JdbcEncoder.java (added) +++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/JdbcEncoder.java Fri Sep 25 00:59:37 2020 @@ -0,0 +1,185 @@ +package org.apache.ctakes.dictionary.cased.encoder; + + +import org.apache.ctakes.dictionary.cased.table.column.CodeType; +import org.apache.ctakes.dictionary.cased.table.column.SchemaCode; +import org.apache.ctakes.dictionary.cased.util.jdbc.JdbcUtil; +import org.apache.log4j.Logger; +import org.apache.uima.UimaContext; + +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import static org.apache.ctakes.dictionary.cased.util.jdbc.JdbcUtil.*; + + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 8/18/2020 + */ +final public class JdbcEncoder implements TermEncoder { + + static public final String ENCODER_TYPE = "JDBC"; + + static private final Logger LOGGER = Logger.getLogger( "JdbcEncoder" ); + + private final String _name; + private final PreparedStatement _selectCodeStatement; + private final CodeType _codeType; + + + public JdbcEncoder( final String name, final UimaContext uimaContext ) throws SQLException { + this( name, + getParameterValue( name, "driver", uimaContext, HSQL_DRIVER ), + getParameterValue( name, "url", uimaContext, "" ), + getParameterValue( name, "table", uimaContext, name.toUpperCase() ), + getParameterValue( name, "user", uimaContext, DEFAULT_USER ), + getParameterValue( name, "pass", uimaContext, DEFAULT_PASS ), + getParameterValue( name, "class", uimaContext, CodeType.TEXT.name() ) ); + } + + /** + * @param name unique name for dictionary + * @param jdbcDriver - + * @param jdbcUrl - + * @param tableName - + * @param jdbcUser - + * @param jdbcPass - + * @param codeType - + */ + public JdbcEncoder( final String name, + final String jdbcDriver, + final String jdbcUrl, + final String tableName, + final String jdbcUser, + final String jdbcPass, + final String codeType ) throws SQLException { + _name = name; + _selectCodeStatement = JdbcUtil.createPreparedStatement( name, + jdbcDriver, jdbcUrl, jdbcUser, jdbcPass, tableName, SchemaCode.CUI.name() ); + LOGGER.info( "Connected to " + name + " table " + tableName ); + _codeType = CodeType.getCodeType( codeType ); + } + + public String getName() { + return _name; + } + + + /** + * {@inheritDoc} + */ + @Override + public Collection<TermEncoding> getEncodings( final long cuiCode ) { + switch ( _codeType ) { + case TEXT: + return getTextEncodings( cuiCode, SchemaCode.SCHEMA_CODE.getColumn() ); + case LONG: + return getLongEncodings( cuiCode, SchemaCode.SCHEMA_CODE.getColumn() ); + case INT: + return getIntEncodings( cuiCode, SchemaCode.SCHEMA_CODE.getColumn() ); + case TUI: + return getTuiEncodings( cuiCode, SchemaCode.SCHEMA_CODE.getColumn() ); + case PREF_TEXT: + return getPrefTextEncodings( cuiCode, SchemaCode.SCHEMA_CODE.getColumn() ); + } + return getTextEncodings( cuiCode, SchemaCode.SCHEMA_CODE.getColumn() ); + } + + + private Collection<TermEncoding> getTextEncodings( final long cuiCode, final int column ) { + final List<TermEncoding> encodings = new ArrayList<>(); + try { + JdbcUtil.fillSelectCall( _selectCodeStatement, cuiCode ); + final ResultSet resultSet = _selectCodeStatement.executeQuery(); + while ( resultSet.next() ) { + encodings.add( new TermEncoding( getName(), resultSet.getString( column ) ) ); + } + // Though the ResultSet interface documentation states that there are automatic closures, + // it is up to the driver to implement this behavior ... historically some drivers have not done so + resultSet.close(); + } catch ( SQLException e ) { + LOGGER.error( e.getMessage() ); + } + return encodings; + } + + + private Collection<TermEncoding> getLongEncodings( final long cuiCode, final int column ) { + final List<TermEncoding> encodings = new ArrayList<>(); + try { + JdbcUtil.fillSelectCall( _selectCodeStatement, cuiCode ); + final ResultSet resultSet = _selectCodeStatement.executeQuery(); + while ( resultSet.next() ) { + encodings.add( new TermEncoding( getName(), resultSet.getLong( column ) ) ); + } + // Though the ResultSet interface documentation states that there are automatic closures, + // it is up to the driver to implement this behavior ... historically some drivers have not done so + resultSet.close(); + } catch ( SQLException e ) { + LOGGER.error( e.getMessage() ); + } + return encodings; + } + + + private Collection<TermEncoding> getIntEncodings( final long cuiCode, final int column ) { + final List<TermEncoding> encodings = new ArrayList<>(); + try { + JdbcUtil.fillSelectCall( _selectCodeStatement, cuiCode ); + final ResultSet resultSet = _selectCodeStatement.executeQuery(); + while ( resultSet.next() ) { + encodings.add( new TermEncoding( getName(), resultSet.getInt( column ) ) ); + } + // Though the ResultSet interface documentation states that there are automatic closures, + // it is up to the driver to implement this behavior ... historically some drivers have not done so + resultSet.close(); + } catch ( SQLException e ) { + LOGGER.error( e.getMessage() ); + } + return encodings; + } + + + private Collection<TermEncoding> getTuiEncodings( final long cuiCode, final int column ) { + final List<TermEncoding> encodings = new ArrayList<>(); + try { + JdbcUtil.fillSelectCall( _selectCodeStatement, cuiCode ); + final ResultSet resultSet = _selectCodeStatement.executeQuery(); + while ( resultSet.next() ) { + encodings.add( new TermEncoding( CodeSchema.TUI.name(), resultSet.getInt( column ) ) ); + } + // Though the ResultSet interface documentation states that there are automatic closures, + // it is up to the driver to implement this behavior ... historically some drivers have not done so + resultSet.close(); + } catch ( SQLException e ) { + LOGGER.error( e.getMessage() ); + } + return encodings; + } + + + private Collection<TermEncoding> getPrefTextEncodings( final long cuiCode, final int column ) { + final List<TermEncoding> encodings = new ArrayList<>(); + try { + JdbcUtil.fillSelectCall( _selectCodeStatement, cuiCode ); + final ResultSet resultSet = _selectCodeStatement.executeQuery(); + while ( resultSet.next() ) { + encodings.add( new TermEncoding( CodeSchema.PREFERRED_TEXT.name(), resultSet.getString( column ) ) ); + } + // Though the ResultSet interface documentation states that there are automatic closures, + // it is up to the driver to implement this behavior ... historically some drivers have not done so + resultSet.close(); + } catch ( SQLException e ) { + LOGGER.error( e.getMessage() ); + } + return encodings; + } + + +} Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/TermEncoder.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/TermEncoder.java?rev=1881994&view=auto ============================================================================== --- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/TermEncoder.java (added) +++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/TermEncoder.java Fri Sep 25 00:59:37 2020 @@ -0,0 +1,31 @@ +package org.apache.ctakes.dictionary.cased.encoder; + +import org.apache.ctakes.dictionary.cased.lookup.DiscoveredTerm; + +import java.util.Collection; + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 8/17/2020 + */ +public interface TermEncoder { + + /** + * The Type identifier and Name are used to maintain a collection of term encoders, + * so the combination of Type and Name should be unique for each encoder if possible. + * + * @return simple name for the encoder + */ + String getName(); + + + default Collection<TermEncoding> getEncodings( final DiscoveredTerm discoveredTerm ) { + return getEncodings( discoveredTerm.getCuiCode() ); + } + + + Collection<TermEncoding> getEncodings( final long cuiCode ); + + +} Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/TermEncoding.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/TermEncoding.java?rev=1881994&view=auto ============================================================================== --- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/TermEncoding.java (added) +++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/TermEncoding.java Fri Sep 25 00:59:37 2020 @@ -0,0 +1,42 @@ +package org.apache.ctakes.dictionary.cased.encoder; + +import jdk.nashorn.internal.ir.annotations.Immutable; + + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 8/18/2020 + */ +@Immutable +final public class TermEncoding { + + private final String _schema; + private final Object _schemaCode; + + + public TermEncoding( final String schema, + final Object schemaCode ) { + _schema = schema; + _schemaCode = schemaCode; + } + + public String getSchema() { + return _schema; + } + + public Object getSchemaCode() { + return _schemaCode; + } + + public boolean equals( final Object object ) { + return object instanceof TermEncoding + && ((TermEncoding)object).getSchema().equals( getSchema() ) + && ((TermEncoding)object).getSchemaCode().equals( getSchemaCode() ); + } + + public int hashCode() { + return (_schema + '_' + _schemaCode).hashCode(); + } + +} Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/CandidateTerm.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/CandidateTerm.java?rev=1881994&view=auto ============================================================================== --- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/CandidateTerm.java (added) +++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/CandidateTerm.java Fri Sep 25 00:59:37 2020 @@ -0,0 +1,227 @@ +package org.apache.ctakes.dictionary.cased.lookup; + + +import jdk.nashorn.internal.ir.annotations.Immutable; +import org.apache.ctakes.core.util.StringUtil; +import org.apache.ctakes.dictionary.cased.util.tokenize.TokenizedTerm; + +import java.util.Arrays; + + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 8/14/2020 + */ +@Immutable +final public class CandidateTerm { + + + private final long _cuiCode; + + private final String[] _prefixes; + private final String _rareWord; + private final String[] _suffixes; + final private boolean _allUpperCase; + final private boolean _allLowerCase; + final private boolean _matchesLookupCase; + private final int _rank; + private final int _instances; + + final private int _hashCode; + + + public CandidateTerm( final TokenizedTerm tokenizedTerm, final int rareWordIndex ) { + _cuiCode = tokenizedTerm.getCui(); + final String[] tokens = tokenizedTerm.getTokens(); + _prefixes = rareWordIndex == 0 + ? new String[ 0 ] + : Arrays.copyOf( tokens, rareWordIndex ); + _rareWord = tokens[ rareWordIndex ]; + final int suffixLength = tokens.length - rareWordIndex - 1; + _suffixes = new String[ suffixLength ]; + System.arraycopy( tokens, rareWordIndex + 1, _suffixes, 0, suffixLength ); + _allUpperCase = tokenizedTerm.isAllUpperCase(); + _allLowerCase = tokenizedTerm.isAllLowerCase(); + _matchesLookupCase = true; + _hashCode = (_cuiCode + "_" + String.join( " ", tokens )).hashCode(); + _rank = 1; + _instances = 1; + } + + + public CandidateTerm( final long cuiCode, + final String[] tokens, + final int rareWordIndex, + final boolean lookupAllUpper, + final boolean lookupAllLower, + final int rank, + final int instances ) { + _cuiCode = cuiCode; + _prefixes = rareWordIndex == 0 + ? new String[ 0 ] + : Arrays.copyOf( tokens, rareWordIndex ); + _rareWord = tokens[ rareWordIndex ]; + final int suffixLength = tokens.length - rareWordIndex - 1; + _suffixes = new String[ suffixLength ]; + System.arraycopy( tokens, rareWordIndex + 1, _suffixes, 0, suffixLength ); + boolean anyCaps = false; + boolean anyLower = false; + for ( char c : String.join( "", tokens ).toCharArray() ) { + if ( Character.isUpperCase( c ) ) { + anyCaps = true; + } else if ( Character.isLowerCase( c ) ) { + anyLower = true; + } + if ( anyCaps && anyLower ) { + break; + } + } + _allUpperCase = anyCaps && !anyLower; + _allLowerCase = anyLower && !anyCaps; + + _hashCode = (cuiCode + "_" + String.join( " ", tokens )).hashCode(); + _matchesLookupCase = _allUpperCase == lookupAllUpper && _allLowerCase == lookupAllLower; + _rank = rank; + _instances = instances; + } + + + public CandidateTerm( final long cuiCode, + final String prefix, + final String rareWord, + final String suffix, + final boolean lookupAllUpper, + final boolean lookupAllLower, + final int rank, + final int instances ) { + _cuiCode = cuiCode; + _prefixes = prefix.isEmpty() + ? new String[ 0 ] + : StringUtil.fastSplit( prefix, ' ' ); + _rareWord = rareWord; + _suffixes = suffix.isEmpty() + ? new String[ 0 ] + : StringUtil.fastSplit( suffix, ' ' ); + boolean anyCaps = false; + boolean anyLower = false; + for ( char c : (prefix + rareWord + suffix).toCharArray() ) { + if ( Character.isUpperCase( c ) ) { + anyCaps = true; + } else if ( Character.isLowerCase( c ) ) { + anyLower = true; + } + if ( anyCaps && anyLower ) { + break; + } + } + _allUpperCase = anyCaps && !anyLower; + _allLowerCase = anyLower && !anyCaps; + _hashCode = (cuiCode + "_" + + (prefix.isEmpty() ? "" : prefix + " ") + + rareWord + + (suffix.isEmpty() ? "" : " " + suffix)) + .hashCode(); + _matchesLookupCase = _allUpperCase == lookupAllUpper && _allLowerCase == lookupAllLower; + _rank = rank; + _instances = instances; + } + + + /** + * @return umls cui for the term + */ + public Long getCuiCode() { + return _cuiCode; + } + + + /** + * @return each token in the term as a separate String + */ + public String[] getTokens() { + final String[] tokens = new String[ _prefixes.length + 1 + _suffixes.length ]; + System.arraycopy( _prefixes, 0, tokens, 0, _prefixes.length ); + tokens[ _prefixes.length ] = _rareWord; + System.arraycopy( _suffixes, 0, tokens, _prefixes.length + 1, _suffixes.length ); + return tokens; + } + + + public String[] getPrefixes() { + return _prefixes; + } + + public String[] getLowerPrefixes() { + if ( isAllLowerCase() ) { + return _prefixes; + } + return Arrays.stream( _prefixes ).map( String::toLowerCase ).toArray( String[]::new ); + } + + + public String[] getSuffixes() { + return _suffixes; + } + + public String[] getLowerSuffixes() { + if ( isAllLowerCase() ) { + return _suffixes; + } + return Arrays.stream( _suffixes ).map( String::toLowerCase ).toArray( String[]::new ); + } + + /** + * @return the index of the rare word used for indexing in the token array + */ + public int getRareWordIndex() { + return _prefixes.length; + } + + + public int getTokenCount() { + return _prefixes.length + 1 + _suffixes.length; + } + + public boolean isAllUpperCase() { + return _allUpperCase; + } + + public boolean isAllLowerCase() { + return _allLowerCase; + } + + public boolean matchesLookupCase() { + return _matchesLookupCase; + } + + public int getRank() { + return _rank; + } + + public int getInstances() { + return _instances; + } + + /** + * {@inheritDoc} + */ + @Override + public boolean equals( final Object value ) { + return value instanceof CandidateTerm && value.hashCode() == hashCode(); +// if ( !(value instanceof LookupTerm) ) { +// return false; +// } +// final LookupTerm other = (LookupTerm)value; +// return other.getCuiCode().equals( _cuiCode ) && Arrays.equals( other.getTokens(), getTokens() ); + } + + /** + * {@inheritDoc} + */ + @Override + public int hashCode() { + return _hashCode; + } + +} Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/ContiguousLookupEngine.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/ContiguousLookupEngine.java?rev=1881994&view=auto ============================================================================== --- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/ContiguousLookupEngine.java (added) +++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/ContiguousLookupEngine.java Fri Sep 25 00:59:37 2020 @@ -0,0 +1,236 @@ +package org.apache.ctakes.dictionary.cased.lookup; + +import org.apache.ctakes.core.util.Pair; +import org.apache.ctakes.dictionary.cased.dictionary.CasedDictionary; +import org.apache.log4j.Logger; + +import java.util.*; + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 8/18/2020 + */ +final public class ContiguousLookupEngine { + + static private final Logger LOGGER = Logger.getLogger( "ContiguousLookupEngine" ); + + + /** + * Given a dictionary, tokens, and lookup token indices, populate a terms collection with discovered terms + * + * @param dictionary - + * @param lookupTokens - + * @return map of text spans to terms discovered at those text spans. + */ + public final Map<Pair<Integer>, Collection<DiscoveredTerm>> findTerms( final CasedDictionary dictionary, + final List<LookupToken> lookupTokens, + final int consecutiveSkipMax, + final int totalSkipMax ) { + final Map<Pair<Integer>, Collection<DiscoveredTerm>> discoveredTermMap = new HashMap<>(); + int lookupTokenIndex = -1; + Collection<CandidateTerm> candidateTerms; + for ( LookupToken lookupToken : lookupTokens ) { + lookupTokenIndex++; + if ( !lookupToken.isValidIndexToken() ) { + continue; + } + candidateTerms = dictionary.getCandidateTerms( lookupToken ); + if ( candidateTerms == null || candidateTerms.isEmpty() ) { + continue; + } + for ( CandidateTerm candidateTerm : candidateTerms ) { + if ( candidateTerm.getTokenCount() == 1 ) { + // Single word term, add and move on + discoveredTermMap.computeIfAbsent( lookupToken.getTextSpan(), s -> new HashSet<>() ) + .add( new DiscoveredTerm( candidateTerm ) ); + continue; + } + if ( candidateTerm.getPrefixes().length >= lookupTokenIndex + || lookupTokenIndex + candidateTerm.getSuffixes().length >= lookupTokens.size() ) { + // term will extend beyond window + continue; + } + if ( isMismatch( getPrefixMatch( candidateTerm, lookupTokens, lookupTokenIndex ) ) ) { + continue; + } + if ( isMismatch( getSuffixMatch( candidateTerm, lookupTokens, lookupTokenIndex ) ) ) { + continue; + } + final int spanBegin = lookupTokens.get( lookupTokenIndex - candidateTerm.getPrefixes().length ).getBegin(); + final int spanEnd = lookupTokens.get( lookupTokenIndex + candidateTerm.getSuffixes().length ).getEnd(); + discoveredTermMap.computeIfAbsent( new Pair<>( spanBegin, spanEnd ), s -> new HashSet<>() ) + .add( new DiscoveredTerm( candidateTerm ) ); + } + } + return discoveredTermMap; + } + + + static private final Pair<Integer> HIT = new Pair<>( 0, 0 ); + static private final Pair<Integer> MISS = new Pair<>( -1, -1 ); + + static private boolean isMismatch( final Pair<Integer> skips ) { + return MISS.equals( skips ); + } + + + /** + * Hopefully the jit will inline this method + * + * @param candidateTerm rare word term to check for match + * @param allTokens all tokens in a window + * @param lookupTokenIndex index of first token in allTokens to check + * @return the consecutiveSkips and totalSkips required to make the prefix fit the tokens. -1,-1 if no fit. + */ + public static Pair<Integer> getPrefixMatch( final CandidateTerm candidateTerm, + final List<LookupToken> allTokens, + final int lookupTokenIndex ) { + final String[] prefixes = candidateTerm.getPrefixes(); + final String[] lowerPrefixes = candidateTerm.getLowerPrefixes(); + if ( prefixes.length == 0 ) { + return HIT; + } + int tokenIndex = lookupTokenIndex - 1; + LookupToken lookupToken = allTokens.get( tokenIndex ); + for ( int i = prefixes.length - 1; i >= 0; i-- ) { + if ( candidateTerm.isAllUpperCase() ) { + if ( !lookupToken.isAllUpperCase() ) { + return MISS; + } + if ( !prefixes[ i ].equals( lookupToken.getText() ) ) { + return MISS; + } + } + if ( !candidateTerm.isAllUpperCase() && !candidateTerm.isAllLowerCase() ) { + if ( !prefixes[ i ].equals( lookupToken.getText() ) ) { + return MISS; + } + } + if ( lowerPrefixes[ i ].equals( lookupToken.getLowerText() ) ) { + tokenIndex--; + lookupToken = allTokens.get( tokenIndex ); + continue; + } + // the token normal didn't match + return MISS; + } + // the token normal matched + return HIT; + } + + /** + * Hopefully the jit will inline this method + * + * @param candidateTerm rare word term to check for match + * @param allTokens all tokens in a window + * @param lookupTokenIndex index of first token in allTokens to check + * @return the consecutiveSkips and totalSkips required to make the prefix fit the tokens. -1,-1 if no fit. + */ + public static Pair<Integer> getSuffixMatch( final CandidateTerm candidateTerm, + final List<LookupToken> allTokens, + final int lookupTokenIndex ) { + final String[] suffixes = candidateTerm.getSuffixes(); + // TODO - Do we really want lower-case candidates? + // They should be stored in the dictionary as the desired case. + final String[] lowerSuffixes = candidateTerm.getLowerSuffixes(); + if ( suffixes.length == 0 ) { + return HIT; + } + int tokenIndex = lookupTokenIndex + 1; + LookupToken lookupToken = allTokens.get( tokenIndex ); + for ( int i = 0; i < suffixes.length; i++ ) { + if ( candidateTerm.isAllUpperCase() ) { + if ( !lookupToken.isAllUpperCase() ) { + return MISS; + } + if ( !suffixes[ i ].equals( lookupToken.getText() ) ) { + return MISS; + } + } + if ( !candidateTerm.isAllUpperCase() && !candidateTerm.isAllLowerCase() ) { + if ( !suffixes[ i ].equals( lookupToken.getText() ) ) { + return MISS; + } + } + if ( lowerSuffixes[ i ].equals( lookupToken.getLowerText() ) ) { + tokenIndex--; + lookupToken = allTokens.get( tokenIndex ); + continue; + } + // the token normal didn't match + return MISS; + } + // the token normal matched + return HIT; + } + + + /** + * Hopefully the jit will inline this method + * + * @param candidateTerm rare word term to check for match + * @param allTokens all tokens in a window + * @param lookupTokenIndex index of first token in allTokens to check + * @param consecutiveSkipMax - + * @param totalSkipMax - + * @return the consecutiveSkips and totalSkips required to make the prefix fit the tokens. -1,-1 if no fit. + */ + public static Pair<Integer> getPrefixMatch( final CandidateTerm candidateTerm, + final List<LookupToken> allTokens, + final int lookupTokenIndex, + final int consecutiveSkipMax, + final int totalSkipMax ) { + final String[] prefixes = candidateTerm.getPrefixes(); + if ( prefixes.length == 0 ) { + return HIT; + } + int tokenIndex = lookupTokenIndex - 1; + for ( int i = prefixes.length - 1; i >= 0; i-- ) { + if ( prefixes[ i ].equals( allTokens.get( tokenIndex ).getText() ) ) { + tokenIndex--; + continue; + } + // the token normal didn't match + // TODO Add overlap logic ... + return MISS; + } + // the token normal matched + return HIT; + } + + + /** + * Hopefully the jit will inline this method + * + * @param candidateTerm rare word term to check for match + * @param allTokens all tokens in a window + * @param lookupTokenIndex index of first token in allTokens to check + * @param consecutiveSkipMax - + * @param totalSkipMax - + * @return the consecutiveSkips and totalSkips required to make the prefix fit the tokens. -1,-1 if no fit. + */ + public static Pair<Integer> getSuffixMatch( final CandidateTerm candidateTerm, + final List<LookupToken> allTokens, + final int lookupTokenIndex, + final int consecutiveSkipMax, + final int totalSkipMax ) { + final String[] suffixes = candidateTerm.getSuffixes(); + if ( suffixes.length == 0 ) { + return HIT; + } + int tokenIndex = lookupTokenIndex + 1; + for ( String suffix : suffixes ) { + if ( suffix.equals( allTokens.get( tokenIndex ).getText() ) ) { + tokenIndex++; + continue; + } + // the token normal didn't match + // TODO Add overlap logic ... + return MISS; + } + // the token normal matched + return HIT; + } + +} Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/DiscoveredTerm.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/DiscoveredTerm.java?rev=1881994&view=auto ============================================================================== --- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/DiscoveredTerm.java (added) +++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/DiscoveredTerm.java Fri Sep 25 00:59:37 2020 @@ -0,0 +1,66 @@ +package org.apache.ctakes.dictionary.cased.lookup; + + +import jdk.nashorn.internal.ir.annotations.Immutable; + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 8/17/2020 + */ +@Immutable +final public class DiscoveredTerm { + + private final long _cuiCode; + private final int _consecutiveSkips; + private final int _totalSkips; + private final boolean _matchesLookupCase; + private final int _rank; + private final int _instances; + + public DiscoveredTerm( final CandidateTerm candidateTerm ) { + this( candidateTerm, 0, 0 ); + } + + public DiscoveredTerm( final CandidateTerm candidateTerm, + final int consecutiveSkips, + final int totalSkips ) { + _cuiCode = candidateTerm.getCuiCode(); + _consecutiveSkips = consecutiveSkips; + _totalSkips = totalSkips; + _matchesLookupCase = candidateTerm.matchesLookupCase(); + _rank = candidateTerm.getRank(); + _instances = candidateTerm.getInstances(); + } + + public long getCuiCode() { + return _cuiCode; + } + + public boolean matchesLookupCase() { + return _matchesLookupCase; + } + + /** + * @return rank, where 1 is the "best". + */ + public int getRank() { + return _rank; + } + + /** + * @return number of source vocabularies that have this synonym for this cui. + */ + public int getInstances() { + return _instances; + } + + public int getTotalSkips() { + return _totalSkips; + } + + public int getConsecutiveSkips() { + return _consecutiveSkips; + } + +} Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/LookupEngine.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/LookupEngine.java?rev=1881994&view=auto ============================================================================== --- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/LookupEngine.java (added) +++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/LookupEngine.java Fri Sep 25 00:59:37 2020 @@ -0,0 +1,236 @@ +package org.apache.ctakes.dictionary.cased.lookup; + + +import org.apache.ctakes.core.util.Pair; +import org.apache.ctakes.dictionary.cased.dictionary.CasedDictionary; +import org.apache.log4j.Logger; + +import java.util.*; + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 8/17/2020 + */ +public class LookupEngine { + + static private final Logger LOGGER = Logger.getLogger( "LookupEngine" ); + + + /** + * Given a dictionary, tokens, and lookup token indices, populate a terms collection with discovered terms + * + * @param dictionary - + * @param lookupTokens - + * @return map of text spans to terms discovered at those text spans. + */ + public final Map<Pair<Integer>, Collection<DiscoveredTerm>> findTerms( final CasedDictionary dictionary, + final List<LookupToken> lookupTokens, + final int consecutiveSkipMax, + final int totalSkipMax ) { + final Map<Pair<Integer>, Collection<DiscoveredTerm>> discoveredTermMap = new HashMap<>(); + int lookupTokenIndex = -1; + Collection<CandidateTerm> candidateTerms; + for ( LookupToken lookupToken : lookupTokens ) { + lookupTokenIndex++; + if ( !lookupToken.isValidIndexToken() ) { + continue; + } + candidateTerms = dictionary.getCandidateTerms( lookupToken ); + if ( candidateTerms == null || candidateTerms.isEmpty() ) { + continue; + } + for ( CandidateTerm candidateTerm : candidateTerms ) { + if ( candidateTerm.getTokenCount() == 1 ) { + // Single word term, add and move on + discoveredTermMap.computeIfAbsent( lookupToken.getTextSpan(), s -> new HashSet<>() ) + .add( new DiscoveredTerm( candidateTerm ) ); + continue; + } + if ( candidateTerm.getPrefixes().length >= lookupTokenIndex + || lookupTokenIndex + candidateTerm.getSuffixes().length >= lookupTokens.size() ) { + // term will extend beyond window + continue; + } + if ( isMismatch( getPrefixMatch( candidateTerm, lookupTokens, lookupTokenIndex ) ) ) { + continue; + } + if ( isMismatch( getSuffixMatch( candidateTerm, lookupTokens, lookupTokenIndex ) ) ) { + continue; + } + final int spanBegin = lookupTokens.get( lookupTokenIndex - candidateTerm.getPrefixes().length ).getBegin(); + final int spanEnd = lookupTokens.get( lookupTokenIndex + candidateTerm.getSuffixes().length ).getEnd(); + discoveredTermMap.computeIfAbsent( new Pair<>( spanBegin, spanEnd ), s -> new HashSet<>() ) + .add( new DiscoveredTerm( candidateTerm ) ); + } + } + return discoveredTermMap; + } + + + static private final Pair<Integer> HIT = new Pair<>( 0, 0 ); + static private final Pair<Integer> MISS = new Pair<>( -1, -1 ); + + static private boolean isMismatch( final Pair<Integer> skips ) { + return MISS.equals( skips ); + } + + + /** + * Hopefully the jit will inline this method + * + * @param candidateTerm rare word term to check for match + * @param allTokens all tokens in a window + * @param lookupTokenIndex index of first token in allTokens to check + * @return the consecutiveSkips and totalSkips required to make the prefix fit the tokens. -1,-1 if no fit. + */ + public static Pair<Integer> getPrefixMatch( final CandidateTerm candidateTerm, + final List<LookupToken> allTokens, + final int lookupTokenIndex ) { + final String[] prefixes = candidateTerm.getPrefixes(); + final String[] lowerPrefixes = candidateTerm.getLowerPrefixes(); + if ( prefixes.length == 0 ) { + return HIT; + } + int tokenIndex = lookupTokenIndex - 1; + LookupToken lookupToken = allTokens.get( tokenIndex ); + for ( int i = prefixes.length - 1; i >= 0; i-- ) { + if ( candidateTerm.isAllUpperCase() ) { + if ( !lookupToken.isAllUpperCase() ) { + return MISS; + } + if ( !prefixes[ i ].equals( lookupToken.getText() ) ) { + return MISS; + } + } + if ( !candidateTerm.isAllUpperCase() && !candidateTerm.isAllLowerCase() ) { + if ( !prefixes[ i ].equals( lookupToken.getText() ) ) { + return MISS; + } + } + if ( lowerPrefixes[ i ].equals( lookupToken.getLowerText() ) ) { + tokenIndex--; + lookupToken = allTokens.get( tokenIndex ); + continue; + } + // the token normal didn't match + return MISS; + } + // the token normal matched + return HIT; + } + + /** + * Hopefully the jit will inline this method + * + * @param candidateTerm rare word term to check for match + * @param allTokens all tokens in a window + * @param lookupTokenIndex index of first token in allTokens to check + * @return the consecutiveSkips and totalSkips required to make the prefix fit the tokens. -1,-1 if no fit. + */ + public static Pair<Integer> getSuffixMatch( final CandidateTerm candidateTerm, + final List<LookupToken> allTokens, + final int lookupTokenIndex ) { + final String[] suffixes = candidateTerm.getSuffixes(); + final String[] lowerSuffixes = candidateTerm.getLowerSuffixes(); + if ( suffixes.length == 0 ) { + return HIT; + } + int tokenIndex = lookupTokenIndex + 1; + LookupToken lookupToken = allTokens.get( tokenIndex ); + for ( int i = 0; i < suffixes.length; i++ ) { + if ( candidateTerm.isAllUpperCase() ) { + if ( !lookupToken.isAllUpperCase() ) { + return MISS; + } + if ( !suffixes[ i ].equals( lookupToken.getText() ) ) { + return MISS; + } + } + if ( !candidateTerm.isAllUpperCase() && !candidateTerm.isAllLowerCase() ) { + if ( !suffixes[ i ].equals( lookupToken.getText() ) ) { + return MISS; + } + } + if ( lowerSuffixes[ i ].equals( lookupToken.getLowerText() ) ) { + tokenIndex--; + lookupToken = allTokens.get( tokenIndex ); + continue; + } + // the token normal didn't match + return MISS; + } + // the token normal matched + return HIT; + } + + +// /** +// * Hopefully the jit will inline this method +// * +// * @param candidateTerm rare word term to check for match +// * @param allTokens all tokens in a window +// * @param lookupTokenIndex index of first token in allTokens to check +// * @param consecutiveSkipMax - +// * @param totalSkipMax - +// * @return the consecutiveSkips and totalSkips required to make the prefix fit the tokens. -1,-1 if no fit. +// */ +// public static Pair<Integer> getPrefixMatch( final CandidateTerm candidateTerm, +// final List<LookupToken> allTokens, +// final int lookupTokenIndex, +// final int consecutiveSkipMax, +// final int totalSkipMax ) { +// final String[] prefixes = candidateTerm.getPrefixes(); +// if ( prefixes.length == 0 ) { +// return HIT; +// } +// int tokenIndex = lookupTokenIndex-1; +// for ( int i = prefixes.length-1; i >=0; i-- ) { +// if ( prefixes[ i ].equals( allTokens.get( tokenIndex ).getText() ) ) { +// tokenIndex--; +// continue; +// } +// // the token normal didn't match +// // TODO Add overlap logic ... +// return MISS; +// } +// // the token normal matched +// return HIT; +// } +// +// +// /** +// * Hopefully the jit will inline this method +// * +// * @param candidateTerm rare word term to check for match +// * @param allTokens all tokens in a window +// * @param lookupTokenIndex index of first token in allTokens to check +// * @param consecutiveSkipMax - +// * @param totalSkipMax - +// * @return the consecutiveSkips and totalSkips required to make the prefix fit the tokens. -1,-1 if no fit. +// */ +// public static Pair<Integer> getSuffixMatch( final CandidateTerm candidateTerm, +// final List<LookupToken> allTokens, +// final int lookupTokenIndex, +// final int consecutiveSkipMax, +// final int totalSkipMax ) { +// final String[] suffixes = candidateTerm.getSuffixes(); +// if ( suffixes.length == 0 ) { +// return HIT; +// } +// int tokenIndex = lookupTokenIndex+1; +// for ( String suffix : suffixes ) { +// if ( suffix.equals( allTokens.get( tokenIndex ).getText() ) ) { +// tokenIndex++; +// continue; +// } +// // the token normal didn't match +// // TODO Add overlap logic ... +// return MISS; +// } +// // the token normal matched +// return HIT; +// } + + +} Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/LookupToken.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/LookupToken.java?rev=1881994&view=auto ============================================================================== --- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/LookupToken.java (added) +++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/LookupToken.java Fri Sep 25 00:59:37 2020 @@ -0,0 +1,124 @@ +package org.apache.ctakes.dictionary.cased.lookup; + + +import jdk.nashorn.internal.ir.annotations.Immutable; +import org.apache.ctakes.core.util.Pair; +import org.apache.ctakes.typesystem.type.syntax.BaseToken; + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 8/13/2020 + */ +@Immutable +final public class LookupToken { + + final private Pair<Integer> _textSpan; + final private String _text; + final private boolean _allCaps; + final private boolean _allLower; + final private boolean _isValidIndexToken; + + public LookupToken( final BaseToken baseToken, final boolean isValidIndexToken ) { + _textSpan = new Pair<>( baseToken.getBegin(), baseToken.getEnd() ); + // All case-sensitivity is handled here. This is the text in the note. + _text = baseToken.getCoveredText(); + boolean anyCaps = false; + boolean anyLower = false; + for ( char c : _text.toCharArray() ) { + if ( Character.isUpperCase( c ) ) { + anyCaps = true; + } else if ( Character.isLowerCase( c ) ) { + anyLower = true; + } + if ( anyCaps && anyLower ) { + break; + } + } + _allCaps = anyCaps && !anyLower; + _allLower = anyLower && !anyCaps; + _isValidIndexToken = isValidIndexToken; + } + + /** + * @return a span with the start and end indices used for this lookup token + */ + public Pair<Integer> getTextSpan() { + return _textSpan; + } + + /** + * @return the start index used for this lookup token + */ + public int getBegin() { + return _textSpan.getValue1(); + } + + /** + * @return the end index used for this lookup token + */ + public int getEnd() { + return _textSpan.getValue2(); + } + + /** + * @return the length of the text span in characters + */ + public int getLength() { + return _text.length(); + } + + /** + * @return the actual text in the document for the lookup token, regardless of case. + */ + public String getText() { + return _text; + } + + /** + * @return the actual text in the document for the lookup token, regardless of case. + */ + public String getLowerText() { + if ( _allLower ) { + return _text; + } + return _text.toLowerCase(); + } + + /** + * @return true if the text characters are all upper case. + */ + public boolean isAllUpperCase() { + return _allCaps; + } + + /** + * @return true if the text characters are all lower case. + */ + public boolean isAllLowerCase() { + return _allLower; + } + + public boolean isValidIndexToken() { + return _isValidIndexToken; + } + + /** + * Two lookup tokens are equal iff the spans are equal. + * + * @param value - + * @return true if {@code value} is a {@code FastLookupToken} and has a span equal to this token's span + */ + public boolean equals( final Object value ) { + return value instanceof LookupToken + && _textSpan.equals( ((LookupToken)value).getTextSpan() ); + } + + /** + * @return hashCode created from the Span + */ + public int hashCode() { + return _textSpan.hashCode(); + } + +} Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/CodeType.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/CodeType.java?rev=1881994&view=auto ============================================================================== --- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/CodeType.java (added) +++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/CodeType.java Fri Sep 25 00:59:37 2020 @@ -0,0 +1,35 @@ +package org.apache.ctakes.dictionary.cased.table.column; + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 8/18/2020 + */ +public enum CodeType { + INT, + LONG, + TEXT, + TUI, + PREF_TEXT; +// VARCHAR(48) , BIGINT , FLOAT , INTEGER ?? + + + /** + * Sending a nonexistant name to enum .valueof( .. ) will throw an IllegalArgumentException. + * + * @param name - + * @return - + */ + static public CodeType getCodeType( final String name ) { + final String upper = name.toUpperCase(); + for ( CodeType codeType : CodeType.values() ) { + if ( codeType.name().equals( upper ) ) { + return codeType; + } + } + // Return TEXT as a default. + return TEXT; + } + + +} Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/SchemaCode.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/SchemaCode.java?rev=1881994&view=auto ============================================================================== --- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/SchemaCode.java (added) +++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/SchemaCode.java Fri Sep 25 00:59:37 2020 @@ -0,0 +1,22 @@ +package org.apache.ctakes.dictionary.cased.table.column; + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 8/18/2020 + */ +public enum SchemaCode { + CUI( 1 ), + SCHEMA_CODE( 2 ); + + final private int _column; + + SchemaCode( final int column ) { + _column = column; + } + + public int getColumn() { + return _column; + } + +} Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/Synonym.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/Synonym.java?rev=1881994&view=auto ============================================================================== --- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/Synonym.java (added) +++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/Synonym.java Fri Sep 25 00:59:37 2020 @@ -0,0 +1,32 @@ +package org.apache.ctakes.dictionary.cased.table.column; + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 8/14/2020 + */ +public enum Synonym { + CUI( 1, Long.class ), + PREFIX( 2, String.class ), + INDEX_WORD( 3, String.class ), + SUFFIX( 4, String.class ), + RANK( 5, Integer.class ), + INSTANCES( 6, Integer.class ); + + final private int _column; + final private Class<?> _class; + + Synonym( final int column, final Class<?> clazz ) { + _column = column; + _class = clazz; + } + + public int getColumn() { + return _column; + } + + public Class<?> getClassType() { + return _class; + } + +} Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/BsvFileParser.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/BsvFileParser.java?rev=1881994&view=auto ============================================================================== --- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/BsvFileParser.java (added) +++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/BsvFileParser.java Fri Sep 25 00:59:37 2020 @@ -0,0 +1,51 @@ +package org.apache.ctakes.dictionary.cased.util.bsv; + + +import org.apache.ctakes.core.resource.FileLocator; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Collection; + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 8/18/2020 + */ +final public class BsvFileParser { + + private BsvFileParser() { + } + + + static public Collection<String[]> parseBsvFile( final String bsvFilePath ) throws IOException { + return parseBsvFile( bsvFilePath, Integer.MAX_VALUE ); + } + + + static public Collection<String[]> parseBsvFile( final String bsvFilePath, + final int columnCount ) throws IOException { + return parseBsvFile( bsvFilePath, new StringArrayCreator( columnCount ) ); + } + + + static public <T> Collection<T> parseBsvFile( final String bsvFilePath, + final BsvObjectCreator<T> objectCreator ) throws IOException { + final Collection<T> bsvObjects = new ArrayList<>(); + final BufferedReader reader + = new BufferedReader( new InputStreamReader( FileLocator.getAsStream( bsvFilePath ) ) ); + String line = reader.readLine(); + while ( line != null ) { + final T bsvObject = objectCreator.createBsvObject( line ); + if ( bsvObject != null ) { + bsvObjects.add( bsvObject ); + } + line = reader.readLine(); + } + return bsvObjects; + } + + +} Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/BsvObjectCreator.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/BsvObjectCreator.java?rev=1881994&view=auto ============================================================================== --- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/BsvObjectCreator.java (added) +++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/BsvObjectCreator.java Fri Sep 25 00:59:37 2020 @@ -0,0 +1,35 @@ +package org.apache.ctakes.dictionary.cased.util.bsv; + +import org.apache.ctakes.core.util.StringUtil; + +import java.util.Arrays; + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 8/18/2020 + */ +public interface BsvObjectCreator<T> { + + T createBsvObject( final String[] columns ); + + default T createBsvObject( final String line ) { + if ( isCommentLine( line ) ) { + return null; + } + final String[] columns = StringUtil.fastSplit( line, '|' ); + if ( isAnyColumnEmpty( columns ) ) { + return null; + } + return createBsvObject( columns ); + } + + default boolean isCommentLine( final String line ) { + return line.isEmpty() || line.startsWith( "//" ) || line.startsWith( "#" ); + } + + default boolean isAnyColumnEmpty( final String[] columns ) { + return Arrays.stream( columns ).anyMatch( c -> c.trim().isEmpty() ); + } + +} Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/StringArrayCreator.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/StringArrayCreator.java?rev=1881994&view=auto ============================================================================== --- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/StringArrayCreator.java (added) +++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/StringArrayCreator.java Fri Sep 25 00:59:37 2020 @@ -0,0 +1,29 @@ +package org.apache.ctakes.dictionary.cased.util.bsv; + + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 8/18/2020 + */ +public class StringArrayCreator implements BsvObjectCreator<String[]> { + + private final int _columnCount; + + public StringArrayCreator() { + this( Integer.MAX_VALUE ); + } + + public StringArrayCreator( final int columnCount ) { + _columnCount = columnCount; + } + + public String[] createBsvObject( final String[] columns ) { + if ( _columnCount != Integer.MAX_VALUE && columns.length != _columnCount ) { + return null; + } + return columns; + } + + +} Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/jdbc/JdbcUtil.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/jdbc/JdbcUtil.java?rev=1881994&view=auto ============================================================================== --- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/jdbc/JdbcUtil.java (added) +++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/jdbc/JdbcUtil.java Fri Sep 25 00:59:37 2020 @@ -0,0 +1,100 @@ +package org.apache.ctakes.dictionary.cased.util.jdbc; + + +import org.apache.ctakes.dictionary.lookup2.util.JdbcConnectionFactory; +import org.apache.ctakes.utils.env.EnvironmentVariable; +import org.apache.uima.UimaContext; + +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.SQLException; + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 8/18/2020 + */ +final public class JdbcUtil { + + private JdbcUtil() { + } + + static public final String HSQL_DRIVER = "org.hsqldb.jdbcDriver"; + static public final String UPPER_TABLE = "UPPER"; + static public final String MIXED_TABLE = "MIXED"; + static public final String LOWER_TABLE = "LOWER"; + static public final String DEFAULT_USER = "sa"; + static public final String DEFAULT_PASS = ""; + + + static public String getParameterValue( final String rootName, + final String parameterName, + final UimaContext uimaContext, + final String defaultValue ) { + final String value = EnvironmentVariable.getEnv( rootName + '_' + parameterName, uimaContext ); + if ( value != null && !value.equals( EnvironmentVariable.NOT_PRESENT ) ) { + return value; + } + return defaultValue; + } + + + static public PreparedStatement createPreparedStatement( final String name, + final String jdbcDriver, + final String jdbcUrl, + final String jdbcUser, + final String jdbcPass, + final String tableName, + final String indexName ) throws SQLException { + if ( jdbcDriver == null || jdbcDriver.isEmpty() ) { + throw new SQLException( "No JDBC Driver specified for " + name ); + } + if ( jdbcUrl == null || jdbcUrl.isEmpty() ) { + throw new SQLException( "No URL specified for " + name ); + } + if ( tableName == null || tableName.isEmpty() ) { + throw new SQLException( "No Table specified for " + name ); + } + // DO NOT use try with resources here. Try with resources uses a closable and closes it when exiting the try + final Connection connection = JdbcConnectionFactory.getInstance() + .getConnection( jdbcDriver, jdbcUrl, jdbcUser, jdbcPass ); + if ( connection == null ) { + throw new SQLException( "Could not connect to " + name ); + } + return createSelectCall( connection, tableName, indexName ); + } + + + /** + * @return an sql call to use for term lookup + * @throws SQLException if the {@code PreparedStatement} could not be created or changed + */ + static private PreparedStatement createSelectCall( final Connection connection, + final String table, + final String index ) throws SQLException { + final String lookupSql = "SELECT * FROM " + table + " WHERE " + index + " = ?"; + return connection.prepareStatement( lookupSql ); + } + + /** + * @param statement an sql call to use for lookup + * @param text of the text to use for lookup + * @throws SQLException if the {@code PreparedStatement} could not be created or changed + */ + static public void fillSelectCall( final PreparedStatement statement, final String text ) throws SQLException { + statement.clearParameters(); + statement.setString( 1, text ); + } + + /** + * @param statement an sql call to use for lookup + * @param value of the long to use for lookup + * @throws SQLException if the {@code PreparedStatement} could not be created or changed + */ + static public void fillSelectCall( final PreparedStatement statement, final long value ) throws SQLException { + statement.clearParameters(); + statement.setLong( 1, value ); + } + + +} Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/ContiguousTextSpan.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/ContiguousTextSpan.java?rev=1881994&view=auto ============================================================================== --- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/ContiguousTextSpan.java (added) +++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/ContiguousTextSpan.java Fri Sep 25 00:59:37 2020 @@ -0,0 +1,53 @@ +package org.apache.ctakes.dictionary.cased.util.textspan; + +import org.apache.ctakes.core.util.Pair; + + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 8/19/2020 + */ +public final class ContiguousTextSpan implements MagicTextSpan { + private final Pair<Integer> _span; + + public ContiguousTextSpan( final int begin, final int end ) { + this( new Pair<>( begin, end ) ); + } + + public ContiguousTextSpan( final Pair<Integer> span ) { + _span = span; + } + + public Pair<Integer> toIntPair() { + return _span; + } + + public int getBegin() { + return _span.getValue1(); + } + + public int getEnd() { + return _span.getValue2(); + } + + /** + * {@inheritDoc} + */ + @Override + public boolean equals( final Object other ) { + return other instanceof ContiguousTextSpan + && ((ContiguousTextSpan)other).getBegin() == getBegin() + && ((ContiguousTextSpan)other).getEnd() == getEnd(); + } + + /** + * {@inheritDoc} + */ + @Override + public int hashCode() { + return _span.hashCode(); + } + + +} Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/DiscontiguousTextSpan.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/DiscontiguousTextSpan.java?rev=1881994&view=auto ============================================================================== --- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/DiscontiguousTextSpan.java (added) +++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/DiscontiguousTextSpan.java Fri Sep 25 00:59:37 2020 @@ -0,0 +1,115 @@ +package org.apache.ctakes.dictionary.cased.util.textspan; + +import org.apache.ctakes.core.util.Pair; + +import java.util.*; + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 8/19/2020 + */ +public final class DiscontiguousTextSpan implements MagicTextSpan { + private final Pair<Integer> _span; + private final Collection<MagicTextSpan> _presentTextSpans; + + private DiscontiguousTextSpan( final Pair<Integer> span, final Collection<Pair<Integer>> missingSpans ) { + _span = span; + _presentTextSpans = createPresentSpans( span, missingSpans ); + } + + public Pair<Integer> toIntPair() { + return _span; + } + + public int getBegin() { + return _span.getValue1(); + } + + public int getEnd() { + return _span.getValue2(); + } + + + public Collection<MagicTextSpan> getPresentSpans() { + return _presentTextSpans; + } + + static private Collection<MagicTextSpan> createPresentSpans( final Pair<Integer> span, + final Collection<Pair<Integer>> missingSpans ) { + final List<Pair<Integer>> missingSpanList = new ArrayList<>( missingSpans ); + missingSpanList.sort( Comparator.comparingInt( Pair::getValue1 ) ); + + final Collection<MagicTextSpan> presentSpans = new HashSet<>( missingSpans.size() + 1 ); + int previousBegin = span.getValue1(); + for ( Pair<Integer> missingSpan : missingSpanList ) { + presentSpans.add( new ContiguousTextSpan( previousBegin, missingSpan.getValue1() ) ); + previousBegin = missingSpan.getValue2(); + } + presentSpans.add( new ContiguousTextSpan( previousBegin, span.getValue2() ) ); + return presentSpans; + } + + public boolean containsAll( final MagicTextSpan textSpan ) { + if ( !contains( textSpan ) ) { + return false; + } + final Collection<MagicTextSpan> presentSpans = getPresentSpans(); + if ( textSpan instanceof ContiguousTextSpan ) { + return presentSpans.stream().anyMatch( t -> t.contains( textSpan ) ); + } + if ( textSpan instanceof DiscontiguousTextSpan ) { + final Collection<MagicTextSpan> otherPresentSpans = ((DiscontiguousTextSpan)textSpan).getPresentSpans(); + for ( MagicTextSpan other : otherPresentSpans ) { + if ( presentSpans.stream().noneMatch( t -> t.contains( other ) ) ) { + return false; + } + } + } + return true; + } + + public boolean fullyContainsAll( final MagicTextSpan textSpan ) { + if ( !fullyContains( textSpan ) ) { + return false; + } + final Collection<MagicTextSpan> presentSpans = getPresentSpans(); + if ( textSpan instanceof ContiguousTextSpan ) { + return presentSpans.stream().anyMatch( t -> t.fullyContains( textSpan ) ); + } + if ( textSpan instanceof DiscontiguousTextSpan ) { + boolean fullyContains = false; + final Collection<MagicTextSpan> otherPresentSpans = ((DiscontiguousTextSpan)textSpan).getPresentSpans(); + for ( MagicTextSpan other : otherPresentSpans ) { + if ( presentSpans.stream().noneMatch( t -> t.contains( other ) ) ) { + return false; + } + fullyContains = fullyContains + || presentSpans.stream().anyMatch( t -> t.fullyContains( other ) ); + } + return fullyContains; + } + return true; + } + + + /** + * {@inheritDoc} + */ + @Override + public boolean equals( final Object other ) { + return other instanceof DiscontiguousTextSpan + && ((DiscontiguousTextSpan)other).getBegin() == getBegin() + && ((DiscontiguousTextSpan)other).getEnd() == getEnd() + && ((DiscontiguousTextSpan)other).getPresentSpans().equals( getPresentSpans() ); + } + + /** + * {@inheritDoc} + */ + @Override + public int hashCode() { + return _span.hashCode() + _presentTextSpans.hashCode(); + } + +} Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/MagicTextSpan.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/MagicTextSpan.java?rev=1881994&view=auto ============================================================================== --- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/MagicTextSpan.java (added) +++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/MagicTextSpan.java Fri Sep 25 00:59:37 2020 @@ -0,0 +1,65 @@ +package org.apache.ctakes.dictionary.cased.util.textspan; + + +import org.apache.ctakes.core.util.Pair; + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 8/19/2020 + */ +public interface MagicTextSpan { + + int getBegin(); + + int getEnd(); + + default Pair<Integer> toIntPair() { + return new Pair<>( getBegin(), getEnd() ); + } + + default int getWidth() { + return getEnd() - getBegin(); + } + + /** + * NOTE: TextSpans are begin inclusive end exclusive. + * So, 1 is subtracted from the end when comparing to another begin + * + * @param textSpan another textspan + * @return true if there is overlap between the two text spans + */ + default boolean overlaps( final MagicTextSpan textSpan ) { + return !(textSpan.getEnd() - 1 < getBegin()) && !(textSpan.getBegin() > getEnd() - 1); + } + + default boolean contains( final MagicTextSpan textSpan ) { + return getBegin() <= textSpan.getBegin() && textSpan.getEnd() <= getEnd(); + } + + default boolean fullyContains( final MagicTextSpan textSpan ) { + return (getBegin() < textSpan.getBegin() && textSpan.getEnd() <= getEnd()) + || (getBegin() <= textSpan.getBegin() && textSpan.getEnd() < getEnd()); + } + + /** + * For discontiguous spans, every part of this span must include every part of that span. + * + * @param textSpan - + * @return - + */ + default boolean containsAll( MagicTextSpan textSpan ) { + return contains( textSpan ); + } + + /** + * For discontiguous spans, every part of this span must include every part of that span. + * + * @param textSpan - + * @return - + */ + default boolean fullyContainsAll( MagicTextSpan textSpan ) { + return fullyContains( textSpan ); + } + +} Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/tokenize/TokenizedTerm.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/tokenize/TokenizedTerm.java?rev=1881994&view=auto ============================================================================== --- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/tokenize/TokenizedTerm.java (added) +++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/tokenize/TokenizedTerm.java Fri Sep 25 00:59:37 2020 @@ -0,0 +1,237 @@ +package org.apache.ctakes.dictionary.cased.util.tokenize; + +import jdk.nashorn.internal.ir.annotations.Immutable; +import org.apache.ctakes.dictionary.lookup2.util.CuiCodeUtil; + +import java.util.*; +import java.util.stream.Collectors; + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 8/17/2020 + */ +@Immutable +final public class TokenizedTerm { + + static private final Collection<String> PREFIXES = new HashSet<>( Arrays.asList( + "e-", + "a-", + "u-", + "x-", + "agro-", + "ante-", + "anti-", + "arch-", + "be-", + "bi-", + "bio-", + "co-", + "counter-", + "cross-", + "cyber-", + "de-", + "eco-", + "ex-", + "extra-", + "inter-", + "intra-", + "macro-", + "mega-", + "micro-", + "mid-", + "mini-", + "multi-", + "neo-", + "non-", + "over-", + "pan-", + "para-", + "peri-", + "post-", + "pre-", + "pro-", + "pseudo-", + "quasi-", + "re-", + "semi-", + "sub-", + "super-", + "tri-", + "ultra-", + "un-", + "uni-", + "vice-", + // From email from Colin Warner <[email protected]> on 7/25/2010 + "electro-", + "gasto-", + "homo-", + "hetero-", + "ortho-", + "phospho-" ) ); + + static private final Collection<String> SUFFIXES = new HashSet<>( Arrays.asList( + "-esque", + "-ette", + "-fest", + "-fold", + "-gate", + "-itis", + "-less", + "-most", + "-o-torium", + "-rama", + "-wise" ) ); + + static private final Collection<String> UPPER_PREFIXES = PREFIXES.stream() + .map( String::toUpperCase ) + .collect( Collectors.toSet() ); + + static private final Collection<String> UPPER_SUFFIXES = SUFFIXES.stream() + .map( String::toUpperCase ) + .collect( Collectors.toSet() ); + + + final private String[] _tokens; + final private boolean _allUpperCase; + final private boolean _allLowerCase; + final private Long _cui; + final private int _hashcode; + + public TokenizedTerm( final String cui, final String text ) { + _cui = CuiCodeUtil.getInstance().getCuiCode( cui ); + _tokens = getTermTokens( text ); + boolean anyCaps = false; + boolean anyLower = false; + for ( char c : text.toCharArray() ) { + if ( Character.isUpperCase( c ) ) { + anyCaps = true; + } else if ( Character.isLowerCase( c ) ) { + anyLower = true; + } + if ( anyCaps && anyLower ) { + break; + } + } + _allUpperCase = anyCaps && !anyLower; + _allLowerCase = anyLower && !anyCaps; + _hashcode = (cui + "_" + text).hashCode(); + } + + public long getCui() { + return _cui; + } + + public String[] getTokens() { + return _tokens; + } + + public boolean isAllUpperCase() { + return _allUpperCase; + } + + public boolean isAllLowerCase() { + return _allLowerCase; + } + + + static private String[] getTermTokens( final String text ) { + if ( text.isEmpty() ) { + return new String[ 0 ]; + } + return Arrays.stream( text.split( "\\s+" ) ) + .map( TokenizedTerm::getTokens ) + .flatMap( Collection::stream ) + .toArray( String[]::new ); + } + + // TODO should this be exactly the same as getTokens in TextTokenizer (dictionary gui code) ? probably ... + static private List<String> getTokens( final String word ) { + final List<String> tokens = new ArrayList<>(); + final StringBuilder sb = new StringBuilder(); + final int count = word.length(); + for ( int i = 0; i < count; i++ ) { + final char c = word.charAt( i ); + if ( Character.isLetterOrDigit( c ) ) { + sb.append( c ); + continue; + } + if ( c == '-' && (isPrefix( sb.toString() ) || isSuffix( word, i + 1 )) ) { + // what precedes is a prefix or what follows is a suffix so append the dash to the current word and move on + sb.append( c ); + continue; + } + if ( (c == '\'' && isOwnerApostrophe( word, i + 1 )) + || (c == '.' && isNumberDecimal( word, i + 1 )) ) { + // what follows is an 's or .# so add the preceding and move on + if ( sb.length() != 0 ) { + tokens.add( createToken( sb ) ); + sb.setLength( 0 ); + } + sb.append( c ); + continue; + } + // Wasn't a special symbol for consideration, so add the previous and symbol separately + if ( sb.length() != 0 ) { + tokens.add( createToken( sb ) ); + sb.setLength( 0 ); + } + tokens.add( "" + c ); + } + if ( sb.length() != 0 ) { + tokens.add( createToken( sb ) ); + } + return tokens; + } + + static private String createToken( final StringBuilder sb ) { + return sb.toString(); + } + + static private boolean isPrefix( final String word ) { + return PREFIXES.contains( word + "-" ) || UPPER_PREFIXES.contains( word + "-" ); + } + + static private boolean isSuffix( final String word, final int startIndex ) { + if ( word.length() <= startIndex ) { + return false; + } + final String nextCharTerm = getNextCharTerm( word.substring( startIndex ) ); + if ( nextCharTerm.isEmpty() ) { + return false; + } + return SUFFIXES.contains( "-" + nextCharTerm ) || UPPER_SUFFIXES.contains( "-" + nextCharTerm ); + } + + static private boolean isOwnerApostrophe( final CharSequence word, final int startIndex ) { + return word.length() == startIndex + 1 && word.charAt( startIndex ) == 's'; + } + + static private boolean isNumberDecimal( final CharSequence word, final int startIndex ) { + // Bizarre scenario in which ctakes tokenizes ".2" as a fraction, but not ".22" + return word.length() == startIndex + 1 && Character.isDigit( word.charAt( startIndex ) ); + } + + static private String getNextCharTerm( final String word ) { + final int count = word.length(); + for ( int i = 0; i < count; i++ ) { + final char c = word.charAt( i ); + if ( !Character.isLetterOrDigit( c ) ) { + return word.substring( 0, i ); + } + } + return word; + } + + public boolean equals( final Object value ) { + return value instanceof TokenizedTerm + && Arrays.equals( _tokens, ((TokenizedTerm)value)._tokens ) + && _cui.equals( ((TokenizedTerm)value)._cui ); + } + + public int hashCode() { + return _hashcode; + } + + +}
