I resolved my own issue. 

#!/bin/sh
curl -XPUT 'http://localhost:9200/specialchars' -d '{
    "settings" : {
        "index" : {
            "number_of_shards" : 1,
            "number_of_replicas" : 1
        },  
        "analysis" : {
            "filter" : {
                "special_character_splitter" : {
                    "type" : "word_delimiter",
                    "split_on_numerics":false,
                    "type_table": ["+ => ALPHANUM", "- => ALPHANUM", "@ => 
ALPHANUM"]
                }
            },
            "analyzer" : {
                "schar_analyzer" : {
                    "type" : "custom",
                    "tokenizer" : "whitespace",
                    "filter" : ["lowercase", "special_character_splitter"]
                }
            }
        }
    },
    "mappings" : {
        "specialchars" : {
            "properties" : {
                "msg" : {
                    "type" : "string",
                    "analyzer" : "schar_analyzer"
                }
            }
        }
    }
}'

curl -XPOST localhost:9200/specialchars/specialchars/1 -d '{"msg" : "HER2+ 
Breast Cancer"}'
curl -XPOST localhost:9200/specialchars/specialchars/2 -d '{"msg" : 
"Non-Small Cell Lung Cancer"}'
curl -XPOST localhost:9200/specialchars/specialchars/3 -d '{"msg" : 
"c.2573T>G NSCLC"}'

curl -XPOST localhost:9200/specialchars/_refresh

curl -XGET 'localhost:9200/specialchars/_analyze?field=msg&pretty=1' -d 
"HER2+ Breast Cancer"
#curl -XGET 'localhost:9200/specialchars/_analyze?field=msg&pretty=1' -d 
"Non-Small Cell Lung Cancer"
#curl -XGET 'localhost:9200/specialchars/_analyze?field=msg&pretty=1' -d 
"c.2573T>G NSCLC"

printf "HER2+\n"
curl -XGET localhost:9200/specialchars/specialchars/_search?pretty -d '{
    "query" : {
    "match" : {
        "msg" : {
            "query" : "HER2+"
           }
    }
    }
}'

printf "HER2-\n"
curl -XGET localhost:9200/specialchars/specialchars/_search?pretty -d '{
    "query" : {
    "match" : {
        "msg" : {
            "query" : "HER2-"
           }
    }
    }
}'

printf "HER2@\n"
curl -XGET localhost:9200/specialchars/specialchars/_search?pretty -d '{
    "query" : {
    "match" : {
        "msg" : {
            "query" : "HER2@"
           }
    }
    }
}'


curl -X DELETE localhost:9200/specialchars


On Friday, October 17, 2014 4:57:52 PM UTC-7, Nick Tackes wrote:
>
> Hello, I am experimenting with word_delimiter and have an example with a 
> special character that is indexed.  The character is in the type table for 
> the word delimiter.  analysis of the tokenization looks good, but when i 
> attempt to do a match query it doesnt seem to respect tokenization as 
> expected.  
> The example indexes 'HER2+ Breast Cancer'.  Tokenization is 'her2+', 
> 'breast', 'cancer', which is good.  searching for 'HER2\\+' results in a 
> hit, as well as 'HER2\\-'
>
> #!/bin/sh
> curl -XPUT 'http://localhost:9200/specialchars' -d '{
>     "settings" : {
>         "index" : {
>             "number_of_shards" : 1,
>             "number_of_replicas" : 1
>         },  
>         "analysis" : {
>             "filter" : {
>                 "special_character_spliter" : {
>                     "type" : "word_delimiter",
>                     "split_on_numerics":false,
>                     "type_table": ["+ => ALPHA", "- => ALPHA"]
>                 }
>             },
>             "analyzer" : {
>                 "schar_analyzer" : {
>                     "type" : "custom",
>                     "tokenizer" : "whitespace",
>                     "filter" : ["lowercase", "special_character_spliter"]
>                 }
>             }
>         }
>     },
>     "mappings" : {
>         "specialchars" : {
>             "properties" : {
>                 "msg" : {
>                     "type" : "string",
>                     "analyzer" : "schar_analyzer"
>                 }
>             }
>         }
>     }
> }'
>
> curl -XPOST localhost:9200/specialchars/1 -d '{"msg" : "HER2+ Breast 
> Cancer"}'
> curl -XPOST localhost:9200/specialchars/2 -d '{"msg" : "Non-Small Cell 
> Lung Cancer"}'
> curl -XPOST localhost:9200/specialchars/3 -d '{"msg" : "c.2573T>G NSCLC"}'
>
> curl -XPOST localhost:9200/specialchars/_refresh
>
> curl -XGET 'localhost:9200/specialchars/_analyze?field=msg&pretty=1' -d 
> "HER2+ Breast Cancer"
> #curl -XGET 'localhost:9200/specialchars/_analyze?field=msg&pretty=1' -d 
> "Non-Small Cell Lung Cancer"
> #curl -XGET 'localhost:9200/specialchars/_analyze?field=msg&pretty=1' -d 
> "c.2573T>G NSCLC"
>
> printf "HER2+\n"
> curl -XGET localhost:9200/specialchars/_search?pretty -d '{
>     "query" : {
>     "match" : {
>         "msg" : {
>             "query" : "HER2\\+"
>            }
>     }
>     }
> }'
>
> printf "HER2-\n"
> curl -XGET localhost:9200/specialchars/_search?pretty -d '{
>     "query" : {
>     "match" : {
>         "msg" : {
>             "query" : "HER2\\-"
>            }
>     }
>     }
> }'
>
> curl -X DELETE localhost:9200/specialchars
>

-- 
You received this message because you are subscribed to the Google Groups 
"elasticsearch" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msgid/elasticsearch/ad8ebeac-a75d-461d-920d-cba1a25a3226%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Reply via email to