Hi All,
In order to check the functions, I got a twitter stream using ESB twitter
connector. Part of that stream shown below.
<statuses>
<statuses>
<metadata>
<result_type>popular</result_type>
<iso_language_code>it</iso_language_code>
</metadata>
<created_at>Tue Sep 02 13:09:08 +0000 2014</created_at>
<id>506790930058199040</id>
<id_str>506790930058199040</id_str>
<text>Donate. Comunque donate. 💪❤️accetto la nomination all'
#icebucketchallange di veronicazimbaro e…http://t.co/eYm7WFFZd4</text>
<source><a href="http://instagram.com"
rel="nofollow">Instagram</a></source>
<truncated>false</truncated>
<in_reply_to_status_id/>
<in_reply_to_status_id_str/>
<in_reply_to_user_id/>
<in_reply_to_user_id_str/>
<in_reply_to_screen_name/>
<user>
<id>563995182</id>
<id_str>563995182</id_str>
<name>Martina Maccari</name>
<screen_name>MartinaZoev</screen_name>
<location>Torino</location>
<description>Prendere o lasciare.</description>
<url>http://t.co/P8zSwRXvoM</url>
<entities>
<url>
<urls>
<url>http://t.co/P8zSwRXvoM</url>
<expanded_url>http://www.zoodizoev.com</expanded_url>
<display_url>zoodizoev.com</display_url>
<indices>0</indices>
<indices>22</indices>
</urls>
</url>
<description/>
</entities>
<protected>false</protected>
<followers_count>18057</followers_count>
<friends_count>96</friends_count>
<listed_count>68</listed_count>
<created_at>Thu Apr 26 17:54:59 +0000 2012</created_at>
<favourites_count>144</favourites_count>
<utc_offset>7200</utc_offset>
<time_zone>Rome</time_zone>
<geo_enabled>true</geo_enabled>
<verified>false</verified>
<statuses_count>2094</statuses_count>
<lang>it</lang>
<contributors_enabled>false</contributors_enabled>
<is_translator>false</is_translator>
<is_translation_enabled>false</is_translation_enabled>
<profile_background_color>131516</profile_background_color>
<profile_background_image_url>
http://abs.twimg.com/images/themes/theme14/bg.gif
</profile_background_image_url>
<profile_background_image_url_https>
https://abs.twimg.com/images/themes/theme14/bg.gif
</profile_background_image_url_https>
<profile_background_tile>false</profile_background_tile>
<profile_image_url>
http://pbs.twimg.com/profile_images/378800000761725544/1efa8c9032ac97c42619986fc52adb7a_normal.jpeg
</profile_image_url>
<profile_image_url_https>
https://pbs.twimg.com/profile_images/378800000761725544/1efa8c9032ac97c42619986fc52adb7a_normal.jpeg
</profile_image_url_https>
<profile_banner_url>
https://pbs.twimg.com/profile_banners/563995182/1384873417
</profile_banner_url>
<profile_link_color>F518DB</profile_link_color>
<profile_sidebar_border_color>EEEEEE</profile_sidebar_border_color>
<profile_sidebar_fill_color>EFEFEF</profile_sidebar_fill_color>
<profile_text_color>333333</profile_text_color>
<profile_use_background_image>true</profile_use_background_image>
<default_profile>false</default_profile>
<default_profile_image>false</default_profile_image>
<following/>
<follow_request_sent/>
<notifications/>
</user>
<geo/>
<coordinates/>
<place/>
<contributors/>
<retweet_count>41</retweet_count>
<favorite_count>62</favorite_count>
<entities>
<hashtags>
<text>icebucketchallange</text>
<indices>55</indices>
<indices>74</indices>
</hashtags>
<urls>
<url>http://t.co/eYm7WFFZd4</url>
<expanded_url>http://instagram.com/p/scbsmeurVA/</expanded_url>
<display_url>instagram.com/p/scbsmeurVA/</display_url>
<indices>97</indices>
<indices>119</indices>
</urls>
</entities>
<favorited>false</favorited>
<retweeted>false</retweeted>
<possibly_sensitive>false</possibly_sensitive>
<lang>it</lang>
</statuses>
<statuses>
<metadata>
<iso_language_code>und</iso_language_code>
<result_type>popular</result_type>
</metadata>
<created_at>Tue Sep 02 21:52:21 +0000 2014</created_at>
<id>506922604003725313</id>
<id_str>506922604003725313</id_str>
<text>#RT #icebucketchallange http://t.co/UXXzndKwH1</text>
<source><a href="http://twitter.com/download/android"
rel="nofollow">Twitter for Android</a></source>
<truncated>false</truncated>
<in_reply_to_status_id/>
<in_reply_to_status_id_str/>
<in_reply_to_user_id/>
<in_reply_to_user_id_str/>
<in_reply_to_screen_name/>
<user>
<id>1103956165</id>
<id_str>1103956165</id_str>
<name>Defensive Backs</name>
<screen_name>DB__TWEETS</screen_name>
<location>#NoFlyZone</location>
<description>blown a coverage? make up for it... missed a tackle?
execute more... got burned? learn from your mistake... got scored on? dont
let it happen again!!!</description>
<url/>
<entities>
<description/>
</entities>
<protected>false</protected>
<followers_count>5185</followers_count>
<friends_count>2222</friends_count>
<listed_count>2</listed_count>
<created_at>Sat Jan 19 15:24:29 +0000 2013</created_at>
<favourites_count>3579</favourites_count>
<utc_offset>-18000</utc_offset>
<time_zone>Central Time (US & Canada)</time_zone>
<geo_enabled>true</geo_enabled>
<verified>false</verified>
<statuses_count>4018</statuses_count>
<lang>en</lang>
<contributors_enabled>false</contributors_enabled>
<is_translator>false</is_translator>
<is_translation_enabled>false</is_translation_enabled>
<profile_background_color>C0DEED</profile_background_color>
<profile_background_image_url>
http://abs.twimg.com/images/themes/theme1/bg.png
</profile_background_image_url>
<profile_background_image_url_https>
https://abs.twimg.com/images/themes/theme1/bg.png
</profile_background_image_url_https>
<profile_background_tile>false</profile_background_tile>
<profile_image_url>
http://pbs.twimg.com/profile_images/483757082978430977/YrXT9l4B_normal.jpeg
</profile_image_url>
<profile_image_url_https>
https://pbs.twimg.com/profile_images/483757082978430977/YrXT9l4B_normal.jpeg
</profile_image_url_https>
<profile_banner_url>
https://pbs.twimg.com/profile_banners/1103956165/1394929163
</profile_banner_url>
<profile_link_color>0084B4</profile_link_color>
<profile_sidebar_border_color>C0DEED</profile_sidebar_border_color>
<profile_sidebar_fill_color>DDEEF6</profile_sidebar_fill_color>
<profile_text_color>333333</profile_text_color>
<profile_use_background_image>true</profile_use_background_image>
<default_profile>true</default_profile>
<default_profile_image>false</default_profile_image>
<following/>
<follow_request_sent/>
<notifications/>
</user>
<geo/>
<coordinates/>
<place/>
<contributors/>
<retweet_count>20</retweet_count>
<favorite_count>14</favorite_count>
<entities>
<hashtags>
<text>RT</text>
<indices>0</indices>
<indices>3</indices>
</hashtags>
<hashtags>
<text>icebucketchallange</text>
<indices>4</indices>
<indices>23</indices>
</hashtags>
<media>
<id>506922601248063488</id>
<id_str>506922601248063488</id_str>
<indices>24</indices>
<indices>46</indices>
<media_url>http://pbs.twimg.com/media/Bwjza4gCAAA9xto.jpg
</media_url>
<media_url_https>https://pbs.twimg.com/media/Bwjza4gCAAA9xto.jpg
</media_url_https>
<url>http://t.co/UXXzndKwH1</url>
<display_url>pic.twitter.com/UXXzndKwH1</display_url>
<expanded_url>
http://twitter.com/DB__TWEETS/status/506922604003725313/photo/1
</expanded_url>
<type>photo</type>
<sizes>
<small>
<w>288</w>
<h>204</h>
<resize>fit</resize>
</small>
<medium>
<w>288</w>
<h>204</h>
<resize>fit</resize>
</medium>
<large>
<w>288</w>
<h>204</h>
<resize>fit</resize>
</large>
<thumb>
<w>150</w>
<h>150</h>
<resize>crop</resize>
</thumb>
</sizes>
</media>
</entities>
<favorited>false</favorited>
<retweeted>false</retweeted>
<possibly_sensitive>false</possibly_sensitive>
<lang>und</lang>
</statuses>
<statuses>
<metadata>
<result_type>popular</result_type>
<iso_language_code>nl</iso_language_code>
</metadata>
<created_at>Fri Aug 29 22:44:20 +0000 2014</created_at>
<id>505486135469309952</id>
<id_str>505486135469309952</id_str>
<text>#ALS Foundation geeft toe dat 73% van de donaties niet wordt
gebruikt voor ALS onderzoek: http://t.co/dgfSvFQC2Q
#Icebucketchallange</text>
<source><a href="http://tapbots.com/tweetbot"
rel="nofollow">Tweetbot for iÎźS</a></source>
<truncated>false</truncated>
<in_reply_to_status_id/>
<in_reply_to_status_id_str/>
<in_reply_to_user_id/>
<in_reply_to_user_id_str/>
<in_reply_to_screen_name/>
<user>
<id>98353402</id>
<id_str>98353402</id_str>
<name>Petra Blankwaard</name>
<screen_name>indigonl</screen_name>
<location>Den Haag</location>
<description>Webnerd ~ Apple ~ MINI ~ Magento ~ WordPress ~ SEO ~
motorrijden ~ duiken ~ humor ~ psychologie ~ wetenschap ~ Recht is vaak
krom</description>
<url>http://t.co/t1B53SkiYm</url>
<entities>
<url>
<urls>
<url>http://t.co/t1B53SkiYm</url>
<expanded_url>http://www.indigowebstudio.nl</expanded_url>
<display_url>indigowebstudio.nl</display_url>
<indices>0</indices>
<indices>22</indices>
</urls>
</url>
<description/>
</entities>
<protected>false</protected>
<followers_count>3120</followers_count>
<friends_count>1847</friends_count>
<listed_count>177</listed_count>
<created_at>Mon Dec 21 11:20:58 +0000 2009</created_at>
<favourites_count>30</favourites_count>
<utc_offset>7200</utc_offset>
<time_zone>Amsterdam</time_zone>
<geo_enabled>true</geo_enabled>
<verified>false</verified>
<statuses_count>60574</statuses_count>
<lang>nl</lang>
<contributors_enabled>false</contributors_enabled>
<is_translator>false</is_translator>
<is_translation_enabled>false</is_translation_enabled>
<profile_background_color>273182</profile_background_color>
<profile_background_image_url>
http://pbs.twimg.com/profile_background_images/265611238/Twitter_page.png
</profile_background_image_url>
<profile_background_image_url_https>
https://pbs.twimg.com/profile_background_images/265611238/Twitter_page.png
</profile_background_image_url_https>
<profile_background_tile>false</profile_background_tile>
<profile_image_url>
http://pbs.twimg.com/profile_images/378800000577198041/b27b8688897f286e45ec1c8aee8afbe2_normal.jpeg
</profile_image_url>
<profile_image_url_https>
https://pbs.twimg.com/profile_images/378800000577198041/b27b8688897f286e45ec1c8aee8afbe2_normal.jpeg
</profile_image_url_https>
<profile_banner_url>
https://pbs.twimg.com/profile_banners/98353402/1394458520
</profile_banner_url>
<profile_link_color>E39517</profile_link_color>
<profile_sidebar_border_color>FFFFFF</profile_sidebar_border_color>
<profile_sidebar_fill_color>DEDEDE</profile_sidebar_fill_color>
<profile_text_color>273182</profile_text_color>
<profile_use_background_image>false</profile_use_background_image>
<default_profile>false</default_profile>
<default_profile_image>false</default_profile_image>
<following/>
<follow_request_sent/>
<notifications/>
</user>
<geo/>
<coordinates/>
<place/>
<contributors/>
<retweet_count>54</retweet_count>
<favorite_count>3</favorite_count>
<entities>
<hashtags>
<text>ALS</text>
<indices>0</indices>
<indices>4</indices>
</hashtags>
<hashtags>
<text>Icebucketchallange</text>
<indices>113</indices>
<indices>132</indices>
</hashtags>
<urls>
<url>http://t.co/dgfSvFQC2Q</url>
<expanded_url>http://bit.ly/1rF4RY4</expanded_url>
<display_url>bit.ly/1rF4RY4</display_url>
<indices>90</indices>
<indices>112</indices>
</urls>
</entities>
<favorited>false</favorited>
<retweeted>false</retweeted>
<possibly_sensitive>false</possibly_sensitive>
<lang>nl</lang>
</statuses>
Problem is: By writing a mediator class we can extract the relevant
information, but content of the stream in not descriptive enough to test
the functions.
- Is there any better way/inputs to do the testing?
- If we try to get the stream continuously, is there any playback option
to retrieve the data in CEP?
- Is, writing a mediator class to extract the data and push that to CEP
from ESB as scheduled task, is better?
Thank you.
On Tue, Sep 2, 2014 at 5:51 PM, Malithi Edirisinghe <[email protected]>
wrote:
> Hi All,
>
> After having a discussion on $subject with Srinath and Suho we agreed on
> following changes for our implementation.
>
> 1. The 2nd operation findNLRegexPattern(sentence, regex) is renamed to
> findTokensRegexPattern(sentence, regex) since this exposes the TokensRegex
> support in Stanford NLP library.
>
> 2. Introduced the following operation to expose the Semgrex regular
> expression support in Stanford NLP.
>
>
> - findSemgrexPattern(sentence, regex)
>
> Description:
>
> This operation takes a sentence and a regular expression as it's inputs.
> It will return each match in the sentence, as an event.
>
> inputs:
>
> sentence : sentence to be processed
> regex : regular expression to be matched. Regex sytax should be in
> Stanford NLP Semgrex
> output: matching pharase(s) as event(s)
>
>
> example:
>
> inputs:
> sentence : They win the lottery
> regex : {} >/nsubj|agent/ {}
>
> output: win
>
>
> 3. Introduced following two operations to extract relationships instead of
> the 3rd operation findRelationship(sentence, regex) defined above.
>
>
> - findRelationshipByVerb(sentence, verb)
>
> Description:
>
> This operation takes a sentence and a verb as it's inputs. It extract the
> subject for the defined verb and object for the defined verb. For each such
> relationship extracted from the operation will return a triplet; subject,
> object and verb as an event.
>
> inputs:
>
> sentence : sentence to be processed
> verb : verb to extract the relationship
> output: triplet(s) of (subject, object, verb) as event(s)
>
> example:
>
> inputs:
> sentence : They win the lottery
> verb : works for
>
> output: (Bob, WSO2, verb)
> inputs:
> sentence : The man has been killed by the police
>
> verb : killed
>
>
> output: (police, man, killed)
>
>
>
> - findRelationshipByRegex(sentence, regex)
>
> This operation takes a sentence and a regex as it's input. The regex
> should define a regular expression to extract subject, object and
> relationship. If regex is defined as per the syntax all matches found will
> be returned as a triplet; subject, object and relationship as an event
> otherwise an error is thrown.
>
> inputs:
>
> sentence : sentence to be processed
> verb : regex to extract the relationship
> output: triplet(s) of (subject, object, verb) as event(s)
>
> example:
>
> inputs:
> sentence : They win the lottery
> regex : {}=verb >/nsubj|agent/ {}=subject ?>/dobj/ {}=object
>
> output: (They, lottery, win)
> Note:
>
> With the NLP library we can simply get the match of the above regular
> expression which is "win" in this case or either we can get each node named
> via the regular expression. i.e verb -> "win", subject -> "They", object ->
> "lottery"
>
> Welcome any comments you might have on above changes.
>
> Thank You.
> Malithi.
>
>
>
> On Mon, Sep 1, 2014 at 3:06 PM, Chanuka Dissanayake <[email protected]>
> wrote:
>
>> Yes, sure.
>>
>> Thanks.
>>
>>
>> On Mon, Sep 1, 2014 at 2:42 PM, Srinath Perera <[email protected]> wrote:
>>
>>> How about 2pm? (Someone had a conflict in the AM)
>>>
>>>
>>> On Mon, Sep 1, 2014 at 2:40 PM, Srinath Perera <[email protected]> wrote:
>>>
>>>> Can we meet and discuss? How about tomorrow 11am?
>>>>
>>>>
>>>> On Thu, Aug 28, 2014 at 6:49 PM, Malithi Edirisinghe <[email protected]
>>>> > wrote:
>>>>
>>>>> Hi,
>>>>>
>>>>> I have looked at how Stanford NLP extract grammatical dependencies in
>>>>> detail and have following concerns with regard to the implementation of
>>>>> 3rd
>>>>> query(findRelationship(sentence, regex)).
>>>>>
>>>>> When a sentence is given Stanford NLP can recognise around 50
>>>>> grammatical relationships. I have listed some with simple examples below.
>>>>>
>>>>>
>>>>> - acomp:adjective complement
>>>>>
>>>>> This is an adjectival phrase which functions as the complement (like
>>>>> an object of the verb).
>>>>>
>>>>> ex:
>>>>>
>>>>> “She looks very beautiful” -> acomp(looks, beautiful)
>>>>>
>>>>>
>>>>> - agent
>>>>>
>>>>> This is a complement of a passive verb which is introduced by the
>>>>> preposition “by” and does the action.
>>>>>
>>>>> ex:
>>>>>
>>>>> “The man has been killed by the police” -> agent(killed, police)
>>>>> “Effects caused by the protein are important” -> agent(caused,
>>>>> protein)
>>>>>
>>>>>
>>>>> - aux:auxiliary
>>>>>
>>>>> This is the non-main verb of the clause
>>>>>
>>>>> ex:
>>>>>
>>>>> "Reagan has died" -> aux(died, has)
>>>>> "He should leave" -> aux(leave,should)
>>>>>
>>>>>
>>>>> - conj:conjunct
>>>>>
>>>>> This is the relation between two elements connected by a coordinating
>>>>> conjunction, such as “and”, “or”, etc.
>>>>>
>>>>> ex:
>>>>>
>>>>> “Bill is big and honest” -> conj(big, honest)
>>>>> “They either ski or snowboard” -> conj(ski, snowboard)
>>>>>
>>>>>
>>>>> - dobj:direct object
>>>>>
>>>>> This is the noun phrase which is the object of the verb.
>>>>>
>>>>> ex:
>>>>>
>>>>> “They win the lottery” -> dobj(win, lottery)
>>>>>
>>>>>
>>>>> - nsubj:nominal subject
>>>>>
>>>>> This is a noun phrase which is the syntactic subject of a clause.
>>>>>
>>>>> ex:
>>>>> “The baby is cute” -> nsubj(cute, baby)
>>>>>
>>>>> With this library support, I would like to clarify on following.
>>>>>
>>>>> 1. How should we use the regular expression to extract the
>>>>> relationship while the library is extracting relationships itself?
>>>>> 2. What kind of relationships should we extract, for an example is
>>>>> it just simple relationships as identifying the subject, verb and
>>>>> object or
>>>>> any other?
>>>>>
>>>>>
>>>>> Kindly expect your thoughts on this.
>>>>>
>>>>> Thanks,
>>>>> Malithi.
>>>>>
>>>>>
>>>>>
>>>>> On Fri, Aug 22, 2014 at 6:11 PM, Malithi Edirisinghe <
>>>>> [email protected]> wrote:
>>>>>
>>>>>> Hi,
>>>>>>
>>>>>> We started the implementation with Stanford NLP due to reasons below.
>>>>>>
>>>>>> 1. Stanford NLP provides a rich regular expression support in writing
>>>>>> patterns over tokens, rather than working at character level with normal
>>>>>> java regular expressions.
>>>>>>
>>>>>> 2. Stanford NLP can extract grammatical relationships from the parsed
>>>>>> tree thus we can easily implement the 3rd query.
>>>>>>
>>>>>> Thanks,
>>>>>>
>>>>>> Malithi.
>>>>>>
>>>>>>
>>>>>> On Thu, Aug 21, 2014 at 12:58 PM, Malithi Edirisinghe <
>>>>>> [email protected]> wrote:
>>>>>>
>>>>>>> Hi Suho,
>>>>>>>
>>>>>>> Since Named Entity Recognition is supported by both libraries we can
>>>>>>> implement the first function from any of them. Both can identify
>>>>>>> entities
>>>>>>> like person, location, organization, etc. For the fourth function we
>>>>>>> found
>>>>>>> a way that we can simply define dictionaries in openNLP. There is a
>>>>>>> class
>>>>>>> called DictionaryNameFinder which takes a Dictionary and identify any
>>>>>>> matching entry in the sentence with the dictionary. In Stanford NLP, we
>>>>>>> could find that there is an implementation for a Dictionary; but yet we
>>>>>>> couldn't find a way of using
>>>>>>> that for our requirement. It lacks samples, and seems like we should
>>>>>>> look into their code to find how they have used it. We will work on it.
>>>>>>> Anyhow I think it should be possible to define such Dictionary in
>>>>>>> Stanford
>>>>>>> NLP also.
>>>>>>>
>>>>>>> Thanks,
>>>>>>> Malithi.
>>>>>>>
>>>>>>>
>>>>>>> On Thu, Aug 21, 2014 at 10:09 AM, Sriskandarajah Suhothayan <
>>>>>>> [email protected]> wrote:
>>>>>>>
>>>>>>>> Thats a good compression.
>>>>>>>> Based on this I believe we have issues in implementing functions 2
>>>>>>>> & 3 using OpenNLP.
>>>>>>>> Can you evaluate others functions as well.
>>>>>>>>
>>>>>>>> Suho
>>>>>>>>
>>>>>>>>
>>>>>>>> On Thu, Aug 21, 2014 at 9:54 AM, Chanuka Dissanayake <
>>>>>>>> [email protected]> wrote:
>>>>>>>>
>>>>>>>>> We did a study on both OpenNLP and Stanford NLP libraries and
>>>>>>>>> looked at the features that could support our implementation.
>>>>>>>>> Our findings are summarised below.
>>>>>>>>>
>>>>>>>>> It seems that Stanford NLP has better capabilities when
>>>>>>>>> considering support for regular expressons and parsing.
>>>>>>>>> We would like to discuss this further and choose the appropriate
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Feature OpenNLP StanfordNLP Named Entity Recognizer Will
>>>>>>>>> identify the person,location,organization,time,date,money,percentage
>>>>>>>>> inside
>>>>>>>>> the given sentence but sentence need to be tokenized first. Includes
>>>>>>>>> a 4 class model trained for CoNLL, a 7 class model trained for MUC,
>>>>>>>>> and a 3
>>>>>>>>> class model trained on both data sets for the intersection of those
>>>>>>>>> class
>>>>>>>>> sets.
>>>>>>>>> 3 class: Location, Person, Organization
>>>>>>>>> 4 class: Location, Person, Organization, Misc
>>>>>>>>> 7 class: Time, Location, Organization, Person, Money, Percent, Date
>>>>>>>>> POS Tagger Identify:
>>>>>>>>> VP(Verb Phrase) ,NP(Noun Phrase) ,JJ(Adjective)…etc
>>>>>>>>>
>>>>>>>>> Input: Hi. How are you? This is Mike
>>>>>>>>> output: Hi_NNP How_WRB are_VBP you? _JJ This_DT is_VBZ Mike._NNP Label
>>>>>>>>> each token with the POS Tag, such as noun, verb, adjective, etc.,
>>>>>>>>> Tokenizing Separates the words which have white spaces in-between
>>>>>>>>> by default. Otherwise it can be trained to tokanize by different
>>>>>>>>> options. Can
>>>>>>>>> tokenize the text either by whitespace or as per the options defined
>>>>>>>>> Parsing Once given a tokanized sentence, It will construct the
>>>>>>>>> tree structure. This works out the grammatical structure of
>>>>>>>>> sentences in a tree structure. The parser provides Stanford
>>>>>>>>> Dependencies as
>>>>>>>>> well. They represent the grammatical relations between words in a
>>>>>>>>> sentence.
>>>>>>>>> Dependecies are triplets: name of the relation, governor and
>>>>>>>>> dependent.
>>>>>>>>> Ex: Bell, based in Los Angeles, makes and distributes electronic,
>>>>>>>>> computer and building products.
>>>>>>>>> Dependency: nsubj(distributes-10, Bell-1)
>>>>>>>>> This is like saying “the subject of distributes is Bell.” Sentence
>>>>>>>>> Detection Detect sentence boundaries given a paragraph. Available
>>>>>>>>> as ssplit. Can split sentences as per the options defined Regular
>>>>>>>>> Expressions Character wise regular expression only. Cannot
>>>>>>>>> identify named entities or PoS tags via regular expression Two
>>>>>>>>> tools are provided to deal with regular expressions.
>>>>>>>>> RegexNER:Can define simple rules with regular expressions and
>>>>>>>>> label entities with NE labels that are not provided.
>>>>>>>>> Ex: Bachelor of (Arts|Laws|Science|Engineering) DEGREE
>>>>>>>>> This rule will label tokens matching with the regex in first
>>>>>>>>> column as DEGREE
>>>>>>>>> TokensRegex: Can identify patterns over a list of tokens. In
>>>>>>>>> addition to java regex matching this provides syntax to match part of
>>>>>>>>> speech tags, named entity tags and lemma.
>>>>>>>>> Ex: [ { tag:VBD } ], /University/ /of/ [{ ner:LOCATION }]
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Thanks,
>>>>>>>>> Chanuka.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On Tue, Aug 19, 2014 at 11:11 PM, Sriskandarajah Suhothayan <
>>>>>>>>> [email protected]> wrote:
>>>>>>>>>
>>>>>>>>>> +1 looks good
>>>>>>>>>>
>>>>>>>>>> Suho
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> On Tue, Aug 19, 2014 at 9:56 PM, Srinath Perera <[email protected]
>>>>>>>>>> > wrote:
>>>>>>>>>>
>>>>>>>>>>> Look good. If possible we should do this with OpenNLP as it has
>>>>>>>>>>> apache licence. However, I could not find NLP regex impl there.
>>>>>>>>>>> Please look
>>>>>>>>>>> at it in detial.
>>>>>>>>>>>
>>>>>>>>>>> --Srinath
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> On Tue, Aug 19, 2014 at 9:52 PM, Malithi Edirisinghe <
>>>>>>>>>>> [email protected]> wrote:
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> Hi All,
>>>>>>>>>>>>
>>>>>>>>>>>> We are working on a NLP Toolbox improvement in CEP. The main
>>>>>>>>>>>> idea of this improvement is to use a NLP library and let user do
>>>>>>>>>>>> some NLP
>>>>>>>>>>>> operations as Siddhi extensions.
>>>>>>>>>>>>
>>>>>>>>>>>> So in our implementation we have decided to support following
>>>>>>>>>>>> NLP operations.
>>>>>>>>>>>>
>>>>>>>>>>>> *1. findNameEntityType(sentence, entityType)*
>>>>>>>>>>>>
>>>>>>>>>>>> *Description:*
>>>>>>>>>>>>
>>>>>>>>>>>> This operation takes a sentence and a predefined entity type as
>>>>>>>>>>>> it's inputs. It will return noun(s) in the sentence that match the
>>>>>>>>>>>> defined
>>>>>>>>>>>> entity type, as event(s).
>>>>>>>>>>>>
>>>>>>>>>>>> *inputs:*
>>>>>>>>>>>>
>>>>>>>>>>>> sentence : sentence to be processed
>>>>>>>>>>>> entityType: predefined entity type
>>>>>>>>>>>> ORGANIZATION
>>>>>>>>>>>> NAME
>>>>>>>>>>>> LOCATION
>>>>>>>>>>>> *output:*
>>>>>>>>>>>>
>>>>>>>>>>>> matching noun(s) as event(s)
>>>>>>>>>>>>
>>>>>>>>>>>> *example:*
>>>>>>>>>>>>
>>>>>>>>>>>> inputs:
>>>>>>>>>>>> sentence : Alice works at WSO2
>>>>>>>>>>>> entityType : NAME
>>>>>>>>>>>>
>>>>>>>>>>>> output: Alice
>>>>>>>>>>>>
>>>>>>>>>>>> *2. findNLRegexPattern(sentence, regex)*
>>>>>>>>>>>>
>>>>>>>>>>>> *Description:*
>>>>>>>>>>>>
>>>>>>>>>>>> This operation takes a sentence and a regular expression as
>>>>>>>>>>>> it's inputs. It will return each match in the sentence, as an
>>>>>>>>>>>> event.
>>>>>>>>>>>>
>>>>>>>>>>>> *inputs:*
>>>>>>>>>>>>
>>>>>>>>>>>> sentence : sentence to be processed
>>>>>>>>>>>> regex : regular expression to be matched
>>>>>>>>>>>> *output:*
>>>>>>>>>>>>
>>>>>>>>>>>> matching pharase(s) as event(s)
>>>>>>>>>>>>
>>>>>>>>>>>> *example:*
>>>>>>>>>>>>
>>>>>>>>>>>> inputs:
>>>>>>>>>>>> sentence : WSO2 was found in 2005
>>>>>>>>>>>> regex : \\d{4}
>>>>>>>>>>>>
>>>>>>>>>>>> output: 2005
>>>>>>>>>>>>
>>>>>>>>>>>> *3. findRelationship(sentence, regex)*
>>>>>>>>>>>>
>>>>>>>>>>>> *Description:*
>>>>>>>>>>>>
>>>>>>>>>>>> This operation takes a sentence and a regular expression as
>>>>>>>>>>>> it's inputs. For each relationship extracted from the regular
>>>>>>>>>>>> expression
>>>>>>>>>>>> the operation will return a triplet; subject, object and
>>>>>>>>>>>> relationship as an
>>>>>>>>>>>> event.
>>>>>>>>>>>>
>>>>>>>>>>>> *inputs:*
>>>>>>>>>>>>
>>>>>>>>>>>> sentence : sentence to be processed
>>>>>>>>>>>> regex : regular expression to extract the relationship
>>>>>>>>>>>> *output:*
>>>>>>>>>>>>
>>>>>>>>>>>> triplet(s) of (subject, object, relationship) as event(s)
>>>>>>>>>>>>
>>>>>>>>>>>> *example:*
>>>>>>>>>>>>
>>>>>>>>>>>> inputs:
>>>>>>>>>>>> sentence : Bob works for WSO2
>>>>>>>>>>>> regex : works for
>>>>>>>>>>>>
>>>>>>>>>>>> output: (Bob, WSO2, works for)
>>>>>>>>>>>> *4. findNameEntityTypeViaDictionary(sentence, dictionary,
>>>>>>>>>>>> entityType)*
>>>>>>>>>>>>
>>>>>>>>>>>> *Description:*
>>>>>>>>>>>>
>>>>>>>>>>>> This operation takes a sentence, dictionary file and a
>>>>>>>>>>>> predefined entity type as it's inputs. It will return noun(s) in
>>>>>>>>>>>> the
>>>>>>>>>>>> sentence of the defined entity type, that also exists in the
>>>>>>>>>>>> dictionary as
>>>>>>>>>>>> event(s).
>>>>>>>>>>>>
>>>>>>>>>>>> *inputs:*
>>>>>>>>>>>>
>>>>>>>>>>>> sentence : sentence to be processed
>>>>>>>>>>>> dictionary : dictionary of entities of the defined entity type
>>>>>>>>>>>> entityType : predefined entity type
>>>>>>>>>>>> ORGANIZATION
>>>>>>>>>>>> NAME
>>>>>>>>>>>> LOCATION
>>>>>>>>>>>> *output:*
>>>>>>>>>>>>
>>>>>>>>>>>> matching noun(s) as event(s)
>>>>>>>>>>>>
>>>>>>>>>>>> *example:*
>>>>>>>>>>>>
>>>>>>>>>>>> inputs:
>>>>>>>>>>>> sentence : Bob works at WSO2
>>>>>>>>>>>> dictionary : (WSO2,ORACLE,IBM)
>>>>>>>>>>>> entityType : ORGANIZATION
>>>>>>>>>>>>
>>>>>>>>>>>> output: WSO2
>>>>>>>>>>>>
>>>>>>>>>>>> Each NLP operation defined here will be implemented as a
>>>>>>>>>>>> transformer extension to Siddhi.
>>>>>>>>>>>> --
>>>>>>>>>>>>
>>>>>>>>>>>> *Malithi Edirisinghe*
>>>>>>>>>>>> Senior Software Engineer
>>>>>>>>>>>> WSO2 Inc.
>>>>>>>>>>>>
>>>>>>>>>>>> Mobile : +94 (0) 718176807
>>>>>>>>>>>> [email protected]
>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> --
>>>>>>>>>>> ============================
>>>>>>>>>>> Director, Research, WSO2 Inc.
>>>>>>>>>>> Visiting Faculty, University of Moratuwa
>>>>>>>>>>> Member, Apache Software Foundation
>>>>>>>>>>> Research Scientist, Lanka Software Foundation
>>>>>>>>>>> Blog: http://srinathsview.blogspot.com twitter:@srinath_perera
>>>>>>>>>>> Site: http://people.apache.org/~hemapani/
>>>>>>>>>>> Photos: http://www.flickr.com/photos/hemapani/
>>>>>>>>>>> Phone: 0772360902
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> --
>>>>>>>>>>
>>>>>>>>>> *S. Suhothayan*
>>>>>>>>>> Technical Lead & Team Lead of WSO2 Complex Event Processor
>>>>>>>>>> *WSO2 Inc. *http://wso2.com
>>>>>>>>>> * <http://wso2.com/>*
>>>>>>>>>> lean . enterprise . middleware
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> *cell: (+94) 779 756 757 <%28%2B94%29%20779%20756%20757> | blog:
>>>>>>>>>> http://suhothayan.blogspot.com/ <http://suhothayan.blogspot.com/>
>>>>>>>>>> twitter:
>>>>>>>>>> http://twitter.com/suhothayan <http://twitter.com/suhothayan> |
>>>>>>>>>> linked-in:
>>>>>>>>>> http://lk.linkedin.com/in/suhothayan
>>>>>>>>>> <http://lk.linkedin.com/in/suhothayan>*
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> --
>>>>>>>>> Chanuka Dissanayake
>>>>>>>>> *Software Engineer | **WSO2 Inc.*; http://wso2.com
>>>>>>>>>
>>>>>>>>> Mobile: +94 71 33 63 596
>>>>>>>>> Email: [email protected]
>>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> --
>>>>>>>>
>>>>>>>> *S. Suhothayan*
>>>>>>>> Technical Lead & Team Lead of WSO2 Complex Event Processor
>>>>>>>> *WSO2 Inc. *http://wso2.com
>>>>>>>> * <http://wso2.com/>*
>>>>>>>> lean . enterprise . middleware
>>>>>>>>
>>>>>>>>
>>>>>>>> *cell: (+94) 779 756 757 <%28%2B94%29%20779%20756%20757> | blog:
>>>>>>>> http://suhothayan.blogspot.com/ <http://suhothayan.blogspot.com/>
>>>>>>>> twitter:
>>>>>>>> http://twitter.com/suhothayan <http://twitter.com/suhothayan> |
>>>>>>>> linked-in:
>>>>>>>> http://lk.linkedin.com/in/suhothayan
>>>>>>>> <http://lk.linkedin.com/in/suhothayan>*
>>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> --
>>>>>>>
>>>>>>> *Malithi Edirisinghe*
>>>>>>> Senior Software Engineer
>>>>>>> WSO2 Inc.
>>>>>>>
>>>>>>> Mobile : +94 (0) 718176807
>>>>>>> [email protected]
>>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> --
>>>>>>
>>>>>> *Malithi Edirisinghe*
>>>>>> Senior Software Engineer
>>>>>> WSO2 Inc.
>>>>>>
>>>>>> Mobile : +94 (0) 718176807
>>>>>> [email protected]
>>>>>>
>>>>>
>>>>>
>>>>>
>>>>> --
>>>>>
>>>>> *Malithi Edirisinghe*
>>>>> Senior Software Engineer
>>>>> WSO2 Inc.
>>>>>
>>>>> Mobile : +94 (0) 718176807
>>>>> [email protected]
>>>>>
>>>>
>>>>
>>>>
>>>> --
>>>> ============================
>>>> Director, Research, WSO2 Inc.
>>>> Visiting Faculty, University of Moratuwa
>>>> Member, Apache Software Foundation
>>>> Research Scientist, Lanka Software Foundation
>>>> Blog: http://srinathsview.blogspot.com twitter:@srinath_perera
>>>> Site: http://people.apache.org/~hemapani/
>>>> Photos: http://www.flickr.com/photos/hemapani/
>>>> Phone: 0772360902
>>>>
>>>
>>>
>>>
>>> --
>>> ============================
>>> Director, Research, WSO2 Inc.
>>> Visiting Faculty, University of Moratuwa
>>> Member, Apache Software Foundation
>>> Research Scientist, Lanka Software Foundation
>>> Blog: http://srinathsview.blogspot.com twitter:@srinath_perera
>>> Site: http://people.apache.org/~hemapani/
>>> Photos: http://www.flickr.com/photos/hemapani/
>>> Phone: 0772360902
>>>
>>
>>
>>
>> --
>> Chanuka Dissanayake
>> *Software Engineer | **WSO2 Inc.*; http://wso2.com
>>
>> Mobile: +94 71 33 63 596
>> Email: [email protected]
>>
>
>
>
> --
>
> *Malithi Edirisinghe*
> Senior Software Engineer
> WSO2 Inc.
>
> Mobile : +94 (0) 718176807
> [email protected]
>
--
Chanuka Dissanayake
*Software Engineer | **WSO2 Inc.*; http://wso2.com
Mobile: +94 71 33 63 596
Email: [email protected]
_______________________________________________
Architecture mailing list
[email protected]
https://mail.wso2.org/cgi-bin/mailman/listinfo/architecture