[NO ISSUE] User-defined Function Documentation update - user model changes: no - storage format changes: no - interface changes: no
Details: 1. Updated the UDF documentation to be consisten with current master. 2. Cleaned default UDF package to remove useless UDFs. 3. Added the example in documentation as a test case for IT. 4. Reorganized the documentation to keep up with the new structure. 5. Minor changes to other documentation pages to keep style consistent. Change-Id: I17b1b4d639ca38689298ce88145257e794eb90e1 Reviewed-on: https://asterix-gerrit.ics.uci.edu/2804 Sonar-Qube: Jenkins <jenk...@fulliautomatix.ics.uci.edu> Tested-by: Jenkins <jenk...@fulliautomatix.ics.uci.edu> Contrib: Jenkins <jenk...@fulliautomatix.ics.uci.edu> Reviewed-by: Taewoo Kim <wangs...@gmail.com> Integration-Tests: Jenkins <jenk...@fulliautomatix.ics.uci.edu> Project: http://git-wip-us.apache.org/repos/asf/asterixdb/repo Commit: http://git-wip-us.apache.org/repos/asf/asterixdb/commit/511e1c83 Tree: http://git-wip-us.apache.org/repos/asf/asterixdb/tree/511e1c83 Diff: http://git-wip-us.apache.org/repos/asf/asterixdb/diff/511e1c83 Branch: refs/heads/master Commit: 511e1c83d3aed5fd40c19491ab9833c35663bb13 Parents: 1aeb8b6 Author: Xikui Wang <xkk...@gmail.com> Authored: Fri Sep 28 14:35:02 2018 -0700 Committer: Xikui Wang <xkk...@gmail.com> Committed: Fri Sep 28 16:58:19 2018 -0700 ---------------------------------------------------------------------- .../external-library/mysum/mysum.3.query.sqlpp | 3 +- .../feed-with-external-function.1.ddl.sqlpp | 30 +- .../feed-with-external-function.2.lib.sqlpp | 2 +- .../feed-with-external-function.3.update.sqlpp | 6 +- ...eed-with-external-function.5.pollquery.sqlpp | 4 +- .../feed-with-external-function.6.lib.sqlpp | 2 +- .../feed-with-external-function.7.ddl.sqlpp | 2 +- .../validate-default-library.1.adm | 2 +- .../feed-with-external-function.1.adm | 33 +- asterixdb/asterix-doc/pom.xml | 6 + .../src/main/data_ingestion/feeds.md | 358 ++++++++++++++++++ .../src/main/data_ingestion/feeds_title.md | 25 ++ .../src/main/installation/ansible_title.md | 2 + .../src/main/installation/aws_title.md | 2 + .../src/main/user-defined_function/udf.md | 147 ++++++++ .../src/main/user-defined_function/udf_title.md | 27 ++ .../src/site/markdown/feeds/tutorial.md | 361 ------------------- .../asterix-doc/src/site/markdown/ncservice.md | 2 + asterixdb/asterix-doc/src/site/markdown/udf.md | 189 ---------- asterixdb/asterix-doc/src/site/site.xml | 2 +- .../apache/asterix/external/util/Datatypes.java | 18 - .../external/library/AddHashTagsFactory.java | 31 -- .../external/library/AddHashTagsFunction.java | 85 ----- .../library/AddHashTagsInPlaceFunction.java | 2 +- .../library/AddMentionedUsersFactory.java | 31 ++ .../library/addMentionedUsersFunction.java | 63 ++++ .../src/test/resources/library_descriptor.xml | 13 +- .../functionDataset/functionDataset.1.adm | 2 +- 28 files changed, 710 insertions(+), 740 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/asterixdb/blob/511e1c83/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-library/mysum/mysum.3.query.sqlpp ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-library/mysum/mysum.3.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-library/mysum/mysum.3.query.sqlpp index 9402e1f..a6a1cdc 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-library/mysum/mysum.3.query.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-library/mysum/mysum.3.query.sqlpp @@ -18,5 +18,4 @@ */ use externallibtest; -let x=testlib#mysum(3,4) -select VALUE x; +testlib#mysum(3,4); http://git-wip-us.apache.org/repos/asf/asterixdb/blob/511e1c83/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.1.ddl.sqlpp ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.1.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.1.ddl.sqlpp index 4fdc669..3bc33de 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.1.ddl.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.1.ddl.sqlpp @@ -22,34 +22,20 @@ * Date : 4th Oct 2017 */ -drop dataverse externallibtest if exists; -create dataverse externallibtest; -use externallibtest; +drop dataverse udfs if exists; +create dataverse udfs; +use udfs; -create type TweetInputType as open { - id: string, - username : string, - location : string, - text : string, - timestamp : string -}; - -create type TweetOutputType as open { - id: string, - username : string, - location : string, - text : string, - timestamp : string, - topics : {{string}} +create type TweetType if not exists as open { + id: int64 }; create feed TweetFeed with { "adapter-name" : "localfs", - "type-name" : "TweetInputType", - "path" : "asterix_nc1://data/twitter/obamatweets.adm", + "type-name" : "TweetType", + "path" : "asterix_nc1://data/twitter/extrasmalltweets.txt", "format" : "adm" }; -create dataset TweetsFeedIngest(TweetOutputType) -primary key id; +create dataset ProcessedTweets(TweetType) primary key id; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/511e1c83/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.2.lib.sqlpp ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.2.lib.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.2.lib.sqlpp index d1e0e87..4f0c6d3 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.2.lib.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.2.lib.sqlpp @@ -16,4 +16,4 @@ * specific language governing permissions and limitations * under the License. */ -install externallibtest testlib target/data/externallib/asterix-external-data-testlib.zip \ No newline at end of file +install udfs testlib target/data/externallib/asterix-external-data-testlib.zip \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/511e1c83/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.3.update.sqlpp ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.3.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.3.update.sqlpp index 0d46387..1407514b 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.3.update.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.3.update.sqlpp @@ -21,10 +21,8 @@ * Expected Res : Success * Date : 4th Oct 2017 */ -use externallibtest; +use udfs; -SET `compiler.parallelism` "5"; - -connect feed TweetFeed to dataset TweetsFeedIngest apply function `testlib#parseTweet`; +connect feed TweetFeed to dataset ProcessedTweets apply function testlib#addMentionedUsers; start feed TweetFeed; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/511e1c83/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.5.pollquery.sqlpp ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.5.pollquery.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.5.pollquery.sqlpp index 607e5bd..b95294a 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.5.pollquery.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.5.pollquery.sqlpp @@ -22,8 +22,8 @@ * Date : 4th Oct 2017 */ // polltimeoutsecs=5 -use externallibtest; +use udfs; -select value t from TweetsFeedIngest t +select value t from ProcessedTweets t ORDER BY t.id; http://git-wip-us.apache.org/repos/asf/asterixdb/blob/511e1c83/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.6.lib.sqlpp ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.6.lib.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.6.lib.sqlpp index 86af80f..98c334d 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.6.lib.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.6.lib.sqlpp @@ -16,4 +16,4 @@ * specific language governing permissions and limitations * under the License. */ -uninstall externallibtest testlib \ No newline at end of file +uninstall udfs testlib \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/511e1c83/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.7.ddl.sqlpp ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.7.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.7.ddl.sqlpp index 2a7acef..128c793 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.7.ddl.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/feeds/feed-with-external-function/feed-with-external-function.7.ddl.sqlpp @@ -16,4 +16,4 @@ * specific language governing permissions and limitations * under the License. */ -drop dataverse externallibtest if exists; +drop dataverse udfs if exists; http://git-wip-us.apache.org/repos/asf/asterixdb/blob/511e1c83/asterixdb/asterix-app/src/test/resources/runtimets/results/external-library/validate-default-library/validate-default-library.1.adm ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-library/validate-default-library/validate-default-library.1.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-library/validate-default-library/validate-default-library.1.adm index f0ad2b2..9b2714a 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-library/validate-default-library/validate-default-library.1.adm +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-library/validate-default-library/validate-default-library.1.adm @@ -1,5 +1,5 @@ -{ "Function": { "DataverseName": "externallibtest", "Name": "testlib#addHashTags", "Arity": "1", "Params": [ "Tweet" ], "ReturnType": "ProcessedTweet", "Definition": "org.apache.asterix.external.library.AddHashTagsFactory", "Language": "JAVA", "Kind": "SCALAR", "Dependencies": [ [ ], [ ] ] } } { "Function": { "DataverseName": "externallibtest", "Name": "testlib#addHashTagsInPlace", "Arity": "1", "Params": [ "Tweet" ], "ReturnType": "ProcessedTweet", "Definition": "org.apache.asterix.external.library.AddHashTagsInPlaceFactory", "Language": "JAVA", "Kind": "SCALAR", "Dependencies": [ [ ], [ ] ] } } +{ "Function": { "DataverseName": "externallibtest", "Name": "testlib#addMentionedUsers", "Arity": "1", "Params": [ "TweetType" ], "ReturnType": "TweetType", "Definition": "org.apache.asterix.external.library.AddMentionedUsersFactory", "Language": "JAVA", "Kind": "SCALAR", "Dependencies": [ [ ], [ ] ] } } { "Function": { "DataverseName": "externallibtest", "Name": "testlib#allTypes", "Arity": "1", "Params": [ "AllType" ], "ReturnType": "AllType", "Definition": "org.apache.asterix.external.library.AllTypesFactory", "Language": "JAVA", "Kind": "SCALAR", "Dependencies": [ [ ], [ ] ] } } { "Function": { "DataverseName": "externallibtest", "Name": "testlib#echoDelay", "Arity": "1", "Params": [ "TweetMessageType" ], "ReturnType": "TweetMessageType", "Definition": "org.apache.asterix.external.library.EchoDelayFactory", "Language": "JAVA", "Kind": "SCALAR", "Dependencies": [ [ ], [ ] ] } } { "Function": { "DataverseName": "externallibtest", "Name": "testlib#fnameDetector", "Arity": "1", "Params": [ "InputRecordType" ], "ReturnType": "DetectResultType", "Definition": "org.apache.asterix.external.library.KeywordsDetectorFactory", "Language": "JAVA", "Kind": "SCALAR", "Dependencies": [ [ ], [ ] ] } } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/511e1c83/asterixdb/asterix-app/src/test/resources/runtimets/results/feeds/feed-with-external-function/feed-with-external-function.1.adm ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/feeds/feed-with-external-function/feed-with-external-function.1.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/feeds/feed-with-external-function/feed-with-external-function.1.adm index 1291213..0f7eb82 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/results/feeds/feed-with-external-function/feed-with-external-function.1.adm +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/feeds/feed-with-external-function/feed-with-external-function.1.adm @@ -1,12 +1,21 @@ -{ "id": "nc1:1", "username": "BronsonMike", "location": "", "text": "@GottaLaff @reutersus Christie and obama just foul weather friends", "timestamp": "Thu Dec 06 16:53:06 PST 2012", "topics": {{ }} } -{ "id": "nc1:100", "username": "KidrauhlProuds", "location": "", "text": "RT @01Direclieber: A filha do Michael Jackson uma Belieber,a filha do Eminem e uma Belieber,as filhas de Obama sao Beliebers, e a filha do meu pai e Belieber", "timestamp": "Thu Dec 06 16:53:16 PST 2012", "topics": {{ }} } -{ "id": "nc1:102", "username": "jaysauce82", "location": "", "text": "Not voting for President Obama #BadDecision", "timestamp": "Thu Dec 06 16:53:16 PST 2012", "topics": {{ "#BadDecision" }} } -{ "id": "nc1:104", "username": "princeofsupras", "location": "", "text": "RT @01Direclieber: A filha do Michael Jackson e uma Belieber,a filha do Eminem e uma Belieber,as filhas de Obama sao Beliebers, e a filha do meu pai e Belieber", "timestamp": "Thu Dec 06 16:53:15 PST 2012", "topics": {{ }} } -{ "id": "nc1:106", "username": "GulfDogs", "location": "", "text": "Obama Admin Knew Libyan Terrorists Had US-Provided Weaponsteaparty #tcot #ccot #NewGuards #BreitbartArmy #patriotwttp://t.co/vJxzrQUE", "timestamp": "Thu Dec 06 16:53:14 PST 2012", "topics": {{ "#tcot", "#ccot", "#NewGuards", "#BreitbartArmy", "#patriotwttp://t.co/vJxzrQUE" }} } -{ "id": "nc1:108", "username": "Laugzpz", "location": "", "text": "@AlfredoJalife Maestro Obama se hace de la vista gorda, es un acuerdo de siempre creo yo.", "timestamp": "Thu Dec 06 16:53:14 PST 2012", "topics": {{ }} } -{ "id": "nc1:11", "username": "magarika", "location": "", "text": "RT @ken24xavier: Obama tells SOROS - our plan is ALMOST finished http://t.co/WvzK0GtU", "timestamp": "Thu Dec 06 16:53:05 PST 2012", "topics": {{ }} } -{ "id": "nc1:111", "username": "ToucanMall", "location": "", "text": "RT @WorldWar3Watch: Michelle Obama Gets More Grammy Nominations Than Justin ... #Obama #WW3 http://t.co/0Wv2GKij", "timestamp": "Thu Dec 06 16:53:13 PST 2012", "topics": {{ "#Obama", "#WW3" }} } -{ "id": "nc1:113", "username": "ToucanMall", "location": "", "text": "RT @ObamaPalooza: Tiffany Shared What $2,000 Meant to Her ... and the President Stopped by to Talk About It http://t.co/sgT7lsNV #Obama", "timestamp": "Thu Dec 06 16:53:12 PST 2012", "topics": {{ "#Obama" }} } -{ "id": "nc1:115", "username": "thewildpitch", "location": "", "text": "RT @RevkahJC: Dennis Miller: Obama Should Just Say He Wants To Tax Successful People http://t.co/Ihlemy9Y", "timestamp": "Thu Dec 06 16:53:11 PST 2012", "topics": {{ }} } -{ "id": "nc1:117", "username": "Rnugent24", "location": "", "text": "RT @ConservativeQuo: unemployment is above 8% again. I wonder how long it will take for Obama to start blaming Bush? 3-2-1 #tcot #antiobama", "timestamp": "Thu Dec 06 16:53:10 PST 2012", "topics": {{ "#tcot", "#antiobama" }} } -{ "id": "nc1:119", "username": "ToucanMall", "location": "", "text": "RT @Newitrsdotcom: I hope #Obama will win re-election... Other four years without meaningless #wars", "timestamp": "Thu Dec 06 16:53:09 PST 2012", "topics": {{ "#Obama", "#wars" }} } +{ "id": 21, "tweetid": 69902639026020352, "loc": point("34.5,-100.5"), "time": datetime("2011-05-15T16:11:02.000Z"), "text": "thats that smokers cough maam <<<<<--- @x_incredibleL :: Allergies. i got that "cough" lol", "mentionedUsers": [ "@x_incredibleL" ] } +{ "id": 22, "tweetid": 69988755800465408, "loc": point("34.5,-97.5"), "time": datetime("2011-05-15T21:53:14.000Z"), "text": "Allergies fuckin over me..#damn", "mentionedUsers": [ ] } +{ "id": 23, "tweetid": 69940039605432320, "loc": point("34.5,-97.5"), "time": datetime("2011-05-15T18:39:39.000Z"), "text": "Natural Asthma Remedy - Deal With Your Asthma in a Natural Way.. Allergies", "mentionedUsers": [ ] } +{ "id": 24, "tweetid": 69834276929159169, "loc": point("25.5,-100.5"), "time": datetime("2011-05-15T11:39:23.000Z"), "text": "Damn Allergies... sneezing like crazy! >_<", "mentionedUsers": [ ] } +{ "id": 25, "tweetid": 69950146787553281, "loc": point("25.5,-97.5"), "time": datetime("2011-05-15T19:19:49.000Z"), "text": "pass me an asthma pump", "mentionedUsers": [ ] } +{ "id": 26, "tweetid": 69754524767756289, "loc": point("25.5,-97.5"), "time": datetime("2011-05-15T06:22:29.000Z"), "text": "Never knew allergies could actually keep me from sleeping", "mentionedUsers": [ ] } +{ "id": 27, "tweetid": 69999864498487297, "loc": point("25.5,-80.5"), "time": datetime("2011-05-15T22:37:22.000Z"), "text": "@ItsCrystal320 gooodd mommy! Except my allergies have been acting up :( and Im having issues with you know who. Smh nothing new. Lol", "mentionedUsers": [ "@ItsCrystal320" ] } +{ "id": 28, "tweetid": 69996796616777728, "loc": point("25.5,-80.5"), "time": datetime("2011-05-15T22:25:11.000Z"), "text": "My allergies act up so much while Im in this house!!! Idk why! Sneezing, now my eye is swollen!! Smh.", "mentionedUsers": [ ] } +{ "id": 29, "tweetid": 69977295351316480, "loc": point("25.5,-80.5"), "time": datetime("2011-05-15T21:07:41.000Z"), "text": "@GOLDenNote6 lmmmaaaoooo!!!! nnnnooo! ur the one that needs the asthma pump!", "mentionedUsers": [ "@GOLDenNote6" ] } +{ "id": 30, "tweetid": 69972022586912768, "loc": point("25.5,-80.5"), "time": datetime("2011-05-15T20:46:44.000Z"), "text": "@TinaLee90 hell yeah ! He snapped cause she got allergies and heavy she be snorting and coughing while he trying to study", "mentionedUsers": [ "@TinaLee90" ] } +{ "id": 31, "tweetid": 69965044678524928, "loc": point("25.5,-80.5"), "time": datetime("2011-05-15T20:19:01.000Z"), "text": "Back home and my ears begin to itch!!! Omg allergies go away please! #thingsicanlivewithout", "mentionedUsers": [ ] } +{ "id": 32, "tweetid": 69961997680246784, "loc": point("25.5,-80.5"), "time": datetime("2011-05-15T20:06:54.000Z"), "text": "@BravoAndy allergies acting up again or you just digging the glasses? Haha u rock it though!", "mentionedUsers": [ "@BravoAndy" ] } +{ "id": 33, "tweetid": 69946356248215552, "loc": point("25.5,-80.5"), "time": datetime("2011-05-15T19:04:45.000Z"), "text": "My allergies act up at the worst times -_-", "mentionedUsers": [ ] } +{ "id": 34, "tweetid": 69929466691993600, "loc": point("25.5,-80.5"), "time": datetime("2011-05-15T17:57:38.000Z"), "text": "Hate being sick!!! -_____- I hate you allergies! :/", "mentionedUsers": [ ] } +{ "id": 35, "tweetid": 69928014615556096, "loc": point("25.5,-80.5"), "time": datetime("2011-05-15T17:51:52.000Z"), "text": "Allergies please go away :(", "mentionedUsers": [ ] } +{ "id": 36, "tweetid": 69916338092654592, "loc": point("25.5,-80.5"), "time": datetime("2011-05-15T17:05:28.000Z"), "text": "I feel tired....i got asthma :( but it was still an awesome birthday", "mentionedUsers": [ ] } +{ "id": 37, "tweetid": 69911241975529474, "loc": point("25.5,-80.5"), "time": datetime("2011-05-15T16:45:13.000Z"), "text": "Cant stand that asthma commercial with the gold fish -__-", "mentionedUsers": [ ] } +{ "id": 38, "tweetid": 69910467233062912, "loc": point("25.5,-80.5"), "time": datetime("2011-05-15T16:42:08.000Z"), "text": "@PapisFavWave whats wrong? Got a cold? Asthma ?", "mentionedUsers": [ "@PapisFavWave" ] } +{ "id": 39, "tweetid": 69908652202536961, "loc": point("25.5,-80.5"), "time": datetime("2011-05-15T16:34:56.000Z"), "text": "My allergies are killing me!", "mentionedUsers": [ ] } +{ "id": 40, "tweetid": 69897794273546240, "loc": point("25.5,-80.5"), "time": datetime("2011-05-15T15:51:47.000Z"), "text": "and allergies", "mentionedUsers": [ ] } +{ "id": 41, "tweetid": 69893733449080832, "loc": point("25.5,-80.5"), "time": datetime("2011-05-15T15:35:39.000Z"), "text": "Repeated splashing of water about the skin, specifically following an exposure to pollution and dirt, makes sure... http://bit.ly/mnWnJo", "mentionedUsers": [ ] } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/511e1c83/asterixdb/asterix-doc/pom.xml ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-doc/pom.xml b/asterixdb/asterix-doc/pom.xml index 8ddc1d8..0cc43e4 100644 --- a/asterixdb/asterix-doc/pom.xml +++ b/asterixdb/asterix-doc/pom.xml @@ -69,6 +69,12 @@ <concat destfile="${project.build.directory}/generated-site/markdown/aws.md"> <filelist dir="${project.basedir}/src/main/installation/" files="aws_title.md,aws.md" /> </concat> + <concat destfile="${project.build.directory}/generated-site/markdown/feeds.md"> + <filelist dir="${project.basedir}/src/main/data_ingestion/" files="feeds_title.md,feeds.md" /> + </concat> + <concat destfile="${project.build.directory}/generated-site/markdown/udf.md"> + <filelist dir="${project.basedir}/src/main/user-defined_function/" files="udf_title.md,udf.md" /> + </concat> </target> </configuration> <goals> http://git-wip-us.apache.org/repos/asf/asterixdb/blob/511e1c83/asterixdb/asterix-doc/src/main/data_ingestion/feeds.md ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-doc/src/main/data_ingestion/feeds.md b/asterixdb/asterix-doc/src/main/data_ingestion/feeds.md new file mode 100644 index 0000000..0dd6789 --- /dev/null +++ b/asterixdb/asterix-doc/src/main/data_ingestion/feeds.md @@ -0,0 +1,358 @@ +<!-- + ! Licensed to the Apache Software Foundation (ASF) under one + ! or more contributor license agreements. See the NOTICE file + ! distributed with this work for additional information + ! regarding copyright ownership. The ASF licenses this file + ! to you under the Apache License, Version 2.0 (the + ! "License"); you may not use this file except in compliance + ! with the License. You may obtain a copy of the License at + ! + ! http://www.apache.org/licenses/LICENSE-2.0 + ! + ! Unless required by applicable law or agreed to in writing, + ! software distributed under the License is distributed on an + ! "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + ! KIND, either express or implied. See the License for the + ! specific language governing permissions and limitations + ! under the License. + !--> + +## <a name="Introduction">Introduction</a> ## + +In this document, we describe the support for data ingestion in +AsterixDB. Data feeds are a new mechanism for having continuous +data arrive into a BDMS from external sources and incrementally +populate a persisted dataset and associated indexes. We add a new BDMS +architectural component, called a data feed, that makes a Big Data system the caretaker for functionality that +used to live outside, and we show how it improves users' lives and system performance. + +## <a name="FeedAdapters">Feed Adapters</a> ## + +The functionality of establishing a connection with a data source +and receiving, parsing and translating its data into ADM objects +(for storage inside AsterixDB) is contained in a feed adapter. A +feed adapter is an implementation of an interface and its details are +specific to a given data source. An adapter may optionally be given +parameters to configure its runtime behavior. Depending upon the +data transfer protocol/APIs offered by the data source, a feed adapter +may operate in a push or a pull mode. Push mode involves just +one initial request by the adapter to the data source for setting up +the connection. Once a connection is authorized, the data source +"pushes" data to the adapter without any subsequent requests by +the adapter. In contrast, when operating in a pull mode, the adapter +makes a separate request each time to receive data. +AsterixDB currently provides built-in adapters for several popular +data sources such as Twitter and RSS feeds. AsterixDB additionally +provides a generic socket-based adapter that can be used +to ingest data that is directed at a prescribed socket. + + +In this tutorial, we shall describe building two example data ingestion pipelines +that cover the popular scenarios of ingesting data from (a) Twitter (b) RSS (c) Socket Feed source. + +####Ingesting Twitter Stream +We shall use the built-in push-based Twitter adapter. +As a pre-requisite, we must define a Tweet using the AsterixDB Data Model (ADM) +and the query language SQL++. Given below are the type definitions in SQL++ +that create a Tweet datatype which is representative of a real tweet as obtained from Twitter. + + drop dataverse feeds if exists; + + create dataverse feeds; + use feeds; + + create type TwitterUser as closed { + screen_name: string, + lang: string, + friends_count: int32, + statuses_count: int32 + }; + + create type Tweet as open { + id: int64, + user: TwitterUser + }; + + create dataset Tweets (Tweet) primary key id; + +We also create a dataset that we shall use to persist the tweets in AsterixDB. +Next we make use of the `create feed` SQL++ statement to define our example data feed. + +#####Using the "push_twitter" feed adapter##### +The "push_twitter" adapter requires setting up an application account with Twitter. To retrieve +tweets, Twitter requires registering an application. Registration involves providing +a name and a brief description for the application. Each application has associated OAuth +authentication credentials that include OAuth keys and tokens. Accessing the +Twitter API requires providing the following. + +1. Consumer Key (API Key) +2. Consumer Secret (API Secret) +3. Access Token +4. Access Token Secret + +The "push_twitter" adapter takes as configuration the above mentioned +parameters. End users are required to obtain the above authentication credentials prior to +using the "push_twitter" adapter. For further information on obtaining OAuth keys and tokens and +registering an application with Twitter, please visit http://apps.twitter.com. + +Note that AsterixDB uses the Twitter4J API for getting data from Twitter. Due to a license conflict, +Apache AsterixDB cannot ship the Twitter4J library. To use the Twitter adapter in AsterixDB, +please download the necessary dependencies (`twitter4j-core-4.0.x.jar` and `twitter4j-stream-4.0.x.jar`) and drop +them into the `repo/` directory before AsterixDB starts. + +Given below is an example SQL++ statement that creates a feed called "TwitterFeed" by using the +"push_twitter" adapter. + + use feeds; + + create feed TwitterFeed with { + "adapter-name": "push_twitter", + "type-name": "Tweet", + "format": "twitter-status", + "consumer.key": "************", + "consumer.secret": "************", + "access.token": "**********", + "access.token.secret": "*************" + }; + +It is required that the above authentication parameters are provided valid. +Note that the `create feed` statement does not initiate the flow of data from Twitter into +the AsterixDB instance. Instead, the `create feed` statement only results in registering +the feed with the instance. The flow of data along a feed is initiated when it is connected +to a target dataset using the connect feed statement and activated using the start feed statement. + +The Twitter adapter also supports several Twitter streaming APIs as follow: + +1. Track filter `"keywords": "AsterixDB, Apache"` +2. Locations filter `"locations": "-29.7, 79.2, 36.7, 72.0; -124.848974,-66.885444, 24.396308, 49.384358"` +3. Language filter `"language": "en"` +4. Filter level `"filter-level": "low"` + +An example of Twitter adapter tracking tweets with keyword "news" can be described using following ddl: + + use feeds; + + create feed TwitterFeed with { + "adapter-name": "push_twitter", + "type-name": "Tweet", + "format": "twitter-status", + "consumer.key": "************", + "consumer.secret": "************", + "access.token": "**********", + "access.token.secret": "*************", + "keywords": "news" + }; + +For more details about these APIs, please visit https://dev.twitter.com/streaming/overview/request-parameters + +####Lifecycle of a Feed#### + +A feed is a logical artifact that is brought to life (i.e., its data flow +is initiated) only when it is activated using the `start feed` statement. +Before we active a feed, we need to designate the dataset where the data to be persisted +using `connect feed` statement. +Subsequent to a `connect feed` statement, the feed is said to be in the connected state. +After that, `start feed` statement will activate the feed, and start the dataflow from feed to its connected dataset. +Multiple feeds can simultaneously be connected to a dataset such that the +contents of the dataset represent the union of the connected feeds. +Also one feed can be simultaneously connected to multiple target datasets. + + use feeds; + + connect feed TwitterFeed to dataset Tweets; + + start feed TwitterFeed; + +The `connect feed` statement above directs AsterixDB to persist +the data from `TwitterFeed` feed into the `Tweets` dataset. The `start feed` statement will activate the feed and +start the dataflow. +If it is required (by the high-level application) to also retain the raw +tweets obtained from Twitter, the end user may additionally choose +to connect TwitterFeed to a different dataset. + +Let the feed run for a minute, then run the following query to see the +latest tweets that are stored into the data set. + + use feeds; + + select * from Tweets limit 10; + +The dataflow of data from a feed can be terminated explicitly by `stop feed` statement. + + use feeds; + + stop feed TwitterFeed; + +The `disconnnect statement` can be used to disconnect the feed from certain dataset. + + use feeds; + + disconnect feed TwitterFeed from dataset Tweets; + +###Ingesting with Other Adapters +AsterixDB has several builtin feed adapters for data ingestion. User can also +implement their own adapters and plug them into AsterixDB. +Here we introduce `socket_adapter` and `localfs` +feed adapter that cover most of the common application scenarios. + +#####Using the "socket_adapter" feed adapter##### +`socket_adapter` feed opens a web socket on the given node which allows user to push data into +AsterixDB directly. Here is an example: + + drop dataverse feeds if exists; + create dataverse feeds; + use feeds; + + create type TestDataType as open { + screenName: string + }; + + create dataset TestDataset(TestDataType) primary key screenName; + + create feed TestSocketFeed with { + "adapter-name": "socket_adapter", + "sockets": "127.0.0.1:10001", + "address-type": "IP", + "type-name": "TestDataType", + "format": "adm" + }; + + connect feed TestSocketFeed to dataset TestDataset; + + use feeds; + start feed TestSocketFeed; + +The above statements create a socket feed which is listening to "10001" port of the host machine. This feed accepts data +records in "adm" format. As an example, you can download the sample dataset [Chirp Users](../data/chu.adm) and push them line +by line into the socket feed using any socket client you like. Following is a socket client example in Python: + + from socket import socket + + ip = '127.0.0.1' + port1 = 10001 + filePath = 'chu.adm' + + sock1 = socket() + sock1.connect((ip, port1)) + + with open(filePath) as inputData: + for line in inputData: + sock1.sendall(line) + sock1.close() + + +####Using the "localfs" feed adapter#### +`localfs` adapter enables data ingestion from local file system. It allows user to feed data records on local disk +into a dataset. A DDL example for creating a `localfs` feed is given as follow: + + use feeds; + + create type TestDataType as open { + screenName: string + }; + + create dataset TestDataset(TestDataType) primary key screenName; + + create feed TestFileFeed with { + "adapter-name": "localfs", + "type-name": "TestDataType", + "path": "HOSTNAME://LOCAL_FILE_PATH", + "format": "adm" + }; + + connect feed TestFileFeed to dataset TestDataset; + + start feed TestFileFeed; + +Similar to previous examples, we need to define the datatype and dataset this feed uses. +The "path" parameter refers to the local data file that we want to ingest data from. +`HOSTNAME` can either be the IP address or node name of the machine which holds the file. +`LOCAL_FILE_PATH` indicates the absolute path to the file on that machine. Similarly to `socket_adapter`, +this feed takes `adm` formatted data records. + +### Datatype for feed and target dataset + +The "type-name" parameter in create feed statement defines the `datatype` of the datasource. In most use cases, +feed will have the same `datatype` as the target dataset. However, if we want to perform certain preprocess before the +data records gets into the target dataset (append autogenerated key, apply user defined functions, etc.), we will +need to define the datatypes for feed and dataset separately. + +#### Ingestion with autogenerated key + +AsterixDB supports using autogenerated uuid as the primary key for dataset. When we use this feature, we will need to +define a datatype with the primary key field, and specify that field to be autogenerated when creating the dataset. +Use that same datatype in feed definition will cause a type discrepancy since there is no such field in the datasource. +Thus, we will need to define two separate datatypes for feed and dataset: + + use feeds; + + create type DBLPFeedType as closed { + dblpid: string, + title: string, + authors: string, + misc: string + } + + create type DBLPDataSetType as open { + id: uuid, + dblpid: string, + title: string, + authors: string, + misc: string + } + create dataset DBLPDataset(DBLPDataSetType) primary key id autogenerated; + + create feed DBLPFeed with { + "adapter-name": "socket_adapter", + "sockets": "127.0.0.1:10001", + "address-type": "IP", + "type-name": "DBLPFeedType", + "format": "adm" + }; + + connect feed DBLPFeed to dataset DBLPDataset; + + start feed DBLPFeed; + +## <a name="FeedPolicies">Policies for Feed Ingestion</a> ## + +Multiple feeds may be concurrently operational on an AsterixDB +cluster, each competing for resources (CPU cycles, network bandwidth, +disk IO) to maintain pace with their respective data sources. +As a data management system, AsterixDB is able to manage a set of concurrent +feeds and make dynamic decisions related to the allocation of +resources, resolving resource bottlenecks and the handling of failures. +Each feed has its own set of constraints, influenced largely +by the nature of its data source and the applications that intend +to consume and process the ingested data. Consider an application +that intends to discover the trending topics on Twitter by analyzing +tweets that are being processed. Losing a few tweets may be +acceptable. In contrast, when ingesting from a data source that +provides a click-stream of ad clicks, losing data would translate to +a loss of revenue for an application that tracks revenue by charging +advertisers per click. + +AsterixDB allows a data feed to have an associated ingestion +policy that is expressed as a collection of parameters and associated +values. An ingestion policy dictates the runtime behavior of +the feed in response to resource bottlenecks and failures. AsterixDB provides +a set of policies that help customize the +system's runtime behavior when handling excess objects. + +####Policies + +- *Spill*: Objects that cannot be processed by an operator for lack of resources +(referred to as excess objects hereafter) should be persisted to the local disk for deferred processing. + +- *Discard*: Excess objects should be discarded. + +Note that the end user may choose to form a custom policy. For example, +it is possible in AsterixDB to create a custom policy that spills excess +objects to disk and subsequently resorts to throttling if the +spillage crosses a configured threshold. In all cases, the desired +ingestion policy is specified as part of the `connect feed` statement +or else the "Basic" policy will be chosen as the default. + + use feeds; + + connect feed TwitterFeed to dataset Tweets using policy Basic; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/511e1c83/asterixdb/asterix-doc/src/main/data_ingestion/feeds_title.md ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-doc/src/main/data_ingestion/feeds_title.md b/asterixdb/asterix-doc/src/main/data_ingestion/feeds_title.md new file mode 100644 index 0000000..1b7293d --- /dev/null +++ b/asterixdb/asterix-doc/src/main/data_ingestion/feeds_title.md @@ -0,0 +1,25 @@ +<!-- + ! Licensed to the Apache Software Foundation (ASF) under one + ! or more contributor license agreements. See the NOTICE file + ! distributed with this work for additional information + ! regarding copyright ownership. The ASF licenses this file + ! to you under the Apache License, Version 2.0 (the + ! "License"); you may not use this file except in compliance + ! with the License. You may obtain a copy of the License at + ! + ! http://www.apache.org/licenses/LICENSE-2.0 + ! + ! Unless required by applicable law or agreed to in writing, + ! software distributed under the License is distributed on an + ! "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + ! KIND, either express or implied. See the License for the + ! specific language governing permissions and limitations + ! under the License. + !--> + +# Data Ingestion with Feeds # + +## <a id="#toc">Table of Contents</a> ## +* [Introduction](#Introduction) +* [Feed Adapters](#FeedAdapters) +* [Feed Policies](#FeedPolicies) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/511e1c83/asterixdb/asterix-doc/src/main/installation/ansible_title.md ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-doc/src/main/installation/ansible_title.md b/asterixdb/asterix-doc/src/main/installation/ansible_title.md index 307580a..d72801f 100644 --- a/asterixdb/asterix-doc/src/main/installation/ansible_title.md +++ b/asterixdb/asterix-doc/src/main/installation/ansible_title.md @@ -16,7 +16,9 @@ ! specific language governing permissions and limitations ! under the License. !--> +# Installation using Ansible # +## <a id="#toc">Table of Contents</a> ## * [Introduction](#Introduction) * [Prerequisites](#Prerequisites) * [Cluster Configuration](#config) http://git-wip-us.apache.org/repos/asf/asterixdb/blob/511e1c83/asterixdb/asterix-doc/src/main/installation/aws_title.md ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-doc/src/main/installation/aws_title.md b/asterixdb/asterix-doc/src/main/installation/aws_title.md index abf01c9..9af36a9 100644 --- a/asterixdb/asterix-doc/src/main/installation/aws_title.md +++ b/asterixdb/asterix-doc/src/main/installation/aws_title.md @@ -16,7 +16,9 @@ ! specific language governing permissions and limitations ! under the License. !--> +# Installation using Amazon Web Services # +## <a id="#toc">Table of Contents</a> ## * [Introduction](#Introduction) * [Prerequisites](#Prerequisites) * [Cluster Configuration](#config) http://git-wip-us.apache.org/repos/asf/asterixdb/blob/511e1c83/asterixdb/asterix-doc/src/main/user-defined_function/udf.md ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-doc/src/main/user-defined_function/udf.md b/asterixdb/asterix-doc/src/main/user-defined_function/udf.md new file mode 100644 index 0000000..2431448 --- /dev/null +++ b/asterixdb/asterix-doc/src/main/user-defined_function/udf.md @@ -0,0 +1,147 @@ +<!-- + ! Licensed to the Apache Software Foundation (ASF) under one + ! or more contributor license agreements. See the NOTICE file + ! distributed with this work for additional information + ! regarding copyright ownership. The ASF licenses this file + ! to you under the Apache License, Version 2.0 (the + ! "License"); you may not use this file except in compliance + ! with the License. You may obtain a copy of the License at + ! + ! http://www.apache.org/licenses/LICENSE-2.0 + ! + ! Unless required by applicable law or agreed to in writing, + ! software distributed under the License is distributed on an + ! "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + ! KIND, either express or implied. See the License for the + ! specific language governing permissions and limitations + ! under the License. + !--> + +## <a name="introduction">Introduction</a>## + +Apache AsterixDB supports two languages for writing user-defined functions (UDFs): SQL++ and Java. +A user can encapsulate data processing logic into a UDF and invoke it +later repeatedly. For SQL++ functions, a user can refer to [SQL++ Functions](sqlpp/manual.html#Functions) +for their usages. In this document, we +focus on how to install/invoke/uninstall a Java function library using the Ansible script that we provide. + + +## <a name="installingUDF">Installing an UDF Library</a>## + +UDFs have to be installed offline. +This section describes the process assuming that you have followed the preceding [ansible installation instructions](ansible.html) +to deploy an AsterixDB instance on your local machine or cluster. Here are the +instructions to install an UDF library: + +- Step 1: Stop the AsterixDB instance if it is ACTIVE. + + $ bin/stop.sh + +- Step 2: Deploy the UDF package. + + $ bin/udf.sh -m i -d DATAVERSE_NAME -l LIBRARY_NAME -p UDF_PACKAGE_PATH + +- Step 3: Start AsterixDB + + $ bin/start.sh + +After AsterixDB starts, you can use the following query to check whether your UDFs have been sucessfully registered with the system. + + SELECT * FROM Metadata.`Function`; + +In the AsterixDB source release, we provide several sample UDFs that you can try out. +You need to build the AsterixDB source to get the compiled UDF package. It can be found under +the `asterixdb-external` sub-project. Assuming that these UDFs have been installed into the `udfs` dataverse and `testlib` library, +here is an example that uses the sample UDF `mysum` to compute the sum of two input integers. + + use udfs; + + testlib#mysum(3,4); + +## <a id="UDFOnFeeds">Attaching a UDF on Data Feeds</a> ## + +In [Data Ingestion using feeds](feeds.html), we introduced an efficient way for users to get data into AsterixDB. In +some use cases, users may want to pre-process the incoming data before storing it into the dataset. To meet this need, +AsterixDB allows +the user to attach a UDF onto the ingestion pipeline. Following the example in [Data Ingestion](feeds.html), here we +show an example of how to attach a UDF that extracts the user names mentioned from the incoming Tweet text, storing the +processed Tweets into a dataset. + +We start by creating the datatype and dataset that will be used for the feed and UDF. One thing to keep in mind is that +data flows from the feed to the UDF and then to the dataset. This means that the feed's datatype +should be the same as the input type of the UDF, and the output datatype of the UDF should be the same as the dataset's +datatype. Thus, users should make sure that their datatypes are consistent in the UDF configuration. Users can also +take advantage of open datatypes in AsterixDB by creating a minimum description of the data for simplicity. +Here we use open datatypes: + + use udfs; + + create type TweetType if not exists as open { + id: int64 + }; + + create dataset ProcessedTweets(TweetType) primary key id; + +As the `TweetType` is an open datatype, processed Tweets can be stored into the dataset after they are annotated +with an extra attribute. Given the datatype and dataset above, we can create a Twitter Feed with the same datatype. +Please refer to section [Data Ingestion](feeds.html) if you have any trouble in creating feeds. + + use udfs; + + create feed TwitterFeed with { + "adapter-name": "push_twitter", + "type-name": "TweetType", + "format": "twitter-status", + "consumer.key": "************", + "consumer.secret": "************", + "access.token": "**********", + "access.token.secret": "*************" + }; + +After creating the feed, we attach the UDF onto the feed pipeline and start the feed with following statements: + + use udfs; + + connect feed TwitterFeed to dataset ProcessedTweets apply function udfs#addMentionedUsers; + + start feed TwitterFeed; + +You can check the annotated Tweets by querying the `ProcessedTweets` dataset: + + SELECT * FROM ProcessedTweets LIMIT 10; + +## <a name="udfConfiguration">A quick look of the UDF configuration</a>## + +AsterixDB uses an XML configuration file to describe the UDFs. A user can use it to define and reuse their compiled UDFs +for different purposes. Here is a snippet of the configuration used in our [previous example](#UDFOnFeeds): + + <libraryFunction> + <name>addMentionedUsers</name> + <function_type>SCALAR</function_type> + <argument_type>TweetType</argument_type> + <return_type>TweetType</return_type> + <definition>org.apache.asterix.external.library.AddMentionedUsersFactory</definition> + <parameters>text</parameters> + </libraryFunction> + +Here are the explanations of the fields in the configuration file: + + name: The proper name that is used for invoke the function. + function_type: The type of the function. + argument_type: The datatype of the arguments passed in. If there is more than one parameter, separate them with comma(s), e.g., `AINT32,AINT32`. + return_type: The datatype of the returning value. + definition: A reference to the function factory. + parameters: The parameters passed into the function. + +In our feeds example, we passed in `"text"` as a parameter to the function so it knows which field to look at to get the Tweet text. +If the Twitter API were to change its field names in the future, we can accommodate that change by simply modifying the configuration file +instead of recompiling the whole UDF package. This feature can be further utilized in use cases where a user has a Machine Learning +algorithm with different trained model files. If you are interested, You can find more examples [here](https://github.com/apache/asterixdb/tree/master/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/library) + +## <a name="uninstall">Unstalling an UDF Library</a>## + +If you want to uninstall the UDF library, put AsterixDB into `INACTVIVE` mode and run following command: + + $ bin/udf.sh -m u -d DATAVERSE_NAME -l LIBRARY_NAME + + http://git-wip-us.apache.org/repos/asf/asterixdb/blob/511e1c83/asterixdb/asterix-doc/src/main/user-defined_function/udf_title.md ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-doc/src/main/user-defined_function/udf_title.md b/asterixdb/asterix-doc/src/main/user-defined_function/udf_title.md new file mode 100644 index 0000000..659c13b --- /dev/null +++ b/asterixdb/asterix-doc/src/main/user-defined_function/udf_title.md @@ -0,0 +1,27 @@ +<!-- + ! Licensed to the Apache Software Foundation (ASF) under one + ! or more contributor license agreements. See the NOTICE file + ! distributed with this work for additional information + ! regarding copyright ownership. The ASF licenses this file + ! to you under the Apache License, Version 2.0 (the + ! "License"); you may not use this file except in compliance + ! with the License. You may obtain a copy of the License at + ! + ! http://www.apache.org/licenses/LICENSE-2.0 + ! + ! Unless required by applicable law or agreed to in writing, + ! software distributed under the License is distributed on an + ! "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + ! KIND, either express or implied. See the License for the + ! specific language governing permissions and limitations + ! under the License. + !--> + +# User-defined Functions # + +## <a id="#toc">Table of Contents</a> ## +* [Introduction](#introduction) +* [Installing an UDF Library](#installingUDF) +* [Attaching an UDF on Data Feeds](#UDFOnFeeds) +* [A quick look of the UDF configuration](#udfConfiguration) +* [Unstalling an UDF Library](#uninstall) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/511e1c83/asterixdb/asterix-doc/src/site/markdown/feeds/tutorial.md ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-doc/src/site/markdown/feeds/tutorial.md b/asterixdb/asterix-doc/src/site/markdown/feeds/tutorial.md deleted file mode 100644 index f5635b8..0000000 --- a/asterixdb/asterix-doc/src/site/markdown/feeds/tutorial.md +++ /dev/null @@ -1,361 +0,0 @@ -<!-- - ! Licensed to the Apache Software Foundation (ASF) under one - ! or more contributor license agreements. See the NOTICE file - ! distributed with this work for additional information - ! regarding copyright ownership. The ASF licenses this file - ! to you under the Apache License, Version 2.0 (the - ! "License"); you may not use this file except in compliance - ! with the License. You may obtain a copy of the License at - ! - ! http://www.apache.org/licenses/LICENSE-2.0 - ! - ! Unless required by applicable law or agreed to in writing, - ! software distributed under the License is distributed on an - ! "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - ! KIND, either express or implied. See the License for the - ! specific language governing permissions and limitations - ! under the License. - !--> - -# Support for Data Ingestion in AsterixDB # - -## <a id="#toc">Table of Contents</a> ## - -* [Introduction](#Introduction) -* [Feed Adapters](#FeedAdapters) -* [Feed Policies](#FeedPolicies) - -## <a name="Introduction">Introduction</a> ## - -In this document, we describe the support for data ingestion in -AsterixDB. Data feeds are a new mechanism for having continuous -data arrive into a BDMS from external sources and incrementally -populate a persisted dataset and associated indexes. We add a new BDMS -architectural component, called a data feed, that makes a Big Data system the caretaker for functionality that -used to live outside, and we show how it improves users' lives and system performance. - -## <a name="FeedAdapters">Feed Adapters</a> ## - -The functionality of establishing a connection with a data source -and receiving, parsing and translating its data into ADM objects -(for storage inside AsterixDB) is contained in a feed adapter. A -feed adapter is an implementation of an interface and its details are -specific to a given data source. An adapter may optionally be given -parameters to configure its runtime behavior. Depending upon the -data transfer protocol/APIs offered by the data source, a feed adapter -may operate in a push or a pull mode. Push mode involves just -one initial request by the adapter to the data source for setting up -the connection. Once a connection is authorized, the data source -"pushes" data to the adapter without any subsequent requests by -the adapter. In contrast, when operating in a pull mode, the adapter -makes a separate request each time to receive data. -AsterixDB currently provides built-in adapters for several popular -data sources such as Twitter and RSS feeds. AsterixDB additionally -provides a generic socket-based adapter that can be used -to ingest data that is directed at a prescribed socket. - - -In this tutorial, we shall describe building two example data ingestion pipelines -that cover the popular scenarios of ingesting data from (a) Twitter (b) RSS (c) Socket Feed source. - -####Ingesting Twitter Stream -We shall use the built-in push-based Twitter adapter. -As a pre-requisite, we must define a Tweet using the AsterixDB Data Model (ADM) -and the query language SQL++. Given below are the type definitions in SQL++ -that create a Tweet datatype which is representative of a real tweet as obtained from Twitter. - - drop dataverse feeds if exists; - - create dataverse feeds; - use feeds; - - create type TwitterUser as closed { - screen_name: string, - lang: string, - friends_count: int32, - statuses_count: int32 - }; - - create type Tweet as open { - id: int64, - user: TwitterUser - }; - - create dataset Tweets (Tweet) primary key id; - -We also create a dataset that we shall use to persist the tweets in AsterixDB. -Next we make use of the `create feed` SQL++ statement to define our example data feed. - -#####Using the "push_twitter" feed adapter##### -The "push_twitter" adapter requires setting up an application account with Twitter. To retrieve -tweets, Twitter requires registering an application. Registration involves providing -a name and a brief description for the application. Each application has associated OAuth -authentication credentials that include OAuth keys and tokens. Accessing the -Twitter API requires providing the following. - -1. Consumer Key (API Key) -2. Consumer Secret (API Secret) -3. Access Token -4. Access Token Secret - -The "push_twitter" adapter takes as configuration the above mentioned -parameters. End users are required to obtain the above authentication credentials prior to -using the "push_twitter" adapter. For further information on obtaining OAuth keys and tokens and -registering an application with Twitter, please visit http://apps.twitter.com - -Given below is an example SQL++ statement that creates a feed called "TwitterFeed" by using the -"push_twitter" adapter. - - use feeds; - - create feed TwitterFeed with { - "adapter-name": "push_twitter", - "type-name": "Tweet", - "format": "twitter-status", - "consumer.key": "************", - "consumer.secret": "************", - "access.token": "**********", - "access.token.secret": "*************" - }; - -It is required that the above authentication parameters are provided valid. -Note that the `create feed` statement does not initiate the flow of data from Twitter into -the AsterixDB instance. Instead, the `create feed` statement only results in registering -the feed with the instance. The flow of data along a feed is initiated when it is connected -to a target dataset using the connect feed statement and activated using the start feed statement. - -The Twitter adapter also supports several Twitter streaming APIs as follow: - -1. Track filter `"keywords": "AsterixDB, Apache"` -2. Locations filter `"locations": "-29.7, 79.2, 36.7, 72.0; -124.848974,-66.885444, 24.396308, 49.384358"` -3. Language filter `"language": "en"` -4. Filter level `"filter-level": "low"` - -An example of Twitter adapter tracking tweets with keyword "news" can be described using following ddl: - - use feeds; - - create feed TwitterFeed with { - "adapter-name": "push_twitter", - "type-name": "Tweet", - "format": "twitter-status", - "consumer.key": "************", - "consumer.secret": "************", - "access.token": "**********", - "access.token.secret": "*************", - "keywords": "news" - }; - -For more details about these APIs, please visit https://dev.twitter.com/streaming/overview/request-parameters - -####Lifecycle of a Feed#### - -A feed is a logical artifact that is brought to life (i.e., its data flow -is initiated) only when it is activated using the `start feed` statement. -Before we active a feed, we need to designate the dataset where the data to be persisted -using `connect feed` statement. -Subsequent to a `connect feed` statement, the feed is said to be in the connected state. -After that, `start feed` statement will activate the feed, and start the dataflow from feed to its connected dataset. -Multiple feeds can simultaneously be connected to a dataset such that the -contents of the dataset represent the union of the connected feeds. -Also one feed can be simultaneously connected to multiple target datasets. - - use feeds; - - connect feed TwitterFeed to dataset Tweets; - - start feed TwitterFeed; - -The `connect feed` statement above directs AsterixDB to persist -the data from `TwitterFeed` feed into the `Tweets` dataset. The `start feed` statement will activate the feed and -start the dataflow. -If it is required (by the high-level application) to also retain the raw -tweets obtained from Twitter, the end user may additionally choose -to connect TwitterFeed to a different dataset. - -Let the feed run for a minute, then run the following query to see the -latest tweets that are stored into the data set. - - use feeds; - - select * from Tweets limit 10; - -The dataflow of data from a feed can be terminated explicitly by `stop feed` statement. - - use feeds; - - stop feed TwitterFeed; - -The `disconnnect statement` can be used to disconnect the feed from certain dataset. - - use feeds; - - disconnect feed TwitterFeed from dataset Tweets; - -###Ingesting with Other Adapters -AsterixDB has several builtin feed adapters for data ingestion. User can also -implement their own adapters and plug them into AsterixDB. -Here we introduce `socket_adapter` and `localfs` -feed adapter that cover most of the common application scenarios. - -#####Using the "socket_adapter" feed adapter##### -`socket_adapter` feed opens a web socket on the given node which allows user to push data into -AsterixDB directly. Here is an example: - - drop dataverse feeds if exists; - create dataverse feeds; - use feeds; - - create type TestDataType as open { - screenName: string - }; - - create dataset TestDataset(TestDataType) primary key screenName; - - create feed TestSocketFeed with { - "adapter-name": "socket_adapter", - "sockets": "127.0.0.1:10001", - "address-type": "IP", - "type-name": "TestDataType", - "format": "adm" - }; - - connect feed TestSocketFeed to dataset TestDataset; - - use feeds; - start feed TestSocketFeed; - -The above statements create a socket feed which is listening to "10001" port of the host machine. This feed accepts data -records in "adm" format. As an example, you can download the sample dataset [Chirp Users](../data/chu.adm) and push them line -by line into the socket feed using any socket client you like. Following is a socket client example in Python: - - from socket import socket - - ip = '127.0.0.1' - port1 = 10001 - filePath = 'chu.adm' - - sock1 = socket() - sock1.connect((ip, port1)) - - with open(filePath) as inputData: - for line in inputData: - sock1.sendall(line) - sock1.close() - - -####Using the "localfs" feed adapter#### -`localfs` adapter enables data ingestion from local file system. It allows user to feed data records on local disk -into a dataset. A DDL example for creating a `localfs` feed is given as follow: - - use feeds; - - create type TestDataType as open { - screenName: string - }; - - create dataset TestDataset(TestDataType) primary key screenName; - - create feed TestFileFeed with { - "adapter-name": "localfs", - "type-name": "TestDataType", - "path": "HOSTNAME://LOCAL_FILE_PATH", - "format": "adm" - }; - - connect feed TestFileFeed to dataset TestDataset; - - start feed TestFileFeed; - -Similar to previous examples, we need to define the datatype and dataset this feed uses. -The "path" parameter refers to the local data file that we want to ingest data from. -`HOSTNAME` can either be the IP address or node name of the machine which holds the file. -`LOCAL_FILE_PATH` indicates the absolute path to the file on that machine. Similarly to `socket_adapter`, -this feed takes `adm` formatted data records. - -### Datatype for feed and target dataset - -The "type-name" parameter in create feed statement defines the `datatype` of the datasource. In most use cases, -feed will have the same `datatype` as the target dataset. However, if we want to perform certain preprocess before the -data records gets into the target dataset (append autogenerated key, apply user defined functions, etc.), we will -need to define the datatypes for feed and dataset separately. - -#### Ingestion with autogenerated key - -AsterixDB supports using autogenerated uuid as the primary key for dataset. When we use this feature, we will need to -define a datatype with the primary key field, and specify that field to be autogenerated when creating the dataset. -Use that same datatype in feed definition will cause a type discrepancy since there is no such field in the datasource. -Thus, we will need to define two separate datatypes for feed and dataset: - - use feeds; - - create type DBLPFeedType as closed { - dblpid: string, - title: string, - authors: string, - misc: string - } - - create type DBLPDataSetType as open { - id: uuid, - dblpid: string, - title: string, - authors: string, - misc: string - } - create dataset DBLPDataset(DBLPDataSetType) primary key id autogenerated; - - create feed DBLPFeed with { - "adapter-name": "socket_adapter", - "sockets": "127.0.0.1:10001", - "address-type": "IP", - "type-name": "DBLPFeedType", - "format": "adm" - }; - - connect feed DBLPFeed to dataset DBLPDataset; - - start feed DBLPFeed; - -## <a name="FeedPolicies">Policies for Feed Ingestion</a> ## - -Multiple feeds may be concurrently operational on an AsterixDB -cluster, each competing for resources (CPU cycles, network bandwidth, -disk IO) to maintain pace with their respective data sources. -As a data management system, AsterixDB is able to manage a set of concurrent -feeds and make dynamic decisions related to the allocation of -resources, resolving resource bottlenecks and the handling of failures. -Each feed has its own set of constraints, influenced largely -by the nature of its data source and the applications that intend -to consume and process the ingested data. Consider an application -that intends to discover the trending topics on Twitter by analyzing -tweets that are being processed. Losing a few tweets may be -acceptable. In contrast, when ingesting from a data source that -provides a click-stream of ad clicks, losing data would translate to -a loss of revenue for an application that tracks revenue by charging -advertisers per click. - -AsterixDB allows a data feed to have an associated ingestion -policy that is expressed as a collection of parameters and associated -values. An ingestion policy dictates the runtime behavior of -the feed in response to resource bottlenecks and failures. AsterixDB provides -a set of policies that help customize the -system's runtime behavior when handling excess objects. - -####Policies - -- *Spill*: Objects that cannot be processed by an operator for lack of resources -(referred to as excess objects hereafter) should be persisted to the local disk for deferred processing. - -- *Discard*: Excess objects should be discarded. - -Note that the end user may choose to form a custom policy. For example, -it is possible in AsterixDB to create a custom policy that spills excess -objects to disk and subsequently resorts to throttling if the -spillage crosses a configured threshold. In all cases, the desired -ingestion policy is specified as part of the `connect feed` statement -or else the "Basic" policy will be chosen as the default. - - use feeds; - - connect feed TwitterFeed to dataset Tweets using policy Basic; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/511e1c83/asterixdb/asterix-doc/src/site/markdown/ncservice.md ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-doc/src/site/markdown/ncservice.md b/asterixdb/asterix-doc/src/site/markdown/ncservice.md index 2b309ce..ef2ac9b 100644 --- a/asterixdb/asterix-doc/src/site/markdown/ncservice.md +++ b/asterixdb/asterix-doc/src/site/markdown/ncservice.md @@ -17,6 +17,8 @@ ! under the License. !--> +# Installation using NCService # + ## <a id="toc">Table of Contents</a> ## * [Quick Start](#quickstart) http://git-wip-us.apache.org/repos/asf/asterixdb/blob/511e1c83/asterixdb/asterix-doc/src/site/markdown/udf.md ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-doc/src/site/markdown/udf.md b/asterixdb/asterix-doc/src/site/markdown/udf.md deleted file mode 100644 index b2ef2bc..0000000 --- a/asterixdb/asterix-doc/src/site/markdown/udf.md +++ /dev/null @@ -1,189 +0,0 @@ -<!-- - ! Licensed to the Apache Software Foundation (ASF) under one - ! or more contributor license agreements. See the NOTICE file - ! distributed with this work for additional information - ! regarding copyright ownership. The ASF licenses this file - ! to you under the Apache License, Version 2.0 (the - ! "License"); you may not use this file except in compliance - ! with the License. You may obtain a copy of the License at - ! - ! http://www.apache.org/licenses/LICENSE-2.0 - ! - ! Unless required by applicable law or agreed to in writing, - ! software distributed under the License is distributed on an - ! "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - ! KIND, either express or implied. See the License for the - ! specific language governing permissions and limitations - ! under the License. - !--> - -# Support for User Defined Functions in AsterixDB # - -## <a id="#toc">Table of Contents</a> ## -* [Using UDF to preprocess feed-collected data](#PreprocessingCollectedData) -* [Writing an External UDF](#WritingAnExternalUDF) -* [Creating an AsterixDB Library](#CreatingAnAsterixDBLibrary) -* [Installing an AsterixDB Library](#installingUDF) - -In this document, we describe the support for implementing, using, and installing user-defined functions (UDF) in -AsterixDB. We will explain how we can use UDFs to preprocess, e.g., data collected using feeds (see the [feeds tutorial](feeds/tutorial.html)). - - -### <a name="installingUDF">Installing an AsterixDB Library</a>### - -We assume you have followed the [installation instructions](../install.html) to set up a running AsterixDB instance. Let us refer your AsterixDB instance by the name "my_asterix". - -- Step 1: Stop the AsterixDB instance if it is in the ACTIVE state. - - $ managix stop -n my_asterix - -- Step 2: Install the library using Managix install command. Just to illustrate, we use the help command to look up the syntax - - $ managix help -cmd install - Installs a library to an asterix instance. - Options - n Name of Asterix Instance - d Name of the dataverse under which the library will be installed - l Name of the library - p Path to library zip bundle - -Above is a sample output and explains the usage and the required parameters. Each library has a name and is installed under a dataverse. Recall that we had created a dataverse by the name - "feeds" prior to creating our datatypes and dataset. We shall name our library - "testlib". - -We assume you have a library zip bundle that needs to be installed. -To install the library, use the Managix install command. An example is shown below. - - $ managix install -n my_asterix -d feeds -l testlib -p extlibs/asterix-external-data-0.8.7-binary-assembly.zip - -You should see the following message: - - INFO: Installed library testlib - -We shall next start our AsterixDB instance using the start command as shown below. - - $ managix start -n my_asterix - -You may now use the AsterixDB library in AQL statements and queries. To look at the installed artifacts, you may execute the following query at the AsterixDB web-console. - - for $x in dataset Metadata.Function - return $x - - for $x in dataset Metadata.Library - return $x - -Our library is now installed and is ready to be used. - - -## <a id="PreprocessingCollectedData">Preprocessing Collected Data</a> ### - -In the following we assume that you already created the `TwitterFeed` and its corresponding data types and dataset following the instruction explained in the [feeds tutorial](feeds/tutorial.html). - -A feed definition may optionally include the specification of a -user-defined function that is to be applied to each feed object prior -to persistence. Examples of pre-processing might include adding -attributes, filtering out objects, sampling, sentiment analysis, feature -extraction, etc. We can express a UDF, which can be defined in AQL or in a programming -language such as Java, to perform such pre-processing. An AQL UDF is a good fit when -pre-processing a object requires the result of a query (join or aggregate) -over data contained in AsterixDB datasets. More sophisticated -processing such as sentiment analysis of text is better handled -by providing a Java UDF. A Java UDF has an initialization phase -that allows the UDF to access any resources it may need to initialize -itself prior to being used in a data flow. It is assumed by the -AsterixDB compiler to be stateless and thus usable as an embarrassingly -parallel black box. In contrast, the AsterixDB compiler can -reason about an AQL UDF and involve the use of indexes during -its invocation. - -We consider an example transformation of a raw tweet into its -lightweight version called `ProcessedTweet`, which is defined next. - - use dataverse feeds; - - create type ProcessedTweet if not exists as open { - id: string, - user_name:string, - location:point, - created_at:string, - message_text:string, - country: string, - topics: {{string}} - }; - - create dataset ProcessedTweets(ProcessedTweet) - primary key id; - -The processing required in transforming a collected tweet to its lighter version of type `ProcessedTweet` involves extracting the topics or hash-tags (if any) in a tweet -and collecting them in the referred "topics" attribute for the tweet. -Additionally, the latitude and longitude values (doubles) are combined into the spatial point type. Note that spatial data types are considered as first-class citizens that come with the support for creating indexes. Next we show a revised version of our example TwitterFeed that involves the use of a UDF. We assume that the UDF that contains the transformation logic into a "ProcessedTweet" is available as a Java UDF inside an AsterixDB library named 'testlib'. We defer the writing of a Java UDF and its installation as part of an AsterixDB library to a later section of this document. - - use dataverse feeds; - - create feed ProcessedTwitterFeed if not exists - using "push_twitter" - (("type-name"="Tweet"), - ("consumer.key"="************"), - ("consumer.secret"="**************"), - ("access.token"="**********"), - ("access.token.secret"="*************")) - - apply function testlib#addHashTagsInPlace; - -Note that a feed adaptor and a UDF act as pluggable components. These -contribute towards providing a generic "plug-and-play" model where -custom implementations can be provided to cater to specific requirements. - -####Building a Cascade Network of Feeds#### -Multiple high-level applications may wish to consume the data -ingested from a data feed. Each such application might perceive the -feed in a different way and require the arriving data to be processed -and/or persisted differently. Building a separate flow of data from -the external source for each application is wasteful of resources as -the pre-processing or transformations required by each application -might overlap and could be done together in an incremental fashion -to avoid redundancy. A single flow of data from the external source -could provide data for multiple applications. To achieve this, we -introduce the notion of primary and secondary feeds in AsterixDB. - -A feed in AsterixDB is considered to be a primary feed if it gets -its data from an external data source. The objects contained in a -feed (subsequent to any pre-processing) are directed to a designated -AsterixDB dataset. Alternatively or additionally, these objects can -be used to derive other feeds known as secondary feeds. A secondary -feed is similar to its parent feed in every other aspect; it can -have an associated UDF to allow for any subsequent processing, -can be persisted into a dataset, and/or can be made to derive other -secondary feeds to form a cascade network. A primary feed and a -dependent secondary feed form a hierarchy. As an example, we next show an -example AQL statement that redefines the previous feed -"ProcessedTwitterFeed" in terms of their -respective parent feed (TwitterFeed). - - use dataverse feeds; - - drop feed ProcessedTwitterFeed if exists; - - create secondary feed ProcessedTwitterFeed from feed TwitterFeed - apply function testlib#addHashTags; - - connect feed ProcessedTwitterFeed to dataset ProcessedTweets; - -The `addHashTags` function is already provided in the example UDF.To see what objects -are being inserted into the dataset, we can perform a simple dataset scan after -allowing a few moments for the feed to start ingesting data: - - use dataverse feeds; - - for $i in dataset ProcessedTweets limit 10 return $i; - -For an example of how to write a Java UDF from scratch, the source for the example -UDF that has been used in this tutorial is available [here] (https://github.com/apache/asterixdb/tree/master/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/library) - -## <a name="installingUDF">Unstalling an AsterixDB Library</a>### - -To uninstall a library, use the Managix uninstall command as follows: - - $ managix stop -n my_asterix - - $ managix uninstall -n my_asterix -d feeds -l testlib - - http://git-wip-us.apache.org/repos/asf/asterixdb/blob/511e1c83/asterixdb/asterix-doc/src/site/site.xml ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-doc/src/site/site.xml b/asterixdb/asterix-doc/src/site/site.xml index 1167c37..6db028e 100644 --- a/asterixdb/asterix-doc/src/site/site.xml +++ b/asterixdb/asterix-doc/src/site/site.xml @@ -90,7 +90,7 @@ <menu name="Advanced Features"> <item name="Accessing External Data" href="aql/externaldata.html"/> - <item name="Support for Data Ingestion" href="feeds/tutorial.html"/> + <item name="Data Ingestion with Feeds" href="feeds.html"/> <item name="User Defined Functions" href="udf.html"/> <item name="Filter-Based LSM Index Acceleration" href="sqlpp/filters.html"/> <item name="Support of Full-text Queries" href="sqlpp/fulltext.html"/> http://git-wip-us.apache.org/repos/asf/asterixdb/blob/511e1c83/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/Datatypes.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/Datatypes.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/Datatypes.java index d915559..9381d09 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/Datatypes.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/Datatypes.java @@ -146,22 +146,4 @@ public class Datatypes { private Tweet_User() { } } - - /* - The following assumes this DDL (but ignoring the field name orders): - create type ProcessedTweet if not exists as open { - id: string, - user_name:string, - location:point, - created_at:string, - message_text:string, - country: string, - topics: [string] - }; - */ - public static final class ProcessedTweet { - public static final String USER_NAME = "user_name"; - public static final String LOCATION = "location"; - public static final String TOPICS = "topics"; - } } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/511e1c83/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/library/AddHashTagsFactory.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/library/AddHashTagsFactory.java b/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/library/AddHashTagsFactory.java deleted file mode 100644 index db693a1..0000000 --- a/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/library/AddHashTagsFactory.java +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.asterix.external.library; - -import org.apache.asterix.external.api.IExternalScalarFunction; -import org.apache.asterix.external.api.IFunctionFactory; - -public class AddHashTagsFactory implements IFunctionFactory { - - @Override - public IExternalScalarFunction getExternalFunction() { - return new AddHashTagsFunction(); - } - -} http://git-wip-us.apache.org/repos/asf/asterixdb/blob/511e1c83/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/library/AddHashTagsFunction.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/library/AddHashTagsFunction.java b/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/library/AddHashTagsFunction.java deleted file mode 100644 index 1b5fecd..0000000 --- a/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/library/AddHashTagsFunction.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.asterix.external.library; - -import org.apache.asterix.external.library.java.JBuiltinType; -import org.apache.asterix.external.library.java.base.JDouble; -import org.apache.asterix.external.library.java.base.JPoint; -import org.apache.asterix.external.library.java.base.JRecord; -import org.apache.asterix.external.library.java.base.JString; -import org.apache.asterix.external.library.java.base.JUnorderedList; -import org.apache.asterix.external.api.IExternalScalarFunction; -import org.apache.asterix.external.api.IFunctionHelper; -import org.apache.asterix.external.library.java.JTypeTag; -import org.apache.asterix.external.util.Datatypes; - -public class AddHashTagsFunction implements IExternalScalarFunction { - - private JUnorderedList list = null; - private JPoint location = null; - - @Override - public void initialize(IFunctionHelper functionHelper) { - list = new JUnorderedList(JBuiltinType.JSTRING); - location = new JPoint(0, 0); - } - - @Override - public void deinitialize() { - } - - @Override - public void evaluate(IFunctionHelper functionHelper) throws Exception { - list.clear(); - JRecord inputRecord = (JRecord) functionHelper.getArgument(0); - JString text = (JString) inputRecord.getValueByName(Datatypes.Tweet.MESSAGE); - JDouble latitude = (JDouble) inputRecord.getValueByName(Datatypes.Tweet.LATITUDE); - JDouble longitude = (JDouble) inputRecord.getValueByName(Datatypes.Tweet.LONGITUDE); - - if (latitude != null && longitude != null) { - location.setValue(latitude.getValue(), longitude.getValue()); - } else { - location.setValue(0, 0); - } - - String[] tokens = text.getValue().split(" "); - for (String tk : tokens) { - if (tk.startsWith("#")) { - JString newField = (JString) functionHelper.getObject(JTypeTag.STRING); - newField.setValue(tk); - list.add(newField); - } - } - - JRecord outputRecord = (JRecord) functionHelper.getResultObject(); - outputRecord.setField(Datatypes.Tweet.ID, inputRecord.getValueByName(Datatypes.Tweet.ID)); - - JRecord userRecord = (JRecord) inputRecord.getValueByName(Datatypes.Tweet.USER); - outputRecord.setField(Datatypes.ProcessedTweet.USER_NAME, - userRecord.getValueByName(Datatypes.Tweet.SCREEN_NAME)); - - outputRecord.setField(Datatypes.ProcessedTweet.LOCATION, location); - outputRecord.setField(Datatypes.Tweet.CREATED_AT, inputRecord.getValueByName(Datatypes.Tweet.CREATED_AT)); - outputRecord.setField(Datatypes.Tweet.MESSAGE, text); - outputRecord.setField(Datatypes.ProcessedTweet.TOPICS, list); - - functionHelper.setResult(outputRecord); - } - -} http://git-wip-us.apache.org/repos/asf/asterixdb/blob/511e1c83/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/library/AddHashTagsInPlaceFunction.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/library/AddHashTagsInPlaceFunction.java b/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/library/AddHashTagsInPlaceFunction.java index 7873835..ecee876 100644 --- a/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/library/AddHashTagsInPlaceFunction.java +++ b/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/library/AddHashTagsInPlaceFunction.java @@ -54,7 +54,7 @@ public class AddHashTagsInPlaceFunction implements IExternalScalarFunction { list.add(newField); } } - inputRecord.addField(Datatypes.ProcessedTweet.TOPICS, list); + inputRecord.addField("topics", list); functionHelper.setResult(inputRecord); }